feat(compute): add Phase 11 Synor Compute L2 heterogeneous compute layer

- Add synor-compute crate for heterogeneous compute orchestration
- Implement processor abstraction for CPU/GPU/TPU/NPU/LPU/FPGA/DSP
- Add device registry with cross-vendor capability tracking
- Implement task scheduler with work stealing and load balancing
- Add energy-aware and latency-aware balancing strategies
- Create spot market for compute resources with order matching
- Add memory manager with tensor handles and cross-device transfers
- Support processor capability profiles (H100, TPU v5p, Groq LPU, etc.)
- Implement priority work queues with task decomposition

Processor types supported:
- CPU (x86-64 AVX512, ARM64 SVE, RISC-V Vector)
- GPU (NVIDIA CUDA, AMD ROCm, Intel OneAPI, Apple Metal)
- TPU (v2-v5p, Edge TPU)
- NPU (Apple Neural Engine, Qualcomm Hexagon, Intel VPU)
- LPU (Groq Language Processing Unit)
- FPGA (Xilinx, Intel Altera)
- DSP (TI, Analog Devices)
- WebGPU and WASM runtimes
This commit is contained in:
Gulshan Yadav 2026-01-11 13:53:57 +05:30
parent 8da34bc73d
commit 4c36ddbdc2
19 changed files with 11219 additions and 0 deletions

View file

@ -9,6 +9,7 @@ members = [
"crates/synor-storage",
"crates/synor-hosting",
"crates/synor-database",
"crates/synor-compute",
"crates/synor-governance",
"crates/synor-rpc",
"crates/synor-vm",

View file

@ -0,0 +1,51 @@
[package]
name = "synor-compute"
version.workspace = true
edition.workspace = true
description = "Heterogeneous multi-processor compute platform for Synor blockchain"
license.workspace = true
[dependencies]
# Internal crates
synor-types = { path = "../synor-types" }
synor-crypto = { path = "../synor-crypto" }
synor-storage = { path = "../synor-storage" }
# Serialization
serde.workspace = true
serde_json.workspace = true
borsh.workspace = true
bincode = "1.3"
# Async runtime
tokio = { workspace = true, features = ["sync", "rt-multi-thread", "time", "macros"] }
async-trait = "0.1"
futures = "0.3"
# Concurrency
parking_lot.workspace = true
crossbeam-deque = "0.8"
crossbeam-channel = "0.5"
dashmap = "5.5"
# Utilities
thiserror.workspace = true
tracing.workspace = true
hex.workspace = true
# Hashing
blake3.workspace = true
# Data structures
indexmap = "2.2"
priority-queue = "2.0"
# Time
chrono = { version = "0.4", features = ["serde"] }
# Random
rand = "0.8"
[dev-dependencies]
tempfile.workspace = true
tokio-test = "0.4"

View file

@ -0,0 +1,377 @@
//! Device registry and management.
//!
//! Supports all device types:
//! - Data center servers
//! - Desktop workstations
//! - Laptops
//! - Mobile devices (iOS, Android)
//! - Browsers (WebGPU, WASM)
//! - IoT devices
use crate::error::ComputeError;
use crate::processor::{GenericProcessor, Processor, ProcessorCapabilities, ProcessorId, ProcessorType};
use crate::{NodeId, ProcessorInfo};
use parking_lot::RwLock;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::Arc;
/// Unique device identifier.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct DeviceId(pub [u8; 32]);
impl DeviceId {
/// Creates a new random device ID.
pub fn new() -> Self {
use rand::Rng;
let mut bytes = [0u8; 32];
rand::thread_rng().fill(&mut bytes);
DeviceId(bytes)
}
/// Creates from bytes.
pub fn from_bytes(bytes: [u8; 32]) -> Self {
DeviceId(bytes)
}
}
impl Default for DeviceId {
fn default() -> Self {
Self::new()
}
}
impl std::fmt::Display for DeviceId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "dev_{}", hex::encode(&self.0[..8]))
}
}
/// Device type classification.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum DeviceType {
/// Data center server.
DataCenter,
/// Desktop workstation.
Desktop,
/// Laptop.
Laptop,
/// Mobile phone.
Mobile,
/// Tablet.
Tablet,
/// IoT device.
IoT,
/// Browser (WebGPU/WASM).
Browser,
/// Edge server.
Edge,
}
impl DeviceType {
/// Returns typical reliability score (0-100).
pub fn reliability(&self) -> u32 {
match self {
DeviceType::DataCenter => 99,
DeviceType::Edge => 95,
DeviceType::Desktop => 80,
DeviceType::Laptop => 60,
DeviceType::Mobile => 40,
DeviceType::Tablet => 50,
DeviceType::IoT => 70,
DeviceType::Browser => 30,
}
}
/// Returns typical availability hours per day.
pub fn availability_hours(&self) -> f32 {
match self {
DeviceType::DataCenter => 24.0,
DeviceType::Edge => 24.0,
DeviceType::Desktop => 8.0,
DeviceType::Laptop => 6.0,
DeviceType::Mobile => 4.0,
DeviceType::Tablet => 4.0,
DeviceType::IoT => 24.0,
DeviceType::Browser => 2.0,
}
}
}
/// Device capabilities.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct DeviceCapabilities {
/// Device type.
pub device_type: DeviceType,
/// Available processors.
pub processors: Vec<ProcessorType>,
/// Total memory (GB).
pub memory_gb: f32,
/// Network bandwidth (Mbps).
pub bandwidth_mbps: f32,
/// Storage available (GB).
pub storage_gb: f32,
/// Battery powered.
pub battery_powered: bool,
/// Supports background execution.
pub background_execution: bool,
}
/// Device information.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct DeviceInfo {
/// Device ID.
pub id: DeviceId,
/// Device type.
pub device_type: DeviceType,
/// Owner address.
pub owner: [u8; 32],
/// Capabilities.
pub capabilities: DeviceCapabilities,
/// Current status.
pub status: DeviceStatus,
/// Reputation score (0-100).
pub reputation: u32,
/// Total earnings (atomic SYNOR).
pub earnings: u64,
/// Geographic region.
pub region: String,
}
/// Device status.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub enum DeviceStatus {
/// Online and available.
Online,
/// Online but busy.
Busy,
/// Idle but available.
Idle,
/// On battery (reduced capacity).
OnBattery,
/// Offline.
Offline,
/// Maintenance.
Maintenance,
}
/// Device registry managing all devices and processors.
pub struct DeviceRegistry {
/// Registered devices.
devices: RwLock<HashMap<DeviceId, DeviceInfo>>,
/// Node to device mapping.
node_devices: RwLock<HashMap<NodeId, Vec<DeviceId>>>,
/// All processors (across all nodes).
processors: RwLock<HashMap<ProcessorId, Arc<dyn Processor>>>,
/// Processor to node mapping.
processor_nodes: RwLock<HashMap<ProcessorId, NodeId>>,
/// Next processor ID.
next_processor_id: std::sync::atomic::AtomicU64,
}
impl DeviceRegistry {
/// Creates a new device registry.
pub fn new() -> Self {
Self {
devices: RwLock::new(HashMap::new()),
node_devices: RwLock::new(HashMap::new()),
processors: RwLock::new(HashMap::new()),
processor_nodes: RwLock::new(HashMap::new()),
next_processor_id: std::sync::atomic::AtomicU64::new(0),
}
}
/// Registers a device.
pub fn register_device(&self, device: DeviceInfo) -> Result<DeviceId, ComputeError> {
let id = device.id;
self.devices.write().insert(id, device);
Ok(id)
}
/// Unregisters a device.
pub fn unregister_device(&self, device_id: DeviceId) -> Result<(), ComputeError> {
self.devices.write().remove(&device_id);
Ok(())
}
/// Gets a device by ID.
pub fn get_device(&self, device_id: DeviceId) -> Option<DeviceInfo> {
self.devices.read().get(&device_id).cloned()
}
/// Registers a processor for a node.
pub fn register_processor(
&self,
node_id: NodeId,
info: ProcessorInfo,
) -> Result<(), ComputeError> {
let processor_id = info.id;
// Create a generic processor from the info
let processor: Arc<dyn Processor> = Arc::new(GenericProcessor::new(
processor_id,
info.processor_type,
info.capabilities,
));
self.processors.write().insert(processor_id, processor);
self.processor_nodes.write().insert(processor_id, node_id);
Ok(())
}
/// Unregisters all processors for a node.
pub fn unregister_node(&self, node_id: NodeId) -> Result<(), ComputeError> {
let mut processors = self.processors.write();
let mut processor_nodes = self.processor_nodes.write();
// Find and remove all processors for this node
let to_remove: Vec<_> = processor_nodes
.iter()
.filter(|(_, n)| **n == node_id)
.map(|(p, _)| *p)
.collect();
for proc_id in to_remove {
processors.remove(&proc_id);
processor_nodes.remove(&proc_id);
}
Ok(())
}
/// Gets a processor by ID.
pub fn get_processor(&self, processor_id: ProcessorId) -> Result<Arc<dyn Processor>, ComputeError> {
self.processors
.read()
.get(&processor_id)
.cloned()
.ok_or(ComputeError::ProcessorNotFound(processor_id))
}
/// Gets all processors.
pub fn all_processors(&self) -> Vec<Arc<dyn Processor>> {
self.processors.read().values().cloned().collect()
}
/// Gets processors of a specific type.
pub fn processors_by_type(&self, proc_type: ProcessorType) -> Vec<Arc<dyn Processor>> {
self.processors
.read()
.values()
.filter(|p| p.processor_type() == proc_type)
.cloned()
.collect()
}
/// Gets the next processor ID.
pub fn next_processor_id(&self) -> ProcessorId {
ProcessorId(self.next_processor_id.fetch_add(1, std::sync::atomic::Ordering::SeqCst))
}
/// Gets total number of devices.
pub fn device_count(&self) -> usize {
self.devices.read().len()
}
/// Gets total number of processors.
pub fn processor_count(&self) -> usize {
self.processors.read().len()
}
/// Gets devices by type.
pub fn devices_by_type(&self, device_type: DeviceType) -> Vec<DeviceInfo> {
self.devices
.read()
.values()
.filter(|d| d.device_type == device_type)
.cloned()
.collect()
}
/// Gets online devices.
pub fn online_devices(&self) -> Vec<DeviceInfo> {
self.devices
.read()
.values()
.filter(|d| d.status == DeviceStatus::Online || d.status == DeviceStatus::Idle)
.cloned()
.collect()
}
/// Updates device status.
pub fn update_device_status(
&self,
device_id: DeviceId,
status: DeviceStatus,
) -> Result<(), ComputeError> {
if let Some(device) = self.devices.write().get_mut(&device_id) {
device.status = status;
Ok(())
} else {
Err(ComputeError::Internal(format!("Device not found: {}", device_id)))
}
}
}
impl Default for DeviceRegistry {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::processor::{CpuVariant, AvxSupport};
#[test]
fn test_device_id() {
let id1 = DeviceId::new();
let id2 = DeviceId::new();
assert_ne!(id1.0, id2.0);
}
#[test]
fn test_device_registry() {
let registry = DeviceRegistry::new();
let device = DeviceInfo {
id: DeviceId::new(),
device_type: DeviceType::Desktop,
owner: [1u8; 32],
capabilities: DeviceCapabilities {
device_type: DeviceType::Desktop,
processors: vec![ProcessorType::Cpu(CpuVariant::X86_64 {
avx: AvxSupport::Avx512,
})],
memory_gb: 64.0,
bandwidth_mbps: 1000.0,
storage_gb: 1000.0,
battery_powered: false,
background_execution: true,
},
status: DeviceStatus::Online,
reputation: 100,
earnings: 0,
region: "us-east".to_string(),
};
let device_id = device.id;
registry.register_device(device).unwrap();
assert_eq!(registry.device_count(), 1);
assert!(registry.get_device(device_id).is_some());
registry.unregister_device(device_id).unwrap();
assert_eq!(registry.device_count(), 0);
}
#[test]
fn test_device_type_properties() {
assert_eq!(DeviceType::DataCenter.reliability(), 99);
assert_eq!(DeviceType::Mobile.reliability(), 40);
assert_eq!(DeviceType::DataCenter.availability_hours(), 24.0);
assert_eq!(DeviceType::Browser.availability_hours(), 2.0);
}
}

View file

@ -0,0 +1,92 @@
//! Error types for Synor Compute.
use crate::{JobId, NodeId, ProcessorId, ProcessorType};
use thiserror::Error;
/// Compute errors.
#[derive(Debug, Error)]
pub enum ComputeError {
/// Job not found.
#[error("Job not found: {0}")]
JobNotFound(JobId),
/// Node not found.
#[error("Node not found: {0}")]
NodeNotFound(NodeId),
/// Processor not found.
#[error("Processor not found: {0}")]
ProcessorNotFound(ProcessorId),
/// No suitable processor for operation.
#[error("No suitable processor for operation: {0}")]
NoSuitableProcessor(String),
/// Insufficient resources.
#[error("Insufficient resources: {0}")]
InsufficientResources(String),
/// Task execution failed.
#[error("Task execution failed: {0}")]
TaskExecutionFailed(String),
/// Scheduling failed.
#[error("Scheduling failed: {0}")]
SchedulingFailed(String),
/// Memory allocation failed.
#[error("Memory allocation failed: {0}")]
MemoryAllocationFailed(String),
/// Data transfer failed.
#[error("Data transfer failed: {0}")]
DataTransferFailed(String),
/// Processor type not supported.
#[error("Processor type not supported: {0:?}")]
ProcessorTypeNotSupported(ProcessorType),
/// Operation not supported on processor.
#[error("Operation not supported on {0:?}: {1}")]
OperationNotSupported(ProcessorType, String),
/// Timeout.
#[error("Operation timed out after {0}ms")]
Timeout(u64),
/// Budget exceeded.
#[error("Budget exceeded: required {required}, available {available}")]
BudgetExceeded { required: u64, available: u64 },
/// Node already registered.
#[error("Node already registered: {0}")]
NodeAlreadyRegistered(NodeId),
/// Invalid configuration.
#[error("Invalid configuration: {0}")]
InvalidConfiguration(String),
/// Serialization error.
#[error("Serialization error: {0}")]
Serialization(String),
/// Network error.
#[error("Network error: {0}")]
Network(String),
/// Internal error.
#[error("Internal error: {0}")]
Internal(String),
}
impl From<bincode::Error> for ComputeError {
fn from(err: bincode::Error) -> Self {
ComputeError::Serialization(err.to_string())
}
}
impl From<serde_json::Error> for ComputeError {
fn from(err: serde_json::Error) -> Self {
ComputeError::Serialization(err.to_string())
}
}

View file

@ -0,0 +1,631 @@
//! Synor Compute L2 - Heterogeneous Multi-Processor Compute Platform
//!
//! Provides decentralized compute services with:
//!
//! - **Heterogeneous Scheduling**: CPU + GPU + TPU + NPU + LPU working simultaneously
//! - **Consumer Device Mesh**: Mobile, browser, desktop devices contributing compute
//! - **90% Cost Reduction**: Zero margins, spot markets, electricity arbitrage
//! - **10x Speed**: Caching, speculative execution, optimal processor assignment
//!
//! # Architecture
//!
//! ```text
//! ┌─────────────────────────────────────────────────────────────────────────────┐
//! │ SYNOR COMPUTE L2 │
//! ├─────────────────────────────────────────────────────────────────────────────┤
//! │ │
//! │ ┌─────────────────────────────────────────────────────────────────────────┐ │
//! │ │ TASK DECOMPOSER │ │
//! │ │ Analyzes workload → Identifies subtasks → Maps to optimal processors │ │
//! │ └─────────────────────────────────────────────────────────────────────────┘ │
//! │ │ │
//! │ ▼ │
//! │ ┌─────────────────────────────────────────────────────────────────────────┐ │
//! │ │ HETEROGENEOUS SCHEDULER │ │
//! │ │ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ │ │
//! │ │ │ CPU │ │ GPU │ │ TPU │ │ NPU │ │ LPU │ │Custom│ │ │
//! │ │ │Queue │ │Queue │ │Queue │ │Queue │ │Queue │ │Queue │ │ │
//! │ │ └──────┘ └──────┘ └──────┘ └──────┘ └──────┘ └──────┘ │ │
//! │ └─────────────────────────────────────────────────────────────────────────┘ │
//! │ │
//! │ ┌─────────────────────────────────────────────────────────────────────────┐ │
//! │ │ UNIFIED MEMORY FABRIC │ │
//! │ │ Zero-copy data sharing │ Automatic placement │ Cache coherency │ │
//! │ └─────────────────────────────────────────────────────────────────────────┘ │
//! │ │
//! └─────────────────────────────────────────────────────────────────────────────┘
//! ```
//!
//! # Pricing
//!
//! | Resource | Unit | Price (SYNOR) |
//! |----------|------|---------------|
//! | GPU (consumer) | hour | 0.10 |
//! | GPU (datacenter) | hour | 0.50-4.00 |
//! | CPU | core/hour | 0.02 |
//! | Memory | GB/hour | 0.005 |
//! | Inference | 1M tokens | 0.10 |
#![allow(dead_code)]
pub mod device;
pub mod error;
pub mod market;
pub mod memory;
pub mod processor;
pub mod scheduler;
pub mod task;
pub use device::{
DeviceCapabilities, DeviceId, DeviceInfo, DeviceRegistry, DeviceStatus, DeviceType,
};
pub use error::ComputeError;
pub use market::{
Auction, AuctionId, CloudComparison, CpuTier as MarketCpuTier, GpuTier as MarketGpuTier,
MarketStats, Order, OrderBook, OrderId, OrderSide, OrderType, PricingEngine, ProviderListing,
ResourceType, SpotMarket, Trade,
};
pub use memory::{MemoryManager, TensorHandle, TransferPath, UnifiedMemory};
pub use processor::{
ComputeThroughput, CpuVariant, GpuVariant, NpuVariant, Operation, OperationType, Processor,
ProcessorCapabilities, ProcessorId, ProcessorType, TpuVersion,
};
pub use scheduler::{
HeterogeneousScheduler, LoadBalancer, Schedule, ScheduleResult, TaskAssignment, WorkQueue,
};
pub use task::{
ComputeTask, DecomposedWorkload, Task, TaskDecomposer, TaskId, TaskPriority, TaskResult,
TaskStatus,
};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::Arc;
use parking_lot::RwLock;
/// Compute node identifier.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct NodeId(pub u64);
impl std::fmt::Display for NodeId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "node_{}", self.0)
}
}
/// Job identifier.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct JobId(pub [u8; 32]);
impl JobId {
/// Creates a new job ID.
pub fn new() -> Self {
use rand::Rng;
let mut bytes = [0u8; 32];
rand::thread_rng().fill(&mut bytes);
JobId(bytes)
}
/// Creates from bytes.
pub fn from_bytes(bytes: [u8; 32]) -> Self {
JobId(bytes)
}
}
impl Default for JobId {
fn default() -> Self {
Self::new()
}
}
impl std::fmt::Display for JobId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "job_{}", hex::encode(&self.0[..8]))
}
}
/// Compute job specification.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ComputeJob {
/// Job ID.
pub id: JobId,
/// Owner address.
pub owner: [u8; 32],
/// Job type.
pub job_type: JobType,
/// Resource requirements.
pub resources: ResourceRequirements,
/// Input data reference (CID).
pub input_cid: Option<String>,
/// Maximum budget (in atomic SYNOR).
pub max_budget: u64,
/// Priority level.
pub priority: JobPriority,
/// Created timestamp.
pub created_at: u64,
/// Deadline (optional).
pub deadline: Option<u64>,
}
/// Job type classification.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum JobType {
/// AI/ML training job.
Training {
framework: MlFramework,
model_cid: String,
dataset_cid: String,
epochs: u32,
batch_size: u32,
},
/// AI/ML inference job.
Inference {
model_cid: String,
input_format: String,
batch_size: u32,
},
/// Container workload.
Container {
image: String,
command: Vec<String>,
env: HashMap<String, String>,
},
/// Serverless function.
Serverless {
runtime: FunctionRuntime,
code_cid: String,
handler: String,
},
/// General compute (WASM).
Wasm {
module_cid: String,
entrypoint: String,
},
}
/// ML framework specification.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum MlFramework {
PyTorch { version: String },
TensorFlow { version: String },
JAX { version: String },
ONNX,
}
/// Function runtime.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum FunctionRuntime {
Node20,
Python312,
Rust,
Go,
Custom { image: String },
}
/// Job priority levels.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub enum JobPriority {
/// Background job, can be preempted.
Background = 0,
/// Normal priority.
Normal = 1,
/// High priority, faster scheduling.
High = 2,
/// Critical, guaranteed resources.
Critical = 3,
}
impl Default for JobPriority {
fn default() -> Self {
JobPriority::Normal
}
}
/// Resource requirements for a job.
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
pub struct ResourceRequirements {
/// Minimum CPU cores.
pub min_cpu_cores: f32,
/// Minimum memory (GB).
pub min_memory_gb: f32,
/// GPU requirements.
pub gpu: Option<GpuRequirements>,
/// Preferred processor types (in priority order).
pub preferred_processors: Vec<ProcessorType>,
/// Maximum latency (ms) - for inference.
pub max_latency_ms: Option<u32>,
/// Requires distributed execution.
pub distributed: bool,
}
/// GPU resource requirements.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct GpuRequirements {
/// Minimum number of GPUs.
pub min_count: u32,
/// Maximum number of GPUs.
pub max_count: u32,
/// Minimum VRAM per GPU (GB).
pub min_vram_gb: u32,
/// Minimum compute capability.
pub min_compute_capability: Option<(u8, u8)>,
/// Allow GPU sharing (MPS/MIG).
pub allow_sharing: bool,
}
/// Job execution status.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum JobStatus {
/// Queued, waiting for resources.
Queued,
/// Resources allocated, starting.
Starting,
/// Running.
Running {
progress: f32,
assigned_nodes: Vec<NodeId>,
},
/// Completed successfully.
Completed {
result_cid: String,
duration_ms: u64,
cost: u64,
},
/// Failed.
Failed { error: String },
/// Cancelled by user.
Cancelled,
}
/// Compute node registration.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ComputeNode {
/// Node ID.
pub id: NodeId,
/// Owner address.
pub owner: [u8; 32],
/// Available processors.
pub processors: Vec<ProcessorInfo>,
/// Total memory (GB).
pub total_memory_gb: f32,
/// Available memory (GB).
pub available_memory_gb: f32,
/// Network bandwidth (Gbps).
pub bandwidth_gbps: f32,
/// Geographic region.
pub region: String,
/// Stake amount (for PoS).
pub stake: u64,
/// Reputation score (0-100).
pub reputation: u32,
/// Current status.
pub status: NodeStatus,
}
/// Processor information on a node.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ProcessorInfo {
/// Processor ID (local to node).
pub id: ProcessorId,
/// Processor type.
pub processor_type: ProcessorType,
/// Capabilities.
pub capabilities: ProcessorCapabilities,
/// Current utilization (0.0 - 1.0).
pub utilization: f32,
/// Current temperature (Celsius).
pub temperature: Option<f32>,
}
/// Node status.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub enum NodeStatus {
/// Online and accepting jobs.
Online,
/// Online but not accepting new jobs.
Draining,
/// Offline.
Offline,
/// Maintenance mode.
Maintenance,
}
/// Compute cluster manager.
pub struct ComputeCluster {
/// Registered nodes.
nodes: RwLock<HashMap<NodeId, ComputeNode>>,
/// Device registry.
device_registry: Arc<DeviceRegistry>,
/// Heterogeneous scheduler.
scheduler: Arc<HeterogeneousScheduler>,
/// Spot market.
spot_market: Arc<SpotMarket>,
/// Memory manager.
memory_manager: Arc<MemoryManager>,
/// Active jobs.
jobs: RwLock<HashMap<JobId, ComputeJob>>,
}
impl ComputeCluster {
/// Creates a new compute cluster.
pub fn new() -> Self {
let device_registry = Arc::new(DeviceRegistry::new());
let scheduler = Arc::new(HeterogeneousScheduler::new(device_registry.clone()));
let spot_market = Arc::new(SpotMarket::new());
let memory_manager = Arc::new(MemoryManager::new());
Self {
nodes: RwLock::new(HashMap::new()),
device_registry,
scheduler,
spot_market,
memory_manager,
jobs: RwLock::new(HashMap::new()),
}
}
/// Registers a compute node.
pub fn register_node(&self, node: ComputeNode) -> Result<(), ComputeError> {
let id = node.id;
// Register processors with device registry
for proc in &node.processors {
self.device_registry.register_processor(id, proc.clone())?;
}
self.nodes.write().insert(id, node);
Ok(())
}
/// Unregisters a compute node.
pub fn unregister_node(&self, node_id: NodeId) -> Result<(), ComputeError> {
self.device_registry.unregister_node(node_id)?;
self.nodes.write().remove(&node_id);
Ok(())
}
/// Submits a job for execution.
pub async fn submit_job(&self, job: ComputeJob) -> Result<JobId, ComputeError> {
let job_id = job.id;
// Decompose job into tasks
let tasks = self.decompose_job(&job)?;
// Schedule tasks
let schedule = self.scheduler.schedule(tasks).await?;
// Store job
self.jobs.write().insert(job_id, job);
// Execute schedule (async)
tokio::spawn({
let scheduler = self.scheduler.clone();
async move {
let _ = scheduler.execute(&schedule.schedule).await;
}
});
Ok(job_id)
}
/// Gets job status.
pub fn get_job_status(&self, job_id: &JobId) -> Option<JobStatus> {
self.jobs.read().get(job_id).map(|_| JobStatus::Queued)
}
/// Cancels a job.
pub fn cancel_job(&self, job_id: &JobId) -> Result<(), ComputeError> {
if self.jobs.write().remove(job_id).is_some() {
Ok(())
} else {
Err(ComputeError::JobNotFound(*job_id))
}
}
/// Gets cluster statistics.
pub fn stats(&self) -> ClusterStats {
let nodes = self.nodes.read();
let jobs = self.jobs.read();
let total_nodes = nodes.len();
let online_nodes = nodes.values().filter(|n| n.status == NodeStatus::Online).count();
let total_gpus: usize = nodes
.values()
.flat_map(|n| &n.processors)
.filter(|p| matches!(p.processor_type, ProcessorType::Gpu(_)))
.count();
let total_memory: f32 = nodes.values().map(|n| n.total_memory_gb).sum();
ClusterStats {
total_nodes,
online_nodes,
total_gpus,
total_memory_gb: total_memory,
active_jobs: jobs.len(),
queued_jobs: jobs.values().filter(|_| true).count(), // Simplified
}
}
/// Decomposes a job into schedulable tasks.
fn decompose_job(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
let decomposer = TaskDecomposer::new();
decomposer.decompose(job)
}
}
impl Default for ComputeCluster {
fn default() -> Self {
Self::new()
}
}
/// Cluster statistics.
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
pub struct ClusterStats {
/// Total registered nodes.
pub total_nodes: usize,
/// Online nodes.
pub online_nodes: usize,
/// Total GPUs across cluster.
pub total_gpus: usize,
/// Total memory (GB).
pub total_memory_gb: f32,
/// Active jobs.
pub active_jobs: usize,
/// Queued jobs.
pub queued_jobs: usize,
}
/// Pricing calculator for compute operations.
#[derive(Clone, Debug)]
pub struct ComputePricing {
/// GPU cost per hour by type.
pub gpu_hourly: HashMap<GpuTier, u64>,
/// CPU cost per core-hour.
pub cpu_core_hour: u64,
/// Memory cost per GB-hour.
pub memory_gb_hour: u64,
/// Network egress per GB.
pub network_egress_gb: u64,
/// Inference per million tokens.
pub inference_per_million_tokens: u64,
}
/// GPU pricing tiers.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum GpuTier {
/// Consumer GPUs (RTX 30xx, 40xx).
Consumer,
/// Professional GPUs (RTX A series).
Professional,
/// Data center GPUs (A100).
DataCenter,
/// Latest generation (H100).
Premium,
}
impl Default for ComputePricing {
fn default() -> Self {
let mut gpu_hourly = HashMap::new();
gpu_hourly.insert(GpuTier::Consumer, 100_000_000); // 0.10 SYNOR
gpu_hourly.insert(GpuTier::Professional, 300_000_000); // 0.30 SYNOR
gpu_hourly.insert(GpuTier::DataCenter, 2_000_000_000); // 2.00 SYNOR
gpu_hourly.insert(GpuTier::Premium, 4_000_000_000); // 4.00 SYNOR
Self {
gpu_hourly,
cpu_core_hour: 20_000_000, // 0.02 SYNOR
memory_gb_hour: 5_000_000, // 0.005 SYNOR
network_egress_gb: 50_000_000, // 0.05 SYNOR
inference_per_million_tokens: 100_000_000, // 0.10 SYNOR
}
}
}
impl ComputePricing {
/// Estimates cost for a job.
pub fn estimate(&self, job: &ComputeJob, duration_hours: f32) -> u64 {
let mut cost = 0u64;
// CPU cost
cost += (self.cpu_core_hour as f32 * job.resources.min_cpu_cores * duration_hours) as u64;
// Memory cost
cost += (self.memory_gb_hour as f32 * job.resources.min_memory_gb * duration_hours) as u64;
// GPU cost
if let Some(gpu) = &job.resources.gpu {
let tier = GpuTier::Consumer; // Simplified
let gpu_cost = self.gpu_hourly.get(&tier).unwrap_or(&100_000_000);
cost += (*gpu_cost as f32 * gpu.min_count as f32 * duration_hours) as u64;
}
cost
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_job_id() {
let id1 = JobId::new();
let id2 = JobId::new();
assert_ne!(id1.0, id2.0);
}
#[test]
fn test_compute_cluster() {
let cluster = ComputeCluster::new();
let stats = cluster.stats();
assert_eq!(stats.total_nodes, 0);
}
#[test]
fn test_pricing() {
let pricing = ComputePricing::default();
let job = ComputeJob {
id: JobId::new(),
owner: [0u8; 32],
job_type: JobType::Inference {
model_cid: "model123".to_string(),
input_format: "json".to_string(),
batch_size: 32,
},
resources: ResourceRequirements {
min_cpu_cores: 4.0,
min_memory_gb: 16.0,
gpu: Some(GpuRequirements {
min_count: 1,
max_count: 1,
min_vram_gb: 16,
min_compute_capability: None,
allow_sharing: false,
}),
..Default::default()
},
input_cid: None,
max_budget: 1_000_000_000,
priority: JobPriority::Normal,
created_at: 0,
deadline: None,
};
let cost = pricing.estimate(&job, 1.0);
assert!(cost > 0);
}
#[test]
fn test_node_registration() {
let cluster = ComputeCluster::new();
let node = ComputeNode {
id: NodeId(1),
owner: [1u8; 32],
processors: vec![ProcessorInfo {
id: ProcessorId(0),
processor_type: ProcessorType::Cpu(CpuVariant::X86_64 {
avx: processor::AvxSupport::Avx512,
}),
capabilities: ProcessorCapabilities::default(),
utilization: 0.0,
temperature: Some(45.0),
}],
total_memory_gb: 64.0,
available_memory_gb: 60.0,
bandwidth_gbps: 10.0,
region: "us-east".to_string(),
stake: 1000,
reputation: 100,
status: NodeStatus::Online,
};
cluster.register_node(node).unwrap();
assert_eq!(cluster.stats().total_nodes, 1);
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,370 @@
//! Unified memory management for heterogeneous compute.
use crate::error::ComputeError;
use crate::processor::ProcessorType;
use parking_lot::RwLock;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::Arc;
/// Tensor handle for memory management.
#[derive(Clone, Debug)]
pub struct TensorHandle {
/// Unique ID.
pub id: TensorId,
/// Shape.
pub shape: Vec<usize>,
/// Data type.
pub dtype: DataType,
/// Size in bytes.
pub size_bytes: u64,
/// Current locations.
pub locations: Vec<ProcessorType>,
}
impl TensorHandle {
/// Creates a new tensor handle.
pub fn new(shape: Vec<usize>, dtype: DataType) -> Self {
let size_bytes = shape.iter().product::<usize>() as u64 * dtype.size_bytes() as u64;
Self {
id: TensorId::new(),
shape,
dtype,
size_bytes,
locations: Vec::new(),
}
}
/// Gets the number of elements.
pub fn numel(&self) -> usize {
self.shape.iter().product()
}
}
/// Tensor identifier.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct TensorId(pub u64);
impl TensorId {
/// Creates a new tensor ID.
pub fn new() -> Self {
use rand::Rng;
TensorId(rand::thread_rng().gen())
}
}
impl Default for TensorId {
fn default() -> Self {
Self::new()
}
}
/// Data types for tensors.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum DataType {
Float64,
Float32,
Float16,
BFloat16,
Int64,
Int32,
Int16,
Int8,
UInt8,
Bool,
}
impl DataType {
/// Returns size in bytes.
pub fn size_bytes(&self) -> usize {
match self {
DataType::Float64 | DataType::Int64 => 8,
DataType::Float32 | DataType::Int32 => 4,
DataType::Float16 | DataType::BFloat16 | DataType::Int16 => 2,
DataType::Int8 | DataType::UInt8 | DataType::Bool => 1,
}
}
}
/// Data transfer path between processors.
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub enum TransferPath {
/// Direct GPU-to-GPU via NVLink.
NvLink,
/// Direct GPU-to-GPU via PCIe P2P.
PciePeerToPeer,
/// Through CPU memory.
CpuMediated,
/// Unified memory (Apple Silicon).
UnifiedMemory,
/// Network transfer.
Network,
/// Same memory space (no transfer needed).
SameMemory,
}
impl TransferPath {
/// Returns approximate bandwidth in GB/s.
pub fn bandwidth_gbps(&self) -> f64 {
match self {
TransferPath::NvLink => 900.0, // NVLink 4.0
TransferPath::PciePeerToPeer => 64.0, // PCIe 5.0 x16
TransferPath::CpuMediated => 50.0, // DDR5
TransferPath::UnifiedMemory => 400.0, // Apple unified
TransferPath::Network => 10.0, // 100Gbps network
TransferPath::SameMemory => f64::INFINITY,
}
}
/// Estimates transfer time for given bytes.
pub fn estimate_transfer_time(&self, bytes: u64) -> std::time::Duration {
if matches!(self, TransferPath::SameMemory) {
return std::time::Duration::ZERO;
}
let bytes_f64 = bytes as f64;
let bandwidth = self.bandwidth_gbps() * 1e9; // Convert to bytes/s
let seconds = bytes_f64 / bandwidth;
std::time::Duration::from_secs_f64(seconds)
}
}
/// Unified memory manager.
pub struct MemoryManager {
/// Allocated tensors.
tensors: RwLock<HashMap<TensorId, TensorHandle>>,
/// Memory usage per processor type.
usage: RwLock<HashMap<ProcessorType, u64>>,
/// Memory limits per processor type.
limits: HashMap<ProcessorType, u64>,
}
impl MemoryManager {
/// Creates a new memory manager.
pub fn new() -> Self {
Self {
tensors: RwLock::new(HashMap::new()),
usage: RwLock::new(HashMap::new()),
limits: HashMap::new(),
}
}
/// Sets memory limit for a processor type.
pub fn set_limit(&mut self, proc_type: ProcessorType, limit_bytes: u64) {
self.limits.insert(proc_type, limit_bytes);
}
/// Allocates a tensor.
pub fn allocate(&self, shape: Vec<usize>, dtype: DataType) -> Result<TensorHandle, ComputeError> {
let handle = TensorHandle::new(shape, dtype);
self.tensors.write().insert(handle.id, handle.clone());
Ok(handle)
}
/// Frees a tensor.
pub fn free(&self, tensor_id: TensorId) -> Result<(), ComputeError> {
if let Some(handle) = self.tensors.write().remove(&tensor_id) {
// Update usage for all locations
let mut usage = self.usage.write();
for loc in &handle.locations {
if let Some(u) = usage.get_mut(loc) {
*u = u.saturating_sub(handle.size_bytes);
}
}
}
Ok(())
}
/// Gets a tensor handle.
pub fn get(&self, tensor_id: TensorId) -> Option<TensorHandle> {
self.tensors.read().get(&tensor_id).cloned()
}
/// Ensures tensor is on specified processor.
pub fn ensure_on(
&self,
tensor_id: TensorId,
target: ProcessorType,
) -> Result<TransferPath, ComputeError> {
let mut tensors = self.tensors.write();
if let Some(handle) = tensors.get_mut(&tensor_id) {
// Check if already on target
if handle.locations.contains(&target) {
return Ok(TransferPath::SameMemory);
}
// Determine transfer path
let path = if handle.locations.is_empty() {
// New tensor, allocate on target
TransferPath::SameMemory
} else {
// Find best transfer path from existing location
self.find_best_path(&handle.locations[0], &target)
};
// Record new location
handle.locations.push(target.clone());
// Update usage
let mut usage = self.usage.write();
*usage.entry(target).or_default() += handle.size_bytes;
Ok(path)
} else {
Err(ComputeError::Internal("Tensor not found".to_string()))
}
}
/// Finds best transfer path between processors.
fn find_best_path(&self, from: &ProcessorType, to: &ProcessorType) -> TransferPath {
// Check for unified memory (Apple Silicon)
if self.shares_memory(from, to) {
return TransferPath::UnifiedMemory;
}
// Check for NVLink between NVIDIA GPUs
if matches!(from, ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda { .. }))
&& matches!(to, ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda { .. }))
{
return TransferPath::NvLink;
}
// Check for PCIe P2P between GPUs
if from.is_gpu() && to.is_gpu() {
return TransferPath::PciePeerToPeer;
}
// Default to CPU-mediated transfer
TransferPath::CpuMediated
}
/// Checks if two processor types share memory.
fn shares_memory(&self, a: &ProcessorType, b: &ProcessorType) -> bool {
use crate::processor::{CpuVariant, GpuVariant, NpuVariant};
match (a, b) {
// Apple Silicon unified memory
(ProcessorType::Cpu(CpuVariant::Arm64 { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal))
| (ProcessorType::Gpu(GpuVariant::AppleMetal), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
| (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
| (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal)) => true,
// Same type
_ if a == b => true,
_ => false,
}
}
/// Gets current memory usage for a processor type.
pub fn usage(&self, proc_type: ProcessorType) -> u64 {
self.usage.read().get(&proc_type).copied().unwrap_or(0)
}
/// Gets available memory for a processor type.
pub fn available(&self, proc_type: ProcessorType) -> u64 {
let limit = self.limits.get(&proc_type).copied().unwrap_or(u64::MAX);
let used = self.usage(proc_type);
limit.saturating_sub(used)
}
/// Gets total allocated tensors.
pub fn tensor_count(&self) -> usize {
self.tensors.read().len()
}
}
impl Default for MemoryManager {
fn default() -> Self {
Self::new()
}
}
/// Unified memory abstraction for zero-copy sharing.
pub struct UnifiedMemory {
/// Base pointer (in unified address space).
pub base: u64,
/// Size in bytes.
pub size: u64,
/// Accessible from these processor types.
pub accessible_from: Vec<ProcessorType>,
}
impl UnifiedMemory {
/// Creates new unified memory region.
pub fn new(size: u64) -> Self {
Self {
base: 0, // Would be actual pointer in real implementation
size,
accessible_from: Vec::new(),
}
}
/// Checks if accessible from processor type.
pub fn is_accessible_from(&self, proc_type: &ProcessorType) -> bool {
self.accessible_from.contains(proc_type)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tensor_handle() {
let handle = TensorHandle::new(vec![1024, 1024], DataType::Float32);
assert_eq!(handle.numel(), 1024 * 1024);
assert_eq!(handle.size_bytes, 1024 * 1024 * 4);
}
#[test]
fn test_data_type_sizes() {
assert_eq!(DataType::Float64.size_bytes(), 8);
assert_eq!(DataType::Float32.size_bytes(), 4);
assert_eq!(DataType::Float16.size_bytes(), 2);
assert_eq!(DataType::Int8.size_bytes(), 1);
}
#[test]
fn test_transfer_path_bandwidth() {
assert!(TransferPath::NvLink.bandwidth_gbps() > TransferPath::PciePeerToPeer.bandwidth_gbps());
assert!(TransferPath::SameMemory.bandwidth_gbps().is_infinite());
}
#[test]
fn test_memory_manager() {
let manager = MemoryManager::new();
let handle = manager.allocate(vec![1024, 1024], DataType::Float32).unwrap();
assert_eq!(manager.tensor_count(), 1);
manager.free(handle.id).unwrap();
assert_eq!(manager.tensor_count(), 0);
}
#[test]
fn test_ensure_on() {
let manager = MemoryManager::new();
let handle = manager.allocate(vec![1024], DataType::Float32).unwrap();
// First ensure should allocate
let path = manager.ensure_on(
handle.id,
ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda {
compute_capability: (8, 0),
}),
).unwrap();
assert_eq!(path, TransferPath::SameMemory);
// Second ensure to same location should be same memory
let path = manager.ensure_on(
handle.id,
ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda {
compute_capability: (8, 0),
}),
).unwrap();
assert_eq!(path, TransferPath::SameMemory);
}
}

View file

@ -0,0 +1,547 @@
//! Processor capability definitions.
use super::operation::OperationType;
use super::types::PowerTier;
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
/// Detailed processor capabilities.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ProcessorCapabilities {
/// Compute throughput.
pub compute: ComputeThroughput,
/// Memory specifications.
pub memory: MemorySpecs,
/// Supported operations.
pub operations: HashSet<OperationType>,
/// Power characteristics.
pub power: PowerCharacteristics,
/// Optimal workload characteristics.
pub optimal_for: Vec<WorkloadCharacteristic>,
}
impl Default for ProcessorCapabilities {
fn default() -> Self {
Self {
compute: ComputeThroughput::default(),
memory: MemorySpecs::default(),
operations: Self::default_operations(),
power: PowerCharacteristics::default(),
optimal_for: vec![],
}
}
}
impl ProcessorCapabilities {
/// Default operations supported by most processors.
fn default_operations() -> HashSet<OperationType> {
[
OperationType::MatMul,
OperationType::Add,
OperationType::Mul,
OperationType::ReLU,
OperationType::Softmax,
OperationType::DataLoad,
OperationType::DataPreprocess,
]
.into_iter()
.collect()
}
/// Creates CPU capabilities.
pub fn cpu(cores: u32, clock_ghz: f32, avx512: bool) -> Self {
let flops_per_cycle = if avx512 { 64.0 } else { 32.0 }; // FP32 ops per cycle with AVX
let fp32_tflops = (cores as f64 * clock_ghz as f64 * flops_per_cycle) / 1000.0;
Self {
compute: ComputeThroughput {
fp64_tflops: fp32_tflops / 2.0,
fp32_tflops,
fp16_tflops: fp32_tflops * 2.0,
bf16_tflops: fp32_tflops * 2.0,
int8_tops: fp32_tflops * 4.0,
int4_tops: fp32_tflops * 8.0,
sparsity_speedup: 1.0,
},
memory: MemorySpecs {
capacity_bytes: 64 * 1024 * 1024 * 1024, // 64 GB typical
bandwidth_gbps: 200, // DDR5
type_: MemoryType::Ddr5,
},
operations: Self::cpu_operations(),
power: PowerCharacteristics {
tdp_watts: 125,
efficiency: 0.8,
power_tier: PowerTier::Medium,
},
optimal_for: vec![
WorkloadCharacteristic::Sequential,
WorkloadCharacteristic::MemoryBound,
WorkloadCharacteristic::SmallBatch,
],
}
}
/// Operations typically supported by CPUs.
fn cpu_operations() -> HashSet<OperationType> {
[
// Matrix operations (slow but supported)
OperationType::MatMul,
OperationType::Conv2d,
OperationType::BatchNorm,
OperationType::LayerNorm,
// Element-wise
OperationType::Add,
OperationType::Mul,
OperationType::ReLU,
OperationType::GeLU,
OperationType::Softmax,
// Data operations (optimal)
OperationType::DataLoad,
OperationType::DataPreprocess,
OperationType::Tokenization,
OperationType::Detokenization,
// Memory operations
OperationType::Transpose,
OperationType::Reshape,
OperationType::Concat,
OperationType::Split,
// I/O
OperationType::Checkpoint,
]
.into_iter()
.collect()
}
/// Creates NVIDIA GPU capabilities.
pub fn nvidia_gpu(
cuda_cores: u32,
tensor_cores: u32,
vram_gb: u32,
bandwidth_gbps: u32,
compute_capability: (u8, u8),
) -> Self {
// Approximate TFLOPS based on cores and typical clocks
let base_clock_ghz = 1.5;
let fp32_tflops = (cuda_cores as f64 * base_clock_ghz * 2.0) / 1000.0;
let tensor_multiplier = if compute_capability.0 >= 8 { 4.0 } else { 2.0 };
Self {
compute: ComputeThroughput {
fp64_tflops: fp32_tflops / 2.0,
fp32_tflops,
fp16_tflops: fp32_tflops * tensor_multiplier,
bf16_tflops: fp32_tflops * tensor_multiplier,
int8_tops: fp32_tflops * tensor_multiplier * 2.0,
int4_tops: fp32_tflops * tensor_multiplier * 4.0,
sparsity_speedup: if compute_capability.0 >= 8 { 2.0 } else { 1.0 },
},
memory: MemorySpecs {
capacity_bytes: vram_gb as u64 * 1024 * 1024 * 1024,
bandwidth_gbps,
type_: if compute_capability.0 >= 9 {
MemoryType::Hbm3
} else {
MemoryType::Hbm2e
},
},
operations: Self::gpu_operations(compute_capability),
power: PowerCharacteristics {
tdp_watts: if compute_capability.0 >= 9 { 700 } else { 350 },
efficiency: 0.9,
power_tier: PowerTier::High,
},
optimal_for: vec![
WorkloadCharacteristic::HighlyParallel,
WorkloadCharacteristic::LargeBatch,
WorkloadCharacteristic::ComputeBound,
],
}
}
/// Operations supported by GPUs.
fn gpu_operations(compute_capability: (u8, u8)) -> HashSet<OperationType> {
let mut ops: HashSet<OperationType> = [
// Matrix operations (optimal)
OperationType::MatMul,
OperationType::Conv2d,
OperationType::Conv3d,
OperationType::DepthwiseConv,
OperationType::BatchNorm,
OperationType::LayerNorm,
// Attention
OperationType::SelfAttention,
OperationType::CrossAttention,
// Element-wise
OperationType::Add,
OperationType::Mul,
OperationType::ReLU,
OperationType::GeLU,
OperationType::SiLU,
OperationType::Softmax,
// Reduction
OperationType::Sum,
OperationType::Mean,
OperationType::Max,
OperationType::ArgMax,
// Memory operations
OperationType::Transpose,
OperationType::Reshape,
OperationType::Concat,
OperationType::Split,
OperationType::Gather,
OperationType::Scatter,
// LLM specific
OperationType::Embedding,
OperationType::RoPE,
OperationType::KVCache,
OperationType::TopK,
OperationType::Sampling,
]
.into_iter()
.collect();
// FlashAttention for newer GPUs
if compute_capability.0 >= 8 {
ops.insert(OperationType::FlashAttention);
}
ops
}
/// Creates TPU capabilities.
pub fn tpu(version: super::TpuVersion) -> Self {
let (bf16_tflops, memory_gb, bandwidth_gbps) = match version {
super::TpuVersion::V5p => (918.0, 95, 4800),
super::TpuVersion::V5e => (197.0, 16, 1600),
super::TpuVersion::V4 => (275.0, 32, 2400),
super::TpuVersion::V4i => (138.0, 32, 1200),
super::TpuVersion::V3 => (123.0, 16, 900),
super::TpuVersion::V2 => (46.0, 8, 600),
super::TpuVersion::Edge => (4.0, 0, 0), // Uses host memory
};
Self {
compute: ComputeThroughput {
fp64_tflops: 0.0, // TPUs don't support FP64
fp32_tflops: bf16_tflops / 2.0,
fp16_tflops: bf16_tflops,
bf16_tflops,
int8_tops: bf16_tflops * 2.0,
int4_tops: bf16_tflops * 4.0,
sparsity_speedup: 2.0,
},
memory: MemorySpecs {
capacity_bytes: memory_gb as u64 * 1024 * 1024 * 1024,
bandwidth_gbps,
type_: MemoryType::Hbm2e,
},
operations: Self::tpu_operations(),
power: PowerCharacteristics {
tdp_watts: if matches!(version, super::TpuVersion::Edge) {
2
} else {
400
},
efficiency: 0.95,
power_tier: if matches!(version, super::TpuVersion::Edge) {
PowerTier::UltraLow
} else {
PowerTier::High
},
},
optimal_for: vec![
WorkloadCharacteristic::HighlyParallel,
WorkloadCharacteristic::ComputeBound,
WorkloadCharacteristic::FixedShape,
WorkloadCharacteristic::LargeBatch,
],
}
}
/// Operations supported by TPUs.
fn tpu_operations() -> HashSet<OperationType> {
[
// Matrix operations (optimal)
OperationType::MatMul,
OperationType::Conv2d,
OperationType::BatchNorm,
OperationType::LayerNorm,
// Attention
OperationType::SelfAttention,
OperationType::CrossAttention,
OperationType::FlashAttention,
// Element-wise
OperationType::Add,
OperationType::Mul,
OperationType::ReLU,
OperationType::GeLU,
OperationType::SiLU,
OperationType::Softmax,
// Reduction
OperationType::Sum,
OperationType::Mean,
OperationType::Max,
// LLM specific
OperationType::Embedding,
OperationType::RoPE,
OperationType::KVCache,
]
.into_iter()
.collect()
}
/// Creates LPU (Groq) capabilities.
pub fn lpu() -> Self {
Self {
compute: ComputeThroughput {
fp64_tflops: 0.0,
fp32_tflops: 0.0,
fp16_tflops: 188.0,
bf16_tflops: 188.0,
int8_tops: 750.0,
int4_tops: 1500.0,
sparsity_speedup: 1.0,
},
memory: MemorySpecs {
capacity_bytes: 230 * 1024 * 1024 * 1024, // 230 GB SRAM!
bandwidth_gbps: 80_000, // 80 TB/s internal
type_: MemoryType::Sram,
},
operations: Self::lpu_operations(),
power: PowerCharacteristics {
tdp_watts: 300,
efficiency: 0.98, // Very efficient for inference
power_tier: PowerTier::Medium,
},
optimal_for: vec![
WorkloadCharacteristic::Sequential,
WorkloadCharacteristic::SmallBatch,
WorkloadCharacteristic::VariableLength,
WorkloadCharacteristic::LowLatency,
],
}
}
/// Operations supported by Groq LPU.
fn lpu_operations() -> HashSet<OperationType> {
[
// Optimized for inference
OperationType::MatMul,
OperationType::LayerNorm,
OperationType::SelfAttention,
OperationType::Add,
OperationType::Mul,
OperationType::ReLU,
OperationType::GeLU,
OperationType::SiLU,
OperationType::Softmax,
OperationType::Embedding,
OperationType::RoPE,
OperationType::KVCache,
OperationType::TopK,
OperationType::Sampling,
]
.into_iter()
.collect()
}
/// Creates Apple Neural Engine capabilities.
pub fn apple_neural_engine(cores: u32) -> Self {
let int8_tops = match cores {
16 => 18.0, // M3
32 => 35.0, // M3 Max
_ => cores as f64 * 1.1,
};
Self {
compute: ComputeThroughput {
fp64_tflops: 0.0,
fp32_tflops: int8_tops / 4.0,
fp16_tflops: int8_tops / 2.0,
bf16_tflops: int8_tops / 2.0,
int8_tops,
int4_tops: int8_tops * 2.0,
sparsity_speedup: 1.0,
},
memory: MemorySpecs {
capacity_bytes: 0, // Uses unified memory
bandwidth_gbps: 400,
type_: MemoryType::Unified,
},
operations: Self::npu_operations(),
power: PowerCharacteristics {
tdp_watts: 15,
efficiency: 0.95,
power_tier: PowerTier::UltraLow,
},
optimal_for: vec![
WorkloadCharacteristic::LowPower,
WorkloadCharacteristic::LowLatency,
WorkloadCharacteristic::SmallBatch,
],
}
}
/// Operations supported by NPUs.
fn npu_operations() -> HashSet<OperationType> {
[
// Inference optimized
OperationType::MatMul,
OperationType::Conv2d,
OperationType::DepthwiseConv,
OperationType::BatchNorm,
OperationType::LayerNorm,
OperationType::Add,
OperationType::Mul,
OperationType::ReLU,
OperationType::Softmax,
OperationType::Embedding,
]
.into_iter()
.collect()
}
}
/// Compute throughput metrics.
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
pub struct ComputeThroughput {
/// FP64 TFLOPS.
pub fp64_tflops: f64,
/// FP32 TFLOPS.
pub fp32_tflops: f64,
/// FP16 TFLOPS.
pub fp16_tflops: f64,
/// BF16 TFLOPS.
pub bf16_tflops: f64,
/// INT8 TOPS.
pub int8_tops: f64,
/// INT4 TOPS.
pub int4_tops: f64,
/// Speedup for sparse operations.
pub sparsity_speedup: f64,
}
/// Memory specifications.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct MemorySpecs {
/// Total capacity (bytes).
pub capacity_bytes: u64,
/// Bandwidth (GB/s).
pub bandwidth_gbps: u32,
/// Memory type.
pub type_: MemoryType,
}
impl Default for MemorySpecs {
fn default() -> Self {
Self {
capacity_bytes: 16 * 1024 * 1024 * 1024, // 16 GB
bandwidth_gbps: 500,
type_: MemoryType::Ddr5,
}
}
}
/// Memory types.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum MemoryType {
/// DDR4 RAM.
Ddr4,
/// DDR5 RAM.
Ddr5,
/// GDDR6/6X video memory.
Gddr6,
/// HBM2.
Hbm2,
/// HBM2e.
Hbm2e,
/// HBM3.
Hbm3,
/// SRAM (on-chip).
Sram,
/// Unified memory (Apple Silicon).
Unified,
/// LPDDR (mobile).
Lpddr,
}
/// Power characteristics.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct PowerCharacteristics {
/// TDP in watts.
pub tdp_watts: u32,
/// Efficiency factor (0.0 - 1.0).
pub efficiency: f64,
/// Power tier.
pub power_tier: PowerTier,
}
impl Default for PowerCharacteristics {
fn default() -> Self {
Self {
tdp_watts: 100,
efficiency: 0.8,
power_tier: PowerTier::Medium,
}
}
}
/// Workload characteristics for processor matching.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum WorkloadCharacteristic {
/// High parallelism (GPU, TPU).
HighlyParallel,
/// Sequential dependencies (CPU, LPU).
Sequential,
/// Memory bandwidth bound (GPU).
MemoryBound,
/// Compute bound (TPU).
ComputeBound,
/// Low latency required (NPU, edge).
LowLatency,
/// Low power required (NPU, mobile).
LowPower,
/// Large batch sizes (GPU, TPU).
LargeBatch,
/// Small batch sizes (CPU, LPU).
SmallBatch,
/// Variable length sequences (LPU).
VariableLength,
/// Fixed tensor shapes (TPU).
FixedShape,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cpu_capabilities() {
let caps = ProcessorCapabilities::cpu(32, 3.5, true);
assert!(caps.compute.fp32_tflops > 0.0);
assert!(caps.operations.contains(&OperationType::DataLoad));
assert!(caps.operations.contains(&OperationType::Tokenization));
}
#[test]
fn test_gpu_capabilities() {
let caps = ProcessorCapabilities::nvidia_gpu(16384, 512, 24, 1000, (8, 9));
assert!(caps.compute.fp16_tflops > caps.compute.fp32_tflops);
assert!(caps.operations.contains(&OperationType::FlashAttention));
}
#[test]
fn test_tpu_capabilities() {
let caps = ProcessorCapabilities::tpu(super::super::TpuVersion::V5p);
assert!(caps.compute.bf16_tflops > 900.0);
assert!(!caps.operations.contains(&OperationType::DataLoad)); // TPUs don't do I/O
}
#[test]
fn test_lpu_capabilities() {
let caps = ProcessorCapabilities::lpu();
assert!(caps.memory.bandwidth_gbps > 10000); // Very high internal bandwidth
assert!(caps.optimal_for.contains(&WorkloadCharacteristic::Sequential));
}
}

View file

@ -0,0 +1,339 @@
//! Processor abstractions for heterogeneous compute.
//!
//! Supports all processor types:
//! - CPU (x86_64, ARM64, RISC-V)
//! - GPU (NVIDIA CUDA, AMD ROCm, Intel OneAPI, Apple Metal)
//! - TPU (Google TPU v2-v5)
//! - NPU (Apple Neural Engine, Qualcomm Hexagon, Intel VPU)
//! - LPU (Groq Language Processing Unit)
//! - FPGA (Xilinx, Intel/Altera)
//! - DSP (Digital Signal Processors)
//! - Custom accelerators
mod capabilities;
mod operation;
mod profiles;
mod types;
pub use capabilities::{ComputeThroughput, MemorySpecs, MemoryType, ProcessorCapabilities};
pub use operation::{Operation, OperationType};
pub use profiles::ProcessorProfiles;
pub use types::*;
use crate::error::ComputeError;
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use std::time::Duration;
/// Unique processor identifier (within a node).
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct ProcessorId(pub u64);
impl std::fmt::Display for ProcessorId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "proc_{}", self.0)
}
}
/// Unified abstraction for any processor type.
#[async_trait]
pub trait Processor: Send + Sync {
/// Get processor ID.
fn id(&self) -> ProcessorId;
/// Get processor type.
fn processor_type(&self) -> ProcessorType;
/// Get capabilities.
fn capabilities(&self) -> &ProcessorCapabilities;
/// Check if processor can execute operation.
fn can_execute(&self, op: &Operation) -> bool;
/// Estimate execution time for operation.
fn estimate_time(&self, op: &Operation) -> Duration;
/// Estimate energy consumption for operation (Joules).
fn estimate_energy(&self, op: &Operation) -> f64;
/// Execute operation.
async fn execute(&self, op: Operation) -> Result<OperationResult, ComputeError>;
/// Current utilization (0.0 - 1.0).
fn utilization(&self) -> f64;
/// Available memory (bytes).
fn available_memory(&self) -> u64;
/// Check if this processor shares memory with another type.
fn shares_memory_with(&self, other: &ProcessorType) -> bool {
// By default, processors don't share memory
// Override for unified memory architectures (Apple Silicon, AMD APUs)
self.processor_type() == *other
}
}
/// Result of an operation execution.
#[derive(Clone, Debug)]
pub struct OperationResult {
/// Output data.
pub output: Vec<u8>,
/// Execution time.
pub duration: Duration,
/// Energy consumed (Joules).
pub energy: f64,
/// Peak memory used (bytes).
pub peak_memory: u64,
}
/// Generic processor implementation for simulation/testing.
pub struct GenericProcessor {
id: ProcessorId,
processor_type: ProcessorType,
capabilities: ProcessorCapabilities,
utilization: std::sync::atomic::AtomicU64,
available_memory: std::sync::atomic::AtomicU64,
}
impl GenericProcessor {
/// Creates a new generic processor.
pub fn new(
id: ProcessorId,
processor_type: ProcessorType,
capabilities: ProcessorCapabilities,
) -> Self {
let available_memory = capabilities.memory.capacity_bytes;
Self {
id,
processor_type,
capabilities,
utilization: std::sync::atomic::AtomicU64::new(0),
available_memory: std::sync::atomic::AtomicU64::new(available_memory),
}
}
/// Creates a CPU processor.
pub fn cpu(id: ProcessorId, variant: CpuVariant) -> Self {
Self::new(
id,
ProcessorType::Cpu(variant),
ProcessorProfiles::cpu_default(),
)
}
/// Creates an NVIDIA GPU processor.
pub fn nvidia_gpu(id: ProcessorId, compute_capability: (u8, u8)) -> Self {
let capabilities = match compute_capability {
(9, 0) => ProcessorProfiles::nvidia_h100(),
(8, 9) => ProcessorProfiles::nvidia_rtx_4090(),
(8, 6) => ProcessorProfiles::nvidia_rtx_3090(),
_ => ProcessorProfiles::nvidia_default(),
};
Self::new(
id,
ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability }),
capabilities,
)
}
/// Creates a TPU processor.
pub fn tpu(id: ProcessorId, version: TpuVersion) -> Self {
let capabilities = match version {
TpuVersion::V5p => ProcessorProfiles::google_tpu_v5p(),
TpuVersion::V4 => ProcessorProfiles::google_tpu_v4(),
_ => ProcessorProfiles::google_tpu_default(),
};
Self::new(id, ProcessorType::Tpu(version), capabilities)
}
/// Creates a Groq LPU processor.
pub fn lpu(id: ProcessorId) -> Self {
Self::new(id, ProcessorType::Lpu, ProcessorProfiles::groq_lpu())
}
/// Creates an Apple Neural Engine processor.
pub fn apple_neural_engine(id: ProcessorId, cores: u32) -> Self {
Self::new(
id,
ProcessorType::Npu(NpuVariant::AppleNeuralEngine { cores }),
ProcessorProfiles::apple_neural_engine(cores),
)
}
}
#[async_trait]
impl Processor for GenericProcessor {
fn id(&self) -> ProcessorId {
self.id
}
fn processor_type(&self) -> ProcessorType {
self.processor_type.clone()
}
fn capabilities(&self) -> &ProcessorCapabilities {
&self.capabilities
}
fn can_execute(&self, op: &Operation) -> bool {
self.capabilities.operations.contains(&op.op_type())
}
fn estimate_time(&self, op: &Operation) -> Duration {
// Estimate based on FLOPS and operation complexity
let flops_needed = op.estimated_flops();
let throughput = match op.precision() {
Precision::Fp32 => self.capabilities.compute.fp32_tflops,
Precision::Fp16 => self.capabilities.compute.fp16_tflops,
Precision::Bf16 => self.capabilities.compute.bf16_tflops,
Precision::Int8 => self.capabilities.compute.int8_tops,
Precision::Int4 => self.capabilities.compute.int4_tops,
Precision::Fp64 => self.capabilities.compute.fp64_tflops,
};
if throughput > 0.0 {
let tflops = throughput;
let flops_per_second = tflops * 1e12;
let seconds = flops_needed / flops_per_second;
Duration::from_secs_f64(seconds)
} else {
Duration::from_secs(1) // Fallback
}
}
fn estimate_energy(&self, op: &Operation) -> f64 {
// Estimate based on TDP and execution time
let duration = self.estimate_time(op);
let watts = self.capabilities.power.tdp_watts as f64;
let efficiency = self.capabilities.power.efficiency;
watts * duration.as_secs_f64() * efficiency
}
async fn execute(&self, op: Operation) -> Result<OperationResult, ComputeError> {
// Check if we can execute
if !self.can_execute(&op) {
return Err(ComputeError::OperationNotSupported(
self.processor_type.clone(),
format!("{:?}", op.op_type()),
));
}
// Simulate execution
let duration = self.estimate_time(&op);
let energy = self.estimate_energy(&op);
// Update utilization
self.utilization
.store(50, std::sync::atomic::Ordering::Relaxed);
// Simulate work
tokio::time::sleep(Duration::from_micros(100)).await;
// Reset utilization
self.utilization
.store(0, std::sync::atomic::Ordering::Relaxed);
Ok(OperationResult {
output: vec![],
duration,
energy,
peak_memory: op.estimated_memory(),
})
}
fn utilization(&self) -> f64 {
self.utilization.load(std::sync::atomic::Ordering::Relaxed) as f64 / 100.0
}
fn available_memory(&self) -> u64 {
self.available_memory
.load(std::sync::atomic::Ordering::Relaxed)
}
fn shares_memory_with(&self, other: &ProcessorType) -> bool {
match (&self.processor_type, other) {
// Apple Silicon has unified memory
(ProcessorType::Cpu(CpuVariant::Arm64 { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal))
| (ProcessorType::Gpu(GpuVariant::AppleMetal), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
| (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
| (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal)) => true,
// Same type always shares
(a, b) if a == b => true,
_ => false,
}
}
}
/// Precision for operations.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum Precision {
Fp64,
Fp32,
Fp16,
Bf16,
Int8,
Int4,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_processor_creation() {
let cpu = GenericProcessor::cpu(
ProcessorId(0),
CpuVariant::X86_64 {
avx: AvxSupport::Avx512,
},
);
assert_eq!(cpu.id(), ProcessorId(0));
assert!(matches!(cpu.processor_type(), ProcessorType::Cpu(_)));
}
#[test]
fn test_gpu_creation() {
let gpu = GenericProcessor::nvidia_gpu(ProcessorId(1), (9, 0));
assert_eq!(gpu.id(), ProcessorId(1));
assert!(matches!(
gpu.processor_type(),
ProcessorType::Gpu(GpuVariant::NvidiaCuda { .. })
));
}
#[test]
fn test_unified_memory() {
let apple_cpu = GenericProcessor::new(
ProcessorId(0),
ProcessorType::Cpu(CpuVariant::Arm64 { sve: false }),
ProcessorCapabilities::default(),
);
assert!(apple_cpu.shares_memory_with(&ProcessorType::Gpu(GpuVariant::AppleMetal)));
}
#[tokio::test]
async fn test_operation_execution() {
let cpu = GenericProcessor::cpu(
ProcessorId(0),
CpuVariant::X86_64 {
avx: AvxSupport::Avx512,
},
);
let op = Operation::MatMul {
m: 1024,
n: 1024,
k: 1024,
precision: Precision::Fp32,
};
// CPU might not support all ops depending on capabilities
// This is testing the infrastructure
let result = cpu.execute(op).await;
// Result depends on capabilities
assert!(result.is_ok() || result.is_err());
}
}

View file

@ -0,0 +1,543 @@
//! Operation definitions for heterogeneous compute.
use super::Precision;
use serde::{Deserialize, Serialize};
/// Operation types for processor matching.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum OperationType {
// Matrix operations
MatMul,
Conv2d,
Conv3d,
DepthwiseConv,
BatchNorm,
LayerNorm,
// Attention operations
SelfAttention,
CrossAttention,
FlashAttention,
// Element-wise operations
Add,
Mul,
ReLU,
GeLU,
SiLU,
Softmax,
// Reduction operations
Sum,
Mean,
Max,
ArgMax,
// Data movement
Transpose,
Reshape,
Concat,
Split,
Gather,
Scatter,
// LLM specific
Embedding,
RoPE, // Rotary Position Embedding
KVCache,
TopK,
Sampling,
// I/O operations
DataLoad,
DataPreprocess,
Tokenization,
Detokenization,
Checkpoint,
// Distributed operations
AllReduce,
AllGather,
ReduceScatter,
// Training specific
Backward,
OptimizerStep,
GradientClip,
}
/// Concrete operation with parameters.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum Operation {
/// Matrix multiplication.
MatMul {
m: usize,
n: usize,
k: usize,
precision: Precision,
},
/// 2D Convolution.
Conv2d {
batch: usize,
in_channels: usize,
out_channels: usize,
height: usize,
width: usize,
kernel_size: usize,
precision: Precision,
},
/// Batch normalization.
BatchNorm {
batch: usize,
channels: usize,
spatial: usize,
precision: Precision,
},
/// Layer normalization.
LayerNorm {
batch: usize,
seq_len: usize,
hidden: usize,
precision: Precision,
},
/// Self-attention.
SelfAttention {
batch: usize,
seq_len: usize,
num_heads: usize,
head_dim: usize,
precision: Precision,
},
/// Flash attention (fused, memory efficient).
FlashAttention {
batch: usize,
seq_len: usize,
num_heads: usize,
head_dim: usize,
precision: Precision,
},
/// Element-wise addition.
Add {
elements: usize,
precision: Precision,
},
/// Element-wise multiplication.
Mul {
elements: usize,
precision: Precision,
},
/// ReLU activation.
ReLU { elements: usize },
/// GeLU activation.
GeLU { elements: usize },
/// SiLU (Swish) activation.
SiLU { elements: usize },
/// Softmax.
Softmax {
batch: usize,
seq_len: usize,
precision: Precision,
},
/// Embedding lookup.
Embedding {
batch: usize,
seq_len: usize,
vocab_size: usize,
embed_dim: usize,
precision: Precision,
},
/// Rotary Position Embedding.
RoPE {
batch: usize,
seq_len: usize,
head_dim: usize,
precision: Precision,
},
/// KV Cache update.
KVCache {
batch: usize,
seq_len: usize,
num_heads: usize,
head_dim: usize,
precision: Precision,
},
/// Top-K sampling.
TopK {
batch: usize,
vocab_size: usize,
k: usize,
},
/// Token sampling.
Sampling {
batch: usize,
vocab_size: usize,
temperature: f32,
},
/// Data loading from storage.
DataLoad {
bytes: usize,
async_: bool,
},
/// Data preprocessing.
DataPreprocess {
batch: usize,
transforms: Vec<String>,
},
/// Tokenization.
Tokenization {
text_bytes: usize,
vocab_size: usize,
},
/// Detokenization.
Detokenization {
tokens: usize,
vocab_size: usize,
},
/// Checkpoint save.
Checkpoint {
bytes: usize,
async_: bool,
},
/// All-reduce across devices.
AllReduce {
elements: usize,
precision: Precision,
devices: usize,
},
/// Backward pass for a layer.
Backward {
forward_op: Box<Operation>,
},
/// Optimizer step.
OptimizerStep {
parameters: usize,
optimizer: String,
precision: Precision,
},
/// Transpose.
Transpose {
shape: Vec<usize>,
axes: Vec<usize>,
},
/// Reshape.
Reshape {
from: Vec<usize>,
to: Vec<usize>,
},
/// Concatenate tensors.
Concat {
shapes: Vec<Vec<usize>>,
axis: usize,
},
/// Generic operation.
Generic {
op_type: OperationType,
flops: f64,
memory: u64,
},
}
impl Operation {
/// Returns the operation type.
pub fn op_type(&self) -> OperationType {
match self {
Operation::MatMul { .. } => OperationType::MatMul,
Operation::Conv2d { .. } => OperationType::Conv2d,
Operation::BatchNorm { .. } => OperationType::BatchNorm,
Operation::LayerNorm { .. } => OperationType::LayerNorm,
Operation::SelfAttention { .. } => OperationType::SelfAttention,
Operation::FlashAttention { .. } => OperationType::FlashAttention,
Operation::Add { .. } => OperationType::Add,
Operation::Mul { .. } => OperationType::Mul,
Operation::ReLU { .. } => OperationType::ReLU,
Operation::GeLU { .. } => OperationType::GeLU,
Operation::SiLU { .. } => OperationType::SiLU,
Operation::Softmax { .. } => OperationType::Softmax,
Operation::Embedding { .. } => OperationType::Embedding,
Operation::RoPE { .. } => OperationType::RoPE,
Operation::KVCache { .. } => OperationType::KVCache,
Operation::TopK { .. } => OperationType::TopK,
Operation::Sampling { .. } => OperationType::Sampling,
Operation::DataLoad { .. } => OperationType::DataLoad,
Operation::DataPreprocess { .. } => OperationType::DataPreprocess,
Operation::Tokenization { .. } => OperationType::Tokenization,
Operation::Detokenization { .. } => OperationType::Detokenization,
Operation::Checkpoint { .. } => OperationType::Checkpoint,
Operation::AllReduce { .. } => OperationType::AllReduce,
Operation::Backward { .. } => OperationType::Backward,
Operation::OptimizerStep { .. } => OperationType::OptimizerStep,
Operation::Transpose { .. } => OperationType::Transpose,
Operation::Reshape { .. } => OperationType::Reshape,
Operation::Concat { .. } => OperationType::Concat,
Operation::Generic { op_type, .. } => *op_type,
}
}
/// Returns the precision used.
pub fn precision(&self) -> Precision {
match self {
Operation::MatMul { precision, .. }
| Operation::Conv2d { precision, .. }
| Operation::BatchNorm { precision, .. }
| Operation::LayerNorm { precision, .. }
| Operation::SelfAttention { precision, .. }
| Operation::FlashAttention { precision, .. }
| Operation::Add { precision, .. }
| Operation::Mul { precision, .. }
| Operation::Softmax { precision, .. }
| Operation::Embedding { precision, .. }
| Operation::RoPE { precision, .. }
| Operation::KVCache { precision, .. }
| Operation::AllReduce { precision, .. }
| Operation::OptimizerStep { precision, .. } => *precision,
Operation::Backward { forward_op } => forward_op.precision(),
_ => Precision::Fp32, // Default
}
}
/// Estimates FLOPS for the operation.
pub fn estimated_flops(&self) -> f64 {
match self {
// MatMul: 2 * M * N * K (multiply-add)
Operation::MatMul { m, n, k, .. } => 2.0 * (*m as f64) * (*n as f64) * (*k as f64),
// Conv2d: 2 * batch * out * H * W * in * K * K
Operation::Conv2d {
batch,
in_channels,
out_channels,
height,
width,
kernel_size,
..
} => {
2.0 * (*batch as f64)
* (*out_channels as f64)
* (*height as f64)
* (*width as f64)
* (*in_channels as f64)
* (*kernel_size as f64)
* (*kernel_size as f64)
}
// Self-attention: 4 * batch * seq * seq * head_dim * heads
Operation::SelfAttention {
batch,
seq_len,
num_heads,
head_dim,
..
}
| Operation::FlashAttention {
batch,
seq_len,
num_heads,
head_dim,
..
} => {
4.0 * (*batch as f64)
* (*seq_len as f64)
* (*seq_len as f64)
* (*head_dim as f64)
* (*num_heads as f64)
}
// Element-wise: 1 FLOP per element
Operation::Add { elements, .. }
| Operation::Mul { elements, .. }
| Operation::ReLU { elements }
| Operation::GeLU { elements }
| Operation::SiLU { elements } => *elements as f64,
// Softmax: ~5 ops per element (exp, sum, div)
Operation::Softmax {
batch, seq_len, ..
} => 5.0 * (*batch as f64) * (*seq_len as f64),
// Embedding: just lookup, minimal FLOPS
Operation::Embedding {
batch,
seq_len,
embed_dim,
..
} => (*batch as f64) * (*seq_len as f64) * (*embed_dim as f64) * 0.1,
// Backward: ~2x forward
Operation::Backward { forward_op } => forward_op.estimated_flops() * 2.0,
// Generic
Operation::Generic { flops, .. } => *flops,
// I/O operations: minimal compute
_ => 1000.0,
}
}
/// Estimates memory usage (bytes).
pub fn estimated_memory(&self) -> u64 {
let precision_bytes = match self.precision() {
Precision::Fp64 => 8,
Precision::Fp32 => 4,
Precision::Fp16 | Precision::Bf16 => 2,
Precision::Int8 => 1,
Precision::Int4 => 1, // Rounded up
};
match self {
Operation::MatMul { m, n, k, .. } => {
// Input A (m×k) + Input B (k×n) + Output (m×n)
((*m * *k) + (*k * *n) + (*m * *n)) as u64 * precision_bytes
}
Operation::SelfAttention {
batch,
seq_len,
num_heads,
head_dim,
..
} => {
// Q, K, V, Output, intermediate attention
5 * (*batch as u64)
* (*seq_len as u64)
* (*num_heads as u64)
* (*head_dim as u64)
* precision_bytes
}
Operation::FlashAttention {
batch,
seq_len,
num_heads,
head_dim,
..
} => {
// FlashAttention uses much less memory
2 * (*batch as u64)
* (*seq_len as u64)
* (*num_heads as u64)
* (*head_dim as u64)
* precision_bytes
}
Operation::KVCache {
batch,
seq_len,
num_heads,
head_dim,
..
} => {
// K and V caches
2 * (*batch as u64)
* (*seq_len as u64)
* (*num_heads as u64)
* (*head_dim as u64)
* precision_bytes
}
Operation::Generic { memory, .. } => *memory,
_ => 1024 * 1024, // 1 MB default
}
}
/// Creates the backward operation for this operation.
pub fn backward(&self) -> Option<Operation> {
match self {
Operation::MatMul { .. }
| Operation::Conv2d { .. }
| Operation::SelfAttention { .. }
| Operation::FlashAttention { .. }
| Operation::LayerNorm { .. }
| Operation::BatchNorm { .. } => Some(Operation::Backward {
forward_op: Box::new(self.clone()),
}),
_ => None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_matmul_flops() {
let op = Operation::MatMul {
m: 1024,
n: 1024,
k: 1024,
precision: Precision::Fp32,
};
let flops = op.estimated_flops();
// 2 * 1024^3 = ~2.1 billion FLOPS
assert!(flops > 2e9 && flops < 2.2e9);
}
#[test]
fn test_attention_memory() {
let regular = Operation::SelfAttention {
batch: 1,
seq_len: 4096,
num_heads: 32,
head_dim: 128,
precision: Precision::Fp16,
};
let flash = Operation::FlashAttention {
batch: 1,
seq_len: 4096,
num_heads: 32,
head_dim: 128,
precision: Precision::Fp16,
};
// FlashAttention should use less memory
assert!(flash.estimated_memory() < regular.estimated_memory());
}
#[test]
fn test_backward_creation() {
let forward = Operation::MatMul {
m: 1024,
n: 1024,
k: 1024,
precision: Precision::Fp32,
};
let backward = forward.backward();
assert!(backward.is_some());
if let Some(Operation::Backward { forward_op }) = backward {
assert!(matches!(*forward_op, Operation::MatMul { .. }));
}
}
}

View file

@ -0,0 +1,513 @@
//! Pre-defined processor profiles for common hardware.
use super::capabilities::{
ComputeThroughput, MemorySpecs, MemoryType, PowerCharacteristics, ProcessorCapabilities,
WorkloadCharacteristic,
};
use super::operation::OperationType;
use super::types::PowerTier;
use super::TpuVersion;
use std::collections::HashSet;
/// Pre-defined processor profiles.
pub struct ProcessorProfiles;
impl ProcessorProfiles {
// ═══════════════════════════════════════════════════════════════
// CPU PROFILES
// ═══════════════════════════════════════════════════════════════
/// Default CPU profile.
pub fn cpu_default() -> ProcessorCapabilities {
ProcessorCapabilities::cpu(8, 3.5, false)
}
/// AMD EPYC 9654 (96 cores).
pub fn amd_epyc_9654() -> ProcessorCapabilities {
ProcessorCapabilities {
compute: ComputeThroughput {
fp64_tflops: 2.7,
fp32_tflops: 5.4,
fp16_tflops: 10.8,
bf16_tflops: 10.8,
int8_tops: 21.6,
int4_tops: 43.2,
sparsity_speedup: 1.0,
},
memory: MemorySpecs {
capacity_bytes: 6 * 1024 * 1024 * 1024 * 1024, // 6 TB max
bandwidth_gbps: 460,
type_: MemoryType::Ddr5,
},
operations: ProcessorCapabilities::cpu(96, 2.4, false)
.operations,
power: PowerCharacteristics {
tdp_watts: 360,
efficiency: 0.85,
power_tier: PowerTier::High,
},
optimal_for: vec![
WorkloadCharacteristic::Sequential,
WorkloadCharacteristic::MemoryBound,
],
}
}
/// Intel Xeon w9-3595X (56 cores).
pub fn intel_xeon_w9_3595x() -> ProcessorCapabilities {
ProcessorCapabilities {
compute: ComputeThroughput {
fp64_tflops: 3.2,
fp32_tflops: 6.4,
fp16_tflops: 12.8,
bf16_tflops: 12.8,
int8_tops: 25.6,
int4_tops: 51.2,
sparsity_speedup: 1.0,
},
memory: MemorySpecs {
capacity_bytes: 4 * 1024 * 1024 * 1024 * 1024, // 4 TB max
bandwidth_gbps: 307,
type_: MemoryType::Ddr5,
},
operations: ProcessorCapabilities::cpu(56, 2.9, true)
.operations,
power: PowerCharacteristics {
tdp_watts: 350,
efficiency: 0.80,
power_tier: PowerTier::High,
},
optimal_for: vec![
WorkloadCharacteristic::Sequential,
WorkloadCharacteristic::MemoryBound,
],
}
}
/// Apple M3 Max CPU cores.
pub fn apple_m3_max_cpu() -> ProcessorCapabilities {
ProcessorCapabilities {
compute: ComputeThroughput {
fp64_tflops: 0.3,
fp32_tflops: 0.6,
fp16_tflops: 1.2,
bf16_tflops: 1.2,
int8_tops: 2.4,
int4_tops: 4.8,
sparsity_speedup: 1.0,
},
memory: MemorySpecs {
capacity_bytes: 128 * 1024 * 1024 * 1024, // 128 GB unified
bandwidth_gbps: 400,
type_: MemoryType::Unified,
},
operations: ProcessorCapabilities::cpu(16, 4.0, false)
.operations,
power: PowerCharacteristics {
tdp_watts: 40,
efficiency: 0.95,
power_tier: PowerTier::Low,
},
optimal_for: vec![
WorkloadCharacteristic::Sequential,
WorkloadCharacteristic::LowPower,
],
}
}
// ═══════════════════════════════════════════════════════════════
// NVIDIA GPU PROFILES
// ═══════════════════════════════════════════════════════════════
/// Default NVIDIA GPU profile.
pub fn nvidia_default() -> ProcessorCapabilities {
ProcessorCapabilities::nvidia_gpu(8192, 256, 12, 600, (8, 0))
}
/// NVIDIA H100 SXM (80GB).
pub fn nvidia_h100() -> ProcessorCapabilities {
ProcessorCapabilities {
compute: ComputeThroughput {
fp64_tflops: 67.0,
fp32_tflops: 67.0,
fp16_tflops: 1979.0, // With sparsity
bf16_tflops: 1979.0,
int8_tops: 3958.0,
int4_tops: 7916.0,
sparsity_speedup: 2.0,
},
memory: MemorySpecs {
capacity_bytes: 80 * 1024 * 1024 * 1024,
bandwidth_gbps: 3350,
type_: MemoryType::Hbm3,
},
operations: ProcessorCapabilities::nvidia_gpu(16896, 528, 80, 3350, (9, 0))
.operations,
power: PowerCharacteristics {
tdp_watts: 700,
efficiency: 0.90,
power_tier: PowerTier::High,
},
optimal_for: vec![
WorkloadCharacteristic::HighlyParallel,
WorkloadCharacteristic::LargeBatch,
WorkloadCharacteristic::ComputeBound,
],
}
}
/// NVIDIA A100 (80GB).
pub fn nvidia_a100() -> ProcessorCapabilities {
ProcessorCapabilities {
compute: ComputeThroughput {
fp64_tflops: 19.5,
fp32_tflops: 19.5,
fp16_tflops: 624.0, // With sparsity
bf16_tflops: 624.0,
int8_tops: 1248.0,
int4_tops: 2496.0,
sparsity_speedup: 2.0,
},
memory: MemorySpecs {
capacity_bytes: 80 * 1024 * 1024 * 1024,
bandwidth_gbps: 2039,
type_: MemoryType::Hbm2e,
},
operations: ProcessorCapabilities::nvidia_gpu(6912, 432, 80, 2039, (8, 0))
.operations,
power: PowerCharacteristics {
tdp_watts: 400,
efficiency: 0.88,
power_tier: PowerTier::High,
},
optimal_for: vec![
WorkloadCharacteristic::HighlyParallel,
WorkloadCharacteristic::LargeBatch,
WorkloadCharacteristic::ComputeBound,
],
}
}
/// NVIDIA RTX 4090.
pub fn nvidia_rtx_4090() -> ProcessorCapabilities {
ProcessorCapabilities {
compute: ComputeThroughput {
fp64_tflops: 1.3,
fp32_tflops: 82.6,
fp16_tflops: 330.4, // With sparsity
bf16_tflops: 330.4,
int8_tops: 660.8,
int4_tops: 1321.6,
sparsity_speedup: 2.0,
},
memory: MemorySpecs {
capacity_bytes: 24 * 1024 * 1024 * 1024,
bandwidth_gbps: 1008,
type_: MemoryType::Gddr6,
},
operations: ProcessorCapabilities::nvidia_gpu(16384, 512, 24, 1008, (8, 9))
.operations,
power: PowerCharacteristics {
tdp_watts: 450,
efficiency: 0.85,
power_tier: PowerTier::High,
},
optimal_for: vec![
WorkloadCharacteristic::HighlyParallel,
WorkloadCharacteristic::LargeBatch,
],
}
}
/// NVIDIA RTX 3090.
pub fn nvidia_rtx_3090() -> ProcessorCapabilities {
ProcessorCapabilities {
compute: ComputeThroughput {
fp64_tflops: 0.6,
fp32_tflops: 35.6,
fp16_tflops: 71.2,
bf16_tflops: 71.2,
int8_tops: 142.4,
int4_tops: 284.8,
sparsity_speedup: 1.0,
},
memory: MemorySpecs {
capacity_bytes: 24 * 1024 * 1024 * 1024,
bandwidth_gbps: 936,
type_: MemoryType::Gddr6,
},
operations: ProcessorCapabilities::nvidia_gpu(10496, 328, 24, 936, (8, 6))
.operations,
power: PowerCharacteristics {
tdp_watts: 350,
efficiency: 0.82,
power_tier: PowerTier::High,
},
optimal_for: vec![
WorkloadCharacteristic::HighlyParallel,
WorkloadCharacteristic::LargeBatch,
],
}
}
// ═══════════════════════════════════════════════════════════════
// AMD GPU PROFILES
// ═══════════════════════════════════════════════════════════════
/// AMD MI300X.
pub fn amd_mi300x() -> ProcessorCapabilities {
ProcessorCapabilities {
compute: ComputeThroughput {
fp64_tflops: 163.4,
fp32_tflops: 163.4,
fp16_tflops: 1307.0,
bf16_tflops: 1307.0,
int8_tops: 2614.0,
int4_tops: 5228.0,
sparsity_speedup: 2.0,
},
memory: MemorySpecs {
capacity_bytes: 192 * 1024 * 1024 * 1024, // 192 GB HBM3
bandwidth_gbps: 5300,
type_: MemoryType::Hbm3,
},
operations: {
let mut ops = ProcessorCapabilities::nvidia_gpu(16384, 512, 80, 5300, (9, 0))
.operations;
ops.remove(&OperationType::FlashAttention); // Different implementation
ops
},
power: PowerCharacteristics {
tdp_watts: 750,
efficiency: 0.88,
power_tier: PowerTier::High,
},
optimal_for: vec![
WorkloadCharacteristic::HighlyParallel,
WorkloadCharacteristic::LargeBatch,
WorkloadCharacteristic::MemoryBound, // High memory bandwidth
],
}
}
/// AMD RX 7900 XTX.
pub fn amd_rx_7900_xtx() -> ProcessorCapabilities {
ProcessorCapabilities {
compute: ComputeThroughput {
fp64_tflops: 1.9,
fp32_tflops: 61.0,
fp16_tflops: 122.0,
bf16_tflops: 122.0,
int8_tops: 244.0,
int4_tops: 488.0,
sparsity_speedup: 1.0,
},
memory: MemorySpecs {
capacity_bytes: 24 * 1024 * 1024 * 1024,
bandwidth_gbps: 960,
type_: MemoryType::Gddr6,
},
operations: {
let mut ops = ProcessorCapabilities::nvidia_gpu(6144, 0, 24, 960, (8, 0))
.operations;
ops.remove(&OperationType::FlashAttention);
ops
},
power: PowerCharacteristics {
tdp_watts: 355,
efficiency: 0.80,
power_tier: PowerTier::High,
},
optimal_for: vec![
WorkloadCharacteristic::HighlyParallel,
],
}
}
// ═══════════════════════════════════════════════════════════════
// GOOGLE TPU PROFILES
// ═══════════════════════════════════════════════════════════════
/// Default TPU profile.
pub fn google_tpu_default() -> ProcessorCapabilities {
ProcessorCapabilities::tpu(TpuVersion::V4)
}
/// Google TPU v5p.
pub fn google_tpu_v5p() -> ProcessorCapabilities {
ProcessorCapabilities::tpu(TpuVersion::V5p)
}
/// Google TPU v4.
pub fn google_tpu_v4() -> ProcessorCapabilities {
ProcessorCapabilities::tpu(TpuVersion::V4)
}
/// Google Edge TPU.
pub fn google_edge_tpu() -> ProcessorCapabilities {
ProcessorCapabilities {
compute: ComputeThroughput {
fp64_tflops: 0.0,
fp32_tflops: 0.0,
fp16_tflops: 0.0,
bf16_tflops: 0.0,
int8_tops: 4.0,
int4_tops: 8.0,
sparsity_speedup: 1.0,
},
memory: MemorySpecs {
capacity_bytes: 0, // Uses host memory
bandwidth_gbps: 0,
type_: MemoryType::Unified,
},
operations: {
let mut ops = HashSet::new();
ops.insert(OperationType::MatMul);
ops.insert(OperationType::Conv2d);
ops.insert(OperationType::DepthwiseConv);
ops.insert(OperationType::Add);
ops.insert(OperationType::Mul);
ops.insert(OperationType::ReLU);
ops.insert(OperationType::Softmax);
ops
},
power: PowerCharacteristics {
tdp_watts: 2,
efficiency: 0.95,
power_tier: PowerTier::UltraLow,
},
optimal_for: vec![
WorkloadCharacteristic::LowPower,
WorkloadCharacteristic::LowLatency,
WorkloadCharacteristic::SmallBatch,
],
}
}
// ═══════════════════════════════════════════════════════════════
// GROQ LPU PROFILE
// ═══════════════════════════════════════════════════════════════
/// Groq LPU.
pub fn groq_lpu() -> ProcessorCapabilities {
ProcessorCapabilities::lpu()
}
// ═══════════════════════════════════════════════════════════════
// APPLE NEURAL ENGINE PROFILES
// ═══════════════════════════════════════════════════════════════
/// Apple Neural Engine (generic).
pub fn apple_neural_engine(cores: u32) -> ProcessorCapabilities {
ProcessorCapabilities::apple_neural_engine(cores)
}
/// Apple M3 Neural Engine (16 cores).
pub fn apple_m3_neural_engine() -> ProcessorCapabilities {
ProcessorCapabilities::apple_neural_engine(16)
}
/// Apple M3 Max Neural Engine (16 cores).
pub fn apple_m3_max_neural_engine() -> ProcessorCapabilities {
ProcessorCapabilities::apple_neural_engine(16) // Same as M3
}
/// Apple A17 Pro Neural Engine (35 TOPS).
pub fn apple_a17_pro_neural_engine() -> ProcessorCapabilities {
ProcessorCapabilities {
compute: ComputeThroughput {
fp64_tflops: 0.0,
fp32_tflops: 4.4,
fp16_tflops: 8.8,
bf16_tflops: 8.8,
int8_tops: 35.0,
int4_tops: 70.0,
sparsity_speedup: 1.0,
},
memory: MemorySpecs {
capacity_bytes: 0, // Uses unified memory
bandwidth_gbps: 200,
type_: MemoryType::Unified,
},
operations: ProcessorCapabilities::apple_neural_engine(16)
.operations,
power: PowerCharacteristics {
tdp_watts: 8,
efficiency: 0.98,
power_tier: PowerTier::UltraLow,
},
optimal_for: vec![
WorkloadCharacteristic::LowPower,
WorkloadCharacteristic::LowLatency,
WorkloadCharacteristic::SmallBatch,
],
}
}
// ═══════════════════════════════════════════════════════════════
// QUALCOMM NPU PROFILES
// ═══════════════════════════════════════════════════════════════
/// Qualcomm Hexagon NPU (Snapdragon 8 Gen 3).
pub fn qualcomm_hexagon_8g3() -> ProcessorCapabilities {
ProcessorCapabilities {
compute: ComputeThroughput {
fp64_tflops: 0.0,
fp32_tflops: 3.0,
fp16_tflops: 6.0,
bf16_tflops: 6.0,
int8_tops: 73.0, // 73 TOPS
int4_tops: 146.0,
sparsity_speedup: 1.0,
},
memory: MemorySpecs {
capacity_bytes: 0, // Uses system memory
bandwidth_gbps: 77,
type_: MemoryType::Lpddr,
},
operations: ProcessorCapabilities::apple_neural_engine(16)
.operations,
power: PowerCharacteristics {
tdp_watts: 10,
efficiency: 0.95,
power_tier: PowerTier::UltraLow,
},
optimal_for: vec![
WorkloadCharacteristic::LowPower,
WorkloadCharacteristic::LowLatency,
WorkloadCharacteristic::SmallBatch,
],
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_h100_profile() {
let h100 = ProcessorProfiles::nvidia_h100();
assert!(h100.compute.fp16_tflops > 1000.0);
assert_eq!(h100.memory.capacity_bytes, 80 * 1024 * 1024 * 1024);
}
#[test]
fn test_tpu_v5p_profile() {
let tpu = ProcessorProfiles::google_tpu_v5p();
assert!(tpu.compute.bf16_tflops > 900.0);
}
#[test]
fn test_groq_lpu_profile() {
let lpu = ProcessorProfiles::groq_lpu();
assert!(lpu.memory.bandwidth_gbps > 50000); // Very high internal bandwidth
}
#[test]
fn test_apple_ane_profile() {
let ane = ProcessorProfiles::apple_m3_neural_engine();
assert!(ane.power.tdp_watts < 20);
assert!(ane.optimal_for.contains(&WorkloadCharacteristic::LowPower));
}
}

View file

@ -0,0 +1,367 @@
//! Processor type definitions.
use serde::{Deserialize, Serialize};
/// All supported processor types.
#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum ProcessorType {
/// Central Processing Unit.
Cpu(CpuVariant),
/// Graphics Processing Unit.
Gpu(GpuVariant),
/// Tensor Processing Unit (Google).
Tpu(TpuVersion),
/// Neural Processing Unit (various vendors).
Npu(NpuVariant),
/// Language Processing Unit (Groq).
Lpu,
/// Field Programmable Gate Array.
Fpga(FpgaVendor),
/// Digital Signal Processor.
Dsp(DspVariant),
/// WebGPU (browser).
WebGpu,
/// WebAssembly runtime.
Wasm,
/// Custom/Unknown accelerator.
Custom {
vendor: String,
model: String,
},
}
impl Default for ProcessorType {
fn default() -> Self {
ProcessorType::Cpu(CpuVariant::default())
}
}
/// CPU architecture variants.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum CpuVariant {
/// x86-64 architecture.
X86_64 { avx: AvxSupport },
/// ARM 64-bit architecture.
Arm64 { sve: bool },
/// RISC-V architecture.
RiscV { vector: bool },
}
impl Default for CpuVariant {
fn default() -> Self {
CpuVariant::X86_64 {
avx: AvxSupport::Avx2,
}
}
}
/// AVX instruction set support levels.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
pub enum AvxSupport {
/// No AVX.
None,
/// AVX (Sandy Bridge+).
Avx,
/// AVX2 (Haswell+).
Avx2,
/// AVX-512 (Skylake-X+).
Avx512,
/// AVX10 (future).
Avx10,
}
/// GPU vendor variants.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum GpuVariant {
/// NVIDIA CUDA GPU.
NvidiaCuda {
/// Compute capability (major, minor).
compute_capability: (u8, u8),
},
/// AMD ROCm GPU.
AmdRocm {
/// GFX version (e.g., 1100 for RDNA3).
gfx_version: u32,
},
/// Intel OneAPI GPU.
IntelOneApi,
/// Apple Metal GPU.
AppleMetal,
/// Qualcomm Adreno GPU.
QualcommAdreno {
/// Adreno model number.
model: u32,
},
/// ARM Mali GPU.
ArmMali {
/// Mali generation (e.g., G710).
model: u32,
},
/// IMG PowerVR GPU.
ImgPowerVr,
}
/// Google TPU versions.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum TpuVersion {
/// TPU v2.
V2,
/// TPU v3.
V3,
/// TPU v4.
V4,
/// TPU v4i (inference).
V4i,
/// TPU v5e (efficiency).
V5e,
/// TPU v5p (performance).
V5p,
/// Edge TPU.
Edge,
}
/// NPU (Neural Processing Unit) variants.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum NpuVariant {
/// Apple Neural Engine.
AppleNeuralEngine {
/// Number of cores.
cores: u32,
},
/// Qualcomm Hexagon DSP/NPU.
QualcommHexagon {
/// Version number.
version: u32,
},
/// Intel VPU (Movidius).
IntelVpu,
/// Huawei Ascend.
HuaweiAscend {
/// Model (310, 910, etc.).
model: u32,
},
/// Google Edge TPU.
GoogleEdgeTpu,
/// Samsung NPU.
SamsungNpu,
/// MediaTek APU.
MediaTekApu {
/// Version.
version: u32,
},
/// Custom NPU.
Custom {
/// TOPS (Tera Operations Per Second).
tops: u32,
},
}
/// FPGA vendors.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum FpgaVendor {
/// Xilinx (AMD).
Xilinx,
/// Intel (Altera).
Intel,
/// Lattice.
Lattice,
/// Microchip.
Microchip,
}
/// DSP (Digital Signal Processor) variants.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum DspVariant {
/// Texas Instruments DSP.
TexasInstruments,
/// Analog Devices DSP.
AnalogDevices,
/// Qualcomm Hexagon DSP.
QualcommHexagon,
/// Custom DSP.
Custom,
}
impl ProcessorType {
/// Returns whether this processor type supports CUDA.
pub fn supports_cuda(&self) -> bool {
matches!(self, ProcessorType::Gpu(GpuVariant::NvidiaCuda { .. }))
}
/// Returns whether this processor type supports ROCm.
pub fn supports_rocm(&self) -> bool {
matches!(self, ProcessorType::Gpu(GpuVariant::AmdRocm { .. }))
}
/// Returns whether this processor type supports Metal.
pub fn supports_metal(&self) -> bool {
matches!(self, ProcessorType::Gpu(GpuVariant::AppleMetal))
}
/// Returns whether this processor type is a GPU.
pub fn is_gpu(&self) -> bool {
matches!(self, ProcessorType::Gpu(_))
}
/// Returns whether this processor type is a CPU.
pub fn is_cpu(&self) -> bool {
matches!(self, ProcessorType::Cpu(_))
}
/// Returns whether this processor type is suitable for parallel workloads.
pub fn is_parallel(&self) -> bool {
matches!(
self,
ProcessorType::Gpu(_) | ProcessorType::Tpu(_) | ProcessorType::Fpga(_)
)
}
/// Returns whether this processor type is suitable for sequential workloads.
pub fn is_sequential(&self) -> bool {
matches!(self, ProcessorType::Cpu(_) | ProcessorType::Lpu)
}
/// Returns whether this processor type is power-efficient.
pub fn is_low_power(&self) -> bool {
matches!(
self,
ProcessorType::Npu(_) | ProcessorType::Tpu(TpuVersion::Edge) | ProcessorType::Wasm
)
}
/// Returns the typical power consumption tier.
pub fn power_tier(&self) -> PowerTier {
match self {
ProcessorType::Npu(_) | ProcessorType::Wasm => PowerTier::UltraLow,
ProcessorType::Cpu(CpuVariant::Arm64 { .. }) => PowerTier::Low,
ProcessorType::Cpu(_) => PowerTier::Medium,
ProcessorType::Gpu(GpuVariant::AppleMetal) => PowerTier::Medium,
ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability })
if compute_capability.0 >= 8 =>
{
PowerTier::High
}
ProcessorType::Gpu(_) => PowerTier::Medium,
ProcessorType::Tpu(TpuVersion::Edge) => PowerTier::UltraLow,
ProcessorType::Tpu(_) => PowerTier::High,
ProcessorType::Lpu => PowerTier::Medium,
ProcessorType::Fpga(_) => PowerTier::Medium,
ProcessorType::Dsp(_) => PowerTier::Low,
ProcessorType::WebGpu => PowerTier::Low,
ProcessorType::Custom { .. } => PowerTier::Medium,
}
}
}
/// Power consumption tiers.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
pub enum PowerTier {
/// < 5W (mobile, IoT).
UltraLow,
/// 5-30W (laptop, tablet).
Low,
/// 30-150W (desktop, workstation).
Medium,
/// > 150W (server, data center).
High,
}
/// Device class for routing decisions.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum DeviceClass {
/// Data center equipment.
DataCenter,
/// Desktop/workstation.
Desktop,
/// Laptop.
Laptop,
/// Mobile phone.
Mobile,
/// Tablet.
Tablet,
/// IoT device.
IoT,
/// Browser (WebGPU/WASM).
Browser,
/// Edge server.
Edge,
}
impl DeviceClass {
/// Returns typical available compute hours per day.
pub fn typical_availability_hours(&self) -> f32 {
match self {
DeviceClass::DataCenter => 24.0,
DeviceClass::Desktop => 8.0,
DeviceClass::Laptop => 6.0,
DeviceClass::Mobile => 4.0,
DeviceClass::Tablet => 4.0,
DeviceClass::IoT => 24.0,
DeviceClass::Browser => 2.0,
DeviceClass::Edge => 24.0,
}
}
/// Returns reliability score (0-100).
pub fn reliability_score(&self) -> u32 {
match self {
DeviceClass::DataCenter => 99,
DeviceClass::Edge => 95,
DeviceClass::Desktop => 80,
DeviceClass::Laptop => 60,
DeviceClass::Mobile => 40,
DeviceClass::Tablet => 50,
DeviceClass::IoT => 70,
DeviceClass::Browser => 30,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_processor_type_properties() {
let nvidia = ProcessorType::Gpu(GpuVariant::NvidiaCuda {
compute_capability: (9, 0),
});
assert!(nvidia.supports_cuda());
assert!(nvidia.is_gpu());
assert!(nvidia.is_parallel());
let cpu = ProcessorType::Cpu(CpuVariant::X86_64 {
avx: AvxSupport::Avx512,
});
assert!(cpu.is_cpu());
assert!(cpu.is_sequential());
let lpu = ProcessorType::Lpu;
assert!(lpu.is_sequential());
let npu = ProcessorType::Npu(NpuVariant::AppleNeuralEngine { cores: 16 });
assert!(npu.is_low_power());
}
#[test]
fn test_power_tiers() {
let h100 = ProcessorType::Gpu(GpuVariant::NvidiaCuda {
compute_capability: (9, 0),
});
assert_eq!(h100.power_tier(), PowerTier::High);
let npu = ProcessorType::Npu(NpuVariant::AppleNeuralEngine { cores: 16 });
assert_eq!(npu.power_tier(), PowerTier::UltraLow);
let arm = ProcessorType::Cpu(CpuVariant::Arm64 { sve: false });
assert_eq!(arm.power_tier(), PowerTier::Low);
}
#[test]
fn test_device_class() {
assert_eq!(DeviceClass::DataCenter.typical_availability_hours(), 24.0);
assert_eq!(DeviceClass::Mobile.typical_availability_hours(), 4.0);
assert_eq!(DeviceClass::DataCenter.reliability_score(), 99);
assert_eq!(DeviceClass::Browser.reliability_score(), 30);
}
}

View file

@ -0,0 +1,810 @@
//! Load balancer with work stealing for heterogeneous compute.
//!
//! Supports:
//! - Cross-processor-type work migration
//! - Energy-aware balancing
//! - Latency-aware scheduling
//! - Real-time utilization metrics
use crate::device::{DeviceInfo, DeviceRegistry};
use crate::processor::{Operation, OperationType, ProcessorId, ProcessorType};
use crate::task::{Task, TaskId, TaskPriority};
use super::TaskAssignment;
use parking_lot::RwLock;
use std::collections::HashMap;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use std::time::{Duration, Instant};
/// Balancing strategy for the load balancer.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum BalancingStrategy {
/// Optimize for speed (minimize execution time).
Speed,
/// Optimize for energy efficiency.
Energy,
/// Balance speed and energy.
Balanced,
/// Optimize for cost (spot pricing).
Cost,
/// Optimize for latency (inference workloads).
Latency,
}
impl Default for BalancingStrategy {
fn default() -> Self {
BalancingStrategy::Balanced
}
}
/// Real-time processor metrics.
#[derive(Clone, Debug, Default)]
pub struct ProcessorMetrics {
/// Current utilization (0.0 - 1.0).
pub utilization: f64,
/// Queue depth (pending tasks).
pub queue_depth: u64,
/// Average task completion time (ms).
pub avg_completion_ms: f64,
/// Tasks completed in last minute.
pub throughput_per_min: u64,
/// Current power draw (watts).
pub power_watts: f64,
/// Temperature (celsius).
pub temperature: f64,
/// Last updated timestamp.
pub last_updated: Option<Instant>,
}
/// Load balancer for heterogeneous compute environments.
pub struct LoadBalancer {
/// Device registry for processor info.
device_registry: Option<Arc<DeviceRegistry>>,
/// Current load per processor (task count).
loads: RwLock<HashMap<ProcessorId, AtomicU64>>,
/// Real-time metrics per processor.
metrics: RwLock<HashMap<ProcessorId, ProcessorMetrics>>,
/// Processor type mapping.
processor_types: RwLock<HashMap<ProcessorId, ProcessorType>>,
/// Work stealing threshold (0.0 - 1.0).
steal_threshold: f64,
/// Rebalance threshold (0.0 - 1.0).
rebalance_threshold: f64,
/// Current balancing strategy.
strategy: RwLock<BalancingStrategy>,
/// Migration history (to prevent thrashing).
migration_history: RwLock<Vec<MigrationRecord>>,
}
/// Record of a task migration.
#[derive(Clone, Debug)]
struct MigrationRecord {
task_id: TaskId,
from: ProcessorId,
to: ProcessorId,
timestamp: Instant,
}
impl LoadBalancer {
/// Creates a new load balancer.
pub fn new() -> Self {
Self {
device_registry: None,
loads: RwLock::new(HashMap::new()),
metrics: RwLock::new(HashMap::new()),
processor_types: RwLock::new(HashMap::new()),
steal_threshold: 0.3,
rebalance_threshold: 0.2,
strategy: RwLock::new(BalancingStrategy::default()),
migration_history: RwLock::new(Vec::new()),
}
}
/// Creates a load balancer with device registry.
pub fn with_registry(device_registry: Arc<DeviceRegistry>) -> Self {
Self {
device_registry: Some(device_registry),
loads: RwLock::new(HashMap::new()),
metrics: RwLock::new(HashMap::new()),
processor_types: RwLock::new(HashMap::new()),
steal_threshold: 0.3,
rebalance_threshold: 0.2,
strategy: RwLock::new(BalancingStrategy::default()),
migration_history: RwLock::new(Vec::new()),
}
}
/// Sets the balancing strategy.
pub fn set_strategy(&self, strategy: BalancingStrategy) {
*self.strategy.write() = strategy;
}
/// Gets the current strategy.
pub fn strategy(&self) -> BalancingStrategy {
*self.strategy.read()
}
/// Register a processor with its type.
pub fn register_processor(&self, processor_id: ProcessorId, processor_type: ProcessorType) {
self.loads.write().insert(processor_id, AtomicU64::new(0));
self.metrics.write().insert(processor_id, ProcessorMetrics::default());
self.processor_types.write().insert(processor_id, processor_type);
}
/// Unregister a processor.
pub fn unregister_processor(&self, processor_id: ProcessorId) {
self.loads.write().remove(&processor_id);
self.metrics.write().remove(&processor_id);
self.processor_types.write().remove(&processor_id);
}
/// Update real-time metrics for a processor.
pub fn update_metrics(&self, processor_id: ProcessorId, metrics: ProcessorMetrics) {
if let Some(existing) = self.metrics.write().get_mut(&processor_id) {
*existing = ProcessorMetrics {
last_updated: Some(Instant::now()),
..metrics
};
}
}
/// Get current load for a processor.
pub fn get_load(&self, processor_id: ProcessorId) -> u64 {
self.loads.read()
.get(&processor_id)
.map(|l| l.load(Ordering::Relaxed))
.unwrap_or(0)
}
/// Increment load for a processor.
pub fn increment_load(&self, processor_id: ProcessorId) {
if let Some(load) = self.loads.read().get(&processor_id) {
load.fetch_add(1, Ordering::Relaxed);
}
}
/// Decrement load for a processor.
pub fn decrement_load(&self, processor_id: ProcessorId) {
if let Some(load) = self.loads.read().get(&processor_id) {
load.fetch_sub(1, Ordering::Relaxed);
}
}
/// Check if an operation can run on a processor type.
pub fn can_execute(&self, op: &Operation, processor_type: &ProcessorType) -> bool {
let op_type = op.op_type();
match processor_type {
// CPUs can handle most sequential operations
ProcessorType::Cpu(_) => matches!(
op_type,
OperationType::MatMul
| OperationType::Conv2d
| OperationType::Conv3d
| OperationType::DepthwiseConv
| OperationType::BatchNorm
| OperationType::LayerNorm
| OperationType::Add
| OperationType::Mul
| OperationType::ReLU
| OperationType::GeLU
| OperationType::SiLU
| OperationType::Softmax
| OperationType::Sum
| OperationType::Mean
| OperationType::Max
| OperationType::ArgMax
| OperationType::Embedding
| OperationType::TopK
| OperationType::Sampling
| OperationType::Tokenization
| OperationType::Detokenization
| OperationType::DataLoad
| OperationType::DataPreprocess
| OperationType::Transpose
| OperationType::Reshape
| OperationType::Concat
| OperationType::Split
),
// GPUs excel at parallel operations
ProcessorType::Gpu(_) => matches!(
op_type,
OperationType::MatMul
| OperationType::Conv2d
| OperationType::Conv3d
| OperationType::DepthwiseConv
| OperationType::BatchNorm
| OperationType::LayerNorm
| OperationType::SelfAttention
| OperationType::CrossAttention
| OperationType::FlashAttention
| OperationType::Add
| OperationType::Mul
| OperationType::ReLU
| OperationType::GeLU
| OperationType::SiLU
| OperationType::Softmax
| OperationType::Sum
| OperationType::Mean
| OperationType::Max
| OperationType::ArgMax
| OperationType::Embedding
| OperationType::RoPE
| OperationType::KVCache
| OperationType::TopK
| OperationType::Sampling
| OperationType::Transpose
| OperationType::Reshape
| OperationType::Concat
| OperationType::Split
| OperationType::Gather
| OperationType::Scatter
| OperationType::AllReduce
| OperationType::AllGather
| OperationType::ReduceScatter
| OperationType::Backward
| OperationType::OptimizerStep
| OperationType::GradientClip
),
// TPUs optimized for ML
ProcessorType::Tpu(_) => matches!(
op_type,
OperationType::MatMul
| OperationType::Conv2d
| OperationType::BatchNorm
| OperationType::LayerNorm
| OperationType::SelfAttention
| OperationType::CrossAttention
| OperationType::FlashAttention
| OperationType::Add
| OperationType::Mul
| OperationType::ReLU
| OperationType::GeLU
| OperationType::SiLU
| OperationType::Softmax
| OperationType::Sum
| OperationType::Mean
| OperationType::Embedding
| OperationType::RoPE
| OperationType::KVCache
| OperationType::AllReduce
| OperationType::AllGather
| OperationType::ReduceScatter
| OperationType::Backward
| OperationType::OptimizerStep
),
// NPUs for neural network inference
ProcessorType::Npu(_) => matches!(
op_type,
OperationType::MatMul
| OperationType::Conv2d
| OperationType::DepthwiseConv
| OperationType::BatchNorm
| OperationType::LayerNorm
| OperationType::SelfAttention
| OperationType::Add
| OperationType::Mul
| OperationType::ReLU
| OperationType::GeLU
| OperationType::SiLU
| OperationType::Softmax
| OperationType::Sum
| OperationType::Mean
),
// LPUs for sequential inference (optimized for LLMs)
ProcessorType::Lpu => matches!(
op_type,
OperationType::MatMul
| OperationType::LayerNorm
| OperationType::SelfAttention
| OperationType::FlashAttention
| OperationType::Add
| OperationType::Mul
| OperationType::ReLU
| OperationType::GeLU
| OperationType::SiLU
| OperationType::Softmax
| OperationType::Embedding
| OperationType::RoPE
| OperationType::KVCache
| OperationType::TopK
| OperationType::Sampling
),
// FPGAs can be programmed for anything
ProcessorType::Fpga(_) => true,
// DSPs for signal processing
ProcessorType::Dsp(_) => matches!(
op_type,
OperationType::Conv2d
| OperationType::DepthwiseConv
| OperationType::Add
| OperationType::Mul
| OperationType::Sum
| OperationType::Mean
| OperationType::Max
),
// WebGPU has limited operations
ProcessorType::WebGpu => matches!(
op_type,
OperationType::MatMul
| OperationType::Conv2d
| OperationType::Add
| OperationType::Mul
| OperationType::ReLU
| OperationType::Softmax
| OperationType::Sum
| OperationType::Transpose
| OperationType::Reshape
),
// WASM for portable compute
ProcessorType::Wasm => matches!(
op_type,
OperationType::MatMul
| OperationType::Add
| OperationType::Mul
| OperationType::ReLU
| OperationType::Softmax
| OperationType::Sum
| OperationType::Mean
| OperationType::Tokenization
| OperationType::Detokenization
),
// Custom processors - assume they can handle anything
ProcessorType::Custom { .. } => true,
}
}
/// Calculate a score for assigning a task to a processor.
fn calculate_score(
&self,
task: &Task,
processor_id: ProcessorId,
processor_type: &ProcessorType,
) -> f64 {
let strategy = *self.strategy.read();
let load = self.get_load(processor_id);
let metrics = self.metrics.read();
let proc_metrics = metrics.get(&processor_id);
// Base score from compatibility
if !self.can_execute(&task.operation, processor_type) {
return f64::NEG_INFINITY;
}
// Get utilization and metrics
let utilization = proc_metrics.map(|m| m.utilization).unwrap_or(load as f64 / 100.0);
let power = proc_metrics.map(|m| m.power_watts).unwrap_or(100.0);
let avg_completion = proc_metrics.map(|m| m.avg_completion_ms).unwrap_or(100.0);
// Calculate score based on strategy
match strategy {
BalancingStrategy::Speed => {
// Prioritize low utilization and fast completion
let speed_score = 1.0 / (avg_completion.max(1.0) * (1.0 + utilization));
// Bonus for powerful processor types
let type_bonus = match processor_type {
ProcessorType::Gpu(_) => 2.0,
ProcessorType::Tpu(_) => 2.5,
ProcessorType::Lpu => 3.0, // Fastest for inference
ProcessorType::Npu(_) => 1.5,
_ => 1.0,
};
speed_score * type_bonus
}
BalancingStrategy::Energy => {
// Prioritize low power consumption
let energy_score = 1.0 / power.max(1.0);
// Bonus for efficient processor types
let efficiency_bonus = match processor_type {
ProcessorType::Npu(_) => 3.0, // Most efficient
ProcessorType::Lpu => 2.0,
ProcessorType::Cpu(_) => 1.5,
ProcessorType::Wasm => 2.0, // Low overhead
_ => 1.0,
};
energy_score * efficiency_bonus * (1.0 - utilization * 0.5)
}
BalancingStrategy::Balanced => {
// Balance speed and energy
let speed = 1.0 / avg_completion.max(1.0);
let efficiency = 1.0 / power.max(1.0);
let load_factor = 1.0 - utilization;
(speed * 0.4 + efficiency * 0.3 + load_factor * 0.3)
}
BalancingStrategy::Cost => {
// Prioritize cheaper resources (consumer devices)
let cost_factor = match processor_type {
ProcessorType::Wasm => 0.1, // Cheapest (browser)
ProcessorType::WebGpu => 0.15,
ProcessorType::Cpu(_) => 0.2,
ProcessorType::Npu(_) => 0.3, // Mobile NPUs
ProcessorType::Gpu(_) => 0.5,
ProcessorType::Lpu => 0.8,
ProcessorType::Tpu(_) => 1.0, // Most expensive
_ => 0.5,
};
(1.0 - cost_factor) * (1.0 - utilization)
}
BalancingStrategy::Latency => {
// Prioritize low latency for inference
let latency_score = 1.0 / avg_completion.max(0.1);
// Bonus for low-latency processors
let latency_bonus = match processor_type {
ProcessorType::Lpu => 5.0, // Designed for low latency
ProcessorType::Npu(_) => 3.0,
ProcessorType::Gpu(_) => 2.0,
ProcessorType::Tpu(_) => 1.5,
_ => 1.0,
};
// Priority boost for critical tasks
let priority_boost = match task.priority {
TaskPriority::Critical => 2.0,
TaskPriority::High => 1.5,
TaskPriority::Normal => 1.0,
TaskPriority::Background => 0.5,
};
latency_score * latency_bonus * priority_boost * (1.0 - utilization)
}
}
}
/// Maybe rebalance a task to a different processor.
pub fn maybe_rebalance(
&self,
task: &Task,
suggested_processor: ProcessorId,
current_assignment: &TaskAssignment,
) -> ProcessorId {
// Get all registered processors
let processor_types = self.processor_types.read();
// If we don't have processor info, use suggested
let suggested_type = match processor_types.get(&suggested_processor) {
Some(t) => t.clone(),
None => return suggested_processor,
};
// Calculate score for suggested processor
let suggested_score = self.calculate_score(task, suggested_processor, &suggested_type);
// Find best alternative
let mut best_processor = suggested_processor;
let mut best_score = suggested_score;
for (proc_id, proc_type) in processor_types.iter() {
if *proc_id == suggested_processor {
continue;
}
let score = self.calculate_score(task, *proc_id, proc_type);
// Only switch if significantly better (prevents thrashing)
if score > best_score * (1.0 + self.rebalance_threshold) {
best_score = score;
best_processor = *proc_id;
}
}
// Record migration if different
if best_processor != suggested_processor {
self.migration_history.write().push(MigrationRecord {
task_id: task.id,
from: suggested_processor,
to: best_processor,
timestamp: Instant::now(),
});
}
best_processor
}
/// Check if work stealing should happen between two processors.
pub fn should_steal(&self, from: ProcessorId, to: ProcessorId) -> bool {
let from_load = self.get_load(from) as f64;
let to_load = self.get_load(to) as f64;
if from_load == 0.0 {
return false;
}
// Check if processor types are compatible for the queued work
let processor_types = self.processor_types.read();
let from_type = processor_types.get(&from);
let to_type = processor_types.get(&to);
// Only steal between same processor types by default
// (cross-type stealing requires operation compatibility check)
match (from_type, to_type) {
(Some(ft), Some(tt)) if std::mem::discriminant(ft) == std::mem::discriminant(tt) => {
let diff = (from_load - to_load) / from_load;
diff > self.steal_threshold
}
_ => false,
}
}
/// Get rebalancing suggestions based on current load.
pub fn get_rebalance_suggestions(&self) -> Vec<(ProcessorId, ProcessorId)> {
let mut suggestions = Vec::new();
let loads = self.loads.read();
let load_values: Vec<_> = loads.iter()
.map(|(id, load)| (*id, load.load(Ordering::Relaxed)))
.collect();
if load_values.is_empty() {
return suggestions;
}
let avg_load: f64 = load_values.iter().map(|(_, l)| *l as f64).sum::<f64>()
/ load_values.len() as f64;
let processor_types = self.processor_types.read();
let overloaded: Vec<_> = load_values.iter()
.filter(|(_, l)| *l as f64 > avg_load * (1.0 + self.rebalance_threshold))
.collect();
let underloaded: Vec<_> = load_values.iter()
.filter(|(_, l)| (*l as f64) < avg_load * (1.0 - self.rebalance_threshold))
.collect();
// Only suggest migrations between compatible processor types
for (over_id, _) in overloaded {
let over_type = processor_types.get(over_id);
for (under_id, _) in &underloaded {
let under_type = processor_types.get(under_id);
// Check type compatibility
if let (Some(ot), Some(ut)) = (over_type, under_type) {
if std::mem::discriminant(ot) == std::mem::discriminant(ut) {
suggestions.push((*over_id, *under_id));
}
}
}
}
suggestions
}
/// Get load statistics.
pub fn get_stats(&self) -> LoadBalancerStats {
let loads = self.loads.read();
let metrics = self.metrics.read();
let total_load: u64 = loads.values().map(|l| l.load(Ordering::Relaxed)).sum();
let processor_count = loads.len();
let avg_load = if processor_count > 0 {
total_load as f64 / processor_count as f64
} else {
0.0
};
let total_utilization: f64 = metrics.values().map(|m| m.utilization).sum();
let avg_utilization = if processor_count > 0 {
total_utilization / processor_count as f64
} else {
0.0
};
let total_power: f64 = metrics.values().map(|m| m.power_watts).sum();
let migrations = self.migration_history.read().len();
LoadBalancerStats {
total_load,
avg_load,
processor_count,
avg_utilization,
total_power_watts: total_power,
total_migrations: migrations,
strategy: *self.strategy.read(),
}
}
/// Clean up old migration history.
pub fn cleanup_history(&self, max_age: Duration) {
let cutoff = Instant::now() - max_age;
self.migration_history.write().retain(|r| r.timestamp > cutoff);
}
}
impl Default for LoadBalancer {
fn default() -> Self {
Self::new()
}
}
/// Load balancer statistics.
#[derive(Clone, Debug)]
pub struct LoadBalancerStats {
/// Total tasks across all processors.
pub total_load: u64,
/// Average load per processor.
pub avg_load: f64,
/// Number of registered processors.
pub processor_count: usize,
/// Average utilization (0.0 - 1.0).
pub avg_utilization: f64,
/// Total power consumption (watts).
pub total_power_watts: f64,
/// Total migrations performed.
pub total_migrations: usize,
/// Current balancing strategy.
pub strategy: BalancingStrategy,
}
#[cfg(test)]
mod tests {
use super::*;
use crate::processor::{CpuVariant, GpuVariant, Operation, Precision};
use crate::task::TaskStatus;
fn create_test_task(priority: TaskPriority) -> Task {
Task {
id: TaskId::new(),
operation: Operation::MatMul {
m: 1024,
n: 1024,
k: 1024,
precision: Precision::Fp32,
},
priority,
dependencies: vec![],
status: TaskStatus::Pending,
deadline: None,
}
}
#[test]
fn test_load_tracking() {
let balancer = LoadBalancer::new();
balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
balancer.register_processor(ProcessorId(1), ProcessorType::Cpu(CpuVariant::default()));
assert_eq!(balancer.get_load(ProcessorId(0)), 0);
balancer.increment_load(ProcessorId(0));
balancer.increment_load(ProcessorId(0));
balancer.increment_load(ProcessorId(1));
assert_eq!(balancer.get_load(ProcessorId(0)), 2);
assert_eq!(balancer.get_load(ProcessorId(1)), 1);
balancer.decrement_load(ProcessorId(0));
assert_eq!(balancer.get_load(ProcessorId(0)), 1);
}
#[test]
fn test_should_steal_same_type() {
let balancer = LoadBalancer::new();
// Register two CPUs
balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
balancer.register_processor(ProcessorId(1), ProcessorType::Cpu(CpuVariant::default()));
// Give processor 0 high load
for _ in 0..10 {
balancer.increment_load(ProcessorId(0));
}
balancer.increment_load(ProcessorId(1));
// Should steal between same types
assert!(balancer.should_steal(ProcessorId(0), ProcessorId(1)));
assert!(!balancer.should_steal(ProcessorId(1), ProcessorId(0)));
}
#[test]
fn test_should_not_steal_different_types() {
let balancer = LoadBalancer::new();
// Register CPU and GPU
balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
balancer.register_processor(
ProcessorId(1),
ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability: (8, 9) }),
);
// Give CPU high load
for _ in 0..10 {
balancer.increment_load(ProcessorId(0));
}
// Should NOT steal between different types
assert!(!balancer.should_steal(ProcessorId(0), ProcessorId(1)));
}
#[test]
fn test_can_execute() {
let balancer = LoadBalancer::new();
let matmul = Operation::MatMul {
m: 1024,
n: 1024,
k: 1024,
precision: Precision::Fp32,
};
let flash_attention = Operation::FlashAttention {
batch: 32,
seq_len: 2048,
num_heads: 32,
head_dim: 128,
precision: Precision::Fp16,
};
let cpu = ProcessorType::Cpu(CpuVariant::default());
let gpu = ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability: (8, 9) });
let lpu = ProcessorType::Lpu;
// MatMul can run on all
assert!(balancer.can_execute(&matmul, &cpu));
assert!(balancer.can_execute(&matmul, &gpu));
assert!(balancer.can_execute(&matmul, &lpu));
// FlashAttention only on GPU/TPU/LPU
assert!(!balancer.can_execute(&flash_attention, &cpu));
assert!(balancer.can_execute(&flash_attention, &gpu));
}
#[test]
fn test_strategy_affects_scoring() {
let balancer = LoadBalancer::new();
let cpu_id = ProcessorId(0);
let npu_id = ProcessorId(1);
balancer.register_processor(cpu_id, ProcessorType::Cpu(CpuVariant::default()));
balancer.register_processor(npu_id, ProcessorType::Npu(crate::processor::NpuVariant::AppleNeuralEngine { cores: 16 }));
let task = create_test_task(TaskPriority::Normal);
// Energy strategy should prefer NPU
balancer.set_strategy(BalancingStrategy::Energy);
let assignment = TaskAssignment::new();
let result = balancer.maybe_rebalance(&task, cpu_id, &assignment);
// NPU should be preferred for energy efficiency
assert_eq!(result, npu_id);
}
#[test]
fn test_stats() {
let balancer = LoadBalancer::new();
balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
balancer.register_processor(ProcessorId(1), ProcessorType::Cpu(CpuVariant::default()));
balancer.increment_load(ProcessorId(0));
balancer.increment_load(ProcessorId(0));
balancer.increment_load(ProcessorId(1));
let stats = balancer.get_stats();
assert_eq!(stats.total_load, 3);
assert_eq!(stats.processor_count, 2);
assert!((stats.avg_load - 1.5).abs() < 0.01);
}
}

View file

@ -0,0 +1,559 @@
//! Heterogeneous scheduler for multi-processor task assignment.
//!
//! Features:
//! - Optimal task-to-processor assignment
//! - Work stealing for load balancing
//! - Pipeline parallelism across processor types
//! - Dynamic rebalancing based on actual throughput
mod load_balancer;
mod work_queue;
pub use load_balancer::LoadBalancer;
pub use work_queue::WorkQueue;
use crate::device::DeviceRegistry;
use crate::error::ComputeError;
use crate::processor::{Operation, Processor, ProcessorId, ProcessorType};
use crate::task::{Task, TaskId, TaskPriority};
use parking_lot::RwLock;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::Arc;
use std::time::Duration;
/// Heterogeneous scheduler that manages tasks across all processor types.
pub struct HeterogeneousScheduler {
/// Device registry.
device_registry: Arc<DeviceRegistry>,
/// Per-processor-type task queues.
queues: RwLock<HashMap<ProcessorType, WorkQueue>>,
/// Load balancer.
load_balancer: LoadBalancer,
/// Active schedules.
active_schedules: RwLock<HashMap<ScheduleId, Schedule>>,
}
impl HeterogeneousScheduler {
/// Creates a new heterogeneous scheduler.
pub fn new(device_registry: Arc<DeviceRegistry>) -> Self {
Self {
device_registry,
queues: RwLock::new(HashMap::new()),
load_balancer: LoadBalancer::new(),
active_schedules: RwLock::new(HashMap::new()),
}
}
/// Schedule a set of tasks for execution.
pub async fn schedule(&self, tasks: Vec<Task>) -> Result<ScheduleResult, ComputeError> {
if tasks.is_empty() {
return Ok(ScheduleResult {
schedule: Schedule::empty(),
estimated_makespan: Duration::ZERO,
processor_utilization: HashMap::new(),
});
}
// 1. Build dependency graph
let deps = self.build_dependency_graph(&tasks);
// 2. Assign tasks to optimal processors
let assignment = self.assign_tasks(&tasks, &deps).await?;
// 3. Create execution schedule with stages
let schedule = self.create_schedule(&tasks, &assignment, &deps)?;
// 4. Estimate metrics
let makespan = self.estimate_makespan(&schedule);
let utilization = self.estimate_utilization(&schedule);
// 5. Store active schedule
self.active_schedules.write().insert(schedule.id, schedule.clone());
Ok(ScheduleResult {
schedule,
estimated_makespan: makespan,
processor_utilization: utilization,
})
}
/// Execute a schedule.
pub async fn execute(&self, schedule: &Schedule) -> Result<ExecutionResult, ComputeError> {
let mut results = HashMap::new();
let start = std::time::Instant::now();
// Execute stages in order
for stage in &schedule.stages {
// Execute all tasks in this stage in parallel
let mut handles = Vec::new();
for task_id in &stage.tasks {
let task = schedule.tasks.get(task_id)
.ok_or_else(|| ComputeError::Internal(format!("Task not found: {:?}", task_id)))?;
let processor_id = schedule.assignment.get(task_id)
.ok_or_else(|| ComputeError::Internal(format!("No assignment for task: {:?}", task_id)))?;
let processor = self.device_registry.get_processor(processor_id)?;
let task_clone = task.clone();
handles.push(tokio::spawn(async move {
processor.execute(task_clone.operation).await
}));
}
// Wait for all tasks in stage
for (i, handle) in handles.into_iter().enumerate() {
let task_id = stage.tasks[i];
match handle.await {
Ok(Ok(result)) => {
results.insert(task_id, TaskExecutionResult::Success(result));
}
Ok(Err(e)) => {
results.insert(task_id, TaskExecutionResult::Failed(e.to_string()));
}
Err(e) => {
results.insert(task_id, TaskExecutionResult::Failed(e.to_string()));
}
}
}
}
let total_time = start.elapsed();
Ok(ExecutionResult {
results,
total_time,
actual_utilization: self.measure_utilization(),
})
}
/// Assign tasks to optimal processors.
async fn assign_tasks(
&self,
tasks: &[Task],
deps: &DependencyGraph,
) -> Result<TaskAssignment, ComputeError> {
let mut assignment = TaskAssignment::new();
// Sort tasks by priority and dependencies (topological sort)
let sorted_tasks = self.topological_sort(tasks, deps);
for task in sorted_tasks {
// Find best processor for this task
let best_processor = self.find_best_processor(&task).await?;
// Check if we should rebalance
let final_processor = self.load_balancer
.maybe_rebalance(&task, best_processor, &assignment);
assignment.assign(task.id, final_processor);
}
Ok(assignment)
}
/// Find the best processor for a task.
async fn find_best_processor(&self, task: &Task) -> Result<ProcessorId, ComputeError> {
let mut best_score = f64::NEG_INFINITY;
let mut best_processor = None;
// Get all available processors
let processors = self.device_registry.all_processors();
for processor in processors {
if !processor.can_execute(&task.operation) {
continue;
}
// Calculate score based on multiple factors
let exec_time = processor.estimate_time(&task.operation);
let energy = processor.estimate_energy(&task.operation);
let load = processor.utilization();
// Score = 1 / (time * (1 + load) * energy_factor)
let time_factor = exec_time.as_secs_f64().max(0.001);
let load_factor = 1.0 + load;
let energy_factor = 1.0 + (energy / 1000.0); // Normalize energy
let score = 1.0 / (time_factor * load_factor * energy_factor);
if score > best_score {
best_score = score;
best_processor = Some(processor.id());
}
}
best_processor.ok_or_else(|| {
ComputeError::NoSuitableProcessor(format!("{:?}", task.operation.op_type()))
})
}
/// Build dependency graph from tasks.
fn build_dependency_graph(&self, tasks: &[Task]) -> DependencyGraph {
let mut graph = DependencyGraph::new();
for task in tasks {
graph.add_node(task.id);
for dep in &task.dependencies {
graph.add_edge(*dep, task.id);
}
}
graph
}
/// Topological sort of tasks respecting dependencies.
fn topological_sort(&self, tasks: &[Task], deps: &DependencyGraph) -> Vec<Task> {
let mut sorted = Vec::new();
let mut visited = std::collections::HashSet::new();
let task_map: HashMap<TaskId, Task> = tasks.iter()
.map(|t| (t.id, t.clone()))
.collect();
fn visit(
task_id: TaskId,
task_map: &HashMap<TaskId, Task>,
deps: &DependencyGraph,
visited: &mut std::collections::HashSet<TaskId>,
sorted: &mut Vec<Task>,
) {
if visited.contains(&task_id) {
return;
}
visited.insert(task_id);
// Visit dependencies first
if let Some(task_deps) = deps.dependencies.get(&task_id) {
for dep in task_deps {
visit(*dep, task_map, deps, visited, sorted);
}
}
if let Some(task) = task_map.get(&task_id) {
sorted.push(task.clone());
}
}
for task in tasks {
visit(task.id, &task_map, deps, &mut visited, &mut sorted);
}
// Sort by priority within dependency constraints
sorted.sort_by(|a, b| b.priority.cmp(&a.priority));
sorted
}
/// Create execution schedule with parallel stages.
fn create_schedule(
&self,
tasks: &[Task],
assignment: &TaskAssignment,
deps: &DependencyGraph,
) -> Result<Schedule, ComputeError> {
let mut stages = Vec::new();
let mut scheduled = std::collections::HashSet::new();
let task_map: HashMap<TaskId, Task> = tasks.iter()
.map(|t| (t.id, t.clone()))
.collect();
while scheduled.len() < tasks.len() {
let mut stage_tasks = Vec::new();
for task in tasks {
if scheduled.contains(&task.id) {
continue;
}
// Check if all dependencies are satisfied
let deps_satisfied = task.dependencies.iter()
.all(|dep| scheduled.contains(dep));
if deps_satisfied {
stage_tasks.push(task.id);
}
}
if stage_tasks.is_empty() {
return Err(ComputeError::SchedulingFailed(
"Circular dependency detected".to_string()
));
}
for task_id in &stage_tasks {
scheduled.insert(*task_id);
}
stages.push(ScheduleStage {
stage_id: stages.len(),
tasks: stage_tasks,
});
}
Ok(Schedule {
id: ScheduleId::new(),
tasks: task_map,
assignment: assignment.clone(),
stages,
})
}
/// Estimate makespan (total execution time).
fn estimate_makespan(&self, schedule: &Schedule) -> Duration {
let mut total = Duration::ZERO;
for stage in &schedule.stages {
let mut max_stage_time = Duration::ZERO;
for task_id in &stage.tasks {
if let (Some(task), Some(proc_id)) = (
schedule.tasks.get(task_id),
schedule.assignment.get(task_id),
) {
if let Ok(processor) = self.device_registry.get_processor(proc_id) {
let time = processor.estimate_time(&task.operation);
max_stage_time = max_stage_time.max(time);
}
}
}
total += max_stage_time;
}
total
}
/// Estimate processor utilization.
fn estimate_utilization(&self, schedule: &Schedule) -> HashMap<ProcessorType, f64> {
let mut work_time: HashMap<ProcessorType, Duration> = HashMap::new();
let makespan = self.estimate_makespan(schedule);
for task_id in schedule.assignment.assignments.keys() {
if let (Some(task), Some(proc_id)) = (
schedule.tasks.get(task_id),
schedule.assignment.get(task_id),
) {
if let Ok(processor) = self.device_registry.get_processor(proc_id) {
let proc_type = processor.processor_type();
let time = processor.estimate_time(&task.operation);
*work_time.entry(proc_type).or_default() += time;
}
}
}
work_time
.into_iter()
.map(|(proc_type, time)| {
let utilization = if makespan.as_secs_f64() > 0.0 {
time.as_secs_f64() / makespan.as_secs_f64()
} else {
0.0
};
(proc_type, utilization.min(1.0))
})
.collect()
}
/// Measure actual current utilization.
fn measure_utilization(&self) -> HashMap<ProcessorType, f64> {
let mut utilization = HashMap::new();
for processor in self.device_registry.all_processors() {
let proc_type = processor.processor_type();
let util = processor.utilization();
utilization
.entry(proc_type)
.and_modify(|u| *u = (*u + util) / 2.0)
.or_insert(util);
}
utilization
}
}
/// Schedule identifier.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct ScheduleId(pub u64);
impl ScheduleId {
/// Creates a new schedule ID.
pub fn new() -> Self {
use rand::Rng;
ScheduleId(rand::thread_rng().gen())
}
}
impl Default for ScheduleId {
fn default() -> Self {
Self::new()
}
}
/// Task-to-processor assignment.
#[derive(Clone, Debug, Default)]
pub struct TaskAssignment {
/// Map from task ID to processor ID.
pub assignments: HashMap<TaskId, ProcessorId>,
}
impl TaskAssignment {
/// Creates a new empty assignment.
pub fn new() -> Self {
Self {
assignments: HashMap::new(),
}
}
/// Assigns a task to a processor.
pub fn assign(&mut self, task_id: TaskId, processor_id: ProcessorId) {
self.assignments.insert(task_id, processor_id);
}
/// Gets the assigned processor for a task.
pub fn get(&self, task_id: &TaskId) -> Option<ProcessorId> {
self.assignments.get(task_id).copied()
}
}
/// Dependency graph for tasks.
#[derive(Clone, Debug, Default)]
pub struct DependencyGraph {
/// Dependencies: task -> list of tasks it depends on.
pub dependencies: HashMap<TaskId, Vec<TaskId>>,
/// Dependents: task -> list of tasks that depend on it.
pub dependents: HashMap<TaskId, Vec<TaskId>>,
}
impl DependencyGraph {
/// Creates a new empty dependency graph.
pub fn new() -> Self {
Self {
dependencies: HashMap::new(),
dependents: HashMap::new(),
}
}
/// Adds a node (task) to the graph.
pub fn add_node(&mut self, task_id: TaskId) {
self.dependencies.entry(task_id).or_default();
self.dependents.entry(task_id).or_default();
}
/// Adds a dependency edge (from depends on to).
pub fn add_edge(&mut self, from: TaskId, to: TaskId) {
self.dependencies.entry(to).or_default().push(from);
self.dependents.entry(from).or_default().push(to);
}
}
/// Execution schedule.
#[derive(Clone, Debug)]
pub struct Schedule {
/// Schedule ID.
pub id: ScheduleId,
/// All tasks.
pub tasks: HashMap<TaskId, Task>,
/// Task assignments.
pub assignment: TaskAssignment,
/// Execution stages (tasks within a stage can run in parallel).
pub stages: Vec<ScheduleStage>,
}
impl Schedule {
/// Creates an empty schedule.
pub fn empty() -> Self {
Self {
id: ScheduleId::new(),
tasks: HashMap::new(),
assignment: TaskAssignment::new(),
stages: Vec::new(),
}
}
}
/// A stage of parallel tasks.
#[derive(Clone, Debug)]
pub struct ScheduleStage {
/// Stage index.
pub stage_id: usize,
/// Tasks in this stage (can run in parallel).
pub tasks: Vec<TaskId>,
}
/// Result of scheduling.
#[derive(Clone, Debug)]
pub struct ScheduleResult {
/// The schedule.
pub schedule: Schedule,
/// Estimated total execution time.
pub estimated_makespan: Duration,
/// Estimated processor utilization by type.
pub processor_utilization: HashMap<ProcessorType, f64>,
}
/// Result of execution.
#[derive(Clone, Debug)]
pub struct ExecutionResult {
/// Results per task.
pub results: HashMap<TaskId, TaskExecutionResult>,
/// Total execution time.
pub total_time: Duration,
/// Actual processor utilization.
pub actual_utilization: HashMap<ProcessorType, f64>,
}
/// Result of a single task execution.
#[derive(Clone, Debug)]
pub enum TaskExecutionResult {
/// Task completed successfully.
Success(crate::processor::OperationResult),
/// Task failed.
Failed(String),
}
#[cfg(test)]
mod tests {
use super::*;
use crate::processor::Precision;
use crate::task::TaskStatus;
fn create_test_task(id: u64, op: Operation, deps: Vec<TaskId>) -> Task {
Task {
id: TaskId(id),
operation: op,
priority: TaskPriority::Normal,
dependencies: deps,
status: TaskStatus::Pending,
deadline: None,
}
}
#[test]
fn test_dependency_graph() {
let mut graph = DependencyGraph::new();
graph.add_node(TaskId(1));
graph.add_node(TaskId(2));
graph.add_node(TaskId(3));
graph.add_edge(TaskId(1), TaskId(2)); // 2 depends on 1
graph.add_edge(TaskId(1), TaskId(3)); // 3 depends on 1
graph.add_edge(TaskId(2), TaskId(3)); // 3 depends on 2
assert_eq!(graph.dependencies[&TaskId(2)], vec![TaskId(1)]);
assert_eq!(graph.dependencies[&TaskId(3)], vec![TaskId(1), TaskId(2)]);
}
#[test]
fn test_task_assignment() {
let mut assignment = TaskAssignment::new();
assignment.assign(TaskId(1), ProcessorId(0));
assignment.assign(TaskId(2), ProcessorId(1));
assert_eq!(assignment.get(&TaskId(1)), Some(ProcessorId(0)));
assert_eq!(assignment.get(&TaskId(2)), Some(ProcessorId(1)));
assert_eq!(assignment.get(&TaskId(3)), None);
}
}

View file

@ -0,0 +1,271 @@
//! Work queue with thread-safe task management.
use crate::processor::ProcessorType;
use crate::task::{Task, TaskId, TaskPriority};
use crossbeam_channel::{bounded, Receiver, Sender, TryRecvError};
use std::collections::HashMap;
use std::sync::atomic::{AtomicU64, Ordering};
/// Work queue for a specific processor type.
pub struct WorkQueue {
/// Task sender (for producers).
sender: Sender<Task>,
/// Task receiver (for consumers).
receiver: Receiver<Task>,
/// Processor type this queue is for.
processor_type: ProcessorType,
/// Current queue size.
size: AtomicU64,
/// Total tasks processed.
processed: AtomicU64,
}
impl WorkQueue {
/// Creates a new work queue for a processor type.
pub fn new(processor_type: ProcessorType, capacity: usize) -> Self {
let (sender, receiver) = bounded(capacity.max(1024));
Self {
sender,
receiver,
processor_type,
size: AtomicU64::new(0),
processed: AtomicU64::new(0),
}
}
/// Push a task to the queue.
pub fn push(&self, task: Task) {
if self.sender.try_send(task).is_ok() {
self.size.fetch_add(1, Ordering::Relaxed);
}
}
/// Pop a task from the queue (ignores worker_id for compatibility).
pub fn pop(&self, _worker_id: usize) -> Option<Task> {
self.pop_any()
}
/// Pop any task from the queue.
pub fn pop_any(&self) -> Option<Task> {
match self.receiver.try_recv() {
Ok(task) => {
self.size.fetch_sub(1, Ordering::Relaxed);
self.processed.fetch_add(1, Ordering::Relaxed);
Some(task)
}
Err(TryRecvError::Empty) | Err(TryRecvError::Disconnected) => None,
}
}
/// Pop from global queue (alias for pop_any).
pub fn pop_global(&self) -> Option<Task> {
self.pop_any()
}
/// Steal a batch of tasks from another queue.
pub fn steal_batch_from(&self, other: &WorkQueue, max_tasks: usize) -> Vec<Task> {
let mut stolen = Vec::new();
while stolen.len() < max_tasks {
if let Some(task) = other.pop_any() {
stolen.push(task);
} else {
break;
}
}
// Push stolen tasks to this queue
for task in &stolen {
// Tasks are already accounted for in `other`, just push to self
if self.sender.try_send(task.clone()).is_ok() {
self.size.fetch_add(1, Ordering::Relaxed);
}
}
stolen
}
/// Get current queue size.
pub fn len(&self) -> usize {
self.size.load(Ordering::Relaxed) as usize
}
/// Check if queue is empty.
pub fn is_empty(&self) -> bool {
self.len() == 0
}
/// Get number of tasks processed.
pub fn processed_count(&self) -> u64 {
self.processed.load(Ordering::Relaxed)
}
/// Get processor type for this queue.
pub fn processor_type(&self) -> ProcessorType {
self.processor_type.clone()
}
/// Get utilization estimate (0.0 - 1.0).
pub fn utilization(&self) -> f64 {
let size = self.size.load(Ordering::Relaxed) as f64;
let capacity = self.sender.capacity().unwrap_or(1024) as f64;
(size / capacity).min(1.0)
}
/// Get a stealer for cross-queue work stealing.
pub fn get_stealer(&self) -> QueueStealer {
QueueStealer {
receiver: self.receiver.clone(),
}
}
}
/// Stealer handle for cross-queue work stealing.
#[derive(Clone)]
pub struct QueueStealer {
receiver: Receiver<Task>,
}
impl QueueStealer {
/// Try to steal a task.
pub fn steal(&self) -> Option<Task> {
self.receiver.try_recv().ok()
}
}
/// Priority queue wrapper for tasks.
pub struct PriorityWorkQueue {
/// Queues by priority level.
queues: HashMap<TaskPriority, WorkQueue>,
/// Processor type.
processor_type: ProcessorType,
}
impl PriorityWorkQueue {
/// Creates a new priority work queue.
pub fn new(processor_type: ProcessorType, capacity_per_priority: usize) -> Self {
let mut queues = HashMap::new();
for priority in [
TaskPriority::Critical,
TaskPriority::High,
TaskPriority::Normal,
TaskPriority::Background,
] {
queues.insert(priority, WorkQueue::new(processor_type.clone(), capacity_per_priority));
}
Self {
queues,
processor_type,
}
}
/// Push a task with its priority.
pub fn push(&self, task: Task) {
let priority = task.priority;
if let Some(queue) = self.queues.get(&priority) {
queue.push(task);
}
}
/// Pop highest priority task available.
pub fn pop(&self, worker_id: usize) -> Option<Task> {
// Try priorities in order: Critical > High > Normal > Background
for priority in [
TaskPriority::Critical,
TaskPriority::High,
TaskPriority::Normal,
TaskPriority::Background,
] {
if let Some(queue) = self.queues.get(&priority) {
if let Some(task) = queue.pop(worker_id) {
return Some(task);
}
}
}
None
}
/// Get total queue size.
pub fn len(&self) -> usize {
self.queues.values().map(|q| q.len()).sum()
}
/// Check if all queues are empty.
pub fn is_empty(&self) -> bool {
self.queues.values().all(|q| q.is_empty())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::processor::{CpuVariant, Operation, Precision};
use crate::task::TaskStatus;
fn create_test_task(id: u64, priority: TaskPriority) -> Task {
Task {
id: TaskId(id),
operation: Operation::MatMul {
m: 1024,
n: 1024,
k: 1024,
precision: Precision::Fp32,
},
priority,
dependencies: vec![],
status: TaskStatus::Pending,
deadline: None,
}
}
#[test]
fn test_work_queue_basic() {
let queue = WorkQueue::new(
ProcessorType::Cpu(CpuVariant::default()),
100,
);
assert!(queue.is_empty());
queue.push(create_test_task(1, TaskPriority::Normal));
queue.push(create_test_task(2, TaskPriority::Normal));
assert_eq!(queue.len(), 2);
let task1 = queue.pop(0);
assert!(task1.is_some());
assert_eq!(queue.len(), 1);
let task2 = queue.pop(0);
assert!(task2.is_some());
assert!(queue.is_empty());
}
#[test]
fn test_priority_queue() {
let queue = PriorityWorkQueue::new(
ProcessorType::Cpu(CpuVariant::default()),
100,
);
queue.push(create_test_task(1, TaskPriority::Background));
queue.push(create_test_task(2, TaskPriority::Critical));
queue.push(create_test_task(3, TaskPriority::Normal));
// Should get Critical first
let task = queue.pop(0).unwrap();
assert_eq!(task.id, TaskId(2));
assert_eq!(task.priority, TaskPriority::Critical);
// Then Normal
let task = queue.pop(0).unwrap();
assert_eq!(task.id, TaskId(3));
// Then Background
let task = queue.pop(0).unwrap();
assert_eq!(task.id, TaskId(1));
}
}

View file

@ -0,0 +1,543 @@
//! Task definitions and decomposition.
use crate::error::ComputeError;
use crate::processor::{Operation, OperationType, Precision, ProcessorType};
use crate::{ComputeJob, JobType};
use serde::{Deserialize, Serialize};
use std::time::Duration;
/// Unique task identifier.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct TaskId(pub u64);
impl TaskId {
/// Creates a new task ID.
pub fn new() -> Self {
use rand::Rng;
TaskId(rand::thread_rng().gen())
}
}
impl Default for TaskId {
fn default() -> Self {
Self::new()
}
}
impl std::fmt::Display for TaskId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "task_{}", self.0)
}
}
/// Task priority levels.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
pub enum TaskPriority {
/// Background, can be preempted.
Background = 0,
/// Normal priority.
Normal = 1,
/// High priority.
High = 2,
/// Critical, must complete.
Critical = 3,
}
impl Default for TaskPriority {
fn default() -> Self {
TaskPriority::Normal
}
}
/// Task execution status.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub enum TaskStatus {
/// Waiting to be scheduled.
Pending,
/// Queued for execution.
Queued,
/// Currently executing.
Running,
/// Completed successfully.
Completed,
/// Failed.
Failed,
/// Cancelled.
Cancelled,
}
/// A schedulable task.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct Task {
/// Task ID.
pub id: TaskId,
/// Operation to execute.
pub operation: Operation,
/// Priority level.
pub priority: TaskPriority,
/// Dependencies (tasks that must complete first).
pub dependencies: Vec<TaskId>,
/// Current status.
pub status: TaskStatus,
/// Deadline (optional).
pub deadline: Option<u64>,
}
impl Task {
/// Creates a new task.
pub fn new(operation: Operation) -> Self {
Self {
id: TaskId::new(),
operation,
priority: TaskPriority::Normal,
dependencies: Vec::new(),
status: TaskStatus::Pending,
deadline: None,
}
}
/// Sets the priority.
pub fn with_priority(mut self, priority: TaskPriority) -> Self {
self.priority = priority;
self
}
/// Adds dependencies.
pub fn with_dependencies(mut self, deps: Vec<TaskId>) -> Self {
self.dependencies = deps;
self
}
/// Sets deadline.
pub fn with_deadline(mut self, deadline: u64) -> Self {
self.deadline = Some(deadline);
self
}
/// Checks if task is compatible with a processor type.
pub fn is_compatible_with(&self, proc_type: ProcessorType) -> bool {
// Check based on operation type
let op_type = self.operation.op_type();
match proc_type {
ProcessorType::Cpu(_) => {
// CPUs can do most things, but slowly
true
}
ProcessorType::Gpu(_) => {
// GPUs are good for parallel operations
matches!(
op_type,
OperationType::MatMul
| OperationType::Conv2d
| OperationType::SelfAttention
| OperationType::FlashAttention
| OperationType::Embedding
| OperationType::Add
| OperationType::Mul
| OperationType::Softmax
)
}
ProcessorType::Tpu(_) => {
// TPUs are good for large matrix ops
matches!(
op_type,
OperationType::MatMul
| OperationType::Conv2d
| OperationType::SelfAttention
| OperationType::FlashAttention
)
}
ProcessorType::Lpu => {
// LPUs are good for sequential inference
matches!(
op_type,
OperationType::MatMul
| OperationType::SelfAttention
| OperationType::KVCache
| OperationType::Sampling
)
}
ProcessorType::Npu(_) => {
// NPUs are good for inference
matches!(
op_type,
OperationType::MatMul
| OperationType::Conv2d
| OperationType::Add
| OperationType::Softmax
)
}
_ => true, // Default to compatible
}
}
}
/// Result of task execution.
#[derive(Clone, Debug)]
pub struct TaskResult {
/// Task ID.
pub task_id: TaskId,
/// Output data.
pub output: Vec<u8>,
/// Execution duration.
pub duration: Duration,
/// Energy consumed (Joules).
pub energy: f64,
}
/// Compute task for job execution.
#[derive(Clone, Debug)]
pub struct ComputeTask {
/// Task.
pub task: Task,
/// Resource requirements.
pub requirements: TaskRequirements,
/// Preferred processor type.
pub preferred_processor: Option<ProcessorType>,
/// Fallback processor type.
pub fallback_processor: Option<ProcessorType>,
}
/// Task resource requirements.
#[derive(Clone, Debug, Default)]
pub struct TaskRequirements {
/// Minimum memory (bytes).
pub min_memory: u64,
/// Minimum TFLOPS.
pub min_tflops: f64,
/// Maximum latency (ms).
pub max_latency_ms: Option<u32>,
/// Requires specific precision.
pub precision: Option<Precision>,
}
/// Decomposed workload.
#[derive(Clone, Debug)]
pub struct DecomposedWorkload {
/// All tasks.
pub tasks: Vec<Task>,
/// Total estimated FLOPS.
pub estimated_flops: f64,
/// Total estimated memory.
pub estimated_memory: u64,
}
/// Task decomposer that breaks jobs into schedulable tasks.
pub struct TaskDecomposer {
/// Default batch size for inference.
inference_batch_size: usize,
/// Default precision.
default_precision: Precision,
}
impl TaskDecomposer {
/// Creates a new task decomposer.
pub fn new() -> Self {
Self {
inference_batch_size: 32,
default_precision: Precision::Fp16,
}
}
/// Decomposes a job into tasks.
pub fn decompose(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
match &job.job_type {
JobType::Training { .. } => self.decompose_training(job),
JobType::Inference { .. } => self.decompose_inference(job),
JobType::Container { .. } => self.decompose_container(job),
JobType::Serverless { .. } => self.decompose_serverless(job),
JobType::Wasm { .. } => self.decompose_wasm(job),
}
}
/// Decompose training job.
fn decompose_training(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
let mut tasks = Vec::new();
if let JobType::Training {
epochs,
batch_size,
..
} = &job.job_type
{
// Data loading task
tasks.push(
Task::new(Operation::DataLoad {
bytes: 1024 * 1024 * 100, // 100MB
async_: true,
})
.with_priority(TaskPriority::High),
);
let data_load_id = tasks[0].id;
// Preprocessing task
tasks.push(
Task::new(Operation::DataPreprocess {
batch: *batch_size as usize,
transforms: vec!["normalize".to_string(), "augment".to_string()],
})
.with_dependencies(vec![data_load_id])
.with_priority(TaskPriority::High),
);
let preprocess_id = tasks[1].id;
// Forward pass (simplified as MatMul)
tasks.push(
Task::new(Operation::MatMul {
m: *batch_size as usize,
n: 4096,
k: 4096,
precision: self.default_precision,
})
.with_dependencies(vec![preprocess_id])
.with_priority(TaskPriority::Critical),
);
let forward_id = tasks[2].id;
// Backward pass
tasks.push(
Task::new(Operation::Backward {
forward_op: Box::new(Operation::MatMul {
m: *batch_size as usize,
n: 4096,
k: 4096,
precision: self.default_precision,
}),
})
.with_dependencies(vec![forward_id])
.with_priority(TaskPriority::Critical),
);
let backward_id = tasks[3].id;
// Optimizer step
tasks.push(
Task::new(Operation::OptimizerStep {
parameters: 1_000_000,
optimizer: "adamw".to_string(),
precision: self.default_precision,
})
.with_dependencies(vec![backward_id])
.with_priority(TaskPriority::High),
);
}
Ok(tasks)
}
/// Decompose inference job.
fn decompose_inference(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
let mut tasks = Vec::new();
if let JobType::Inference { batch_size, .. } = &job.job_type {
// Tokenization (CPU optimal)
tasks.push(
Task::new(Operation::Tokenization {
text_bytes: 4096,
vocab_size: 32000,
})
.with_priority(TaskPriority::High),
);
let token_id = tasks[0].id;
// Embedding (GPU optimal)
tasks.push(
Task::new(Operation::Embedding {
batch: *batch_size as usize,
seq_len: 512,
vocab_size: 32000,
embed_dim: 4096,
precision: self.default_precision,
})
.with_dependencies(vec![token_id])
.with_priority(TaskPriority::Critical),
);
let embed_id = tasks[1].id;
// Self-attention (TPU/GPU optimal)
tasks.push(
Task::new(Operation::SelfAttention {
batch: *batch_size as usize,
seq_len: 512,
num_heads: 32,
head_dim: 128,
precision: self.default_precision,
})
.with_dependencies(vec![embed_id])
.with_priority(TaskPriority::Critical),
);
let attention_id = tasks[2].id;
// Sampling (LPU optimal)
tasks.push(
Task::new(Operation::Sampling {
batch: *batch_size as usize,
vocab_size: 32000,
temperature: 0.7,
})
.with_dependencies(vec![attention_id])
.with_priority(TaskPriority::High),
);
let sample_id = tasks[3].id;
// Detokenization (CPU optimal)
tasks.push(
Task::new(Operation::Detokenization {
tokens: 256,
vocab_size: 32000,
})
.with_dependencies(vec![sample_id])
.with_priority(TaskPriority::Normal),
);
}
Ok(tasks)
}
/// Decompose container job.
fn decompose_container(&self, _job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
// Container jobs are typically a single task
Ok(vec![Task::new(Operation::Generic {
op_type: OperationType::DataLoad,
flops: 1e9,
memory: 1024 * 1024 * 1024,
})
.with_priority(TaskPriority::Normal)])
}
/// Decompose serverless function.
fn decompose_serverless(&self, _job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
// Serverless is typically a single task
Ok(vec![Task::new(Operation::Generic {
op_type: OperationType::DataPreprocess,
flops: 1e6,
memory: 256 * 1024 * 1024,
})
.with_priority(TaskPriority::High)])
}
/// Decompose WASM job.
fn decompose_wasm(&self, _job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
// WASM is typically a single task
Ok(vec![Task::new(Operation::Generic {
op_type: OperationType::DataPreprocess,
flops: 1e6,
memory: 16 * 1024 * 1024,
})
.with_priority(TaskPriority::Normal)])
}
}
impl Default for TaskDecomposer {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_task_creation() {
let task = Task::new(Operation::MatMul {
m: 1024,
n: 1024,
k: 1024,
precision: Precision::Fp32,
})
.with_priority(TaskPriority::High);
assert_eq!(task.priority, TaskPriority::High);
assert!(task.dependencies.is_empty());
assert_eq!(task.status, TaskStatus::Pending);
}
#[test]
fn test_task_dependencies() {
let task1 = Task::new(Operation::DataLoad {
bytes: 1000,
async_: true,
});
let task1_id = task1.id;
let task2 = Task::new(Operation::MatMul {
m: 1024,
n: 1024,
k: 1024,
precision: Precision::Fp32,
})
.with_dependencies(vec![task1_id]);
assert_eq!(task2.dependencies, vec![task1_id]);
}
#[test]
fn test_task_compatibility() {
let matmul_task = Task::new(Operation::MatMul {
m: 1024,
n: 1024,
k: 1024,
precision: Precision::Fp32,
});
// MatMul should be compatible with GPU and TPU
assert!(matmul_task.is_compatible_with(ProcessorType::Gpu(
crate::processor::GpuVariant::NvidiaCuda {
compute_capability: (8, 0)
}
)));
assert!(matmul_task.is_compatible_with(ProcessorType::Tpu(
crate::processor::TpuVersion::V5p
)));
let data_load_task = Task::new(Operation::DataLoad {
bytes: 1000,
async_: true,
});
// DataLoad should be compatible with CPU
assert!(data_load_task.is_compatible_with(ProcessorType::Cpu(
crate::processor::CpuVariant::default()
)));
}
#[test]
fn test_task_decomposer() {
let decomposer = TaskDecomposer::new();
let job = ComputeJob {
id: crate::JobId::new(),
owner: [0u8; 32],
job_type: JobType::Inference {
model_cid: "model".to_string(),
input_format: "json".to_string(),
batch_size: 1,
},
resources: crate::ResourceRequirements::default(),
input_cid: None,
max_budget: 1_000_000,
priority: crate::JobPriority::Normal,
created_at: 0,
deadline: None,
};
let tasks = decomposer.decompose(&job).unwrap();
assert!(!tasks.is_empty());
// Check dependencies form a chain
for (i, task) in tasks.iter().enumerate() {
if i > 0 {
assert!(!task.dependencies.is_empty());
}
}
}
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,906 @@
# Phase 11: Synor Compute L2 - Full-Stack Compute Platform
> **Mission**: Build a decentralized compute platform capable of AI/ML training, inference, OS hosting, and general-purpose high-performance computing.
---
## Executive Summary
Synor Compute L2 extends beyond the current WASM-only Synor VM to provide:
- **GPU Compute**: AI/ML training and inference with CUDA/ROCm support
- **Container Orchestration**: Docker-compatible workloads with Kubernetes-style scheduling
- **Persistent VMs**: Long-running virtual machines for OS hosting
- **Serverless Functions**: Short-lived compute for API backends and event processing
- **Edge Compute**: Low-latency compute at network edge nodes
---
## Architecture Overview
```
┌─────────────────────────────────────────────────────────────────────────────┐
│ SYNOR COMPUTE L2 │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
│ │ APPLICATION LAYER │ │
│ ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
│ │ AI/ML │ Serverless │ Containers │ Persistent │ Edge │ │
│ │ Training │ Functions │ (Docker) │ VMs (Linux) │ Compute │ │
│ └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
│ │ ORCHESTRATION LAYER │ │
│ ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
│ │ Job │ Resource │ Network │ Storage │ Health │ │
│ │ Scheduler │ Manager │ Fabric │ Orchestrator│ Monitor │ │
│ └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
│ │ COMPUTE RUNTIME LAYER │ │
│ ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
│ │ GPU │ Container │ MicroVM │ WASM │ Native │ │
│ │ Runtime │ Runtime │ Runtime │ Runtime │ Runtime │ │
│ │ (CUDA/ROCm)│ (containerd)│ (Firecracker)│ (Wasmtime) │ (gVisor) │ │
│ └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
│ │ INFRASTRUCTURE LAYER │ │
│ ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
│ │ Node │ Network │ Distributed │ Consensus │ Billing │ │
│ │ Registry │ Overlay │ Storage │ (PoS+PoW) │ Metering │ │
│ └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
│ │ SYNOR L1 BLOCKCHAIN (GHOSTDAG + DAG-RIDER) │ │
│ └─────────────────────────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
```
---
## Milestone 1: GPU Compute Foundation (AI/ML Training & Inference)
### 1.1 GPU Node Registration
```rust
// synor-compute/src/gpu/node.rs
/// GPU node capabilities
pub struct GpuNode {
/// Unique node ID
pub node_id: NodeId,
/// GPU specifications
pub gpus: Vec<GpuSpec>,
/// Total VRAM available (bytes)
pub total_vram: u64,
/// Available VRAM (bytes)
pub available_vram: u64,
/// CUDA compute capability (e.g., 8.6 for RTX 3090)
pub cuda_capability: Option<(u8, u8)>,
/// ROCm version (for AMD)
pub rocm_version: Option<String>,
/// Network bandwidth (Gbps)
pub bandwidth_gbps: u32,
/// Geographic region
pub region: Region,
/// Stake amount (for PoS validation)
pub stake: u64,
}
pub struct GpuSpec {
pub model: String, // "NVIDIA RTX 4090"
pub vram_gb: u32, // 24
pub tensor_cores: u32, // 512
pub cuda_cores: u32, // 16384
pub memory_bandwidth: u32, // 1008 GB/s
pub fp32_tflops: f32, // 82.6
pub fp16_tflops: f32, // 165.2
pub int8_tops: f32, // 330.4
}
```
### 1.2 AI/ML Job Specification
```rust
// synor-compute/src/ai/job.rs
/// AI/ML training job specification
pub struct TrainingJob {
/// Job ID
pub job_id: JobId,
/// Owner address
pub owner: Address,
/// Framework (PyTorch, TensorFlow, JAX)
pub framework: MlFramework,
/// Model specification
pub model: ModelSpec,
/// Dataset reference (Synor Storage CID)
pub dataset_cid: Cid,
/// Training configuration
pub config: TrainingConfig,
/// Resource requirements
pub resources: GpuResources,
/// Maximum budget (SYNOR tokens)
pub max_budget: u64,
/// Checkpoint interval (steps)
pub checkpoint_interval: u64,
}
pub struct GpuResources {
pub min_gpus: u32,
pub max_gpus: u32,
pub min_vram_per_gpu: u64,
pub cuda_capability_min: Option<(u8, u8)>,
pub distributed: bool, // Multi-node training
pub priority: JobPriority,
}
pub enum MlFramework {
PyTorch { version: String },
TensorFlow { version: String },
JAX { version: String },
ONNX,
Custom { image: String },
}
pub struct TrainingConfig {
pub epochs: u32,
pub batch_size: u32,
pub learning_rate: f32,
pub optimizer: String,
pub mixed_precision: bool,
pub gradient_accumulation: u32,
pub distributed_strategy: DistributedStrategy,
}
pub enum DistributedStrategy {
DataParallel,
ModelParallel,
PipelineParallel,
ZeRO { stage: u8 }, // DeepSpeed ZeRO stages 1-3
FSDP, // Fully Sharded Data Parallel
}
```
### 1.3 Inference Service
```rust
// synor-compute/src/ai/inference.rs
/// Inference endpoint specification
pub struct InferenceEndpoint {
/// Endpoint ID
pub endpoint_id: EndpointId,
/// Model reference (Synor Storage CID)
pub model_cid: Cid,
/// Model format
pub format: ModelFormat,
/// Scaling configuration
pub scaling: AutoscaleConfig,
/// GPU requirements per replica
pub gpu_per_replica: GpuResources,
/// Request timeout
pub timeout_ms: u32,
/// Max batch size for batching inference
pub max_batch_size: u32,
/// Batching timeout
pub batch_timeout_ms: u32,
}
pub enum ModelFormat {
PyTorch,
ONNX,
TensorRT,
Triton,
vLLM, // For LLM serving
TGI, // Text Generation Inference
Custom,
}
pub struct AutoscaleConfig {
pub min_replicas: u32,
pub max_replicas: u32,
pub target_gpu_utilization: f32,
pub scale_up_threshold: f32,
pub scale_down_threshold: f32,
pub cooldown_seconds: u32,
}
```
### 1.4 Pricing Model for GPU Compute
| Resource | Unit | Price (SYNOR/unit) |
|----------|------|-------------------|
| GPU (RTX 4090 equivalent) | hour | 0.50 |
| GPU (A100 80GB equivalent) | hour | 2.00 |
| GPU (H100 equivalent) | hour | 4.00 |
| VRAM | GB/hour | 0.01 |
| Network egress | GB | 0.05 |
| Storage (hot, NVMe) | GB/month | 0.10 |
| Inference requests | 1M tokens | 0.10 |
---
## Milestone 2: Container Orchestration (Docker/Kubernetes-Compatible)
### 2.1 Container Runtime
```rust
// synor-compute/src/container/runtime.rs
/// Container specification (OCI-compatible)
pub struct ContainerSpec {
/// Image reference
pub image: ImageRef,
/// Resource limits
pub resources: ContainerResources,
/// Environment variables
pub env: HashMap<String, String>,
/// Volume mounts
pub volumes: Vec<VolumeMount>,
/// Network configuration
pub network: NetworkConfig,
/// Security context
pub security: SecurityContext,
/// Health check
pub health_check: Option<HealthCheck>,
}
pub struct ContainerResources {
pub cpu_cores: f32, // 0.5, 1.0, 2.0, etc.
pub memory_mb: u64,
pub gpu: Option<GpuAllocation>,
pub ephemeral_storage_gb: u32,
pub network_bandwidth_mbps: u32,
}
pub struct GpuAllocation {
pub count: u32,
pub vram_mb: u64,
pub shared: bool, // Allow GPU sharing via MPS/MIG
}
```
### 2.2 Service Mesh & Networking
```rust
// synor-compute/src/network/mesh.rs
/// Service definition for container orchestration
pub struct Service {
pub service_id: ServiceId,
pub name: String,
pub containers: Vec<ContainerSpec>,
pub replicas: ReplicaConfig,
pub load_balancer: LoadBalancerConfig,
pub service_mesh: ServiceMeshConfig,
}
pub struct ServiceMeshConfig {
pub mtls_enabled: bool,
pub traffic_policy: TrafficPolicy,
pub circuit_breaker: CircuitBreakerConfig,
pub retry_policy: RetryPolicy,
pub rate_limit: Option<RateLimitConfig>,
}
pub struct LoadBalancerConfig {
pub algorithm: LoadBalancerAlgorithm,
pub health_check: HealthCheck,
pub sticky_sessions: bool,
pub ssl_termination: SslTermination,
}
pub enum LoadBalancerAlgorithm {
RoundRobin,
LeastConnections,
WeightedRoundRobin { weights: Vec<u32> },
IPHash,
Random,
}
```
### 2.3 Container Pricing
| Resource | Unit | Price (SYNOR/unit) |
|----------|------|-------------------|
| CPU | core/hour | 0.02 |
| Memory | GB/hour | 0.005 |
| Ephemeral storage | GB/hour | 0.001 |
| Network ingress | GB | FREE |
| Network egress | GB | 0.05 |
| Load balancer | hour | 0.01 |
| Static IP | month | 2.00 |
---
## Milestone 3: Persistent Virtual Machines (OS Hosting)
### 3.1 MicroVM Architecture (Firecracker-based)
```rust
// synor-compute/src/vm/microvm.rs
/// Virtual machine specification
pub struct VmSpec {
/// VM ID
pub vm_id: VmId,
/// Owner address
pub owner: Address,
/// VM size
pub size: VmSize,
/// Boot image
pub image: VmImage,
/// Persistent volumes
pub volumes: Vec<PersistentVolume>,
/// Network configuration
pub network: VmNetworkConfig,
/// SSH keys for access
pub ssh_keys: Vec<SshPublicKey>,
/// Cloud-init user data
pub user_data: Option<String>,
}
pub struct VmSize {
pub vcpus: u32,
pub memory_gb: u32,
pub gpu: Option<GpuPassthrough>,
pub network_bandwidth_gbps: u32,
}
pub struct GpuPassthrough {
pub count: u32,
pub model: GpuModel,
pub vram_gb: u32,
}
pub enum VmImage {
/// Pre-built images
Marketplace { image_id: String, version: String },
/// Custom image from Synor Storage
Custom { cid: Cid, format: ImageFormat },
/// Standard OS images
Ubuntu { version: String },
Debian { version: String },
AlmaLinux { version: String },
Windows { version: String, license: WindowsLicense },
}
pub struct PersistentVolume {
pub volume_id: VolumeId,
pub size_gb: u32,
pub volume_type: VolumeType,
pub mount_path: String,
pub encrypted: bool,
}
pub enum VolumeType {
/// High-performance NVMe SSD
NvmeSsd { iops: u32, throughput_mbps: u32 },
/// Standard SSD
Ssd,
/// HDD for archival
Hdd,
/// Distributed storage (Synor Storage L2)
Distributed { replication: u8 },
}
```
### 3.2 VM Lifecycle Management
```rust
// synor-compute/src/vm/lifecycle.rs
pub enum VmState {
Pending,
Provisioning,
Running,
Stopping,
Stopped,
Hibernating,
Hibernated,
Migrating,
Failed,
Terminated,
}
pub struct VmManager {
/// Active VMs
vms: HashMap<VmId, VmInstance>,
/// Node assignments
node_assignments: HashMap<VmId, NodeId>,
/// Live migration coordinator
migration_coordinator: MigrationCoordinator,
}
impl VmManager {
/// Start a new VM
pub async fn create(&self, spec: VmSpec) -> Result<VmId, VmError>;
/// Stop a VM (preserves state)
pub async fn stop(&self, vm_id: &VmId) -> Result<(), VmError>;
/// Start a stopped VM
pub async fn start(&self, vm_id: &VmId) -> Result<(), VmError>;
/// Hibernate VM to storage (saves memory state)
pub async fn hibernate(&self, vm_id: &VmId) -> Result<(), VmError>;
/// Live migrate VM to another node
pub async fn migrate(&self, vm_id: &VmId, target_node: NodeId) -> Result<(), VmError>;
/// Resize VM (requires restart)
pub async fn resize(&self, vm_id: &VmId, new_size: VmSize) -> Result<(), VmError>;
/// Snapshot VM state
pub async fn snapshot(&self, vm_id: &VmId) -> Result<SnapshotId, VmError>;
/// Terminate and delete VM
pub async fn terminate(&self, vm_id: &VmId) -> Result<(), VmError>;
}
```
### 3.3 VM Pricing
| VM Type | vCPUs | Memory | Storage | GPU | Price (SYNOR/month) |
|---------|-------|--------|---------|-----|---------------------|
| micro | 1 | 1 GB | 20 GB SSD | - | 5 |
| small | 2 | 4 GB | 50 GB SSD | - | 15 |
| medium | 4 | 8 GB | 100 GB SSD | - | 30 |
| large | 8 | 32 GB | 200 GB SSD | - | 80 |
| xlarge | 16 | 64 GB | 500 GB NVMe | - | 200 |
| gpu-small | 8 | 32 GB | 200 GB NVMe | 1x RTX 4090 | 400 |
| gpu-medium | 16 | 64 GB | 500 GB NVMe | 2x RTX 4090 | 750 |
| gpu-large | 32 | 128 GB | 1 TB NVMe | 4x A100 80GB | 2500 |
| gpu-xlarge | 64 | 256 GB | 2 TB NVMe | 8x H100 | 8000 |
---
## Milestone 4: Serverless Functions (FaaS)
### 4.1 Function Specification
```rust
// synor-compute/src/serverless/function.rs
/// Serverless function definition
pub struct Function {
pub function_id: FunctionId,
pub owner: Address,
pub name: String,
pub runtime: FunctionRuntime,
pub handler: String,
pub code: FunctionCode,
pub resources: FunctionResources,
pub triggers: Vec<FunctionTrigger>,
pub environment: HashMap<String, String>,
pub timeout_ms: u32,
pub concurrency: ConcurrencyConfig,
}
pub enum FunctionRuntime {
Node20,
Node22,
Python311,
Python312,
Rust,
Go122,
Java21,
Dotnet8,
Ruby33,
Custom { image: String },
}
pub struct FunctionCode {
/// Source code CID in Synor Storage
pub cid: Cid,
/// Entry point file
pub entry_point: String,
/// Dependencies (package.json, requirements.txt, etc.)
pub dependencies: Option<Cid>,
}
pub struct FunctionResources {
pub memory_mb: u32, // 128, 256, 512, 1024, 2048, 4096, 8192
pub cpu_allocation: f32, // Proportional to memory
pub ephemeral_storage_mb: u32,
pub gpu: Option<GpuAllocation>,
}
pub enum FunctionTrigger {
/// HTTP endpoint
Http { path: String, methods: Vec<HttpMethod> },
/// Scheduled execution (cron)
Schedule { cron: String },
/// Event from message queue
Queue { queue_name: String },
/// Storage events
Storage { bucket: String, events: Vec<StorageEvent> },
/// Blockchain events
Blockchain { contract: Address, events: Vec<String> },
/// Webhook
Webhook { url: String },
}
```
### 4.2 Cold Start Optimization
```rust
// synor-compute/src/serverless/warmup.rs
/// Function warmup strategies
pub struct WarmupConfig {
/// Minimum warm instances
pub min_instances: u32,
/// Provisioned concurrency
pub provisioned_concurrency: u32,
/// Warmup schedule
pub warmup_schedule: Option<String>,
/// Snapshot-based cold start (SnapStart)
pub snapstart_enabled: bool,
}
pub struct ColdStartOptimizer {
/// Pre-warmed function pools
pools: HashMap<FunctionRuntime, WarmPool>,
/// Snapshot cache
snapshots: LruCache<FunctionId, FunctionSnapshot>,
/// Prediction model for scaling
predictor: ScalingPredictor,
}
impl ColdStartOptimizer {
/// Get a warm instance or create one
pub async fn get_instance(&self, function: &Function) -> Result<FunctionInstance, Error> {
// Try snapshot restore first (< 100ms)
if let Some(snapshot) = self.snapshots.get(&function.function_id) {
return self.restore_from_snapshot(snapshot).await;
}
// Try warm pool (< 50ms)
if let Some(instance) = self.pools.get(&function.runtime)?.get_warm() {
return Ok(instance);
}
// Cold start (1-5s depending on runtime)
self.cold_start(function).await
}
}
```
### 4.3 Serverless Pricing
| Resource | Unit | Price (SYNOR) |
|----------|------|---------------|
| Invocations | 1M requests | 0.20 |
| Duration | GB-second | 0.00001 |
| Provisioned concurrency | GB-hour | 0.01 |
| HTTP Gateway | 1M requests | 0.10 |
| Event bridge | 1M events | 0.50 |
---
## Milestone 5: Edge Compute
### 5.1 Edge Node Architecture
```rust
// synor-compute/src/edge/node.rs
/// Edge compute node
pub struct EdgeNode {
pub node_id: NodeId,
pub location: GeoLocation,
pub capabilities: EdgeCapabilities,
pub latency_zones: Vec<LatencyZone>,
pub resources: EdgeResources,
}
pub struct EdgeCapabilities {
pub wasm_runtime: bool,
pub container_runtime: bool,
pub gpu_inference: bool,
pub video_transcoding: bool,
pub cdn_cache: bool,
}
pub struct EdgeResources {
pub cpu_cores: u32,
pub memory_gb: u32,
pub storage_gb: u32,
pub gpu: Option<EdgeGpu>,
pub bandwidth_gbps: u32,
}
/// Edge function for low-latency compute
pub struct EdgeFunction {
pub function_id: FunctionId,
pub code: WasmModule,
pub memory_limit: u32,
pub timeout_ms: u32,
pub allowed_regions: Vec<Region>,
}
```
### 5.2 Edge Use Cases
```rust
// synor-compute/src/edge/usecases.rs
/// CDN with compute at edge
pub struct EdgeCdn {
/// Origin servers
origins: Vec<Origin>,
/// Cache rules
cache_rules: Vec<CacheRule>,
/// Edge workers for request/response transformation
workers: Vec<EdgeWorker>,
}
/// Real-time inference at edge
pub struct EdgeInference {
/// Model optimized for edge (quantized, pruned)
model_id: ModelId,
/// Inference runtime (TensorRT, ONNX Runtime)
runtime: EdgeInferenceRuntime,
/// Max batch size
max_batch: u32,
/// Target latency
target_latency_ms: u32,
}
/// Video processing at edge
pub struct EdgeVideoProcessor {
/// Transcoding profiles
profiles: Vec<TranscodingProfile>,
/// Real-time streaming
live_streaming: bool,
/// Adaptive bitrate
abr_enabled: bool,
}
```
### 5.3 Edge Pricing
| Resource | Unit | Price (SYNOR) |
|----------|------|---------------|
| Edge function invocations | 1M | 0.50 |
| Edge function duration | GB-second | 0.00002 |
| Edge bandwidth | GB | 0.08 |
| Edge cache storage | GB/month | 0.02 |
| Video transcoding | minute | 0.02 |
---
## Milestone 6: Node Provider Economics
### 6.1 Provider Registration
```rust
// synor-compute/src/provider/registration.rs
/// Compute provider registration
pub struct ProviderRegistration {
pub provider_id: ProviderId,
pub owner: Address,
/// Stake required to become provider
pub stake: u64,
/// Hardware specifications
pub hardware: HardwareManifest,
/// Network connectivity
pub network: NetworkManifest,
/// Geographic location
pub location: GeoLocation,
/// Availability SLA commitment
pub sla: SlaCommitment,
}
pub struct HardwareManifest {
pub cpus: Vec<CpuSpec>,
pub memory_total_gb: u64,
pub gpus: Vec<GpuSpec>,
pub storage: Vec<StorageSpec>,
pub verified: bool, // Hardware attestation passed
}
pub struct SlaCommitment {
pub uptime_percent: f32, // 99.9, 99.99, etc.
pub response_time_ms: u32,
pub data_durability: f32,
pub penalty_rate: f32, // Penalty for SLA violation
}
```
### 6.2 Provider Revenue Model
| Revenue Source | Provider Share | Protocol Share |
|----------------|----------------|----------------|
| Compute fees | 85% | 15% |
| Storage fees | 80% | 20% |
| Network fees | 75% | 25% |
| SLA bonuses | 100% | 0% |
| Staking rewards | 100% | 0% |
### 6.3 Slashing Conditions
| Violation | Penalty |
|-----------|---------|
| Downtime > committed SLA | 1% stake per hour |
| Data loss | 10% stake + compensation |
| Malicious behavior | 100% stake |
| False hardware attestation | 50% stake |
---
## Implementation Timeline
### Phase 11.1: Foundation (Weeks 1-4)
- [ ] Node registration and hardware attestation
- [ ] Basic job scheduler
- [ ] WASM runtime integration (existing)
- [ ] Container runtime (containerd)
- [ ] Network overlay (WireGuard mesh)
### Phase 11.2: GPU Compute (Weeks 5-8)
- [ ] GPU node registration
- [ ] NVIDIA driver integration
- [ ] CUDA runtime support
- [ ] Basic ML job execution
- [ ] Model storage integration
### Phase 11.3: Container Orchestration (Weeks 9-12)
- [ ] OCI image support
- [ ] Service deployment
- [ ] Load balancing
- [ ] Auto-scaling
- [ ] Service mesh (mTLS)
### Phase 11.4: Persistent VMs (Weeks 13-16)
- [ ] MicroVM runtime (Firecracker)
- [ ] VM lifecycle management
- [ ] Persistent storage
- [ ] Live migration
- [ ] Snapshot/restore
### Phase 11.5: Serverless (Weeks 17-20)
- [ ] Function deployment
- [ ] Cold start optimization
- [ ] Event triggers
- [ ] API gateway
- [ ] Monitoring/logging
### Phase 11.6: Edge Compute (Weeks 21-24)
- [ ] Edge node registration
- [ ] Edge function runtime
- [ ] CDN integration
- [ ] Edge inference
- [ ] Global anycast
---
## Security Considerations
### Isolation Levels
| Workload Type | Isolation Technology | Security Level |
|---------------|---------------------|----------------|
| WASM | Wasmtime sandbox | High |
| Serverless | gVisor + seccomp | High |
| Containers | gVisor or Kata | Medium-High |
| VMs | Firecracker MicroVM | High |
| GPU | NVIDIA MIG/MPS | Medium |
### Network Security
- All inter-node traffic encrypted (WireGuard)
- mTLS for service-to-service communication
- Network policies for workload isolation
- DDoS protection at edge
### Data Security
- Encryption at rest (AES-256)
- Encryption in transit (TLS 1.3)
- Confidential computing support (AMD SEV, Intel SGX)
- Secure key management (HSM integration)
---
## API Examples
### Deploy AI Training Job
```bash
synor compute train create \
--framework pytorch \
--model-config ./model.yaml \
--dataset synor://datasets/imagenet \
--gpus 8 \
--gpu-type h100 \
--distributed ddp \
--epochs 100 \
--checkpoint-interval 1000 \
--max-budget 1000
```
### Deploy Inference Endpoint
```bash
synor compute inference deploy \
--model synor://models/llama-70b \
--format vllm \
--min-replicas 2 \
--max-replicas 10 \
--gpu-per-replica 2 \
--target-utilization 0.7
```
### Create Persistent VM
```bash
synor compute vm create \
--name my-dev-server \
--image ubuntu:22.04 \
--size gpu-small \
--volume 100gb:nvme:/data \
--ssh-key ~/.ssh/id_ed25519.pub \
--region us-east
```
### Deploy Container Service
```bash
synor compute service deploy \
--name my-api \
--image my-registry/my-api:latest \
--replicas 3 \
--cpu 2 \
--memory 4gb \
--port 8080 \
--health-check /health \
--autoscale 2-10
```
### Deploy Serverless Function
```bash
synor compute function deploy \
--name process-image \
--runtime python312 \
--handler main.handler \
--code ./function \
--memory 1024 \
--timeout 30000 \
--trigger http:/api/process
```
---
## Comparison with Existing Synor VM
| Feature | Current Synor VM | Synor Compute L2 |
|---------|------------------|------------------|
| Runtime | WASM only | WASM, Container, MicroVM |
| Timeout | 30 seconds | Unlimited (VMs) |
| Memory | 16 MB max | Up to 256 GB |
| GPU | ❌ | ✅ Full CUDA/ROCm |
| Networking | ❌ | ✅ Full TCP/UDP |
| File I/O | ❌ | ✅ Persistent volumes |
| Threading | ❌ | ✅ Multi-threaded |
| AI/ML | ❌ | ✅ Training + Inference |
| OS Hosting | ❌ | ✅ Full Linux/Windows |
---
## Next Steps
1. **Milestone 1**: Implement GPU node registration and attestation
2. **Milestone 2**: Build basic job scheduler with resource allocation
3. **Milestone 3**: Integrate containerd for container workloads
4. **Milestone 4**: Add Firecracker for MicroVM support
5. **Milestone 5**: Implement serverless function runtime
6. **Milestone 6**: Deploy edge nodes and CDN integration
This plan transforms Synor from a smart contract platform into a full-stack decentralized cloud provider capable of competing with AWS/GCP/Azure while maintaining decentralization and censorship resistance.