diff --git a/Cargo.toml b/Cargo.toml index a9b8da5..af65ff4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,7 @@ members = [ "crates/synor-storage", "crates/synor-hosting", "crates/synor-database", + "crates/synor-compute", "crates/synor-governance", "crates/synor-rpc", "crates/synor-vm", diff --git a/crates/synor-compute/Cargo.toml b/crates/synor-compute/Cargo.toml new file mode 100644 index 0000000..07d0b3a --- /dev/null +++ b/crates/synor-compute/Cargo.toml @@ -0,0 +1,51 @@ +[package] +name = "synor-compute" +version.workspace = true +edition.workspace = true +description = "Heterogeneous multi-processor compute platform for Synor blockchain" +license.workspace = true + +[dependencies] +# Internal crates +synor-types = { path = "../synor-types" } +synor-crypto = { path = "../synor-crypto" } +synor-storage = { path = "../synor-storage" } + +# Serialization +serde.workspace = true +serde_json.workspace = true +borsh.workspace = true +bincode = "1.3" + +# Async runtime +tokio = { workspace = true, features = ["sync", "rt-multi-thread", "time", "macros"] } +async-trait = "0.1" +futures = "0.3" + +# Concurrency +parking_lot.workspace = true +crossbeam-deque = "0.8" +crossbeam-channel = "0.5" +dashmap = "5.5" + +# Utilities +thiserror.workspace = true +tracing.workspace = true +hex.workspace = true + +# Hashing +blake3.workspace = true + +# Data structures +indexmap = "2.2" +priority-queue = "2.0" + +# Time +chrono = { version = "0.4", features = ["serde"] } + +# Random +rand = "0.8" + +[dev-dependencies] +tempfile.workspace = true +tokio-test = "0.4" diff --git a/crates/synor-compute/src/device/mod.rs b/crates/synor-compute/src/device/mod.rs new file mode 100644 index 0000000..cb56bbd --- /dev/null +++ b/crates/synor-compute/src/device/mod.rs @@ -0,0 +1,377 @@ +//! Device registry and management. +//! +//! Supports all device types: +//! - Data center servers +//! - Desktop workstations +//! - Laptops +//! - Mobile devices (iOS, Android) +//! - Browsers (WebGPU, WASM) +//! - IoT devices + +use crate::error::ComputeError; +use crate::processor::{GenericProcessor, Processor, ProcessorCapabilities, ProcessorId, ProcessorType}; +use crate::{NodeId, ProcessorInfo}; +use parking_lot::RwLock; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::sync::Arc; + +/// Unique device identifier. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct DeviceId(pub [u8; 32]); + +impl DeviceId { + /// Creates a new random device ID. + pub fn new() -> Self { + use rand::Rng; + let mut bytes = [0u8; 32]; + rand::thread_rng().fill(&mut bytes); + DeviceId(bytes) + } + + /// Creates from bytes. + pub fn from_bytes(bytes: [u8; 32]) -> Self { + DeviceId(bytes) + } +} + +impl Default for DeviceId { + fn default() -> Self { + Self::new() + } +} + +impl std::fmt::Display for DeviceId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "dev_{}", hex::encode(&self.0[..8])) + } +} + +/// Device type classification. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum DeviceType { + /// Data center server. + DataCenter, + /// Desktop workstation. + Desktop, + /// Laptop. + Laptop, + /// Mobile phone. + Mobile, + /// Tablet. + Tablet, + /// IoT device. + IoT, + /// Browser (WebGPU/WASM). + Browser, + /// Edge server. + Edge, +} + +impl DeviceType { + /// Returns typical reliability score (0-100). + pub fn reliability(&self) -> u32 { + match self { + DeviceType::DataCenter => 99, + DeviceType::Edge => 95, + DeviceType::Desktop => 80, + DeviceType::Laptop => 60, + DeviceType::Mobile => 40, + DeviceType::Tablet => 50, + DeviceType::IoT => 70, + DeviceType::Browser => 30, + } + } + + /// Returns typical availability hours per day. + pub fn availability_hours(&self) -> f32 { + match self { + DeviceType::DataCenter => 24.0, + DeviceType::Edge => 24.0, + DeviceType::Desktop => 8.0, + DeviceType::Laptop => 6.0, + DeviceType::Mobile => 4.0, + DeviceType::Tablet => 4.0, + DeviceType::IoT => 24.0, + DeviceType::Browser => 2.0, + } + } +} + +/// Device capabilities. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct DeviceCapabilities { + /// Device type. + pub device_type: DeviceType, + /// Available processors. + pub processors: Vec, + /// Total memory (GB). + pub memory_gb: f32, + /// Network bandwidth (Mbps). + pub bandwidth_mbps: f32, + /// Storage available (GB). + pub storage_gb: f32, + /// Battery powered. + pub battery_powered: bool, + /// Supports background execution. + pub background_execution: bool, +} + +/// Device information. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct DeviceInfo { + /// Device ID. + pub id: DeviceId, + /// Device type. + pub device_type: DeviceType, + /// Owner address. + pub owner: [u8; 32], + /// Capabilities. + pub capabilities: DeviceCapabilities, + /// Current status. + pub status: DeviceStatus, + /// Reputation score (0-100). + pub reputation: u32, + /// Total earnings (atomic SYNOR). + pub earnings: u64, + /// Geographic region. + pub region: String, +} + +/// Device status. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub enum DeviceStatus { + /// Online and available. + Online, + /// Online but busy. + Busy, + /// Idle but available. + Idle, + /// On battery (reduced capacity). + OnBattery, + /// Offline. + Offline, + /// Maintenance. + Maintenance, +} + +/// Device registry managing all devices and processors. +pub struct DeviceRegistry { + /// Registered devices. + devices: RwLock>, + /// Node to device mapping. + node_devices: RwLock>>, + /// All processors (across all nodes). + processors: RwLock>>, + /// Processor to node mapping. + processor_nodes: RwLock>, + /// Next processor ID. + next_processor_id: std::sync::atomic::AtomicU64, +} + +impl DeviceRegistry { + /// Creates a new device registry. + pub fn new() -> Self { + Self { + devices: RwLock::new(HashMap::new()), + node_devices: RwLock::new(HashMap::new()), + processors: RwLock::new(HashMap::new()), + processor_nodes: RwLock::new(HashMap::new()), + next_processor_id: std::sync::atomic::AtomicU64::new(0), + } + } + + /// Registers a device. + pub fn register_device(&self, device: DeviceInfo) -> Result { + let id = device.id; + self.devices.write().insert(id, device); + Ok(id) + } + + /// Unregisters a device. + pub fn unregister_device(&self, device_id: DeviceId) -> Result<(), ComputeError> { + self.devices.write().remove(&device_id); + Ok(()) + } + + /// Gets a device by ID. + pub fn get_device(&self, device_id: DeviceId) -> Option { + self.devices.read().get(&device_id).cloned() + } + + /// Registers a processor for a node. + pub fn register_processor( + &self, + node_id: NodeId, + info: ProcessorInfo, + ) -> Result<(), ComputeError> { + let processor_id = info.id; + + // Create a generic processor from the info + let processor: Arc = Arc::new(GenericProcessor::new( + processor_id, + info.processor_type, + info.capabilities, + )); + + self.processors.write().insert(processor_id, processor); + self.processor_nodes.write().insert(processor_id, node_id); + + Ok(()) + } + + /// Unregisters all processors for a node. + pub fn unregister_node(&self, node_id: NodeId) -> Result<(), ComputeError> { + let mut processors = self.processors.write(); + let mut processor_nodes = self.processor_nodes.write(); + + // Find and remove all processors for this node + let to_remove: Vec<_> = processor_nodes + .iter() + .filter(|(_, n)| **n == node_id) + .map(|(p, _)| *p) + .collect(); + + for proc_id in to_remove { + processors.remove(&proc_id); + processor_nodes.remove(&proc_id); + } + + Ok(()) + } + + /// Gets a processor by ID. + pub fn get_processor(&self, processor_id: ProcessorId) -> Result, ComputeError> { + self.processors + .read() + .get(&processor_id) + .cloned() + .ok_or(ComputeError::ProcessorNotFound(processor_id)) + } + + /// Gets all processors. + pub fn all_processors(&self) -> Vec> { + self.processors.read().values().cloned().collect() + } + + /// Gets processors of a specific type. + pub fn processors_by_type(&self, proc_type: ProcessorType) -> Vec> { + self.processors + .read() + .values() + .filter(|p| p.processor_type() == proc_type) + .cloned() + .collect() + } + + /// Gets the next processor ID. + pub fn next_processor_id(&self) -> ProcessorId { + ProcessorId(self.next_processor_id.fetch_add(1, std::sync::atomic::Ordering::SeqCst)) + } + + /// Gets total number of devices. + pub fn device_count(&self) -> usize { + self.devices.read().len() + } + + /// Gets total number of processors. + pub fn processor_count(&self) -> usize { + self.processors.read().len() + } + + /// Gets devices by type. + pub fn devices_by_type(&self, device_type: DeviceType) -> Vec { + self.devices + .read() + .values() + .filter(|d| d.device_type == device_type) + .cloned() + .collect() + } + + /// Gets online devices. + pub fn online_devices(&self) -> Vec { + self.devices + .read() + .values() + .filter(|d| d.status == DeviceStatus::Online || d.status == DeviceStatus::Idle) + .cloned() + .collect() + } + + /// Updates device status. + pub fn update_device_status( + &self, + device_id: DeviceId, + status: DeviceStatus, + ) -> Result<(), ComputeError> { + if let Some(device) = self.devices.write().get_mut(&device_id) { + device.status = status; + Ok(()) + } else { + Err(ComputeError::Internal(format!("Device not found: {}", device_id))) + } + } +} + +impl Default for DeviceRegistry { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::processor::{CpuVariant, AvxSupport}; + + #[test] + fn test_device_id() { + let id1 = DeviceId::new(); + let id2 = DeviceId::new(); + assert_ne!(id1.0, id2.0); + } + + #[test] + fn test_device_registry() { + let registry = DeviceRegistry::new(); + + let device = DeviceInfo { + id: DeviceId::new(), + device_type: DeviceType::Desktop, + owner: [1u8; 32], + capabilities: DeviceCapabilities { + device_type: DeviceType::Desktop, + processors: vec![ProcessorType::Cpu(CpuVariant::X86_64 { + avx: AvxSupport::Avx512, + })], + memory_gb: 64.0, + bandwidth_mbps: 1000.0, + storage_gb: 1000.0, + battery_powered: false, + background_execution: true, + }, + status: DeviceStatus::Online, + reputation: 100, + earnings: 0, + region: "us-east".to_string(), + }; + + let device_id = device.id; + registry.register_device(device).unwrap(); + + assert_eq!(registry.device_count(), 1); + assert!(registry.get_device(device_id).is_some()); + + registry.unregister_device(device_id).unwrap(); + assert_eq!(registry.device_count(), 0); + } + + #[test] + fn test_device_type_properties() { + assert_eq!(DeviceType::DataCenter.reliability(), 99); + assert_eq!(DeviceType::Mobile.reliability(), 40); + assert_eq!(DeviceType::DataCenter.availability_hours(), 24.0); + assert_eq!(DeviceType::Browser.availability_hours(), 2.0); + } +} diff --git a/crates/synor-compute/src/error.rs b/crates/synor-compute/src/error.rs new file mode 100644 index 0000000..33a34ee --- /dev/null +++ b/crates/synor-compute/src/error.rs @@ -0,0 +1,92 @@ +//! Error types for Synor Compute. + +use crate::{JobId, NodeId, ProcessorId, ProcessorType}; +use thiserror::Error; + +/// Compute errors. +#[derive(Debug, Error)] +pub enum ComputeError { + /// Job not found. + #[error("Job not found: {0}")] + JobNotFound(JobId), + + /// Node not found. + #[error("Node not found: {0}")] + NodeNotFound(NodeId), + + /// Processor not found. + #[error("Processor not found: {0}")] + ProcessorNotFound(ProcessorId), + + /// No suitable processor for operation. + #[error("No suitable processor for operation: {0}")] + NoSuitableProcessor(String), + + /// Insufficient resources. + #[error("Insufficient resources: {0}")] + InsufficientResources(String), + + /// Task execution failed. + #[error("Task execution failed: {0}")] + TaskExecutionFailed(String), + + /// Scheduling failed. + #[error("Scheduling failed: {0}")] + SchedulingFailed(String), + + /// Memory allocation failed. + #[error("Memory allocation failed: {0}")] + MemoryAllocationFailed(String), + + /// Data transfer failed. + #[error("Data transfer failed: {0}")] + DataTransferFailed(String), + + /// Processor type not supported. + #[error("Processor type not supported: {0:?}")] + ProcessorTypeNotSupported(ProcessorType), + + /// Operation not supported on processor. + #[error("Operation not supported on {0:?}: {1}")] + OperationNotSupported(ProcessorType, String), + + /// Timeout. + #[error("Operation timed out after {0}ms")] + Timeout(u64), + + /// Budget exceeded. + #[error("Budget exceeded: required {required}, available {available}")] + BudgetExceeded { required: u64, available: u64 }, + + /// Node already registered. + #[error("Node already registered: {0}")] + NodeAlreadyRegistered(NodeId), + + /// Invalid configuration. + #[error("Invalid configuration: {0}")] + InvalidConfiguration(String), + + /// Serialization error. + #[error("Serialization error: {0}")] + Serialization(String), + + /// Network error. + #[error("Network error: {0}")] + Network(String), + + /// Internal error. + #[error("Internal error: {0}")] + Internal(String), +} + +impl From for ComputeError { + fn from(err: bincode::Error) -> Self { + ComputeError::Serialization(err.to_string()) + } +} + +impl From for ComputeError { + fn from(err: serde_json::Error) -> Self { + ComputeError::Serialization(err.to_string()) + } +} diff --git a/crates/synor-compute/src/lib.rs b/crates/synor-compute/src/lib.rs new file mode 100644 index 0000000..6baef4c --- /dev/null +++ b/crates/synor-compute/src/lib.rs @@ -0,0 +1,631 @@ +//! Synor Compute L2 - Heterogeneous Multi-Processor Compute Platform +//! +//! Provides decentralized compute services with: +//! +//! - **Heterogeneous Scheduling**: CPU + GPU + TPU + NPU + LPU working simultaneously +//! - **Consumer Device Mesh**: Mobile, browser, desktop devices contributing compute +//! - **90% Cost Reduction**: Zero margins, spot markets, electricity arbitrage +//! - **10x Speed**: Caching, speculative execution, optimal processor assignment +//! +//! # Architecture +//! +//! ```text +//! ┌─────────────────────────────────────────────────────────────────────────────┐ +//! │ SYNOR COMPUTE L2 │ +//! ├─────────────────────────────────────────────────────────────────────────────┤ +//! │ │ +//! │ ┌─────────────────────────────────────────────────────────────────────────┐ │ +//! │ │ TASK DECOMPOSER │ │ +//! │ │ Analyzes workload → Identifies subtasks → Maps to optimal processors │ │ +//! │ └─────────────────────────────────────────────────────────────────────────┘ │ +//! │ │ │ +//! │ ▼ │ +//! │ ┌─────────────────────────────────────────────────────────────────────────┐ │ +//! │ │ HETEROGENEOUS SCHEDULER │ │ +//! │ │ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ │ │ +//! │ │ │ CPU │ │ GPU │ │ TPU │ │ NPU │ │ LPU │ │Custom│ │ │ +//! │ │ │Queue │ │Queue │ │Queue │ │Queue │ │Queue │ │Queue │ │ │ +//! │ │ └──────┘ └──────┘ └──────┘ └──────┘ └──────┘ └──────┘ │ │ +//! │ └─────────────────────────────────────────────────────────────────────────┘ │ +//! │ │ +//! │ ┌─────────────────────────────────────────────────────────────────────────┐ │ +//! │ │ UNIFIED MEMORY FABRIC │ │ +//! │ │ Zero-copy data sharing │ Automatic placement │ Cache coherency │ │ +//! │ └─────────────────────────────────────────────────────────────────────────┘ │ +//! │ │ +//! └─────────────────────────────────────────────────────────────────────────────┘ +//! ``` +//! +//! # Pricing +//! +//! | Resource | Unit | Price (SYNOR) | +//! |----------|------|---------------| +//! | GPU (consumer) | hour | 0.10 | +//! | GPU (datacenter) | hour | 0.50-4.00 | +//! | CPU | core/hour | 0.02 | +//! | Memory | GB/hour | 0.005 | +//! | Inference | 1M tokens | 0.10 | + +#![allow(dead_code)] + +pub mod device; +pub mod error; +pub mod market; +pub mod memory; +pub mod processor; +pub mod scheduler; +pub mod task; + +pub use device::{ + DeviceCapabilities, DeviceId, DeviceInfo, DeviceRegistry, DeviceStatus, DeviceType, +}; +pub use error::ComputeError; +pub use market::{ + Auction, AuctionId, CloudComparison, CpuTier as MarketCpuTier, GpuTier as MarketGpuTier, + MarketStats, Order, OrderBook, OrderId, OrderSide, OrderType, PricingEngine, ProviderListing, + ResourceType, SpotMarket, Trade, +}; +pub use memory::{MemoryManager, TensorHandle, TransferPath, UnifiedMemory}; +pub use processor::{ + ComputeThroughput, CpuVariant, GpuVariant, NpuVariant, Operation, OperationType, Processor, + ProcessorCapabilities, ProcessorId, ProcessorType, TpuVersion, +}; +pub use scheduler::{ + HeterogeneousScheduler, LoadBalancer, Schedule, ScheduleResult, TaskAssignment, WorkQueue, +}; +pub use task::{ + ComputeTask, DecomposedWorkload, Task, TaskDecomposer, TaskId, TaskPriority, TaskResult, + TaskStatus, +}; + +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::sync::Arc; + +use parking_lot::RwLock; + +/// Compute node identifier. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct NodeId(pub u64); + +impl std::fmt::Display for NodeId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "node_{}", self.0) + } +} + +/// Job identifier. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct JobId(pub [u8; 32]); + +impl JobId { + /// Creates a new job ID. + pub fn new() -> Self { + use rand::Rng; + let mut bytes = [0u8; 32]; + rand::thread_rng().fill(&mut bytes); + JobId(bytes) + } + + /// Creates from bytes. + pub fn from_bytes(bytes: [u8; 32]) -> Self { + JobId(bytes) + } +} + +impl Default for JobId { + fn default() -> Self { + Self::new() + } +} + +impl std::fmt::Display for JobId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "job_{}", hex::encode(&self.0[..8])) + } +} + +/// Compute job specification. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct ComputeJob { + /// Job ID. + pub id: JobId, + /// Owner address. + pub owner: [u8; 32], + /// Job type. + pub job_type: JobType, + /// Resource requirements. + pub resources: ResourceRequirements, + /// Input data reference (CID). + pub input_cid: Option, + /// Maximum budget (in atomic SYNOR). + pub max_budget: u64, + /// Priority level. + pub priority: JobPriority, + /// Created timestamp. + pub created_at: u64, + /// Deadline (optional). + pub deadline: Option, +} + +/// Job type classification. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub enum JobType { + /// AI/ML training job. + Training { + framework: MlFramework, + model_cid: String, + dataset_cid: String, + epochs: u32, + batch_size: u32, + }, + /// AI/ML inference job. + Inference { + model_cid: String, + input_format: String, + batch_size: u32, + }, + /// Container workload. + Container { + image: String, + command: Vec, + env: HashMap, + }, + /// Serverless function. + Serverless { + runtime: FunctionRuntime, + code_cid: String, + handler: String, + }, + /// General compute (WASM). + Wasm { + module_cid: String, + entrypoint: String, + }, +} + +/// ML framework specification. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub enum MlFramework { + PyTorch { version: String }, + TensorFlow { version: String }, + JAX { version: String }, + ONNX, +} + +/// Function runtime. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub enum FunctionRuntime { + Node20, + Python312, + Rust, + Go, + Custom { image: String }, +} + +/// Job priority levels. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] +pub enum JobPriority { + /// Background job, can be preempted. + Background = 0, + /// Normal priority. + Normal = 1, + /// High priority, faster scheduling. + High = 2, + /// Critical, guaranteed resources. + Critical = 3, +} + +impl Default for JobPriority { + fn default() -> Self { + JobPriority::Normal + } +} + +/// Resource requirements for a job. +#[derive(Clone, Debug, Default, Serialize, Deserialize)] +pub struct ResourceRequirements { + /// Minimum CPU cores. + pub min_cpu_cores: f32, + /// Minimum memory (GB). + pub min_memory_gb: f32, + /// GPU requirements. + pub gpu: Option, + /// Preferred processor types (in priority order). + pub preferred_processors: Vec, + /// Maximum latency (ms) - for inference. + pub max_latency_ms: Option, + /// Requires distributed execution. + pub distributed: bool, +} + +/// GPU resource requirements. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct GpuRequirements { + /// Minimum number of GPUs. + pub min_count: u32, + /// Maximum number of GPUs. + pub max_count: u32, + /// Minimum VRAM per GPU (GB). + pub min_vram_gb: u32, + /// Minimum compute capability. + pub min_compute_capability: Option<(u8, u8)>, + /// Allow GPU sharing (MPS/MIG). + pub allow_sharing: bool, +} + +/// Job execution status. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub enum JobStatus { + /// Queued, waiting for resources. + Queued, + /// Resources allocated, starting. + Starting, + /// Running. + Running { + progress: f32, + assigned_nodes: Vec, + }, + /// Completed successfully. + Completed { + result_cid: String, + duration_ms: u64, + cost: u64, + }, + /// Failed. + Failed { error: String }, + /// Cancelled by user. + Cancelled, +} + +/// Compute node registration. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct ComputeNode { + /// Node ID. + pub id: NodeId, + /// Owner address. + pub owner: [u8; 32], + /// Available processors. + pub processors: Vec, + /// Total memory (GB). + pub total_memory_gb: f32, + /// Available memory (GB). + pub available_memory_gb: f32, + /// Network bandwidth (Gbps). + pub bandwidth_gbps: f32, + /// Geographic region. + pub region: String, + /// Stake amount (for PoS). + pub stake: u64, + /// Reputation score (0-100). + pub reputation: u32, + /// Current status. + pub status: NodeStatus, +} + +/// Processor information on a node. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct ProcessorInfo { + /// Processor ID (local to node). + pub id: ProcessorId, + /// Processor type. + pub processor_type: ProcessorType, + /// Capabilities. + pub capabilities: ProcessorCapabilities, + /// Current utilization (0.0 - 1.0). + pub utilization: f32, + /// Current temperature (Celsius). + pub temperature: Option, +} + +/// Node status. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub enum NodeStatus { + /// Online and accepting jobs. + Online, + /// Online but not accepting new jobs. + Draining, + /// Offline. + Offline, + /// Maintenance mode. + Maintenance, +} + +/// Compute cluster manager. +pub struct ComputeCluster { + /// Registered nodes. + nodes: RwLock>, + /// Device registry. + device_registry: Arc, + /// Heterogeneous scheduler. + scheduler: Arc, + /// Spot market. + spot_market: Arc, + /// Memory manager. + memory_manager: Arc, + /// Active jobs. + jobs: RwLock>, +} + +impl ComputeCluster { + /// Creates a new compute cluster. + pub fn new() -> Self { + let device_registry = Arc::new(DeviceRegistry::new()); + let scheduler = Arc::new(HeterogeneousScheduler::new(device_registry.clone())); + let spot_market = Arc::new(SpotMarket::new()); + let memory_manager = Arc::new(MemoryManager::new()); + + Self { + nodes: RwLock::new(HashMap::new()), + device_registry, + scheduler, + spot_market, + memory_manager, + jobs: RwLock::new(HashMap::new()), + } + } + + /// Registers a compute node. + pub fn register_node(&self, node: ComputeNode) -> Result<(), ComputeError> { + let id = node.id; + + // Register processors with device registry + for proc in &node.processors { + self.device_registry.register_processor(id, proc.clone())?; + } + + self.nodes.write().insert(id, node); + Ok(()) + } + + /// Unregisters a compute node. + pub fn unregister_node(&self, node_id: NodeId) -> Result<(), ComputeError> { + self.device_registry.unregister_node(node_id)?; + self.nodes.write().remove(&node_id); + Ok(()) + } + + /// Submits a job for execution. + pub async fn submit_job(&self, job: ComputeJob) -> Result { + let job_id = job.id; + + // Decompose job into tasks + let tasks = self.decompose_job(&job)?; + + // Schedule tasks + let schedule = self.scheduler.schedule(tasks).await?; + + // Store job + self.jobs.write().insert(job_id, job); + + // Execute schedule (async) + tokio::spawn({ + let scheduler = self.scheduler.clone(); + async move { + let _ = scheduler.execute(&schedule.schedule).await; + } + }); + + Ok(job_id) + } + + /// Gets job status. + pub fn get_job_status(&self, job_id: &JobId) -> Option { + self.jobs.read().get(job_id).map(|_| JobStatus::Queued) + } + + /// Cancels a job. + pub fn cancel_job(&self, job_id: &JobId) -> Result<(), ComputeError> { + if self.jobs.write().remove(job_id).is_some() { + Ok(()) + } else { + Err(ComputeError::JobNotFound(*job_id)) + } + } + + /// Gets cluster statistics. + pub fn stats(&self) -> ClusterStats { + let nodes = self.nodes.read(); + let jobs = self.jobs.read(); + + let total_nodes = nodes.len(); + let online_nodes = nodes.values().filter(|n| n.status == NodeStatus::Online).count(); + + let total_gpus: usize = nodes + .values() + .flat_map(|n| &n.processors) + .filter(|p| matches!(p.processor_type, ProcessorType::Gpu(_))) + .count(); + + let total_memory: f32 = nodes.values().map(|n| n.total_memory_gb).sum(); + + ClusterStats { + total_nodes, + online_nodes, + total_gpus, + total_memory_gb: total_memory, + active_jobs: jobs.len(), + queued_jobs: jobs.values().filter(|_| true).count(), // Simplified + } + } + + /// Decomposes a job into schedulable tasks. + fn decompose_job(&self, job: &ComputeJob) -> Result, ComputeError> { + let decomposer = TaskDecomposer::new(); + decomposer.decompose(job) + } +} + +impl Default for ComputeCluster { + fn default() -> Self { + Self::new() + } +} + +/// Cluster statistics. +#[derive(Clone, Debug, Default, Serialize, Deserialize)] +pub struct ClusterStats { + /// Total registered nodes. + pub total_nodes: usize, + /// Online nodes. + pub online_nodes: usize, + /// Total GPUs across cluster. + pub total_gpus: usize, + /// Total memory (GB). + pub total_memory_gb: f32, + /// Active jobs. + pub active_jobs: usize, + /// Queued jobs. + pub queued_jobs: usize, +} + +/// Pricing calculator for compute operations. +#[derive(Clone, Debug)] +pub struct ComputePricing { + /// GPU cost per hour by type. + pub gpu_hourly: HashMap, + /// CPU cost per core-hour. + pub cpu_core_hour: u64, + /// Memory cost per GB-hour. + pub memory_gb_hour: u64, + /// Network egress per GB. + pub network_egress_gb: u64, + /// Inference per million tokens. + pub inference_per_million_tokens: u64, +} + +/// GPU pricing tiers. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum GpuTier { + /// Consumer GPUs (RTX 30xx, 40xx). + Consumer, + /// Professional GPUs (RTX A series). + Professional, + /// Data center GPUs (A100). + DataCenter, + /// Latest generation (H100). + Premium, +} + +impl Default for ComputePricing { + fn default() -> Self { + let mut gpu_hourly = HashMap::new(); + gpu_hourly.insert(GpuTier::Consumer, 100_000_000); // 0.10 SYNOR + gpu_hourly.insert(GpuTier::Professional, 300_000_000); // 0.30 SYNOR + gpu_hourly.insert(GpuTier::DataCenter, 2_000_000_000); // 2.00 SYNOR + gpu_hourly.insert(GpuTier::Premium, 4_000_000_000); // 4.00 SYNOR + + Self { + gpu_hourly, + cpu_core_hour: 20_000_000, // 0.02 SYNOR + memory_gb_hour: 5_000_000, // 0.005 SYNOR + network_egress_gb: 50_000_000, // 0.05 SYNOR + inference_per_million_tokens: 100_000_000, // 0.10 SYNOR + } + } +} + +impl ComputePricing { + /// Estimates cost for a job. + pub fn estimate(&self, job: &ComputeJob, duration_hours: f32) -> u64 { + let mut cost = 0u64; + + // CPU cost + cost += (self.cpu_core_hour as f32 * job.resources.min_cpu_cores * duration_hours) as u64; + + // Memory cost + cost += (self.memory_gb_hour as f32 * job.resources.min_memory_gb * duration_hours) as u64; + + // GPU cost + if let Some(gpu) = &job.resources.gpu { + let tier = GpuTier::Consumer; // Simplified + let gpu_cost = self.gpu_hourly.get(&tier).unwrap_or(&100_000_000); + cost += (*gpu_cost as f32 * gpu.min_count as f32 * duration_hours) as u64; + } + + cost + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_job_id() { + let id1 = JobId::new(); + let id2 = JobId::new(); + assert_ne!(id1.0, id2.0); + } + + #[test] + fn test_compute_cluster() { + let cluster = ComputeCluster::new(); + let stats = cluster.stats(); + assert_eq!(stats.total_nodes, 0); + } + + #[test] + fn test_pricing() { + let pricing = ComputePricing::default(); + + let job = ComputeJob { + id: JobId::new(), + owner: [0u8; 32], + job_type: JobType::Inference { + model_cid: "model123".to_string(), + input_format: "json".to_string(), + batch_size: 32, + }, + resources: ResourceRequirements { + min_cpu_cores: 4.0, + min_memory_gb: 16.0, + gpu: Some(GpuRequirements { + min_count: 1, + max_count: 1, + min_vram_gb: 16, + min_compute_capability: None, + allow_sharing: false, + }), + ..Default::default() + }, + input_cid: None, + max_budget: 1_000_000_000, + priority: JobPriority::Normal, + created_at: 0, + deadline: None, + }; + + let cost = pricing.estimate(&job, 1.0); + assert!(cost > 0); + } + + #[test] + fn test_node_registration() { + let cluster = ComputeCluster::new(); + + let node = ComputeNode { + id: NodeId(1), + owner: [1u8; 32], + processors: vec![ProcessorInfo { + id: ProcessorId(0), + processor_type: ProcessorType::Cpu(CpuVariant::X86_64 { + avx: processor::AvxSupport::Avx512, + }), + capabilities: ProcessorCapabilities::default(), + utilization: 0.0, + temperature: Some(45.0), + }], + total_memory_gb: 64.0, + available_memory_gb: 60.0, + bandwidth_gbps: 10.0, + region: "us-east".to_string(), + stake: 1000, + reputation: 100, + status: NodeStatus::Online, + }; + + cluster.register_node(node).unwrap(); + assert_eq!(cluster.stats().total_nodes, 1); + } +} diff --git a/crates/synor-compute/src/market/mod.rs b/crates/synor-compute/src/market/mod.rs new file mode 100644 index 0000000..d7300b2 --- /dev/null +++ b/crates/synor-compute/src/market/mod.rs @@ -0,0 +1,1151 @@ +//! Spot market and pricing engine for compute resources. +//! +//! Implements real-time pricing based on supply/demand, geographic arbitrage, +//! and auction-based compute allocation for 90% cost reduction vs cloud providers. + +use crate::error::ComputeError; +use crate::processor::ProcessorType; +use crate::{NodeId, ProcessorId}; +use parking_lot::RwLock; +use serde::{Deserialize, Serialize}; +use std::collections::{BinaryHeap, HashMap}; +use std::sync::Arc; +use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; + +/// Unique order identifier. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct OrderId(pub u64); + +impl OrderId { + /// Creates a new order ID. + pub fn new() -> Self { + use rand::Rng; + OrderId(rand::thread_rng().gen()) + } +} + +impl Default for OrderId { + fn default() -> Self { + Self::new() + } +} + +/// Order side (buy or sell). +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub enum OrderSide { + /// Buying compute resources. + Buy, + /// Selling compute resources. + Sell, +} + +/// Order type. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub enum OrderType { + /// Market order - execute at best available price. + Market, + /// Limit order - execute only at specified price or better. + Limit, + /// Fill or kill - execute entirely or cancel. + FillOrKill, + /// Immediate or cancel - execute as much as possible immediately. + ImmediateOrCancel, +} + +/// Resource type being traded. +#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum ResourceType { + /// GPU compute hours. + GpuHours(GpuTier), + /// CPU compute hours. + CpuHours(CpuTier), + /// TPU compute hours. + TpuHours, + /// NPU compute hours. + NpuHours, + /// LPU inference credits. + LpuCredits, + /// Memory GB-hours. + MemoryGbHours, + /// Network bandwidth GB. + NetworkGb, + /// Storage GB-hours. + StorageGbHours, +} + +/// GPU performance tier. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum GpuTier { + /// Entry level (RTX 3060, etc.). + Entry, + /// Mid-range (RTX 4080, A4000, etc.). + Mid, + /// High-end (RTX 4090, A6000, etc.). + High, + /// Data center (A100, H100, etc.). + DataCenter, + /// Ultra (H100 SXM, B200, etc.). + Ultra, +} + +/// CPU performance tier. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum CpuTier { + /// Mobile/ARM. + Mobile, + /// Desktop. + Desktop, + /// Workstation. + Workstation, + /// Server. + Server, +} + +/// A market order for compute resources. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct Order { + /// Unique order ID. + pub id: OrderId, + /// Node placing the order. + pub node_id: NodeId, + /// Buy or sell. + pub side: OrderSide, + /// Order type. + pub order_type: OrderType, + /// Resource being traded. + pub resource: ResourceType, + /// Quantity in resource units. + pub quantity: f64, + /// Price per unit in Synor credits. + pub price: f64, + /// Remaining unfilled quantity. + pub remaining: f64, + /// Timestamp. + pub timestamp: u64, + /// Expiration (None = good till cancelled). + pub expires_at: Option, + /// Geographic region preference. + pub region: Option, +} + +impl Order { + /// Creates a new order. + pub fn new( + node_id: NodeId, + side: OrderSide, + order_type: OrderType, + resource: ResourceType, + quantity: f64, + price: f64, + ) -> Self { + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_secs(); + + Self { + id: OrderId::new(), + node_id, + side, + order_type, + resource, + quantity, + price, + remaining: quantity, + timestamp: now, + expires_at: None, + region: None, + } + } + + /// Sets expiration time. + pub fn with_expiration(mut self, expires_at: u64) -> Self { + self.expires_at = Some(expires_at); + self + } + + /// Sets region preference. + pub fn with_region(mut self, region: String) -> Self { + self.region = Some(region); + self + } + + /// Checks if order is expired. + pub fn is_expired(&self) -> bool { + if let Some(expires) = self.expires_at { + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_secs(); + now >= expires + } else { + false + } + } + + /// Checks if order is fully filled. + pub fn is_filled(&self) -> bool { + self.remaining <= 0.0 + } +} + +/// Order wrapper for priority queue (bid side - max heap). +#[derive(Clone, Debug)] +struct BidOrder(Arc>); + +impl PartialEq for BidOrder { + fn eq(&self, other: &Self) -> bool { + let a = self.0.read(); + let b = other.0.read(); + a.price == b.price && a.timestamp == b.timestamp + } +} + +impl Eq for BidOrder {} + +impl PartialOrd for BidOrder { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for BidOrder { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + let a = self.0.read(); + let b = other.0.read(); + // Higher price first, then earlier timestamp + match a.price.partial_cmp(&b.price) { + Some(std::cmp::Ordering::Equal) => b.timestamp.cmp(&a.timestamp), + Some(ord) => ord, + None => std::cmp::Ordering::Equal, + } + } +} + +/// Order wrapper for priority queue (ask side - min heap). +#[derive(Clone, Debug)] +struct AskOrder(Arc>); + +impl PartialEq for AskOrder { + fn eq(&self, other: &Self) -> bool { + let a = self.0.read(); + let b = other.0.read(); + a.price == b.price && a.timestamp == b.timestamp + } +} + +impl Eq for AskOrder {} + +impl PartialOrd for AskOrder { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for AskOrder { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + let a = self.0.read(); + let b = other.0.read(); + // Lower price first (reverse for min-heap), then earlier timestamp + match b.price.partial_cmp(&a.price) { + Some(std::cmp::Ordering::Equal) => b.timestamp.cmp(&a.timestamp), + Some(ord) => ord, + None => std::cmp::Ordering::Equal, + } + } +} + +/// Order book for a single resource type. +pub struct OrderBook { + /// Resource type this book handles. + resource: ResourceType, + /// Buy orders (bids) - max heap. + bids: RwLock>, + /// Sell orders (asks) - min heap. + asks: RwLock>, + /// Order lookup by ID. + orders: RwLock>>>, + /// Last traded price. + last_price: RwLock>, + /// Trade history. + trades: RwLock>, +} + +/// A completed trade. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct Trade { + /// Buyer order ID. + pub buyer_order: OrderId, + /// Seller order ID. + pub seller_order: OrderId, + /// Buyer node. + pub buyer: NodeId, + /// Seller node. + pub seller: NodeId, + /// Trade price. + pub price: f64, + /// Trade quantity. + pub quantity: f64, + /// Timestamp. + pub timestamp: u64, +} + +impl OrderBook { + /// Creates a new order book. + pub fn new(resource: ResourceType) -> Self { + Self { + resource, + bids: RwLock::new(BinaryHeap::new()), + asks: RwLock::new(BinaryHeap::new()), + orders: RwLock::new(HashMap::new()), + last_price: RwLock::new(None), + trades: RwLock::new(Vec::new()), + } + } + + /// Submits an order and attempts to match. + pub fn submit(&self, order: Order) -> Result, ComputeError> { + let order = Arc::new(RwLock::new(order)); + let mut trades = Vec::new(); + + // Try to match order + match order.read().side { + OrderSide::Buy => { + trades = self.match_buy(&order); + } + OrderSide::Sell => { + trades = self.match_sell(&order); + } + } + + // Add remaining to book if not filled + let (is_filled, is_ioc, id, side) = { + let guard = order.read(); + ( + guard.is_filled(), + matches!( + guard.order_type, + OrderType::ImmediateOrCancel | OrderType::FillOrKill + ), + guard.id, + guard.side, + ) + }; + + if !is_filled && !is_ioc { + self.orders.write().insert(id, order.clone()); + + match side { + OrderSide::Buy => { + self.bids.write().push(BidOrder(order)); + } + OrderSide::Sell => { + self.asks.write().push(AskOrder(order)); + } + } + } + + // Record trades + if !trades.is_empty() { + self.trades.write().extend(trades.clone()); + *self.last_price.write() = Some(trades.last().unwrap().price); + } + + Ok(trades) + } + + /// Matches a buy order against asks. + fn match_buy(&self, buy: &Arc>) -> Vec { + let mut trades = Vec::new(); + let mut asks = self.asks.write(); + + while !buy.read().is_filled() { + // Get best ask + let best_ask = match asks.peek() { + Some(ask) => ask.clone(), + None => break, + }; + + let ask_order = best_ask.0.read(); + + // Check price compatibility + if buy.read().price < ask_order.price + && !matches!(buy.read().order_type, OrderType::Market) + { + break; + } + + // Remove from heap for modification + drop(ask_order); + let ask = asks.pop().unwrap().0; + + // Calculate trade + let mut buy_guard = buy.write(); + let mut ask_guard = ask.write(); + + let trade_qty = buy_guard.remaining.min(ask_guard.remaining); + let trade_price = ask_guard.price; // Use ask price + + buy_guard.remaining -= trade_qty; + ask_guard.remaining -= trade_qty; + + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_secs(); + + trades.push(Trade { + buyer_order: buy_guard.id, + seller_order: ask_guard.id, + buyer: buy_guard.node_id, + seller: ask_guard.node_id, + price: trade_price, + quantity: trade_qty, + timestamp: now, + }); + + // Put ask back if not filled + drop(buy_guard); + drop(ask_guard); + + if !ask.read().is_filled() { + asks.push(AskOrder(ask)); + } + } + + trades + } + + /// Matches a sell order against bids. + fn match_sell(&self, sell: &Arc>) -> Vec { + let mut trades = Vec::new(); + let mut bids = self.bids.write(); + + while !sell.read().is_filled() { + // Get best bid + let best_bid = match bids.peek() { + Some(bid) => bid.clone(), + None => break, + }; + + let bid_order = best_bid.0.read(); + + // Check price compatibility + if sell.read().price > bid_order.price + && !matches!(sell.read().order_type, OrderType::Market) + { + break; + } + + // Remove from heap for modification + drop(bid_order); + let bid = bids.pop().unwrap().0; + + // Calculate trade + let mut sell_guard = sell.write(); + let mut bid_guard = bid.write(); + + let trade_qty = sell_guard.remaining.min(bid_guard.remaining); + let trade_price = bid_guard.price; // Use bid price + + sell_guard.remaining -= trade_qty; + bid_guard.remaining -= trade_qty; + + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_secs(); + + trades.push(Trade { + buyer_order: bid_guard.id, + seller_order: sell_guard.id, + buyer: bid_guard.node_id, + seller: sell_guard.node_id, + price: trade_price, + quantity: trade_qty, + timestamp: now, + }); + + // Put bid back if not filled + drop(sell_guard); + drop(bid_guard); + + if !bid.read().is_filled() { + bids.push(BidOrder(bid)); + } + } + + trades + } + + /// Gets the best bid price. + pub fn best_bid(&self) -> Option { + self.bids.read().peek().map(|b| b.0.read().price) + } + + /// Gets the best ask price. + pub fn best_ask(&self) -> Option { + self.asks.read().peek().map(|a| a.0.read().price) + } + + /// Gets the bid-ask spread. + pub fn spread(&self) -> Option { + match (self.best_bid(), self.best_ask()) { + (Some(bid), Some(ask)) => Some(ask - bid), + _ => None, + } + } + + /// Gets the mid price. + pub fn mid_price(&self) -> Option { + match (self.best_bid(), self.best_ask()) { + (Some(bid), Some(ask)) => Some((bid + ask) / 2.0), + _ => self.last_price.read().clone(), + } + } + + /// Gets last traded price. + pub fn last_price(&self) -> Option { + *self.last_price.read() + } +} + +/// Regional electricity pricing for geographic arbitrage. +#[derive(Clone, Debug)] +pub struct RegionalPricing { + /// Region identifier. + pub region: String, + /// Electricity cost in USD per kWh. + pub electricity_cost: f64, + /// Carbon intensity (gCO2/kWh). + pub carbon_intensity: f64, + /// Current grid load factor (0.0-1.0). + pub grid_load: f64, + /// Renewable energy percentage. + pub renewable_pct: f64, +} + +impl RegionalPricing { + /// Calculates effective compute cost multiplier. + pub fn cost_multiplier(&self) -> f64 { + // Base on electricity cost, with grid load adjustment + let base = self.electricity_cost / 0.10; // Normalized to $0.10/kWh baseline + let load_adj = 1.0 + (self.grid_load - 0.5) * 0.2; // ±10% based on load + base * load_adj + } +} + +/// Default regional pricing data. +pub fn default_regional_pricing() -> Vec { + vec![ + RegionalPricing { + region: "us-west".to_string(), + electricity_cost: 0.12, + carbon_intensity: 200.0, + grid_load: 0.6, + renewable_pct: 0.35, + }, + RegionalPricing { + region: "us-east".to_string(), + electricity_cost: 0.11, + carbon_intensity: 350.0, + grid_load: 0.7, + renewable_pct: 0.15, + }, + RegionalPricing { + region: "eu-west".to_string(), + electricity_cost: 0.25, + carbon_intensity: 150.0, + grid_load: 0.5, + renewable_pct: 0.45, + }, + RegionalPricing { + region: "eu-north".to_string(), + electricity_cost: 0.08, + carbon_intensity: 50.0, + grid_load: 0.4, + renewable_pct: 0.90, + }, + RegionalPricing { + region: "asia-east".to_string(), + electricity_cost: 0.10, + carbon_intensity: 500.0, + grid_load: 0.8, + renewable_pct: 0.20, + }, + RegionalPricing { + region: "asia-south".to_string(), + electricity_cost: 0.07, + carbon_intensity: 600.0, + grid_load: 0.6, + renewable_pct: 0.10, + }, + ] +} + +/// Pricing engine for compute resources. +pub struct PricingEngine { + /// Base prices per resource type. + base_prices: HashMap, + /// Regional pricing data. + regions: Vec, + /// Supply/demand factors. + supply_demand: RwLock>, + /// Time-of-day factors. + time_factors: Vec, // 24 hourly factors +} + +impl PricingEngine { + /// Creates a new pricing engine. + pub fn new() -> Self { + let mut base_prices = HashMap::new(); + + // Base prices in Synor credits per unit + // Designed to be ~90% cheaper than AWS/GCP/Azure + base_prices.insert(ResourceType::GpuHours(GpuTier::Entry), 0.05); + base_prices.insert(ResourceType::GpuHours(GpuTier::Mid), 0.15); + base_prices.insert(ResourceType::GpuHours(GpuTier::High), 0.30); + base_prices.insert(ResourceType::GpuHours(GpuTier::DataCenter), 0.80); + base_prices.insert(ResourceType::GpuHours(GpuTier::Ultra), 1.50); + base_prices.insert(ResourceType::CpuHours(CpuTier::Mobile), 0.001); + base_prices.insert(ResourceType::CpuHours(CpuTier::Desktop), 0.005); + base_prices.insert(ResourceType::CpuHours(CpuTier::Workstation), 0.015); + base_prices.insert(ResourceType::CpuHours(CpuTier::Server), 0.03); + base_prices.insert(ResourceType::TpuHours, 1.00); + base_prices.insert(ResourceType::NpuHours, 0.10); + base_prices.insert(ResourceType::LpuCredits, 0.50); + base_prices.insert(ResourceType::MemoryGbHours, 0.001); + base_prices.insert(ResourceType::NetworkGb, 0.01); + base_prices.insert(ResourceType::StorageGbHours, 0.0001); + + // Time-of-day factors (0 = midnight UTC) + // Lower prices during off-peak hours + let time_factors = vec![ + 0.7, 0.6, 0.5, 0.5, 0.5, 0.6, // 00:00 - 05:00 (off-peak) + 0.8, 0.9, 1.0, 1.0, 1.0, 1.0, // 06:00 - 11:00 (ramp up) + 1.0, 1.1, 1.2, 1.2, 1.1, 1.0, // 12:00 - 17:00 (peak) + 0.9, 0.9, 0.8, 0.8, 0.7, 0.7, // 18:00 - 23:00 (wind down) + ]; + + Self { + base_prices, + regions: default_regional_pricing(), + supply_demand: RwLock::new(HashMap::new()), + time_factors, + } + } + + /// Gets the current spot price for a resource. + pub fn spot_price(&self, resource: &ResourceType, region: Option<&str>) -> f64 { + let base = self.base_prices.get(resource).copied().unwrap_or(0.1); + + // Apply supply/demand factor + let sd_factor = self + .supply_demand + .read() + .get(resource) + .copied() + .unwrap_or(1.0); + + // Apply time-of-day factor + let hour = chrono::Utc::now().hour() as usize; + let time_factor = self.time_factors.get(hour).copied().unwrap_or(1.0); + + // Apply regional factor + let region_factor = region + .and_then(|r| self.regions.iter().find(|p| p.region == r)) + .map(|p| p.cost_multiplier()) + .unwrap_or(1.0); + + base * sd_factor * time_factor * region_factor + } + + /// Updates supply/demand factor for a resource. + pub fn update_supply_demand(&self, resource: ResourceType, supply: f64, demand: f64) { + // Factor increases when demand > supply + let factor = if supply > 0.0 { + (demand / supply).sqrt().clamp(0.5, 2.0) + } else if demand > 0.0 { + 2.0 // Max factor when no supply + } else { + 1.0 + }; + + self.supply_demand.write().insert(resource, factor); + } + + /// Gets cheapest region for a resource. + pub fn cheapest_region(&self, resource: &ResourceType) -> &str { + self.regions + .iter() + .min_by(|a, b| { + let cost_a = self.spot_price(resource, Some(&a.region)); + let cost_b = self.spot_price(resource, Some(&b.region)); + cost_a.partial_cmp(&cost_b).unwrap() + }) + .map(|r| r.region.as_str()) + .unwrap_or("us-west") + } + + /// Gets greenest region for a resource. + pub fn greenest_region(&self) -> &str { + self.regions + .iter() + .max_by(|a, b| { + a.renewable_pct + .partial_cmp(&b.renewable_pct) + .unwrap() + }) + .map(|r| r.region.as_str()) + .unwrap_or("eu-north") + } + + /// Compares price to cloud providers. + pub fn compare_to_cloud(&self, resource: &ResourceType, region: Option<&str>) -> CloudComparison { + let our_price = self.spot_price(resource, region); + + // Approximate cloud provider prices (USD/hour for GPU) + let (aws_price, gcp_price, azure_price) = match resource { + ResourceType::GpuHours(GpuTier::DataCenter) => (3.06, 2.95, 3.10), // A100 equivalents + ResourceType::GpuHours(GpuTier::Ultra) => (5.00, 4.50, 5.20), // H100 equivalents + ResourceType::GpuHours(GpuTier::High) => (1.50, 1.40, 1.60), // T4/A10 equivalents + ResourceType::CpuHours(CpuTier::Server) => (0.40, 0.35, 0.42), + _ => (1.0, 1.0, 1.0), + }; + + CloudComparison { + synor_price: our_price, + aws_price, + gcp_price, + azure_price, + aws_savings: ((aws_price - our_price) / aws_price * 100.0).max(0.0), + gcp_savings: ((gcp_price - our_price) / gcp_price * 100.0).max(0.0), + azure_savings: ((azure_price - our_price) / azure_price * 100.0).max(0.0), + } + } +} + +impl Default for PricingEngine { + fn default() -> Self { + Self::new() + } +} + +/// Comparison with cloud provider prices. +#[derive(Clone, Debug)] +pub struct CloudComparison { + /// Our spot price. + pub synor_price: f64, + /// AWS price. + pub aws_price: f64, + /// GCP price. + pub gcp_price: f64, + /// Azure price. + pub azure_price: f64, + /// Savings vs AWS (percentage). + pub aws_savings: f64, + /// Savings vs GCP (percentage). + pub gcp_savings: f64, + /// Savings vs Azure (percentage). + pub azure_savings: f64, +} + +/// Spot market for compute resources. +pub struct SpotMarket { + /// Order books per resource type. + order_books: HashMap, + /// Pricing engine. + pricing: PricingEngine, + /// Provider registry (node -> resources offered). + providers: RwLock>>, + /// Active auctions. + auctions: RwLock>, +} + +/// Unique auction identifier. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct AuctionId(pub u64); + +impl AuctionId { + /// Creates a new auction ID. + pub fn new() -> Self { + use rand::Rng; + AuctionId(rand::thread_rng().gen()) + } +} + +impl Default for AuctionId { + fn default() -> Self { + Self::new() + } +} + +/// A provider's resource listing. +#[derive(Clone, Debug)] +pub struct ProviderListing { + /// Resource type offered. + pub resource: ResourceType, + /// Available quantity. + pub quantity: f64, + /// Minimum price accepted. + pub min_price: f64, + /// Region. + pub region: String, + /// Processor types available. + pub processors: Vec, +} + +/// An auction for compute resources. +#[derive(Clone, Debug)] +pub struct Auction { + /// Auction ID. + pub id: AuctionId, + /// Resource being auctioned. + pub resource: ResourceType, + /// Quantity needed. + pub quantity: f64, + /// Maximum price buyer will pay. + pub max_price: f64, + /// Current winning bid. + pub winning_bid: Option<(NodeId, f64)>, + /// All bids. + pub bids: Vec<(NodeId, f64)>, + /// Start time. + pub started: Instant, + /// Duration. + pub duration: Duration, + /// Whether auction is closed. + pub closed: bool, +} + +impl Auction { + /// Creates a new auction. + pub fn new(resource: ResourceType, quantity: f64, max_price: f64, duration: Duration) -> Self { + Self { + id: AuctionId::new(), + resource, + quantity, + max_price, + winning_bid: None, + bids: Vec::new(), + started: Instant::now(), + duration, + closed: false, + } + } + + /// Submits a bid. + pub fn bid(&mut self, node: NodeId, price: f64) -> bool { + if self.closed || price > self.max_price { + return false; + } + + self.bids.push((node, price)); + + // Update winning bid (lowest price wins) + if self.winning_bid.is_none() || price < self.winning_bid.unwrap().1 { + self.winning_bid = Some((node, price)); + } + + true + } + + /// Checks if auction has ended. + pub fn is_ended(&self) -> bool { + self.closed || self.started.elapsed() >= self.duration + } + + /// Closes the auction. + pub fn close(&mut self) -> Option<(NodeId, f64)> { + self.closed = true; + self.winning_bid + } +} + +impl SpotMarket { + /// Creates a new spot market. + pub fn new() -> Self { + let mut order_books = HashMap::new(); + + // Create order books for common resources + for tier in [ + GpuTier::Entry, + GpuTier::Mid, + GpuTier::High, + GpuTier::DataCenter, + GpuTier::Ultra, + ] { + order_books.insert( + ResourceType::GpuHours(tier), + OrderBook::new(ResourceType::GpuHours(tier)), + ); + } + + for tier in [ + CpuTier::Mobile, + CpuTier::Desktop, + CpuTier::Workstation, + CpuTier::Server, + ] { + order_books.insert( + ResourceType::CpuHours(tier), + OrderBook::new(ResourceType::CpuHours(tier)), + ); + } + + order_books.insert(ResourceType::TpuHours, OrderBook::new(ResourceType::TpuHours)); + order_books.insert(ResourceType::NpuHours, OrderBook::new(ResourceType::NpuHours)); + order_books.insert(ResourceType::LpuCredits, OrderBook::new(ResourceType::LpuCredits)); + + Self { + order_books, + pricing: PricingEngine::new(), + providers: RwLock::new(HashMap::new()), + auctions: RwLock::new(HashMap::new()), + } + } + + /// Registers a compute provider. + pub fn register_provider(&self, node_id: NodeId, listings: Vec) { + self.providers.write().insert(node_id, listings); + } + + /// Submits an order. + pub fn submit_order(&self, order: Order) -> Result, ComputeError> { + let book = self.order_books.get(&order.resource).ok_or_else(|| { + ComputeError::Internal(format!("No order book for resource: {:?}", order.resource)) + })?; + + book.submit(order) + } + + /// Gets spot price for a resource. + pub fn spot_price(&self, resource: &ResourceType, region: Option<&str>) -> f64 { + // Check if there's a market price + if let Some(book) = self.order_books.get(resource) { + if let Some(mid) = book.mid_price() { + return mid; + } + } + + // Fall back to pricing engine + self.pricing.spot_price(resource, region) + } + + /// Starts an auction for compute resources. + pub fn start_auction( + &self, + resource: ResourceType, + quantity: f64, + max_price: f64, + duration: Duration, + ) -> AuctionId { + let auction = Auction::new(resource, quantity, max_price, duration); + let id = auction.id; + self.auctions.write().insert(id, auction); + id + } + + /// Submits a bid to an auction. + pub fn bid_auction(&self, auction_id: AuctionId, node: NodeId, price: f64) -> bool { + if let Some(auction) = self.auctions.write().get_mut(&auction_id) { + auction.bid(node, price) + } else { + false + } + } + + /// Closes an auction and returns the winner. + pub fn close_auction(&self, auction_id: AuctionId) -> Option<(NodeId, f64)> { + if let Some(auction) = self.auctions.write().get_mut(&auction_id) { + auction.close() + } else { + None + } + } + + /// Gets pricing comparison with cloud providers. + pub fn compare_to_cloud(&self, resource: &ResourceType) -> CloudComparison { + self.pricing.compare_to_cloud(resource, None) + } + + /// Gets the cheapest region for a resource. + pub fn cheapest_region(&self, resource: &ResourceType) -> &str { + self.pricing.cheapest_region(resource) + } + + /// Gets all provider listings for a resource. + pub fn find_providers(&self, resource: &ResourceType) -> Vec<(NodeId, ProviderListing)> { + self.providers + .read() + .iter() + .flat_map(|(node, listings)| { + listings + .iter() + .filter(|l| &l.resource == resource) + .map(|l| (*node, l.clone())) + }) + .collect() + } + + /// Gets market stats for a resource. + pub fn market_stats(&self, resource: &ResourceType) -> Option { + let book = self.order_books.get(resource)?; + + Some(MarketStats { + best_bid: book.best_bid(), + best_ask: book.best_ask(), + spread: book.spread(), + mid_price: book.mid_price(), + last_price: book.last_price(), + }) + } +} + +impl Default for SpotMarket { + fn default() -> Self { + Self::new() + } +} + +/// Market statistics for a resource. +#[derive(Clone, Debug)] +pub struct MarketStats { + /// Best bid price. + pub best_bid: Option, + /// Best ask price. + pub best_ask: Option, + /// Bid-ask spread. + pub spread: Option, + /// Mid price. + pub mid_price: Option, + /// Last traded price. + pub last_price: Option, +} + +use chrono::Timelike; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_order_creation() { + let order = Order::new( + NodeId(1), + OrderSide::Buy, + OrderType::Limit, + ResourceType::GpuHours(GpuTier::DataCenter), + 10.0, + 0.80, + ); + + assert_eq!(order.remaining, 10.0); + assert!(!order.is_filled()); + } + + #[test] + fn test_order_book_matching() { + let book = OrderBook::new(ResourceType::GpuHours(GpuTier::High)); + + // Add a sell order + let sell = Order::new( + NodeId(1), + OrderSide::Sell, + OrderType::Limit, + ResourceType::GpuHours(GpuTier::High), + 5.0, + 0.25, + ); + book.submit(sell).unwrap(); + + // Add a matching buy order + let buy = Order::new( + NodeId(2), + OrderSide::Buy, + OrderType::Limit, + ResourceType::GpuHours(GpuTier::High), + 3.0, + 0.30, + ); + let trades = book.submit(buy).unwrap(); + + assert_eq!(trades.len(), 1); + assert_eq!(trades[0].quantity, 3.0); + assert_eq!(trades[0].price, 0.25); // Uses ask price + } + + #[test] + fn test_pricing_engine() { + let engine = PricingEngine::new(); + + let price = engine.spot_price(&ResourceType::GpuHours(GpuTier::DataCenter), Some("eu-north")); + assert!(price > 0.0); + + // eu-north should be cheaper (low electricity cost) + let eu_price = engine.spot_price(&ResourceType::GpuHours(GpuTier::DataCenter), Some("eu-north")); + let eu_west_price = engine.spot_price(&ResourceType::GpuHours(GpuTier::DataCenter), Some("eu-west")); + + // eu-north has cheaper electricity + assert!(eu_price < eu_west_price); + } + + #[test] + fn test_cloud_comparison() { + let engine = PricingEngine::new(); + + let comparison = engine.compare_to_cloud(&ResourceType::GpuHours(GpuTier::DataCenter), None); + + // Should show significant savings + assert!(comparison.aws_savings > 50.0); + assert!(comparison.gcp_savings > 50.0); + assert!(comparison.azure_savings > 50.0); + } + + #[test] + fn test_auction() { + let mut auction = Auction::new( + ResourceType::GpuHours(GpuTier::Ultra), + 100.0, + 2.0, + Duration::from_secs(60), + ); + + // Submit bids + assert!(auction.bid(NodeId(1), 1.8)); + assert!(auction.bid(NodeId(2), 1.5)); + assert!(auction.bid(NodeId(3), 1.7)); + + // Price too high + assert!(!auction.bid(NodeId(4), 2.5)); + + // Lowest bid wins + let winner = auction.close(); + assert_eq!(winner, Some((NodeId(2), 1.5))); + } + + #[test] + fn test_spot_market() { + let market = SpotMarket::new(); + + // Register a provider + market.register_provider( + NodeId(1), + vec![ProviderListing { + resource: ResourceType::GpuHours(GpuTier::High), + quantity: 100.0, + min_price: 0.20, + region: "us-west".to_string(), + processors: vec![], + }], + ); + + // Get providers + let providers = market.find_providers(&ResourceType::GpuHours(GpuTier::High)); + assert_eq!(providers.len(), 1); + + // Get spot price + let price = market.spot_price(&ResourceType::GpuHours(GpuTier::High), None); + assert!(price > 0.0); + } +} diff --git a/crates/synor-compute/src/memory/mod.rs b/crates/synor-compute/src/memory/mod.rs new file mode 100644 index 0000000..ef0dd6b --- /dev/null +++ b/crates/synor-compute/src/memory/mod.rs @@ -0,0 +1,370 @@ +//! Unified memory management for heterogeneous compute. + +use crate::error::ComputeError; +use crate::processor::ProcessorType; +use parking_lot::RwLock; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::sync::Arc; + +/// Tensor handle for memory management. +#[derive(Clone, Debug)] +pub struct TensorHandle { + /// Unique ID. + pub id: TensorId, + /// Shape. + pub shape: Vec, + /// Data type. + pub dtype: DataType, + /// Size in bytes. + pub size_bytes: u64, + /// Current locations. + pub locations: Vec, +} + +impl TensorHandle { + /// Creates a new tensor handle. + pub fn new(shape: Vec, dtype: DataType) -> Self { + let size_bytes = shape.iter().product::() as u64 * dtype.size_bytes() as u64; + Self { + id: TensorId::new(), + shape, + dtype, + size_bytes, + locations: Vec::new(), + } + } + + /// Gets the number of elements. + pub fn numel(&self) -> usize { + self.shape.iter().product() + } +} + +/// Tensor identifier. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct TensorId(pub u64); + +impl TensorId { + /// Creates a new tensor ID. + pub fn new() -> Self { + use rand::Rng; + TensorId(rand::thread_rng().gen()) + } +} + +impl Default for TensorId { + fn default() -> Self { + Self::new() + } +} + +/// Data types for tensors. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum DataType { + Float64, + Float32, + Float16, + BFloat16, + Int64, + Int32, + Int16, + Int8, + UInt8, + Bool, +} + +impl DataType { + /// Returns size in bytes. + pub fn size_bytes(&self) -> usize { + match self { + DataType::Float64 | DataType::Int64 => 8, + DataType::Float32 | DataType::Int32 => 4, + DataType::Float16 | DataType::BFloat16 | DataType::Int16 => 2, + DataType::Int8 | DataType::UInt8 | DataType::Bool => 1, + } + } +} + +/// Data transfer path between processors. +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub enum TransferPath { + /// Direct GPU-to-GPU via NVLink. + NvLink, + /// Direct GPU-to-GPU via PCIe P2P. + PciePeerToPeer, + /// Through CPU memory. + CpuMediated, + /// Unified memory (Apple Silicon). + UnifiedMemory, + /// Network transfer. + Network, + /// Same memory space (no transfer needed). + SameMemory, +} + +impl TransferPath { + /// Returns approximate bandwidth in GB/s. + pub fn bandwidth_gbps(&self) -> f64 { + match self { + TransferPath::NvLink => 900.0, // NVLink 4.0 + TransferPath::PciePeerToPeer => 64.0, // PCIe 5.0 x16 + TransferPath::CpuMediated => 50.0, // DDR5 + TransferPath::UnifiedMemory => 400.0, // Apple unified + TransferPath::Network => 10.0, // 100Gbps network + TransferPath::SameMemory => f64::INFINITY, + } + } + + /// Estimates transfer time for given bytes. + pub fn estimate_transfer_time(&self, bytes: u64) -> std::time::Duration { + if matches!(self, TransferPath::SameMemory) { + return std::time::Duration::ZERO; + } + + let bytes_f64 = bytes as f64; + let bandwidth = self.bandwidth_gbps() * 1e9; // Convert to bytes/s + let seconds = bytes_f64 / bandwidth; + std::time::Duration::from_secs_f64(seconds) + } +} + +/// Unified memory manager. +pub struct MemoryManager { + /// Allocated tensors. + tensors: RwLock>, + /// Memory usage per processor type. + usage: RwLock>, + /// Memory limits per processor type. + limits: HashMap, +} + +impl MemoryManager { + /// Creates a new memory manager. + pub fn new() -> Self { + Self { + tensors: RwLock::new(HashMap::new()), + usage: RwLock::new(HashMap::new()), + limits: HashMap::new(), + } + } + + /// Sets memory limit for a processor type. + pub fn set_limit(&mut self, proc_type: ProcessorType, limit_bytes: u64) { + self.limits.insert(proc_type, limit_bytes); + } + + /// Allocates a tensor. + pub fn allocate(&self, shape: Vec, dtype: DataType) -> Result { + let handle = TensorHandle::new(shape, dtype); + self.tensors.write().insert(handle.id, handle.clone()); + Ok(handle) + } + + /// Frees a tensor. + pub fn free(&self, tensor_id: TensorId) -> Result<(), ComputeError> { + if let Some(handle) = self.tensors.write().remove(&tensor_id) { + // Update usage for all locations + let mut usage = self.usage.write(); + for loc in &handle.locations { + if let Some(u) = usage.get_mut(loc) { + *u = u.saturating_sub(handle.size_bytes); + } + } + } + Ok(()) + } + + /// Gets a tensor handle. + pub fn get(&self, tensor_id: TensorId) -> Option { + self.tensors.read().get(&tensor_id).cloned() + } + + /// Ensures tensor is on specified processor. + pub fn ensure_on( + &self, + tensor_id: TensorId, + target: ProcessorType, + ) -> Result { + let mut tensors = self.tensors.write(); + + if let Some(handle) = tensors.get_mut(&tensor_id) { + // Check if already on target + if handle.locations.contains(&target) { + return Ok(TransferPath::SameMemory); + } + + // Determine transfer path + let path = if handle.locations.is_empty() { + // New tensor, allocate on target + TransferPath::SameMemory + } else { + // Find best transfer path from existing location + self.find_best_path(&handle.locations[0], &target) + }; + + // Record new location + handle.locations.push(target.clone()); + + // Update usage + let mut usage = self.usage.write(); + *usage.entry(target).or_default() += handle.size_bytes; + + Ok(path) + } else { + Err(ComputeError::Internal("Tensor not found".to_string())) + } + } + + /// Finds best transfer path between processors. + fn find_best_path(&self, from: &ProcessorType, to: &ProcessorType) -> TransferPath { + // Check for unified memory (Apple Silicon) + if self.shares_memory(from, to) { + return TransferPath::UnifiedMemory; + } + + // Check for NVLink between NVIDIA GPUs + if matches!(from, ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda { .. })) + && matches!(to, ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda { .. })) + { + return TransferPath::NvLink; + } + + // Check for PCIe P2P between GPUs + if from.is_gpu() && to.is_gpu() { + return TransferPath::PciePeerToPeer; + } + + // Default to CPU-mediated transfer + TransferPath::CpuMediated + } + + /// Checks if two processor types share memory. + fn shares_memory(&self, a: &ProcessorType, b: &ProcessorType) -> bool { + use crate::processor::{CpuVariant, GpuVariant, NpuVariant}; + + match (a, b) { + // Apple Silicon unified memory + (ProcessorType::Cpu(CpuVariant::Arm64 { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal)) + | (ProcessorType::Gpu(GpuVariant::AppleMetal), ProcessorType::Cpu(CpuVariant::Arm64 { .. })) + | (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Cpu(CpuVariant::Arm64 { .. })) + | (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal)) => true, + // Same type + _ if a == b => true, + _ => false, + } + } + + /// Gets current memory usage for a processor type. + pub fn usage(&self, proc_type: ProcessorType) -> u64 { + self.usage.read().get(&proc_type).copied().unwrap_or(0) + } + + /// Gets available memory for a processor type. + pub fn available(&self, proc_type: ProcessorType) -> u64 { + let limit = self.limits.get(&proc_type).copied().unwrap_or(u64::MAX); + let used = self.usage(proc_type); + limit.saturating_sub(used) + } + + /// Gets total allocated tensors. + pub fn tensor_count(&self) -> usize { + self.tensors.read().len() + } +} + +impl Default for MemoryManager { + fn default() -> Self { + Self::new() + } +} + +/// Unified memory abstraction for zero-copy sharing. +pub struct UnifiedMemory { + /// Base pointer (in unified address space). + pub base: u64, + /// Size in bytes. + pub size: u64, + /// Accessible from these processor types. + pub accessible_from: Vec, +} + +impl UnifiedMemory { + /// Creates new unified memory region. + pub fn new(size: u64) -> Self { + Self { + base: 0, // Would be actual pointer in real implementation + size, + accessible_from: Vec::new(), + } + } + + /// Checks if accessible from processor type. + pub fn is_accessible_from(&self, proc_type: &ProcessorType) -> bool { + self.accessible_from.contains(proc_type) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_tensor_handle() { + let handle = TensorHandle::new(vec![1024, 1024], DataType::Float32); + assert_eq!(handle.numel(), 1024 * 1024); + assert_eq!(handle.size_bytes, 1024 * 1024 * 4); + } + + #[test] + fn test_data_type_sizes() { + assert_eq!(DataType::Float64.size_bytes(), 8); + assert_eq!(DataType::Float32.size_bytes(), 4); + assert_eq!(DataType::Float16.size_bytes(), 2); + assert_eq!(DataType::Int8.size_bytes(), 1); + } + + #[test] + fn test_transfer_path_bandwidth() { + assert!(TransferPath::NvLink.bandwidth_gbps() > TransferPath::PciePeerToPeer.bandwidth_gbps()); + assert!(TransferPath::SameMemory.bandwidth_gbps().is_infinite()); + } + + #[test] + fn test_memory_manager() { + let manager = MemoryManager::new(); + + let handle = manager.allocate(vec![1024, 1024], DataType::Float32).unwrap(); + assert_eq!(manager.tensor_count(), 1); + + manager.free(handle.id).unwrap(); + assert_eq!(manager.tensor_count(), 0); + } + + #[test] + fn test_ensure_on() { + let manager = MemoryManager::new(); + + let handle = manager.allocate(vec![1024], DataType::Float32).unwrap(); + + // First ensure should allocate + let path = manager.ensure_on( + handle.id, + ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda { + compute_capability: (8, 0), + }), + ).unwrap(); + + assert_eq!(path, TransferPath::SameMemory); + + // Second ensure to same location should be same memory + let path = manager.ensure_on( + handle.id, + ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda { + compute_capability: (8, 0), + }), + ).unwrap(); + + assert_eq!(path, TransferPath::SameMemory); + } +} diff --git a/crates/synor-compute/src/processor/capabilities.rs b/crates/synor-compute/src/processor/capabilities.rs new file mode 100644 index 0000000..bedb6aa --- /dev/null +++ b/crates/synor-compute/src/processor/capabilities.rs @@ -0,0 +1,547 @@ +//! Processor capability definitions. + +use super::operation::OperationType; +use super::types::PowerTier; +use serde::{Deserialize, Serialize}; +use std::collections::HashSet; + +/// Detailed processor capabilities. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct ProcessorCapabilities { + /// Compute throughput. + pub compute: ComputeThroughput, + /// Memory specifications. + pub memory: MemorySpecs, + /// Supported operations. + pub operations: HashSet, + /// Power characteristics. + pub power: PowerCharacteristics, + /// Optimal workload characteristics. + pub optimal_for: Vec, +} + +impl Default for ProcessorCapabilities { + fn default() -> Self { + Self { + compute: ComputeThroughput::default(), + memory: MemorySpecs::default(), + operations: Self::default_operations(), + power: PowerCharacteristics::default(), + optimal_for: vec![], + } + } +} + +impl ProcessorCapabilities { + /// Default operations supported by most processors. + fn default_operations() -> HashSet { + [ + OperationType::MatMul, + OperationType::Add, + OperationType::Mul, + OperationType::ReLU, + OperationType::Softmax, + OperationType::DataLoad, + OperationType::DataPreprocess, + ] + .into_iter() + .collect() + } + + /// Creates CPU capabilities. + pub fn cpu(cores: u32, clock_ghz: f32, avx512: bool) -> Self { + let flops_per_cycle = if avx512 { 64.0 } else { 32.0 }; // FP32 ops per cycle with AVX + let fp32_tflops = (cores as f64 * clock_ghz as f64 * flops_per_cycle) / 1000.0; + + Self { + compute: ComputeThroughput { + fp64_tflops: fp32_tflops / 2.0, + fp32_tflops, + fp16_tflops: fp32_tflops * 2.0, + bf16_tflops: fp32_tflops * 2.0, + int8_tops: fp32_tflops * 4.0, + int4_tops: fp32_tflops * 8.0, + sparsity_speedup: 1.0, + }, + memory: MemorySpecs { + capacity_bytes: 64 * 1024 * 1024 * 1024, // 64 GB typical + bandwidth_gbps: 200, // DDR5 + type_: MemoryType::Ddr5, + }, + operations: Self::cpu_operations(), + power: PowerCharacteristics { + tdp_watts: 125, + efficiency: 0.8, + power_tier: PowerTier::Medium, + }, + optimal_for: vec![ + WorkloadCharacteristic::Sequential, + WorkloadCharacteristic::MemoryBound, + WorkloadCharacteristic::SmallBatch, + ], + } + } + + /// Operations typically supported by CPUs. + fn cpu_operations() -> HashSet { + [ + // Matrix operations (slow but supported) + OperationType::MatMul, + OperationType::Conv2d, + OperationType::BatchNorm, + OperationType::LayerNorm, + // Element-wise + OperationType::Add, + OperationType::Mul, + OperationType::ReLU, + OperationType::GeLU, + OperationType::Softmax, + // Data operations (optimal) + OperationType::DataLoad, + OperationType::DataPreprocess, + OperationType::Tokenization, + OperationType::Detokenization, + // Memory operations + OperationType::Transpose, + OperationType::Reshape, + OperationType::Concat, + OperationType::Split, + // I/O + OperationType::Checkpoint, + ] + .into_iter() + .collect() + } + + /// Creates NVIDIA GPU capabilities. + pub fn nvidia_gpu( + cuda_cores: u32, + tensor_cores: u32, + vram_gb: u32, + bandwidth_gbps: u32, + compute_capability: (u8, u8), + ) -> Self { + // Approximate TFLOPS based on cores and typical clocks + let base_clock_ghz = 1.5; + let fp32_tflops = (cuda_cores as f64 * base_clock_ghz * 2.0) / 1000.0; + let tensor_multiplier = if compute_capability.0 >= 8 { 4.0 } else { 2.0 }; + + Self { + compute: ComputeThroughput { + fp64_tflops: fp32_tflops / 2.0, + fp32_tflops, + fp16_tflops: fp32_tflops * tensor_multiplier, + bf16_tflops: fp32_tflops * tensor_multiplier, + int8_tops: fp32_tflops * tensor_multiplier * 2.0, + int4_tops: fp32_tflops * tensor_multiplier * 4.0, + sparsity_speedup: if compute_capability.0 >= 8 { 2.0 } else { 1.0 }, + }, + memory: MemorySpecs { + capacity_bytes: vram_gb as u64 * 1024 * 1024 * 1024, + bandwidth_gbps, + type_: if compute_capability.0 >= 9 { + MemoryType::Hbm3 + } else { + MemoryType::Hbm2e + }, + }, + operations: Self::gpu_operations(compute_capability), + power: PowerCharacteristics { + tdp_watts: if compute_capability.0 >= 9 { 700 } else { 350 }, + efficiency: 0.9, + power_tier: PowerTier::High, + }, + optimal_for: vec![ + WorkloadCharacteristic::HighlyParallel, + WorkloadCharacteristic::LargeBatch, + WorkloadCharacteristic::ComputeBound, + ], + } + } + + /// Operations supported by GPUs. + fn gpu_operations(compute_capability: (u8, u8)) -> HashSet { + let mut ops: HashSet = [ + // Matrix operations (optimal) + OperationType::MatMul, + OperationType::Conv2d, + OperationType::Conv3d, + OperationType::DepthwiseConv, + OperationType::BatchNorm, + OperationType::LayerNorm, + // Attention + OperationType::SelfAttention, + OperationType::CrossAttention, + // Element-wise + OperationType::Add, + OperationType::Mul, + OperationType::ReLU, + OperationType::GeLU, + OperationType::SiLU, + OperationType::Softmax, + // Reduction + OperationType::Sum, + OperationType::Mean, + OperationType::Max, + OperationType::ArgMax, + // Memory operations + OperationType::Transpose, + OperationType::Reshape, + OperationType::Concat, + OperationType::Split, + OperationType::Gather, + OperationType::Scatter, + // LLM specific + OperationType::Embedding, + OperationType::RoPE, + OperationType::KVCache, + OperationType::TopK, + OperationType::Sampling, + ] + .into_iter() + .collect(); + + // FlashAttention for newer GPUs + if compute_capability.0 >= 8 { + ops.insert(OperationType::FlashAttention); + } + + ops + } + + /// Creates TPU capabilities. + pub fn tpu(version: super::TpuVersion) -> Self { + let (bf16_tflops, memory_gb, bandwidth_gbps) = match version { + super::TpuVersion::V5p => (918.0, 95, 4800), + super::TpuVersion::V5e => (197.0, 16, 1600), + super::TpuVersion::V4 => (275.0, 32, 2400), + super::TpuVersion::V4i => (138.0, 32, 1200), + super::TpuVersion::V3 => (123.0, 16, 900), + super::TpuVersion::V2 => (46.0, 8, 600), + super::TpuVersion::Edge => (4.0, 0, 0), // Uses host memory + }; + + Self { + compute: ComputeThroughput { + fp64_tflops: 0.0, // TPUs don't support FP64 + fp32_tflops: bf16_tflops / 2.0, + fp16_tflops: bf16_tflops, + bf16_tflops, + int8_tops: bf16_tflops * 2.0, + int4_tops: bf16_tflops * 4.0, + sparsity_speedup: 2.0, + }, + memory: MemorySpecs { + capacity_bytes: memory_gb as u64 * 1024 * 1024 * 1024, + bandwidth_gbps, + type_: MemoryType::Hbm2e, + }, + operations: Self::tpu_operations(), + power: PowerCharacteristics { + tdp_watts: if matches!(version, super::TpuVersion::Edge) { + 2 + } else { + 400 + }, + efficiency: 0.95, + power_tier: if matches!(version, super::TpuVersion::Edge) { + PowerTier::UltraLow + } else { + PowerTier::High + }, + }, + optimal_for: vec![ + WorkloadCharacteristic::HighlyParallel, + WorkloadCharacteristic::ComputeBound, + WorkloadCharacteristic::FixedShape, + WorkloadCharacteristic::LargeBatch, + ], + } + } + + /// Operations supported by TPUs. + fn tpu_operations() -> HashSet { + [ + // Matrix operations (optimal) + OperationType::MatMul, + OperationType::Conv2d, + OperationType::BatchNorm, + OperationType::LayerNorm, + // Attention + OperationType::SelfAttention, + OperationType::CrossAttention, + OperationType::FlashAttention, + // Element-wise + OperationType::Add, + OperationType::Mul, + OperationType::ReLU, + OperationType::GeLU, + OperationType::SiLU, + OperationType::Softmax, + // Reduction + OperationType::Sum, + OperationType::Mean, + OperationType::Max, + // LLM specific + OperationType::Embedding, + OperationType::RoPE, + OperationType::KVCache, + ] + .into_iter() + .collect() + } + + /// Creates LPU (Groq) capabilities. + pub fn lpu() -> Self { + Self { + compute: ComputeThroughput { + fp64_tflops: 0.0, + fp32_tflops: 0.0, + fp16_tflops: 188.0, + bf16_tflops: 188.0, + int8_tops: 750.0, + int4_tops: 1500.0, + sparsity_speedup: 1.0, + }, + memory: MemorySpecs { + capacity_bytes: 230 * 1024 * 1024 * 1024, // 230 GB SRAM! + bandwidth_gbps: 80_000, // 80 TB/s internal + type_: MemoryType::Sram, + }, + operations: Self::lpu_operations(), + power: PowerCharacteristics { + tdp_watts: 300, + efficiency: 0.98, // Very efficient for inference + power_tier: PowerTier::Medium, + }, + optimal_for: vec![ + WorkloadCharacteristic::Sequential, + WorkloadCharacteristic::SmallBatch, + WorkloadCharacteristic::VariableLength, + WorkloadCharacteristic::LowLatency, + ], + } + } + + /// Operations supported by Groq LPU. + fn lpu_operations() -> HashSet { + [ + // Optimized for inference + OperationType::MatMul, + OperationType::LayerNorm, + OperationType::SelfAttention, + OperationType::Add, + OperationType::Mul, + OperationType::ReLU, + OperationType::GeLU, + OperationType::SiLU, + OperationType::Softmax, + OperationType::Embedding, + OperationType::RoPE, + OperationType::KVCache, + OperationType::TopK, + OperationType::Sampling, + ] + .into_iter() + .collect() + } + + /// Creates Apple Neural Engine capabilities. + pub fn apple_neural_engine(cores: u32) -> Self { + let int8_tops = match cores { + 16 => 18.0, // M3 + 32 => 35.0, // M3 Max + _ => cores as f64 * 1.1, + }; + + Self { + compute: ComputeThroughput { + fp64_tflops: 0.0, + fp32_tflops: int8_tops / 4.0, + fp16_tflops: int8_tops / 2.0, + bf16_tflops: int8_tops / 2.0, + int8_tops, + int4_tops: int8_tops * 2.0, + sparsity_speedup: 1.0, + }, + memory: MemorySpecs { + capacity_bytes: 0, // Uses unified memory + bandwidth_gbps: 400, + type_: MemoryType::Unified, + }, + operations: Self::npu_operations(), + power: PowerCharacteristics { + tdp_watts: 15, + efficiency: 0.95, + power_tier: PowerTier::UltraLow, + }, + optimal_for: vec![ + WorkloadCharacteristic::LowPower, + WorkloadCharacteristic::LowLatency, + WorkloadCharacteristic::SmallBatch, + ], + } + } + + /// Operations supported by NPUs. + fn npu_operations() -> HashSet { + [ + // Inference optimized + OperationType::MatMul, + OperationType::Conv2d, + OperationType::DepthwiseConv, + OperationType::BatchNorm, + OperationType::LayerNorm, + OperationType::Add, + OperationType::Mul, + OperationType::ReLU, + OperationType::Softmax, + OperationType::Embedding, + ] + .into_iter() + .collect() + } +} + +/// Compute throughput metrics. +#[derive(Clone, Debug, Default, Serialize, Deserialize)] +pub struct ComputeThroughput { + /// FP64 TFLOPS. + pub fp64_tflops: f64, + /// FP32 TFLOPS. + pub fp32_tflops: f64, + /// FP16 TFLOPS. + pub fp16_tflops: f64, + /// BF16 TFLOPS. + pub bf16_tflops: f64, + /// INT8 TOPS. + pub int8_tops: f64, + /// INT4 TOPS. + pub int4_tops: f64, + /// Speedup for sparse operations. + pub sparsity_speedup: f64, +} + +/// Memory specifications. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct MemorySpecs { + /// Total capacity (bytes). + pub capacity_bytes: u64, + /// Bandwidth (GB/s). + pub bandwidth_gbps: u32, + /// Memory type. + pub type_: MemoryType, +} + +impl Default for MemorySpecs { + fn default() -> Self { + Self { + capacity_bytes: 16 * 1024 * 1024 * 1024, // 16 GB + bandwidth_gbps: 500, + type_: MemoryType::Ddr5, + } + } +} + +/// Memory types. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum MemoryType { + /// DDR4 RAM. + Ddr4, + /// DDR5 RAM. + Ddr5, + /// GDDR6/6X video memory. + Gddr6, + /// HBM2. + Hbm2, + /// HBM2e. + Hbm2e, + /// HBM3. + Hbm3, + /// SRAM (on-chip). + Sram, + /// Unified memory (Apple Silicon). + Unified, + /// LPDDR (mobile). + Lpddr, +} + +/// Power characteristics. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct PowerCharacteristics { + /// TDP in watts. + pub tdp_watts: u32, + /// Efficiency factor (0.0 - 1.0). + pub efficiency: f64, + /// Power tier. + pub power_tier: PowerTier, +} + +impl Default for PowerCharacteristics { + fn default() -> Self { + Self { + tdp_watts: 100, + efficiency: 0.8, + power_tier: PowerTier::Medium, + } + } +} + +/// Workload characteristics for processor matching. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum WorkloadCharacteristic { + /// High parallelism (GPU, TPU). + HighlyParallel, + /// Sequential dependencies (CPU, LPU). + Sequential, + /// Memory bandwidth bound (GPU). + MemoryBound, + /// Compute bound (TPU). + ComputeBound, + /// Low latency required (NPU, edge). + LowLatency, + /// Low power required (NPU, mobile). + LowPower, + /// Large batch sizes (GPU, TPU). + LargeBatch, + /// Small batch sizes (CPU, LPU). + SmallBatch, + /// Variable length sequences (LPU). + VariableLength, + /// Fixed tensor shapes (TPU). + FixedShape, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_cpu_capabilities() { + let caps = ProcessorCapabilities::cpu(32, 3.5, true); + assert!(caps.compute.fp32_tflops > 0.0); + assert!(caps.operations.contains(&OperationType::DataLoad)); + assert!(caps.operations.contains(&OperationType::Tokenization)); + } + + #[test] + fn test_gpu_capabilities() { + let caps = ProcessorCapabilities::nvidia_gpu(16384, 512, 24, 1000, (8, 9)); + assert!(caps.compute.fp16_tflops > caps.compute.fp32_tflops); + assert!(caps.operations.contains(&OperationType::FlashAttention)); + } + + #[test] + fn test_tpu_capabilities() { + let caps = ProcessorCapabilities::tpu(super::super::TpuVersion::V5p); + assert!(caps.compute.bf16_tflops > 900.0); + assert!(!caps.operations.contains(&OperationType::DataLoad)); // TPUs don't do I/O + } + + #[test] + fn test_lpu_capabilities() { + let caps = ProcessorCapabilities::lpu(); + assert!(caps.memory.bandwidth_gbps > 10000); // Very high internal bandwidth + assert!(caps.optimal_for.contains(&WorkloadCharacteristic::Sequential)); + } +} diff --git a/crates/synor-compute/src/processor/mod.rs b/crates/synor-compute/src/processor/mod.rs new file mode 100644 index 0000000..3bca36d --- /dev/null +++ b/crates/synor-compute/src/processor/mod.rs @@ -0,0 +1,339 @@ +//! Processor abstractions for heterogeneous compute. +//! +//! Supports all processor types: +//! - CPU (x86_64, ARM64, RISC-V) +//! - GPU (NVIDIA CUDA, AMD ROCm, Intel OneAPI, Apple Metal) +//! - TPU (Google TPU v2-v5) +//! - NPU (Apple Neural Engine, Qualcomm Hexagon, Intel VPU) +//! - LPU (Groq Language Processing Unit) +//! - FPGA (Xilinx, Intel/Altera) +//! - DSP (Digital Signal Processors) +//! - Custom accelerators + +mod capabilities; +mod operation; +mod profiles; +mod types; + +pub use capabilities::{ComputeThroughput, MemorySpecs, MemoryType, ProcessorCapabilities}; +pub use operation::{Operation, OperationType}; +pub use profiles::ProcessorProfiles; +pub use types::*; + +use crate::error::ComputeError; +use async_trait::async_trait; +use serde::{Deserialize, Serialize}; +use std::time::Duration; + +/// Unique processor identifier (within a node). +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct ProcessorId(pub u64); + +impl std::fmt::Display for ProcessorId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "proc_{}", self.0) + } +} + +/// Unified abstraction for any processor type. +#[async_trait] +pub trait Processor: Send + Sync { + /// Get processor ID. + fn id(&self) -> ProcessorId; + + /// Get processor type. + fn processor_type(&self) -> ProcessorType; + + /// Get capabilities. + fn capabilities(&self) -> &ProcessorCapabilities; + + /// Check if processor can execute operation. + fn can_execute(&self, op: &Operation) -> bool; + + /// Estimate execution time for operation. + fn estimate_time(&self, op: &Operation) -> Duration; + + /// Estimate energy consumption for operation (Joules). + fn estimate_energy(&self, op: &Operation) -> f64; + + /// Execute operation. + async fn execute(&self, op: Operation) -> Result; + + /// Current utilization (0.0 - 1.0). + fn utilization(&self) -> f64; + + /// Available memory (bytes). + fn available_memory(&self) -> u64; + + /// Check if this processor shares memory with another type. + fn shares_memory_with(&self, other: &ProcessorType) -> bool { + // By default, processors don't share memory + // Override for unified memory architectures (Apple Silicon, AMD APUs) + self.processor_type() == *other + } +} + +/// Result of an operation execution. +#[derive(Clone, Debug)] +pub struct OperationResult { + /// Output data. + pub output: Vec, + /// Execution time. + pub duration: Duration, + /// Energy consumed (Joules). + pub energy: f64, + /// Peak memory used (bytes). + pub peak_memory: u64, +} + +/// Generic processor implementation for simulation/testing. +pub struct GenericProcessor { + id: ProcessorId, + processor_type: ProcessorType, + capabilities: ProcessorCapabilities, + utilization: std::sync::atomic::AtomicU64, + available_memory: std::sync::atomic::AtomicU64, +} + +impl GenericProcessor { + /// Creates a new generic processor. + pub fn new( + id: ProcessorId, + processor_type: ProcessorType, + capabilities: ProcessorCapabilities, + ) -> Self { + let available_memory = capabilities.memory.capacity_bytes; + Self { + id, + processor_type, + capabilities, + utilization: std::sync::atomic::AtomicU64::new(0), + available_memory: std::sync::atomic::AtomicU64::new(available_memory), + } + } + + /// Creates a CPU processor. + pub fn cpu(id: ProcessorId, variant: CpuVariant) -> Self { + Self::new( + id, + ProcessorType::Cpu(variant), + ProcessorProfiles::cpu_default(), + ) + } + + /// Creates an NVIDIA GPU processor. + pub fn nvidia_gpu(id: ProcessorId, compute_capability: (u8, u8)) -> Self { + let capabilities = match compute_capability { + (9, 0) => ProcessorProfiles::nvidia_h100(), + (8, 9) => ProcessorProfiles::nvidia_rtx_4090(), + (8, 6) => ProcessorProfiles::nvidia_rtx_3090(), + _ => ProcessorProfiles::nvidia_default(), + }; + Self::new( + id, + ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability }), + capabilities, + ) + } + + /// Creates a TPU processor. + pub fn tpu(id: ProcessorId, version: TpuVersion) -> Self { + let capabilities = match version { + TpuVersion::V5p => ProcessorProfiles::google_tpu_v5p(), + TpuVersion::V4 => ProcessorProfiles::google_tpu_v4(), + _ => ProcessorProfiles::google_tpu_default(), + }; + Self::new(id, ProcessorType::Tpu(version), capabilities) + } + + /// Creates a Groq LPU processor. + pub fn lpu(id: ProcessorId) -> Self { + Self::new(id, ProcessorType::Lpu, ProcessorProfiles::groq_lpu()) + } + + /// Creates an Apple Neural Engine processor. + pub fn apple_neural_engine(id: ProcessorId, cores: u32) -> Self { + Self::new( + id, + ProcessorType::Npu(NpuVariant::AppleNeuralEngine { cores }), + ProcessorProfiles::apple_neural_engine(cores), + ) + } +} + +#[async_trait] +impl Processor for GenericProcessor { + fn id(&self) -> ProcessorId { + self.id + } + + fn processor_type(&self) -> ProcessorType { + self.processor_type.clone() + } + + fn capabilities(&self) -> &ProcessorCapabilities { + &self.capabilities + } + + fn can_execute(&self, op: &Operation) -> bool { + self.capabilities.operations.contains(&op.op_type()) + } + + fn estimate_time(&self, op: &Operation) -> Duration { + // Estimate based on FLOPS and operation complexity + let flops_needed = op.estimated_flops(); + let throughput = match op.precision() { + Precision::Fp32 => self.capabilities.compute.fp32_tflops, + Precision::Fp16 => self.capabilities.compute.fp16_tflops, + Precision::Bf16 => self.capabilities.compute.bf16_tflops, + Precision::Int8 => self.capabilities.compute.int8_tops, + Precision::Int4 => self.capabilities.compute.int4_tops, + Precision::Fp64 => self.capabilities.compute.fp64_tflops, + }; + + if throughput > 0.0 { + let tflops = throughput; + let flops_per_second = tflops * 1e12; + let seconds = flops_needed / flops_per_second; + Duration::from_secs_f64(seconds) + } else { + Duration::from_secs(1) // Fallback + } + } + + fn estimate_energy(&self, op: &Operation) -> f64 { + // Estimate based on TDP and execution time + let duration = self.estimate_time(op); + let watts = self.capabilities.power.tdp_watts as f64; + let efficiency = self.capabilities.power.efficiency; + watts * duration.as_secs_f64() * efficiency + } + + async fn execute(&self, op: Operation) -> Result { + // Check if we can execute + if !self.can_execute(&op) { + return Err(ComputeError::OperationNotSupported( + self.processor_type.clone(), + format!("{:?}", op.op_type()), + )); + } + + // Simulate execution + let duration = self.estimate_time(&op); + let energy = self.estimate_energy(&op); + + // Update utilization + self.utilization + .store(50, std::sync::atomic::Ordering::Relaxed); + + // Simulate work + tokio::time::sleep(Duration::from_micros(100)).await; + + // Reset utilization + self.utilization + .store(0, std::sync::atomic::Ordering::Relaxed); + + Ok(OperationResult { + output: vec![], + duration, + energy, + peak_memory: op.estimated_memory(), + }) + } + + fn utilization(&self) -> f64 { + self.utilization.load(std::sync::atomic::Ordering::Relaxed) as f64 / 100.0 + } + + fn available_memory(&self) -> u64 { + self.available_memory + .load(std::sync::atomic::Ordering::Relaxed) + } + + fn shares_memory_with(&self, other: &ProcessorType) -> bool { + match (&self.processor_type, other) { + // Apple Silicon has unified memory + (ProcessorType::Cpu(CpuVariant::Arm64 { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal)) + | (ProcessorType::Gpu(GpuVariant::AppleMetal), ProcessorType::Cpu(CpuVariant::Arm64 { .. })) + | (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Cpu(CpuVariant::Arm64 { .. })) + | (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal)) => true, + // Same type always shares + (a, b) if a == b => true, + _ => false, + } + } +} + +/// Precision for operations. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum Precision { + Fp64, + Fp32, + Fp16, + Bf16, + Int8, + Int4, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_processor_creation() { + let cpu = GenericProcessor::cpu( + ProcessorId(0), + CpuVariant::X86_64 { + avx: AvxSupport::Avx512, + }, + ); + + assert_eq!(cpu.id(), ProcessorId(0)); + assert!(matches!(cpu.processor_type(), ProcessorType::Cpu(_))); + } + + #[test] + fn test_gpu_creation() { + let gpu = GenericProcessor::nvidia_gpu(ProcessorId(1), (9, 0)); + + assert_eq!(gpu.id(), ProcessorId(1)); + assert!(matches!( + gpu.processor_type(), + ProcessorType::Gpu(GpuVariant::NvidiaCuda { .. }) + )); + } + + #[test] + fn test_unified_memory() { + let apple_cpu = GenericProcessor::new( + ProcessorId(0), + ProcessorType::Cpu(CpuVariant::Arm64 { sve: false }), + ProcessorCapabilities::default(), + ); + + assert!(apple_cpu.shares_memory_with(&ProcessorType::Gpu(GpuVariant::AppleMetal))); + } + + #[tokio::test] + async fn test_operation_execution() { + let cpu = GenericProcessor::cpu( + ProcessorId(0), + CpuVariant::X86_64 { + avx: AvxSupport::Avx512, + }, + ); + + let op = Operation::MatMul { + m: 1024, + n: 1024, + k: 1024, + precision: Precision::Fp32, + }; + + // CPU might not support all ops depending on capabilities + // This is testing the infrastructure + let result = cpu.execute(op).await; + // Result depends on capabilities + assert!(result.is_ok() || result.is_err()); + } +} diff --git a/crates/synor-compute/src/processor/operation.rs b/crates/synor-compute/src/processor/operation.rs new file mode 100644 index 0000000..41d52b3 --- /dev/null +++ b/crates/synor-compute/src/processor/operation.rs @@ -0,0 +1,543 @@ +//! Operation definitions for heterogeneous compute. + +use super::Precision; +use serde::{Deserialize, Serialize}; + +/// Operation types for processor matching. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum OperationType { + // Matrix operations + MatMul, + Conv2d, + Conv3d, + DepthwiseConv, + BatchNorm, + LayerNorm, + + // Attention operations + SelfAttention, + CrossAttention, + FlashAttention, + + // Element-wise operations + Add, + Mul, + ReLU, + GeLU, + SiLU, + Softmax, + + // Reduction operations + Sum, + Mean, + Max, + ArgMax, + + // Data movement + Transpose, + Reshape, + Concat, + Split, + Gather, + Scatter, + + // LLM specific + Embedding, + RoPE, // Rotary Position Embedding + KVCache, + TopK, + Sampling, + + // I/O operations + DataLoad, + DataPreprocess, + Tokenization, + Detokenization, + Checkpoint, + + // Distributed operations + AllReduce, + AllGather, + ReduceScatter, + + // Training specific + Backward, + OptimizerStep, + GradientClip, +} + +/// Concrete operation with parameters. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub enum Operation { + /// Matrix multiplication. + MatMul { + m: usize, + n: usize, + k: usize, + precision: Precision, + }, + + /// 2D Convolution. + Conv2d { + batch: usize, + in_channels: usize, + out_channels: usize, + height: usize, + width: usize, + kernel_size: usize, + precision: Precision, + }, + + /// Batch normalization. + BatchNorm { + batch: usize, + channels: usize, + spatial: usize, + precision: Precision, + }, + + /// Layer normalization. + LayerNorm { + batch: usize, + seq_len: usize, + hidden: usize, + precision: Precision, + }, + + /// Self-attention. + SelfAttention { + batch: usize, + seq_len: usize, + num_heads: usize, + head_dim: usize, + precision: Precision, + }, + + /// Flash attention (fused, memory efficient). + FlashAttention { + batch: usize, + seq_len: usize, + num_heads: usize, + head_dim: usize, + precision: Precision, + }, + + /// Element-wise addition. + Add { + elements: usize, + precision: Precision, + }, + + /// Element-wise multiplication. + Mul { + elements: usize, + precision: Precision, + }, + + /// ReLU activation. + ReLU { elements: usize }, + + /// GeLU activation. + GeLU { elements: usize }, + + /// SiLU (Swish) activation. + SiLU { elements: usize }, + + /// Softmax. + Softmax { + batch: usize, + seq_len: usize, + precision: Precision, + }, + + /// Embedding lookup. + Embedding { + batch: usize, + seq_len: usize, + vocab_size: usize, + embed_dim: usize, + precision: Precision, + }, + + /// Rotary Position Embedding. + RoPE { + batch: usize, + seq_len: usize, + head_dim: usize, + precision: Precision, + }, + + /// KV Cache update. + KVCache { + batch: usize, + seq_len: usize, + num_heads: usize, + head_dim: usize, + precision: Precision, + }, + + /// Top-K sampling. + TopK { + batch: usize, + vocab_size: usize, + k: usize, + }, + + /// Token sampling. + Sampling { + batch: usize, + vocab_size: usize, + temperature: f32, + }, + + /// Data loading from storage. + DataLoad { + bytes: usize, + async_: bool, + }, + + /// Data preprocessing. + DataPreprocess { + batch: usize, + transforms: Vec, + }, + + /// Tokenization. + Tokenization { + text_bytes: usize, + vocab_size: usize, + }, + + /// Detokenization. + Detokenization { + tokens: usize, + vocab_size: usize, + }, + + /// Checkpoint save. + Checkpoint { + bytes: usize, + async_: bool, + }, + + /// All-reduce across devices. + AllReduce { + elements: usize, + precision: Precision, + devices: usize, + }, + + /// Backward pass for a layer. + Backward { + forward_op: Box, + }, + + /// Optimizer step. + OptimizerStep { + parameters: usize, + optimizer: String, + precision: Precision, + }, + + /// Transpose. + Transpose { + shape: Vec, + axes: Vec, + }, + + /// Reshape. + Reshape { + from: Vec, + to: Vec, + }, + + /// Concatenate tensors. + Concat { + shapes: Vec>, + axis: usize, + }, + + /// Generic operation. + Generic { + op_type: OperationType, + flops: f64, + memory: u64, + }, +} + +impl Operation { + /// Returns the operation type. + pub fn op_type(&self) -> OperationType { + match self { + Operation::MatMul { .. } => OperationType::MatMul, + Operation::Conv2d { .. } => OperationType::Conv2d, + Operation::BatchNorm { .. } => OperationType::BatchNorm, + Operation::LayerNorm { .. } => OperationType::LayerNorm, + Operation::SelfAttention { .. } => OperationType::SelfAttention, + Operation::FlashAttention { .. } => OperationType::FlashAttention, + Operation::Add { .. } => OperationType::Add, + Operation::Mul { .. } => OperationType::Mul, + Operation::ReLU { .. } => OperationType::ReLU, + Operation::GeLU { .. } => OperationType::GeLU, + Operation::SiLU { .. } => OperationType::SiLU, + Operation::Softmax { .. } => OperationType::Softmax, + Operation::Embedding { .. } => OperationType::Embedding, + Operation::RoPE { .. } => OperationType::RoPE, + Operation::KVCache { .. } => OperationType::KVCache, + Operation::TopK { .. } => OperationType::TopK, + Operation::Sampling { .. } => OperationType::Sampling, + Operation::DataLoad { .. } => OperationType::DataLoad, + Operation::DataPreprocess { .. } => OperationType::DataPreprocess, + Operation::Tokenization { .. } => OperationType::Tokenization, + Operation::Detokenization { .. } => OperationType::Detokenization, + Operation::Checkpoint { .. } => OperationType::Checkpoint, + Operation::AllReduce { .. } => OperationType::AllReduce, + Operation::Backward { .. } => OperationType::Backward, + Operation::OptimizerStep { .. } => OperationType::OptimizerStep, + Operation::Transpose { .. } => OperationType::Transpose, + Operation::Reshape { .. } => OperationType::Reshape, + Operation::Concat { .. } => OperationType::Concat, + Operation::Generic { op_type, .. } => *op_type, + } + } + + /// Returns the precision used. + pub fn precision(&self) -> Precision { + match self { + Operation::MatMul { precision, .. } + | Operation::Conv2d { precision, .. } + | Operation::BatchNorm { precision, .. } + | Operation::LayerNorm { precision, .. } + | Operation::SelfAttention { precision, .. } + | Operation::FlashAttention { precision, .. } + | Operation::Add { precision, .. } + | Operation::Mul { precision, .. } + | Operation::Softmax { precision, .. } + | Operation::Embedding { precision, .. } + | Operation::RoPE { precision, .. } + | Operation::KVCache { precision, .. } + | Operation::AllReduce { precision, .. } + | Operation::OptimizerStep { precision, .. } => *precision, + Operation::Backward { forward_op } => forward_op.precision(), + _ => Precision::Fp32, // Default + } + } + + /// Estimates FLOPS for the operation. + pub fn estimated_flops(&self) -> f64 { + match self { + // MatMul: 2 * M * N * K (multiply-add) + Operation::MatMul { m, n, k, .. } => 2.0 * (*m as f64) * (*n as f64) * (*k as f64), + + // Conv2d: 2 * batch * out * H * W * in * K * K + Operation::Conv2d { + batch, + in_channels, + out_channels, + height, + width, + kernel_size, + .. + } => { + 2.0 * (*batch as f64) + * (*out_channels as f64) + * (*height as f64) + * (*width as f64) + * (*in_channels as f64) + * (*kernel_size as f64) + * (*kernel_size as f64) + } + + // Self-attention: 4 * batch * seq * seq * head_dim * heads + Operation::SelfAttention { + batch, + seq_len, + num_heads, + head_dim, + .. + } + | Operation::FlashAttention { + batch, + seq_len, + num_heads, + head_dim, + .. + } => { + 4.0 * (*batch as f64) + * (*seq_len as f64) + * (*seq_len as f64) + * (*head_dim as f64) + * (*num_heads as f64) + } + + // Element-wise: 1 FLOP per element + Operation::Add { elements, .. } + | Operation::Mul { elements, .. } + | Operation::ReLU { elements } + | Operation::GeLU { elements } + | Operation::SiLU { elements } => *elements as f64, + + // Softmax: ~5 ops per element (exp, sum, div) + Operation::Softmax { + batch, seq_len, .. + } => 5.0 * (*batch as f64) * (*seq_len as f64), + + // Embedding: just lookup, minimal FLOPS + Operation::Embedding { + batch, + seq_len, + embed_dim, + .. + } => (*batch as f64) * (*seq_len as f64) * (*embed_dim as f64) * 0.1, + + // Backward: ~2x forward + Operation::Backward { forward_op } => forward_op.estimated_flops() * 2.0, + + // Generic + Operation::Generic { flops, .. } => *flops, + + // I/O operations: minimal compute + _ => 1000.0, + } + } + + /// Estimates memory usage (bytes). + pub fn estimated_memory(&self) -> u64 { + let precision_bytes = match self.precision() { + Precision::Fp64 => 8, + Precision::Fp32 => 4, + Precision::Fp16 | Precision::Bf16 => 2, + Precision::Int8 => 1, + Precision::Int4 => 1, // Rounded up + }; + + match self { + Operation::MatMul { m, n, k, .. } => { + // Input A (m×k) + Input B (k×n) + Output (m×n) + ((*m * *k) + (*k * *n) + (*m * *n)) as u64 * precision_bytes + } + + Operation::SelfAttention { + batch, + seq_len, + num_heads, + head_dim, + .. + } => { + // Q, K, V, Output, intermediate attention + 5 * (*batch as u64) + * (*seq_len as u64) + * (*num_heads as u64) + * (*head_dim as u64) + * precision_bytes + } + + Operation::FlashAttention { + batch, + seq_len, + num_heads, + head_dim, + .. + } => { + // FlashAttention uses much less memory + 2 * (*batch as u64) + * (*seq_len as u64) + * (*num_heads as u64) + * (*head_dim as u64) + * precision_bytes + } + + Operation::KVCache { + batch, + seq_len, + num_heads, + head_dim, + .. + } => { + // K and V caches + 2 * (*batch as u64) + * (*seq_len as u64) + * (*num_heads as u64) + * (*head_dim as u64) + * precision_bytes + } + + Operation::Generic { memory, .. } => *memory, + + _ => 1024 * 1024, // 1 MB default + } + } + + /// Creates the backward operation for this operation. + pub fn backward(&self) -> Option { + match self { + Operation::MatMul { .. } + | Operation::Conv2d { .. } + | Operation::SelfAttention { .. } + | Operation::FlashAttention { .. } + | Operation::LayerNorm { .. } + | Operation::BatchNorm { .. } => Some(Operation::Backward { + forward_op: Box::new(self.clone()), + }), + _ => None, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_matmul_flops() { + let op = Operation::MatMul { + m: 1024, + n: 1024, + k: 1024, + precision: Precision::Fp32, + }; + + let flops = op.estimated_flops(); + // 2 * 1024^3 = ~2.1 billion FLOPS + assert!(flops > 2e9 && flops < 2.2e9); + } + + #[test] + fn test_attention_memory() { + let regular = Operation::SelfAttention { + batch: 1, + seq_len: 4096, + num_heads: 32, + head_dim: 128, + precision: Precision::Fp16, + }; + + let flash = Operation::FlashAttention { + batch: 1, + seq_len: 4096, + num_heads: 32, + head_dim: 128, + precision: Precision::Fp16, + }; + + // FlashAttention should use less memory + assert!(flash.estimated_memory() < regular.estimated_memory()); + } + + #[test] + fn test_backward_creation() { + let forward = Operation::MatMul { + m: 1024, + n: 1024, + k: 1024, + precision: Precision::Fp32, + }; + + let backward = forward.backward(); + assert!(backward.is_some()); + + if let Some(Operation::Backward { forward_op }) = backward { + assert!(matches!(*forward_op, Operation::MatMul { .. })); + } + } +} diff --git a/crates/synor-compute/src/processor/profiles.rs b/crates/synor-compute/src/processor/profiles.rs new file mode 100644 index 0000000..f61be69 --- /dev/null +++ b/crates/synor-compute/src/processor/profiles.rs @@ -0,0 +1,513 @@ +//! Pre-defined processor profiles for common hardware. + +use super::capabilities::{ + ComputeThroughput, MemorySpecs, MemoryType, PowerCharacteristics, ProcessorCapabilities, + WorkloadCharacteristic, +}; +use super::operation::OperationType; +use super::types::PowerTier; +use super::TpuVersion; +use std::collections::HashSet; + +/// Pre-defined processor profiles. +pub struct ProcessorProfiles; + +impl ProcessorProfiles { + // ═══════════════════════════════════════════════════════════════ + // CPU PROFILES + // ═══════════════════════════════════════════════════════════════ + + /// Default CPU profile. + pub fn cpu_default() -> ProcessorCapabilities { + ProcessorCapabilities::cpu(8, 3.5, false) + } + + /// AMD EPYC 9654 (96 cores). + pub fn amd_epyc_9654() -> ProcessorCapabilities { + ProcessorCapabilities { + compute: ComputeThroughput { + fp64_tflops: 2.7, + fp32_tflops: 5.4, + fp16_tflops: 10.8, + bf16_tflops: 10.8, + int8_tops: 21.6, + int4_tops: 43.2, + sparsity_speedup: 1.0, + }, + memory: MemorySpecs { + capacity_bytes: 6 * 1024 * 1024 * 1024 * 1024, // 6 TB max + bandwidth_gbps: 460, + type_: MemoryType::Ddr5, + }, + operations: ProcessorCapabilities::cpu(96, 2.4, false) + .operations, + power: PowerCharacteristics { + tdp_watts: 360, + efficiency: 0.85, + power_tier: PowerTier::High, + }, + optimal_for: vec![ + WorkloadCharacteristic::Sequential, + WorkloadCharacteristic::MemoryBound, + ], + } + } + + /// Intel Xeon w9-3595X (56 cores). + pub fn intel_xeon_w9_3595x() -> ProcessorCapabilities { + ProcessorCapabilities { + compute: ComputeThroughput { + fp64_tflops: 3.2, + fp32_tflops: 6.4, + fp16_tflops: 12.8, + bf16_tflops: 12.8, + int8_tops: 25.6, + int4_tops: 51.2, + sparsity_speedup: 1.0, + }, + memory: MemorySpecs { + capacity_bytes: 4 * 1024 * 1024 * 1024 * 1024, // 4 TB max + bandwidth_gbps: 307, + type_: MemoryType::Ddr5, + }, + operations: ProcessorCapabilities::cpu(56, 2.9, true) + .operations, + power: PowerCharacteristics { + tdp_watts: 350, + efficiency: 0.80, + power_tier: PowerTier::High, + }, + optimal_for: vec![ + WorkloadCharacteristic::Sequential, + WorkloadCharacteristic::MemoryBound, + ], + } + } + + /// Apple M3 Max CPU cores. + pub fn apple_m3_max_cpu() -> ProcessorCapabilities { + ProcessorCapabilities { + compute: ComputeThroughput { + fp64_tflops: 0.3, + fp32_tflops: 0.6, + fp16_tflops: 1.2, + bf16_tflops: 1.2, + int8_tops: 2.4, + int4_tops: 4.8, + sparsity_speedup: 1.0, + }, + memory: MemorySpecs { + capacity_bytes: 128 * 1024 * 1024 * 1024, // 128 GB unified + bandwidth_gbps: 400, + type_: MemoryType::Unified, + }, + operations: ProcessorCapabilities::cpu(16, 4.0, false) + .operations, + power: PowerCharacteristics { + tdp_watts: 40, + efficiency: 0.95, + power_tier: PowerTier::Low, + }, + optimal_for: vec![ + WorkloadCharacteristic::Sequential, + WorkloadCharacteristic::LowPower, + ], + } + } + + // ═══════════════════════════════════════════════════════════════ + // NVIDIA GPU PROFILES + // ═══════════════════════════════════════════════════════════════ + + /// Default NVIDIA GPU profile. + pub fn nvidia_default() -> ProcessorCapabilities { + ProcessorCapabilities::nvidia_gpu(8192, 256, 12, 600, (8, 0)) + } + + /// NVIDIA H100 SXM (80GB). + pub fn nvidia_h100() -> ProcessorCapabilities { + ProcessorCapabilities { + compute: ComputeThroughput { + fp64_tflops: 67.0, + fp32_tflops: 67.0, + fp16_tflops: 1979.0, // With sparsity + bf16_tflops: 1979.0, + int8_tops: 3958.0, + int4_tops: 7916.0, + sparsity_speedup: 2.0, + }, + memory: MemorySpecs { + capacity_bytes: 80 * 1024 * 1024 * 1024, + bandwidth_gbps: 3350, + type_: MemoryType::Hbm3, + }, + operations: ProcessorCapabilities::nvidia_gpu(16896, 528, 80, 3350, (9, 0)) + .operations, + power: PowerCharacteristics { + tdp_watts: 700, + efficiency: 0.90, + power_tier: PowerTier::High, + }, + optimal_for: vec![ + WorkloadCharacteristic::HighlyParallel, + WorkloadCharacteristic::LargeBatch, + WorkloadCharacteristic::ComputeBound, + ], + } + } + + /// NVIDIA A100 (80GB). + pub fn nvidia_a100() -> ProcessorCapabilities { + ProcessorCapabilities { + compute: ComputeThroughput { + fp64_tflops: 19.5, + fp32_tflops: 19.5, + fp16_tflops: 624.0, // With sparsity + bf16_tflops: 624.0, + int8_tops: 1248.0, + int4_tops: 2496.0, + sparsity_speedup: 2.0, + }, + memory: MemorySpecs { + capacity_bytes: 80 * 1024 * 1024 * 1024, + bandwidth_gbps: 2039, + type_: MemoryType::Hbm2e, + }, + operations: ProcessorCapabilities::nvidia_gpu(6912, 432, 80, 2039, (8, 0)) + .operations, + power: PowerCharacteristics { + tdp_watts: 400, + efficiency: 0.88, + power_tier: PowerTier::High, + }, + optimal_for: vec![ + WorkloadCharacteristic::HighlyParallel, + WorkloadCharacteristic::LargeBatch, + WorkloadCharacteristic::ComputeBound, + ], + } + } + + /// NVIDIA RTX 4090. + pub fn nvidia_rtx_4090() -> ProcessorCapabilities { + ProcessorCapabilities { + compute: ComputeThroughput { + fp64_tflops: 1.3, + fp32_tflops: 82.6, + fp16_tflops: 330.4, // With sparsity + bf16_tflops: 330.4, + int8_tops: 660.8, + int4_tops: 1321.6, + sparsity_speedup: 2.0, + }, + memory: MemorySpecs { + capacity_bytes: 24 * 1024 * 1024 * 1024, + bandwidth_gbps: 1008, + type_: MemoryType::Gddr6, + }, + operations: ProcessorCapabilities::nvidia_gpu(16384, 512, 24, 1008, (8, 9)) + .operations, + power: PowerCharacteristics { + tdp_watts: 450, + efficiency: 0.85, + power_tier: PowerTier::High, + }, + optimal_for: vec![ + WorkloadCharacteristic::HighlyParallel, + WorkloadCharacteristic::LargeBatch, + ], + } + } + + /// NVIDIA RTX 3090. + pub fn nvidia_rtx_3090() -> ProcessorCapabilities { + ProcessorCapabilities { + compute: ComputeThroughput { + fp64_tflops: 0.6, + fp32_tflops: 35.6, + fp16_tflops: 71.2, + bf16_tflops: 71.2, + int8_tops: 142.4, + int4_tops: 284.8, + sparsity_speedup: 1.0, + }, + memory: MemorySpecs { + capacity_bytes: 24 * 1024 * 1024 * 1024, + bandwidth_gbps: 936, + type_: MemoryType::Gddr6, + }, + operations: ProcessorCapabilities::nvidia_gpu(10496, 328, 24, 936, (8, 6)) + .operations, + power: PowerCharacteristics { + tdp_watts: 350, + efficiency: 0.82, + power_tier: PowerTier::High, + }, + optimal_for: vec![ + WorkloadCharacteristic::HighlyParallel, + WorkloadCharacteristic::LargeBatch, + ], + } + } + + // ═══════════════════════════════════════════════════════════════ + // AMD GPU PROFILES + // ═══════════════════════════════════════════════════════════════ + + /// AMD MI300X. + pub fn amd_mi300x() -> ProcessorCapabilities { + ProcessorCapabilities { + compute: ComputeThroughput { + fp64_tflops: 163.4, + fp32_tflops: 163.4, + fp16_tflops: 1307.0, + bf16_tflops: 1307.0, + int8_tops: 2614.0, + int4_tops: 5228.0, + sparsity_speedup: 2.0, + }, + memory: MemorySpecs { + capacity_bytes: 192 * 1024 * 1024 * 1024, // 192 GB HBM3 + bandwidth_gbps: 5300, + type_: MemoryType::Hbm3, + }, + operations: { + let mut ops = ProcessorCapabilities::nvidia_gpu(16384, 512, 80, 5300, (9, 0)) + .operations; + ops.remove(&OperationType::FlashAttention); // Different implementation + ops + }, + power: PowerCharacteristics { + tdp_watts: 750, + efficiency: 0.88, + power_tier: PowerTier::High, + }, + optimal_for: vec![ + WorkloadCharacteristic::HighlyParallel, + WorkloadCharacteristic::LargeBatch, + WorkloadCharacteristic::MemoryBound, // High memory bandwidth + ], + } + } + + /// AMD RX 7900 XTX. + pub fn amd_rx_7900_xtx() -> ProcessorCapabilities { + ProcessorCapabilities { + compute: ComputeThroughput { + fp64_tflops: 1.9, + fp32_tflops: 61.0, + fp16_tflops: 122.0, + bf16_tflops: 122.0, + int8_tops: 244.0, + int4_tops: 488.0, + sparsity_speedup: 1.0, + }, + memory: MemorySpecs { + capacity_bytes: 24 * 1024 * 1024 * 1024, + bandwidth_gbps: 960, + type_: MemoryType::Gddr6, + }, + operations: { + let mut ops = ProcessorCapabilities::nvidia_gpu(6144, 0, 24, 960, (8, 0)) + .operations; + ops.remove(&OperationType::FlashAttention); + ops + }, + power: PowerCharacteristics { + tdp_watts: 355, + efficiency: 0.80, + power_tier: PowerTier::High, + }, + optimal_for: vec![ + WorkloadCharacteristic::HighlyParallel, + ], + } + } + + // ═══════════════════════════════════════════════════════════════ + // GOOGLE TPU PROFILES + // ═══════════════════════════════════════════════════════════════ + + /// Default TPU profile. + pub fn google_tpu_default() -> ProcessorCapabilities { + ProcessorCapabilities::tpu(TpuVersion::V4) + } + + /// Google TPU v5p. + pub fn google_tpu_v5p() -> ProcessorCapabilities { + ProcessorCapabilities::tpu(TpuVersion::V5p) + } + + /// Google TPU v4. + pub fn google_tpu_v4() -> ProcessorCapabilities { + ProcessorCapabilities::tpu(TpuVersion::V4) + } + + /// Google Edge TPU. + pub fn google_edge_tpu() -> ProcessorCapabilities { + ProcessorCapabilities { + compute: ComputeThroughput { + fp64_tflops: 0.0, + fp32_tflops: 0.0, + fp16_tflops: 0.0, + bf16_tflops: 0.0, + int8_tops: 4.0, + int4_tops: 8.0, + sparsity_speedup: 1.0, + }, + memory: MemorySpecs { + capacity_bytes: 0, // Uses host memory + bandwidth_gbps: 0, + type_: MemoryType::Unified, + }, + operations: { + let mut ops = HashSet::new(); + ops.insert(OperationType::MatMul); + ops.insert(OperationType::Conv2d); + ops.insert(OperationType::DepthwiseConv); + ops.insert(OperationType::Add); + ops.insert(OperationType::Mul); + ops.insert(OperationType::ReLU); + ops.insert(OperationType::Softmax); + ops + }, + power: PowerCharacteristics { + tdp_watts: 2, + efficiency: 0.95, + power_tier: PowerTier::UltraLow, + }, + optimal_for: vec![ + WorkloadCharacteristic::LowPower, + WorkloadCharacteristic::LowLatency, + WorkloadCharacteristic::SmallBatch, + ], + } + } + + // ═══════════════════════════════════════════════════════════════ + // GROQ LPU PROFILE + // ═══════════════════════════════════════════════════════════════ + + /// Groq LPU. + pub fn groq_lpu() -> ProcessorCapabilities { + ProcessorCapabilities::lpu() + } + + // ═══════════════════════════════════════════════════════════════ + // APPLE NEURAL ENGINE PROFILES + // ═══════════════════════════════════════════════════════════════ + + /// Apple Neural Engine (generic). + pub fn apple_neural_engine(cores: u32) -> ProcessorCapabilities { + ProcessorCapabilities::apple_neural_engine(cores) + } + + /// Apple M3 Neural Engine (16 cores). + pub fn apple_m3_neural_engine() -> ProcessorCapabilities { + ProcessorCapabilities::apple_neural_engine(16) + } + + /// Apple M3 Max Neural Engine (16 cores). + pub fn apple_m3_max_neural_engine() -> ProcessorCapabilities { + ProcessorCapabilities::apple_neural_engine(16) // Same as M3 + } + + /// Apple A17 Pro Neural Engine (35 TOPS). + pub fn apple_a17_pro_neural_engine() -> ProcessorCapabilities { + ProcessorCapabilities { + compute: ComputeThroughput { + fp64_tflops: 0.0, + fp32_tflops: 4.4, + fp16_tflops: 8.8, + bf16_tflops: 8.8, + int8_tops: 35.0, + int4_tops: 70.0, + sparsity_speedup: 1.0, + }, + memory: MemorySpecs { + capacity_bytes: 0, // Uses unified memory + bandwidth_gbps: 200, + type_: MemoryType::Unified, + }, + operations: ProcessorCapabilities::apple_neural_engine(16) + .operations, + power: PowerCharacteristics { + tdp_watts: 8, + efficiency: 0.98, + power_tier: PowerTier::UltraLow, + }, + optimal_for: vec![ + WorkloadCharacteristic::LowPower, + WorkloadCharacteristic::LowLatency, + WorkloadCharacteristic::SmallBatch, + ], + } + } + + // ═══════════════════════════════════════════════════════════════ + // QUALCOMM NPU PROFILES + // ═══════════════════════════════════════════════════════════════ + + /// Qualcomm Hexagon NPU (Snapdragon 8 Gen 3). + pub fn qualcomm_hexagon_8g3() -> ProcessorCapabilities { + ProcessorCapabilities { + compute: ComputeThroughput { + fp64_tflops: 0.0, + fp32_tflops: 3.0, + fp16_tflops: 6.0, + bf16_tflops: 6.0, + int8_tops: 73.0, // 73 TOPS + int4_tops: 146.0, + sparsity_speedup: 1.0, + }, + memory: MemorySpecs { + capacity_bytes: 0, // Uses system memory + bandwidth_gbps: 77, + type_: MemoryType::Lpddr, + }, + operations: ProcessorCapabilities::apple_neural_engine(16) + .operations, + power: PowerCharacteristics { + tdp_watts: 10, + efficiency: 0.95, + power_tier: PowerTier::UltraLow, + }, + optimal_for: vec![ + WorkloadCharacteristic::LowPower, + WorkloadCharacteristic::LowLatency, + WorkloadCharacteristic::SmallBatch, + ], + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_h100_profile() { + let h100 = ProcessorProfiles::nvidia_h100(); + assert!(h100.compute.fp16_tflops > 1000.0); + assert_eq!(h100.memory.capacity_bytes, 80 * 1024 * 1024 * 1024); + } + + #[test] + fn test_tpu_v5p_profile() { + let tpu = ProcessorProfiles::google_tpu_v5p(); + assert!(tpu.compute.bf16_tflops > 900.0); + } + + #[test] + fn test_groq_lpu_profile() { + let lpu = ProcessorProfiles::groq_lpu(); + assert!(lpu.memory.bandwidth_gbps > 50000); // Very high internal bandwidth + } + + #[test] + fn test_apple_ane_profile() { + let ane = ProcessorProfiles::apple_m3_neural_engine(); + assert!(ane.power.tdp_watts < 20); + assert!(ane.optimal_for.contains(&WorkloadCharacteristic::LowPower)); + } +} diff --git a/crates/synor-compute/src/processor/types.rs b/crates/synor-compute/src/processor/types.rs new file mode 100644 index 0000000..7e9ac3e --- /dev/null +++ b/crates/synor-compute/src/processor/types.rs @@ -0,0 +1,367 @@ +//! Processor type definitions. + +use serde::{Deserialize, Serialize}; + +/// All supported processor types. +#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum ProcessorType { + /// Central Processing Unit. + Cpu(CpuVariant), + /// Graphics Processing Unit. + Gpu(GpuVariant), + /// Tensor Processing Unit (Google). + Tpu(TpuVersion), + /// Neural Processing Unit (various vendors). + Npu(NpuVariant), + /// Language Processing Unit (Groq). + Lpu, + /// Field Programmable Gate Array. + Fpga(FpgaVendor), + /// Digital Signal Processor. + Dsp(DspVariant), + /// WebGPU (browser). + WebGpu, + /// WebAssembly runtime. + Wasm, + /// Custom/Unknown accelerator. + Custom { + vendor: String, + model: String, + }, +} + +impl Default for ProcessorType { + fn default() -> Self { + ProcessorType::Cpu(CpuVariant::default()) + } +} + +/// CPU architecture variants. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum CpuVariant { + /// x86-64 architecture. + X86_64 { avx: AvxSupport }, + /// ARM 64-bit architecture. + Arm64 { sve: bool }, + /// RISC-V architecture. + RiscV { vector: bool }, +} + +impl Default for CpuVariant { + fn default() -> Self { + CpuVariant::X86_64 { + avx: AvxSupport::Avx2, + } + } +} + +/// AVX instruction set support levels. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)] +pub enum AvxSupport { + /// No AVX. + None, + /// AVX (Sandy Bridge+). + Avx, + /// AVX2 (Haswell+). + Avx2, + /// AVX-512 (Skylake-X+). + Avx512, + /// AVX10 (future). + Avx10, +} + +/// GPU vendor variants. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum GpuVariant { + /// NVIDIA CUDA GPU. + NvidiaCuda { + /// Compute capability (major, minor). + compute_capability: (u8, u8), + }, + /// AMD ROCm GPU. + AmdRocm { + /// GFX version (e.g., 1100 for RDNA3). + gfx_version: u32, + }, + /// Intel OneAPI GPU. + IntelOneApi, + /// Apple Metal GPU. + AppleMetal, + /// Qualcomm Adreno GPU. + QualcommAdreno { + /// Adreno model number. + model: u32, + }, + /// ARM Mali GPU. + ArmMali { + /// Mali generation (e.g., G710). + model: u32, + }, + /// IMG PowerVR GPU. + ImgPowerVr, +} + +/// Google TPU versions. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum TpuVersion { + /// TPU v2. + V2, + /// TPU v3. + V3, + /// TPU v4. + V4, + /// TPU v4i (inference). + V4i, + /// TPU v5e (efficiency). + V5e, + /// TPU v5p (performance). + V5p, + /// Edge TPU. + Edge, +} + +/// NPU (Neural Processing Unit) variants. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum NpuVariant { + /// Apple Neural Engine. + AppleNeuralEngine { + /// Number of cores. + cores: u32, + }, + /// Qualcomm Hexagon DSP/NPU. + QualcommHexagon { + /// Version number. + version: u32, + }, + /// Intel VPU (Movidius). + IntelVpu, + /// Huawei Ascend. + HuaweiAscend { + /// Model (310, 910, etc.). + model: u32, + }, + /// Google Edge TPU. + GoogleEdgeTpu, + /// Samsung NPU. + SamsungNpu, + /// MediaTek APU. + MediaTekApu { + /// Version. + version: u32, + }, + /// Custom NPU. + Custom { + /// TOPS (Tera Operations Per Second). + tops: u32, + }, +} + +/// FPGA vendors. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum FpgaVendor { + /// Xilinx (AMD). + Xilinx, + /// Intel (Altera). + Intel, + /// Lattice. + Lattice, + /// Microchip. + Microchip, +} + +/// DSP (Digital Signal Processor) variants. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum DspVariant { + /// Texas Instruments DSP. + TexasInstruments, + /// Analog Devices DSP. + AnalogDevices, + /// Qualcomm Hexagon DSP. + QualcommHexagon, + /// Custom DSP. + Custom, +} + +impl ProcessorType { + /// Returns whether this processor type supports CUDA. + pub fn supports_cuda(&self) -> bool { + matches!(self, ProcessorType::Gpu(GpuVariant::NvidiaCuda { .. })) + } + + /// Returns whether this processor type supports ROCm. + pub fn supports_rocm(&self) -> bool { + matches!(self, ProcessorType::Gpu(GpuVariant::AmdRocm { .. })) + } + + /// Returns whether this processor type supports Metal. + pub fn supports_metal(&self) -> bool { + matches!(self, ProcessorType::Gpu(GpuVariant::AppleMetal)) + } + + /// Returns whether this processor type is a GPU. + pub fn is_gpu(&self) -> bool { + matches!(self, ProcessorType::Gpu(_)) + } + + /// Returns whether this processor type is a CPU. + pub fn is_cpu(&self) -> bool { + matches!(self, ProcessorType::Cpu(_)) + } + + /// Returns whether this processor type is suitable for parallel workloads. + pub fn is_parallel(&self) -> bool { + matches!( + self, + ProcessorType::Gpu(_) | ProcessorType::Tpu(_) | ProcessorType::Fpga(_) + ) + } + + /// Returns whether this processor type is suitable for sequential workloads. + pub fn is_sequential(&self) -> bool { + matches!(self, ProcessorType::Cpu(_) | ProcessorType::Lpu) + } + + /// Returns whether this processor type is power-efficient. + pub fn is_low_power(&self) -> bool { + matches!( + self, + ProcessorType::Npu(_) | ProcessorType::Tpu(TpuVersion::Edge) | ProcessorType::Wasm + ) + } + + /// Returns the typical power consumption tier. + pub fn power_tier(&self) -> PowerTier { + match self { + ProcessorType::Npu(_) | ProcessorType::Wasm => PowerTier::UltraLow, + ProcessorType::Cpu(CpuVariant::Arm64 { .. }) => PowerTier::Low, + ProcessorType::Cpu(_) => PowerTier::Medium, + ProcessorType::Gpu(GpuVariant::AppleMetal) => PowerTier::Medium, + ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability }) + if compute_capability.0 >= 8 => + { + PowerTier::High + } + ProcessorType::Gpu(_) => PowerTier::Medium, + ProcessorType::Tpu(TpuVersion::Edge) => PowerTier::UltraLow, + ProcessorType::Tpu(_) => PowerTier::High, + ProcessorType::Lpu => PowerTier::Medium, + ProcessorType::Fpga(_) => PowerTier::Medium, + ProcessorType::Dsp(_) => PowerTier::Low, + ProcessorType::WebGpu => PowerTier::Low, + ProcessorType::Custom { .. } => PowerTier::Medium, + } + } +} + +/// Power consumption tiers. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub enum PowerTier { + /// < 5W (mobile, IoT). + UltraLow, + /// 5-30W (laptop, tablet). + Low, + /// 30-150W (desktop, workstation). + Medium, + /// > 150W (server, data center). + High, +} + +/// Device class for routing decisions. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum DeviceClass { + /// Data center equipment. + DataCenter, + /// Desktop/workstation. + Desktop, + /// Laptop. + Laptop, + /// Mobile phone. + Mobile, + /// Tablet. + Tablet, + /// IoT device. + IoT, + /// Browser (WebGPU/WASM). + Browser, + /// Edge server. + Edge, +} + +impl DeviceClass { + /// Returns typical available compute hours per day. + pub fn typical_availability_hours(&self) -> f32 { + match self { + DeviceClass::DataCenter => 24.0, + DeviceClass::Desktop => 8.0, + DeviceClass::Laptop => 6.0, + DeviceClass::Mobile => 4.0, + DeviceClass::Tablet => 4.0, + DeviceClass::IoT => 24.0, + DeviceClass::Browser => 2.0, + DeviceClass::Edge => 24.0, + } + } + + /// Returns reliability score (0-100). + pub fn reliability_score(&self) -> u32 { + match self { + DeviceClass::DataCenter => 99, + DeviceClass::Edge => 95, + DeviceClass::Desktop => 80, + DeviceClass::Laptop => 60, + DeviceClass::Mobile => 40, + DeviceClass::Tablet => 50, + DeviceClass::IoT => 70, + DeviceClass::Browser => 30, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_processor_type_properties() { + let nvidia = ProcessorType::Gpu(GpuVariant::NvidiaCuda { + compute_capability: (9, 0), + }); + assert!(nvidia.supports_cuda()); + assert!(nvidia.is_gpu()); + assert!(nvidia.is_parallel()); + + let cpu = ProcessorType::Cpu(CpuVariant::X86_64 { + avx: AvxSupport::Avx512, + }); + assert!(cpu.is_cpu()); + assert!(cpu.is_sequential()); + + let lpu = ProcessorType::Lpu; + assert!(lpu.is_sequential()); + + let npu = ProcessorType::Npu(NpuVariant::AppleNeuralEngine { cores: 16 }); + assert!(npu.is_low_power()); + } + + #[test] + fn test_power_tiers() { + let h100 = ProcessorType::Gpu(GpuVariant::NvidiaCuda { + compute_capability: (9, 0), + }); + assert_eq!(h100.power_tier(), PowerTier::High); + + let npu = ProcessorType::Npu(NpuVariant::AppleNeuralEngine { cores: 16 }); + assert_eq!(npu.power_tier(), PowerTier::UltraLow); + + let arm = ProcessorType::Cpu(CpuVariant::Arm64 { sve: false }); + assert_eq!(arm.power_tier(), PowerTier::Low); + } + + #[test] + fn test_device_class() { + assert_eq!(DeviceClass::DataCenter.typical_availability_hours(), 24.0); + assert_eq!(DeviceClass::Mobile.typical_availability_hours(), 4.0); + assert_eq!(DeviceClass::DataCenter.reliability_score(), 99); + assert_eq!(DeviceClass::Browser.reliability_score(), 30); + } +} diff --git a/crates/synor-compute/src/scheduler/load_balancer.rs b/crates/synor-compute/src/scheduler/load_balancer.rs new file mode 100644 index 0000000..17a695e --- /dev/null +++ b/crates/synor-compute/src/scheduler/load_balancer.rs @@ -0,0 +1,810 @@ +//! Load balancer with work stealing for heterogeneous compute. +//! +//! Supports: +//! - Cross-processor-type work migration +//! - Energy-aware balancing +//! - Latency-aware scheduling +//! - Real-time utilization metrics + +use crate::device::{DeviceInfo, DeviceRegistry}; +use crate::processor::{Operation, OperationType, ProcessorId, ProcessorType}; +use crate::task::{Task, TaskId, TaskPriority}; +use super::TaskAssignment; +use parking_lot::RwLock; +use std::collections::HashMap; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +/// Balancing strategy for the load balancer. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum BalancingStrategy { + /// Optimize for speed (minimize execution time). + Speed, + /// Optimize for energy efficiency. + Energy, + /// Balance speed and energy. + Balanced, + /// Optimize for cost (spot pricing). + Cost, + /// Optimize for latency (inference workloads). + Latency, +} + +impl Default for BalancingStrategy { + fn default() -> Self { + BalancingStrategy::Balanced + } +} + +/// Real-time processor metrics. +#[derive(Clone, Debug, Default)] +pub struct ProcessorMetrics { + /// Current utilization (0.0 - 1.0). + pub utilization: f64, + /// Queue depth (pending tasks). + pub queue_depth: u64, + /// Average task completion time (ms). + pub avg_completion_ms: f64, + /// Tasks completed in last minute. + pub throughput_per_min: u64, + /// Current power draw (watts). + pub power_watts: f64, + /// Temperature (celsius). + pub temperature: f64, + /// Last updated timestamp. + pub last_updated: Option, +} + +/// Load balancer for heterogeneous compute environments. +pub struct LoadBalancer { + /// Device registry for processor info. + device_registry: Option>, + /// Current load per processor (task count). + loads: RwLock>, + /// Real-time metrics per processor. + metrics: RwLock>, + /// Processor type mapping. + processor_types: RwLock>, + /// Work stealing threshold (0.0 - 1.0). + steal_threshold: f64, + /// Rebalance threshold (0.0 - 1.0). + rebalance_threshold: f64, + /// Current balancing strategy. + strategy: RwLock, + /// Migration history (to prevent thrashing). + migration_history: RwLock>, +} + +/// Record of a task migration. +#[derive(Clone, Debug)] +struct MigrationRecord { + task_id: TaskId, + from: ProcessorId, + to: ProcessorId, + timestamp: Instant, +} + +impl LoadBalancer { + /// Creates a new load balancer. + pub fn new() -> Self { + Self { + device_registry: None, + loads: RwLock::new(HashMap::new()), + metrics: RwLock::new(HashMap::new()), + processor_types: RwLock::new(HashMap::new()), + steal_threshold: 0.3, + rebalance_threshold: 0.2, + strategy: RwLock::new(BalancingStrategy::default()), + migration_history: RwLock::new(Vec::new()), + } + } + + /// Creates a load balancer with device registry. + pub fn with_registry(device_registry: Arc) -> Self { + Self { + device_registry: Some(device_registry), + loads: RwLock::new(HashMap::new()), + metrics: RwLock::new(HashMap::new()), + processor_types: RwLock::new(HashMap::new()), + steal_threshold: 0.3, + rebalance_threshold: 0.2, + strategy: RwLock::new(BalancingStrategy::default()), + migration_history: RwLock::new(Vec::new()), + } + } + + /// Sets the balancing strategy. + pub fn set_strategy(&self, strategy: BalancingStrategy) { + *self.strategy.write() = strategy; + } + + /// Gets the current strategy. + pub fn strategy(&self) -> BalancingStrategy { + *self.strategy.read() + } + + /// Register a processor with its type. + pub fn register_processor(&self, processor_id: ProcessorId, processor_type: ProcessorType) { + self.loads.write().insert(processor_id, AtomicU64::new(0)); + self.metrics.write().insert(processor_id, ProcessorMetrics::default()); + self.processor_types.write().insert(processor_id, processor_type); + } + + /// Unregister a processor. + pub fn unregister_processor(&self, processor_id: ProcessorId) { + self.loads.write().remove(&processor_id); + self.metrics.write().remove(&processor_id); + self.processor_types.write().remove(&processor_id); + } + + /// Update real-time metrics for a processor. + pub fn update_metrics(&self, processor_id: ProcessorId, metrics: ProcessorMetrics) { + if let Some(existing) = self.metrics.write().get_mut(&processor_id) { + *existing = ProcessorMetrics { + last_updated: Some(Instant::now()), + ..metrics + }; + } + } + + /// Get current load for a processor. + pub fn get_load(&self, processor_id: ProcessorId) -> u64 { + self.loads.read() + .get(&processor_id) + .map(|l| l.load(Ordering::Relaxed)) + .unwrap_or(0) + } + + /// Increment load for a processor. + pub fn increment_load(&self, processor_id: ProcessorId) { + if let Some(load) = self.loads.read().get(&processor_id) { + load.fetch_add(1, Ordering::Relaxed); + } + } + + /// Decrement load for a processor. + pub fn decrement_load(&self, processor_id: ProcessorId) { + if let Some(load) = self.loads.read().get(&processor_id) { + load.fetch_sub(1, Ordering::Relaxed); + } + } + + /// Check if an operation can run on a processor type. + pub fn can_execute(&self, op: &Operation, processor_type: &ProcessorType) -> bool { + let op_type = op.op_type(); + + match processor_type { + // CPUs can handle most sequential operations + ProcessorType::Cpu(_) => matches!( + op_type, + OperationType::MatMul + | OperationType::Conv2d + | OperationType::Conv3d + | OperationType::DepthwiseConv + | OperationType::BatchNorm + | OperationType::LayerNorm + | OperationType::Add + | OperationType::Mul + | OperationType::ReLU + | OperationType::GeLU + | OperationType::SiLU + | OperationType::Softmax + | OperationType::Sum + | OperationType::Mean + | OperationType::Max + | OperationType::ArgMax + | OperationType::Embedding + | OperationType::TopK + | OperationType::Sampling + | OperationType::Tokenization + | OperationType::Detokenization + | OperationType::DataLoad + | OperationType::DataPreprocess + | OperationType::Transpose + | OperationType::Reshape + | OperationType::Concat + | OperationType::Split + ), + + // GPUs excel at parallel operations + ProcessorType::Gpu(_) => matches!( + op_type, + OperationType::MatMul + | OperationType::Conv2d + | OperationType::Conv3d + | OperationType::DepthwiseConv + | OperationType::BatchNorm + | OperationType::LayerNorm + | OperationType::SelfAttention + | OperationType::CrossAttention + | OperationType::FlashAttention + | OperationType::Add + | OperationType::Mul + | OperationType::ReLU + | OperationType::GeLU + | OperationType::SiLU + | OperationType::Softmax + | OperationType::Sum + | OperationType::Mean + | OperationType::Max + | OperationType::ArgMax + | OperationType::Embedding + | OperationType::RoPE + | OperationType::KVCache + | OperationType::TopK + | OperationType::Sampling + | OperationType::Transpose + | OperationType::Reshape + | OperationType::Concat + | OperationType::Split + | OperationType::Gather + | OperationType::Scatter + | OperationType::AllReduce + | OperationType::AllGather + | OperationType::ReduceScatter + | OperationType::Backward + | OperationType::OptimizerStep + | OperationType::GradientClip + ), + + // TPUs optimized for ML + ProcessorType::Tpu(_) => matches!( + op_type, + OperationType::MatMul + | OperationType::Conv2d + | OperationType::BatchNorm + | OperationType::LayerNorm + | OperationType::SelfAttention + | OperationType::CrossAttention + | OperationType::FlashAttention + | OperationType::Add + | OperationType::Mul + | OperationType::ReLU + | OperationType::GeLU + | OperationType::SiLU + | OperationType::Softmax + | OperationType::Sum + | OperationType::Mean + | OperationType::Embedding + | OperationType::RoPE + | OperationType::KVCache + | OperationType::AllReduce + | OperationType::AllGather + | OperationType::ReduceScatter + | OperationType::Backward + | OperationType::OptimizerStep + ), + + // NPUs for neural network inference + ProcessorType::Npu(_) => matches!( + op_type, + OperationType::MatMul + | OperationType::Conv2d + | OperationType::DepthwiseConv + | OperationType::BatchNorm + | OperationType::LayerNorm + | OperationType::SelfAttention + | OperationType::Add + | OperationType::Mul + | OperationType::ReLU + | OperationType::GeLU + | OperationType::SiLU + | OperationType::Softmax + | OperationType::Sum + | OperationType::Mean + ), + + // LPUs for sequential inference (optimized for LLMs) + ProcessorType::Lpu => matches!( + op_type, + OperationType::MatMul + | OperationType::LayerNorm + | OperationType::SelfAttention + | OperationType::FlashAttention + | OperationType::Add + | OperationType::Mul + | OperationType::ReLU + | OperationType::GeLU + | OperationType::SiLU + | OperationType::Softmax + | OperationType::Embedding + | OperationType::RoPE + | OperationType::KVCache + | OperationType::TopK + | OperationType::Sampling + ), + + // FPGAs can be programmed for anything + ProcessorType::Fpga(_) => true, + + // DSPs for signal processing + ProcessorType::Dsp(_) => matches!( + op_type, + OperationType::Conv2d + | OperationType::DepthwiseConv + | OperationType::Add + | OperationType::Mul + | OperationType::Sum + | OperationType::Mean + | OperationType::Max + ), + + // WebGPU has limited operations + ProcessorType::WebGpu => matches!( + op_type, + OperationType::MatMul + | OperationType::Conv2d + | OperationType::Add + | OperationType::Mul + | OperationType::ReLU + | OperationType::Softmax + | OperationType::Sum + | OperationType::Transpose + | OperationType::Reshape + ), + + // WASM for portable compute + ProcessorType::Wasm => matches!( + op_type, + OperationType::MatMul + | OperationType::Add + | OperationType::Mul + | OperationType::ReLU + | OperationType::Softmax + | OperationType::Sum + | OperationType::Mean + | OperationType::Tokenization + | OperationType::Detokenization + ), + + // Custom processors - assume they can handle anything + ProcessorType::Custom { .. } => true, + } + } + + /// Calculate a score for assigning a task to a processor. + fn calculate_score( + &self, + task: &Task, + processor_id: ProcessorId, + processor_type: &ProcessorType, + ) -> f64 { + let strategy = *self.strategy.read(); + let load = self.get_load(processor_id); + let metrics = self.metrics.read(); + let proc_metrics = metrics.get(&processor_id); + + // Base score from compatibility + if !self.can_execute(&task.operation, processor_type) { + return f64::NEG_INFINITY; + } + + // Get utilization and metrics + let utilization = proc_metrics.map(|m| m.utilization).unwrap_or(load as f64 / 100.0); + let power = proc_metrics.map(|m| m.power_watts).unwrap_or(100.0); + let avg_completion = proc_metrics.map(|m| m.avg_completion_ms).unwrap_or(100.0); + + // Calculate score based on strategy + match strategy { + BalancingStrategy::Speed => { + // Prioritize low utilization and fast completion + let speed_score = 1.0 / (avg_completion.max(1.0) * (1.0 + utilization)); + + // Bonus for powerful processor types + let type_bonus = match processor_type { + ProcessorType::Gpu(_) => 2.0, + ProcessorType::Tpu(_) => 2.5, + ProcessorType::Lpu => 3.0, // Fastest for inference + ProcessorType::Npu(_) => 1.5, + _ => 1.0, + }; + + speed_score * type_bonus + } + + BalancingStrategy::Energy => { + // Prioritize low power consumption + let energy_score = 1.0 / power.max(1.0); + + // Bonus for efficient processor types + let efficiency_bonus = match processor_type { + ProcessorType::Npu(_) => 3.0, // Most efficient + ProcessorType::Lpu => 2.0, + ProcessorType::Cpu(_) => 1.5, + ProcessorType::Wasm => 2.0, // Low overhead + _ => 1.0, + }; + + energy_score * efficiency_bonus * (1.0 - utilization * 0.5) + } + + BalancingStrategy::Balanced => { + // Balance speed and energy + let speed = 1.0 / avg_completion.max(1.0); + let efficiency = 1.0 / power.max(1.0); + let load_factor = 1.0 - utilization; + + (speed * 0.4 + efficiency * 0.3 + load_factor * 0.3) + } + + BalancingStrategy::Cost => { + // Prioritize cheaper resources (consumer devices) + let cost_factor = match processor_type { + ProcessorType::Wasm => 0.1, // Cheapest (browser) + ProcessorType::WebGpu => 0.15, + ProcessorType::Cpu(_) => 0.2, + ProcessorType::Npu(_) => 0.3, // Mobile NPUs + ProcessorType::Gpu(_) => 0.5, + ProcessorType::Lpu => 0.8, + ProcessorType::Tpu(_) => 1.0, // Most expensive + _ => 0.5, + }; + + (1.0 - cost_factor) * (1.0 - utilization) + } + + BalancingStrategy::Latency => { + // Prioritize low latency for inference + let latency_score = 1.0 / avg_completion.max(0.1); + + // Bonus for low-latency processors + let latency_bonus = match processor_type { + ProcessorType::Lpu => 5.0, // Designed for low latency + ProcessorType::Npu(_) => 3.0, + ProcessorType::Gpu(_) => 2.0, + ProcessorType::Tpu(_) => 1.5, + _ => 1.0, + }; + + // Priority boost for critical tasks + let priority_boost = match task.priority { + TaskPriority::Critical => 2.0, + TaskPriority::High => 1.5, + TaskPriority::Normal => 1.0, + TaskPriority::Background => 0.5, + }; + + latency_score * latency_bonus * priority_boost * (1.0 - utilization) + } + } + } + + /// Maybe rebalance a task to a different processor. + pub fn maybe_rebalance( + &self, + task: &Task, + suggested_processor: ProcessorId, + current_assignment: &TaskAssignment, + ) -> ProcessorId { + // Get all registered processors + let processor_types = self.processor_types.read(); + + // If we don't have processor info, use suggested + let suggested_type = match processor_types.get(&suggested_processor) { + Some(t) => t.clone(), + None => return suggested_processor, + }; + + // Calculate score for suggested processor + let suggested_score = self.calculate_score(task, suggested_processor, &suggested_type); + + // Find best alternative + let mut best_processor = suggested_processor; + let mut best_score = suggested_score; + + for (proc_id, proc_type) in processor_types.iter() { + if *proc_id == suggested_processor { + continue; + } + + let score = self.calculate_score(task, *proc_id, proc_type); + + // Only switch if significantly better (prevents thrashing) + if score > best_score * (1.0 + self.rebalance_threshold) { + best_score = score; + best_processor = *proc_id; + } + } + + // Record migration if different + if best_processor != suggested_processor { + self.migration_history.write().push(MigrationRecord { + task_id: task.id, + from: suggested_processor, + to: best_processor, + timestamp: Instant::now(), + }); + } + + best_processor + } + + /// Check if work stealing should happen between two processors. + pub fn should_steal(&self, from: ProcessorId, to: ProcessorId) -> bool { + let from_load = self.get_load(from) as f64; + let to_load = self.get_load(to) as f64; + + if from_load == 0.0 { + return false; + } + + // Check if processor types are compatible for the queued work + let processor_types = self.processor_types.read(); + let from_type = processor_types.get(&from); + let to_type = processor_types.get(&to); + + // Only steal between same processor types by default + // (cross-type stealing requires operation compatibility check) + match (from_type, to_type) { + (Some(ft), Some(tt)) if std::mem::discriminant(ft) == std::mem::discriminant(tt) => { + let diff = (from_load - to_load) / from_load; + diff > self.steal_threshold + } + _ => false, + } + } + + /// Get rebalancing suggestions based on current load. + pub fn get_rebalance_suggestions(&self) -> Vec<(ProcessorId, ProcessorId)> { + let mut suggestions = Vec::new(); + let loads = self.loads.read(); + + let load_values: Vec<_> = loads.iter() + .map(|(id, load)| (*id, load.load(Ordering::Relaxed))) + .collect(); + + if load_values.is_empty() { + return suggestions; + } + + let avg_load: f64 = load_values.iter().map(|(_, l)| *l as f64).sum::() + / load_values.len() as f64; + + let processor_types = self.processor_types.read(); + + let overloaded: Vec<_> = load_values.iter() + .filter(|(_, l)| *l as f64 > avg_load * (1.0 + self.rebalance_threshold)) + .collect(); + + let underloaded: Vec<_> = load_values.iter() + .filter(|(_, l)| (*l as f64) < avg_load * (1.0 - self.rebalance_threshold)) + .collect(); + + // Only suggest migrations between compatible processor types + for (over_id, _) in overloaded { + let over_type = processor_types.get(over_id); + + for (under_id, _) in &underloaded { + let under_type = processor_types.get(under_id); + + // Check type compatibility + if let (Some(ot), Some(ut)) = (over_type, under_type) { + if std::mem::discriminant(ot) == std::mem::discriminant(ut) { + suggestions.push((*over_id, *under_id)); + } + } + } + } + + suggestions + } + + /// Get load statistics. + pub fn get_stats(&self) -> LoadBalancerStats { + let loads = self.loads.read(); + let metrics = self.metrics.read(); + + let total_load: u64 = loads.values().map(|l| l.load(Ordering::Relaxed)).sum(); + let processor_count = loads.len(); + let avg_load = if processor_count > 0 { + total_load as f64 / processor_count as f64 + } else { + 0.0 + }; + + let total_utilization: f64 = metrics.values().map(|m| m.utilization).sum(); + let avg_utilization = if processor_count > 0 { + total_utilization / processor_count as f64 + } else { + 0.0 + }; + + let total_power: f64 = metrics.values().map(|m| m.power_watts).sum(); + let migrations = self.migration_history.read().len(); + + LoadBalancerStats { + total_load, + avg_load, + processor_count, + avg_utilization, + total_power_watts: total_power, + total_migrations: migrations, + strategy: *self.strategy.read(), + } + } + + /// Clean up old migration history. + pub fn cleanup_history(&self, max_age: Duration) { + let cutoff = Instant::now() - max_age; + self.migration_history.write().retain(|r| r.timestamp > cutoff); + } +} + +impl Default for LoadBalancer { + fn default() -> Self { + Self::new() + } +} + +/// Load balancer statistics. +#[derive(Clone, Debug)] +pub struct LoadBalancerStats { + /// Total tasks across all processors. + pub total_load: u64, + /// Average load per processor. + pub avg_load: f64, + /// Number of registered processors. + pub processor_count: usize, + /// Average utilization (0.0 - 1.0). + pub avg_utilization: f64, + /// Total power consumption (watts). + pub total_power_watts: f64, + /// Total migrations performed. + pub total_migrations: usize, + /// Current balancing strategy. + pub strategy: BalancingStrategy, +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::processor::{CpuVariant, GpuVariant, Operation, Precision}; + use crate::task::TaskStatus; + + fn create_test_task(priority: TaskPriority) -> Task { + Task { + id: TaskId::new(), + operation: Operation::MatMul { + m: 1024, + n: 1024, + k: 1024, + precision: Precision::Fp32, + }, + priority, + dependencies: vec![], + status: TaskStatus::Pending, + deadline: None, + } + } + + #[test] + fn test_load_tracking() { + let balancer = LoadBalancer::new(); + + balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default())); + balancer.register_processor(ProcessorId(1), ProcessorType::Cpu(CpuVariant::default())); + + assert_eq!(balancer.get_load(ProcessorId(0)), 0); + + balancer.increment_load(ProcessorId(0)); + balancer.increment_load(ProcessorId(0)); + balancer.increment_load(ProcessorId(1)); + + assert_eq!(balancer.get_load(ProcessorId(0)), 2); + assert_eq!(balancer.get_load(ProcessorId(1)), 1); + + balancer.decrement_load(ProcessorId(0)); + assert_eq!(balancer.get_load(ProcessorId(0)), 1); + } + + #[test] + fn test_should_steal_same_type() { + let balancer = LoadBalancer::new(); + + // Register two CPUs + balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default())); + balancer.register_processor(ProcessorId(1), ProcessorType::Cpu(CpuVariant::default())); + + // Give processor 0 high load + for _ in 0..10 { + balancer.increment_load(ProcessorId(0)); + } + balancer.increment_load(ProcessorId(1)); + + // Should steal between same types + assert!(balancer.should_steal(ProcessorId(0), ProcessorId(1))); + assert!(!balancer.should_steal(ProcessorId(1), ProcessorId(0))); + } + + #[test] + fn test_should_not_steal_different_types() { + let balancer = LoadBalancer::new(); + + // Register CPU and GPU + balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default())); + balancer.register_processor( + ProcessorId(1), + ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability: (8, 9) }), + ); + + // Give CPU high load + for _ in 0..10 { + balancer.increment_load(ProcessorId(0)); + } + + // Should NOT steal between different types + assert!(!balancer.should_steal(ProcessorId(0), ProcessorId(1))); + } + + #[test] + fn test_can_execute() { + let balancer = LoadBalancer::new(); + + let matmul = Operation::MatMul { + m: 1024, + n: 1024, + k: 1024, + precision: Precision::Fp32, + }; + + let flash_attention = Operation::FlashAttention { + batch: 32, + seq_len: 2048, + num_heads: 32, + head_dim: 128, + precision: Precision::Fp16, + }; + + let cpu = ProcessorType::Cpu(CpuVariant::default()); + let gpu = ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability: (8, 9) }); + let lpu = ProcessorType::Lpu; + + // MatMul can run on all + assert!(balancer.can_execute(&matmul, &cpu)); + assert!(balancer.can_execute(&matmul, &gpu)); + assert!(balancer.can_execute(&matmul, &lpu)); + + // FlashAttention only on GPU/TPU/LPU + assert!(!balancer.can_execute(&flash_attention, &cpu)); + assert!(balancer.can_execute(&flash_attention, &gpu)); + } + + #[test] + fn test_strategy_affects_scoring() { + let balancer = LoadBalancer::new(); + + let cpu_id = ProcessorId(0); + let npu_id = ProcessorId(1); + + balancer.register_processor(cpu_id, ProcessorType::Cpu(CpuVariant::default())); + balancer.register_processor(npu_id, ProcessorType::Npu(crate::processor::NpuVariant::AppleNeuralEngine { cores: 16 })); + + let task = create_test_task(TaskPriority::Normal); + + // Energy strategy should prefer NPU + balancer.set_strategy(BalancingStrategy::Energy); + let assignment = TaskAssignment::new(); + let result = balancer.maybe_rebalance(&task, cpu_id, &assignment); + + // NPU should be preferred for energy efficiency + assert_eq!(result, npu_id); + } + + #[test] + fn test_stats() { + let balancer = LoadBalancer::new(); + + balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default())); + balancer.register_processor(ProcessorId(1), ProcessorType::Cpu(CpuVariant::default())); + + balancer.increment_load(ProcessorId(0)); + balancer.increment_load(ProcessorId(0)); + balancer.increment_load(ProcessorId(1)); + + let stats = balancer.get_stats(); + assert_eq!(stats.total_load, 3); + assert_eq!(stats.processor_count, 2); + assert!((stats.avg_load - 1.5).abs() < 0.01); + } +} diff --git a/crates/synor-compute/src/scheduler/mod.rs b/crates/synor-compute/src/scheduler/mod.rs new file mode 100644 index 0000000..aaf6b5e --- /dev/null +++ b/crates/synor-compute/src/scheduler/mod.rs @@ -0,0 +1,559 @@ +//! Heterogeneous scheduler for multi-processor task assignment. +//! +//! Features: +//! - Optimal task-to-processor assignment +//! - Work stealing for load balancing +//! - Pipeline parallelism across processor types +//! - Dynamic rebalancing based on actual throughput + +mod load_balancer; +mod work_queue; + +pub use load_balancer::LoadBalancer; +pub use work_queue::WorkQueue; + +use crate::device::DeviceRegistry; +use crate::error::ComputeError; +use crate::processor::{Operation, Processor, ProcessorId, ProcessorType}; +use crate::task::{Task, TaskId, TaskPriority}; +use parking_lot::RwLock; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Duration; + +/// Heterogeneous scheduler that manages tasks across all processor types. +pub struct HeterogeneousScheduler { + /// Device registry. + device_registry: Arc, + /// Per-processor-type task queues. + queues: RwLock>, + /// Load balancer. + load_balancer: LoadBalancer, + /// Active schedules. + active_schedules: RwLock>, +} + +impl HeterogeneousScheduler { + /// Creates a new heterogeneous scheduler. + pub fn new(device_registry: Arc) -> Self { + Self { + device_registry, + queues: RwLock::new(HashMap::new()), + load_balancer: LoadBalancer::new(), + active_schedules: RwLock::new(HashMap::new()), + } + } + + /// Schedule a set of tasks for execution. + pub async fn schedule(&self, tasks: Vec) -> Result { + if tasks.is_empty() { + return Ok(ScheduleResult { + schedule: Schedule::empty(), + estimated_makespan: Duration::ZERO, + processor_utilization: HashMap::new(), + }); + } + + // 1. Build dependency graph + let deps = self.build_dependency_graph(&tasks); + + // 2. Assign tasks to optimal processors + let assignment = self.assign_tasks(&tasks, &deps).await?; + + // 3. Create execution schedule with stages + let schedule = self.create_schedule(&tasks, &assignment, &deps)?; + + // 4. Estimate metrics + let makespan = self.estimate_makespan(&schedule); + let utilization = self.estimate_utilization(&schedule); + + // 5. Store active schedule + self.active_schedules.write().insert(schedule.id, schedule.clone()); + + Ok(ScheduleResult { + schedule, + estimated_makespan: makespan, + processor_utilization: utilization, + }) + } + + /// Execute a schedule. + pub async fn execute(&self, schedule: &Schedule) -> Result { + let mut results = HashMap::new(); + let start = std::time::Instant::now(); + + // Execute stages in order + for stage in &schedule.stages { + // Execute all tasks in this stage in parallel + let mut handles = Vec::new(); + + for task_id in &stage.tasks { + let task = schedule.tasks.get(task_id) + .ok_or_else(|| ComputeError::Internal(format!("Task not found: {:?}", task_id)))?; + let processor_id = schedule.assignment.get(task_id) + .ok_or_else(|| ComputeError::Internal(format!("No assignment for task: {:?}", task_id)))?; + + let processor = self.device_registry.get_processor(processor_id)?; + let task_clone = task.clone(); + + handles.push(tokio::spawn(async move { + processor.execute(task_clone.operation).await + })); + } + + // Wait for all tasks in stage + for (i, handle) in handles.into_iter().enumerate() { + let task_id = stage.tasks[i]; + match handle.await { + Ok(Ok(result)) => { + results.insert(task_id, TaskExecutionResult::Success(result)); + } + Ok(Err(e)) => { + results.insert(task_id, TaskExecutionResult::Failed(e.to_string())); + } + Err(e) => { + results.insert(task_id, TaskExecutionResult::Failed(e.to_string())); + } + } + } + } + + let total_time = start.elapsed(); + + Ok(ExecutionResult { + results, + total_time, + actual_utilization: self.measure_utilization(), + }) + } + + /// Assign tasks to optimal processors. + async fn assign_tasks( + &self, + tasks: &[Task], + deps: &DependencyGraph, + ) -> Result { + let mut assignment = TaskAssignment::new(); + + // Sort tasks by priority and dependencies (topological sort) + let sorted_tasks = self.topological_sort(tasks, deps); + + for task in sorted_tasks { + // Find best processor for this task + let best_processor = self.find_best_processor(&task).await?; + + // Check if we should rebalance + let final_processor = self.load_balancer + .maybe_rebalance(&task, best_processor, &assignment); + + assignment.assign(task.id, final_processor); + } + + Ok(assignment) + } + + /// Find the best processor for a task. + async fn find_best_processor(&self, task: &Task) -> Result { + let mut best_score = f64::NEG_INFINITY; + let mut best_processor = None; + + // Get all available processors + let processors = self.device_registry.all_processors(); + + for processor in processors { + if !processor.can_execute(&task.operation) { + continue; + } + + // Calculate score based on multiple factors + let exec_time = processor.estimate_time(&task.operation); + let energy = processor.estimate_energy(&task.operation); + let load = processor.utilization(); + + // Score = 1 / (time * (1 + load) * energy_factor) + let time_factor = exec_time.as_secs_f64().max(0.001); + let load_factor = 1.0 + load; + let energy_factor = 1.0 + (energy / 1000.0); // Normalize energy + + let score = 1.0 / (time_factor * load_factor * energy_factor); + + if score > best_score { + best_score = score; + best_processor = Some(processor.id()); + } + } + + best_processor.ok_or_else(|| { + ComputeError::NoSuitableProcessor(format!("{:?}", task.operation.op_type())) + }) + } + + /// Build dependency graph from tasks. + fn build_dependency_graph(&self, tasks: &[Task]) -> DependencyGraph { + let mut graph = DependencyGraph::new(); + + for task in tasks { + graph.add_node(task.id); + for dep in &task.dependencies { + graph.add_edge(*dep, task.id); + } + } + + graph + } + + /// Topological sort of tasks respecting dependencies. + fn topological_sort(&self, tasks: &[Task], deps: &DependencyGraph) -> Vec { + let mut sorted = Vec::new(); + let mut visited = std::collections::HashSet::new(); + let task_map: HashMap = tasks.iter() + .map(|t| (t.id, t.clone())) + .collect(); + + fn visit( + task_id: TaskId, + task_map: &HashMap, + deps: &DependencyGraph, + visited: &mut std::collections::HashSet, + sorted: &mut Vec, + ) { + if visited.contains(&task_id) { + return; + } + visited.insert(task_id); + + // Visit dependencies first + if let Some(task_deps) = deps.dependencies.get(&task_id) { + for dep in task_deps { + visit(*dep, task_map, deps, visited, sorted); + } + } + + if let Some(task) = task_map.get(&task_id) { + sorted.push(task.clone()); + } + } + + for task in tasks { + visit(task.id, &task_map, deps, &mut visited, &mut sorted); + } + + // Sort by priority within dependency constraints + sorted.sort_by(|a, b| b.priority.cmp(&a.priority)); + + sorted + } + + /// Create execution schedule with parallel stages. + fn create_schedule( + &self, + tasks: &[Task], + assignment: &TaskAssignment, + deps: &DependencyGraph, + ) -> Result { + let mut stages = Vec::new(); + let mut scheduled = std::collections::HashSet::new(); + let task_map: HashMap = tasks.iter() + .map(|t| (t.id, t.clone())) + .collect(); + + while scheduled.len() < tasks.len() { + let mut stage_tasks = Vec::new(); + + for task in tasks { + if scheduled.contains(&task.id) { + continue; + } + + // Check if all dependencies are satisfied + let deps_satisfied = task.dependencies.iter() + .all(|dep| scheduled.contains(dep)); + + if deps_satisfied { + stage_tasks.push(task.id); + } + } + + if stage_tasks.is_empty() { + return Err(ComputeError::SchedulingFailed( + "Circular dependency detected".to_string() + )); + } + + for task_id in &stage_tasks { + scheduled.insert(*task_id); + } + + stages.push(ScheduleStage { + stage_id: stages.len(), + tasks: stage_tasks, + }); + } + + Ok(Schedule { + id: ScheduleId::new(), + tasks: task_map, + assignment: assignment.clone(), + stages, + }) + } + + /// Estimate makespan (total execution time). + fn estimate_makespan(&self, schedule: &Schedule) -> Duration { + let mut total = Duration::ZERO; + + for stage in &schedule.stages { + let mut max_stage_time = Duration::ZERO; + + for task_id in &stage.tasks { + if let (Some(task), Some(proc_id)) = ( + schedule.tasks.get(task_id), + schedule.assignment.get(task_id), + ) { + if let Ok(processor) = self.device_registry.get_processor(proc_id) { + let time = processor.estimate_time(&task.operation); + max_stage_time = max_stage_time.max(time); + } + } + } + + total += max_stage_time; + } + + total + } + + /// Estimate processor utilization. + fn estimate_utilization(&self, schedule: &Schedule) -> HashMap { + let mut work_time: HashMap = HashMap::new(); + let makespan = self.estimate_makespan(schedule); + + for task_id in schedule.assignment.assignments.keys() { + if let (Some(task), Some(proc_id)) = ( + schedule.tasks.get(task_id), + schedule.assignment.get(task_id), + ) { + if let Ok(processor) = self.device_registry.get_processor(proc_id) { + let proc_type = processor.processor_type(); + let time = processor.estimate_time(&task.operation); + *work_time.entry(proc_type).or_default() += time; + } + } + } + + work_time + .into_iter() + .map(|(proc_type, time)| { + let utilization = if makespan.as_secs_f64() > 0.0 { + time.as_secs_f64() / makespan.as_secs_f64() + } else { + 0.0 + }; + (proc_type, utilization.min(1.0)) + }) + .collect() + } + + /// Measure actual current utilization. + fn measure_utilization(&self) -> HashMap { + let mut utilization = HashMap::new(); + + for processor in self.device_registry.all_processors() { + let proc_type = processor.processor_type(); + let util = processor.utilization(); + utilization + .entry(proc_type) + .and_modify(|u| *u = (*u + util) / 2.0) + .or_insert(util); + } + + utilization + } +} + +/// Schedule identifier. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct ScheduleId(pub u64); + +impl ScheduleId { + /// Creates a new schedule ID. + pub fn new() -> Self { + use rand::Rng; + ScheduleId(rand::thread_rng().gen()) + } +} + +impl Default for ScheduleId { + fn default() -> Self { + Self::new() + } +} + +/// Task-to-processor assignment. +#[derive(Clone, Debug, Default)] +pub struct TaskAssignment { + /// Map from task ID to processor ID. + pub assignments: HashMap, +} + +impl TaskAssignment { + /// Creates a new empty assignment. + pub fn new() -> Self { + Self { + assignments: HashMap::new(), + } + } + + /// Assigns a task to a processor. + pub fn assign(&mut self, task_id: TaskId, processor_id: ProcessorId) { + self.assignments.insert(task_id, processor_id); + } + + /// Gets the assigned processor for a task. + pub fn get(&self, task_id: &TaskId) -> Option { + self.assignments.get(task_id).copied() + } +} + +/// Dependency graph for tasks. +#[derive(Clone, Debug, Default)] +pub struct DependencyGraph { + /// Dependencies: task -> list of tasks it depends on. + pub dependencies: HashMap>, + /// Dependents: task -> list of tasks that depend on it. + pub dependents: HashMap>, +} + +impl DependencyGraph { + /// Creates a new empty dependency graph. + pub fn new() -> Self { + Self { + dependencies: HashMap::new(), + dependents: HashMap::new(), + } + } + + /// Adds a node (task) to the graph. + pub fn add_node(&mut self, task_id: TaskId) { + self.dependencies.entry(task_id).or_default(); + self.dependents.entry(task_id).or_default(); + } + + /// Adds a dependency edge (from depends on to). + pub fn add_edge(&mut self, from: TaskId, to: TaskId) { + self.dependencies.entry(to).or_default().push(from); + self.dependents.entry(from).or_default().push(to); + } +} + +/// Execution schedule. +#[derive(Clone, Debug)] +pub struct Schedule { + /// Schedule ID. + pub id: ScheduleId, + /// All tasks. + pub tasks: HashMap, + /// Task assignments. + pub assignment: TaskAssignment, + /// Execution stages (tasks within a stage can run in parallel). + pub stages: Vec, +} + +impl Schedule { + /// Creates an empty schedule. + pub fn empty() -> Self { + Self { + id: ScheduleId::new(), + tasks: HashMap::new(), + assignment: TaskAssignment::new(), + stages: Vec::new(), + } + } +} + +/// A stage of parallel tasks. +#[derive(Clone, Debug)] +pub struct ScheduleStage { + /// Stage index. + pub stage_id: usize, + /// Tasks in this stage (can run in parallel). + pub tasks: Vec, +} + +/// Result of scheduling. +#[derive(Clone, Debug)] +pub struct ScheduleResult { + /// The schedule. + pub schedule: Schedule, + /// Estimated total execution time. + pub estimated_makespan: Duration, + /// Estimated processor utilization by type. + pub processor_utilization: HashMap, +} + +/// Result of execution. +#[derive(Clone, Debug)] +pub struct ExecutionResult { + /// Results per task. + pub results: HashMap, + /// Total execution time. + pub total_time: Duration, + /// Actual processor utilization. + pub actual_utilization: HashMap, +} + +/// Result of a single task execution. +#[derive(Clone, Debug)] +pub enum TaskExecutionResult { + /// Task completed successfully. + Success(crate::processor::OperationResult), + /// Task failed. + Failed(String), +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::processor::Precision; + use crate::task::TaskStatus; + + fn create_test_task(id: u64, op: Operation, deps: Vec) -> Task { + Task { + id: TaskId(id), + operation: op, + priority: TaskPriority::Normal, + dependencies: deps, + status: TaskStatus::Pending, + deadline: None, + } + } + + #[test] + fn test_dependency_graph() { + let mut graph = DependencyGraph::new(); + + graph.add_node(TaskId(1)); + graph.add_node(TaskId(2)); + graph.add_node(TaskId(3)); + + graph.add_edge(TaskId(1), TaskId(2)); // 2 depends on 1 + graph.add_edge(TaskId(1), TaskId(3)); // 3 depends on 1 + graph.add_edge(TaskId(2), TaskId(3)); // 3 depends on 2 + + assert_eq!(graph.dependencies[&TaskId(2)], vec![TaskId(1)]); + assert_eq!(graph.dependencies[&TaskId(3)], vec![TaskId(1), TaskId(2)]); + } + + #[test] + fn test_task_assignment() { + let mut assignment = TaskAssignment::new(); + + assignment.assign(TaskId(1), ProcessorId(0)); + assignment.assign(TaskId(2), ProcessorId(1)); + + assert_eq!(assignment.get(&TaskId(1)), Some(ProcessorId(0))); + assert_eq!(assignment.get(&TaskId(2)), Some(ProcessorId(1))); + assert_eq!(assignment.get(&TaskId(3)), None); + } +} diff --git a/crates/synor-compute/src/scheduler/work_queue.rs b/crates/synor-compute/src/scheduler/work_queue.rs new file mode 100644 index 0000000..fba13d1 --- /dev/null +++ b/crates/synor-compute/src/scheduler/work_queue.rs @@ -0,0 +1,271 @@ +//! Work queue with thread-safe task management. + +use crate::processor::ProcessorType; +use crate::task::{Task, TaskId, TaskPriority}; +use crossbeam_channel::{bounded, Receiver, Sender, TryRecvError}; +use std::collections::HashMap; +use std::sync::atomic::{AtomicU64, Ordering}; + +/// Work queue for a specific processor type. +pub struct WorkQueue { + /// Task sender (for producers). + sender: Sender, + /// Task receiver (for consumers). + receiver: Receiver, + /// Processor type this queue is for. + processor_type: ProcessorType, + /// Current queue size. + size: AtomicU64, + /// Total tasks processed. + processed: AtomicU64, +} + +impl WorkQueue { + /// Creates a new work queue for a processor type. + pub fn new(processor_type: ProcessorType, capacity: usize) -> Self { + let (sender, receiver) = bounded(capacity.max(1024)); + + Self { + sender, + receiver, + processor_type, + size: AtomicU64::new(0), + processed: AtomicU64::new(0), + } + } + + /// Push a task to the queue. + pub fn push(&self, task: Task) { + if self.sender.try_send(task).is_ok() { + self.size.fetch_add(1, Ordering::Relaxed); + } + } + + /// Pop a task from the queue (ignores worker_id for compatibility). + pub fn pop(&self, _worker_id: usize) -> Option { + self.pop_any() + } + + /// Pop any task from the queue. + pub fn pop_any(&self) -> Option { + match self.receiver.try_recv() { + Ok(task) => { + self.size.fetch_sub(1, Ordering::Relaxed); + self.processed.fetch_add(1, Ordering::Relaxed); + Some(task) + } + Err(TryRecvError::Empty) | Err(TryRecvError::Disconnected) => None, + } + } + + /// Pop from global queue (alias for pop_any). + pub fn pop_global(&self) -> Option { + self.pop_any() + } + + /// Steal a batch of tasks from another queue. + pub fn steal_batch_from(&self, other: &WorkQueue, max_tasks: usize) -> Vec { + let mut stolen = Vec::new(); + + while stolen.len() < max_tasks { + if let Some(task) = other.pop_any() { + stolen.push(task); + } else { + break; + } + } + + // Push stolen tasks to this queue + for task in &stolen { + // Tasks are already accounted for in `other`, just push to self + if self.sender.try_send(task.clone()).is_ok() { + self.size.fetch_add(1, Ordering::Relaxed); + } + } + + stolen + } + + /// Get current queue size. + pub fn len(&self) -> usize { + self.size.load(Ordering::Relaxed) as usize + } + + /// Check if queue is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Get number of tasks processed. + pub fn processed_count(&self) -> u64 { + self.processed.load(Ordering::Relaxed) + } + + /// Get processor type for this queue. + pub fn processor_type(&self) -> ProcessorType { + self.processor_type.clone() + } + + /// Get utilization estimate (0.0 - 1.0). + pub fn utilization(&self) -> f64 { + let size = self.size.load(Ordering::Relaxed) as f64; + let capacity = self.sender.capacity().unwrap_or(1024) as f64; + (size / capacity).min(1.0) + } + + /// Get a stealer for cross-queue work stealing. + pub fn get_stealer(&self) -> QueueStealer { + QueueStealer { + receiver: self.receiver.clone(), + } + } +} + +/// Stealer handle for cross-queue work stealing. +#[derive(Clone)] +pub struct QueueStealer { + receiver: Receiver, +} + +impl QueueStealer { + /// Try to steal a task. + pub fn steal(&self) -> Option { + self.receiver.try_recv().ok() + } +} + +/// Priority queue wrapper for tasks. +pub struct PriorityWorkQueue { + /// Queues by priority level. + queues: HashMap, + /// Processor type. + processor_type: ProcessorType, +} + +impl PriorityWorkQueue { + /// Creates a new priority work queue. + pub fn new(processor_type: ProcessorType, capacity_per_priority: usize) -> Self { + let mut queues = HashMap::new(); + + for priority in [ + TaskPriority::Critical, + TaskPriority::High, + TaskPriority::Normal, + TaskPriority::Background, + ] { + queues.insert(priority, WorkQueue::new(processor_type.clone(), capacity_per_priority)); + } + + Self { + queues, + processor_type, + } + } + + /// Push a task with its priority. + pub fn push(&self, task: Task) { + let priority = task.priority; + if let Some(queue) = self.queues.get(&priority) { + queue.push(task); + } + } + + /// Pop highest priority task available. + pub fn pop(&self, worker_id: usize) -> Option { + // Try priorities in order: Critical > High > Normal > Background + for priority in [ + TaskPriority::Critical, + TaskPriority::High, + TaskPriority::Normal, + TaskPriority::Background, + ] { + if let Some(queue) = self.queues.get(&priority) { + if let Some(task) = queue.pop(worker_id) { + return Some(task); + } + } + } + None + } + + /// Get total queue size. + pub fn len(&self) -> usize { + self.queues.values().map(|q| q.len()).sum() + } + + /// Check if all queues are empty. + pub fn is_empty(&self) -> bool { + self.queues.values().all(|q| q.is_empty()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::processor::{CpuVariant, Operation, Precision}; + use crate::task::TaskStatus; + + fn create_test_task(id: u64, priority: TaskPriority) -> Task { + Task { + id: TaskId(id), + operation: Operation::MatMul { + m: 1024, + n: 1024, + k: 1024, + precision: Precision::Fp32, + }, + priority, + dependencies: vec![], + status: TaskStatus::Pending, + deadline: None, + } + } + + #[test] + fn test_work_queue_basic() { + let queue = WorkQueue::new( + ProcessorType::Cpu(CpuVariant::default()), + 100, + ); + + assert!(queue.is_empty()); + + queue.push(create_test_task(1, TaskPriority::Normal)); + queue.push(create_test_task(2, TaskPriority::Normal)); + + assert_eq!(queue.len(), 2); + + let task1 = queue.pop(0); + assert!(task1.is_some()); + assert_eq!(queue.len(), 1); + + let task2 = queue.pop(0); + assert!(task2.is_some()); + assert!(queue.is_empty()); + } + + #[test] + fn test_priority_queue() { + let queue = PriorityWorkQueue::new( + ProcessorType::Cpu(CpuVariant::default()), + 100, + ); + + queue.push(create_test_task(1, TaskPriority::Background)); + queue.push(create_test_task(2, TaskPriority::Critical)); + queue.push(create_test_task(3, TaskPriority::Normal)); + + // Should get Critical first + let task = queue.pop(0).unwrap(); + assert_eq!(task.id, TaskId(2)); + assert_eq!(task.priority, TaskPriority::Critical); + + // Then Normal + let task = queue.pop(0).unwrap(); + assert_eq!(task.id, TaskId(3)); + + // Then Background + let task = queue.pop(0).unwrap(); + assert_eq!(task.id, TaskId(1)); + } +} diff --git a/crates/synor-compute/src/task/mod.rs b/crates/synor-compute/src/task/mod.rs new file mode 100644 index 0000000..a51b3f9 --- /dev/null +++ b/crates/synor-compute/src/task/mod.rs @@ -0,0 +1,543 @@ +//! Task definitions and decomposition. + +use crate::error::ComputeError; +use crate::processor::{Operation, OperationType, Precision, ProcessorType}; +use crate::{ComputeJob, JobType}; +use serde::{Deserialize, Serialize}; +use std::time::Duration; + +/// Unique task identifier. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct TaskId(pub u64); + +impl TaskId { + /// Creates a new task ID. + pub fn new() -> Self { + use rand::Rng; + TaskId(rand::thread_rng().gen()) + } +} + +impl Default for TaskId { + fn default() -> Self { + Self::new() + } +} + +impl std::fmt::Display for TaskId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "task_{}", self.0) + } +} + +/// Task priority levels. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub enum TaskPriority { + /// Background, can be preempted. + Background = 0, + /// Normal priority. + Normal = 1, + /// High priority. + High = 2, + /// Critical, must complete. + Critical = 3, +} + +impl Default for TaskPriority { + fn default() -> Self { + TaskPriority::Normal + } +} + +/// Task execution status. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub enum TaskStatus { + /// Waiting to be scheduled. + Pending, + /// Queued for execution. + Queued, + /// Currently executing. + Running, + /// Completed successfully. + Completed, + /// Failed. + Failed, + /// Cancelled. + Cancelled, +} + +/// A schedulable task. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct Task { + /// Task ID. + pub id: TaskId, + /// Operation to execute. + pub operation: Operation, + /// Priority level. + pub priority: TaskPriority, + /// Dependencies (tasks that must complete first). + pub dependencies: Vec, + /// Current status. + pub status: TaskStatus, + /// Deadline (optional). + pub deadline: Option, +} + +impl Task { + /// Creates a new task. + pub fn new(operation: Operation) -> Self { + Self { + id: TaskId::new(), + operation, + priority: TaskPriority::Normal, + dependencies: Vec::new(), + status: TaskStatus::Pending, + deadline: None, + } + } + + /// Sets the priority. + pub fn with_priority(mut self, priority: TaskPriority) -> Self { + self.priority = priority; + self + } + + /// Adds dependencies. + pub fn with_dependencies(mut self, deps: Vec) -> Self { + self.dependencies = deps; + self + } + + /// Sets deadline. + pub fn with_deadline(mut self, deadline: u64) -> Self { + self.deadline = Some(deadline); + self + } + + /// Checks if task is compatible with a processor type. + pub fn is_compatible_with(&self, proc_type: ProcessorType) -> bool { + // Check based on operation type + let op_type = self.operation.op_type(); + + match proc_type { + ProcessorType::Cpu(_) => { + // CPUs can do most things, but slowly + true + } + ProcessorType::Gpu(_) => { + // GPUs are good for parallel operations + matches!( + op_type, + OperationType::MatMul + | OperationType::Conv2d + | OperationType::SelfAttention + | OperationType::FlashAttention + | OperationType::Embedding + | OperationType::Add + | OperationType::Mul + | OperationType::Softmax + ) + } + ProcessorType::Tpu(_) => { + // TPUs are good for large matrix ops + matches!( + op_type, + OperationType::MatMul + | OperationType::Conv2d + | OperationType::SelfAttention + | OperationType::FlashAttention + ) + } + ProcessorType::Lpu => { + // LPUs are good for sequential inference + matches!( + op_type, + OperationType::MatMul + | OperationType::SelfAttention + | OperationType::KVCache + | OperationType::Sampling + ) + } + ProcessorType::Npu(_) => { + // NPUs are good for inference + matches!( + op_type, + OperationType::MatMul + | OperationType::Conv2d + | OperationType::Add + | OperationType::Softmax + ) + } + _ => true, // Default to compatible + } + } +} + +/// Result of task execution. +#[derive(Clone, Debug)] +pub struct TaskResult { + /// Task ID. + pub task_id: TaskId, + /// Output data. + pub output: Vec, + /// Execution duration. + pub duration: Duration, + /// Energy consumed (Joules). + pub energy: f64, +} + +/// Compute task for job execution. +#[derive(Clone, Debug)] +pub struct ComputeTask { + /// Task. + pub task: Task, + /// Resource requirements. + pub requirements: TaskRequirements, + /// Preferred processor type. + pub preferred_processor: Option, + /// Fallback processor type. + pub fallback_processor: Option, +} + +/// Task resource requirements. +#[derive(Clone, Debug, Default)] +pub struct TaskRequirements { + /// Minimum memory (bytes). + pub min_memory: u64, + /// Minimum TFLOPS. + pub min_tflops: f64, + /// Maximum latency (ms). + pub max_latency_ms: Option, + /// Requires specific precision. + pub precision: Option, +} + +/// Decomposed workload. +#[derive(Clone, Debug)] +pub struct DecomposedWorkload { + /// All tasks. + pub tasks: Vec, + /// Total estimated FLOPS. + pub estimated_flops: f64, + /// Total estimated memory. + pub estimated_memory: u64, +} + +/// Task decomposer that breaks jobs into schedulable tasks. +pub struct TaskDecomposer { + /// Default batch size for inference. + inference_batch_size: usize, + /// Default precision. + default_precision: Precision, +} + +impl TaskDecomposer { + /// Creates a new task decomposer. + pub fn new() -> Self { + Self { + inference_batch_size: 32, + default_precision: Precision::Fp16, + } + } + + /// Decomposes a job into tasks. + pub fn decompose(&self, job: &ComputeJob) -> Result, ComputeError> { + match &job.job_type { + JobType::Training { .. } => self.decompose_training(job), + JobType::Inference { .. } => self.decompose_inference(job), + JobType::Container { .. } => self.decompose_container(job), + JobType::Serverless { .. } => self.decompose_serverless(job), + JobType::Wasm { .. } => self.decompose_wasm(job), + } + } + + /// Decompose training job. + fn decompose_training(&self, job: &ComputeJob) -> Result, ComputeError> { + let mut tasks = Vec::new(); + + if let JobType::Training { + epochs, + batch_size, + .. + } = &job.job_type + { + // Data loading task + tasks.push( + Task::new(Operation::DataLoad { + bytes: 1024 * 1024 * 100, // 100MB + async_: true, + }) + .with_priority(TaskPriority::High), + ); + + let data_load_id = tasks[0].id; + + // Preprocessing task + tasks.push( + Task::new(Operation::DataPreprocess { + batch: *batch_size as usize, + transforms: vec!["normalize".to_string(), "augment".to_string()], + }) + .with_dependencies(vec![data_load_id]) + .with_priority(TaskPriority::High), + ); + + let preprocess_id = tasks[1].id; + + // Forward pass (simplified as MatMul) + tasks.push( + Task::new(Operation::MatMul { + m: *batch_size as usize, + n: 4096, + k: 4096, + precision: self.default_precision, + }) + .with_dependencies(vec![preprocess_id]) + .with_priority(TaskPriority::Critical), + ); + + let forward_id = tasks[2].id; + + // Backward pass + tasks.push( + Task::new(Operation::Backward { + forward_op: Box::new(Operation::MatMul { + m: *batch_size as usize, + n: 4096, + k: 4096, + precision: self.default_precision, + }), + }) + .with_dependencies(vec![forward_id]) + .with_priority(TaskPriority::Critical), + ); + + let backward_id = tasks[3].id; + + // Optimizer step + tasks.push( + Task::new(Operation::OptimizerStep { + parameters: 1_000_000, + optimizer: "adamw".to_string(), + precision: self.default_precision, + }) + .with_dependencies(vec![backward_id]) + .with_priority(TaskPriority::High), + ); + } + + Ok(tasks) + } + + /// Decompose inference job. + fn decompose_inference(&self, job: &ComputeJob) -> Result, ComputeError> { + let mut tasks = Vec::new(); + + if let JobType::Inference { batch_size, .. } = &job.job_type { + // Tokenization (CPU optimal) + tasks.push( + Task::new(Operation::Tokenization { + text_bytes: 4096, + vocab_size: 32000, + }) + .with_priority(TaskPriority::High), + ); + + let token_id = tasks[0].id; + + // Embedding (GPU optimal) + tasks.push( + Task::new(Operation::Embedding { + batch: *batch_size as usize, + seq_len: 512, + vocab_size: 32000, + embed_dim: 4096, + precision: self.default_precision, + }) + .with_dependencies(vec![token_id]) + .with_priority(TaskPriority::Critical), + ); + + let embed_id = tasks[1].id; + + // Self-attention (TPU/GPU optimal) + tasks.push( + Task::new(Operation::SelfAttention { + batch: *batch_size as usize, + seq_len: 512, + num_heads: 32, + head_dim: 128, + precision: self.default_precision, + }) + .with_dependencies(vec![embed_id]) + .with_priority(TaskPriority::Critical), + ); + + let attention_id = tasks[2].id; + + // Sampling (LPU optimal) + tasks.push( + Task::new(Operation::Sampling { + batch: *batch_size as usize, + vocab_size: 32000, + temperature: 0.7, + }) + .with_dependencies(vec![attention_id]) + .with_priority(TaskPriority::High), + ); + + let sample_id = tasks[3].id; + + // Detokenization (CPU optimal) + tasks.push( + Task::new(Operation::Detokenization { + tokens: 256, + vocab_size: 32000, + }) + .with_dependencies(vec![sample_id]) + .with_priority(TaskPriority::Normal), + ); + } + + Ok(tasks) + } + + /// Decompose container job. + fn decompose_container(&self, _job: &ComputeJob) -> Result, ComputeError> { + // Container jobs are typically a single task + Ok(vec![Task::new(Operation::Generic { + op_type: OperationType::DataLoad, + flops: 1e9, + memory: 1024 * 1024 * 1024, + }) + .with_priority(TaskPriority::Normal)]) + } + + /// Decompose serverless function. + fn decompose_serverless(&self, _job: &ComputeJob) -> Result, ComputeError> { + // Serverless is typically a single task + Ok(vec![Task::new(Operation::Generic { + op_type: OperationType::DataPreprocess, + flops: 1e6, + memory: 256 * 1024 * 1024, + }) + .with_priority(TaskPriority::High)]) + } + + /// Decompose WASM job. + fn decompose_wasm(&self, _job: &ComputeJob) -> Result, ComputeError> { + // WASM is typically a single task + Ok(vec![Task::new(Operation::Generic { + op_type: OperationType::DataPreprocess, + flops: 1e6, + memory: 16 * 1024 * 1024, + }) + .with_priority(TaskPriority::Normal)]) + } +} + +impl Default for TaskDecomposer { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_task_creation() { + let task = Task::new(Operation::MatMul { + m: 1024, + n: 1024, + k: 1024, + precision: Precision::Fp32, + }) + .with_priority(TaskPriority::High); + + assert_eq!(task.priority, TaskPriority::High); + assert!(task.dependencies.is_empty()); + assert_eq!(task.status, TaskStatus::Pending); + } + + #[test] + fn test_task_dependencies() { + let task1 = Task::new(Operation::DataLoad { + bytes: 1000, + async_: true, + }); + let task1_id = task1.id; + + let task2 = Task::new(Operation::MatMul { + m: 1024, + n: 1024, + k: 1024, + precision: Precision::Fp32, + }) + .with_dependencies(vec![task1_id]); + + assert_eq!(task2.dependencies, vec![task1_id]); + } + + #[test] + fn test_task_compatibility() { + let matmul_task = Task::new(Operation::MatMul { + m: 1024, + n: 1024, + k: 1024, + precision: Precision::Fp32, + }); + + // MatMul should be compatible with GPU and TPU + assert!(matmul_task.is_compatible_with(ProcessorType::Gpu( + crate::processor::GpuVariant::NvidiaCuda { + compute_capability: (8, 0) + } + ))); + assert!(matmul_task.is_compatible_with(ProcessorType::Tpu( + crate::processor::TpuVersion::V5p + ))); + + let data_load_task = Task::new(Operation::DataLoad { + bytes: 1000, + async_: true, + }); + + // DataLoad should be compatible with CPU + assert!(data_load_task.is_compatible_with(ProcessorType::Cpu( + crate::processor::CpuVariant::default() + ))); + } + + #[test] + fn test_task_decomposer() { + let decomposer = TaskDecomposer::new(); + + let job = ComputeJob { + id: crate::JobId::new(), + owner: [0u8; 32], + job_type: JobType::Inference { + model_cid: "model".to_string(), + input_format: "json".to_string(), + batch_size: 1, + }, + resources: crate::ResourceRequirements::default(), + input_cid: None, + max_budget: 1_000_000, + priority: crate::JobPriority::Normal, + created_at: 0, + deadline: None, + }; + + let tasks = decomposer.decompose(&job).unwrap(); + assert!(!tasks.is_empty()); + + // Check dependencies form a chain + for (i, task) in tasks.iter().enumerate() { + if i > 0 { + assert!(!task.dependencies.is_empty()); + } + } + } +} diff --git a/docs/PLAN/PHASE11-Synor-Compute-L2-Part2-HyperEfficiency.md b/docs/PLAN/PHASE11-Synor-Compute-L2-Part2-HyperEfficiency.md new file mode 100644 index 0000000..e3127bd --- /dev/null +++ b/docs/PLAN/PHASE11-Synor-Compute-L2-Part2-HyperEfficiency.md @@ -0,0 +1,1584 @@ +# Phase 11 Part 2: Hyper-Efficient Distributed Compute + +> **Goal**: 90% cost reduction vs AWS/GCP/Azure + 10x speed improvement through innovative architecture + +--- + +## Executive Summary + +Traditional cloud providers have structural inefficiencies: +- **30-60% profit margins** built into pricing +- **Centralized data centers** with high real estate/cooling costs +- **Idle capacity** that customers pay for but don't use +- **Geographic lock-in** preventing arbitrage on electricity costs +- **Billions of idle consumer devices** completely untapped + +Synor Compute eliminates these inefficiencies through: +1. **Protocol-only overhead** (no corporate margin) +2. **Distributed infrastructure** (homes, offices, edge locations) +3. **Real-time spot markets** (fill idle capacity instantly) +4. **Global electricity arbitrage** (route to cheapest regions) +5. **Consumer device mesh** (phones, browsers, desktops) + +--- + +## Part 1: Cost Reduction Architecture + +### 1.1 Zero-Margin Protocol Design + +```rust +// synor-compute/src/economics/pricing.rs + +/// Dynamic pricing engine with near-zero overhead +pub struct DynamicPricingEngine { + /// Base cost = provider's actual cost (electricity + depreciation) + base_cost_calculator: BaseCostCalculator, + /// Protocol fee (only fee in system) + protocol_fee_percent: f32, // 5-10% for network sustainability + /// Real-time supply/demand + market_state: MarketState, + /// Geographic cost map + geo_costs: GeoCostMap, +} + +impl DynamicPricingEngine { + /// Calculate price for compute job + pub fn calculate_price(&self, job: &ComputeJob) -> Price { + // 1. Calculate provider's actual cost + let base_cost = self.base_cost_calculator.compute_cost( + job.resources(), + job.duration_estimate(), + job.provider_location(), + ); + + // 2. Apply supply/demand multiplier (0.5x to 2x) + let demand_multiplier = self.market_state.demand_multiplier( + job.resource_type(), + job.urgency(), + ); + + // 3. Add minimal protocol fee + let protocol_fee = base_cost * self.protocol_fee_percent; + + Price { + base: base_cost, + demand_adjustment: base_cost * (demand_multiplier - 1.0), + protocol_fee, + total: base_cost * demand_multiplier + protocol_fee, + } + } +} + +/// Provider's actual operating cost +pub struct BaseCostCalculator { + /// Electricity cost by region ($/kWh) + electricity_rates: HashMap, + /// Hardware depreciation rates + depreciation: HardwareDepreciation, + /// Cooling efficiency (PUE) + pue_by_climate: HashMap, +} + +impl BaseCostCalculator { + pub fn compute_cost( + &self, + resources: &Resources, + duration: Duration, + location: &GeoLocation, + ) -> f64 { + let region = location.region(); + let electricity_rate = self.electricity_rates.get(®ion).unwrap_or(&0.10); + let pue = self.pue_by_climate.get(&location.climate()).unwrap_or(&1.5); + + // Power consumption + let power_kw = resources.estimated_power_kw(); + let energy_kwh = power_kw * duration.as_hours() * pue; + let electricity_cost = energy_kwh * electricity_rate; + + // Hardware depreciation + let depreciation_cost = self.depreciation.cost_per_hour(resources) + * duration.as_hours(); + + // Network cost (minimal for most jobs) + let network_cost = resources.network_gb() * 0.01; // $0.01/GB + + electricity_cost + depreciation_cost + network_cost + } +} +``` + +### 1.2 Geographic Electricity Arbitrage + +```rust +// synor-compute/src/scheduler/geo_arbitrage.rs + +/// Routes compute to cheapest electricity regions +pub struct GeoArbitrageScheduler { + /// Real-time electricity prices by region + electricity_prices: Arc>>, + /// Provider locations and capabilities + providers: ProviderRegistry, + /// Latency requirements + latency_constraints: LatencyConstraints, +} + +/// Real-time electricity pricing +pub struct ElectricityPrice { + pub region: Region, + pub price_per_kwh: f64, + pub carbon_intensity: f64, // gCO2/kWh + pub renewable_percent: f64, + pub timestamp: Timestamp, + pub forecast_24h: Vec, // Predicted prices +} + +impl GeoArbitrageScheduler { + /// Find cheapest region for job + pub async fn find_optimal_region( + &self, + job: &ComputeJob, + ) -> Result { + let prices = self.electricity_prices.read(); + + // Get regions with available capacity + let available_regions = self.providers + .regions_with_capacity(job.resources()) + .await?; + + // Filter by latency requirements + let viable_regions: Vec<_> = available_regions + .into_iter() + .filter(|r| self.meets_latency_requirements(r, job)) + .collect(); + + // Sort by total cost (electricity + network) + let mut scored: Vec<_> = viable_regions + .iter() + .map(|region| { + let electricity = prices.get(region).map(|p| p.price_per_kwh).unwrap_or(0.15); + let network_cost = self.network_cost_to_user(region, job.user_location()); + let total_score = electricity * job.estimated_kwh() + network_cost; + (region, total_score) + }) + .collect(); + + scored.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); + + Ok(SchedulingDecision { + region: scored[0].0.clone(), + estimated_cost: scored[0].1, + alternatives: scored[1..].iter().map(|(r, c)| (*r, *c)).collect(), + }) + } +} + +/// Electricity price feeds from multiple sources +pub struct ElectricityPriceFeed { + sources: Vec>, +} + +#[async_trait] +pub trait ElectricityDataSource: Send + Sync { + async fn get_prices(&self, regions: &[Region]) -> Result, Error>; +} + +// Implementations for various markets: +// - US: PJM, CAISO, ERCOT, MISO, etc. +// - Europe: EPEX SPOT, Nord Pool +// - Asia: JEPX (Japan), KPX (Korea) +``` + +### 1.3 Spot Market for Idle Capacity + +```rust +// synor-compute/src/market/spot.rs + +/// Real-time spot market for compute resources +pub struct SpotMarket { + /// Order book for each resource type + order_books: HashMap, + /// Matching engine + matcher: MatchingEngine, + /// Price discovery + price_discovery: PriceDiscovery, +} + +/// Compute resource order +pub struct SpotOrder { + pub order_id: OrderId, + pub order_type: OrderType, + pub resource_type: ResourceType, + pub quantity: ResourceQuantity, + pub price_limit: Option, // None = market order + pub duration: Duration, + pub preemptible: bool, // Can be interrupted + pub constraints: JobConstraints, +} + +pub enum OrderType { + /// Provider offering compute + Ask { + provider_id: ProviderId, + available_until: Timestamp, + interruptible: bool, + }, + /// User requesting compute + Bid { + user_id: UserId, + deadline: Option, + priority: Priority, + }, +} + +impl SpotMarket { + /// Submit order to market + pub async fn submit_order(&self, order: SpotOrder) -> Result { + let book = self.order_books.get_mut(&order.resource_type)?; + + match order.order_type { + OrderType::Ask { .. } => { + // Provider offering capacity + book.add_ask(order.clone()); + + // Try to match with existing bids + let matches = self.matcher.match_asks(&book); + self.execute_matches(matches).await + } + OrderType::Bid { .. } => { + // User requesting capacity + if let Some(price_limit) = order.price_limit { + // Limit order - add to book + book.add_bid(order.clone()); + } + + // Try to match immediately + let matches = self.matcher.match_bids(&book, &order); + self.execute_matches(matches).await + } + } + } + + /// Get current spot price for resource + pub fn spot_price(&self, resource: &ResourceType) -> SpotPrice { + let book = &self.order_books[resource]; + SpotPrice { + bid: book.best_bid(), + ask: book.best_ask(), + last_trade: book.last_trade_price(), + volume_24h: book.volume_24h(), + } + } +} + +/// Preemptible compute (like AWS Spot Instances) +pub struct PreemptibleCompute { + /// Discount vs on-demand (typically 70-90%) + discount_percent: f32, + /// Warning time before preemption + warning_seconds: u32, + /// Checkpoint strategy + checkpoint: CheckpointStrategy, +} + +impl PreemptibleCompute { + /// Price at 10-30% of on-demand + pub fn calculate_price(&self, on_demand_price: f64) -> f64 { + on_demand_price * (1.0 - self.discount_percent as f64 / 100.0) + } + + /// Handle preemption gracefully + pub async fn preempt(&self, job: &mut ComputeJob) -> Result<(), Error> { + // 1. Send warning to job + job.send_preemption_warning(self.warning_seconds).await?; + + // 2. Trigger checkpoint if configured + if let CheckpointStrategy::Auto = self.checkpoint { + job.checkpoint().await?; + } + + // 3. Migrate or terminate + if let Some(new_capacity) = self.find_alternative_capacity(job).await? { + job.migrate_to(new_capacity).await + } else { + job.terminate_gracefully().await + } + } +} +``` + +### 1.4 Cost Comparison Calculator + +```rust +// synor-compute/src/economics/comparison.rs + +/// Compare Synor vs traditional cloud pricing +pub struct CostComparison { + synor_pricing: DynamicPricingEngine, + aws_pricing: AwsPricingData, + gcp_pricing: GcpPricingData, + azure_pricing: AzurePricingData, +} + +impl CostComparison { + pub fn compare(&self, workload: &Workload) -> ComparisonResult { + let synor_cost = self.synor_pricing.calculate_total(workload); + let aws_cost = self.aws_pricing.calculate_total(workload); + let gcp_cost = self.gcp_pricing.calculate_total(workload); + let azure_cost = self.azure_pricing.calculate_total(workload); + + let min_cloud = aws_cost.min(gcp_cost).min(azure_cost); + let savings_percent = ((min_cloud - synor_cost) / min_cloud) * 100.0; + + ComparisonResult { + synor: synor_cost, + aws: aws_cost, + gcp: gcp_cost, + azure: azure_cost, + savings_vs_cheapest: savings_percent, + savings_breakdown: self.breakdown_savings(workload), + } + } + + fn breakdown_savings(&self, workload: &Workload) -> SavingsBreakdown { + SavingsBreakdown { + // No cloud margin + margin_elimination: 35.0, // ~35% of cloud pricing is margin + // Distributed infrastructure + infrastructure_savings: 15.0, // No data center overhead + // Spot/preemptible usage + spot_savings: 20.0, // If using preemptible + // Geographic arbitrage + geo_arbitrage: 10.0, // Routing to cheap electricity + // Consumer device usage (if applicable) + consumer_devices: 10.0, // Free compute from devices + // Total + total: 90.0, + } + } +} +``` + +--- + +## Part 2: 10x Speed Architecture + +### 2.1 Intelligent Caching Layer + +```rust +// synor-compute/src/acceleration/cache.rs + +/// Multi-tier caching for inference acceleration +pub struct InferenceCache { + /// L1: Hot cache in GPU memory + gpu_cache: GpuCache, + /// L2: Warm cache in system memory + memory_cache: MemoryCache, + /// L3: Cold cache on NVMe + nvme_cache: NvmeCache, + /// L4: Distributed cache across nodes + distributed_cache: DistributedCache, + /// Semantic cache for similar queries + semantic_cache: SemanticCache, +} + +impl InferenceCache { + /// Check all cache tiers for result + pub async fn get(&self, request: &InferenceRequest) -> Option { + // 1. Exact match in hot cache (sub-ms) + if let Some(result) = self.gpu_cache.get(&request.hash()).await { + return Some(CachedResult::exact(result)); + } + + // 2. Exact match in memory cache (~1ms) + if let Some(result) = self.memory_cache.get(&request.hash()).await { + // Promote to GPU cache + self.gpu_cache.insert(&request.hash(), &result).await; + return Some(CachedResult::exact(result)); + } + + // 3. Semantic similarity search (~5ms) + if let Some((similar_req, result, similarity)) = + self.semantic_cache.find_similar(request, 0.95).await + { + // If >95% similar, reuse result + return Some(CachedResult::semantic(result, similarity)); + } + + // 4. Check distributed cache (~10-50ms) + if let Some(result) = self.distributed_cache.get(&request.hash()).await { + self.memory_cache.insert(&request.hash(), &result).await; + return Some(CachedResult::exact(result)); + } + + None + } +} + +/// Semantic cache using embeddings +pub struct SemanticCache { + /// Embedding model for queries + embedder: EmbeddingModel, + /// Vector index for similarity search + index: VectorIndex, + /// Cached results + results: HashMap, +} + +impl SemanticCache { + /// Find semantically similar cached query + pub async fn find_similar( + &self, + request: &InferenceRequest, + min_similarity: f32, + ) -> Option<(InferenceRequest, InferenceResult, f32)> { + // Embed the query + let embedding = self.embedder.embed(&request.input).await?; + + // Search for similar + let results = self.index.search(&embedding, 1, min_similarity).await; + + results.first().map(|r| { + let cached_result = self.results.get(&r.id).unwrap(); + (r.request.clone(), cached_result.clone(), r.similarity) + }) + } +} +``` + +### 2.2 Speculative Execution + +```rust +// synor-compute/src/acceleration/speculative.rs + +/// Speculative execution for predictable workloads +pub struct SpeculativeExecutor { + /// Prediction model for next likely requests + predictor: RequestPredictor, + /// Pre-computed results + precomputed: PrecomputedResults, + /// Background speculation workers + workers: Vec, +} + +impl SpeculativeExecutor { + /// Predict and pre-execute likely next requests + pub async fn speculate(&self, context: &UserContext) -> Vec { + // 1. Predict likely next requests + let predictions = self.predictor.predict_next(context, 5).await; + + // 2. Execute in background if not cached + let mut futures = Vec::new(); + for (request, probability) in predictions { + if probability > 0.3 && !self.is_cached(&request).await { + futures.push(self.execute_speculative(request, probability)); + } + } + + // 3. Store results for instant retrieval + let results = join_all(futures).await; + for result in &results { + self.precomputed.store(result).await; + } + + results + } + + /// Check if speculative result is available + pub async fn get_speculative(&self, request: &InferenceRequest) -> Option { + self.precomputed.get(&request.hash()).await + } +} + +/// Request pattern predictor using ML +pub struct RequestPredictor { + /// Sequence model for request patterns + model: SequenceModel, + /// User behavior history + history: UserHistoryStore, +} + +impl RequestPredictor { + pub async fn predict_next( + &self, + context: &UserContext, + count: usize, + ) -> Vec<(InferenceRequest, f32)> { + // Get user's recent request history + let history = self.history.get_recent(&context.user_id, 10).await; + + // Predict next likely requests + let predictions = self.model.predict(&history, count).await; + + predictions + .into_iter() + .map(|(req, prob)| (req, prob)) + .collect() + } +} +``` + +### 2.3 Model Optimization Pipeline + +```rust +// synor-compute/src/acceleration/optimization.rs + +/// Automatic model optimization for faster inference +pub struct ModelOptimizer { + /// Quantization engine + quantizer: Quantizer, + /// Pruning engine + pruner: Pruner, + /// Distillation engine + distiller: Distiller, + /// Compilation (TensorRT, etc.) + compiler: ModelCompiler, +} + +impl ModelOptimizer { + /// Optimize model for target hardware + pub async fn optimize( + &self, + model: &Model, + target: &HardwareTarget, + constraints: &OptimizationConstraints, + ) -> Result { + let mut optimized = model.clone(); + + // 1. Quantization (FP32 → FP16 → INT8 → INT4) + if constraints.allow_quantization { + optimized = self.quantizer.quantize( + &optimized, + constraints.min_precision, + constraints.max_accuracy_loss, + ).await?; + } + + // 2. Pruning (remove unimportant weights) + if constraints.allow_pruning { + optimized = self.pruner.prune( + &optimized, + constraints.max_sparsity, + constraints.max_accuracy_loss, + ).await?; + } + + // 3. Compile for target hardware + optimized = self.compiler.compile(&optimized, target).await?; + + Ok(optimized) + } +} + +/// Quantization levels +pub enum QuantizationLevel { + FP32, // Full precision (baseline) + FP16, // Half precision (~2x speedup) + BF16, // Brain float 16 (better range) + INT8, // 8-bit integer (~4x speedup) + INT4, // 4-bit integer (~8x speedup) + FP8, // 8-bit float (H100+) + Mixed, // Dynamic mixed precision +} + +/// Hardware-specific compilation +pub struct ModelCompiler { + /// TensorRT for NVIDIA + tensorrt: TensorRtCompiler, + /// ROCm MIGraphX for AMD + migraphx: MiGraphXCompiler, + /// OpenVINO for Intel + openvino: OpenVinoCompiler, + /// Core ML for Apple + coreml: CoreMlCompiler, +} + +impl ModelCompiler { + pub async fn compile( + &self, + model: &Model, + target: &HardwareTarget, + ) -> Result { + match target.vendor { + Vendor::Nvidia => self.tensorrt.compile(model, target).await, + Vendor::Amd => self.migraphx.compile(model, target).await, + Vendor::Intel => self.openvino.compile(model, target).await, + Vendor::Apple => self.coreml.compile(model, target).await, + _ => Ok(model.clone().into()), + } + } +} +``` + +### 2.4 Continuous Batching + +```rust +// synor-compute/src/acceleration/batching.rs + +/// Continuous batching for maximum GPU utilization +pub struct ContinuousBatcher { + /// Request queue + queue: RequestQueue, + /// Active batches + active_batches: Vec, + /// Batching configuration + config: BatchConfig, +} + +pub struct BatchConfig { + /// Maximum batch size + pub max_batch_size: usize, + /// Maximum wait time for batching + pub max_wait_ms: u64, + /// Enable dynamic batching + pub dynamic: bool, + /// Enable iteration-level batching (for LLMs) + pub iteration_level: bool, +} + +impl ContinuousBatcher { + /// Process requests with continuous batching + pub async fn process(&self) -> Result<(), Error> { + loop { + // 1. Collect requests up to batch size or timeout + let requests = self.queue.collect_batch( + self.config.max_batch_size, + self.config.max_wait_ms, + ).await; + + if requests.is_empty() { + continue; + } + + // 2. Create batch + let batch = self.create_batch(requests)?; + + // 3. Execute batch + let results = self.execute_batch(batch).await?; + + // 4. Dispatch results to individual requests + self.dispatch_results(results).await; + } + } + + /// Iteration-level batching for LLMs (vLLM-style) + pub async fn process_iterative(&self) -> Result<(), Error> { + let mut active_sequences: Vec = Vec::new(); + + loop { + // 1. Add new requests to active sequences + while active_sequences.len() < self.config.max_batch_size { + if let Some(req) = self.queue.try_pop() { + active_sequences.push(ActiveSequence::new(req)); + } else { + break; + } + } + + if active_sequences.is_empty() { + tokio::time::sleep(Duration::from_millis(1)).await; + continue; + } + + // 2. Run one iteration for all active sequences + let next_tokens = self.run_iteration(&active_sequences).await?; + + // 3. Update sequences and remove completed ones + let mut completed = Vec::new(); + for (i, (seq, token)) in active_sequences.iter_mut() + .zip(next_tokens.iter()) + .enumerate() + { + seq.append_token(*token); + if seq.is_complete() { + completed.push(i); + } + } + + // 4. Return completed sequences + for i in completed.into_iter().rev() { + let seq = active_sequences.remove(i); + seq.complete().await; + } + } + } +} +``` + +### 2.5 Speed Comparison + +| Optimization | Speedup Factor | Notes | +|--------------|----------------|-------| +| Semantic caching | 100-1000x | Cache hits are instant | +| Speculative execution | 2-5x | For predictable workloads | +| INT8 quantization | 2-4x | Minimal accuracy loss | +| INT4 quantization | 4-8x | For LLMs with good quality | +| TensorRT compilation | 2-5x | Hardware-specific optimization | +| Continuous batching | 3-10x | Maximum GPU utilization | +| KV cache optimization | 2-3x | For LLM inference | +| **Combined** | **10-50x** | Achievable with all optimizations | + +--- + +## Part 3: Consumer Device Mesh Network + +### 3.1 Universal Device Support + +```rust +// synor-compute/src/mesh/device.rs + +/// Any device that can contribute compute +pub enum DeviceType { + /// Data center GPU (NVIDIA A100, H100) + DataCenterGpu { + model: GpuModel, + vram_gb: u32, + tensor_cores: u32, + }, + /// Consumer GPU (RTX 3090, 4090) + ConsumerGpu { + model: GpuModel, + vram_gb: u32, + }, + /// Mobile device (phone, tablet) + Mobile { + platform: MobilePlatform, + chip: MobileChip, + gpu: MobileGpu, + }, + /// Desktop/Laptop CPU + Cpu { + vendor: CpuVendor, + cores: u32, + threads: u32, + avx_support: AvxSupport, + }, + /// Browser (WebGPU/WebAssembly) + Browser { + runtime: BrowserRuntime, + gpu_available: bool, + wasm_simd: bool, + }, + /// Apple Silicon (M1, M2, M3) + AppleSilicon { + chip: AppleChip, + gpu_cores: u32, + neural_engine_cores: u32, + unified_memory_gb: u32, + }, + /// TPU (if accessible) + Tpu { + version: TpuVersion, + chips: u32, + }, + /// Custom accelerator (Groq LPU, Cerebras, etc.) + CustomAccelerator { + vendor: String, + model: String, + tops: f32, // Tera operations per second + }, +} + +pub enum MobilePlatform { + Ios, + Android, +} + +pub enum MobileChip { + // Apple + A15Bionic, + A16Bionic, + A17Pro, + // Qualcomm + Snapdragon8Gen1, + Snapdragon8Gen2, + Snapdragon8Gen3, + // Samsung + Exynos2200, + Exynos2400, + // Google + Tensor, + TensorG2, + TensorG3, + // MediaTek + Dimensity9000, + Dimensity9300, +} + +pub enum MobileGpu { + // Apple + AppleGpu { cores: u32 }, + // Qualcomm + Adreno { model: u32 }, + // ARM + MaliG { model: u32 }, + // IMG + PowerVR { model: String }, +} +``` + +### 3.2 Device Capability Registry + +```rust +// synor-compute/src/mesh/registry.rs + +/// Central registry of all contributing devices +pub struct DeviceRegistry { + /// All registered devices + devices: HashMap, + /// Devices by capability + by_capability: CapabilityIndex, + /// Devices by location + by_location: GeoIndex, + /// Device reputation scores + reputation: ReputationStore, +} + +/// Detailed device capabilities +pub struct DeviceInfo { + pub device_id: DeviceId, + pub device_type: DeviceType, + pub owner: Address, + /// Compute capabilities + pub compute: ComputeCapabilities, + /// Network capabilities + pub network: NetworkCapabilities, + /// Availability schedule + pub availability: AvailabilitySchedule, + /// Current status + pub status: DeviceStatus, + /// Reputation score (0-100) + pub reputation: u32, +} + +pub struct ComputeCapabilities { + /// FLOPS (single precision) + pub fp32_gflops: f64, + /// FLOPS (half precision) + pub fp16_gflops: f64, + /// Integer operations + pub int8_tops: f64, + /// Memory bandwidth (GB/s) + pub memory_bandwidth: f64, + /// Available memory (GB) + pub memory_gb: f64, + /// Supported frameworks + pub frameworks: Vec, + /// Supported model formats + pub model_formats: Vec, +} + +pub struct NetworkCapabilities { + /// Download speed (Mbps) + pub download_mbps: f64, + /// Upload speed (Mbps) + pub upload_mbps: f64, + /// Latency to nearest edge (ms) + pub edge_latency_ms: u32, + /// NAT type + pub nat_type: NatType, +} + +/// When device is available +pub struct AvailabilitySchedule { + /// Always available + pub always: bool, + /// Available hours (UTC) + pub hours: Option>, + /// Available only when idle + pub idle_only: bool, + /// Available only when charging (mobile) + pub charging_only: bool, + /// Minimum battery level (mobile) + pub min_battery: Option, +} +``` + +### 3.3 Mobile SDK + +```rust +// synor-compute/src/sdk/mobile.rs + +/// Mobile SDK for contributing compute +pub struct SynorMobileSDK { + /// Device identifier + device_id: DeviceId, + /// User wallet + wallet: Wallet, + /// Local inference runtime + runtime: MobileInferenceRuntime, + /// Task queue + tasks: TaskQueue, + /// Earnings tracker + earnings: EarningsTracker, +} + +impl SynorMobileSDK { + /// Initialize SDK + pub async fn init(config: MobileConfig) -> Result { + // 1. Generate or load device ID + let device_id = Self::get_or_create_device_id().await?; + + // 2. Initialize wallet + let wallet = Wallet::load_or_create(&config.keystore_path).await?; + + // 3. Detect device capabilities + let capabilities = Self::detect_capabilities().await?; + + // 4. Initialize inference runtime + let runtime = MobileInferenceRuntime::new(&capabilities)?; + + // 5. Register with network + Self::register_device(&device_id, &capabilities).await?; + + Ok(Self { + device_id, + wallet, + runtime, + tasks: TaskQueue::new(), + earnings: EarningsTracker::new(), + }) + } + + /// Start contributing compute + pub async fn start_contributing(&self, settings: ContributionSettings) -> Result<(), Error> { + loop { + // 1. Check if we should be active + if !self.should_be_active(&settings).await { + tokio::time::sleep(Duration::from_secs(60)).await; + continue; + } + + // 2. Get available tasks + let task = self.get_next_task().await?; + + // 3. Execute task + let result = self.execute_task(&task).await?; + + // 4. Submit result and earn rewards + let reward = self.submit_result(&task, &result).await?; + self.earnings.add(reward); + } + } + + /// Check contribution conditions + async fn should_be_active(&self, settings: &ContributionSettings) -> bool { + // Check battery + if let Some(min_battery) = settings.min_battery { + if Self::battery_level() < min_battery { + return false; + } + } + + // Check if charging + if settings.charging_only && !Self::is_charging() { + return false; + } + + // Check if idle + if settings.idle_only && !Self::is_idle() { + return false; + } + + // Check thermal state + if Self::thermal_state() == ThermalState::Critical { + return false; + } + + true + } +} + +/// Mobile inference runtime +pub struct MobileInferenceRuntime { + /// Core ML for iOS + #[cfg(target_os = "ios")] + coreml: CoreMlRuntime, + /// NNAPI/GPU delegate for Android + #[cfg(target_os = "android")] + tflite: TfLiteRuntime, + /// Metal for Apple GPUs + #[cfg(any(target_os = "ios", target_os = "macos"))] + metal: MetalRuntime, + /// OpenCL for Android GPUs + #[cfg(target_os = "android")] + opencl: OpenClRuntime, +} +``` + +### 3.4 Browser SDK (WebGPU + WASM) + +```typescript +// synor-compute/sdk/browser/src/index.ts + +/** + * Browser SDK for contributing compute via WebGPU/WASM + */ +export class SynorBrowserSDK { + private deviceId: string; + private wallet: BrowserWallet; + private runtime: BrowserRuntime; + private webgpu: GPUDevice | null; + private worker: Worker; + + /** + * Initialize SDK in browser + */ + static async init(config: BrowserConfig): Promise { + const sdk = new SynorBrowserSDK(); + + // 1. Check WebGPU support + if (navigator.gpu) { + const adapter = await navigator.gpu.requestAdapter(); + if (adapter) { + sdk.webgpu = await adapter.requestDevice(); + console.log('WebGPU available'); + } + } + + // 2. Initialize WASM runtime + sdk.runtime = await BrowserRuntime.init(); + + // 3. Create/load wallet + sdk.wallet = await BrowserWallet.loadOrCreate(); + + // 4. Generate device ID + sdk.deviceId = await sdk.generateDeviceId(); + + // 5. Start worker thread + sdk.worker = new Worker(new URL('./worker.ts', import.meta.url)); + + return sdk; + } + + /** + * Start contributing compute + */ + async startContributing(settings: ContributionSettings): Promise { + // Register capabilities + const capabilities = await this.detectCapabilities(); + await this.registerDevice(capabilities); + + // Start task loop in worker + this.worker.postMessage({ + type: 'start', + settings, + capabilities, + }); + + // Listen for results + this.worker.onmessage = async (event) => { + if (event.data.type === 'task_complete') { + await this.submitResult(event.data.taskId, event.data.result); + } else if (event.data.type === 'earnings_update') { + this.onEarningsUpdate?.(event.data.earnings); + } + }; + } + + /** + * Detect browser compute capabilities + */ + private async detectCapabilities(): Promise { + return { + // WebGPU + webgpu: { + available: !!this.webgpu, + maxBufferSize: this.webgpu?.limits.maxBufferSize, + maxComputeWorkgroupSizeX: this.webgpu?.limits.maxComputeWorkgroupSizeX, + }, + // WASM + wasm: { + simd: await this.checkWasmSimd(), + threads: await this.checkWasmThreads(), + memory64: await this.checkWasmMemory64(), + }, + // Hardware + hardwareConcurrency: navigator.hardwareConcurrency, + deviceMemory: (navigator as any).deviceMemory, + // Network + connection: (navigator as any).connection, + }; + } + + /** + * Execute inference task using WebGPU + */ + private async executeWithWebGPU(task: InferenceTask): Promise { + // Load model if not cached + if (!this.modelCache.has(task.modelId)) { + const model = await this.loadModel(task.modelId); + this.modelCache.set(task.modelId, model); + } + + const model = this.modelCache.get(task.modelId)!; + + // Create input buffer + const inputBuffer = this.webgpu!.createBuffer({ + size: task.input.byteLength, + usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST, + }); + this.webgpu!.queue.writeBuffer(inputBuffer, 0, task.input); + + // Execute compute shader + const commandEncoder = this.webgpu!.createCommandEncoder(); + const passEncoder = commandEncoder.beginComputePass(); + passEncoder.setPipeline(model.pipeline); + passEncoder.setBindGroup(0, model.bindGroup); + passEncoder.dispatchWorkgroups( + Math.ceil(task.input.length / 256) + ); + passEncoder.end(); + + // Read results + const outputBuffer = this.webgpu!.createBuffer({ + size: model.outputSize, + usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST, + }); + commandEncoder.copyBufferToBuffer( + model.outputBuffer, 0, + outputBuffer, 0, + model.outputSize + ); + + this.webgpu!.queue.submit([commandEncoder.finish()]); + + await outputBuffer.mapAsync(GPUMapMode.READ); + const result = new Float32Array(outputBuffer.getMappedRange()); + outputBuffer.unmap(); + + return { output: result }; + } +} +``` + +### 3.5 Desktop App SDK + +```rust +// synor-compute/src/sdk/desktop.rs + +/// Desktop SDK for contributing compute +pub struct SynorDesktopSDK { + device_id: DeviceId, + wallet: Wallet, + /// GPU runtime (CUDA, ROCm, Metal, etc.) + gpu_runtime: Option, + /// CPU runtime + cpu_runtime: CpuRuntime, + /// Task scheduler + scheduler: LocalScheduler, + /// System monitor + monitor: SystemMonitor, +} + +impl SynorDesktopSDK { + pub async fn init() -> Result { + // 1. Detect all available compute resources + let gpus = GpuDetector::detect_all().await?; + let cpus = CpuDetector::detect().await?; + + // 2. Initialize runtimes + let gpu_runtime = if !gpus.is_empty() { + Some(GpuRuntime::init(&gpus).await?) + } else { + None + }; + + let cpu_runtime = CpuRuntime::init(&cpus).await?; + + // 3. Start system monitor + let monitor = SystemMonitor::new(); + + Ok(Self { + device_id: DeviceId::generate(), + wallet: Wallet::load_or_create().await?, + gpu_runtime, + cpu_runtime, + scheduler: LocalScheduler::new(), + monitor, + }) + } + + /// Configure resource sharing + pub fn configure(&mut self, config: DesktopContributionConfig) { + // How much GPU to share (0-100%) + self.scheduler.set_gpu_limit(config.gpu_share_percent); + + // How much CPU to share + self.scheduler.set_cpu_limit(config.cpu_share_percent); + + // How much memory to share + self.scheduler.set_memory_limit(config.memory_share_percent); + + // Only run when idle + self.scheduler.set_idle_only(config.idle_only); + + // Power mode preferences + self.scheduler.set_power_mode(config.power_mode); + } +} + +/// GPU detection for all platforms +pub struct GpuDetector; + +impl GpuDetector { + pub async fn detect_all() -> Result, Error> { + let mut gpus = Vec::new(); + + // NVIDIA (CUDA) + #[cfg(feature = "cuda")] + gpus.extend(Self::detect_nvidia().await?); + + // AMD (ROCm) + #[cfg(feature = "rocm")] + gpus.extend(Self::detect_amd().await?); + + // Intel (OneAPI) + #[cfg(feature = "oneapi")] + gpus.extend(Self::detect_intel().await?); + + // Apple (Metal) + #[cfg(target_os = "macos")] + gpus.extend(Self::detect_apple().await?); + + Ok(gpus) + } + + #[cfg(feature = "cuda")] + async fn detect_nvidia() -> Result, Error> { + use nvml_wrapper::Nvml; + + let nvml = Nvml::init()?; + let device_count = nvml.device_count()?; + + let mut gpus = Vec::new(); + for i in 0..device_count { + let device = nvml.device_by_index(i)?; + gpus.push(GpuInfo { + vendor: GpuVendor::Nvidia, + name: device.name()?, + vram_bytes: device.memory_info()?.total, + compute_capability: device.cuda_compute_capability()?, + driver_version: nvml.sys_driver_version()?, + }); + } + + Ok(gpus) + } +} +``` + +### 3.6 Contribution Rewards Model + +```rust +// synor-compute/src/economics/rewards.rs + +/// Reward calculation for device contributors +pub struct RewardCalculator { + /// Base reward rates + base_rates: BaseRates, + /// Reputation multiplier + reputation_multiplier: ReputationMultiplier, + /// Uptime bonuses + uptime_bonus: UptimeBonus, +} + +pub struct BaseRates { + /// Per TFLOP-second (GPU) + pub gpu_tflops: f64, // 0.000001 SYNOR/TFLOP-s + /// Per GFLOP-second (CPU) + pub cpu_gflops: f64, // 0.00000001 SYNOR/GFLOP-s + /// Per GB transferred + pub bandwidth_gb: f64, // 0.001 SYNOR/GB + /// Per hour of availability + pub availability_hour: f64, // 0.0001 SYNOR/hour +} + +impl RewardCalculator { + pub fn calculate_reward(&self, contribution: &Contribution) -> Reward { + let base = match contribution.resource_type { + ResourceType::Gpu => { + contribution.tflops * contribution.duration.as_secs_f64() + * self.base_rates.gpu_tflops + } + ResourceType::Cpu => { + contribution.gflops * contribution.duration.as_secs_f64() + * self.base_rates.cpu_gflops + } + ResourceType::Bandwidth => { + contribution.gb_transferred * self.base_rates.bandwidth_gb + } + }; + + // Apply reputation multiplier (0.5x to 2x) + let reputation_mult = self.reputation_multiplier.get(contribution.reputation); + + // Apply uptime bonus (up to 20% extra) + let uptime_mult = self.uptime_bonus.get(contribution.uptime_percent); + + Reward { + base, + reputation_bonus: base * (reputation_mult - 1.0), + uptime_bonus: base * (uptime_mult - 1.0), + total: base * reputation_mult * uptime_mult, + } + } +} + +/// Expected monthly earnings by device type +pub struct EarningsEstimator; + +impl EarningsEstimator { + pub fn estimate_monthly(device: &DeviceType, hours_per_day: f64) -> MonthlyEarnings { + let hourly = match device { + DeviceType::DataCenterGpu { .. } => 0.50, // $0.50/hour + DeviceType::ConsumerGpu { .. } => 0.10, // $0.10/hour + DeviceType::AppleSilicon { .. } => 0.05, // $0.05/hour + DeviceType::Cpu { .. } => 0.01, // $0.01/hour + DeviceType::Mobile { .. } => 0.005, // $0.005/hour + DeviceType::Browser { .. } => 0.002, // $0.002/hour + _ => 0.01, + }; + + let daily = hourly * hours_per_day; + let monthly = daily * 30.0; + + MonthlyEarnings { + low: monthly * 0.5, // 50% utilization + medium: monthly * 0.7, // 70% utilization + high: monthly, // 100% utilization + } + } +} +``` + +--- + +## Part 4: Task Distribution Algorithm + +### 4.1 Optimal Task Router + +```rust +// synor-compute/src/scheduler/router.rs + +/// Routes tasks to optimal devices +pub struct TaskRouter { + /// Device registry + devices: Arc, + /// Cost optimizer + cost_optimizer: CostOptimizer, + /// Latency optimizer + latency_optimizer: LatencyOptimizer, + /// Load balancer + load_balancer: LoadBalancer, +} + +impl TaskRouter { + /// Find optimal device(s) for task + pub async fn route(&self, task: &ComputeTask) -> Result { + // 1. Filter devices that can handle this task + let capable_devices = self.devices + .find_capable(&task.requirements) + .await?; + + // 2. Score each device + let mut scored: Vec<(DeviceId, RoutingScore)> = Vec::new(); + + for device in capable_devices { + let score = self.score_device(&device, task).await?; + scored.push((device.device_id, score)); + } + + // 3. Sort by composite score + scored.sort_by(|a, b| b.1.composite.partial_cmp(&a.1.composite).unwrap()); + + // 4. Select best device(s) + let selected = if task.distributed { + // Select multiple devices for distributed task + self.select_distributed(&scored, task) + } else { + // Single best device + vec![scored[0].0.clone()] + }; + + Ok(RoutingDecision { + devices: selected, + estimated_cost: scored[0].1.cost, + estimated_latency: scored[0].1.latency, + estimated_duration: scored[0].1.duration, + }) + } + + async fn score_device(&self, device: &DeviceInfo, task: &ComputeTask) -> Result { + // Cost score (lower is better) + let cost = self.cost_optimizer.estimate_cost(device, task); + let cost_score = 1.0 / (1.0 + cost); + + // Latency score (lower is better) + let latency = self.latency_optimizer.estimate_latency(device, task); + let latency_score = 1.0 / (1.0 + latency.as_millis() as f64 / 1000.0); + + // Capability score (higher compute = better) + let capability_score = device.compute.fp16_gflops / task.requirements.min_gflops; + + // Reputation score + let reputation_score = device.reputation as f64 / 100.0; + + // Load score (less loaded = better) + let load = self.load_balancer.current_load(&device.device_id).await?; + let load_score = 1.0 - load; + + // Composite score with weights + let composite = + cost_score * 0.3 + + latency_score * 0.2 + + capability_score * 0.2 + + reputation_score * 0.15 + + load_score * 0.15; + + Ok(RoutingScore { + cost, + latency, + duration: self.estimate_duration(device, task), + composite, + }) + } +} +``` + +### 4.2 Distributed Task Sharding + +```rust +// synor-compute/src/scheduler/sharding.rs + +/// Shard large tasks across multiple devices +pub struct TaskSharder { + /// Sharding strategies + strategies: HashMap>, +} + +#[async_trait] +pub trait ShardingStrategy: Send + Sync { + /// Shard task into subtasks + async fn shard(&self, task: &ComputeTask, devices: &[DeviceInfo]) -> Result, Error>; + + /// Aggregate results from shards + async fn aggregate(&self, results: Vec) -> Result; +} + +/// Data parallel sharding (same model, different data) +pub struct DataParallelSharder; + +#[async_trait] +impl ShardingStrategy for DataParallelSharder { + async fn shard(&self, task: &ComputeTask, devices: &[DeviceInfo]) -> Result, Error> { + let data_size = task.input_data.len(); + let num_shards = devices.len(); + let shard_size = data_size / num_shards; + + let mut shards = Vec::new(); + for (i, device) in devices.iter().enumerate() { + let start = i * shard_size; + let end = if i == num_shards - 1 { data_size } else { start + shard_size }; + + shards.push(Shard { + shard_id: i as u32, + device_id: device.device_id.clone(), + model: task.model.clone(), + data_range: start..end, + }); + } + + Ok(shards) + } + + async fn aggregate(&self, results: Vec) -> Result { + // Concatenate results in order + let mut aggregated = Vec::new(); + for result in results.into_iter().sorted_by_key(|r| r.shard_id) { + aggregated.extend(result.output); + } + Ok(TaskResult { output: aggregated }) + } +} + +/// Model parallel sharding (different model layers on different devices) +pub struct ModelParallelSharder { + /// Layer assignments + layer_assignments: Vec<(usize, usize)>, // (start_layer, end_layer) +} + +#[async_trait] +impl ShardingStrategy for ModelParallelSharder { + async fn shard(&self, task: &ComputeTask, devices: &[DeviceInfo]) -> Result, Error> { + // Assign model layers to devices based on memory + let total_layers = task.model.num_layers(); + let mut assignments = Vec::new(); + let mut current_layer = 0; + + for device in devices { + let layers_for_device = self.calculate_layers_for_device( + device, + &task.model, + total_layers - current_layer, + ); + + assignments.push(Shard { + shard_id: assignments.len() as u32, + device_id: device.device_id.clone(), + model_layers: current_layer..(current_layer + layers_for_device), + data: task.input_data.clone(), + }); + + current_layer += layers_for_device; + } + + Ok(assignments) + } + + async fn aggregate(&self, results: Vec) -> Result { + // Pipeline results through layers + // Last shard result is the final output + Ok(results.into_iter().last().unwrap().into()) + } +} +``` + +--- + +## Summary: Achieving 90% Cost Reduction + 10x Speed + +### Cost Reduction Breakdown + +| Factor | Savings | How | +|--------|---------|-----| +| Zero cloud margin | 35% | Protocol-only, no corporate overhead | +| Distributed infra | 15% | No data center costs | +| Spot market | 20% | Fill idle capacity at discount | +| Geo arbitrage | 10% | Route to cheap electricity | +| Consumer devices | 10% | Free idle compute | +| **Total** | **90%** | Combined savings | + +### Speed Improvement Breakdown + +| Optimization | Speedup | How | +|--------------|---------|-----| +| Semantic caching | 10-100x | Reuse similar results | +| Speculative execution | 2-5x | Pre-compute likely requests | +| Quantization (INT4/INT8) | 4-8x | Reduced precision inference | +| Hardware compilation | 2-5x | TensorRT, custom kernels | +| Continuous batching | 3-10x | Maximum GPU utilization | +| Edge compute | 2-5x | Compute closer to user | +| **Combined** | **10-50x** | With all optimizations | + +### Consumer Device Contribution + +| Device Type | Contribution | Monthly Earnings | +|-------------|--------------|------------------| +| Data center GPU | Full training/inference | $100-500 | +| Consumer GPU | Inference, light training | $30-100 | +| Apple Silicon | Efficient inference | $15-50 | +| Desktop CPU | Data processing, embeddings | $5-20 | +| Mobile device | Edge inference | $2-10 | +| Browser | Light compute, idle cycles | $1-5 | + +This architecture creates a truly decentralized compute network that can undercut traditional cloud providers while providing competitive performance. diff --git a/docs/PLAN/PHASE11-Synor-Compute-L2-Part3-HeterogeneousCompute.md b/docs/PLAN/PHASE11-Synor-Compute-L2-Part3-HeterogeneousCompute.md new file mode 100644 index 0000000..628ca8f --- /dev/null +++ b/docs/PLAN/PHASE11-Synor-Compute-L2-Part3-HeterogeneousCompute.md @@ -0,0 +1,1564 @@ +# Phase 11 Part 3: Heterogeneous Multi-Processor Compute + +> **Goal**: Utilize ALL processor types simultaneously (CPU+GPU+TPU+NPU+LPU+Custom) with intelligent task scheduling to achieve maximum throughput and zero idle processors. + +--- + +## Executive Summary + +Modern compute workloads can be decomposed into subtasks that are optimal for different processor types: + +| Processor | Optimal For | Examples | +|-----------|-------------|----------| +| **CPU** | Sequential logic, control flow, I/O | Data loading, preprocessing, orchestration | +| **GPU** | Parallel matrix operations | Neural network layers, convolutions | +| **TPU** | Tensor operations, ML inference | Transformer attention, matrix multiply | +| **NPU** | Low-power inference | Edge inference, mobile AI | +| **LPU** | Sequential inference (Groq) | LLM token generation | +| **FPGA** | Custom bit-level operations | Cryptography, specialized kernels | +| **DSP** | Signal processing | Audio, video, sensor data | + +**Key Insight**: A single AI training job contains ALL these subtask types. By routing each subtask to the optimal processor, we achieve **2-5x speedup** over GPU-only execution. + +--- + +## Architecture: Unified Heterogeneous Scheduler + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ SYNOR HETEROGENEOUS COMPUTE ENGINE │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ TASK DECOMPOSER │ │ +│ │ Analyzes workload → Identifies subtasks → Maps to optimal processors │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ HETEROGENEOUS SCHEDULER │ │ +│ │ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ │ │ +│ │ │ CPU │ │ GPU │ │ TPU │ │ NPU │ │ LPU │ │ FPGA │ │ DSP │ │ │ +│ │ │Queue │ │Queue │ │Queue │ │Queue │ │Queue │ │Queue │ │Queue │ │ │ +│ │ └──┬───┘ └──┬───┘ └──┬───┘ └──┬───┘ └──┬───┘ └──┬───┘ └──┬───┘ │ │ +│ └─────┼────────┼────────┼────────┼────────┼────────┼────────┼────────────┘ │ +│ │ │ │ │ │ │ │ │ +│ ▼ ▼ ▼ ▼ ▼ ▼ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ PROCESSOR FABRIC │ │ +│ │ │ │ +│ │ ┌────────┐ ┌────────┐ ┌────────┐ ┌────────┐ ┌────────┐ │ │ +│ │ │ CPU │ │ GPU │ │ TPU │ │ NPU │ │ LPU │ │ │ +│ │ │Cluster │ │Cluster │ │ Pods │ │ Array │ │ Rack │ │ │ +│ │ │ │ │ │ │ │ │ │ │ │ │ │ +│ │ │ ┌────┐ │ │ ┌────┐ │ │ ┌────┐ │ │ ┌────┐ │ │ ┌────┐ │ │ │ +│ │ │ │Core│ │ │ │CUDA│ │ │ │MXU │ │ │ │ NPE│ │ │ │TSP │ │ │ │ +│ │ │ │Core│ │ │ │CUDA│ │ │ │MXU │ │ │ │ NPE│ │ │ │TSP │ │ │ │ +│ │ │ │Core│ │ │ │CUDA│ │ │ │MXU │ │ │ │ NPE│ │ │ │TSP │ │ │ │ +│ │ │ └────┘ │ │ └────┘ │ │ └────┘ │ │ └────┘ │ │ └────┘ │ │ │ +│ │ └────────┘ └────────┘ └────────┘ └────────┘ └────────┘ │ │ +│ │ │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ UNIFIED MEMORY FABRIC │ │ +│ │ Zero-copy data sharing │ Automatic placement │ Cache coherency │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Part 1: Processor Type Definitions + +### 1.1 Unified Processor Abstraction + +```rust +// synor-compute/src/heterogeneous/processor.rs + +/// Unified abstraction for any processor type +pub trait Processor: Send + Sync { + /// Processor type identifier + fn processor_type(&self) -> ProcessorType; + + /// Get capabilities + fn capabilities(&self) -> &ProcessorCapabilities; + + /// Check if processor can execute operation + fn can_execute(&self, op: &Operation) -> bool; + + /// Estimate execution time for operation + fn estimate_time(&self, op: &Operation) -> Duration; + + /// Estimate energy consumption for operation + fn estimate_energy(&self, op: &Operation) -> f64; // Joules + + /// Execute operation + async fn execute(&self, op: Operation) -> Result; + + /// Current utilization (0.0 - 1.0) + fn utilization(&self) -> f64; + + /// Available memory + fn available_memory(&self) -> u64; +} + +/// All supported processor types +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum ProcessorType { + /// Central Processing Unit + Cpu(CpuVariant), + /// Graphics Processing Unit + Gpu(GpuVariant), + /// Tensor Processing Unit (Google) + Tpu(TpuVersion), + /// Neural Processing Unit (various vendors) + Npu(NpuVariant), + /// Language Processing Unit (Groq) + Lpu, + /// Field Programmable Gate Array + Fpga(FpgaVendor), + /// Digital Signal Processor + Dsp(DspVariant), + /// Custom/Unknown Accelerator + Custom { vendor: String, model: String }, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum CpuVariant { + X86_64 { avx: AvxSupport }, + Arm64 { sve: bool }, + RiscV { vector: bool }, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum GpuVariant { + NvidiaCuda { compute_capability: (u8, u8) }, + AmdRocm { gfx_version: u32 }, + IntelOneApi, + AppleMetal, + QualcommAdreno, + ArmMali, + WebGpu, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum TpuVersion { + V2, V3, V4, V4i, V5e, V5p, + EdgeTpu, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum NpuVariant { + AppleNeuralEngine { cores: u32 }, + QualcommHexagon { version: u32 }, + IntelVpu, + HuaweiAscend, + GoogleEdgeTpu, + Custom { tops: f32 }, +} +``` + +### 1.2 Processor Capabilities + +```rust +// synor-compute/src/heterogeneous/capabilities.rs + +/// Detailed processor capabilities +#[derive(Clone, Debug)] +pub struct ProcessorCapabilities { + /// Compute throughput + pub compute: ComputeThroughput, + /// Memory specs + pub memory: MemorySpecs, + /// Supported operations + pub operations: HashSet, + /// Supported data types + pub data_types: HashSet, + /// Power characteristics + pub power: PowerCharacteristics, + /// Optimal workload characteristics + pub optimal_for: Vec, +} + +#[derive(Clone, Debug)] +pub struct ComputeThroughput { + /// FP64 TFLOPS + pub fp64_tflops: f64, + /// FP32 TFLOPS + pub fp32_tflops: f64, + /// FP16 TFLOPS + pub fp16_tflops: f64, + /// BF16 TFLOPS + pub bf16_tflops: f64, + /// INT8 TOPS + pub int8_tops: f64, + /// INT4 TOPS + pub int4_tops: f64, + /// Sparse operations multiplier + pub sparsity_speedup: f64, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum OperationType { + // Matrix operations + MatMul, + Conv2d, + Conv3d, + DepthwiseConv, + BatchNorm, + LayerNorm, + // Attention operations + SelfAttention, + CrossAttention, + FlashAttention, + // Activation functions + ReLU, + GeLU, + SiLU, + Softmax, + // Reduction operations + Sum, + Mean, + Max, + ArgMax, + // Data movement + Transpose, + Reshape, + Concat, + Split, + Gather, + Scatter, + // Special operations + Embedding, + RoPE, // Rotary Position Embedding + KVCache, + TopK, + Sampling, + // I/O operations + DataLoad, + DataPreprocess, + Tokenization, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum WorkloadCharacteristic { + /// High parallelism (GPU, TPU) + HighlyParallel, + /// Sequential dependencies (CPU, LPU) + Sequential, + /// Memory bandwidth bound (GPU) + MemoryBound, + /// Compute bound (TPU) + ComputeBound, + /// Low latency required (NPU, edge) + LowLatency, + /// Low power required (NPU, mobile) + LowPower, + /// Large batch sizes (GPU, TPU) + LargeBatch, + /// Small batch sizes (CPU, LPU) + SmallBatch, + /// Variable length sequences (LPU) + VariableLength, + /// Fixed tensor shapes (TPU) + FixedShape, +} +``` + +### 1.3 Processor Profiles by Type + +```rust +// synor-compute/src/heterogeneous/profiles.rs + +/// Pre-defined processor profiles +pub struct ProcessorProfiles; + +impl ProcessorProfiles { + /// NVIDIA H100 SXM profile + pub fn nvidia_h100() -> ProcessorCapabilities { + ProcessorCapabilities { + compute: ComputeThroughput { + fp64_tflops: 67.0, + fp32_tflops: 67.0, + fp16_tflops: 1979.0, // With sparsity + bf16_tflops: 1979.0, + int8_tops: 3958.0, + int4_tops: 7916.0, + sparsity_speedup: 2.0, + }, + memory: MemorySpecs { + capacity_gb: 80, + bandwidth_gbps: 3350, + type_: MemoryType::Hbm3, + }, + operations: [ + OperationType::MatMul, + OperationType::Conv2d, + OperationType::SelfAttention, + OperationType::FlashAttention, + // ... all GPU operations + ].into_iter().collect(), + optimal_for: vec![ + WorkloadCharacteristic::HighlyParallel, + WorkloadCharacteristic::LargeBatch, + WorkloadCharacteristic::ComputeBound, + ], + ..Default::default() + } + } + + /// Google TPU v5p profile + pub fn google_tpu_v5p() -> ProcessorCapabilities { + ProcessorCapabilities { + compute: ComputeThroughput { + fp32_tflops: 459.0, + bf16_tflops: 918.0, + int8_tops: 1836.0, + ..Default::default() + }, + memory: MemorySpecs { + capacity_gb: 95, + bandwidth_gbps: 4800, + type_: MemoryType::Hbm2e, + }, + optimal_for: vec![ + WorkloadCharacteristic::HighlyParallel, + WorkloadCharacteristic::ComputeBound, + WorkloadCharacteristic::FixedShape, + WorkloadCharacteristic::LargeBatch, + ], + ..Default::default() + } + } + + /// Groq LPU profile + pub fn groq_lpu() -> ProcessorCapabilities { + ProcessorCapabilities { + compute: ComputeThroughput { + int8_tops: 750.0, + ..Default::default() + }, + memory: MemorySpecs { + capacity_gb: 230, // SRAM! + bandwidth_gbps: 80_000, // 80 TB/s internal + type_: MemoryType::Sram, + }, + optimal_for: vec![ + WorkloadCharacteristic::Sequential, + WorkloadCharacteristic::SmallBatch, + WorkloadCharacteristic::VariableLength, + WorkloadCharacteristic::LowLatency, + ], + ..Default::default() + } + } + + /// Apple M3 Max Neural Engine profile + pub fn apple_neural_engine_m3() -> ProcessorCapabilities { + ProcessorCapabilities { + compute: ComputeThroughput { + int8_tops: 18.0, + ..Default::default() + }, + memory: MemorySpecs { + capacity_gb: 0, // Uses unified memory + bandwidth_gbps: 400, + type_: MemoryType::Unified, + }, + optimal_for: vec![ + WorkloadCharacteristic::LowPower, + WorkloadCharacteristic::LowLatency, + WorkloadCharacteristic::SmallBatch, + ], + ..Default::default() + } + } + + /// AMD EPYC 9654 CPU profile + pub fn amd_epyc_9654() -> ProcessorCapabilities { + ProcessorCapabilities { + compute: ComputeThroughput { + fp64_tflops: 5.4, + fp32_tflops: 10.8, + ..Default::default() + }, + memory: MemorySpecs { + capacity_gb: 6144, // 6TB max + bandwidth_gbps: 460, + type_: MemoryType::Ddr5, + }, + operations: [ + OperationType::DataLoad, + OperationType::DataPreprocess, + OperationType::Tokenization, + // Sequential operations + ].into_iter().collect(), + optimal_for: vec![ + WorkloadCharacteristic::Sequential, + WorkloadCharacteristic::MemoryBound, + ], + ..Default::default() + } + } +} +``` + +--- + +## Part 2: Task Decomposition Engine + +### 2.1 Workload Analyzer + +```rust +// synor-compute/src/heterogeneous/analyzer.rs + +/// Analyzes workloads and decomposes into optimal subtasks +pub struct WorkloadAnalyzer { + /// Operation cost models for each processor type + cost_models: HashMap, + /// Dependency graph builder + graph_builder: DependencyGraphBuilder, + /// ML model for workload prediction + predictor: WorkloadPredictor, +} + +impl WorkloadAnalyzer { + /// Analyze a computation graph and decompose into subtasks + pub async fn analyze(&self, graph: &ComputationGraph) -> WorkloadAnalysis { + // 1. Build dependency graph + let deps = self.graph_builder.build(graph); + + // 2. Identify operation types + let operations = self.identify_operations(graph); + + // 3. Estimate costs for each processor type + let cost_matrix = self.estimate_costs(&operations); + + // 4. Find optimal assignment + let assignment = self.optimize_assignment(&deps, &cost_matrix); + + // 5. Create execution plan + WorkloadAnalysis { + operations, + dependencies: deps, + cost_matrix, + optimal_assignment: assignment, + estimated_speedup: self.calculate_speedup(&assignment), + } + } + + /// Estimate operation costs across all processor types + fn estimate_costs(&self, operations: &[Operation]) -> CostMatrix { + let mut matrix = CostMatrix::new(operations.len(), self.cost_models.len()); + + for (op_idx, op) in operations.iter().enumerate() { + for (proc_idx, (proc_type, model)) in self.cost_models.iter().enumerate() { + let cost = if model.can_execute(op) { + model.estimate_cost(op) + } else { + f64::INFINITY // Can't execute on this processor + }; + matrix.set(op_idx, proc_idx, cost); + } + } + + matrix + } + + /// Optimize task-to-processor assignment + fn optimize_assignment( + &self, + deps: &DependencyGraph, + costs: &CostMatrix, + ) -> TaskAssignment { + // Use ILP (Integer Linear Programming) or heuristic + // to minimize total execution time considering: + // 1. Operation costs on each processor + // 2. Data transfer costs between processors + // 3. Dependency constraints (ordering) + // 4. Processor capacity constraints + + let solver = HeterogeneousSchedulingSolver::new(); + solver.solve(deps, costs) + } +} + +/// Cost matrix: operations × processor types +pub struct CostMatrix { + /// Rows: operations, Cols: processor types + data: Vec>, + /// Data transfer costs between processor types + transfer_costs: HashMap<(ProcessorType, ProcessorType), f64>, +} + +impl CostMatrix { + /// Get cost of operation on processor + pub fn get(&self, op: usize, proc: usize) -> f64 { + self.data[op][proc] + } + + /// Get data transfer cost between processors + pub fn transfer_cost(&self, from: ProcessorType, to: ProcessorType, bytes: u64) -> f64 { + if from == to { + 0.0 // Same processor type, no transfer + } else { + let per_byte = self.transfer_costs + .get(&(from, to)) + .unwrap_or(&1e-9); // Default: 1ns per byte + *per_byte * bytes as f64 + } + } +} +``` + +### 2.2 AI Training Decomposition Example + +```rust +// synor-compute/src/heterogeneous/training.rs + +/// Decompose AI training into heterogeneous subtasks +pub struct TrainingDecomposer; + +impl TrainingDecomposer { + /// Decompose a training iteration into processor-specific tasks + pub fn decompose_iteration( + &self, + model: &Model, + batch: &Batch, + available_processors: &[ProcessorInfo], + ) -> DecomposedIteration { + let mut tasks = Vec::new(); + + // ═══════════════════════════════════════════════════════════════ + // PHASE 1: DATA LOADING & PREPROCESSING → CPU + // ═══════════════════════════════════════════════════════════════ + tasks.push(Task { + id: TaskId::new(), + operation: Operation::DataLoad { + batch_ids: batch.ids.clone(), + shuffle: true, + }, + optimal_processor: ProcessorType::Cpu(CpuVariant::X86_64 { avx: AvxSupport::Avx512 }), + priority: TaskPriority::High, + dependencies: vec![], + }); + + tasks.push(Task { + id: TaskId::new(), + operation: Operation::DataPreprocess { + transforms: vec![ + Transform::Normalize, + Transform::Augment, + Transform::ToTensor, + ], + }, + optimal_processor: ProcessorType::Cpu(CpuVariant::X86_64 { avx: AvxSupport::Avx512 }), + priority: TaskPriority::High, + dependencies: vec![tasks[0].id], + }); + + // ═══════════════════════════════════════════════════════════════ + // PHASE 2: TOKENIZATION (for LLMs) → CPU or NPU + // ═══════════════════════════════════════════════════════════════ + if model.model_type == ModelType::Llm { + tasks.push(Task { + id: TaskId::new(), + operation: Operation::Tokenization { + vocab_size: model.vocab_size, + max_length: model.max_seq_len, + }, + optimal_processor: ProcessorType::Cpu(CpuVariant::X86_64 { avx: AvxSupport::Avx512 }), + priority: TaskPriority::High, + dependencies: vec![tasks[1].id], + }); + } + + // ═══════════════════════════════════════════════════════════════ + // PHASE 3: EMBEDDING LOOKUP → GPU (memory bandwidth bound) + // ═══════════════════════════════════════════════════════════════ + tasks.push(Task { + id: TaskId::new(), + operation: Operation::Embedding { + vocab_size: model.vocab_size, + embedding_dim: model.embedding_dim, + }, + optimal_processor: ProcessorType::Gpu(GpuVariant::NvidiaCuda { + compute_capability: (9, 0), // H100 + }), + priority: TaskPriority::High, + dependencies: vec![tasks.last().unwrap().id], + }); + + // ═══════════════════════════════════════════════════════════════ + // PHASE 4: TRANSFORMER LAYERS → TPU or GPU (compute bound) + // ═══════════════════════════════════════════════════════════════ + let embedding_task_id = tasks.last().unwrap().id; + + for layer_idx in 0..model.num_layers { + // Self-attention → TPU optimal (large matrix multiplies) + tasks.push(Task { + id: TaskId::new(), + operation: Operation::SelfAttention { + layer: layer_idx, + num_heads: model.num_heads, + head_dim: model.head_dim, + use_flash: true, + }, + optimal_processor: ProcessorType::Tpu(TpuVersion::V5p), + fallback_processor: Some(ProcessorType::Gpu(GpuVariant::NvidiaCuda { + compute_capability: (9, 0), + })), + priority: TaskPriority::Critical, + dependencies: vec![ + if layer_idx == 0 { embedding_task_id } else { tasks.last().unwrap().id } + ], + }); + + // FFN (Feed-Forward Network) → GPU optimal + tasks.push(Task { + id: TaskId::new(), + operation: Operation::FeedForward { + layer: layer_idx, + hidden_dim: model.ffn_dim, + activation: Activation::SiLU, + }, + optimal_processor: ProcessorType::Gpu(GpuVariant::NvidiaCuda { + compute_capability: (9, 0), + }), + priority: TaskPriority::Critical, + dependencies: vec![tasks.last().unwrap().id], + }); + } + + // ═══════════════════════════════════════════════════════════════ + // PHASE 5: OUTPUT PROJECTION & LOSS → GPU + // ═══════════════════════════════════════════════════════════════ + tasks.push(Task { + id: TaskId::new(), + operation: Operation::OutputProjection { + vocab_size: model.vocab_size, + }, + optimal_processor: ProcessorType::Gpu(GpuVariant::NvidiaCuda { + compute_capability: (9, 0), + }), + priority: TaskPriority::High, + dependencies: vec![tasks.last().unwrap().id], + }); + + tasks.push(Task { + id: TaskId::new(), + operation: Operation::CrossEntropyLoss {}, + optimal_processor: ProcessorType::Gpu(GpuVariant::NvidiaCuda { + compute_capability: (9, 0), + }), + priority: TaskPriority::High, + dependencies: vec![tasks.last().unwrap().id], + }); + + // ═══════════════════════════════════════════════════════════════ + // PHASE 6: BACKWARD PASS → Same as forward, reversed + // ═══════════════════════════════════════════════════════════════ + let forward_tasks = tasks.clone(); + for task in forward_tasks.iter().rev() { + if let Some(backward_op) = task.operation.backward() { + tasks.push(Task { + id: TaskId::new(), + operation: backward_op, + optimal_processor: task.optimal_processor, + priority: task.priority, + dependencies: vec![tasks.last().unwrap().id], + }); + } + } + + // ═══════════════════════════════════════════════════════════════ + // PHASE 7: GRADIENT AGGREGATION → CPU (network I/O) + GPU (compute) + // ═══════════════════════════════════════════════════════════════ + tasks.push(Task { + id: TaskId::new(), + operation: Operation::AllReduce { + algorithm: AllReduceAlgorithm::RingAllReduce, + }, + optimal_processor: ProcessorType::Cpu(CpuVariant::X86_64 { avx: AvxSupport::Avx512 }), + priority: TaskPriority::Critical, + dependencies: vec![tasks.last().unwrap().id], + }); + + // ═══════════════════════════════════════════════════════════════ + // PHASE 8: OPTIMIZER STEP → GPU + // ═══════════════════════════════════════════════════════════════ + tasks.push(Task { + id: TaskId::new(), + operation: Operation::OptimizerStep { + optimizer: OptimizerType::AdamW, + learning_rate: 1e-4, + }, + optimal_processor: ProcessorType::Gpu(GpuVariant::NvidiaCuda { + compute_capability: (9, 0), + }), + priority: TaskPriority::High, + dependencies: vec![tasks.last().unwrap().id], + }); + + // ═══════════════════════════════════════════════════════════════ + // PHASE 9: CHECKPOINTING → CPU (I/O) + // ═══════════════════════════════════════════════════════════════ + tasks.push(Task { + id: TaskId::new(), + operation: Operation::Checkpoint { + async_: true, + }, + optimal_processor: ProcessorType::Cpu(CpuVariant::X86_64 { avx: AvxSupport::Avx512 }), + priority: TaskPriority::Low, + dependencies: vec![tasks.last().unwrap().id], + }); + + DecomposedIteration { + tasks, + estimated_time: self.estimate_total_time(&tasks), + processor_utilization: self.estimate_utilization(&tasks), + } + } +} +``` + +--- + +## Part 3: Heterogeneous Scheduler + +### 3.1 Multi-Queue Scheduler + +```rust +// synor-compute/src/heterogeneous/scheduler.rs + +/// Scheduler that manages tasks across all processor types +pub struct HeterogeneousScheduler { + /// Per-processor-type task queues + queues: HashMap, + /// Available processors + processors: Vec>, + /// Task dependency tracker + dependencies: DependencyTracker, + /// Load balancer + load_balancer: LoadBalancer, + /// Data placement optimizer + data_placement: DataPlacementOptimizer, +} + +impl HeterogeneousScheduler { + /// Schedule a decomposed workload + pub async fn schedule(&self, workload: DecomposedWorkload) -> Result { + // 1. Build execution graph + let graph = self.dependencies.build_graph(&workload.tasks); + + // 2. Assign tasks to processors + let assignment = self.assign_tasks(&workload.tasks, &graph).await?; + + // 3. Optimize data placement + let data_plan = self.data_placement.optimize(&assignment).await?; + + // 4. Create execution schedule + let schedule = self.create_schedule(&assignment, &data_plan, &graph)?; + + Ok(ScheduleResult { + schedule, + data_plan, + estimated_makespan: self.estimate_makespan(&schedule), + processor_utilization: self.estimate_utilization(&schedule), + }) + } + + /// Assign tasks to optimal processors + async fn assign_tasks( + &self, + tasks: &[Task], + graph: &DependencyGraph, + ) -> Result { + let mut assignment = TaskAssignment::new(); + + // Sort tasks by priority and dependencies (topological sort) + let sorted_tasks = graph.topological_sort(tasks); + + for task in sorted_tasks { + // Find best processor for this task + let best_processor = self.find_best_processor(&task).await?; + + // Check if we should steal work for load balancing + let final_processor = self.load_balancer + .maybe_rebalance(&task, best_processor, &assignment) + .await?; + + assignment.assign(task.id, final_processor); + } + + Ok(assignment) + } + + /// Find the best processor for a task + async fn find_best_processor(&self, task: &Task) -> Result { + let mut best_score = f64::NEG_INFINITY; + let mut best_processor = None; + + for processor in &self.processors { + if !processor.can_execute(&task.operation) { + continue; + } + + // Score = 1 / (execution_time + data_transfer_time) + let exec_time = processor.estimate_time(&task.operation); + let transfer_time = self.estimate_data_transfer_time(task, processor.as_ref()); + let total_time = exec_time + transfer_time; + + // Adjust for current load + let load_factor = 1.0 + processor.utilization(); + let adjusted_time = total_time.as_secs_f64() * load_factor; + + let score = 1.0 / adjusted_time; + + if score > best_score { + best_score = score; + best_processor = Some(processor.id()); + } + } + + best_processor.ok_or(Error::NoSuitableProcessor) + } + + /// Execute the schedule + pub async fn execute(&self, schedule: &Schedule) -> Result { + let mut handles = Vec::new(); + let results = Arc::new(Mutex::new(HashMap::new())); + let completed = Arc::new(AtomicUsize::new(0)); + + // Create execution contexts for each processor + let contexts: HashMap = self.processors + .iter() + .map(|p| (p.id(), ExecutionContext::new(p.clone()))) + .collect(); + + // Execute tasks in schedule order + for stage in &schedule.stages { + // Execute all tasks in this stage in parallel + let stage_handles: Vec<_> = stage.tasks + .iter() + .map(|task_id| { + let task = schedule.get_task(*task_id); + let processor_id = schedule.get_assignment(*task_id); + let context = contexts.get(&processor_id).unwrap().clone(); + let results = results.clone(); + let completed = completed.clone(); + + tokio::spawn(async move { + // Wait for dependencies + task.wait_for_dependencies(&results).await; + + // Execute on assigned processor + let result = context.execute(&task).await?; + + // Store result + results.lock().await.insert(task.id, result); + completed.fetch_add(1, Ordering::SeqCst); + + Ok::<_, Error>(()) + }) + }) + .collect(); + + // Wait for all tasks in stage to complete + for handle in stage_handles { + handle.await??; + } + } + + Ok(ExecutionResult { + results: Arc::try_unwrap(results).unwrap().into_inner(), + total_time: schedule.estimated_makespan, + processor_utilization: self.measure_utilization(&contexts), + }) + } +} +``` + +### 3.2 Work Stealing for Load Balancing + +```rust +// synor-compute/src/heterogeneous/work_stealing.rs + +/// Work stealing scheduler for load balancing +pub struct WorkStealingScheduler { + /// Per-processor work queues (deques for work stealing) + queues: HashMap, + /// Stealing policy + policy: StealingPolicy, +} + +impl WorkStealingScheduler { + /// Try to steal work for an idle processor + pub async fn try_steal(&self, idle_processor: ProcessorId) -> Option { + let idle_type = self.get_processor_type(idle_processor); + + // Find most loaded processor with compatible tasks + let mut best_victim = None; + let mut best_load = 0; + + for (proc_id, queue) in &self.queues { + if *proc_id == idle_processor { + continue; + } + + // Check if this queue has tasks compatible with idle processor + let compatible_count = queue.count_compatible(idle_type); + if compatible_count > best_load { + best_load = compatible_count; + best_victim = Some(*proc_id); + } + } + + // Steal from the most loaded compatible queue + if let Some(victim) = best_victim { + let victim_queue = self.queues.get(&victim)?; + + // Steal from the back of the queue (oldest tasks) + victim_queue.steal_compatible(idle_type).await + } else { + None + } + } + + /// Rebalance when processor utilization is uneven + pub async fn rebalance(&self) -> Vec { + let mut migrations = Vec::new(); + + // Calculate average utilization + let total_util: f64 = self.queues.values().map(|q| q.utilization()).sum(); + let avg_util = total_util / self.queues.len() as f64; + + // Find overloaded and underloaded processors + let mut overloaded: Vec<_> = self.queues.iter() + .filter(|(_, q)| q.utilization() > avg_util * 1.2) + .collect(); + let mut underloaded: Vec<_> = self.queues.iter() + .filter(|(_, q)| q.utilization() < avg_util * 0.8) + .collect(); + + // Sort by utilization + overloaded.sort_by(|a, b| b.1.utilization().partial_cmp(&a.1.utilization()).unwrap()); + underloaded.sort_by(|a, b| a.1.utilization().partial_cmp(&b.1.utilization()).unwrap()); + + // Migrate tasks from overloaded to underloaded + for (over_id, over_queue) in overloaded { + for (under_id, under_queue) in &underloaded { + if over_queue.utilization() <= avg_util { + break; + } + + let under_type = self.get_processor_type(**under_id); + + // Find tasks that can be migrated + if let Some(task) = over_queue.find_migratable(under_type) { + migrations.push(TaskMigration { + task_id: task.id, + from: *over_id, + to: **under_id, + }); + } + } + } + + migrations + } +} + +/// Work queue with lock-free deque for work stealing +pub struct WorkQueue { + /// Double-ended queue for work stealing + deque: crossbeam_deque::Injector, + /// Local queues per worker + local: Vec>, + /// Stealers for other workers + stealers: Vec>, + /// Current utilization + utilization: AtomicU64, +} + +impl WorkQueue { + /// Push task (owner pushes to front) + pub fn push(&self, task: Task) { + self.deque.push(task); + } + + /// Pop task (owner pops from front) + pub fn pop(&self) -> Option { + self.deque.steal().success() + } + + /// Steal task (thieves steal from back) + pub async fn steal_compatible(&self, processor_type: ProcessorType) -> Option { + // Try to steal a task compatible with the given processor type + loop { + match self.deque.steal() { + crossbeam_deque::Steal::Success(task) => { + if task.is_compatible_with(processor_type) { + return Some(task); + } else { + // Put it back and try again + self.deque.push(task); + } + } + crossbeam_deque::Steal::Empty => return None, + crossbeam_deque::Steal::Retry => continue, + } + } + } +} +``` + +### 3.3 Pipeline Parallelism Across Processors + +```rust +// synor-compute/src/heterogeneous/pipeline.rs + +/// Pipeline parallelism across heterogeneous processors +pub struct HeterogeneousPipeline { + /// Pipeline stages + stages: Vec, + /// Inter-stage buffers + buffers: Vec, + /// Synchronization + sync: PipelineSync, +} + +/// A stage in the pipeline assigned to a processor type +pub struct PipelineStage { + pub stage_id: usize, + pub operations: Vec, + pub processor_type: ProcessorType, + pub processors: Vec, // Multiple processors for parallelism +} + +impl HeterogeneousPipeline { + /// Create a pipeline for LLM inference + pub fn create_llm_pipeline( + model: &LlmModel, + available_processors: &ProcessorRegistry, + ) -> Self { + let mut stages = Vec::new(); + + // Stage 1: Tokenization → CPU + stages.push(PipelineStage { + stage_id: 0, + operations: vec![Operation::Tokenization { .. }], + processor_type: ProcessorType::Cpu(CpuVariant::X86_64 { .. }), + processors: available_processors.get_type(ProcessorType::Cpu(..)), + }); + + // Stage 2: Embedding → GPU (memory bound) + stages.push(PipelineStage { + stage_id: 1, + operations: vec![Operation::Embedding { .. }], + processor_type: ProcessorType::Gpu(GpuVariant::NvidiaCuda { .. }), + processors: available_processors.get_type(ProcessorType::Gpu(..)), + }); + + // Stage 3: Transformer layers → TPU (if available) or GPU + let transformer_processor = if available_processors.has_tpu() { + ProcessorType::Tpu(TpuVersion::V5p) + } else { + ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability: (9, 0) }) + }; + + stages.push(PipelineStage { + stage_id: 2, + operations: model.layers.iter().flat_map(|l| l.operations()).collect(), + processor_type: transformer_processor, + processors: available_processors.get_type(transformer_processor), + }); + + // Stage 4: Token generation → LPU (if available, best for sequential) or GPU + let generation_processor = if available_processors.has_lpu() { + ProcessorType::Lpu + } else { + ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability: (9, 0) }) + }; + + stages.push(PipelineStage { + stage_id: 3, + operations: vec![ + Operation::OutputProjection { .. }, + Operation::Sampling { .. }, + ], + processor_type: generation_processor, + processors: available_processors.get_type(generation_processor), + }); + + // Stage 5: Detokenization → CPU + stages.push(PipelineStage { + stage_id: 4, + operations: vec![Operation::Detokenization { .. }], + processor_type: ProcessorType::Cpu(CpuVariant::X86_64 { .. }), + processors: available_processors.get_type(ProcessorType::Cpu(..)), + }); + + // Create inter-stage buffers + let buffers = (0..stages.len() - 1) + .map(|i| PipelineBuffer::new( + stages[i].processor_type, + stages[i + 1].processor_type, + )) + .collect(); + + Self { + stages, + buffers, + sync: PipelineSync::new(), + } + } + + /// Execute pipeline with micro-batching + pub async fn execute_stream( + &self, + input_stream: impl Stream, + ) -> impl Stream { + let (tx, rx) = mpsc::channel(1024); + + // Start pipeline stages + for (i, stage) in self.stages.iter().enumerate() { + let input_buffer = if i == 0 { + None + } else { + Some(self.buffers[i - 1].clone()) + }; + + let output_buffer = if i == self.stages.len() - 1 { + None + } else { + Some(self.buffers[i].clone()) + }; + + let stage = stage.clone(); + let tx = tx.clone(); + + tokio::spawn(async move { + stage.run(input_buffer, output_buffer, tx).await; + }); + } + + // Feed input stream to first stage + let first_buffer = self.buffers[0].clone(); + tokio::spawn(async move { + pin_mut!(input_stream); + while let Some(request) = input_stream.next().await { + first_buffer.push(request.into()).await; + } + }); + + ReceiverStream::new(rx) + } +} + +/// Buffer between pipeline stages with automatic data transfer +pub struct PipelineBuffer { + /// Source processor type + source_type: ProcessorType, + /// Destination processor type + dest_type: ProcessorType, + /// Data queue + queue: Arc>, + /// Transfer strategy + transfer: DataTransferStrategy, +} + +impl PipelineBuffer { + /// Push data from source stage + pub async fn push(&self, data: PipelineData) { + // Transfer data if processors have different memory spaces + let transferred = if self.needs_transfer() { + self.transfer.transfer(&data, self.source_type, self.dest_type).await + } else { + data + }; + + self.queue.push(transferred).unwrap(); + } + + /// Pop data for destination stage + pub async fn pop(&self) -> Option { + self.queue.pop() + } + + fn needs_transfer(&self) -> bool { + !self.source_type.shares_memory_with(&self.dest_type) + } +} +``` + +--- + +## Part 4: Data Movement Optimization + +### 4.1 Unified Memory Management + +```rust +// synor-compute/src/heterogeneous/memory.rs + +/// Unified memory manager across all processor types +pub struct UnifiedMemoryManager { + /// Memory allocators per processor type + allocators: HashMap>, + /// Data location tracker + locations: DataLocationTracker, + /// Transfer scheduler + transfer_scheduler: TransferScheduler, + /// Prefetch predictor + prefetcher: PrefetchPredictor, +} + +impl UnifiedMemoryManager { + /// Allocate tensor with optimal placement + pub async fn allocate_tensor( + &self, + shape: &[usize], + dtype: DataType, + hint: PlacementHint, + ) -> Result { + // Determine optimal initial placement + let location = match hint { + PlacementHint::Processor(proc_type) => proc_type, + PlacementHint::Operation(op) => self.optimal_location_for_op(&op), + PlacementHint::Auto => self.predict_optimal_location(shape, dtype), + }; + + // Allocate on chosen processor + let allocator = self.allocators.get(&location)?; + let ptr = allocator.allocate(shape.iter().product::() * dtype.size())?; + + // Register with location tracker + let handle = TensorHandle::new(ptr, shape.to_vec(), dtype); + self.locations.register(&handle, location); + + Ok(handle) + } + + /// Ensure tensor is available on specified processor + pub async fn ensure_on( + &self, + tensor: &TensorHandle, + target: ProcessorType, + ) -> Result { + let current_location = self.locations.get(tensor)?; + + if current_location == target { + // Already on target, return view + return Ok(TensorView::new(tensor, target)); + } + + // Check if already cached on target + if let Some(cached) = self.locations.get_cached(tensor, target) { + return Ok(cached); + } + + // Need to transfer + let transfer = self.transfer_scheduler.schedule_transfer( + tensor, + current_location, + target, + ).await?; + + // Execute transfer + transfer.execute().await?; + + // Register new location + self.locations.add_copy(tensor, target); + + Ok(TensorView::new(tensor, target)) + } + + /// Prefetch data before it's needed + pub async fn prefetch(&self, tensor: &TensorHandle, target: ProcessorType) { + // Don't wait, just schedule the transfer + let _ = self.transfer_scheduler.schedule_transfer_async( + tensor, + self.locations.get(tensor).unwrap_or(ProcessorType::Cpu(Default::default())), + target, + ).await; + } +} + +/// Optimized data transfer between processors +pub struct TransferScheduler { + /// Direct transfer paths (e.g., NVLink, PCIe P2P) + direct_paths: HashMap<(ProcessorType, ProcessorType), TransferPath>, + /// Transfer queue + queue: TransferQueue, +} + +impl TransferScheduler { + /// Schedule optimal transfer + pub async fn schedule_transfer( + &self, + tensor: &TensorHandle, + from: ProcessorType, + to: ProcessorType, + ) -> Result { + // Find optimal path + let path = self.find_optimal_path(from, to, tensor.size_bytes()); + + // Create transfer + let transfer = Transfer { + tensor: tensor.clone(), + path, + size: tensor.size_bytes(), + }; + + // Add to queue (batching similar transfers) + self.queue.enqueue(transfer.clone()).await; + + Ok(transfer) + } + + fn find_optimal_path( + &self, + from: ProcessorType, + to: ProcessorType, + size: usize, + ) -> TransferPath { + // Check for direct path first + if let Some(direct) = self.direct_paths.get(&(from, to)) { + return direct.clone(); + } + + // Check for direct path in reverse (bidirectional) + if let Some(direct) = self.direct_paths.get(&(to, from)) { + return direct.clone(); + } + + // Fall back to CPU-mediated transfer + TransferPath::CpuMediated { from, to } + } +} + +/// Available transfer paths +#[derive(Clone, Debug)] +pub enum TransferPath { + /// Direct GPU-to-GPU (NVLink, NVSwitch) + NvLink { bandwidth_gbps: u32 }, + /// PCIe peer-to-peer + PciePeerToPeer { gen: u8, lanes: u8 }, + /// Through CPU memory (slowest) + CpuMediated { from: ProcessorType, to: ProcessorType }, + /// Unified memory (Apple, some AMD APUs) + UnifiedMemory, + /// Network transfer (for distributed) + Network { protocol: NetworkProtocol }, +} +``` + +--- + +## Part 5: Example: Heterogeneous LLM Inference + +### 5.1 Complete Example Flow + +```rust +// synor-compute/src/examples/heterogeneous_llm.rs + +/// Example: Running LLM inference across CPU + GPU + TPU + LPU +pub async fn run_heterogeneous_inference( + prompt: &str, + model: &LlmModel, + processors: &ProcessorRegistry, +) -> Result { + let scheduler = HeterogeneousScheduler::new(processors); + + // ═══════════════════════════════════════════════════════════════ + // STEP 1: TOKENIZATION (CPU) + // CPU is optimal for string processing and variable-length operations + // ═══════════════════════════════════════════════════════════════ + let cpu = processors.get_best(ProcessorType::Cpu(..))?; + let tokens = cpu.execute(Operation::Tokenization { + text: prompt.to_string(), + vocab: model.vocab.clone(), + }).await?; + + println!("✓ Tokenization complete on CPU: {} tokens", tokens.len()); + + // ═══════════════════════════════════════════════════════════════ + // STEP 2: EMBEDDING LOOKUP (GPU) + // GPU is optimal for memory-bandwidth-bound operations + // ═══════════════════════════════════════════════════════════════ + let gpu = processors.get_best(ProcessorType::Gpu(..))?; + let embeddings = gpu.execute(Operation::Embedding { + tokens: tokens.clone(), + embedding_table: model.embedding_table.clone(), + }).await?; + + println!("✓ Embedding complete on GPU"); + + // ═══════════════════════════════════════════════════════════════ + // STEP 3: PREFILL (PARALLEL ATTENTION) → TPU or GPU + // TPU excels at large matrix multiplications with fixed shapes + // ═══════════════════════════════════════════════════════════════ + let prefill_processor = processors + .get_best(ProcessorType::Tpu(..)) + .or_else(|_| processors.get_best(ProcessorType::Gpu(..)))?; + + let mut hidden_states = embeddings; + + for layer_idx in 0..model.num_layers { + hidden_states = prefill_processor.execute(Operation::TransformerLayer { + layer: layer_idx, + input: hidden_states, + attention_mask: None, + kv_cache: None, // No cache for prefill + }).await?; + } + + println!("✓ Prefill complete on {:?}", prefill_processor.processor_type()); + + // ═══════════════════════════════════════════════════════════════ + // STEP 4: DECODE (SEQUENTIAL TOKEN GENERATION) → LPU or GPU + // LPU excels at sequential, low-batch operations (autoregressive) + // ═══════════════════════════════════════════════════════════════ + let decode_processor = processors + .get_best(ProcessorType::Lpu) + .or_else(|_| processors.get_best(ProcessorType::Gpu(..)))?; + + let mut generated_tokens = Vec::new(); + let mut kv_cache = KvCache::new(); + + for _ in 0..model.max_new_tokens { + // Run one decode step + let logits = decode_processor.execute(Operation::DecodeStep { + hidden_states: hidden_states.last_token(), + kv_cache: &mut kv_cache, + layers: &model.layers, + }).await?; + + // Sample next token + let next_token = decode_processor.execute(Operation::Sampling { + logits, + temperature: 0.7, + top_p: 0.9, + }).await?; + + if next_token == model.eos_token { + break; + } + + generated_tokens.push(next_token); + + // Get embedding for next iteration + hidden_states = gpu.execute(Operation::Embedding { + tokens: vec![next_token], + embedding_table: model.embedding_table.clone(), + }).await?; + } + + println!("✓ Decode complete on {:?}: {} tokens generated", + decode_processor.processor_type(), + generated_tokens.len()); + + // ═══════════════════════════════════════════════════════════════ + // STEP 5: DETOKENIZATION (CPU) + // CPU handles string operations and variable-length output + // ═══════════════════════════════════════════════════════════════ + let output = cpu.execute(Operation::Detokenization { + tokens: generated_tokens, + vocab: model.vocab.clone(), + }).await?; + + println!("✓ Detokenization complete on CPU"); + + Ok(output) +} +``` + +### 5.2 Utilization Report + +``` +╔═══════════════════════════════════════════════════════════════════════════╗ +║ HETEROGENEOUS INFERENCE REPORT ║ +╠═══════════════════════════════════════════════════════════════════════════╣ +║ ║ +║ Model: Llama-70B ║ +║ Input: 512 tokens ║ +║ Output: 256 tokens ║ +║ ║ +║ ┌────────────────────────────────────────────────────────────────────┐ ║ +║ │ PROCESSOR UTILIZATION │ ║ +║ ├────────────┬──────────┬──────────┬──────────┬────────────────────┤ ║ +║ │ Processor │ Time │ Util % │ Tasks │ Operations │ ║ +║ ├────────────┼──────────┼──────────┼──────────┼────────────────────┤ ║ +║ │ CPU │ 15ms │ 8% │ 2 │ Token, Detoken │ ║ +║ │ GPU (H100) │ 120ms │ 65% │ 257 │ Embedding (×257) │ ║ +║ │ TPU v5p │ 200ms │ 95% │ 80 │ Prefill layers │ ║ +║ │ LPU (Groq) │ 450ms │ 92% │ 256 │ Decode steps │ ║ +║ └────────────┴──────────┴──────────┴──────────┴────────────────────┘ ║ +║ ║ +║ Total Time: 785ms (vs 2400ms GPU-only = 3.1x speedup) ║ +║ Zero Idle Processors: ✓ ║ +║ ║ +║ ┌────────────────────────────────────────────────────────────────────┐ ║ +║ │ TIMELINE │ ║ +║ ├────────────────────────────────────────────────────────────────────┤ ║ +║ │ │ ║ +║ │ CPU ██░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░██ │ ║ +║ │ │Tok Detok│ │ ║ +║ │ │ ║ +║ │ GPU ░░██████████████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░ │ ║ +║ │ │Embed×512 │ │ ║ +║ │ │ ║ +║ │ TPU ░░░░░░░░░░░░░░██████████████████████████░░░░░░░░░░░░░░░░░░░░ │ ║ +║ │ │Prefill (80 layers) │ │ ║ +║ │ │ ║ +║ │ LPU ░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░██████████████████████████ │ ║ +║ │ │Decode (256 steps) │ │ ║ +║ │ │ ║ +║ │ 0ms 200ms 400ms 600ms 800ms │ ║ +║ └────────────────────────────────────────────────────────────────────┘ ║ +║ ║ +╚═══════════════════════════════════════════════════════════════════════════╝ +``` + +--- + +## Summary: Multi-Processor Advantages + +### Processor-Task Mapping + +| Task Type | Best Processor | Why | +|-----------|----------------|-----| +| Data loading, I/O | **CPU** | Sequential, system calls | +| Tokenization/Detokenization | **CPU** | String processing | +| Embedding lookup | **GPU** | Memory bandwidth | +| Matrix multiply (large) | **TPU** | Dedicated MXU units | +| Attention (prefill) | **TPU/GPU** | Parallel, compute-bound | +| Token generation (decode) | **LPU** | Sequential, low latency | +| On-device inference | **NPU** | Power efficient | +| Browser compute | **WebGPU** | Platform agnostic | +| Cryptography | **FPGA** | Custom bit operations | +| Signal processing | **DSP** | Specialized math | + +### Expected Speedups + +| Workload | GPU-Only | Heterogeneous | Speedup | +|----------|----------|---------------|---------| +| LLM Training | 1x | 1.5-2x | +50-100% | +| LLM Inference | 1x | 2-4x | +100-300% | +| Image Generation | 1x | 1.3-1.8x | +30-80% | +| RAG Pipeline | 1x | 2-3x | +100-200% | +| Real-time Video | 1x | 3-5x | +200-400% | + +### Zero Idle Guarantee + +The heterogeneous scheduler ensures: +1. **Parallel execution** across processor types +2. **Pipeline overlap** between stages +3. **Work stealing** when processors become idle +4. **Predictive prefetching** of data +5. **Dynamic rebalancing** based on actual throughput + +This architecture maximizes hardware utilization and minimizes total execution time by using EVERY available processor simultaneously. diff --git a/docs/PLAN/PHASE11-Synor-Compute-L2.md b/docs/PLAN/PHASE11-Synor-Compute-L2.md new file mode 100644 index 0000000..dea2dd9 --- /dev/null +++ b/docs/PLAN/PHASE11-Synor-Compute-L2.md @@ -0,0 +1,906 @@ +# Phase 11: Synor Compute L2 - Full-Stack Compute Platform + +> **Mission**: Build a decentralized compute platform capable of AI/ML training, inference, OS hosting, and general-purpose high-performance computing. + +--- + +## Executive Summary + +Synor Compute L2 extends beyond the current WASM-only Synor VM to provide: +- **GPU Compute**: AI/ML training and inference with CUDA/ROCm support +- **Container Orchestration**: Docker-compatible workloads with Kubernetes-style scheduling +- **Persistent VMs**: Long-running virtual machines for OS hosting +- **Serverless Functions**: Short-lived compute for API backends and event processing +- **Edge Compute**: Low-latency compute at network edge nodes + +--- + +## Architecture Overview + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ SYNOR COMPUTE L2 │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ APPLICATION LAYER │ │ +│ ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │ +│ │ AI/ML │ Serverless │ Containers │ Persistent │ Edge │ │ +│ │ Training │ Functions │ (Docker) │ VMs (Linux) │ Compute │ │ +│ └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ ORCHESTRATION LAYER │ │ +│ ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │ +│ │ Job │ Resource │ Network │ Storage │ Health │ │ +│ │ Scheduler │ Manager │ Fabric │ Orchestrator│ Monitor │ │ +│ └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ COMPUTE RUNTIME LAYER │ │ +│ ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │ +│ │ GPU │ Container │ MicroVM │ WASM │ Native │ │ +│ │ Runtime │ Runtime │ Runtime │ Runtime │ Runtime │ │ +│ │ (CUDA/ROCm)│ (containerd)│ (Firecracker)│ (Wasmtime) │ (gVisor) │ │ +│ └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ INFRASTRUCTURE LAYER │ │ +│ ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │ +│ │ Node │ Network │ Distributed │ Consensus │ Billing │ │ +│ │ Registry │ Overlay │ Storage │ (PoS+PoW) │ Metering │ │ +│ └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ SYNOR L1 BLOCKCHAIN (GHOSTDAG + DAG-RIDER) │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Milestone 1: GPU Compute Foundation (AI/ML Training & Inference) + +### 1.1 GPU Node Registration + +```rust +// synor-compute/src/gpu/node.rs + +/// GPU node capabilities +pub struct GpuNode { + /// Unique node ID + pub node_id: NodeId, + /// GPU specifications + pub gpus: Vec, + /// Total VRAM available (bytes) + pub total_vram: u64, + /// Available VRAM (bytes) + pub available_vram: u64, + /// CUDA compute capability (e.g., 8.6 for RTX 3090) + pub cuda_capability: Option<(u8, u8)>, + /// ROCm version (for AMD) + pub rocm_version: Option, + /// Network bandwidth (Gbps) + pub bandwidth_gbps: u32, + /// Geographic region + pub region: Region, + /// Stake amount (for PoS validation) + pub stake: u64, +} + +pub struct GpuSpec { + pub model: String, // "NVIDIA RTX 4090" + pub vram_gb: u32, // 24 + pub tensor_cores: u32, // 512 + pub cuda_cores: u32, // 16384 + pub memory_bandwidth: u32, // 1008 GB/s + pub fp32_tflops: f32, // 82.6 + pub fp16_tflops: f32, // 165.2 + pub int8_tops: f32, // 330.4 +} +``` + +### 1.2 AI/ML Job Specification + +```rust +// synor-compute/src/ai/job.rs + +/// AI/ML training job specification +pub struct TrainingJob { + /// Job ID + pub job_id: JobId, + /// Owner address + pub owner: Address, + /// Framework (PyTorch, TensorFlow, JAX) + pub framework: MlFramework, + /// Model specification + pub model: ModelSpec, + /// Dataset reference (Synor Storage CID) + pub dataset_cid: Cid, + /// Training configuration + pub config: TrainingConfig, + /// Resource requirements + pub resources: GpuResources, + /// Maximum budget (SYNOR tokens) + pub max_budget: u64, + /// Checkpoint interval (steps) + pub checkpoint_interval: u64, +} + +pub struct GpuResources { + pub min_gpus: u32, + pub max_gpus: u32, + pub min_vram_per_gpu: u64, + pub cuda_capability_min: Option<(u8, u8)>, + pub distributed: bool, // Multi-node training + pub priority: JobPriority, +} + +pub enum MlFramework { + PyTorch { version: String }, + TensorFlow { version: String }, + JAX { version: String }, + ONNX, + Custom { image: String }, +} + +pub struct TrainingConfig { + pub epochs: u32, + pub batch_size: u32, + pub learning_rate: f32, + pub optimizer: String, + pub mixed_precision: bool, + pub gradient_accumulation: u32, + pub distributed_strategy: DistributedStrategy, +} + +pub enum DistributedStrategy { + DataParallel, + ModelParallel, + PipelineParallel, + ZeRO { stage: u8 }, // DeepSpeed ZeRO stages 1-3 + FSDP, // Fully Sharded Data Parallel +} +``` + +### 1.3 Inference Service + +```rust +// synor-compute/src/ai/inference.rs + +/// Inference endpoint specification +pub struct InferenceEndpoint { + /// Endpoint ID + pub endpoint_id: EndpointId, + /// Model reference (Synor Storage CID) + pub model_cid: Cid, + /// Model format + pub format: ModelFormat, + /// Scaling configuration + pub scaling: AutoscaleConfig, + /// GPU requirements per replica + pub gpu_per_replica: GpuResources, + /// Request timeout + pub timeout_ms: u32, + /// Max batch size for batching inference + pub max_batch_size: u32, + /// Batching timeout + pub batch_timeout_ms: u32, +} + +pub enum ModelFormat { + PyTorch, + ONNX, + TensorRT, + Triton, + vLLM, // For LLM serving + TGI, // Text Generation Inference + Custom, +} + +pub struct AutoscaleConfig { + pub min_replicas: u32, + pub max_replicas: u32, + pub target_gpu_utilization: f32, + pub scale_up_threshold: f32, + pub scale_down_threshold: f32, + pub cooldown_seconds: u32, +} +``` + +### 1.4 Pricing Model for GPU Compute + +| Resource | Unit | Price (SYNOR/unit) | +|----------|------|-------------------| +| GPU (RTX 4090 equivalent) | hour | 0.50 | +| GPU (A100 80GB equivalent) | hour | 2.00 | +| GPU (H100 equivalent) | hour | 4.00 | +| VRAM | GB/hour | 0.01 | +| Network egress | GB | 0.05 | +| Storage (hot, NVMe) | GB/month | 0.10 | +| Inference requests | 1M tokens | 0.10 | + +--- + +## Milestone 2: Container Orchestration (Docker/Kubernetes-Compatible) + +### 2.1 Container Runtime + +```rust +// synor-compute/src/container/runtime.rs + +/// Container specification (OCI-compatible) +pub struct ContainerSpec { + /// Image reference + pub image: ImageRef, + /// Resource limits + pub resources: ContainerResources, + /// Environment variables + pub env: HashMap, + /// Volume mounts + pub volumes: Vec, + /// Network configuration + pub network: NetworkConfig, + /// Security context + pub security: SecurityContext, + /// Health check + pub health_check: Option, +} + +pub struct ContainerResources { + pub cpu_cores: f32, // 0.5, 1.0, 2.0, etc. + pub memory_mb: u64, + pub gpu: Option, + pub ephemeral_storage_gb: u32, + pub network_bandwidth_mbps: u32, +} + +pub struct GpuAllocation { + pub count: u32, + pub vram_mb: u64, + pub shared: bool, // Allow GPU sharing via MPS/MIG +} +``` + +### 2.2 Service Mesh & Networking + +```rust +// synor-compute/src/network/mesh.rs + +/// Service definition for container orchestration +pub struct Service { + pub service_id: ServiceId, + pub name: String, + pub containers: Vec, + pub replicas: ReplicaConfig, + pub load_balancer: LoadBalancerConfig, + pub service_mesh: ServiceMeshConfig, +} + +pub struct ServiceMeshConfig { + pub mtls_enabled: bool, + pub traffic_policy: TrafficPolicy, + pub circuit_breaker: CircuitBreakerConfig, + pub retry_policy: RetryPolicy, + pub rate_limit: Option, +} + +pub struct LoadBalancerConfig { + pub algorithm: LoadBalancerAlgorithm, + pub health_check: HealthCheck, + pub sticky_sessions: bool, + pub ssl_termination: SslTermination, +} + +pub enum LoadBalancerAlgorithm { + RoundRobin, + LeastConnections, + WeightedRoundRobin { weights: Vec }, + IPHash, + Random, +} +``` + +### 2.3 Container Pricing + +| Resource | Unit | Price (SYNOR/unit) | +|----------|------|-------------------| +| CPU | core/hour | 0.02 | +| Memory | GB/hour | 0.005 | +| Ephemeral storage | GB/hour | 0.001 | +| Network ingress | GB | FREE | +| Network egress | GB | 0.05 | +| Load balancer | hour | 0.01 | +| Static IP | month | 2.00 | + +--- + +## Milestone 3: Persistent Virtual Machines (OS Hosting) + +### 3.1 MicroVM Architecture (Firecracker-based) + +```rust +// synor-compute/src/vm/microvm.rs + +/// Virtual machine specification +pub struct VmSpec { + /// VM ID + pub vm_id: VmId, + /// Owner address + pub owner: Address, + /// VM size + pub size: VmSize, + /// Boot image + pub image: VmImage, + /// Persistent volumes + pub volumes: Vec, + /// Network configuration + pub network: VmNetworkConfig, + /// SSH keys for access + pub ssh_keys: Vec, + /// Cloud-init user data + pub user_data: Option, +} + +pub struct VmSize { + pub vcpus: u32, + pub memory_gb: u32, + pub gpu: Option, + pub network_bandwidth_gbps: u32, +} + +pub struct GpuPassthrough { + pub count: u32, + pub model: GpuModel, + pub vram_gb: u32, +} + +pub enum VmImage { + /// Pre-built images + Marketplace { image_id: String, version: String }, + /// Custom image from Synor Storage + Custom { cid: Cid, format: ImageFormat }, + /// Standard OS images + Ubuntu { version: String }, + Debian { version: String }, + AlmaLinux { version: String }, + Windows { version: String, license: WindowsLicense }, +} + +pub struct PersistentVolume { + pub volume_id: VolumeId, + pub size_gb: u32, + pub volume_type: VolumeType, + pub mount_path: String, + pub encrypted: bool, +} + +pub enum VolumeType { + /// High-performance NVMe SSD + NvmeSsd { iops: u32, throughput_mbps: u32 }, + /// Standard SSD + Ssd, + /// HDD for archival + Hdd, + /// Distributed storage (Synor Storage L2) + Distributed { replication: u8 }, +} +``` + +### 3.2 VM Lifecycle Management + +```rust +// synor-compute/src/vm/lifecycle.rs + +pub enum VmState { + Pending, + Provisioning, + Running, + Stopping, + Stopped, + Hibernating, + Hibernated, + Migrating, + Failed, + Terminated, +} + +pub struct VmManager { + /// Active VMs + vms: HashMap, + /// Node assignments + node_assignments: HashMap, + /// Live migration coordinator + migration_coordinator: MigrationCoordinator, +} + +impl VmManager { + /// Start a new VM + pub async fn create(&self, spec: VmSpec) -> Result; + + /// Stop a VM (preserves state) + pub async fn stop(&self, vm_id: &VmId) -> Result<(), VmError>; + + /// Start a stopped VM + pub async fn start(&self, vm_id: &VmId) -> Result<(), VmError>; + + /// Hibernate VM to storage (saves memory state) + pub async fn hibernate(&self, vm_id: &VmId) -> Result<(), VmError>; + + /// Live migrate VM to another node + pub async fn migrate(&self, vm_id: &VmId, target_node: NodeId) -> Result<(), VmError>; + + /// Resize VM (requires restart) + pub async fn resize(&self, vm_id: &VmId, new_size: VmSize) -> Result<(), VmError>; + + /// Snapshot VM state + pub async fn snapshot(&self, vm_id: &VmId) -> Result; + + /// Terminate and delete VM + pub async fn terminate(&self, vm_id: &VmId) -> Result<(), VmError>; +} +``` + +### 3.3 VM Pricing + +| VM Type | vCPUs | Memory | Storage | GPU | Price (SYNOR/month) | +|---------|-------|--------|---------|-----|---------------------| +| micro | 1 | 1 GB | 20 GB SSD | - | 5 | +| small | 2 | 4 GB | 50 GB SSD | - | 15 | +| medium | 4 | 8 GB | 100 GB SSD | - | 30 | +| large | 8 | 32 GB | 200 GB SSD | - | 80 | +| xlarge | 16 | 64 GB | 500 GB NVMe | - | 200 | +| gpu-small | 8 | 32 GB | 200 GB NVMe | 1x RTX 4090 | 400 | +| gpu-medium | 16 | 64 GB | 500 GB NVMe | 2x RTX 4090 | 750 | +| gpu-large | 32 | 128 GB | 1 TB NVMe | 4x A100 80GB | 2500 | +| gpu-xlarge | 64 | 256 GB | 2 TB NVMe | 8x H100 | 8000 | + +--- + +## Milestone 4: Serverless Functions (FaaS) + +### 4.1 Function Specification + +```rust +// synor-compute/src/serverless/function.rs + +/// Serverless function definition +pub struct Function { + pub function_id: FunctionId, + pub owner: Address, + pub name: String, + pub runtime: FunctionRuntime, + pub handler: String, + pub code: FunctionCode, + pub resources: FunctionResources, + pub triggers: Vec, + pub environment: HashMap, + pub timeout_ms: u32, + pub concurrency: ConcurrencyConfig, +} + +pub enum FunctionRuntime { + Node20, + Node22, + Python311, + Python312, + Rust, + Go122, + Java21, + Dotnet8, + Ruby33, + Custom { image: String }, +} + +pub struct FunctionCode { + /// Source code CID in Synor Storage + pub cid: Cid, + /// Entry point file + pub entry_point: String, + /// Dependencies (package.json, requirements.txt, etc.) + pub dependencies: Option, +} + +pub struct FunctionResources { + pub memory_mb: u32, // 128, 256, 512, 1024, 2048, 4096, 8192 + pub cpu_allocation: f32, // Proportional to memory + pub ephemeral_storage_mb: u32, + pub gpu: Option, +} + +pub enum FunctionTrigger { + /// HTTP endpoint + Http { path: String, methods: Vec }, + /// Scheduled execution (cron) + Schedule { cron: String }, + /// Event from message queue + Queue { queue_name: String }, + /// Storage events + Storage { bucket: String, events: Vec }, + /// Blockchain events + Blockchain { contract: Address, events: Vec }, + /// Webhook + Webhook { url: String }, +} +``` + +### 4.2 Cold Start Optimization + +```rust +// synor-compute/src/serverless/warmup.rs + +/// Function warmup strategies +pub struct WarmupConfig { + /// Minimum warm instances + pub min_instances: u32, + /// Provisioned concurrency + pub provisioned_concurrency: u32, + /// Warmup schedule + pub warmup_schedule: Option, + /// Snapshot-based cold start (SnapStart) + pub snapstart_enabled: bool, +} + +pub struct ColdStartOptimizer { + /// Pre-warmed function pools + pools: HashMap, + /// Snapshot cache + snapshots: LruCache, + /// Prediction model for scaling + predictor: ScalingPredictor, +} + +impl ColdStartOptimizer { + /// Get a warm instance or create one + pub async fn get_instance(&self, function: &Function) -> Result { + // Try snapshot restore first (< 100ms) + if let Some(snapshot) = self.snapshots.get(&function.function_id) { + return self.restore_from_snapshot(snapshot).await; + } + + // Try warm pool (< 50ms) + if let Some(instance) = self.pools.get(&function.runtime)?.get_warm() { + return Ok(instance); + } + + // Cold start (1-5s depending on runtime) + self.cold_start(function).await + } +} +``` + +### 4.3 Serverless Pricing + +| Resource | Unit | Price (SYNOR) | +|----------|------|---------------| +| Invocations | 1M requests | 0.20 | +| Duration | GB-second | 0.00001 | +| Provisioned concurrency | GB-hour | 0.01 | +| HTTP Gateway | 1M requests | 0.10 | +| Event bridge | 1M events | 0.50 | + +--- + +## Milestone 5: Edge Compute + +### 5.1 Edge Node Architecture + +```rust +// synor-compute/src/edge/node.rs + +/// Edge compute node +pub struct EdgeNode { + pub node_id: NodeId, + pub location: GeoLocation, + pub capabilities: EdgeCapabilities, + pub latency_zones: Vec, + pub resources: EdgeResources, +} + +pub struct EdgeCapabilities { + pub wasm_runtime: bool, + pub container_runtime: bool, + pub gpu_inference: bool, + pub video_transcoding: bool, + pub cdn_cache: bool, +} + +pub struct EdgeResources { + pub cpu_cores: u32, + pub memory_gb: u32, + pub storage_gb: u32, + pub gpu: Option, + pub bandwidth_gbps: u32, +} + +/// Edge function for low-latency compute +pub struct EdgeFunction { + pub function_id: FunctionId, + pub code: WasmModule, + pub memory_limit: u32, + pub timeout_ms: u32, + pub allowed_regions: Vec, +} +``` + +### 5.2 Edge Use Cases + +```rust +// synor-compute/src/edge/usecases.rs + +/// CDN with compute at edge +pub struct EdgeCdn { + /// Origin servers + origins: Vec, + /// Cache rules + cache_rules: Vec, + /// Edge workers for request/response transformation + workers: Vec, +} + +/// Real-time inference at edge +pub struct EdgeInference { + /// Model optimized for edge (quantized, pruned) + model_id: ModelId, + /// Inference runtime (TensorRT, ONNX Runtime) + runtime: EdgeInferenceRuntime, + /// Max batch size + max_batch: u32, + /// Target latency + target_latency_ms: u32, +} + +/// Video processing at edge +pub struct EdgeVideoProcessor { + /// Transcoding profiles + profiles: Vec, + /// Real-time streaming + live_streaming: bool, + /// Adaptive bitrate + abr_enabled: bool, +} +``` + +### 5.3 Edge Pricing + +| Resource | Unit | Price (SYNOR) | +|----------|------|---------------| +| Edge function invocations | 1M | 0.50 | +| Edge function duration | GB-second | 0.00002 | +| Edge bandwidth | GB | 0.08 | +| Edge cache storage | GB/month | 0.02 | +| Video transcoding | minute | 0.02 | + +--- + +## Milestone 6: Node Provider Economics + +### 6.1 Provider Registration + +```rust +// synor-compute/src/provider/registration.rs + +/// Compute provider registration +pub struct ProviderRegistration { + pub provider_id: ProviderId, + pub owner: Address, + /// Stake required to become provider + pub stake: u64, + /// Hardware specifications + pub hardware: HardwareManifest, + /// Network connectivity + pub network: NetworkManifest, + /// Geographic location + pub location: GeoLocation, + /// Availability SLA commitment + pub sla: SlaCommitment, +} + +pub struct HardwareManifest { + pub cpus: Vec, + pub memory_total_gb: u64, + pub gpus: Vec, + pub storage: Vec, + pub verified: bool, // Hardware attestation passed +} + +pub struct SlaCommitment { + pub uptime_percent: f32, // 99.9, 99.99, etc. + pub response_time_ms: u32, + pub data_durability: f32, + pub penalty_rate: f32, // Penalty for SLA violation +} +``` + +### 6.2 Provider Revenue Model + +| Revenue Source | Provider Share | Protocol Share | +|----------------|----------------|----------------| +| Compute fees | 85% | 15% | +| Storage fees | 80% | 20% | +| Network fees | 75% | 25% | +| SLA bonuses | 100% | 0% | +| Staking rewards | 100% | 0% | + +### 6.3 Slashing Conditions + +| Violation | Penalty | +|-----------|---------| +| Downtime > committed SLA | 1% stake per hour | +| Data loss | 10% stake + compensation | +| Malicious behavior | 100% stake | +| False hardware attestation | 50% stake | + +--- + +## Implementation Timeline + +### Phase 11.1: Foundation (Weeks 1-4) +- [ ] Node registration and hardware attestation +- [ ] Basic job scheduler +- [ ] WASM runtime integration (existing) +- [ ] Container runtime (containerd) +- [ ] Network overlay (WireGuard mesh) + +### Phase 11.2: GPU Compute (Weeks 5-8) +- [ ] GPU node registration +- [ ] NVIDIA driver integration +- [ ] CUDA runtime support +- [ ] Basic ML job execution +- [ ] Model storage integration + +### Phase 11.3: Container Orchestration (Weeks 9-12) +- [ ] OCI image support +- [ ] Service deployment +- [ ] Load balancing +- [ ] Auto-scaling +- [ ] Service mesh (mTLS) + +### Phase 11.4: Persistent VMs (Weeks 13-16) +- [ ] MicroVM runtime (Firecracker) +- [ ] VM lifecycle management +- [ ] Persistent storage +- [ ] Live migration +- [ ] Snapshot/restore + +### Phase 11.5: Serverless (Weeks 17-20) +- [ ] Function deployment +- [ ] Cold start optimization +- [ ] Event triggers +- [ ] API gateway +- [ ] Monitoring/logging + +### Phase 11.6: Edge Compute (Weeks 21-24) +- [ ] Edge node registration +- [ ] Edge function runtime +- [ ] CDN integration +- [ ] Edge inference +- [ ] Global anycast + +--- + +## Security Considerations + +### Isolation Levels + +| Workload Type | Isolation Technology | Security Level | +|---------------|---------------------|----------------| +| WASM | Wasmtime sandbox | High | +| Serverless | gVisor + seccomp | High | +| Containers | gVisor or Kata | Medium-High | +| VMs | Firecracker MicroVM | High | +| GPU | NVIDIA MIG/MPS | Medium | + +### Network Security + +- All inter-node traffic encrypted (WireGuard) +- mTLS for service-to-service communication +- Network policies for workload isolation +- DDoS protection at edge + +### Data Security + +- Encryption at rest (AES-256) +- Encryption in transit (TLS 1.3) +- Confidential computing support (AMD SEV, Intel SGX) +- Secure key management (HSM integration) + +--- + +## API Examples + +### Deploy AI Training Job + +```bash +synor compute train create \ + --framework pytorch \ + --model-config ./model.yaml \ + --dataset synor://datasets/imagenet \ + --gpus 8 \ + --gpu-type h100 \ + --distributed ddp \ + --epochs 100 \ + --checkpoint-interval 1000 \ + --max-budget 1000 +``` + +### Deploy Inference Endpoint + +```bash +synor compute inference deploy \ + --model synor://models/llama-70b \ + --format vllm \ + --min-replicas 2 \ + --max-replicas 10 \ + --gpu-per-replica 2 \ + --target-utilization 0.7 +``` + +### Create Persistent VM + +```bash +synor compute vm create \ + --name my-dev-server \ + --image ubuntu:22.04 \ + --size gpu-small \ + --volume 100gb:nvme:/data \ + --ssh-key ~/.ssh/id_ed25519.pub \ + --region us-east +``` + +### Deploy Container Service + +```bash +synor compute service deploy \ + --name my-api \ + --image my-registry/my-api:latest \ + --replicas 3 \ + --cpu 2 \ + --memory 4gb \ + --port 8080 \ + --health-check /health \ + --autoscale 2-10 +``` + +### Deploy Serverless Function + +```bash +synor compute function deploy \ + --name process-image \ + --runtime python312 \ + --handler main.handler \ + --code ./function \ + --memory 1024 \ + --timeout 30000 \ + --trigger http:/api/process +``` + +--- + +## Comparison with Existing Synor VM + +| Feature | Current Synor VM | Synor Compute L2 | +|---------|------------------|------------------| +| Runtime | WASM only | WASM, Container, MicroVM | +| Timeout | 30 seconds | Unlimited (VMs) | +| Memory | 16 MB max | Up to 256 GB | +| GPU | ❌ | ✅ Full CUDA/ROCm | +| Networking | ❌ | ✅ Full TCP/UDP | +| File I/O | ❌ | ✅ Persistent volumes | +| Threading | ❌ | ✅ Multi-threaded | +| AI/ML | ❌ | ✅ Training + Inference | +| OS Hosting | ❌ | ✅ Full Linux/Windows | + +--- + +## Next Steps + +1. **Milestone 1**: Implement GPU node registration and attestation +2. **Milestone 2**: Build basic job scheduler with resource allocation +3. **Milestone 3**: Integrate containerd for container workloads +4. **Milestone 4**: Add Firecracker for MicroVM support +5. **Milestone 5**: Implement serverless function runtime +6. **Milestone 6**: Deploy edge nodes and CDN integration + +This plan transforms Synor from a smart contract platform into a full-stack decentralized cloud provider capable of competing with AWS/GCP/Azure while maintaining decentralization and censorship resistance.