feat(compute): add Phase 11 Synor Compute L2 heterogeneous compute layer

- Add synor-compute crate for heterogeneous compute orchestration - Implement processor abstraction for CPU/GPU/TPU/NPU/LPU/FPGA/DSP - Add device registry with cross-vendor capability tracking - Implement task scheduler with work stealing and load balancing - Add energy-aware and latency-aware balancing strategies - Create spot market for compute resources with order matching - Add memory manager with tensor handles and cross-device transfers - Support processor capability profiles (H100, TPU v5p, Groq LPU, etc.) - Implement priority work queues with task decomposition Processor types supported: - CPU (x86-64 AVX512, ARM64 SVE, RISC-V Vector) - GPU (NVIDIA CUDA, AMD ROCm, Intel OneAPI, Apple Metal) - TPU (v2-v5p, Edge TPU) - NPU (Apple Neural Engine, Qualcomm Hexagon, Intel VPU) - LPU (Groq Language Processing Unit) - FPGA (Xilinx, Intel Altera) - DSP (TI, Analog Devices) - WebGPU and WASM runtimes
2026-01-11 13:53:57 +05:30 · 2026-01-11 13:53:57 +05:30 · 4c36ddbdc2
commit 4c36ddbdc2
parent 8da34bc73d
19 changed files with 11219 additions and 0 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -9,6 +9,7 @@ members = [
    "crates/synor-storage",
    "crates/synor-hosting",
    "crates/synor-database",
    "crates/synor-compute",
    "crates/synor-governance",
    "crates/synor-rpc",
    "crates/synor-vm",
--- a/crates/synor-compute/Cargo.toml
+++ b/crates/synor-compute/Cargo.toml
@ -0,0 +1,51 @@
 [package]
 name = "synor-compute"
 version.workspace = true
 edition.workspace = true
 description = "Heterogeneous multi-processor compute platform for Synor blockchain"
 license.workspace = true
 [dependencies]
 # Internal crates
 synor-types = { path = "../synor-types" }
 synor-crypto = { path = "../synor-crypto" }
 synor-storage = { path = "../synor-storage" }
 # Serialization
 serde.workspace = true
 serde_json.workspace = true
 borsh.workspace = true
 bincode = "1.3"
 # Async runtime
 tokio = { workspace = true, features = ["sync", "rt-multi-thread", "time", "macros"] }
 async-trait = "0.1"
 futures = "0.3"
 # Concurrency
 parking_lot.workspace = true
 crossbeam-deque = "0.8"
 crossbeam-channel = "0.5"
 dashmap = "5.5"
 # Utilities
 thiserror.workspace = true
 tracing.workspace = true
 hex.workspace = true
 # Hashing
 blake3.workspace = true
 # Data structures
 indexmap = "2.2"
 priority-queue = "2.0"
 # Time
 chrono = { version = "0.4", features = ["serde"] }
 # Random
 rand = "0.8"
 [dev-dependencies]
 tempfile.workspace = true
 tokio-test = "0.4"
--- a/crates/synor-compute/src/device/mod.rs
+++ b/crates/synor-compute/src/device/mod.rs
@ -0,0 +1,377 @@
 //! Device registry and management.
 //!
 //! Supports all device types:
 //! - Data center servers
 //! - Desktop workstations
 //! - Laptops
 //! - Mobile devices (iOS, Android)
 //! - Browsers (WebGPU, WASM)
 //! - IoT devices
 use crate::error::ComputeError;
 use crate::processor::{GenericProcessor, Processor, ProcessorCapabilities, ProcessorId, ProcessorType};
 use crate::{NodeId, ProcessorInfo};
 use parking_lot::RwLock;
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::sync::Arc;
 /// Unique device identifier.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct DeviceId(pub [u8; 32]);
 impl DeviceId {
    /// Creates a new random device ID.
    pub fn new() -> Self {
        use rand::Rng;
        let mut bytes = [0u8; 32];
        rand::thread_rng().fill(&mut bytes);
        DeviceId(bytes)
    }
    /// Creates from bytes.
    pub fn from_bytes(bytes: [u8; 32]) -> Self {
        DeviceId(bytes)
    }
 }
 impl Default for DeviceId {
    fn default() -> Self {
        Self::new()
    }
 }
 impl std::fmt::Display for DeviceId {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "dev_{}", hex::encode(&self.0[..8]))
    }
 }
 /// Device type classification.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub enum DeviceType {
    /// Data center server.
    DataCenter,
    /// Desktop workstation.
    Desktop,
    /// Laptop.
    Laptop,
    /// Mobile phone.
    Mobile,
    /// Tablet.
    Tablet,
    /// IoT device.
    IoT,
    /// Browser (WebGPU/WASM).
    Browser,
    /// Edge server.
    Edge,
 }
 impl DeviceType {
    /// Returns typical reliability score (0-100).
    pub fn reliability(&self) -> u32 {
        match self {
            DeviceType::DataCenter => 99,
            DeviceType::Edge => 95,
            DeviceType::Desktop => 80,
            DeviceType::Laptop => 60,
            DeviceType::Mobile => 40,
            DeviceType::Tablet => 50,
            DeviceType::IoT => 70,
            DeviceType::Browser => 30,
        }
    }
    /// Returns typical availability hours per day.
    pub fn availability_hours(&self) -> f32 {
        match self {
            DeviceType::DataCenter => 24.0,
            DeviceType::Edge => 24.0,
            DeviceType::Desktop => 8.0,
            DeviceType::Laptop => 6.0,
            DeviceType::Mobile => 4.0,
            DeviceType::Tablet => 4.0,
            DeviceType::IoT => 24.0,
            DeviceType::Browser => 2.0,
        }
    }
 }
 /// Device capabilities.
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct DeviceCapabilities {
    /// Device type.
    pub device_type: DeviceType,
    /// Available processors.
    pub processors: Vec<ProcessorType>,
    /// Total memory (GB).
    pub memory_gb: f32,
    /// Network bandwidth (Mbps).
    pub bandwidth_mbps: f32,
    /// Storage available (GB).
    pub storage_gb: f32,
    /// Battery powered.
    pub battery_powered: bool,
    /// Supports background execution.
    pub background_execution: bool,
 }
 /// Device information.
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct DeviceInfo {
    /// Device ID.
    pub id: DeviceId,
    /// Device type.
    pub device_type: DeviceType,
    /// Owner address.
    pub owner: [u8; 32],
    /// Capabilities.
    pub capabilities: DeviceCapabilities,
    /// Current status.
    pub status: DeviceStatus,
    /// Reputation score (0-100).
    pub reputation: u32,
    /// Total earnings (atomic SYNOR).
    pub earnings: u64,
    /// Geographic region.
    pub region: String,
 }
 /// Device status.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
 pub enum DeviceStatus {
    /// Online and available.
    Online,
    /// Online but busy.
    Busy,
    /// Idle but available.
    Idle,
    /// On battery (reduced capacity).
    OnBattery,
    /// Offline.
    Offline,
    /// Maintenance.
    Maintenance,
 }
 /// Device registry managing all devices and processors.
 pub struct DeviceRegistry {
    /// Registered devices.
    devices: RwLock<HashMap<DeviceId, DeviceInfo>>,
    /// Node to device mapping.
    node_devices: RwLock<HashMap<NodeId, Vec<DeviceId>>>,
    /// All processors (across all nodes).
    processors: RwLock<HashMap<ProcessorId, Arc<dyn Processor>>>,
    /// Processor to node mapping.
    processor_nodes: RwLock<HashMap<ProcessorId, NodeId>>,
    /// Next processor ID.
    next_processor_id: std::sync::atomic::AtomicU64,
 }
 impl DeviceRegistry {
    /// Creates a new device registry.
    pub fn new() -> Self {
        Self {
            devices: RwLock::new(HashMap::new()),
            node_devices: RwLock::new(HashMap::new()),
            processors: RwLock::new(HashMap::new()),
            processor_nodes: RwLock::new(HashMap::new()),
            next_processor_id: std::sync::atomic::AtomicU64::new(0),
        }
    }
    /// Registers a device.
    pub fn register_device(&self, device: DeviceInfo) -> Result<DeviceId, ComputeError> {
        let id = device.id;
        self.devices.write().insert(id, device);
        Ok(id)
    }
    /// Unregisters a device.
    pub fn unregister_device(&self, device_id: DeviceId) -> Result<(), ComputeError> {
        self.devices.write().remove(&device_id);
        Ok(())
    }
    /// Gets a device by ID.
    pub fn get_device(&self, device_id: DeviceId) -> Option<DeviceInfo> {
        self.devices.read().get(&device_id).cloned()
    }
    /// Registers a processor for a node.
    pub fn register_processor(
        &self,
        node_id: NodeId,
        info: ProcessorInfo,
    ) -> Result<(), ComputeError> {
        let processor_id = info.id;
        // Create a generic processor from the info
        let processor: Arc<dyn Processor> = Arc::new(GenericProcessor::new(
            processor_id,
            info.processor_type,
            info.capabilities,
        ));
        self.processors.write().insert(processor_id, processor);
        self.processor_nodes.write().insert(processor_id, node_id);
        Ok(())
    }
    /// Unregisters all processors for a node.
    pub fn unregister_node(&self, node_id: NodeId) -> Result<(), ComputeError> {
        let mut processors = self.processors.write();
        let mut processor_nodes = self.processor_nodes.write();
        // Find and remove all processors for this node
        let to_remove: Vec<_> = processor_nodes
            .iter()
            .filter(|(_, n)| **n == node_id)
            .map(|(p, _)| *p)
            .collect();
        for proc_id in to_remove {
            processors.remove(&proc_id);
            processor_nodes.remove(&proc_id);
        }
        Ok(())
    }
    /// Gets a processor by ID.
    pub fn get_processor(&self, processor_id: ProcessorId) -> Result<Arc<dyn Processor>, ComputeError> {
        self.processors
            .read()
            .get(&processor_id)
            .cloned()
            .ok_or(ComputeError::ProcessorNotFound(processor_id))
    }
    /// Gets all processors.
    pub fn all_processors(&self) -> Vec<Arc<dyn Processor>> {
        self.processors.read().values().cloned().collect()
    }
    /// Gets processors of a specific type.
    pub fn processors_by_type(&self, proc_type: ProcessorType) -> Vec<Arc<dyn Processor>> {
        self.processors
            .read()
            .values()
            .filter(|p| p.processor_type() == proc_type)
            .cloned()
            .collect()
    }
    /// Gets the next processor ID.
    pub fn next_processor_id(&self) -> ProcessorId {
        ProcessorId(self.next_processor_id.fetch_add(1, std::sync::atomic::Ordering::SeqCst))
    }
    /// Gets total number of devices.
    pub fn device_count(&self) -> usize {
        self.devices.read().len()
    }
    /// Gets total number of processors.
    pub fn processor_count(&self) -> usize {
        self.processors.read().len()
    }
    /// Gets devices by type.
    pub fn devices_by_type(&self, device_type: DeviceType) -> Vec<DeviceInfo> {
        self.devices
            .read()
            .values()
            .filter(|d| d.device_type == device_type)
            .cloned()
            .collect()
    }
    /// Gets online devices.
    pub fn online_devices(&self) -> Vec<DeviceInfo> {
        self.devices
            .read()
            .values()
            .filter(|d| d.status == DeviceStatus::Online || d.status == DeviceStatus::Idle)
            .cloned()
            .collect()
    }
    /// Updates device status.
    pub fn update_device_status(
        &self,
        device_id: DeviceId,
        status: DeviceStatus,
    ) -> Result<(), ComputeError> {
        if let Some(device) = self.devices.write().get_mut(&device_id) {
            device.status = status;
            Ok(())
        } else {
            Err(ComputeError::Internal(format!("Device not found: {}", device_id)))
        }
    }
 }
 impl Default for DeviceRegistry {
    fn default() -> Self {
        Self::new()
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use crate::processor::{CpuVariant, AvxSupport};
    #[test]
    fn test_device_id() {
        let id1 = DeviceId::new();
        let id2 = DeviceId::new();
        assert_ne!(id1.0, id2.0);
    }
    #[test]
    fn test_device_registry() {
        let registry = DeviceRegistry::new();
        let device = DeviceInfo {
            id: DeviceId::new(),
            device_type: DeviceType::Desktop,
            owner: [1u8; 32],
            capabilities: DeviceCapabilities {
                device_type: DeviceType::Desktop,
                processors: vec![ProcessorType::Cpu(CpuVariant::X86_64 {
                    avx: AvxSupport::Avx512,
                })],
                memory_gb: 64.0,
                bandwidth_mbps: 1000.0,
                storage_gb: 1000.0,
                battery_powered: false,
                background_execution: true,
            },
            status: DeviceStatus::Online,
            reputation: 100,
            earnings: 0,
            region: "us-east".to_string(),
        };
        let device_id = device.id;
        registry.register_device(device).unwrap();
        assert_eq!(registry.device_count(), 1);
        assert!(registry.get_device(device_id).is_some());
        registry.unregister_device(device_id).unwrap();
        assert_eq!(registry.device_count(), 0);
    }
    #[test]
    fn test_device_type_properties() {
        assert_eq!(DeviceType::DataCenter.reliability(), 99);
        assert_eq!(DeviceType::Mobile.reliability(), 40);
        assert_eq!(DeviceType::DataCenter.availability_hours(), 24.0);
        assert_eq!(DeviceType::Browser.availability_hours(), 2.0);
    }
 }
--- a/crates/synor-compute/src/error.rs
+++ b/crates/synor-compute/src/error.rs
@ -0,0 +1,92 @@
 //! Error types for Synor Compute.
 use crate::{JobId, NodeId, ProcessorId, ProcessorType};
 use thiserror::Error;
 /// Compute errors.
 #[derive(Debug, Error)]
 pub enum ComputeError {
    /// Job not found.
    #[error("Job not found: {0}")]
    JobNotFound(JobId),
    /// Node not found.
    #[error("Node not found: {0}")]
    NodeNotFound(NodeId),
    /// Processor not found.
    #[error("Processor not found: {0}")]
    ProcessorNotFound(ProcessorId),
    /// No suitable processor for operation.
    #[error("No suitable processor for operation: {0}")]
    NoSuitableProcessor(String),
    /// Insufficient resources.
    #[error("Insufficient resources: {0}")]
    InsufficientResources(String),
    /// Task execution failed.
    #[error("Task execution failed: {0}")]
    TaskExecutionFailed(String),
    /// Scheduling failed.
    #[error("Scheduling failed: {0}")]
    SchedulingFailed(String),
    /// Memory allocation failed.
    #[error("Memory allocation failed: {0}")]
    MemoryAllocationFailed(String),
    /// Data transfer failed.
    #[error("Data transfer failed: {0}")]
    DataTransferFailed(String),
    /// Processor type not supported.
    #[error("Processor type not supported: {0:?}")]
    ProcessorTypeNotSupported(ProcessorType),
    /// Operation not supported on processor.
    #[error("Operation not supported on {0:?}: {1}")]
    OperationNotSupported(ProcessorType, String),
    /// Timeout.
    #[error("Operation timed out after {0}ms")]
    Timeout(u64),
    /// Budget exceeded.
    #[error("Budget exceeded: required {required}, available {available}")]
    BudgetExceeded { required: u64, available: u64 },
    /// Node already registered.
    #[error("Node already registered: {0}")]
    NodeAlreadyRegistered(NodeId),
    /// Invalid configuration.
    #[error("Invalid configuration: {0}")]
    InvalidConfiguration(String),
    /// Serialization error.
    #[error("Serialization error: {0}")]
    Serialization(String),
    /// Network error.
    #[error("Network error: {0}")]
    Network(String),
    /// Internal error.
    #[error("Internal error: {0}")]
    Internal(String),
 }
 impl From<bincode::Error> for ComputeError {
    fn from(err: bincode::Error) -> Self {
        ComputeError::Serialization(err.to_string())
    }
 }
 impl From<serde_json::Error> for ComputeError {
    fn from(err: serde_json::Error) -> Self {
        ComputeError::Serialization(err.to_string())
    }
 }
--- a/crates/synor-compute/src/lib.rs
+++ b/crates/synor-compute/src/lib.rs
@ -0,0 +1,631 @@
 //! Synor Compute L2 - Heterogeneous Multi-Processor Compute Platform
 //!
 //! Provides decentralized compute services with:
 //!
 //! - **Heterogeneous Scheduling**: CPU + GPU + TPU + NPU + LPU working simultaneously
 //! - **Consumer Device Mesh**: Mobile, browser, desktop devices contributing compute
 //! - **90% Cost Reduction**: Zero margins, spot markets, electricity arbitrage
 //! - **10x Speed**: Caching, speculative execution, optimal processor assignment
 //!
 //! # Architecture
 //!
 //! ```text
 //! ┌─────────────────────────────────────────────────────────────────────────────┐
 //! │                         SYNOR COMPUTE L2                                     │
 //! ├─────────────────────────────────────────────────────────────────────────────┤
 //! │                                                                              │
 //! │  ┌─────────────────────────────────────────────────────────────────────────┐ │
 //! │  │                      TASK DECOMPOSER                                     │ │
 //! │  │  Analyzes workload → Identifies subtasks → Maps to optimal processors    │ │
 //! │  └─────────────────────────────────────────────────────────────────────────┘ │
 //! │                                    │                                         │
 //! │                                    ▼                                         │
 //! │  ┌─────────────────────────────────────────────────────────────────────────┐ │
 //! │  │                    HETEROGENEOUS SCHEDULER                               │ │
 //! │  │  ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐                 │ │
 //! │  │  │ CPU  │ │ GPU  │ │ TPU  │ │ NPU  │ │ LPU  │ │Custom│                 │ │
 //! │  │  │Queue │ │Queue │ │Queue │ │Queue │ │Queue │ │Queue │                 │ │
 //! │  │  └──────┘ └──────┘ └──────┘ └──────┘ └──────┘ └──────┘                 │ │
 //! │  └─────────────────────────────────────────────────────────────────────────┘ │
 //! │                                                                              │
 //! │  ┌─────────────────────────────────────────────────────────────────────────┐ │
 //! │  │                    UNIFIED MEMORY FABRIC                                 │ │
 //! │  │  Zero-copy data sharing │ Automatic placement │ Cache coherency          │ │
 //! │  └─────────────────────────────────────────────────────────────────────────┘ │
 //! │                                                                              │
 //! └─────────────────────────────────────────────────────────────────────────────┘
 //! ```
 //!
 //! # Pricing
 //!
 //! | Resource | Unit | Price (SYNOR) |
 //! |----------|------|---------------|
 //! | GPU (consumer) | hour | 0.10 |
 //! | GPU (datacenter) | hour | 0.50-4.00 |
 //! | CPU | core/hour | 0.02 |
 //! | Memory | GB/hour | 0.005 |
 //! | Inference | 1M tokens | 0.10 |
 #![allow(dead_code)]
 pub mod device;
 pub mod error;
 pub mod market;
 pub mod memory;
 pub mod processor;
 pub mod scheduler;
 pub mod task;
 pub use device::{
    DeviceCapabilities, DeviceId, DeviceInfo, DeviceRegistry, DeviceStatus, DeviceType,
 };
 pub use error::ComputeError;
 pub use market::{
    Auction, AuctionId, CloudComparison, CpuTier as MarketCpuTier, GpuTier as MarketGpuTier,
    MarketStats, Order, OrderBook, OrderId, OrderSide, OrderType, PricingEngine, ProviderListing,
    ResourceType, SpotMarket, Trade,
 };
 pub use memory::{MemoryManager, TensorHandle, TransferPath, UnifiedMemory};
 pub use processor::{
    ComputeThroughput, CpuVariant, GpuVariant, NpuVariant, Operation, OperationType, Processor,
    ProcessorCapabilities, ProcessorId, ProcessorType, TpuVersion,
 };
 pub use scheduler::{
    HeterogeneousScheduler, LoadBalancer, Schedule, ScheduleResult, TaskAssignment, WorkQueue,
 };
 pub use task::{
    ComputeTask, DecomposedWorkload, Task, TaskDecomposer, TaskId, TaskPriority, TaskResult,
    TaskStatus,
 };
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::sync::Arc;
 use parking_lot::RwLock;
 /// Compute node identifier.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct NodeId(pub u64);
 impl std::fmt::Display for NodeId {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "node_{}", self.0)
    }
 }
 /// Job identifier.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct JobId(pub [u8; 32]);
 impl JobId {
    /// Creates a new job ID.
    pub fn new() -> Self {
        use rand::Rng;
        let mut bytes = [0u8; 32];
        rand::thread_rng().fill(&mut bytes);
        JobId(bytes)
    }
    /// Creates from bytes.
    pub fn from_bytes(bytes: [u8; 32]) -> Self {
        JobId(bytes)
    }
 }
 impl Default for JobId {
    fn default() -> Self {
        Self::new()
    }
 }
 impl std::fmt::Display for JobId {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "job_{}", hex::encode(&self.0[..8]))
    }
 }
 /// Compute job specification.
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct ComputeJob {
    /// Job ID.
    pub id: JobId,
    /// Owner address.
    pub owner: [u8; 32],
    /// Job type.
    pub job_type: JobType,
    /// Resource requirements.
    pub resources: ResourceRequirements,
    /// Input data reference (CID).
    pub input_cid: Option<String>,
    /// Maximum budget (in atomic SYNOR).
    pub max_budget: u64,
    /// Priority level.
    pub priority: JobPriority,
    /// Created timestamp.
    pub created_at: u64,
    /// Deadline (optional).
    pub deadline: Option<u64>,
 }
 /// Job type classification.
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub enum JobType {
    /// AI/ML training job.
    Training {
        framework: MlFramework,
        model_cid: String,
        dataset_cid: String,
        epochs: u32,
        batch_size: u32,
    },
    /// AI/ML inference job.
    Inference {
        model_cid: String,
        input_format: String,
        batch_size: u32,
    },
    /// Container workload.
    Container {
        image: String,
        command: Vec<String>,
        env: HashMap<String, String>,
    },
    /// Serverless function.
    Serverless {
        runtime: FunctionRuntime,
        code_cid: String,
        handler: String,
    },
    /// General compute (WASM).
    Wasm {
        module_cid: String,
        entrypoint: String,
    },
 }
 /// ML framework specification.
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub enum MlFramework {
    PyTorch { version: String },
    TensorFlow { version: String },
    JAX { version: String },
    ONNX,
 }
 /// Function runtime.
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub enum FunctionRuntime {
    Node20,
    Python312,
    Rust,
    Go,
    Custom { image: String },
 }
 /// Job priority levels.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
 pub enum JobPriority {
    /// Background job, can be preempted.
    Background = 0,
    /// Normal priority.
    Normal = 1,
    /// High priority, faster scheduling.
    High = 2,
    /// Critical, guaranteed resources.
    Critical = 3,
 }
 impl Default for JobPriority {
    fn default() -> Self {
        JobPriority::Normal
    }
 }
 /// Resource requirements for a job.
 #[derive(Clone, Debug, Default, Serialize, Deserialize)]
 pub struct ResourceRequirements {
    /// Minimum CPU cores.
    pub min_cpu_cores: f32,
    /// Minimum memory (GB).
    pub min_memory_gb: f32,
    /// GPU requirements.
    pub gpu: Option<GpuRequirements>,
    /// Preferred processor types (in priority order).
    pub preferred_processors: Vec<ProcessorType>,
    /// Maximum latency (ms) - for inference.
    pub max_latency_ms: Option<u32>,
    /// Requires distributed execution.
    pub distributed: bool,
 }
 /// GPU resource requirements.
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct GpuRequirements {
    /// Minimum number of GPUs.
    pub min_count: u32,
    /// Maximum number of GPUs.
    pub max_count: u32,
    /// Minimum VRAM per GPU (GB).
    pub min_vram_gb: u32,
    /// Minimum compute capability.
    pub min_compute_capability: Option<(u8, u8)>,
    /// Allow GPU sharing (MPS/MIG).
    pub allow_sharing: bool,
 }
 /// Job execution status.
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub enum JobStatus {
    /// Queued, waiting for resources.
    Queued,
    /// Resources allocated, starting.
    Starting,
    /// Running.
    Running {
        progress: f32,
        assigned_nodes: Vec<NodeId>,
    },
    /// Completed successfully.
    Completed {
        result_cid: String,
        duration_ms: u64,
        cost: u64,
    },
    /// Failed.
    Failed { error: String },
    /// Cancelled by user.
    Cancelled,
 }
 /// Compute node registration.
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct ComputeNode {
    /// Node ID.
    pub id: NodeId,
    /// Owner address.
    pub owner: [u8; 32],
    /// Available processors.
    pub processors: Vec<ProcessorInfo>,
    /// Total memory (GB).
    pub total_memory_gb: f32,
    /// Available memory (GB).
    pub available_memory_gb: f32,
    /// Network bandwidth (Gbps).
    pub bandwidth_gbps: f32,
    /// Geographic region.
    pub region: String,
    /// Stake amount (for PoS).
    pub stake: u64,
    /// Reputation score (0-100).
    pub reputation: u32,
    /// Current status.
    pub status: NodeStatus,
 }
 /// Processor information on a node.
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct ProcessorInfo {
    /// Processor ID (local to node).
    pub id: ProcessorId,
    /// Processor type.
    pub processor_type: ProcessorType,
    /// Capabilities.
    pub capabilities: ProcessorCapabilities,
    /// Current utilization (0.0 - 1.0).
    pub utilization: f32,
    /// Current temperature (Celsius).
    pub temperature: Option<f32>,
 }
 /// Node status.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
 pub enum NodeStatus {
    /// Online and accepting jobs.
    Online,
    /// Online but not accepting new jobs.
    Draining,
    /// Offline.
    Offline,
    /// Maintenance mode.
    Maintenance,
 }
 /// Compute cluster manager.
 pub struct ComputeCluster {
    /// Registered nodes.
    nodes: RwLock<HashMap<NodeId, ComputeNode>>,
    /// Device registry.
    device_registry: Arc<DeviceRegistry>,
    /// Heterogeneous scheduler.
    scheduler: Arc<HeterogeneousScheduler>,
    /// Spot market.
    spot_market: Arc<SpotMarket>,
    /// Memory manager.
    memory_manager: Arc<MemoryManager>,
    /// Active jobs.
    jobs: RwLock<HashMap<JobId, ComputeJob>>,
 }
 impl ComputeCluster {
    /// Creates a new compute cluster.
    pub fn new() -> Self {
        let device_registry = Arc::new(DeviceRegistry::new());
        let scheduler = Arc::new(HeterogeneousScheduler::new(device_registry.clone()));
        let spot_market = Arc::new(SpotMarket::new());
        let memory_manager = Arc::new(MemoryManager::new());
        Self {
            nodes: RwLock::new(HashMap::new()),
            device_registry,
            scheduler,
            spot_market,
            memory_manager,
            jobs: RwLock::new(HashMap::new()),
        }
    }
    /// Registers a compute node.
    pub fn register_node(&self, node: ComputeNode) -> Result<(), ComputeError> {
        let id = node.id;
        // Register processors with device registry
        for proc in &node.processors {
            self.device_registry.register_processor(id, proc.clone())?;
        }
        self.nodes.write().insert(id, node);
        Ok(())
    }
    /// Unregisters a compute node.
    pub fn unregister_node(&self, node_id: NodeId) -> Result<(), ComputeError> {
        self.device_registry.unregister_node(node_id)?;
        self.nodes.write().remove(&node_id);
        Ok(())
    }
    /// Submits a job for execution.
    pub async fn submit_job(&self, job: ComputeJob) -> Result<JobId, ComputeError> {
        let job_id = job.id;
        // Decompose job into tasks
        let tasks = self.decompose_job(&job)?;
        // Schedule tasks
        let schedule = self.scheduler.schedule(tasks).await?;
        // Store job
        self.jobs.write().insert(job_id, job);
        // Execute schedule (async)
        tokio::spawn({
            let scheduler = self.scheduler.clone();
            async move {
                let _ = scheduler.execute(&schedule.schedule).await;
            }
        });
        Ok(job_id)
    }
    /// Gets job status.
    pub fn get_job_status(&self, job_id: &JobId) -> Option<JobStatus> {
        self.jobs.read().get(job_id).map(|_| JobStatus::Queued)
    }
    /// Cancels a job.
    pub fn cancel_job(&self, job_id: &JobId) -> Result<(), ComputeError> {
        if self.jobs.write().remove(job_id).is_some() {
            Ok(())
        } else {
            Err(ComputeError::JobNotFound(*job_id))
        }
    }
    /// Gets cluster statistics.
    pub fn stats(&self) -> ClusterStats {
        let nodes = self.nodes.read();
        let jobs = self.jobs.read();
        let total_nodes = nodes.len();
        let online_nodes = nodes.values().filter(|n| n.status == NodeStatus::Online).count();
        let total_gpus: usize = nodes
            .values()
            .flat_map(|n| &n.processors)
            .filter(|p| matches!(p.processor_type, ProcessorType::Gpu(_)))
            .count();
        let total_memory: f32 = nodes.values().map(|n| n.total_memory_gb).sum();
        ClusterStats {
            total_nodes,
            online_nodes,
            total_gpus,
            total_memory_gb: total_memory,
            active_jobs: jobs.len(),
            queued_jobs: jobs.values().filter(|_| true).count(), // Simplified
        }
    }
    /// Decomposes a job into schedulable tasks.
    fn decompose_job(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
        let decomposer = TaskDecomposer::new();
        decomposer.decompose(job)
    }
 }
 impl Default for ComputeCluster {
    fn default() -> Self {
        Self::new()
    }
 }
 /// Cluster statistics.
 #[derive(Clone, Debug, Default, Serialize, Deserialize)]
 pub struct ClusterStats {
    /// Total registered nodes.
    pub total_nodes: usize,
    /// Online nodes.
    pub online_nodes: usize,
    /// Total GPUs across cluster.
    pub total_gpus: usize,
    /// Total memory (GB).
    pub total_memory_gb: f32,
    /// Active jobs.
    pub active_jobs: usize,
    /// Queued jobs.
    pub queued_jobs: usize,
 }
 /// Pricing calculator for compute operations.
 #[derive(Clone, Debug)]
 pub struct ComputePricing {
    /// GPU cost per hour by type.
    pub gpu_hourly: HashMap<GpuTier, u64>,
    /// CPU cost per core-hour.
    pub cpu_core_hour: u64,
    /// Memory cost per GB-hour.
    pub memory_gb_hour: u64,
    /// Network egress per GB.
    pub network_egress_gb: u64,
    /// Inference per million tokens.
    pub inference_per_million_tokens: u64,
 }
 /// GPU pricing tiers.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub enum GpuTier {
    /// Consumer GPUs (RTX 30xx, 40xx).
    Consumer,
    /// Professional GPUs (RTX A series).
    Professional,
    /// Data center GPUs (A100).
    DataCenter,
    /// Latest generation (H100).
    Premium,
 }
 impl Default for ComputePricing {
    fn default() -> Self {
        let mut gpu_hourly = HashMap::new();
        gpu_hourly.insert(GpuTier::Consumer, 100_000_000);      // 0.10 SYNOR
        gpu_hourly.insert(GpuTier::Professional, 300_000_000);  // 0.30 SYNOR
        gpu_hourly.insert(GpuTier::DataCenter, 2_000_000_000);  // 2.00 SYNOR
        gpu_hourly.insert(GpuTier::Premium, 4_000_000_000);     // 4.00 SYNOR
        Self {
            gpu_hourly,
            cpu_core_hour: 20_000_000,           // 0.02 SYNOR
            memory_gb_hour: 5_000_000,           // 0.005 SYNOR
            network_egress_gb: 50_000_000,       // 0.05 SYNOR
            inference_per_million_tokens: 100_000_000, // 0.10 SYNOR
        }
    }
 }
 impl ComputePricing {
    /// Estimates cost for a job.
    pub fn estimate(&self, job: &ComputeJob, duration_hours: f32) -> u64 {
        let mut cost = 0u64;
        // CPU cost
        cost += (self.cpu_core_hour as f32 * job.resources.min_cpu_cores * duration_hours) as u64;
        // Memory cost
        cost += (self.memory_gb_hour as f32 * job.resources.min_memory_gb * duration_hours) as u64;
        // GPU cost
        if let Some(gpu) = &job.resources.gpu {
            let tier = GpuTier::Consumer; // Simplified
            let gpu_cost = self.gpu_hourly.get(&tier).unwrap_or(&100_000_000);
            cost += (*gpu_cost as f32 * gpu.min_count as f32 * duration_hours) as u64;
        }
        cost
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_job_id() {
        let id1 = JobId::new();
        let id2 = JobId::new();
        assert_ne!(id1.0, id2.0);
    }
    #[test]
    fn test_compute_cluster() {
        let cluster = ComputeCluster::new();
        let stats = cluster.stats();
        assert_eq!(stats.total_nodes, 0);
    }
    #[test]
    fn test_pricing() {
        let pricing = ComputePricing::default();
        let job = ComputeJob {
            id: JobId::new(),
            owner: [0u8; 32],
            job_type: JobType::Inference {
                model_cid: "model123".to_string(),
                input_format: "json".to_string(),
                batch_size: 32,
            },
            resources: ResourceRequirements {
                min_cpu_cores: 4.0,
                min_memory_gb: 16.0,
                gpu: Some(GpuRequirements {
                    min_count: 1,
                    max_count: 1,
                    min_vram_gb: 16,
                    min_compute_capability: None,
                    allow_sharing: false,
                }),
                ..Default::default()
            },
            input_cid: None,
            max_budget: 1_000_000_000,
            priority: JobPriority::Normal,
            created_at: 0,
            deadline: None,
        };
        let cost = pricing.estimate(&job, 1.0);
        assert!(cost > 0);
    }
    #[test]
    fn test_node_registration() {
        let cluster = ComputeCluster::new();
        let node = ComputeNode {
            id: NodeId(1),
            owner: [1u8; 32],
            processors: vec![ProcessorInfo {
                id: ProcessorId(0),
                processor_type: ProcessorType::Cpu(CpuVariant::X86_64 {
                    avx: processor::AvxSupport::Avx512,
                }),
                capabilities: ProcessorCapabilities::default(),
                utilization: 0.0,
                temperature: Some(45.0),
            }],
            total_memory_gb: 64.0,
            available_memory_gb: 60.0,
            bandwidth_gbps: 10.0,
            region: "us-east".to_string(),
            stake: 1000,
            reputation: 100,
            status: NodeStatus::Online,
        };
        cluster.register_node(node).unwrap();
        assert_eq!(cluster.stats().total_nodes, 1);
    }
 }
--- a/crates/synor-compute/src/market/mod.rs
+++ b/crates/synor-compute/src/market/mod.rs
--- a/crates/synor-compute/src/memory/mod.rs
+++ b/crates/synor-compute/src/memory/mod.rs
@ -0,0 +1,370 @@
 //! Unified memory management for heterogeneous compute.
 use crate::error::ComputeError;
 use crate::processor::ProcessorType;
 use parking_lot::RwLock;
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::sync::Arc;
 /// Tensor handle for memory management.
 #[derive(Clone, Debug)]
 pub struct TensorHandle {
    /// Unique ID.
    pub id: TensorId,
    /// Shape.
    pub shape: Vec<usize>,
    /// Data type.
    pub dtype: DataType,
    /// Size in bytes.
    pub size_bytes: u64,
    /// Current locations.
    pub locations: Vec<ProcessorType>,
 }
 impl TensorHandle {
    /// Creates a new tensor handle.
    pub fn new(shape: Vec<usize>, dtype: DataType) -> Self {
        let size_bytes = shape.iter().product::<usize>() as u64 * dtype.size_bytes() as u64;
        Self {
            id: TensorId::new(),
            shape,
            dtype,
            size_bytes,
            locations: Vec::new(),
        }
    }
    /// Gets the number of elements.
    pub fn numel(&self) -> usize {
        self.shape.iter().product()
    }
 }
 /// Tensor identifier.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct TensorId(pub u64);
 impl TensorId {
    /// Creates a new tensor ID.
    pub fn new() -> Self {
        use rand::Rng;
        TensorId(rand::thread_rng().gen())
    }
 }
 impl Default for TensorId {
    fn default() -> Self {
        Self::new()
    }
 }
 /// Data types for tensors.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub enum DataType {
    Float64,
    Float32,
    Float16,
    BFloat16,
    Int64,
    Int32,
    Int16,
    Int8,
    UInt8,
    Bool,
 }
 impl DataType {
    /// Returns size in bytes.
    pub fn size_bytes(&self) -> usize {
        match self {
            DataType::Float64 | DataType::Int64 => 8,
            DataType::Float32 | DataType::Int32 => 4,
            DataType::Float16 | DataType::BFloat16 | DataType::Int16 => 2,
            DataType::Int8 | DataType::UInt8 | DataType::Bool => 1,
        }
    }
 }
 /// Data transfer path between processors.
 #[derive(Clone, Debug, PartialEq, Eq, Hash)]
 pub enum TransferPath {
    /// Direct GPU-to-GPU via NVLink.
    NvLink,
    /// Direct GPU-to-GPU via PCIe P2P.
    PciePeerToPeer,
    /// Through CPU memory.
    CpuMediated,
    /// Unified memory (Apple Silicon).
    UnifiedMemory,
    /// Network transfer.
    Network,
    /// Same memory space (no transfer needed).
    SameMemory,
 }
 impl TransferPath {
    /// Returns approximate bandwidth in GB/s.
    pub fn bandwidth_gbps(&self) -> f64 {
        match self {
            TransferPath::NvLink => 900.0,       // NVLink 4.0
            TransferPath::PciePeerToPeer => 64.0, // PCIe 5.0 x16
            TransferPath::CpuMediated => 50.0,   // DDR5
            TransferPath::UnifiedMemory => 400.0, // Apple unified
            TransferPath::Network => 10.0,       // 100Gbps network
            TransferPath::SameMemory => f64::INFINITY,
        }
    }
    /// Estimates transfer time for given bytes.
    pub fn estimate_transfer_time(&self, bytes: u64) -> std::time::Duration {
        if matches!(self, TransferPath::SameMemory) {
            return std::time::Duration::ZERO;
        }
        let bytes_f64 = bytes as f64;
        let bandwidth = self.bandwidth_gbps() * 1e9; // Convert to bytes/s
        let seconds = bytes_f64 / bandwidth;
        std::time::Duration::from_secs_f64(seconds)
    }
 }
 /// Unified memory manager.
 pub struct MemoryManager {
    /// Allocated tensors.
    tensors: RwLock<HashMap<TensorId, TensorHandle>>,
    /// Memory usage per processor type.
    usage: RwLock<HashMap<ProcessorType, u64>>,
    /// Memory limits per processor type.
    limits: HashMap<ProcessorType, u64>,
 }
 impl MemoryManager {
    /// Creates a new memory manager.
    pub fn new() -> Self {
        Self {
            tensors: RwLock::new(HashMap::new()),
            usage: RwLock::new(HashMap::new()),
            limits: HashMap::new(),
        }
    }
    /// Sets memory limit for a processor type.
    pub fn set_limit(&mut self, proc_type: ProcessorType, limit_bytes: u64) {
        self.limits.insert(proc_type, limit_bytes);
    }
    /// Allocates a tensor.
    pub fn allocate(&self, shape: Vec<usize>, dtype: DataType) -> Result<TensorHandle, ComputeError> {
        let handle = TensorHandle::new(shape, dtype);
        self.tensors.write().insert(handle.id, handle.clone());
        Ok(handle)
    }
    /// Frees a tensor.
    pub fn free(&self, tensor_id: TensorId) -> Result<(), ComputeError> {
        if let Some(handle) = self.tensors.write().remove(&tensor_id) {
            // Update usage for all locations
            let mut usage = self.usage.write();
            for loc in &handle.locations {
                if let Some(u) = usage.get_mut(loc) {
                    *u = u.saturating_sub(handle.size_bytes);
                }
            }
        }
        Ok(())
    }
    /// Gets a tensor handle.
    pub fn get(&self, tensor_id: TensorId) -> Option<TensorHandle> {
        self.tensors.read().get(&tensor_id).cloned()
    }
    /// Ensures tensor is on specified processor.
    pub fn ensure_on(
        &self,
        tensor_id: TensorId,
        target: ProcessorType,
    ) -> Result<TransferPath, ComputeError> {
        let mut tensors = self.tensors.write();
        if let Some(handle) = tensors.get_mut(&tensor_id) {
            // Check if already on target
            if handle.locations.contains(&target) {
                return Ok(TransferPath::SameMemory);
            }
            // Determine transfer path
            let path = if handle.locations.is_empty() {
                // New tensor, allocate on target
                TransferPath::SameMemory
            } else {
                // Find best transfer path from existing location
                self.find_best_path(&handle.locations[0], &target)
            };
            // Record new location
            handle.locations.push(target.clone());
            // Update usage
            let mut usage = self.usage.write();
            *usage.entry(target).or_default() += handle.size_bytes;
            Ok(path)
        } else {
            Err(ComputeError::Internal("Tensor not found".to_string()))
        }
    }
    /// Finds best transfer path between processors.
    fn find_best_path(&self, from: &ProcessorType, to: &ProcessorType) -> TransferPath {
        // Check for unified memory (Apple Silicon)
        if self.shares_memory(from, to) {
            return TransferPath::UnifiedMemory;
        }
        // Check for NVLink between NVIDIA GPUs
        if matches!(from, ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda { .. }))
            && matches!(to, ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda { .. }))
        {
            return TransferPath::NvLink;
        }
        // Check for PCIe P2P between GPUs
        if from.is_gpu() && to.is_gpu() {
            return TransferPath::PciePeerToPeer;
        }
        // Default to CPU-mediated transfer
        TransferPath::CpuMediated
    }
    /// Checks if two processor types share memory.
    fn shares_memory(&self, a: &ProcessorType, b: &ProcessorType) -> bool {
        use crate::processor::{CpuVariant, GpuVariant, NpuVariant};
        match (a, b) {
            // Apple Silicon unified memory
            (ProcessorType::Cpu(CpuVariant::Arm64 { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal))
            | (ProcessorType::Gpu(GpuVariant::AppleMetal), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
            | (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
            | (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal)) => true,
            // Same type
            _ if a == b => true,
            _ => false,
        }
    }
    /// Gets current memory usage for a processor type.
    pub fn usage(&self, proc_type: ProcessorType) -> u64 {
        self.usage.read().get(&proc_type).copied().unwrap_or(0)
    }
    /// Gets available memory for a processor type.
    pub fn available(&self, proc_type: ProcessorType) -> u64 {
        let limit = self.limits.get(&proc_type).copied().unwrap_or(u64::MAX);
        let used = self.usage(proc_type);
        limit.saturating_sub(used)
    }
    /// Gets total allocated tensors.
    pub fn tensor_count(&self) -> usize {
        self.tensors.read().len()
    }
 }
 impl Default for MemoryManager {
    fn default() -> Self {
        Self::new()
    }
 }
 /// Unified memory abstraction for zero-copy sharing.
 pub struct UnifiedMemory {
    /// Base pointer (in unified address space).
    pub base: u64,
    /// Size in bytes.
    pub size: u64,
    /// Accessible from these processor types.
    pub accessible_from: Vec<ProcessorType>,
 }
 impl UnifiedMemory {
    /// Creates new unified memory region.
    pub fn new(size: u64) -> Self {
        Self {
            base: 0, // Would be actual pointer in real implementation
            size,
            accessible_from: Vec::new(),
        }
    }
    /// Checks if accessible from processor type.
    pub fn is_accessible_from(&self, proc_type: &ProcessorType) -> bool {
        self.accessible_from.contains(proc_type)
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_tensor_handle() {
        let handle = TensorHandle::new(vec![1024, 1024], DataType::Float32);
        assert_eq!(handle.numel(), 1024 * 1024);
        assert_eq!(handle.size_bytes, 1024 * 1024 * 4);
    }
    #[test]
    fn test_data_type_sizes() {
        assert_eq!(DataType::Float64.size_bytes(), 8);
        assert_eq!(DataType::Float32.size_bytes(), 4);
        assert_eq!(DataType::Float16.size_bytes(), 2);
        assert_eq!(DataType::Int8.size_bytes(), 1);
    }
    #[test]
    fn test_transfer_path_bandwidth() {
        assert!(TransferPath::NvLink.bandwidth_gbps() > TransferPath::PciePeerToPeer.bandwidth_gbps());
        assert!(TransferPath::SameMemory.bandwidth_gbps().is_infinite());
    }
    #[test]
    fn test_memory_manager() {
        let manager = MemoryManager::new();
        let handle = manager.allocate(vec![1024, 1024], DataType::Float32).unwrap();
        assert_eq!(manager.tensor_count(), 1);
        manager.free(handle.id).unwrap();
        assert_eq!(manager.tensor_count(), 0);
    }
    #[test]
    fn test_ensure_on() {
        let manager = MemoryManager::new();
        let handle = manager.allocate(vec![1024], DataType::Float32).unwrap();
        // First ensure should allocate
        let path = manager.ensure_on(
            handle.id,
            ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda {
                compute_capability: (8, 0),
            }),
        ).unwrap();
        assert_eq!(path, TransferPath::SameMemory);
        // Second ensure to same location should be same memory
        let path = manager.ensure_on(
            handle.id,
            ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda {
                compute_capability: (8, 0),
            }),
        ).unwrap();
        assert_eq!(path, TransferPath::SameMemory);
    }
 }
--- a/crates/synor-compute/src/processor/capabilities.rs
+++ b/crates/synor-compute/src/processor/capabilities.rs
@ -0,0 +1,547 @@
 //! Processor capability definitions.
 use super::operation::OperationType;
 use super::types::PowerTier;
 use serde::{Deserialize, Serialize};
 use std::collections::HashSet;
 /// Detailed processor capabilities.
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct ProcessorCapabilities {
    /// Compute throughput.
    pub compute: ComputeThroughput,
    /// Memory specifications.
    pub memory: MemorySpecs,
    /// Supported operations.
    pub operations: HashSet<OperationType>,
    /// Power characteristics.
    pub power: PowerCharacteristics,
    /// Optimal workload characteristics.
    pub optimal_for: Vec<WorkloadCharacteristic>,
 }
 impl Default for ProcessorCapabilities {
    fn default() -> Self {
        Self {
            compute: ComputeThroughput::default(),
            memory: MemorySpecs::default(),
            operations: Self::default_operations(),
            power: PowerCharacteristics::default(),
            optimal_for: vec![],
        }
    }
 }
 impl ProcessorCapabilities {
    /// Default operations supported by most processors.
    fn default_operations() -> HashSet<OperationType> {
        [
            OperationType::MatMul,
            OperationType::Add,
            OperationType::Mul,
            OperationType::ReLU,
            OperationType::Softmax,
            OperationType::DataLoad,
            OperationType::DataPreprocess,
        ]
        .into_iter()
        .collect()
    }
    /// Creates CPU capabilities.
    pub fn cpu(cores: u32, clock_ghz: f32, avx512: bool) -> Self {
        let flops_per_cycle = if avx512 { 64.0 } else { 32.0 }; // FP32 ops per cycle with AVX
        let fp32_tflops = (cores as f64 * clock_ghz as f64 * flops_per_cycle) / 1000.0;
        Self {
            compute: ComputeThroughput {
                fp64_tflops: fp32_tflops / 2.0,
                fp32_tflops,
                fp16_tflops: fp32_tflops * 2.0,
                bf16_tflops: fp32_tflops * 2.0,
                int8_tops: fp32_tflops * 4.0,
                int4_tops: fp32_tflops * 8.0,
                sparsity_speedup: 1.0,
            },
            memory: MemorySpecs {
                capacity_bytes: 64 * 1024 * 1024 * 1024, // 64 GB typical
                bandwidth_gbps: 200,                     // DDR5
                type_: MemoryType::Ddr5,
            },
            operations: Self::cpu_operations(),
            power: PowerCharacteristics {
                tdp_watts: 125,
                efficiency: 0.8,
                power_tier: PowerTier::Medium,
            },
            optimal_for: vec![
                WorkloadCharacteristic::Sequential,
                WorkloadCharacteristic::MemoryBound,
                WorkloadCharacteristic::SmallBatch,
            ],
        }
    }
    /// Operations typically supported by CPUs.
    fn cpu_operations() -> HashSet<OperationType> {
        [
            // Matrix operations (slow but supported)
            OperationType::MatMul,
            OperationType::Conv2d,
            OperationType::BatchNorm,
            OperationType::LayerNorm,
            // Element-wise
            OperationType::Add,
            OperationType::Mul,
            OperationType::ReLU,
            OperationType::GeLU,
            OperationType::Softmax,
            // Data operations (optimal)
            OperationType::DataLoad,
            OperationType::DataPreprocess,
            OperationType::Tokenization,
            OperationType::Detokenization,
            // Memory operations
            OperationType::Transpose,
            OperationType::Reshape,
            OperationType::Concat,
            OperationType::Split,
            // I/O
            OperationType::Checkpoint,
        ]
        .into_iter()
        .collect()
    }
    /// Creates NVIDIA GPU capabilities.
    pub fn nvidia_gpu(
        cuda_cores: u32,
        tensor_cores: u32,
        vram_gb: u32,
        bandwidth_gbps: u32,
        compute_capability: (u8, u8),
    ) -> Self {
        // Approximate TFLOPS based on cores and typical clocks
        let base_clock_ghz = 1.5;
        let fp32_tflops = (cuda_cores as f64 * base_clock_ghz * 2.0) / 1000.0;
        let tensor_multiplier = if compute_capability.0 >= 8 { 4.0 } else { 2.0 };
        Self {
            compute: ComputeThroughput {
                fp64_tflops: fp32_tflops / 2.0,
                fp32_tflops,
                fp16_tflops: fp32_tflops * tensor_multiplier,
                bf16_tflops: fp32_tflops * tensor_multiplier,
                int8_tops: fp32_tflops * tensor_multiplier * 2.0,
                int4_tops: fp32_tflops * tensor_multiplier * 4.0,
                sparsity_speedup: if compute_capability.0 >= 8 { 2.0 } else { 1.0 },
            },
            memory: MemorySpecs {
                capacity_bytes: vram_gb as u64 * 1024 * 1024 * 1024,
                bandwidth_gbps,
                type_: if compute_capability.0 >= 9 {
                    MemoryType::Hbm3
                } else {
                    MemoryType::Hbm2e
                },
            },
            operations: Self::gpu_operations(compute_capability),
            power: PowerCharacteristics {
                tdp_watts: if compute_capability.0 >= 9 { 700 } else { 350 },
                efficiency: 0.9,
                power_tier: PowerTier::High,
            },
            optimal_for: vec![
                WorkloadCharacteristic::HighlyParallel,
                WorkloadCharacteristic::LargeBatch,
                WorkloadCharacteristic::ComputeBound,
            ],
        }
    }
    /// Operations supported by GPUs.
    fn gpu_operations(compute_capability: (u8, u8)) -> HashSet<OperationType> {
        let mut ops: HashSet<OperationType> = [
            // Matrix operations (optimal)
            OperationType::MatMul,
            OperationType::Conv2d,
            OperationType::Conv3d,
            OperationType::DepthwiseConv,
            OperationType::BatchNorm,
            OperationType::LayerNorm,
            // Attention
            OperationType::SelfAttention,
            OperationType::CrossAttention,
            // Element-wise
            OperationType::Add,
            OperationType::Mul,
            OperationType::ReLU,
            OperationType::GeLU,
            OperationType::SiLU,
            OperationType::Softmax,
            // Reduction
            OperationType::Sum,
            OperationType::Mean,
            OperationType::Max,
            OperationType::ArgMax,
            // Memory operations
            OperationType::Transpose,
            OperationType::Reshape,
            OperationType::Concat,
            OperationType::Split,
            OperationType::Gather,
            OperationType::Scatter,
            // LLM specific
            OperationType::Embedding,
            OperationType::RoPE,
            OperationType::KVCache,
            OperationType::TopK,
            OperationType::Sampling,
        ]
        .into_iter()
        .collect();
        // FlashAttention for newer GPUs
        if compute_capability.0 >= 8 {
            ops.insert(OperationType::FlashAttention);
        }
        ops
    }
    /// Creates TPU capabilities.
    pub fn tpu(version: super::TpuVersion) -> Self {
        let (bf16_tflops, memory_gb, bandwidth_gbps) = match version {
            super::TpuVersion::V5p => (918.0, 95, 4800),
            super::TpuVersion::V5e => (197.0, 16, 1600),
            super::TpuVersion::V4 => (275.0, 32, 2400),
            super::TpuVersion::V4i => (138.0, 32, 1200),
            super::TpuVersion::V3 => (123.0, 16, 900),
            super::TpuVersion::V2 => (46.0, 8, 600),
            super::TpuVersion::Edge => (4.0, 0, 0), // Uses host memory
        };
        Self {
            compute: ComputeThroughput {
                fp64_tflops: 0.0, // TPUs don't support FP64
                fp32_tflops: bf16_tflops / 2.0,
                fp16_tflops: bf16_tflops,
                bf16_tflops,
                int8_tops: bf16_tflops * 2.0,
                int4_tops: bf16_tflops * 4.0,
                sparsity_speedup: 2.0,
            },
            memory: MemorySpecs {
                capacity_bytes: memory_gb as u64 * 1024 * 1024 * 1024,
                bandwidth_gbps,
                type_: MemoryType::Hbm2e,
            },
            operations: Self::tpu_operations(),
            power: PowerCharacteristics {
                tdp_watts: if matches!(version, super::TpuVersion::Edge) {
                    2
                } else {
                    400
                },
                efficiency: 0.95,
                power_tier: if matches!(version, super::TpuVersion::Edge) {
                    PowerTier::UltraLow
                } else {
                    PowerTier::High
                },
            },
            optimal_for: vec![
                WorkloadCharacteristic::HighlyParallel,
                WorkloadCharacteristic::ComputeBound,
                WorkloadCharacteristic::FixedShape,
                WorkloadCharacteristic::LargeBatch,
            ],
        }
    }
    /// Operations supported by TPUs.
    fn tpu_operations() -> HashSet<OperationType> {
        [
            // Matrix operations (optimal)
            OperationType::MatMul,
            OperationType::Conv2d,
            OperationType::BatchNorm,
            OperationType::LayerNorm,
            // Attention
            OperationType::SelfAttention,
            OperationType::CrossAttention,
            OperationType::FlashAttention,
            // Element-wise
            OperationType::Add,
            OperationType::Mul,
            OperationType::ReLU,
            OperationType::GeLU,
            OperationType::SiLU,
            OperationType::Softmax,
            // Reduction
            OperationType::Sum,
            OperationType::Mean,
            OperationType::Max,
            // LLM specific
            OperationType::Embedding,
            OperationType::RoPE,
            OperationType::KVCache,
        ]
        .into_iter()
        .collect()
    }
    /// Creates LPU (Groq) capabilities.
    pub fn lpu() -> Self {
        Self {
            compute: ComputeThroughput {
                fp64_tflops: 0.0,
                fp32_tflops: 0.0,
                fp16_tflops: 188.0,
                bf16_tflops: 188.0,
                int8_tops: 750.0,
                int4_tops: 1500.0,
                sparsity_speedup: 1.0,
            },
            memory: MemorySpecs {
                capacity_bytes: 230 * 1024 * 1024 * 1024, // 230 GB SRAM!
                bandwidth_gbps: 80_000,                    // 80 TB/s internal
                type_: MemoryType::Sram,
            },
            operations: Self::lpu_operations(),
            power: PowerCharacteristics {
                tdp_watts: 300,
                efficiency: 0.98, // Very efficient for inference
                power_tier: PowerTier::Medium,
            },
            optimal_for: vec![
                WorkloadCharacteristic::Sequential,
                WorkloadCharacteristic::SmallBatch,
                WorkloadCharacteristic::VariableLength,
                WorkloadCharacteristic::LowLatency,
            ],
        }
    }
    /// Operations supported by Groq LPU.
    fn lpu_operations() -> HashSet<OperationType> {
        [
            // Optimized for inference
            OperationType::MatMul,
            OperationType::LayerNorm,
            OperationType::SelfAttention,
            OperationType::Add,
            OperationType::Mul,
            OperationType::ReLU,
            OperationType::GeLU,
            OperationType::SiLU,
            OperationType::Softmax,
            OperationType::Embedding,
            OperationType::RoPE,
            OperationType::KVCache,
            OperationType::TopK,
            OperationType::Sampling,
        ]
        .into_iter()
        .collect()
    }
    /// Creates Apple Neural Engine capabilities.
    pub fn apple_neural_engine(cores: u32) -> Self {
        let int8_tops = match cores {
            16 => 18.0,  // M3
            32 => 35.0,  // M3 Max
            _ => cores as f64 * 1.1,
        };
        Self {
            compute: ComputeThroughput {
                fp64_tflops: 0.0,
                fp32_tflops: int8_tops / 4.0,
                fp16_tflops: int8_tops / 2.0,
                bf16_tflops: int8_tops / 2.0,
                int8_tops,
                int4_tops: int8_tops * 2.0,
                sparsity_speedup: 1.0,
            },
            memory: MemorySpecs {
                capacity_bytes: 0, // Uses unified memory
                bandwidth_gbps: 400,
                type_: MemoryType::Unified,
            },
            operations: Self::npu_operations(),
            power: PowerCharacteristics {
                tdp_watts: 15,
                efficiency: 0.95,
                power_tier: PowerTier::UltraLow,
            },
            optimal_for: vec![
                WorkloadCharacteristic::LowPower,
                WorkloadCharacteristic::LowLatency,
                WorkloadCharacteristic::SmallBatch,
            ],
        }
    }
    /// Operations supported by NPUs.
    fn npu_operations() -> HashSet<OperationType> {
        [
            // Inference optimized
            OperationType::MatMul,
            OperationType::Conv2d,
            OperationType::DepthwiseConv,
            OperationType::BatchNorm,
            OperationType::LayerNorm,
            OperationType::Add,
            OperationType::Mul,
            OperationType::ReLU,
            OperationType::Softmax,
            OperationType::Embedding,
        ]
        .into_iter()
        .collect()
    }
 }
 /// Compute throughput metrics.
 #[derive(Clone, Debug, Default, Serialize, Deserialize)]
 pub struct ComputeThroughput {
    /// FP64 TFLOPS.
    pub fp64_tflops: f64,
    /// FP32 TFLOPS.
    pub fp32_tflops: f64,
    /// FP16 TFLOPS.
    pub fp16_tflops: f64,
    /// BF16 TFLOPS.
    pub bf16_tflops: f64,
    /// INT8 TOPS.
    pub int8_tops: f64,
    /// INT4 TOPS.
    pub int4_tops: f64,
    /// Speedup for sparse operations.
    pub sparsity_speedup: f64,
 }
 /// Memory specifications.
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct MemorySpecs {
    /// Total capacity (bytes).
    pub capacity_bytes: u64,
    /// Bandwidth (GB/s).
    pub bandwidth_gbps: u32,
    /// Memory type.
    pub type_: MemoryType,
 }
 impl Default for MemorySpecs {
    fn default() -> Self {
        Self {
            capacity_bytes: 16 * 1024 * 1024 * 1024, // 16 GB
            bandwidth_gbps: 500,
            type_: MemoryType::Ddr5,
        }
    }
 }
 /// Memory types.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub enum MemoryType {
    /// DDR4 RAM.
    Ddr4,
    /// DDR5 RAM.
    Ddr5,
    /// GDDR6/6X video memory.
    Gddr6,
    /// HBM2.
    Hbm2,
    /// HBM2e.
    Hbm2e,
    /// HBM3.
    Hbm3,
    /// SRAM (on-chip).
    Sram,
    /// Unified memory (Apple Silicon).
    Unified,
    /// LPDDR (mobile).
    Lpddr,
 }
 /// Power characteristics.
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct PowerCharacteristics {
    /// TDP in watts.
    pub tdp_watts: u32,
    /// Efficiency factor (0.0 - 1.0).
    pub efficiency: f64,
    /// Power tier.
    pub power_tier: PowerTier,
 }
 impl Default for PowerCharacteristics {
    fn default() -> Self {
        Self {
            tdp_watts: 100,
            efficiency: 0.8,
            power_tier: PowerTier::Medium,
        }
    }
 }
 /// Workload characteristics for processor matching.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub enum WorkloadCharacteristic {
    /// High parallelism (GPU, TPU).
    HighlyParallel,
    /// Sequential dependencies (CPU, LPU).
    Sequential,
    /// Memory bandwidth bound (GPU).
    MemoryBound,
    /// Compute bound (TPU).
    ComputeBound,
    /// Low latency required (NPU, edge).
    LowLatency,
    /// Low power required (NPU, mobile).
    LowPower,
    /// Large batch sizes (GPU, TPU).
    LargeBatch,
    /// Small batch sizes (CPU, LPU).
    SmallBatch,
    /// Variable length sequences (LPU).
    VariableLength,
    /// Fixed tensor shapes (TPU).
    FixedShape,
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_cpu_capabilities() {
        let caps = ProcessorCapabilities::cpu(32, 3.5, true);
        assert!(caps.compute.fp32_tflops > 0.0);
        assert!(caps.operations.contains(&OperationType::DataLoad));
        assert!(caps.operations.contains(&OperationType::Tokenization));
    }
    #[test]
    fn test_gpu_capabilities() {
        let caps = ProcessorCapabilities::nvidia_gpu(16384, 512, 24, 1000, (8, 9));
        assert!(caps.compute.fp16_tflops > caps.compute.fp32_tflops);
        assert!(caps.operations.contains(&OperationType::FlashAttention));
    }
    #[test]
    fn test_tpu_capabilities() {
        let caps = ProcessorCapabilities::tpu(super::super::TpuVersion::V5p);
        assert!(caps.compute.bf16_tflops > 900.0);
        assert!(!caps.operations.contains(&OperationType::DataLoad)); // TPUs don't do I/O
    }
    #[test]
    fn test_lpu_capabilities() {
        let caps = ProcessorCapabilities::lpu();
        assert!(caps.memory.bandwidth_gbps > 10000); // Very high internal bandwidth
        assert!(caps.optimal_for.contains(&WorkloadCharacteristic::Sequential));
    }
 }
--- a/crates/synor-compute/src/processor/mod.rs
+++ b/crates/synor-compute/src/processor/mod.rs
@ -0,0 +1,339 @@
 //! Processor abstractions for heterogeneous compute.
 //!
 //! Supports all processor types:
 //! - CPU (x86_64, ARM64, RISC-V)
 //! - GPU (NVIDIA CUDA, AMD ROCm, Intel OneAPI, Apple Metal)
 //! - TPU (Google TPU v2-v5)
 //! - NPU (Apple Neural Engine, Qualcomm Hexagon, Intel VPU)
 //! - LPU (Groq Language Processing Unit)
 //! - FPGA (Xilinx, Intel/Altera)
 //! - DSP (Digital Signal Processors)
 //! - Custom accelerators
 mod capabilities;
 mod operation;
 mod profiles;
 mod types;
 pub use capabilities::{ComputeThroughput, MemorySpecs, MemoryType, ProcessorCapabilities};
 pub use operation::{Operation, OperationType};
 pub use profiles::ProcessorProfiles;
 pub use types::*;
 use crate::error::ComputeError;
 use async_trait::async_trait;
 use serde::{Deserialize, Serialize};
 use std::time::Duration;
 /// Unique processor identifier (within a node).
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct ProcessorId(pub u64);
 impl std::fmt::Display for ProcessorId {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "proc_{}", self.0)
    }
 }
 /// Unified abstraction for any processor type.
 #[async_trait]
 pub trait Processor: Send + Sync {
    /// Get processor ID.
    fn id(&self) -> ProcessorId;
    /// Get processor type.
    fn processor_type(&self) -> ProcessorType;
    /// Get capabilities.
    fn capabilities(&self) -> &ProcessorCapabilities;
    /// Check if processor can execute operation.
    fn can_execute(&self, op: &Operation) -> bool;
    /// Estimate execution time for operation.
    fn estimate_time(&self, op: &Operation) -> Duration;
    /// Estimate energy consumption for operation (Joules).
    fn estimate_energy(&self, op: &Operation) -> f64;
    /// Execute operation.
    async fn execute(&self, op: Operation) -> Result<OperationResult, ComputeError>;
    /// Current utilization (0.0 - 1.0).
    fn utilization(&self) -> f64;
    /// Available memory (bytes).
    fn available_memory(&self) -> u64;
    /// Check if this processor shares memory with another type.
    fn shares_memory_with(&self, other: &ProcessorType) -> bool {
        // By default, processors don't share memory
        // Override for unified memory architectures (Apple Silicon, AMD APUs)
        self.processor_type() == *other
    }
 }
 /// Result of an operation execution.
 #[derive(Clone, Debug)]
 pub struct OperationResult {
    /// Output data.
    pub output: Vec<u8>,
    /// Execution time.
    pub duration: Duration,
    /// Energy consumed (Joules).
    pub energy: f64,
    /// Peak memory used (bytes).
    pub peak_memory: u64,
 }
 /// Generic processor implementation for simulation/testing.
 pub struct GenericProcessor {
    id: ProcessorId,
    processor_type: ProcessorType,
    capabilities: ProcessorCapabilities,
    utilization: std::sync::atomic::AtomicU64,
    available_memory: std::sync::atomic::AtomicU64,
 }
 impl GenericProcessor {
    /// Creates a new generic processor.
    pub fn new(
        id: ProcessorId,
        processor_type: ProcessorType,
        capabilities: ProcessorCapabilities,
    ) -> Self {
        let available_memory = capabilities.memory.capacity_bytes;
        Self {
            id,
            processor_type,
            capabilities,
            utilization: std::sync::atomic::AtomicU64::new(0),
            available_memory: std::sync::atomic::AtomicU64::new(available_memory),
        }
    }
    /// Creates a CPU processor.
    pub fn cpu(id: ProcessorId, variant: CpuVariant) -> Self {
        Self::new(
            id,
            ProcessorType::Cpu(variant),
            ProcessorProfiles::cpu_default(),
        )
    }
    /// Creates an NVIDIA GPU processor.
    pub fn nvidia_gpu(id: ProcessorId, compute_capability: (u8, u8)) -> Self {
        let capabilities = match compute_capability {
            (9, 0) => ProcessorProfiles::nvidia_h100(),
            (8, 9) => ProcessorProfiles::nvidia_rtx_4090(),
            (8, 6) => ProcessorProfiles::nvidia_rtx_3090(),
            _ => ProcessorProfiles::nvidia_default(),
        };
        Self::new(
            id,
            ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability }),
            capabilities,
        )
    }
    /// Creates a TPU processor.
    pub fn tpu(id: ProcessorId, version: TpuVersion) -> Self {
        let capabilities = match version {
            TpuVersion::V5p => ProcessorProfiles::google_tpu_v5p(),
            TpuVersion::V4 => ProcessorProfiles::google_tpu_v4(),
            _ => ProcessorProfiles::google_tpu_default(),
        };
        Self::new(id, ProcessorType::Tpu(version), capabilities)
    }
    /// Creates a Groq LPU processor.
    pub fn lpu(id: ProcessorId) -> Self {
        Self::new(id, ProcessorType::Lpu, ProcessorProfiles::groq_lpu())
    }
    /// Creates an Apple Neural Engine processor.
    pub fn apple_neural_engine(id: ProcessorId, cores: u32) -> Self {
        Self::new(
            id,
            ProcessorType::Npu(NpuVariant::AppleNeuralEngine { cores }),
            ProcessorProfiles::apple_neural_engine(cores),
        )
    }
 }
 #[async_trait]
 impl Processor for GenericProcessor {
    fn id(&self) -> ProcessorId {
        self.id
    }
    fn processor_type(&self) -> ProcessorType {
        self.processor_type.clone()
    }
    fn capabilities(&self) -> &ProcessorCapabilities {
        &self.capabilities
    }
    fn can_execute(&self, op: &Operation) -> bool {
        self.capabilities.operations.contains(&op.op_type())
    }
    fn estimate_time(&self, op: &Operation) -> Duration {
        // Estimate based on FLOPS and operation complexity
        let flops_needed = op.estimated_flops();
        let throughput = match op.precision() {
            Precision::Fp32 => self.capabilities.compute.fp32_tflops,
            Precision::Fp16 => self.capabilities.compute.fp16_tflops,
            Precision::Bf16 => self.capabilities.compute.bf16_tflops,
            Precision::Int8 => self.capabilities.compute.int8_tops,
            Precision::Int4 => self.capabilities.compute.int4_tops,
            Precision::Fp64 => self.capabilities.compute.fp64_tflops,
        };
        if throughput > 0.0 {
            let tflops = throughput;
            let flops_per_second = tflops * 1e12;
            let seconds = flops_needed / flops_per_second;
            Duration::from_secs_f64(seconds)
        } else {
            Duration::from_secs(1) // Fallback
        }
    }
    fn estimate_energy(&self, op: &Operation) -> f64 {
        // Estimate based on TDP and execution time
        let duration = self.estimate_time(op);
        let watts = self.capabilities.power.tdp_watts as f64;
        let efficiency = self.capabilities.power.efficiency;
        watts * duration.as_secs_f64() * efficiency
    }
    async fn execute(&self, op: Operation) -> Result<OperationResult, ComputeError> {
        // Check if we can execute
        if !self.can_execute(&op) {
            return Err(ComputeError::OperationNotSupported(
                self.processor_type.clone(),
                format!("{:?}", op.op_type()),
            ));
        }
        // Simulate execution
        let duration = self.estimate_time(&op);
        let energy = self.estimate_energy(&op);
        // Update utilization
        self.utilization
            .store(50, std::sync::atomic::Ordering::Relaxed);
        // Simulate work
        tokio::time::sleep(Duration::from_micros(100)).await;
        // Reset utilization
        self.utilization
            .store(0, std::sync::atomic::Ordering::Relaxed);
        Ok(OperationResult {
            output: vec![],
            duration,
            energy,
            peak_memory: op.estimated_memory(),
        })
    }
    fn utilization(&self) -> f64 {
        self.utilization.load(std::sync::atomic::Ordering::Relaxed) as f64 / 100.0
    }
    fn available_memory(&self) -> u64 {
        self.available_memory
            .load(std::sync::atomic::Ordering::Relaxed)
    }
    fn shares_memory_with(&self, other: &ProcessorType) -> bool {
        match (&self.processor_type, other) {
            // Apple Silicon has unified memory
            (ProcessorType::Cpu(CpuVariant::Arm64 { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal))
            | (ProcessorType::Gpu(GpuVariant::AppleMetal), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
            | (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
            | (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal)) => true,
            // Same type always shares
            (a, b) if a == b => true,
            _ => false,
        }
    }
 }
 /// Precision for operations.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub enum Precision {
    Fp64,
    Fp32,
    Fp16,
    Bf16,
    Int8,
    Int4,
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_processor_creation() {
        let cpu = GenericProcessor::cpu(
            ProcessorId(0),
            CpuVariant::X86_64 {
                avx: AvxSupport::Avx512,
            },
        );
        assert_eq!(cpu.id(), ProcessorId(0));
        assert!(matches!(cpu.processor_type(), ProcessorType::Cpu(_)));
    }
    #[test]
    fn test_gpu_creation() {
        let gpu = GenericProcessor::nvidia_gpu(ProcessorId(1), (9, 0));
        assert_eq!(gpu.id(), ProcessorId(1));
        assert!(matches!(
            gpu.processor_type(),
            ProcessorType::Gpu(GpuVariant::NvidiaCuda { .. })
        ));
    }
    #[test]
    fn test_unified_memory() {
        let apple_cpu = GenericProcessor::new(
            ProcessorId(0),
            ProcessorType::Cpu(CpuVariant::Arm64 { sve: false }),
            ProcessorCapabilities::default(),
        );
        assert!(apple_cpu.shares_memory_with(&ProcessorType::Gpu(GpuVariant::AppleMetal)));
    }
    #[tokio::test]
    async fn test_operation_execution() {
        let cpu = GenericProcessor::cpu(
            ProcessorId(0),
            CpuVariant::X86_64 {
                avx: AvxSupport::Avx512,
            },
        );
        let op = Operation::MatMul {
            m: 1024,
            n: 1024,
            k: 1024,
            precision: Precision::Fp32,
        };
        // CPU might not support all ops depending on capabilities
        // This is testing the infrastructure
        let result = cpu.execute(op).await;
        // Result depends on capabilities
        assert!(result.is_ok() || result.is_err());
    }
 }
--- a/crates/synor-compute/src/processor/operation.rs
+++ b/crates/synor-compute/src/processor/operation.rs
@ -0,0 +1,543 @@
 //! Operation definitions for heterogeneous compute.
 use super::Precision;
 use serde::{Deserialize, Serialize};
 /// Operation types for processor matching.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub enum OperationType {
    // Matrix operations
    MatMul,
    Conv2d,
    Conv3d,
    DepthwiseConv,
    BatchNorm,
    LayerNorm,
    // Attention operations
    SelfAttention,
    CrossAttention,
    FlashAttention,
    // Element-wise operations
    Add,
    Mul,
    ReLU,
    GeLU,
    SiLU,
    Softmax,
    // Reduction operations
    Sum,
    Mean,
    Max,
    ArgMax,
    // Data movement
    Transpose,
    Reshape,
    Concat,
    Split,
    Gather,
    Scatter,
    // LLM specific
    Embedding,
    RoPE, // Rotary Position Embedding
    KVCache,
    TopK,
    Sampling,
    // I/O operations
    DataLoad,
    DataPreprocess,
    Tokenization,
    Detokenization,
    Checkpoint,
    // Distributed operations
    AllReduce,
    AllGather,
    ReduceScatter,
    // Training specific
    Backward,
    OptimizerStep,
    GradientClip,
 }
 /// Concrete operation with parameters.
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub enum Operation {
    /// Matrix multiplication.
    MatMul {
        m: usize,
        n: usize,
        k: usize,
        precision: Precision,
    },
    /// 2D Convolution.
    Conv2d {
        batch: usize,
        in_channels: usize,
        out_channels: usize,
        height: usize,
        width: usize,
        kernel_size: usize,
        precision: Precision,
    },
    /// Batch normalization.
    BatchNorm {
        batch: usize,
        channels: usize,
        spatial: usize,
        precision: Precision,
    },
    /// Layer normalization.
    LayerNorm {
        batch: usize,
        seq_len: usize,
        hidden: usize,
        precision: Precision,
    },
    /// Self-attention.
    SelfAttention {
        batch: usize,
        seq_len: usize,
        num_heads: usize,
        head_dim: usize,
        precision: Precision,
    },
    /// Flash attention (fused, memory efficient).
    FlashAttention {
        batch: usize,
        seq_len: usize,
        num_heads: usize,
        head_dim: usize,
        precision: Precision,
    },
    /// Element-wise addition.
    Add {
        elements: usize,
        precision: Precision,
    },
    /// Element-wise multiplication.
    Mul {
        elements: usize,
        precision: Precision,
    },
    /// ReLU activation.
    ReLU { elements: usize },
    /// GeLU activation.
    GeLU { elements: usize },
    /// SiLU (Swish) activation.
    SiLU { elements: usize },
    /// Softmax.
    Softmax {
        batch: usize,
        seq_len: usize,
        precision: Precision,
    },
    /// Embedding lookup.
    Embedding {
        batch: usize,
        seq_len: usize,
        vocab_size: usize,
        embed_dim: usize,
        precision: Precision,
    },
    /// Rotary Position Embedding.
    RoPE {
        batch: usize,
        seq_len: usize,
        head_dim: usize,
        precision: Precision,
    },
    /// KV Cache update.
    KVCache {
        batch: usize,
        seq_len: usize,
        num_heads: usize,
        head_dim: usize,
        precision: Precision,
    },
    /// Top-K sampling.
    TopK {
        batch: usize,
        vocab_size: usize,
        k: usize,
    },
    /// Token sampling.
    Sampling {
        batch: usize,
        vocab_size: usize,
        temperature: f32,
    },
    /// Data loading from storage.
    DataLoad {
        bytes: usize,
        async_: bool,
    },
    /// Data preprocessing.
    DataPreprocess {
        batch: usize,
        transforms: Vec<String>,
    },
    /// Tokenization.
    Tokenization {
        text_bytes: usize,
        vocab_size: usize,
    },
    /// Detokenization.
    Detokenization {
        tokens: usize,
        vocab_size: usize,
    },
    /// Checkpoint save.
    Checkpoint {
        bytes: usize,
        async_: bool,
    },
    /// All-reduce across devices.
    AllReduce {
        elements: usize,
        precision: Precision,
        devices: usize,
    },
    /// Backward pass for a layer.
    Backward {
        forward_op: Box<Operation>,
    },
    /// Optimizer step.
    OptimizerStep {
        parameters: usize,
        optimizer: String,
        precision: Precision,
    },
    /// Transpose.
    Transpose {
        shape: Vec<usize>,
        axes: Vec<usize>,
    },
    /// Reshape.
    Reshape {
        from: Vec<usize>,
        to: Vec<usize>,
    },
    /// Concatenate tensors.
    Concat {
        shapes: Vec<Vec<usize>>,
        axis: usize,
    },
    /// Generic operation.
    Generic {
        op_type: OperationType,
        flops: f64,
        memory: u64,
    },
 }
 impl Operation {
    /// Returns the operation type.
    pub fn op_type(&self) -> OperationType {
        match self {
            Operation::MatMul { .. } => OperationType::MatMul,
            Operation::Conv2d { .. } => OperationType::Conv2d,
            Operation::BatchNorm { .. } => OperationType::BatchNorm,
            Operation::LayerNorm { .. } => OperationType::LayerNorm,
            Operation::SelfAttention { .. } => OperationType::SelfAttention,
            Operation::FlashAttention { .. } => OperationType::FlashAttention,
            Operation::Add { .. } => OperationType::Add,
            Operation::Mul { .. } => OperationType::Mul,
            Operation::ReLU { .. } => OperationType::ReLU,
            Operation::GeLU { .. } => OperationType::GeLU,
            Operation::SiLU { .. } => OperationType::SiLU,
            Operation::Softmax { .. } => OperationType::Softmax,
            Operation::Embedding { .. } => OperationType::Embedding,
            Operation::RoPE { .. } => OperationType::RoPE,
            Operation::KVCache { .. } => OperationType::KVCache,
            Operation::TopK { .. } => OperationType::TopK,
            Operation::Sampling { .. } => OperationType::Sampling,
            Operation::DataLoad { .. } => OperationType::DataLoad,
            Operation::DataPreprocess { .. } => OperationType::DataPreprocess,
            Operation::Tokenization { .. } => OperationType::Tokenization,
            Operation::Detokenization { .. } => OperationType::Detokenization,
            Operation::Checkpoint { .. } => OperationType::Checkpoint,
            Operation::AllReduce { .. } => OperationType::AllReduce,
            Operation::Backward { .. } => OperationType::Backward,
            Operation::OptimizerStep { .. } => OperationType::OptimizerStep,
            Operation::Transpose { .. } => OperationType::Transpose,
            Operation::Reshape { .. } => OperationType::Reshape,
            Operation::Concat { .. } => OperationType::Concat,
            Operation::Generic { op_type, .. } => *op_type,
        }
    }
    /// Returns the precision used.
    pub fn precision(&self) -> Precision {
        match self {
            Operation::MatMul { precision, .. }
            | Operation::Conv2d { precision, .. }
            | Operation::BatchNorm { precision, .. }
            | Operation::LayerNorm { precision, .. }
            | Operation::SelfAttention { precision, .. }
            | Operation::FlashAttention { precision, .. }
            | Operation::Add { precision, .. }
            | Operation::Mul { precision, .. }
            | Operation::Softmax { precision, .. }
            | Operation::Embedding { precision, .. }
            | Operation::RoPE { precision, .. }
            | Operation::KVCache { precision, .. }
            | Operation::AllReduce { precision, .. }
            | Operation::OptimizerStep { precision, .. } => *precision,
            Operation::Backward { forward_op } => forward_op.precision(),
            _ => Precision::Fp32, // Default
        }
    }
    /// Estimates FLOPS for the operation.
    pub fn estimated_flops(&self) -> f64 {
        match self {
            // MatMul: 2 * M * N * K (multiply-add)
            Operation::MatMul { m, n, k, .. } => 2.0 * (*m as f64) * (*n as f64) * (*k as f64),
            // Conv2d: 2 * batch * out * H * W * in * K * K
            Operation::Conv2d {
                batch,
                in_channels,
                out_channels,
                height,
                width,
                kernel_size,
                ..
            } => {
                2.0 * (*batch as f64)
                    * (*out_channels as f64)
                    * (*height as f64)
                    * (*width as f64)
                    * (*in_channels as f64)
                    * (*kernel_size as f64)
                    * (*kernel_size as f64)
            }
            // Self-attention: 4 * batch * seq * seq * head_dim * heads
            Operation::SelfAttention {
                batch,
                seq_len,
                num_heads,
                head_dim,
                ..
            }
            | Operation::FlashAttention {
                batch,
                seq_len,
                num_heads,
                head_dim,
                ..
            } => {
                4.0 * (*batch as f64)
                    * (*seq_len as f64)
                    * (*seq_len as f64)
                    * (*head_dim as f64)
                    * (*num_heads as f64)
            }
            // Element-wise: 1 FLOP per element
            Operation::Add { elements, .. }
            | Operation::Mul { elements, .. }
            | Operation::ReLU { elements }
            | Operation::GeLU { elements }
            | Operation::SiLU { elements } => *elements as f64,
            // Softmax: ~5 ops per element (exp, sum, div)
            Operation::Softmax {
                batch, seq_len, ..
            } => 5.0 * (*batch as f64) * (*seq_len as f64),
            // Embedding: just lookup, minimal FLOPS
            Operation::Embedding {
                batch,
                seq_len,
                embed_dim,
                ..
            } => (*batch as f64) * (*seq_len as f64) * (*embed_dim as f64) * 0.1,
            // Backward: ~2x forward
            Operation::Backward { forward_op } => forward_op.estimated_flops() * 2.0,
            // Generic
            Operation::Generic { flops, .. } => *flops,
            // I/O operations: minimal compute
            _ => 1000.0,
        }
    }
    /// Estimates memory usage (bytes).
    pub fn estimated_memory(&self) -> u64 {
        let precision_bytes = match self.precision() {
            Precision::Fp64 => 8,
            Precision::Fp32 => 4,
            Precision::Fp16 | Precision::Bf16 => 2,
            Precision::Int8 => 1,
            Precision::Int4 => 1, // Rounded up
        };
        match self {
            Operation::MatMul { m, n, k, .. } => {
                // Input A (m×k) + Input B (k×n) + Output (m×n)
                ((*m * *k) + (*k * *n) + (*m * *n)) as u64 * precision_bytes
            }
            Operation::SelfAttention {
                batch,
                seq_len,
                num_heads,
                head_dim,
                ..
            } => {
                // Q, K, V, Output, intermediate attention
                5 * (*batch as u64)
                    * (*seq_len as u64)
                    * (*num_heads as u64)
                    * (*head_dim as u64)
                    * precision_bytes
            }
            Operation::FlashAttention {
                batch,
                seq_len,
                num_heads,
                head_dim,
                ..
            } => {
                // FlashAttention uses much less memory
                2 * (*batch as u64)
                    * (*seq_len as u64)
                    * (*num_heads as u64)
                    * (*head_dim as u64)
                    * precision_bytes
            }
            Operation::KVCache {
                batch,
                seq_len,
                num_heads,
                head_dim,
                ..
            } => {
                // K and V caches
                2 * (*batch as u64)
                    * (*seq_len as u64)
                    * (*num_heads as u64)
                    * (*head_dim as u64)
                    * precision_bytes
            }
            Operation::Generic { memory, .. } => *memory,
            _ => 1024 * 1024, // 1 MB default
        }
    }
    /// Creates the backward operation for this operation.
    pub fn backward(&self) -> Option<Operation> {
        match self {
            Operation::MatMul { .. }
            | Operation::Conv2d { .. }
            | Operation::SelfAttention { .. }
            | Operation::FlashAttention { .. }
            | Operation::LayerNorm { .. }
            | Operation::BatchNorm { .. } => Some(Operation::Backward {
                forward_op: Box::new(self.clone()),
            }),
            _ => None,
        }
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_matmul_flops() {
        let op = Operation::MatMul {
            m: 1024,
            n: 1024,
            k: 1024,
            precision: Precision::Fp32,
        };
        let flops = op.estimated_flops();
        // 2 * 1024^3 = ~2.1 billion FLOPS
        assert!(flops > 2e9 && flops < 2.2e9);
    }
    #[test]
    fn test_attention_memory() {
        let regular = Operation::SelfAttention {
            batch: 1,
            seq_len: 4096,
            num_heads: 32,
            head_dim: 128,
            precision: Precision::Fp16,
        };
        let flash = Operation::FlashAttention {
            batch: 1,
            seq_len: 4096,
            num_heads: 32,
            head_dim: 128,
            precision: Precision::Fp16,
        };
        // FlashAttention should use less memory
        assert!(flash.estimated_memory() < regular.estimated_memory());
    }
    #[test]
    fn test_backward_creation() {
        let forward = Operation::MatMul {
            m: 1024,
            n: 1024,
            k: 1024,
            precision: Precision::Fp32,
        };
        let backward = forward.backward();
        assert!(backward.is_some());
        if let Some(Operation::Backward { forward_op }) = backward {
            assert!(matches!(*forward_op, Operation::MatMul { .. }));
        }
    }
 }
--- a/crates/synor-compute/src/processor/profiles.rs
+++ b/crates/synor-compute/src/processor/profiles.rs
@ -0,0 +1,513 @@
 //! Pre-defined processor profiles for common hardware.
 use super::capabilities::{
    ComputeThroughput, MemorySpecs, MemoryType, PowerCharacteristics, ProcessorCapabilities,
    WorkloadCharacteristic,
 };
 use super::operation::OperationType;
 use super::types::PowerTier;
 use super::TpuVersion;
 use std::collections::HashSet;
 /// Pre-defined processor profiles.
 pub struct ProcessorProfiles;
 impl ProcessorProfiles {
    // ═══════════════════════════════════════════════════════════════
    // CPU PROFILES
    // ═══════════════════════════════════════════════════════════════
    /// Default CPU profile.
    pub fn cpu_default() -> ProcessorCapabilities {
        ProcessorCapabilities::cpu(8, 3.5, false)
    }
    /// AMD EPYC 9654 (96 cores).
    pub fn amd_epyc_9654() -> ProcessorCapabilities {
        ProcessorCapabilities {
            compute: ComputeThroughput {
                fp64_tflops: 2.7,
                fp32_tflops: 5.4,
                fp16_tflops: 10.8,
                bf16_tflops: 10.8,
                int8_tops: 21.6,
                int4_tops: 43.2,
                sparsity_speedup: 1.0,
            },
            memory: MemorySpecs {
                capacity_bytes: 6 * 1024 * 1024 * 1024 * 1024, // 6 TB max
                bandwidth_gbps: 460,
                type_: MemoryType::Ddr5,
            },
            operations: ProcessorCapabilities::cpu(96, 2.4, false)
                .operations,
            power: PowerCharacteristics {
                tdp_watts: 360,
                efficiency: 0.85,
                power_tier: PowerTier::High,
            },
            optimal_for: vec![
                WorkloadCharacteristic::Sequential,
                WorkloadCharacteristic::MemoryBound,
            ],
        }
    }
    /// Intel Xeon w9-3595X (56 cores).
    pub fn intel_xeon_w9_3595x() -> ProcessorCapabilities {
        ProcessorCapabilities {
            compute: ComputeThroughput {
                fp64_tflops: 3.2,
                fp32_tflops: 6.4,
                fp16_tflops: 12.8,
                bf16_tflops: 12.8,
                int8_tops: 25.6,
                int4_tops: 51.2,
                sparsity_speedup: 1.0,
            },
            memory: MemorySpecs {
                capacity_bytes: 4 * 1024 * 1024 * 1024 * 1024, // 4 TB max
                bandwidth_gbps: 307,
                type_: MemoryType::Ddr5,
            },
            operations: ProcessorCapabilities::cpu(56, 2.9, true)
                .operations,
            power: PowerCharacteristics {
                tdp_watts: 350,
                efficiency: 0.80,
                power_tier: PowerTier::High,
            },
            optimal_for: vec![
                WorkloadCharacteristic::Sequential,
                WorkloadCharacteristic::MemoryBound,
            ],
        }
    }
    /// Apple M3 Max CPU cores.
    pub fn apple_m3_max_cpu() -> ProcessorCapabilities {
        ProcessorCapabilities {
            compute: ComputeThroughput {
                fp64_tflops: 0.3,
                fp32_tflops: 0.6,
                fp16_tflops: 1.2,
                bf16_tflops: 1.2,
                int8_tops: 2.4,
                int4_tops: 4.8,
                sparsity_speedup: 1.0,
            },
            memory: MemorySpecs {
                capacity_bytes: 128 * 1024 * 1024 * 1024, // 128 GB unified
                bandwidth_gbps: 400,
                type_: MemoryType::Unified,
            },
            operations: ProcessorCapabilities::cpu(16, 4.0, false)
                .operations,
            power: PowerCharacteristics {
                tdp_watts: 40,
                efficiency: 0.95,
                power_tier: PowerTier::Low,
            },
            optimal_for: vec![
                WorkloadCharacteristic::Sequential,
                WorkloadCharacteristic::LowPower,
            ],
        }
    }
    // ═══════════════════════════════════════════════════════════════
    // NVIDIA GPU PROFILES
    // ═══════════════════════════════════════════════════════════════
    /// Default NVIDIA GPU profile.
    pub fn nvidia_default() -> ProcessorCapabilities {
        ProcessorCapabilities::nvidia_gpu(8192, 256, 12, 600, (8, 0))
    }
    /// NVIDIA H100 SXM (80GB).
    pub fn nvidia_h100() -> ProcessorCapabilities {
        ProcessorCapabilities {
            compute: ComputeThroughput {
                fp64_tflops: 67.0,
                fp32_tflops: 67.0,
                fp16_tflops: 1979.0, // With sparsity
                bf16_tflops: 1979.0,
                int8_tops: 3958.0,
                int4_tops: 7916.0,
                sparsity_speedup: 2.0,
            },
            memory: MemorySpecs {
                capacity_bytes: 80 * 1024 * 1024 * 1024,
                bandwidth_gbps: 3350,
                type_: MemoryType::Hbm3,
            },
            operations: ProcessorCapabilities::nvidia_gpu(16896, 528, 80, 3350, (9, 0))
                .operations,
            power: PowerCharacteristics {
                tdp_watts: 700,
                efficiency: 0.90,
                power_tier: PowerTier::High,
            },
            optimal_for: vec![
                WorkloadCharacteristic::HighlyParallel,
                WorkloadCharacteristic::LargeBatch,
                WorkloadCharacteristic::ComputeBound,
            ],
        }
    }
    /// NVIDIA A100 (80GB).
    pub fn nvidia_a100() -> ProcessorCapabilities {
        ProcessorCapabilities {
            compute: ComputeThroughput {
                fp64_tflops: 19.5,
                fp32_tflops: 19.5,
                fp16_tflops: 624.0, // With sparsity
                bf16_tflops: 624.0,
                int8_tops: 1248.0,
                int4_tops: 2496.0,
                sparsity_speedup: 2.0,
            },
            memory: MemorySpecs {
                capacity_bytes: 80 * 1024 * 1024 * 1024,
                bandwidth_gbps: 2039,
                type_: MemoryType::Hbm2e,
            },
            operations: ProcessorCapabilities::nvidia_gpu(6912, 432, 80, 2039, (8, 0))
                .operations,
            power: PowerCharacteristics {
                tdp_watts: 400,
                efficiency: 0.88,
                power_tier: PowerTier::High,
            },
            optimal_for: vec![
                WorkloadCharacteristic::HighlyParallel,
                WorkloadCharacteristic::LargeBatch,
                WorkloadCharacteristic::ComputeBound,
            ],
        }
    }
    /// NVIDIA RTX 4090.
    pub fn nvidia_rtx_4090() -> ProcessorCapabilities {
        ProcessorCapabilities {
            compute: ComputeThroughput {
                fp64_tflops: 1.3,
                fp32_tflops: 82.6,
                fp16_tflops: 330.4, // With sparsity
                bf16_tflops: 330.4,
                int8_tops: 660.8,
                int4_tops: 1321.6,
                sparsity_speedup: 2.0,
            },
            memory: MemorySpecs {
                capacity_bytes: 24 * 1024 * 1024 * 1024,
                bandwidth_gbps: 1008,
                type_: MemoryType::Gddr6,
            },
            operations: ProcessorCapabilities::nvidia_gpu(16384, 512, 24, 1008, (8, 9))
                .operations,
            power: PowerCharacteristics {
                tdp_watts: 450,
                efficiency: 0.85,
                power_tier: PowerTier::High,
            },
            optimal_for: vec![
                WorkloadCharacteristic::HighlyParallel,
                WorkloadCharacteristic::LargeBatch,
            ],
        }
    }
    /// NVIDIA RTX 3090.
    pub fn nvidia_rtx_3090() -> ProcessorCapabilities {
        ProcessorCapabilities {
            compute: ComputeThroughput {
                fp64_tflops: 0.6,
                fp32_tflops: 35.6,
                fp16_tflops: 71.2,
                bf16_tflops: 71.2,
                int8_tops: 142.4,
                int4_tops: 284.8,
                sparsity_speedup: 1.0,
            },
            memory: MemorySpecs {
                capacity_bytes: 24 * 1024 * 1024 * 1024,
                bandwidth_gbps: 936,
                type_: MemoryType::Gddr6,
            },
            operations: ProcessorCapabilities::nvidia_gpu(10496, 328, 24, 936, (8, 6))
                .operations,
            power: PowerCharacteristics {
                tdp_watts: 350,
                efficiency: 0.82,
                power_tier: PowerTier::High,
            },
            optimal_for: vec![
                WorkloadCharacteristic::HighlyParallel,
                WorkloadCharacteristic::LargeBatch,
            ],
        }
    }
    // ═══════════════════════════════════════════════════════════════
    // AMD GPU PROFILES
    // ═══════════════════════════════════════════════════════════════
    /// AMD MI300X.
    pub fn amd_mi300x() -> ProcessorCapabilities {
        ProcessorCapabilities {
            compute: ComputeThroughput {
                fp64_tflops: 163.4,
                fp32_tflops: 163.4,
                fp16_tflops: 1307.0,
                bf16_tflops: 1307.0,
                int8_tops: 2614.0,
                int4_tops: 5228.0,
                sparsity_speedup: 2.0,
            },
            memory: MemorySpecs {
                capacity_bytes: 192 * 1024 * 1024 * 1024, // 192 GB HBM3
                bandwidth_gbps: 5300,
                type_: MemoryType::Hbm3,
            },
            operations: {
                let mut ops = ProcessorCapabilities::nvidia_gpu(16384, 512, 80, 5300, (9, 0))
                    .operations;
                ops.remove(&OperationType::FlashAttention); // Different implementation
                ops
            },
            power: PowerCharacteristics {
                tdp_watts: 750,
                efficiency: 0.88,
                power_tier: PowerTier::High,
            },
            optimal_for: vec![
                WorkloadCharacteristic::HighlyParallel,
                WorkloadCharacteristic::LargeBatch,
                WorkloadCharacteristic::MemoryBound, // High memory bandwidth
            ],
        }
    }
    /// AMD RX 7900 XTX.
    pub fn amd_rx_7900_xtx() -> ProcessorCapabilities {
        ProcessorCapabilities {
            compute: ComputeThroughput {
                fp64_tflops: 1.9,
                fp32_tflops: 61.0,
                fp16_tflops: 122.0,
                bf16_tflops: 122.0,
                int8_tops: 244.0,
                int4_tops: 488.0,
                sparsity_speedup: 1.0,
            },
            memory: MemorySpecs {
                capacity_bytes: 24 * 1024 * 1024 * 1024,
                bandwidth_gbps: 960,
                type_: MemoryType::Gddr6,
            },
            operations: {
                let mut ops = ProcessorCapabilities::nvidia_gpu(6144, 0, 24, 960, (8, 0))
                    .operations;
                ops.remove(&OperationType::FlashAttention);
                ops
            },
            power: PowerCharacteristics {
                tdp_watts: 355,
                efficiency: 0.80,
                power_tier: PowerTier::High,
            },
            optimal_for: vec![
                WorkloadCharacteristic::HighlyParallel,
            ],
        }
    }
    // ═══════════════════════════════════════════════════════════════
    // GOOGLE TPU PROFILES
    // ═══════════════════════════════════════════════════════════════
    /// Default TPU profile.
    pub fn google_tpu_default() -> ProcessorCapabilities {
        ProcessorCapabilities::tpu(TpuVersion::V4)
    }
    /// Google TPU v5p.
    pub fn google_tpu_v5p() -> ProcessorCapabilities {
        ProcessorCapabilities::tpu(TpuVersion::V5p)
    }
    /// Google TPU v4.
    pub fn google_tpu_v4() -> ProcessorCapabilities {
        ProcessorCapabilities::tpu(TpuVersion::V4)
    }
    /// Google Edge TPU.
    pub fn google_edge_tpu() -> ProcessorCapabilities {
        ProcessorCapabilities {
            compute: ComputeThroughput {
                fp64_tflops: 0.0,
                fp32_tflops: 0.0,
                fp16_tflops: 0.0,
                bf16_tflops: 0.0,
                int8_tops: 4.0,
                int4_tops: 8.0,
                sparsity_speedup: 1.0,
            },
            memory: MemorySpecs {
                capacity_bytes: 0, // Uses host memory
                bandwidth_gbps: 0,
                type_: MemoryType::Unified,
            },
            operations: {
                let mut ops = HashSet::new();
                ops.insert(OperationType::MatMul);
                ops.insert(OperationType::Conv2d);
                ops.insert(OperationType::DepthwiseConv);
                ops.insert(OperationType::Add);
                ops.insert(OperationType::Mul);
                ops.insert(OperationType::ReLU);
                ops.insert(OperationType::Softmax);
                ops
            },
            power: PowerCharacteristics {
                tdp_watts: 2,
                efficiency: 0.95,
                power_tier: PowerTier::UltraLow,
            },
            optimal_for: vec![
                WorkloadCharacteristic::LowPower,
                WorkloadCharacteristic::LowLatency,
                WorkloadCharacteristic::SmallBatch,
            ],
        }
    }
    // ═══════════════════════════════════════════════════════════════
    // GROQ LPU PROFILE
    // ═══════════════════════════════════════════════════════════════
    /// Groq LPU.
    pub fn groq_lpu() -> ProcessorCapabilities {
        ProcessorCapabilities::lpu()
    }
    // ═══════════════════════════════════════════════════════════════
    // APPLE NEURAL ENGINE PROFILES
    // ═══════════════════════════════════════════════════════════════
    /// Apple Neural Engine (generic).
    pub fn apple_neural_engine(cores: u32) -> ProcessorCapabilities {
        ProcessorCapabilities::apple_neural_engine(cores)
    }
    /// Apple M3 Neural Engine (16 cores).
    pub fn apple_m3_neural_engine() -> ProcessorCapabilities {
        ProcessorCapabilities::apple_neural_engine(16)
    }
    /// Apple M3 Max Neural Engine (16 cores).
    pub fn apple_m3_max_neural_engine() -> ProcessorCapabilities {
        ProcessorCapabilities::apple_neural_engine(16) // Same as M3
    }
    /// Apple A17 Pro Neural Engine (35 TOPS).
    pub fn apple_a17_pro_neural_engine() -> ProcessorCapabilities {
        ProcessorCapabilities {
            compute: ComputeThroughput {
                fp64_tflops: 0.0,
                fp32_tflops: 4.4,
                fp16_tflops: 8.8,
                bf16_tflops: 8.8,
                int8_tops: 35.0,
                int4_tops: 70.0,
                sparsity_speedup: 1.0,
            },
            memory: MemorySpecs {
                capacity_bytes: 0, // Uses unified memory
                bandwidth_gbps: 200,
                type_: MemoryType::Unified,
            },
            operations: ProcessorCapabilities::apple_neural_engine(16)
                .operations,
            power: PowerCharacteristics {
                tdp_watts: 8,
                efficiency: 0.98,
                power_tier: PowerTier::UltraLow,
            },
            optimal_for: vec![
                WorkloadCharacteristic::LowPower,
                WorkloadCharacteristic::LowLatency,
                WorkloadCharacteristic::SmallBatch,
            ],
        }
    }
    // ═══════════════════════════════════════════════════════════════
    // QUALCOMM NPU PROFILES
    // ═══════════════════════════════════════════════════════════════
    /// Qualcomm Hexagon NPU (Snapdragon 8 Gen 3).
    pub fn qualcomm_hexagon_8g3() -> ProcessorCapabilities {
        ProcessorCapabilities {
            compute: ComputeThroughput {
                fp64_tflops: 0.0,
                fp32_tflops: 3.0,
                fp16_tflops: 6.0,
                bf16_tflops: 6.0,
                int8_tops: 73.0, // 73 TOPS
                int4_tops: 146.0,
                sparsity_speedup: 1.0,
            },
            memory: MemorySpecs {
                capacity_bytes: 0, // Uses system memory
                bandwidth_gbps: 77,
                type_: MemoryType::Lpddr,
            },
            operations: ProcessorCapabilities::apple_neural_engine(16)
                .operations,
            power: PowerCharacteristics {
                tdp_watts: 10,
                efficiency: 0.95,
                power_tier: PowerTier::UltraLow,
            },
            optimal_for: vec![
                WorkloadCharacteristic::LowPower,
                WorkloadCharacteristic::LowLatency,
                WorkloadCharacteristic::SmallBatch,
            ],
        }
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_h100_profile() {
        let h100 = ProcessorProfiles::nvidia_h100();
        assert!(h100.compute.fp16_tflops > 1000.0);
        assert_eq!(h100.memory.capacity_bytes, 80 * 1024 * 1024 * 1024);
    }
    #[test]
    fn test_tpu_v5p_profile() {
        let tpu = ProcessorProfiles::google_tpu_v5p();
        assert!(tpu.compute.bf16_tflops > 900.0);
    }
    #[test]
    fn test_groq_lpu_profile() {
        let lpu = ProcessorProfiles::groq_lpu();
        assert!(lpu.memory.bandwidth_gbps > 50000); // Very high internal bandwidth
    }
    #[test]
    fn test_apple_ane_profile() {
        let ane = ProcessorProfiles::apple_m3_neural_engine();
        assert!(ane.power.tdp_watts < 20);
        assert!(ane.optimal_for.contains(&WorkloadCharacteristic::LowPower));
    }
 }
--- a/crates/synor-compute/src/processor/types.rs
+++ b/crates/synor-compute/src/processor/types.rs
@ -0,0 +1,367 @@
 //! Processor type definitions.
 use serde::{Deserialize, Serialize};
 /// All supported processor types.
 #[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub enum ProcessorType {
    /// Central Processing Unit.
    Cpu(CpuVariant),
    /// Graphics Processing Unit.
    Gpu(GpuVariant),
    /// Tensor Processing Unit (Google).
    Tpu(TpuVersion),
    /// Neural Processing Unit (various vendors).
    Npu(NpuVariant),
    /// Language Processing Unit (Groq).
    Lpu,
    /// Field Programmable Gate Array.
    Fpga(FpgaVendor),
    /// Digital Signal Processor.
    Dsp(DspVariant),
    /// WebGPU (browser).
    WebGpu,
    /// WebAssembly runtime.
    Wasm,
    /// Custom/Unknown accelerator.
    Custom {
        vendor: String,
        model: String,
    },
 }
 impl Default for ProcessorType {
    fn default() -> Self {
        ProcessorType::Cpu(CpuVariant::default())
    }
 }
 /// CPU architecture variants.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub enum CpuVariant {
    /// x86-64 architecture.
    X86_64 { avx: AvxSupport },
    /// ARM 64-bit architecture.
    Arm64 { sve: bool },
    /// RISC-V architecture.
    RiscV { vector: bool },
 }
 impl Default for CpuVariant {
    fn default() -> Self {
        CpuVariant::X86_64 {
            avx: AvxSupport::Avx2,
        }
    }
 }
 /// AVX instruction set support levels.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
 pub enum AvxSupport {
    /// No AVX.
    None,
    /// AVX (Sandy Bridge+).
    Avx,
    /// AVX2 (Haswell+).
    Avx2,
    /// AVX-512 (Skylake-X+).
    Avx512,
    /// AVX10 (future).
    Avx10,
 }
 /// GPU vendor variants.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub enum GpuVariant {
    /// NVIDIA CUDA GPU.
    NvidiaCuda {
        /// Compute capability (major, minor).
        compute_capability: (u8, u8),
    },
    /// AMD ROCm GPU.
    AmdRocm {
        /// GFX version (e.g., 1100 for RDNA3).
        gfx_version: u32,
    },
    /// Intel OneAPI GPU.
    IntelOneApi,
    /// Apple Metal GPU.
    AppleMetal,
    /// Qualcomm Adreno GPU.
    QualcommAdreno {
        /// Adreno model number.
        model: u32,
    },
    /// ARM Mali GPU.
    ArmMali {
        /// Mali generation (e.g., G710).
        model: u32,
    },
    /// IMG PowerVR GPU.
    ImgPowerVr,
 }
 /// Google TPU versions.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub enum TpuVersion {
    /// TPU v2.
    V2,
    /// TPU v3.
    V3,
    /// TPU v4.
    V4,
    /// TPU v4i (inference).
    V4i,
    /// TPU v5e (efficiency).
    V5e,
    /// TPU v5p (performance).
    V5p,
    /// Edge TPU.
    Edge,
 }
 /// NPU (Neural Processing Unit) variants.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub enum NpuVariant {
    /// Apple Neural Engine.
    AppleNeuralEngine {
        /// Number of cores.
        cores: u32,
    },
    /// Qualcomm Hexagon DSP/NPU.
    QualcommHexagon {
        /// Version number.
        version: u32,
    },
    /// Intel VPU (Movidius).
    IntelVpu,
    /// Huawei Ascend.
    HuaweiAscend {
        /// Model (310, 910, etc.).
        model: u32,
    },
    /// Google Edge TPU.
    GoogleEdgeTpu,
    /// Samsung NPU.
    SamsungNpu,
    /// MediaTek APU.
    MediaTekApu {
        /// Version.
        version: u32,
    },
    /// Custom NPU.
    Custom {
        /// TOPS (Tera Operations Per Second).
        tops: u32,
    },
 }
 /// FPGA vendors.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub enum FpgaVendor {
    /// Xilinx (AMD).
    Xilinx,
    /// Intel (Altera).
    Intel,
    /// Lattice.
    Lattice,
    /// Microchip.
    Microchip,
 }
 /// DSP (Digital Signal Processor) variants.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub enum DspVariant {
    /// Texas Instruments DSP.
    TexasInstruments,
    /// Analog Devices DSP.
    AnalogDevices,
    /// Qualcomm Hexagon DSP.
    QualcommHexagon,
    /// Custom DSP.
    Custom,
 }
 impl ProcessorType {
    /// Returns whether this processor type supports CUDA.
    pub fn supports_cuda(&self) -> bool {
        matches!(self, ProcessorType::Gpu(GpuVariant::NvidiaCuda { .. }))
    }
    /// Returns whether this processor type supports ROCm.
    pub fn supports_rocm(&self) -> bool {
        matches!(self, ProcessorType::Gpu(GpuVariant::AmdRocm { .. }))
    }
    /// Returns whether this processor type supports Metal.
    pub fn supports_metal(&self) -> bool {
        matches!(self, ProcessorType::Gpu(GpuVariant::AppleMetal))
    }
    /// Returns whether this processor type is a GPU.
    pub fn is_gpu(&self) -> bool {
        matches!(self, ProcessorType::Gpu(_))
    }
    /// Returns whether this processor type is a CPU.
    pub fn is_cpu(&self) -> bool {
        matches!(self, ProcessorType::Cpu(_))
    }
    /// Returns whether this processor type is suitable for parallel workloads.
    pub fn is_parallel(&self) -> bool {
        matches!(
            self,
            ProcessorType::Gpu(_) | ProcessorType::Tpu(_) | ProcessorType::Fpga(_)
        )
    }
    /// Returns whether this processor type is suitable for sequential workloads.
    pub fn is_sequential(&self) -> bool {
        matches!(self, ProcessorType::Cpu(_) | ProcessorType::Lpu)
    }
    /// Returns whether this processor type is power-efficient.
    pub fn is_low_power(&self) -> bool {
        matches!(
            self,
            ProcessorType::Npu(_) | ProcessorType::Tpu(TpuVersion::Edge) | ProcessorType::Wasm
        )
    }
    /// Returns the typical power consumption tier.
    pub fn power_tier(&self) -> PowerTier {
        match self {
            ProcessorType::Npu(_) | ProcessorType::Wasm => PowerTier::UltraLow,
            ProcessorType::Cpu(CpuVariant::Arm64 { .. }) => PowerTier::Low,
            ProcessorType::Cpu(_) => PowerTier::Medium,
            ProcessorType::Gpu(GpuVariant::AppleMetal) => PowerTier::Medium,
            ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability })
                if compute_capability.0 >= 8 =>
            {
                PowerTier::High
            }
            ProcessorType::Gpu(_) => PowerTier::Medium,
            ProcessorType::Tpu(TpuVersion::Edge) => PowerTier::UltraLow,
            ProcessorType::Tpu(_) => PowerTier::High,
            ProcessorType::Lpu => PowerTier::Medium,
            ProcessorType::Fpga(_) => PowerTier::Medium,
            ProcessorType::Dsp(_) => PowerTier::Low,
            ProcessorType::WebGpu => PowerTier::Low,
            ProcessorType::Custom { .. } => PowerTier::Medium,
        }
    }
 }
 /// Power consumption tiers.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
 pub enum PowerTier {
    /// < 5W (mobile, IoT).
    UltraLow,
    /// 5-30W (laptop, tablet).
    Low,
    /// 30-150W (desktop, workstation).
    Medium,
    /// > 150W (server, data center).
    High,
 }
 /// Device class for routing decisions.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub enum DeviceClass {
    /// Data center equipment.
    DataCenter,
    /// Desktop/workstation.
    Desktop,
    /// Laptop.
    Laptop,
    /// Mobile phone.
    Mobile,
    /// Tablet.
    Tablet,
    /// IoT device.
    IoT,
    /// Browser (WebGPU/WASM).
    Browser,
    /// Edge server.
    Edge,
 }
 impl DeviceClass {
    /// Returns typical available compute hours per day.
    pub fn typical_availability_hours(&self) -> f32 {
        match self {
            DeviceClass::DataCenter => 24.0,
            DeviceClass::Desktop => 8.0,
            DeviceClass::Laptop => 6.0,
            DeviceClass::Mobile => 4.0,
            DeviceClass::Tablet => 4.0,
            DeviceClass::IoT => 24.0,
            DeviceClass::Browser => 2.0,
            DeviceClass::Edge => 24.0,
        }
    }
    /// Returns reliability score (0-100).
    pub fn reliability_score(&self) -> u32 {
        match self {
            DeviceClass::DataCenter => 99,
            DeviceClass::Edge => 95,
            DeviceClass::Desktop => 80,
            DeviceClass::Laptop => 60,
            DeviceClass::Mobile => 40,
            DeviceClass::Tablet => 50,
            DeviceClass::IoT => 70,
            DeviceClass::Browser => 30,
        }
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_processor_type_properties() {
        let nvidia = ProcessorType::Gpu(GpuVariant::NvidiaCuda {
            compute_capability: (9, 0),
        });
        assert!(nvidia.supports_cuda());
        assert!(nvidia.is_gpu());
        assert!(nvidia.is_parallel());
        let cpu = ProcessorType::Cpu(CpuVariant::X86_64 {
            avx: AvxSupport::Avx512,
        });
        assert!(cpu.is_cpu());
        assert!(cpu.is_sequential());
        let lpu = ProcessorType::Lpu;
        assert!(lpu.is_sequential());
        let npu = ProcessorType::Npu(NpuVariant::AppleNeuralEngine { cores: 16 });
        assert!(npu.is_low_power());
    }
    #[test]
    fn test_power_tiers() {
        let h100 = ProcessorType::Gpu(GpuVariant::NvidiaCuda {
            compute_capability: (9, 0),
        });
        assert_eq!(h100.power_tier(), PowerTier::High);
        let npu = ProcessorType::Npu(NpuVariant::AppleNeuralEngine { cores: 16 });
        assert_eq!(npu.power_tier(), PowerTier::UltraLow);
        let arm = ProcessorType::Cpu(CpuVariant::Arm64 { sve: false });
        assert_eq!(arm.power_tier(), PowerTier::Low);
    }
    #[test]
    fn test_device_class() {
        assert_eq!(DeviceClass::DataCenter.typical_availability_hours(), 24.0);
        assert_eq!(DeviceClass::Mobile.typical_availability_hours(), 4.0);
        assert_eq!(DeviceClass::DataCenter.reliability_score(), 99);
        assert_eq!(DeviceClass::Browser.reliability_score(), 30);
    }
 }
--- a/crates/synor-compute/src/scheduler/load_balancer.rs
+++ b/crates/synor-compute/src/scheduler/load_balancer.rs
@ -0,0 +1,810 @@
 //! Load balancer with work stealing for heterogeneous compute.
 //!
 //! Supports:
 //! - Cross-processor-type work migration
 //! - Energy-aware balancing
 //! - Latency-aware scheduling
 //! - Real-time utilization metrics
 use crate::device::{DeviceInfo, DeviceRegistry};
 use crate::processor::{Operation, OperationType, ProcessorId, ProcessorType};
 use crate::task::{Task, TaskId, TaskPriority};
 use super::TaskAssignment;
 use parking_lot::RwLock;
 use std::collections::HashMap;
 use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 /// Balancing strategy for the load balancer.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum BalancingStrategy {
    /// Optimize for speed (minimize execution time).
    Speed,
    /// Optimize for energy efficiency.
    Energy,
    /// Balance speed and energy.
    Balanced,
    /// Optimize for cost (spot pricing).
    Cost,
    /// Optimize for latency (inference workloads).
    Latency,
 }
 impl Default for BalancingStrategy {
    fn default() -> Self {
        BalancingStrategy::Balanced
    }
 }
 /// Real-time processor metrics.
 #[derive(Clone, Debug, Default)]
 pub struct ProcessorMetrics {
    /// Current utilization (0.0 - 1.0).
    pub utilization: f64,
    /// Queue depth (pending tasks).
    pub queue_depth: u64,
    /// Average task completion time (ms).
    pub avg_completion_ms: f64,
    /// Tasks completed in last minute.
    pub throughput_per_min: u64,
    /// Current power draw (watts).
    pub power_watts: f64,
    /// Temperature (celsius).
    pub temperature: f64,
    /// Last updated timestamp.
    pub last_updated: Option<Instant>,
 }
 /// Load balancer for heterogeneous compute environments.
 pub struct LoadBalancer {
    /// Device registry for processor info.
    device_registry: Option<Arc<DeviceRegistry>>,
    /// Current load per processor (task count).
    loads: RwLock<HashMap<ProcessorId, AtomicU64>>,
    /// Real-time metrics per processor.
    metrics: RwLock<HashMap<ProcessorId, ProcessorMetrics>>,
    /// Processor type mapping.
    processor_types: RwLock<HashMap<ProcessorId, ProcessorType>>,
    /// Work stealing threshold (0.0 - 1.0).
    steal_threshold: f64,
    /// Rebalance threshold (0.0 - 1.0).
    rebalance_threshold: f64,
    /// Current balancing strategy.
    strategy: RwLock<BalancingStrategy>,
    /// Migration history (to prevent thrashing).
    migration_history: RwLock<Vec<MigrationRecord>>,
 }
 /// Record of a task migration.
 #[derive(Clone, Debug)]
 struct MigrationRecord {
    task_id: TaskId,
    from: ProcessorId,
    to: ProcessorId,
    timestamp: Instant,
 }
 impl LoadBalancer {
    /// Creates a new load balancer.
    pub fn new() -> Self {
        Self {
            device_registry: None,
            loads: RwLock::new(HashMap::new()),
            metrics: RwLock::new(HashMap::new()),
            processor_types: RwLock::new(HashMap::new()),
            steal_threshold: 0.3,
            rebalance_threshold: 0.2,
            strategy: RwLock::new(BalancingStrategy::default()),
            migration_history: RwLock::new(Vec::new()),
        }
    }
    /// Creates a load balancer with device registry.
    pub fn with_registry(device_registry: Arc<DeviceRegistry>) -> Self {
        Self {
            device_registry: Some(device_registry),
            loads: RwLock::new(HashMap::new()),
            metrics: RwLock::new(HashMap::new()),
            processor_types: RwLock::new(HashMap::new()),
            steal_threshold: 0.3,
            rebalance_threshold: 0.2,
            strategy: RwLock::new(BalancingStrategy::default()),
            migration_history: RwLock::new(Vec::new()),
        }
    }
    /// Sets the balancing strategy.
    pub fn set_strategy(&self, strategy: BalancingStrategy) {
        *self.strategy.write() = strategy;
    }
    /// Gets the current strategy.
    pub fn strategy(&self) -> BalancingStrategy {
        *self.strategy.read()
    }
    /// Register a processor with its type.
    pub fn register_processor(&self, processor_id: ProcessorId, processor_type: ProcessorType) {
        self.loads.write().insert(processor_id, AtomicU64::new(0));
        self.metrics.write().insert(processor_id, ProcessorMetrics::default());
        self.processor_types.write().insert(processor_id, processor_type);
    }
    /// Unregister a processor.
    pub fn unregister_processor(&self, processor_id: ProcessorId) {
        self.loads.write().remove(&processor_id);
        self.metrics.write().remove(&processor_id);
        self.processor_types.write().remove(&processor_id);
    }
    /// Update real-time metrics for a processor.
    pub fn update_metrics(&self, processor_id: ProcessorId, metrics: ProcessorMetrics) {
        if let Some(existing) = self.metrics.write().get_mut(&processor_id) {
            *existing = ProcessorMetrics {
                last_updated: Some(Instant::now()),
                ..metrics
            };
        }
    }
    /// Get current load for a processor.
    pub fn get_load(&self, processor_id: ProcessorId) -> u64 {
        self.loads.read()
            .get(&processor_id)
            .map(|l| l.load(Ordering::Relaxed))
            .unwrap_or(0)
    }
    /// Increment load for a processor.
    pub fn increment_load(&self, processor_id: ProcessorId) {
        if let Some(load) = self.loads.read().get(&processor_id) {
            load.fetch_add(1, Ordering::Relaxed);
        }
    }
    /// Decrement load for a processor.
    pub fn decrement_load(&self, processor_id: ProcessorId) {
        if let Some(load) = self.loads.read().get(&processor_id) {
            load.fetch_sub(1, Ordering::Relaxed);
        }
    }
    /// Check if an operation can run on a processor type.
    pub fn can_execute(&self, op: &Operation, processor_type: &ProcessorType) -> bool {
        let op_type = op.op_type();
        match processor_type {
            // CPUs can handle most sequential operations
            ProcessorType::Cpu(_) => matches!(
                op_type,
                OperationType::MatMul
                | OperationType::Conv2d
                | OperationType::Conv3d
                | OperationType::DepthwiseConv
                | OperationType::BatchNorm
                | OperationType::LayerNorm
                | OperationType::Add
                | OperationType::Mul
                | OperationType::ReLU
                | OperationType::GeLU
                | OperationType::SiLU
                | OperationType::Softmax
                | OperationType::Sum
                | OperationType::Mean
                | OperationType::Max
                | OperationType::ArgMax
                | OperationType::Embedding
                | OperationType::TopK
                | OperationType::Sampling
                | OperationType::Tokenization
                | OperationType::Detokenization
                | OperationType::DataLoad
                | OperationType::DataPreprocess
                | OperationType::Transpose
                | OperationType::Reshape
                | OperationType::Concat
                | OperationType::Split
            ),
            // GPUs excel at parallel operations
            ProcessorType::Gpu(_) => matches!(
                op_type,
                OperationType::MatMul
                | OperationType::Conv2d
                | OperationType::Conv3d
                | OperationType::DepthwiseConv
                | OperationType::BatchNorm
                | OperationType::LayerNorm
                | OperationType::SelfAttention
                | OperationType::CrossAttention
                | OperationType::FlashAttention
                | OperationType::Add
                | OperationType::Mul
                | OperationType::ReLU
                | OperationType::GeLU
                | OperationType::SiLU
                | OperationType::Softmax
                | OperationType::Sum
                | OperationType::Mean
                | OperationType::Max
                | OperationType::ArgMax
                | OperationType::Embedding
                | OperationType::RoPE
                | OperationType::KVCache
                | OperationType::TopK
                | OperationType::Sampling
                | OperationType::Transpose
                | OperationType::Reshape
                | OperationType::Concat
                | OperationType::Split
                | OperationType::Gather
                | OperationType::Scatter
                | OperationType::AllReduce
                | OperationType::AllGather
                | OperationType::ReduceScatter
                | OperationType::Backward
                | OperationType::OptimizerStep
                | OperationType::GradientClip
            ),
            // TPUs optimized for ML
            ProcessorType::Tpu(_) => matches!(
                op_type,
                OperationType::MatMul
                | OperationType::Conv2d
                | OperationType::BatchNorm
                | OperationType::LayerNorm
                | OperationType::SelfAttention
                | OperationType::CrossAttention
                | OperationType::FlashAttention
                | OperationType::Add
                | OperationType::Mul
                | OperationType::ReLU
                | OperationType::GeLU
                | OperationType::SiLU
                | OperationType::Softmax
                | OperationType::Sum
                | OperationType::Mean
                | OperationType::Embedding
                | OperationType::RoPE
                | OperationType::KVCache
                | OperationType::AllReduce
                | OperationType::AllGather
                | OperationType::ReduceScatter
                | OperationType::Backward
                | OperationType::OptimizerStep
            ),
            // NPUs for neural network inference
            ProcessorType::Npu(_) => matches!(
                op_type,
                OperationType::MatMul
                | OperationType::Conv2d
                | OperationType::DepthwiseConv
                | OperationType::BatchNorm
                | OperationType::LayerNorm
                | OperationType::SelfAttention
                | OperationType::Add
                | OperationType::Mul
                | OperationType::ReLU
                | OperationType::GeLU
                | OperationType::SiLU
                | OperationType::Softmax
                | OperationType::Sum
                | OperationType::Mean
            ),
            // LPUs for sequential inference (optimized for LLMs)
            ProcessorType::Lpu => matches!(
                op_type,
                OperationType::MatMul
                | OperationType::LayerNorm
                | OperationType::SelfAttention
                | OperationType::FlashAttention
                | OperationType::Add
                | OperationType::Mul
                | OperationType::ReLU
                | OperationType::GeLU
                | OperationType::SiLU
                | OperationType::Softmax
                | OperationType::Embedding
                | OperationType::RoPE
                | OperationType::KVCache
                | OperationType::TopK
                | OperationType::Sampling
            ),
            // FPGAs can be programmed for anything
            ProcessorType::Fpga(_) => true,
            // DSPs for signal processing
            ProcessorType::Dsp(_) => matches!(
                op_type,
                OperationType::Conv2d
                | OperationType::DepthwiseConv
                | OperationType::Add
                | OperationType::Mul
                | OperationType::Sum
                | OperationType::Mean
                | OperationType::Max
            ),
            // WebGPU has limited operations
            ProcessorType::WebGpu => matches!(
                op_type,
                OperationType::MatMul
                | OperationType::Conv2d
                | OperationType::Add
                | OperationType::Mul
                | OperationType::ReLU
                | OperationType::Softmax
                | OperationType::Sum
                | OperationType::Transpose
                | OperationType::Reshape
            ),
            // WASM for portable compute
            ProcessorType::Wasm => matches!(
                op_type,
                OperationType::MatMul
                | OperationType::Add
                | OperationType::Mul
                | OperationType::ReLU
                | OperationType::Softmax
                | OperationType::Sum
                | OperationType::Mean
                | OperationType::Tokenization
                | OperationType::Detokenization
            ),
            // Custom processors - assume they can handle anything
            ProcessorType::Custom { .. } => true,
        }
    }
    /// Calculate a score for assigning a task to a processor.
    fn calculate_score(
        &self,
        task: &Task,
        processor_id: ProcessorId,
        processor_type: &ProcessorType,
    ) -> f64 {
        let strategy = *self.strategy.read();
        let load = self.get_load(processor_id);
        let metrics = self.metrics.read();
        let proc_metrics = metrics.get(&processor_id);
        // Base score from compatibility
        if !self.can_execute(&task.operation, processor_type) {
            return f64::NEG_INFINITY;
        }
        // Get utilization and metrics
        let utilization = proc_metrics.map(|m| m.utilization).unwrap_or(load as f64 / 100.0);
        let power = proc_metrics.map(|m| m.power_watts).unwrap_or(100.0);
        let avg_completion = proc_metrics.map(|m| m.avg_completion_ms).unwrap_or(100.0);
        // Calculate score based on strategy
        match strategy {
            BalancingStrategy::Speed => {
                // Prioritize low utilization and fast completion
                let speed_score = 1.0 / (avg_completion.max(1.0) * (1.0 + utilization));
                // Bonus for powerful processor types
                let type_bonus = match processor_type {
                    ProcessorType::Gpu(_) => 2.0,
                    ProcessorType::Tpu(_) => 2.5,
                    ProcessorType::Lpu => 3.0, // Fastest for inference
                    ProcessorType::Npu(_) => 1.5,
                    _ => 1.0,
                };
                speed_score * type_bonus
            }
            BalancingStrategy::Energy => {
                // Prioritize low power consumption
                let energy_score = 1.0 / power.max(1.0);
                // Bonus for efficient processor types
                let efficiency_bonus = match processor_type {
                    ProcessorType::Npu(_) => 3.0, // Most efficient
                    ProcessorType::Lpu => 2.0,
                    ProcessorType::Cpu(_) => 1.5,
                    ProcessorType::Wasm => 2.0, // Low overhead
                    _ => 1.0,
                };
                energy_score * efficiency_bonus * (1.0 - utilization * 0.5)
            }
            BalancingStrategy::Balanced => {
                // Balance speed and energy
                let speed = 1.0 / avg_completion.max(1.0);
                let efficiency = 1.0 / power.max(1.0);
                let load_factor = 1.0 - utilization;
                (speed * 0.4 + efficiency * 0.3 + load_factor * 0.3)
            }
            BalancingStrategy::Cost => {
                // Prioritize cheaper resources (consumer devices)
                let cost_factor = match processor_type {
                    ProcessorType::Wasm => 0.1,      // Cheapest (browser)
                    ProcessorType::WebGpu => 0.15,
                    ProcessorType::Cpu(_) => 0.2,
                    ProcessorType::Npu(_) => 0.3,    // Mobile NPUs
                    ProcessorType::Gpu(_) => 0.5,
                    ProcessorType::Lpu => 0.8,
                    ProcessorType::Tpu(_) => 1.0,    // Most expensive
                    _ => 0.5,
                };
                (1.0 - cost_factor) * (1.0 - utilization)
            }
            BalancingStrategy::Latency => {
                // Prioritize low latency for inference
                let latency_score = 1.0 / avg_completion.max(0.1);
                // Bonus for low-latency processors
                let latency_bonus = match processor_type {
                    ProcessorType::Lpu => 5.0,      // Designed for low latency
                    ProcessorType::Npu(_) => 3.0,
                    ProcessorType::Gpu(_) => 2.0,
                    ProcessorType::Tpu(_) => 1.5,
                    _ => 1.0,
                };
                // Priority boost for critical tasks
                let priority_boost = match task.priority {
                    TaskPriority::Critical => 2.0,
                    TaskPriority::High => 1.5,
                    TaskPriority::Normal => 1.0,
                    TaskPriority::Background => 0.5,
                };
                latency_score * latency_bonus * priority_boost * (1.0 - utilization)
            }
        }
    }
    /// Maybe rebalance a task to a different processor.
    pub fn maybe_rebalance(
        &self,
        task: &Task,
        suggested_processor: ProcessorId,
        current_assignment: &TaskAssignment,
    ) -> ProcessorId {
        // Get all registered processors
        let processor_types = self.processor_types.read();
        // If we don't have processor info, use suggested
        let suggested_type = match processor_types.get(&suggested_processor) {
            Some(t) => t.clone(),
            None => return suggested_processor,
        };
        // Calculate score for suggested processor
        let suggested_score = self.calculate_score(task, suggested_processor, &suggested_type);
        // Find best alternative
        let mut best_processor = suggested_processor;
        let mut best_score = suggested_score;
        for (proc_id, proc_type) in processor_types.iter() {
            if *proc_id == suggested_processor {
                continue;
            }
            let score = self.calculate_score(task, *proc_id, proc_type);
            // Only switch if significantly better (prevents thrashing)
            if score > best_score * (1.0 + self.rebalance_threshold) {
                best_score = score;
                best_processor = *proc_id;
            }
        }
        // Record migration if different
        if best_processor != suggested_processor {
            self.migration_history.write().push(MigrationRecord {
                task_id: task.id,
                from: suggested_processor,
                to: best_processor,
                timestamp: Instant::now(),
            });
        }
        best_processor
    }
    /// Check if work stealing should happen between two processors.
    pub fn should_steal(&self, from: ProcessorId, to: ProcessorId) -> bool {
        let from_load = self.get_load(from) as f64;
        let to_load = self.get_load(to) as f64;
        if from_load == 0.0 {
            return false;
        }
        // Check if processor types are compatible for the queued work
        let processor_types = self.processor_types.read();
        let from_type = processor_types.get(&from);
        let to_type = processor_types.get(&to);
        // Only steal between same processor types by default
        // (cross-type stealing requires operation compatibility check)
        match (from_type, to_type) {
            (Some(ft), Some(tt)) if std::mem::discriminant(ft) == std::mem::discriminant(tt) => {
                let diff = (from_load - to_load) / from_load;
                diff > self.steal_threshold
            }
            _ => false,
        }
    }
    /// Get rebalancing suggestions based on current load.
    pub fn get_rebalance_suggestions(&self) -> Vec<(ProcessorId, ProcessorId)> {
        let mut suggestions = Vec::new();
        let loads = self.loads.read();
        let load_values: Vec<_> = loads.iter()
            .map(|(id, load)| (*id, load.load(Ordering::Relaxed)))
            .collect();
        if load_values.is_empty() {
            return suggestions;
        }
        let avg_load: f64 = load_values.iter().map(|(_, l)| *l as f64).sum::<f64>()
            / load_values.len() as f64;
        let processor_types = self.processor_types.read();
        let overloaded: Vec<_> = load_values.iter()
            .filter(|(_, l)| *l as f64 > avg_load * (1.0 + self.rebalance_threshold))
            .collect();
        let underloaded: Vec<_> = load_values.iter()
            .filter(|(_, l)| (*l as f64) < avg_load * (1.0 - self.rebalance_threshold))
            .collect();
        // Only suggest migrations between compatible processor types
        for (over_id, _) in overloaded {
            let over_type = processor_types.get(over_id);
            for (under_id, _) in &underloaded {
                let under_type = processor_types.get(under_id);
                // Check type compatibility
                if let (Some(ot), Some(ut)) = (over_type, under_type) {
                    if std::mem::discriminant(ot) == std::mem::discriminant(ut) {
                        suggestions.push((*over_id, *under_id));
                    }
                }
            }
        }
        suggestions
    }
    /// Get load statistics.
    pub fn get_stats(&self) -> LoadBalancerStats {
        let loads = self.loads.read();
        let metrics = self.metrics.read();
        let total_load: u64 = loads.values().map(|l| l.load(Ordering::Relaxed)).sum();
        let processor_count = loads.len();
        let avg_load = if processor_count > 0 {
            total_load as f64 / processor_count as f64
        } else {
            0.0
        };
        let total_utilization: f64 = metrics.values().map(|m| m.utilization).sum();
        let avg_utilization = if processor_count > 0 {
            total_utilization / processor_count as f64
        } else {
            0.0
        };
        let total_power: f64 = metrics.values().map(|m| m.power_watts).sum();
        let migrations = self.migration_history.read().len();
        LoadBalancerStats {
            total_load,
            avg_load,
            processor_count,
            avg_utilization,
            total_power_watts: total_power,
            total_migrations: migrations,
            strategy: *self.strategy.read(),
        }
    }
    /// Clean up old migration history.
    pub fn cleanup_history(&self, max_age: Duration) {
        let cutoff = Instant::now() - max_age;
        self.migration_history.write().retain(|r| r.timestamp > cutoff);
    }
 }
 impl Default for LoadBalancer {
    fn default() -> Self {
        Self::new()
    }
 }
 /// Load balancer statistics.
 #[derive(Clone, Debug)]
 pub struct LoadBalancerStats {
    /// Total tasks across all processors.
    pub total_load: u64,
    /// Average load per processor.
    pub avg_load: f64,
    /// Number of registered processors.
    pub processor_count: usize,
    /// Average utilization (0.0 - 1.0).
    pub avg_utilization: f64,
    /// Total power consumption (watts).
    pub total_power_watts: f64,
    /// Total migrations performed.
    pub total_migrations: usize,
    /// Current balancing strategy.
    pub strategy: BalancingStrategy,
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use crate::processor::{CpuVariant, GpuVariant, Operation, Precision};
    use crate::task::TaskStatus;
    fn create_test_task(priority: TaskPriority) -> Task {
        Task {
            id: TaskId::new(),
            operation: Operation::MatMul {
                m: 1024,
                n: 1024,
                k: 1024,
                precision: Precision::Fp32,
            },
            priority,
            dependencies: vec![],
            status: TaskStatus::Pending,
            deadline: None,
        }
    }
    #[test]
    fn test_load_tracking() {
        let balancer = LoadBalancer::new();
        balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
        balancer.register_processor(ProcessorId(1), ProcessorType::Cpu(CpuVariant::default()));
        assert_eq!(balancer.get_load(ProcessorId(0)), 0);
        balancer.increment_load(ProcessorId(0));
        balancer.increment_load(ProcessorId(0));
        balancer.increment_load(ProcessorId(1));
        assert_eq!(balancer.get_load(ProcessorId(0)), 2);
        assert_eq!(balancer.get_load(ProcessorId(1)), 1);
        balancer.decrement_load(ProcessorId(0));
        assert_eq!(balancer.get_load(ProcessorId(0)), 1);
    }
    #[test]
    fn test_should_steal_same_type() {
        let balancer = LoadBalancer::new();
        // Register two CPUs
        balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
        balancer.register_processor(ProcessorId(1), ProcessorType::Cpu(CpuVariant::default()));
        // Give processor 0 high load
        for _ in 0..10 {
            balancer.increment_load(ProcessorId(0));
        }
        balancer.increment_load(ProcessorId(1));
        // Should steal between same types
        assert!(balancer.should_steal(ProcessorId(0), ProcessorId(1)));
        assert!(!balancer.should_steal(ProcessorId(1), ProcessorId(0)));
    }
    #[test]
    fn test_should_not_steal_different_types() {
        let balancer = LoadBalancer::new();
        // Register CPU and GPU
        balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
        balancer.register_processor(
            ProcessorId(1),
            ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability: (8, 9) }),
        );
        // Give CPU high load
        for _ in 0..10 {
            balancer.increment_load(ProcessorId(0));
        }
        // Should NOT steal between different types
        assert!(!balancer.should_steal(ProcessorId(0), ProcessorId(1)));
    }
    #[test]
    fn test_can_execute() {
        let balancer = LoadBalancer::new();
        let matmul = Operation::MatMul {
            m: 1024,
            n: 1024,
            k: 1024,
            precision: Precision::Fp32,
        };
        let flash_attention = Operation::FlashAttention {
            batch: 32,
            seq_len: 2048,
            num_heads: 32,
            head_dim: 128,
            precision: Precision::Fp16,
        };
        let cpu = ProcessorType::Cpu(CpuVariant::default());
        let gpu = ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability: (8, 9) });
        let lpu = ProcessorType::Lpu;
        // MatMul can run on all
        assert!(balancer.can_execute(&matmul, &cpu));
        assert!(balancer.can_execute(&matmul, &gpu));
        assert!(balancer.can_execute(&matmul, &lpu));
        // FlashAttention only on GPU/TPU/LPU
        assert!(!balancer.can_execute(&flash_attention, &cpu));
        assert!(balancer.can_execute(&flash_attention, &gpu));
    }
    #[test]
    fn test_strategy_affects_scoring() {
        let balancer = LoadBalancer::new();
        let cpu_id = ProcessorId(0);
        let npu_id = ProcessorId(1);
        balancer.register_processor(cpu_id, ProcessorType::Cpu(CpuVariant::default()));
        balancer.register_processor(npu_id, ProcessorType::Npu(crate::processor::NpuVariant::AppleNeuralEngine { cores: 16 }));
        let task = create_test_task(TaskPriority::Normal);
        // Energy strategy should prefer NPU
        balancer.set_strategy(BalancingStrategy::Energy);
        let assignment = TaskAssignment::new();
        let result = balancer.maybe_rebalance(&task, cpu_id, &assignment);
        // NPU should be preferred for energy efficiency
        assert_eq!(result, npu_id);
    }
    #[test]
    fn test_stats() {
        let balancer = LoadBalancer::new();
        balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
        balancer.register_processor(ProcessorId(1), ProcessorType::Cpu(CpuVariant::default()));
        balancer.increment_load(ProcessorId(0));
        balancer.increment_load(ProcessorId(0));
        balancer.increment_load(ProcessorId(1));
        let stats = balancer.get_stats();
        assert_eq!(stats.total_load, 3);
        assert_eq!(stats.processor_count, 2);
        assert!((stats.avg_load - 1.5).abs() < 0.01);
    }
 }
--- a/crates/synor-compute/src/scheduler/mod.rs
+++ b/crates/synor-compute/src/scheduler/mod.rs
@ -0,0 +1,559 @@
 //! Heterogeneous scheduler for multi-processor task assignment.
 //!
 //! Features:
 //! - Optimal task-to-processor assignment
 //! - Work stealing for load balancing
 //! - Pipeline parallelism across processor types
 //! - Dynamic rebalancing based on actual throughput
 mod load_balancer;
 mod work_queue;
 pub use load_balancer::LoadBalancer;
 pub use work_queue::WorkQueue;
 use crate::device::DeviceRegistry;
 use crate::error::ComputeError;
 use crate::processor::{Operation, Processor, ProcessorId, ProcessorType};
 use crate::task::{Task, TaskId, TaskPriority};
 use parking_lot::RwLock;
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::Duration;
 /// Heterogeneous scheduler that manages tasks across all processor types.
 pub struct HeterogeneousScheduler {
    /// Device registry.
    device_registry: Arc<DeviceRegistry>,
    /// Per-processor-type task queues.
    queues: RwLock<HashMap<ProcessorType, WorkQueue>>,
    /// Load balancer.
    load_balancer: LoadBalancer,
    /// Active schedules.
    active_schedules: RwLock<HashMap<ScheduleId, Schedule>>,
 }
 impl HeterogeneousScheduler {
    /// Creates a new heterogeneous scheduler.
    pub fn new(device_registry: Arc<DeviceRegistry>) -> Self {
        Self {
            device_registry,
            queues: RwLock::new(HashMap::new()),
            load_balancer: LoadBalancer::new(),
            active_schedules: RwLock::new(HashMap::new()),
        }
    }
    /// Schedule a set of tasks for execution.
    pub async fn schedule(&self, tasks: Vec<Task>) -> Result<ScheduleResult, ComputeError> {
        if tasks.is_empty() {
            return Ok(ScheduleResult {
                schedule: Schedule::empty(),
                estimated_makespan: Duration::ZERO,
                processor_utilization: HashMap::new(),
            });
        }
        // 1. Build dependency graph
        let deps = self.build_dependency_graph(&tasks);
        // 2. Assign tasks to optimal processors
        let assignment = self.assign_tasks(&tasks, &deps).await?;
        // 3. Create execution schedule with stages
        let schedule = self.create_schedule(&tasks, &assignment, &deps)?;
        // 4. Estimate metrics
        let makespan = self.estimate_makespan(&schedule);
        let utilization = self.estimate_utilization(&schedule);
        // 5. Store active schedule
        self.active_schedules.write().insert(schedule.id, schedule.clone());
        Ok(ScheduleResult {
            schedule,
            estimated_makespan: makespan,
            processor_utilization: utilization,
        })
    }
    /// Execute a schedule.
    pub async fn execute(&self, schedule: &Schedule) -> Result<ExecutionResult, ComputeError> {
        let mut results = HashMap::new();
        let start = std::time::Instant::now();
        // Execute stages in order
        for stage in &schedule.stages {
            // Execute all tasks in this stage in parallel
            let mut handles = Vec::new();
            for task_id in &stage.tasks {
                let task = schedule.tasks.get(task_id)
                    .ok_or_else(|| ComputeError::Internal(format!("Task not found: {:?}", task_id)))?;
                let processor_id = schedule.assignment.get(task_id)
                    .ok_or_else(|| ComputeError::Internal(format!("No assignment for task: {:?}", task_id)))?;
                let processor = self.device_registry.get_processor(processor_id)?;
                let task_clone = task.clone();
                handles.push(tokio::spawn(async move {
                    processor.execute(task_clone.operation).await
                }));
            }
            // Wait for all tasks in stage
            for (i, handle) in handles.into_iter().enumerate() {
                let task_id = stage.tasks[i];
                match handle.await {
                    Ok(Ok(result)) => {
                        results.insert(task_id, TaskExecutionResult::Success(result));
                    }
                    Ok(Err(e)) => {
                        results.insert(task_id, TaskExecutionResult::Failed(e.to_string()));
                    }
                    Err(e) => {
                        results.insert(task_id, TaskExecutionResult::Failed(e.to_string()));
                    }
                }
            }
        }
        let total_time = start.elapsed();
        Ok(ExecutionResult {
            results,
            total_time,
            actual_utilization: self.measure_utilization(),
        })
    }
    /// Assign tasks to optimal processors.
    async fn assign_tasks(
        &self,
        tasks: &[Task],
        deps: &DependencyGraph,
    ) -> Result<TaskAssignment, ComputeError> {
        let mut assignment = TaskAssignment::new();
        // Sort tasks by priority and dependencies (topological sort)
        let sorted_tasks = self.topological_sort(tasks, deps);
        for task in sorted_tasks {
            // Find best processor for this task
            let best_processor = self.find_best_processor(&task).await?;
            // Check if we should rebalance
            let final_processor = self.load_balancer
                .maybe_rebalance(&task, best_processor, &assignment);
            assignment.assign(task.id, final_processor);
        }
        Ok(assignment)
    }
    /// Find the best processor for a task.
    async fn find_best_processor(&self, task: &Task) -> Result<ProcessorId, ComputeError> {
        let mut best_score = f64::NEG_INFINITY;
        let mut best_processor = None;
        // Get all available processors
        let processors = self.device_registry.all_processors();
        for processor in processors {
            if !processor.can_execute(&task.operation) {
                continue;
            }
            // Calculate score based on multiple factors
            let exec_time = processor.estimate_time(&task.operation);
            let energy = processor.estimate_energy(&task.operation);
            let load = processor.utilization();
            // Score = 1 / (time * (1 + load) * energy_factor)
            let time_factor = exec_time.as_secs_f64().max(0.001);
            let load_factor = 1.0 + load;
            let energy_factor = 1.0 + (energy / 1000.0); // Normalize energy
            let score = 1.0 / (time_factor * load_factor * energy_factor);
            if score > best_score {
                best_score = score;
                best_processor = Some(processor.id());
            }
        }
        best_processor.ok_or_else(|| {
            ComputeError::NoSuitableProcessor(format!("{:?}", task.operation.op_type()))
        })
    }
    /// Build dependency graph from tasks.
    fn build_dependency_graph(&self, tasks: &[Task]) -> DependencyGraph {
        let mut graph = DependencyGraph::new();
        for task in tasks {
            graph.add_node(task.id);
            for dep in &task.dependencies {
                graph.add_edge(*dep, task.id);
            }
        }
        graph
    }
    /// Topological sort of tasks respecting dependencies.
    fn topological_sort(&self, tasks: &[Task], deps: &DependencyGraph) -> Vec<Task> {
        let mut sorted = Vec::new();
        let mut visited = std::collections::HashSet::new();
        let task_map: HashMap<TaskId, Task> = tasks.iter()
            .map(|t| (t.id, t.clone()))
            .collect();
        fn visit(
            task_id: TaskId,
            task_map: &HashMap<TaskId, Task>,
            deps: &DependencyGraph,
            visited: &mut std::collections::HashSet<TaskId>,
            sorted: &mut Vec<Task>,
        ) {
            if visited.contains(&task_id) {
                return;
            }
            visited.insert(task_id);
            // Visit dependencies first
            if let Some(task_deps) = deps.dependencies.get(&task_id) {
                for dep in task_deps {
                    visit(*dep, task_map, deps, visited, sorted);
                }
            }
            if let Some(task) = task_map.get(&task_id) {
                sorted.push(task.clone());
            }
        }
        for task in tasks {
            visit(task.id, &task_map, deps, &mut visited, &mut sorted);
        }
        // Sort by priority within dependency constraints
        sorted.sort_by(|a, b| b.priority.cmp(&a.priority));
        sorted
    }
    /// Create execution schedule with parallel stages.
    fn create_schedule(
        &self,
        tasks: &[Task],
        assignment: &TaskAssignment,
        deps: &DependencyGraph,
    ) -> Result<Schedule, ComputeError> {
        let mut stages = Vec::new();
        let mut scheduled = std::collections::HashSet::new();
        let task_map: HashMap<TaskId, Task> = tasks.iter()
            .map(|t| (t.id, t.clone()))
            .collect();
        while scheduled.len() < tasks.len() {
            let mut stage_tasks = Vec::new();
            for task in tasks {
                if scheduled.contains(&task.id) {
                    continue;
                }
                // Check if all dependencies are satisfied
                let deps_satisfied = task.dependencies.iter()
                    .all(|dep| scheduled.contains(dep));
                if deps_satisfied {
                    stage_tasks.push(task.id);
                }
            }
            if stage_tasks.is_empty() {
                return Err(ComputeError::SchedulingFailed(
                    "Circular dependency detected".to_string()
                ));
            }
            for task_id in &stage_tasks {
                scheduled.insert(*task_id);
            }
            stages.push(ScheduleStage {
                stage_id: stages.len(),
                tasks: stage_tasks,
            });
        }
        Ok(Schedule {
            id: ScheduleId::new(),
            tasks: task_map,
            assignment: assignment.clone(),
            stages,
        })
    }
    /// Estimate makespan (total execution time).
    fn estimate_makespan(&self, schedule: &Schedule) -> Duration {
        let mut total = Duration::ZERO;
        for stage in &schedule.stages {
            let mut max_stage_time = Duration::ZERO;
            for task_id in &stage.tasks {
                if let (Some(task), Some(proc_id)) = (
                    schedule.tasks.get(task_id),
                    schedule.assignment.get(task_id),
                ) {
                    if let Ok(processor) = self.device_registry.get_processor(proc_id) {
                        let time = processor.estimate_time(&task.operation);
                        max_stage_time = max_stage_time.max(time);
                    }
                }
            }
            total += max_stage_time;
        }
        total
    }
    /// Estimate processor utilization.
    fn estimate_utilization(&self, schedule: &Schedule) -> HashMap<ProcessorType, f64> {
        let mut work_time: HashMap<ProcessorType, Duration> = HashMap::new();
        let makespan = self.estimate_makespan(schedule);
        for task_id in schedule.assignment.assignments.keys() {
            if let (Some(task), Some(proc_id)) = (
                schedule.tasks.get(task_id),
                schedule.assignment.get(task_id),
            ) {
                if let Ok(processor) = self.device_registry.get_processor(proc_id) {
                    let proc_type = processor.processor_type();
                    let time = processor.estimate_time(&task.operation);
                    *work_time.entry(proc_type).or_default() += time;
                }
            }
        }
        work_time
            .into_iter()
            .map(|(proc_type, time)| {
                let utilization = if makespan.as_secs_f64() > 0.0 {
                    time.as_secs_f64() / makespan.as_secs_f64()
                } else {
                    0.0
                };
                (proc_type, utilization.min(1.0))
            })
            .collect()
    }
    /// Measure actual current utilization.
    fn measure_utilization(&self) -> HashMap<ProcessorType, f64> {
        let mut utilization = HashMap::new();
        for processor in self.device_registry.all_processors() {
            let proc_type = processor.processor_type();
            let util = processor.utilization();
            utilization
                .entry(proc_type)
                .and_modify(|u| *u = (*u + util) / 2.0)
                .or_insert(util);
        }
        utilization
    }
 }
 /// Schedule identifier.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct ScheduleId(pub u64);
 impl ScheduleId {
    /// Creates a new schedule ID.
    pub fn new() -> Self {
        use rand::Rng;
        ScheduleId(rand::thread_rng().gen())
    }
 }
 impl Default for ScheduleId {
    fn default() -> Self {
        Self::new()
    }
 }
 /// Task-to-processor assignment.
 #[derive(Clone, Debug, Default)]
 pub struct TaskAssignment {
    /// Map from task ID to processor ID.
    pub assignments: HashMap<TaskId, ProcessorId>,
 }
 impl TaskAssignment {
    /// Creates a new empty assignment.
    pub fn new() -> Self {
        Self {
            assignments: HashMap::new(),
        }
    }
    /// Assigns a task to a processor.
    pub fn assign(&mut self, task_id: TaskId, processor_id: ProcessorId) {
        self.assignments.insert(task_id, processor_id);
    }
    /// Gets the assigned processor for a task.
    pub fn get(&self, task_id: &TaskId) -> Option<ProcessorId> {
        self.assignments.get(task_id).copied()
    }
 }
 /// Dependency graph for tasks.
 #[derive(Clone, Debug, Default)]
 pub struct DependencyGraph {
    /// Dependencies: task -> list of tasks it depends on.
    pub dependencies: HashMap<TaskId, Vec<TaskId>>,
    /// Dependents: task -> list of tasks that depend on it.
    pub dependents: HashMap<TaskId, Vec<TaskId>>,
 }
 impl DependencyGraph {
    /// Creates a new empty dependency graph.
    pub fn new() -> Self {
        Self {
            dependencies: HashMap::new(),
            dependents: HashMap::new(),
        }
    }
    /// Adds a node (task) to the graph.
    pub fn add_node(&mut self, task_id: TaskId) {
        self.dependencies.entry(task_id).or_default();
        self.dependents.entry(task_id).or_default();
    }
    /// Adds a dependency edge (from depends on to).
    pub fn add_edge(&mut self, from: TaskId, to: TaskId) {
        self.dependencies.entry(to).or_default().push(from);
        self.dependents.entry(from).or_default().push(to);
    }
 }
 /// Execution schedule.
 #[derive(Clone, Debug)]
 pub struct Schedule {
    /// Schedule ID.
    pub id: ScheduleId,
    /// All tasks.
    pub tasks: HashMap<TaskId, Task>,
    /// Task assignments.
    pub assignment: TaskAssignment,
    /// Execution stages (tasks within a stage can run in parallel).
    pub stages: Vec<ScheduleStage>,
 }
 impl Schedule {
    /// Creates an empty schedule.
    pub fn empty() -> Self {
        Self {
            id: ScheduleId::new(),
            tasks: HashMap::new(),
            assignment: TaskAssignment::new(),
            stages: Vec::new(),
        }
    }
 }
 /// A stage of parallel tasks.
 #[derive(Clone, Debug)]
 pub struct ScheduleStage {
    /// Stage index.
    pub stage_id: usize,
    /// Tasks in this stage (can run in parallel).
    pub tasks: Vec<TaskId>,
 }
 /// Result of scheduling.
 #[derive(Clone, Debug)]
 pub struct ScheduleResult {
    /// The schedule.
    pub schedule: Schedule,
    /// Estimated total execution time.
    pub estimated_makespan: Duration,
    /// Estimated processor utilization by type.
    pub processor_utilization: HashMap<ProcessorType, f64>,
 }
 /// Result of execution.
 #[derive(Clone, Debug)]
 pub struct ExecutionResult {
    /// Results per task.
    pub results: HashMap<TaskId, TaskExecutionResult>,
    /// Total execution time.
    pub total_time: Duration,
    /// Actual processor utilization.
    pub actual_utilization: HashMap<ProcessorType, f64>,
 }
 /// Result of a single task execution.
 #[derive(Clone, Debug)]
 pub enum TaskExecutionResult {
    /// Task completed successfully.
    Success(crate::processor::OperationResult),
    /// Task failed.
    Failed(String),
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use crate::processor::Precision;
    use crate::task::TaskStatus;
    fn create_test_task(id: u64, op: Operation, deps: Vec<TaskId>) -> Task {
        Task {
            id: TaskId(id),
            operation: op,
            priority: TaskPriority::Normal,
            dependencies: deps,
            status: TaskStatus::Pending,
            deadline: None,
        }
    }
    #[test]
    fn test_dependency_graph() {
        let mut graph = DependencyGraph::new();
        graph.add_node(TaskId(1));
        graph.add_node(TaskId(2));
        graph.add_node(TaskId(3));
        graph.add_edge(TaskId(1), TaskId(2)); // 2 depends on 1
        graph.add_edge(TaskId(1), TaskId(3)); // 3 depends on 1
        graph.add_edge(TaskId(2), TaskId(3)); // 3 depends on 2
        assert_eq!(graph.dependencies[&TaskId(2)], vec![TaskId(1)]);
        assert_eq!(graph.dependencies[&TaskId(3)], vec![TaskId(1), TaskId(2)]);
    }
    #[test]
    fn test_task_assignment() {
        let mut assignment = TaskAssignment::new();
        assignment.assign(TaskId(1), ProcessorId(0));
        assignment.assign(TaskId(2), ProcessorId(1));
        assert_eq!(assignment.get(&TaskId(1)), Some(ProcessorId(0)));
        assert_eq!(assignment.get(&TaskId(2)), Some(ProcessorId(1)));
        assert_eq!(assignment.get(&TaskId(3)), None);
    }
 }
--- a/crates/synor-compute/src/scheduler/work_queue.rs
+++ b/crates/synor-compute/src/scheduler/work_queue.rs
@ -0,0 +1,271 @@
 //! Work queue with thread-safe task management.
 use crate::processor::ProcessorType;
 use crate::task::{Task, TaskId, TaskPriority};
 use crossbeam_channel::{bounded, Receiver, Sender, TryRecvError};
 use std::collections::HashMap;
 use std::sync::atomic::{AtomicU64, Ordering};
 /// Work queue for a specific processor type.
 pub struct WorkQueue {
    /// Task sender (for producers).
    sender: Sender<Task>,
    /// Task receiver (for consumers).
    receiver: Receiver<Task>,
    /// Processor type this queue is for.
    processor_type: ProcessorType,
    /// Current queue size.
    size: AtomicU64,
    /// Total tasks processed.
    processed: AtomicU64,
 }
 impl WorkQueue {
    /// Creates a new work queue for a processor type.
    pub fn new(processor_type: ProcessorType, capacity: usize) -> Self {
        let (sender, receiver) = bounded(capacity.max(1024));
        Self {
            sender,
            receiver,
            processor_type,
            size: AtomicU64::new(0),
            processed: AtomicU64::new(0),
        }
    }
    /// Push a task to the queue.
    pub fn push(&self, task: Task) {
        if self.sender.try_send(task).is_ok() {
            self.size.fetch_add(1, Ordering::Relaxed);
        }
    }
    /// Pop a task from the queue (ignores worker_id for compatibility).
    pub fn pop(&self, _worker_id: usize) -> Option<Task> {
        self.pop_any()
    }
    /// Pop any task from the queue.
    pub fn pop_any(&self) -> Option<Task> {
        match self.receiver.try_recv() {
            Ok(task) => {
                self.size.fetch_sub(1, Ordering::Relaxed);
                self.processed.fetch_add(1, Ordering::Relaxed);
                Some(task)
            }
            Err(TryRecvError::Empty) | Err(TryRecvError::Disconnected) => None,
        }
    }
    /// Pop from global queue (alias for pop_any).
    pub fn pop_global(&self) -> Option<Task> {
        self.pop_any()
    }
    /// Steal a batch of tasks from another queue.
    pub fn steal_batch_from(&self, other: &WorkQueue, max_tasks: usize) -> Vec<Task> {
        let mut stolen = Vec::new();
        while stolen.len() < max_tasks {
            if let Some(task) = other.pop_any() {
                stolen.push(task);
            } else {
                break;
            }
        }
        // Push stolen tasks to this queue
        for task in &stolen {
            // Tasks are already accounted for in `other`, just push to self
            if self.sender.try_send(task.clone()).is_ok() {
                self.size.fetch_add(1, Ordering::Relaxed);
            }
        }
        stolen
    }
    /// Get current queue size.
    pub fn len(&self) -> usize {
        self.size.load(Ordering::Relaxed) as usize
    }
    /// Check if queue is empty.
    pub fn is_empty(&self) -> bool {
        self.len() == 0
    }
    /// Get number of tasks processed.
    pub fn processed_count(&self) -> u64 {
        self.processed.load(Ordering::Relaxed)
    }
    /// Get processor type for this queue.
    pub fn processor_type(&self) -> ProcessorType {
        self.processor_type.clone()
    }
    /// Get utilization estimate (0.0 - 1.0).
    pub fn utilization(&self) -> f64 {
        let size = self.size.load(Ordering::Relaxed) as f64;
        let capacity = self.sender.capacity().unwrap_or(1024) as f64;
        (size / capacity).min(1.0)
    }
    /// Get a stealer for cross-queue work stealing.
    pub fn get_stealer(&self) -> QueueStealer {
        QueueStealer {
            receiver: self.receiver.clone(),
        }
    }
 }
 /// Stealer handle for cross-queue work stealing.
 #[derive(Clone)]
 pub struct QueueStealer {
    receiver: Receiver<Task>,
 }
 impl QueueStealer {
    /// Try to steal a task.
    pub fn steal(&self) -> Option<Task> {
        self.receiver.try_recv().ok()
    }
 }
 /// Priority queue wrapper for tasks.
 pub struct PriorityWorkQueue {
    /// Queues by priority level.
    queues: HashMap<TaskPriority, WorkQueue>,
    /// Processor type.
    processor_type: ProcessorType,
 }
 impl PriorityWorkQueue {
    /// Creates a new priority work queue.
    pub fn new(processor_type: ProcessorType, capacity_per_priority: usize) -> Self {
        let mut queues = HashMap::new();
        for priority in [
            TaskPriority::Critical,
            TaskPriority::High,
            TaskPriority::Normal,
            TaskPriority::Background,
        ] {
            queues.insert(priority, WorkQueue::new(processor_type.clone(), capacity_per_priority));
        }
        Self {
            queues,
            processor_type,
        }
    }
    /// Push a task with its priority.
    pub fn push(&self, task: Task) {
        let priority = task.priority;
        if let Some(queue) = self.queues.get(&priority) {
            queue.push(task);
        }
    }
    /// Pop highest priority task available.
    pub fn pop(&self, worker_id: usize) -> Option<Task> {
        // Try priorities in order: Critical > High > Normal > Background
        for priority in [
            TaskPriority::Critical,
            TaskPriority::High,
            TaskPriority::Normal,
            TaskPriority::Background,
        ] {
            if let Some(queue) = self.queues.get(&priority) {
                if let Some(task) = queue.pop(worker_id) {
                    return Some(task);
                }
            }
        }
        None
    }
    /// Get total queue size.
    pub fn len(&self) -> usize {
        self.queues.values().map(|q| q.len()).sum()
    }
    /// Check if all queues are empty.
    pub fn is_empty(&self) -> bool {
        self.queues.values().all(|q| q.is_empty())
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use crate::processor::{CpuVariant, Operation, Precision};
    use crate::task::TaskStatus;
    fn create_test_task(id: u64, priority: TaskPriority) -> Task {
        Task {
            id: TaskId(id),
            operation: Operation::MatMul {
                m: 1024,
                n: 1024,
                k: 1024,
                precision: Precision::Fp32,
            },
            priority,
            dependencies: vec![],
            status: TaskStatus::Pending,
            deadline: None,
        }
    }
    #[test]
    fn test_work_queue_basic() {
        let queue = WorkQueue::new(
            ProcessorType::Cpu(CpuVariant::default()),
            100,
        );
        assert!(queue.is_empty());
        queue.push(create_test_task(1, TaskPriority::Normal));
        queue.push(create_test_task(2, TaskPriority::Normal));
        assert_eq!(queue.len(), 2);
        let task1 = queue.pop(0);
        assert!(task1.is_some());
        assert_eq!(queue.len(), 1);
        let task2 = queue.pop(0);
        assert!(task2.is_some());
        assert!(queue.is_empty());
    }
    #[test]
    fn test_priority_queue() {
        let queue = PriorityWorkQueue::new(
            ProcessorType::Cpu(CpuVariant::default()),
            100,
        );
        queue.push(create_test_task(1, TaskPriority::Background));
        queue.push(create_test_task(2, TaskPriority::Critical));
        queue.push(create_test_task(3, TaskPriority::Normal));
        // Should get Critical first
        let task = queue.pop(0).unwrap();
        assert_eq!(task.id, TaskId(2));
        assert_eq!(task.priority, TaskPriority::Critical);
        // Then Normal
        let task = queue.pop(0).unwrap();
        assert_eq!(task.id, TaskId(3));
        // Then Background
        let task = queue.pop(0).unwrap();
        assert_eq!(task.id, TaskId(1));
    }
 }
--- a/crates/synor-compute/src/task/mod.rs
+++ b/crates/synor-compute/src/task/mod.rs
@ -0,0 +1,543 @@
 //! Task definitions and decomposition.
 use crate::error::ComputeError;
 use crate::processor::{Operation, OperationType, Precision, ProcessorType};
 use crate::{ComputeJob, JobType};
 use serde::{Deserialize, Serialize};
 use std::time::Duration;
 /// Unique task identifier.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct TaskId(pub u64);
 impl TaskId {
    /// Creates a new task ID.
    pub fn new() -> Self {
        use rand::Rng;
        TaskId(rand::thread_rng().gen())
    }
 }
 impl Default for TaskId {
    fn default() -> Self {
        Self::new()
    }
 }
 impl std::fmt::Display for TaskId {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "task_{}", self.0)
    }
 }
 /// Task priority levels.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
 pub enum TaskPriority {
    /// Background, can be preempted.
    Background = 0,
    /// Normal priority.
    Normal = 1,
    /// High priority.
    High = 2,
    /// Critical, must complete.
    Critical = 3,
 }
 impl Default for TaskPriority {
    fn default() -> Self {
        TaskPriority::Normal
    }
 }
 /// Task execution status.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
 pub enum TaskStatus {
    /// Waiting to be scheduled.
    Pending,
    /// Queued for execution.
    Queued,
    /// Currently executing.
    Running,
    /// Completed successfully.
    Completed,
    /// Failed.
    Failed,
    /// Cancelled.
    Cancelled,
 }
 /// A schedulable task.
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct Task {
    /// Task ID.
    pub id: TaskId,
    /// Operation to execute.
    pub operation: Operation,
    /// Priority level.
    pub priority: TaskPriority,
    /// Dependencies (tasks that must complete first).
    pub dependencies: Vec<TaskId>,
    /// Current status.
    pub status: TaskStatus,
    /// Deadline (optional).
    pub deadline: Option<u64>,
 }
 impl Task {
    /// Creates a new task.
    pub fn new(operation: Operation) -> Self {
        Self {
            id: TaskId::new(),
            operation,
            priority: TaskPriority::Normal,
            dependencies: Vec::new(),
            status: TaskStatus::Pending,
            deadline: None,
        }
    }
    /// Sets the priority.
    pub fn with_priority(mut self, priority: TaskPriority) -> Self {
        self.priority = priority;
        self
    }
    /// Adds dependencies.
    pub fn with_dependencies(mut self, deps: Vec<TaskId>) -> Self {
        self.dependencies = deps;
        self
    }
    /// Sets deadline.
    pub fn with_deadline(mut self, deadline: u64) -> Self {
        self.deadline = Some(deadline);
        self
    }
    /// Checks if task is compatible with a processor type.
    pub fn is_compatible_with(&self, proc_type: ProcessorType) -> bool {
        // Check based on operation type
        let op_type = self.operation.op_type();
        match proc_type {
            ProcessorType::Cpu(_) => {
                // CPUs can do most things, but slowly
                true
            }
            ProcessorType::Gpu(_) => {
                // GPUs are good for parallel operations
                matches!(
                    op_type,
                    OperationType::MatMul
                        | OperationType::Conv2d
                        | OperationType::SelfAttention
                        | OperationType::FlashAttention
                        | OperationType::Embedding
                        | OperationType::Add
                        | OperationType::Mul
                        | OperationType::Softmax
                )
            }
            ProcessorType::Tpu(_) => {
                // TPUs are good for large matrix ops
                matches!(
                    op_type,
                    OperationType::MatMul
                        | OperationType::Conv2d
                        | OperationType::SelfAttention
                        | OperationType::FlashAttention
                )
            }
            ProcessorType::Lpu => {
                // LPUs are good for sequential inference
                matches!(
                    op_type,
                    OperationType::MatMul
                        | OperationType::SelfAttention
                        | OperationType::KVCache
                        | OperationType::Sampling
                )
            }
            ProcessorType::Npu(_) => {
                // NPUs are good for inference
                matches!(
                    op_type,
                    OperationType::MatMul
                        | OperationType::Conv2d
                        | OperationType::Add
                        | OperationType::Softmax
                )
            }
            _ => true, // Default to compatible
        }
    }
 }
 /// Result of task execution.
 #[derive(Clone, Debug)]
 pub struct TaskResult {
    /// Task ID.
    pub task_id: TaskId,
    /// Output data.
    pub output: Vec<u8>,
    /// Execution duration.
    pub duration: Duration,
    /// Energy consumed (Joules).
    pub energy: f64,
 }
 /// Compute task for job execution.
 #[derive(Clone, Debug)]
 pub struct ComputeTask {
    /// Task.
    pub task: Task,
    /// Resource requirements.
    pub requirements: TaskRequirements,
    /// Preferred processor type.
    pub preferred_processor: Option<ProcessorType>,
    /// Fallback processor type.
    pub fallback_processor: Option<ProcessorType>,
 }
 /// Task resource requirements.
 #[derive(Clone, Debug, Default)]
 pub struct TaskRequirements {
    /// Minimum memory (bytes).
    pub min_memory: u64,
    /// Minimum TFLOPS.
    pub min_tflops: f64,
    /// Maximum latency (ms).
    pub max_latency_ms: Option<u32>,
    /// Requires specific precision.
    pub precision: Option<Precision>,
 }
 /// Decomposed workload.
 #[derive(Clone, Debug)]
 pub struct DecomposedWorkload {
    /// All tasks.
    pub tasks: Vec<Task>,
    /// Total estimated FLOPS.
    pub estimated_flops: f64,
    /// Total estimated memory.
    pub estimated_memory: u64,
 }
 /// Task decomposer that breaks jobs into schedulable tasks.
 pub struct TaskDecomposer {
    /// Default batch size for inference.
    inference_batch_size: usize,
    /// Default precision.
    default_precision: Precision,
 }
 impl TaskDecomposer {
    /// Creates a new task decomposer.
    pub fn new() -> Self {
        Self {
            inference_batch_size: 32,
            default_precision: Precision::Fp16,
        }
    }
    /// Decomposes a job into tasks.
    pub fn decompose(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
        match &job.job_type {
            JobType::Training { .. } => self.decompose_training(job),
            JobType::Inference { .. } => self.decompose_inference(job),
            JobType::Container { .. } => self.decompose_container(job),
            JobType::Serverless { .. } => self.decompose_serverless(job),
            JobType::Wasm { .. } => self.decompose_wasm(job),
        }
    }
    /// Decompose training job.
    fn decompose_training(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
        let mut tasks = Vec::new();
        if let JobType::Training {
            epochs,
            batch_size,
            ..
        } = &job.job_type
        {
            // Data loading task
            tasks.push(
                Task::new(Operation::DataLoad {
                    bytes: 1024 * 1024 * 100, // 100MB
                    async_: true,
                })
                .with_priority(TaskPriority::High),
            );
            let data_load_id = tasks[0].id;
            // Preprocessing task
            tasks.push(
                Task::new(Operation::DataPreprocess {
                    batch: *batch_size as usize,
                    transforms: vec!["normalize".to_string(), "augment".to_string()],
                })
                .with_dependencies(vec![data_load_id])
                .with_priority(TaskPriority::High),
            );
            let preprocess_id = tasks[1].id;
            // Forward pass (simplified as MatMul)
            tasks.push(
                Task::new(Operation::MatMul {
                    m: *batch_size as usize,
                    n: 4096,
                    k: 4096,
                    precision: self.default_precision,
                })
                .with_dependencies(vec![preprocess_id])
                .with_priority(TaskPriority::Critical),
            );
            let forward_id = tasks[2].id;
            // Backward pass
            tasks.push(
                Task::new(Operation::Backward {
                    forward_op: Box::new(Operation::MatMul {
                        m: *batch_size as usize,
                        n: 4096,
                        k: 4096,
                        precision: self.default_precision,
                    }),
                })
                .with_dependencies(vec![forward_id])
                .with_priority(TaskPriority::Critical),
            );
            let backward_id = tasks[3].id;
            // Optimizer step
            tasks.push(
                Task::new(Operation::OptimizerStep {
                    parameters: 1_000_000,
                    optimizer: "adamw".to_string(),
                    precision: self.default_precision,
                })
                .with_dependencies(vec![backward_id])
                .with_priority(TaskPriority::High),
            );
        }
        Ok(tasks)
    }
    /// Decompose inference job.
    fn decompose_inference(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
        let mut tasks = Vec::new();
        if let JobType::Inference { batch_size, .. } = &job.job_type {
            // Tokenization (CPU optimal)
            tasks.push(
                Task::new(Operation::Tokenization {
                    text_bytes: 4096,
                    vocab_size: 32000,
                })
                .with_priority(TaskPriority::High),
            );
            let token_id = tasks[0].id;
            // Embedding (GPU optimal)
            tasks.push(
                Task::new(Operation::Embedding {
                    batch: *batch_size as usize,
                    seq_len: 512,
                    vocab_size: 32000,
                    embed_dim: 4096,
                    precision: self.default_precision,
                })
                .with_dependencies(vec![token_id])
                .with_priority(TaskPriority::Critical),
            );
            let embed_id = tasks[1].id;
            // Self-attention (TPU/GPU optimal)
            tasks.push(
                Task::new(Operation::SelfAttention {
                    batch: *batch_size as usize,
                    seq_len: 512,
                    num_heads: 32,
                    head_dim: 128,
                    precision: self.default_precision,
                })
                .with_dependencies(vec![embed_id])
                .with_priority(TaskPriority::Critical),
            );
            let attention_id = tasks[2].id;
            // Sampling (LPU optimal)
            tasks.push(
                Task::new(Operation::Sampling {
                    batch: *batch_size as usize,
                    vocab_size: 32000,
                    temperature: 0.7,
                })
                .with_dependencies(vec![attention_id])
                .with_priority(TaskPriority::High),
            );
            let sample_id = tasks[3].id;
            // Detokenization (CPU optimal)
            tasks.push(
                Task::new(Operation::Detokenization {
                    tokens: 256,
                    vocab_size: 32000,
                })
                .with_dependencies(vec![sample_id])
                .with_priority(TaskPriority::Normal),
            );
        }
        Ok(tasks)
    }
    /// Decompose container job.
    fn decompose_container(&self, _job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
        // Container jobs are typically a single task
        Ok(vec![Task::new(Operation::Generic {
            op_type: OperationType::DataLoad,
            flops: 1e9,
            memory: 1024 * 1024 * 1024,
        })
        .with_priority(TaskPriority::Normal)])
    }
    /// Decompose serverless function.
    fn decompose_serverless(&self, _job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
        // Serverless is typically a single task
        Ok(vec![Task::new(Operation::Generic {
            op_type: OperationType::DataPreprocess,
            flops: 1e6,
            memory: 256 * 1024 * 1024,
        })
        .with_priority(TaskPriority::High)])
    }
    /// Decompose WASM job.
    fn decompose_wasm(&self, _job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
        // WASM is typically a single task
        Ok(vec![Task::new(Operation::Generic {
            op_type: OperationType::DataPreprocess,
            flops: 1e6,
            memory: 16 * 1024 * 1024,
        })
        .with_priority(TaskPriority::Normal)])
    }
 }
 impl Default for TaskDecomposer {
    fn default() -> Self {
        Self::new()
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_task_creation() {
        let task = Task::new(Operation::MatMul {
            m: 1024,
            n: 1024,
            k: 1024,
            precision: Precision::Fp32,
        })
        .with_priority(TaskPriority::High);
        assert_eq!(task.priority, TaskPriority::High);
        assert!(task.dependencies.is_empty());
        assert_eq!(task.status, TaskStatus::Pending);
    }
    #[test]
    fn test_task_dependencies() {
        let task1 = Task::new(Operation::DataLoad {
            bytes: 1000,
            async_: true,
        });
        let task1_id = task1.id;
        let task2 = Task::new(Operation::MatMul {
            m: 1024,
            n: 1024,
            k: 1024,
            precision: Precision::Fp32,
        })
        .with_dependencies(vec![task1_id]);
        assert_eq!(task2.dependencies, vec![task1_id]);
    }
    #[test]
    fn test_task_compatibility() {
        let matmul_task = Task::new(Operation::MatMul {
            m: 1024,
            n: 1024,
            k: 1024,
            precision: Precision::Fp32,
        });
        // MatMul should be compatible with GPU and TPU
        assert!(matmul_task.is_compatible_with(ProcessorType::Gpu(
            crate::processor::GpuVariant::NvidiaCuda {
                compute_capability: (8, 0)
            }
        )));
        assert!(matmul_task.is_compatible_with(ProcessorType::Tpu(
            crate::processor::TpuVersion::V5p
        )));
        let data_load_task = Task::new(Operation::DataLoad {
            bytes: 1000,
            async_: true,
        });
        // DataLoad should be compatible with CPU
        assert!(data_load_task.is_compatible_with(ProcessorType::Cpu(
            crate::processor::CpuVariant::default()
        )));
    }
    #[test]
    fn test_task_decomposer() {
        let decomposer = TaskDecomposer::new();
        let job = ComputeJob {
            id: crate::JobId::new(),
            owner: [0u8; 32],
            job_type: JobType::Inference {
                model_cid: "model".to_string(),
                input_format: "json".to_string(),
                batch_size: 1,
            },
            resources: crate::ResourceRequirements::default(),
            input_cid: None,
            max_budget: 1_000_000,
            priority: crate::JobPriority::Normal,
            created_at: 0,
            deadline: None,
        };
        let tasks = decomposer.decompose(&job).unwrap();
        assert!(!tasks.is_empty());
        // Check dependencies form a chain
        for (i, task) in tasks.iter().enumerate() {
            if i > 0 {
                assert!(!task.dependencies.is_empty());
            }
        }
    }
 }
--- a/docs/PLAN/PHASE11-Synor-Compute-L2-Part2-HyperEfficiency.md
+++ b/docs/PLAN/PHASE11-Synor-Compute-L2-Part2-HyperEfficiency.md
--- a/docs/PLAN/PHASE11-Synor-Compute-L2-Part3-HeterogeneousCompute.md
+++ b/docs/PLAN/PHASE11-Synor-Compute-L2-Part3-HeterogeneousCompute.md
--- a/docs/PLAN/PHASE11-Synor-Compute-L2.md
+++ b/docs/PLAN/PHASE11-Synor-Compute-L2.md
@ -0,0 +1,906 @@
 # Phase 11: Synor Compute L2 - Full-Stack Compute Platform
 > **Mission**: Build a decentralized compute platform capable of AI/ML training, inference, OS hosting, and general-purpose high-performance computing.
 ---
 ## Executive Summary
 Synor Compute L2 extends beyond the current WASM-only Synor VM to provide:
 - **GPU Compute**: AI/ML training and inference with CUDA/ROCm support
 - **Container Orchestration**: Docker-compatible workloads with Kubernetes-style scheduling
 - **Persistent VMs**: Long-running virtual machines for OS hosting
 - **Serverless Functions**: Short-lived compute for API backends and event processing
 - **Edge Compute**: Low-latency compute at network edge nodes
 ---
 ## Architecture Overview
 ```
 ┌─────────────────────────────────────────────────────────────────────────────┐
 │                         SYNOR COMPUTE L2                                     │
 ├─────────────────────────────────────────────────────────────────────────────┤
 │                                                                              │
 │  ┌─────────────────────────────────────────────────────────────────────────┐ │
 │  │                      APPLICATION LAYER                                   │ │
 │  ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
 │  │   AI/ML      │  Serverless  │  Containers  │  Persistent  │   Edge     │ │
 │  │   Training   │  Functions   │  (Docker)    │  VMs (Linux) │  Compute   │ │
 │  └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
 │                                                                              │
 │  ┌─────────────────────────────────────────────────────────────────────────┐ │
 │  │                      ORCHESTRATION LAYER                                 │ │
 │  ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
 │  │   Job        │  Resource    │  Network     │  Storage     │   Health   │ │
 │  │   Scheduler  │  Manager     │  Fabric      │  Orchestrator│   Monitor  │ │
 │  └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
 │                                                                              │
 │  ┌─────────────────────────────────────────────────────────────────────────┐ │
 │  │                      COMPUTE RUNTIME LAYER                               │ │
 │  ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
 │  │   GPU        │  Container   │  MicroVM     │  WASM        │   Native   │ │
 │  │   Runtime    │  Runtime     │  Runtime     │  Runtime     │   Runtime  │ │
 │  │   (CUDA/ROCm)│  (containerd)│  (Firecracker)│  (Wasmtime) │   (gVisor) │ │
 │  └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
 │                                                                              │
 │  ┌─────────────────────────────────────────────────────────────────────────┐ │
 │  │                      INFRASTRUCTURE LAYER                                │ │
 │  ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
 │  │   Node       │  Network     │  Distributed │  Consensus   │   Billing  │ │
 │  │   Registry   │  Overlay     │  Storage     │  (PoS+PoW)   │   Metering │ │
 │  └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
 │                                                                              │
 │  ┌─────────────────────────────────────────────────────────────────────────┐ │
 │  │              SYNOR L1 BLOCKCHAIN (GHOSTDAG + DAG-RIDER)                  │ │
 │  └─────────────────────────────────────────────────────────────────────────┘ │
 │                                                                              │
 └─────────────────────────────────────────────────────────────────────────────┘
 ```
 ---
 ## Milestone 1: GPU Compute Foundation (AI/ML Training & Inference)
 ### 1.1 GPU Node Registration
 ```rust
 // synor-compute/src/gpu/node.rs
 /// GPU node capabilities
 pub struct GpuNode {
    /// Unique node ID
    pub node_id: NodeId,
    /// GPU specifications
    pub gpus: Vec<GpuSpec>,
    /// Total VRAM available (bytes)
    pub total_vram: u64,
    /// Available VRAM (bytes)
    pub available_vram: u64,
    /// CUDA compute capability (e.g., 8.6 for RTX 3090)
    pub cuda_capability: Option<(u8, u8)>,
    /// ROCm version (for AMD)
    pub rocm_version: Option<String>,
    /// Network bandwidth (Gbps)
    pub bandwidth_gbps: u32,
    /// Geographic region
    pub region: Region,
    /// Stake amount (for PoS validation)
    pub stake: u64,
 }
 pub struct GpuSpec {
    pub model: String,           // "NVIDIA RTX 4090"
    pub vram_gb: u32,            // 24
    pub tensor_cores: u32,       // 512
    pub cuda_cores: u32,         // 16384
    pub memory_bandwidth: u32,   // 1008 GB/s
    pub fp32_tflops: f32,        // 82.6
    pub fp16_tflops: f32,        // 165.2
    pub int8_tops: f32,          // 330.4
 }
 ```
 ### 1.2 AI/ML Job Specification
 ```rust
 // synor-compute/src/ai/job.rs
 /// AI/ML training job specification
 pub struct TrainingJob {
    /// Job ID
    pub job_id: JobId,
    /// Owner address
    pub owner: Address,
    /// Framework (PyTorch, TensorFlow, JAX)
    pub framework: MlFramework,
    /// Model specification
    pub model: ModelSpec,
    /// Dataset reference (Synor Storage CID)
    pub dataset_cid: Cid,
    /// Training configuration
    pub config: TrainingConfig,
    /// Resource requirements
    pub resources: GpuResources,
    /// Maximum budget (SYNOR tokens)
    pub max_budget: u64,
    /// Checkpoint interval (steps)
    pub checkpoint_interval: u64,
 }
 pub struct GpuResources {
    pub min_gpus: u32,
    pub max_gpus: u32,
    pub min_vram_per_gpu: u64,
    pub cuda_capability_min: Option<(u8, u8)>,
    pub distributed: bool,        // Multi-node training
    pub priority: JobPriority,
 }
 pub enum MlFramework {
    PyTorch { version: String },
    TensorFlow { version: String },
    JAX { version: String },
    ONNX,
    Custom { image: String },
 }
 pub struct TrainingConfig {
    pub epochs: u32,
    pub batch_size: u32,
    pub learning_rate: f32,
    pub optimizer: String,
    pub mixed_precision: bool,
    pub gradient_accumulation: u32,
    pub distributed_strategy: DistributedStrategy,
 }
 pub enum DistributedStrategy {
    DataParallel,
    ModelParallel,
    PipelineParallel,
    ZeRO { stage: u8 },  // DeepSpeed ZeRO stages 1-3
    FSDP,                // Fully Sharded Data Parallel
 }
 ```
 ### 1.3 Inference Service
 ```rust
 // synor-compute/src/ai/inference.rs
 /// Inference endpoint specification
 pub struct InferenceEndpoint {
    /// Endpoint ID
    pub endpoint_id: EndpointId,
    /// Model reference (Synor Storage CID)
    pub model_cid: Cid,
    /// Model format
    pub format: ModelFormat,
    /// Scaling configuration
    pub scaling: AutoscaleConfig,
    /// GPU requirements per replica
    pub gpu_per_replica: GpuResources,
    /// Request timeout
    pub timeout_ms: u32,
    /// Max batch size for batching inference
    pub max_batch_size: u32,
    /// Batching timeout
    pub batch_timeout_ms: u32,
 }
 pub enum ModelFormat {
    PyTorch,
    ONNX,
    TensorRT,
    Triton,
    vLLM,          // For LLM serving
    TGI,           // Text Generation Inference
    Custom,
 }
 pub struct AutoscaleConfig {
    pub min_replicas: u32,
    pub max_replicas: u32,
    pub target_gpu_utilization: f32,
    pub scale_up_threshold: f32,
    pub scale_down_threshold: f32,
    pub cooldown_seconds: u32,
 }
 ```
 ### 1.4 Pricing Model for GPU Compute
 | Resource | Unit | Price (SYNOR/unit) |
 |----------|------|-------------------|
 | GPU (RTX 4090 equivalent) | hour | 0.50 |
 | GPU (A100 80GB equivalent) | hour | 2.00 |
 | GPU (H100 equivalent) | hour | 4.00 |
 | VRAM | GB/hour | 0.01 |
 | Network egress | GB | 0.05 |
 | Storage (hot, NVMe) | GB/month | 0.10 |
 | Inference requests | 1M tokens | 0.10 |
 ---
 ## Milestone 2: Container Orchestration (Docker/Kubernetes-Compatible)
 ### 2.1 Container Runtime
 ```rust
 // synor-compute/src/container/runtime.rs
 /// Container specification (OCI-compatible)
 pub struct ContainerSpec {
    /// Image reference
    pub image: ImageRef,
    /// Resource limits
    pub resources: ContainerResources,
    /// Environment variables
    pub env: HashMap<String, String>,
    /// Volume mounts
    pub volumes: Vec<VolumeMount>,
    /// Network configuration
    pub network: NetworkConfig,
    /// Security context
    pub security: SecurityContext,
    /// Health check
    pub health_check: Option<HealthCheck>,
 }
 pub struct ContainerResources {
    pub cpu_cores: f32,          // 0.5, 1.0, 2.0, etc.
    pub memory_mb: u64,
    pub gpu: Option<GpuAllocation>,
    pub ephemeral_storage_gb: u32,
    pub network_bandwidth_mbps: u32,
 }
 pub struct GpuAllocation {
    pub count: u32,
    pub vram_mb: u64,
    pub shared: bool,  // Allow GPU sharing via MPS/MIG
 }
 ```
 ### 2.2 Service Mesh & Networking
 ```rust
 // synor-compute/src/network/mesh.rs
 /// Service definition for container orchestration
 pub struct Service {
    pub service_id: ServiceId,
    pub name: String,
    pub containers: Vec<ContainerSpec>,
    pub replicas: ReplicaConfig,
    pub load_balancer: LoadBalancerConfig,
    pub service_mesh: ServiceMeshConfig,
 }
 pub struct ServiceMeshConfig {
    pub mtls_enabled: bool,
    pub traffic_policy: TrafficPolicy,
    pub circuit_breaker: CircuitBreakerConfig,
    pub retry_policy: RetryPolicy,
    pub rate_limit: Option<RateLimitConfig>,
 }
 pub struct LoadBalancerConfig {
    pub algorithm: LoadBalancerAlgorithm,
    pub health_check: HealthCheck,
    pub sticky_sessions: bool,
    pub ssl_termination: SslTermination,
 }
 pub enum LoadBalancerAlgorithm {
    RoundRobin,
    LeastConnections,
    WeightedRoundRobin { weights: Vec<u32> },
    IPHash,
    Random,
 }
 ```
 ### 2.3 Container Pricing
 | Resource | Unit | Price (SYNOR/unit) |
 |----------|------|-------------------|
 | CPU | core/hour | 0.02 |
 | Memory | GB/hour | 0.005 |
 | Ephemeral storage | GB/hour | 0.001 |
 | Network ingress | GB | FREE |
 | Network egress | GB | 0.05 |
 | Load balancer | hour | 0.01 |
 | Static IP | month | 2.00 |
 ---
 ## Milestone 3: Persistent Virtual Machines (OS Hosting)
 ### 3.1 MicroVM Architecture (Firecracker-based)
 ```rust
 // synor-compute/src/vm/microvm.rs
 /// Virtual machine specification
 pub struct VmSpec {
    /// VM ID
    pub vm_id: VmId,
    /// Owner address
    pub owner: Address,
    /// VM size
    pub size: VmSize,
    /// Boot image
    pub image: VmImage,
    /// Persistent volumes
    pub volumes: Vec<PersistentVolume>,
    /// Network configuration
    pub network: VmNetworkConfig,
    /// SSH keys for access
    pub ssh_keys: Vec<SshPublicKey>,
    /// Cloud-init user data
    pub user_data: Option<String>,
 }
 pub struct VmSize {
    pub vcpus: u32,
    pub memory_gb: u32,
    pub gpu: Option<GpuPassthrough>,
    pub network_bandwidth_gbps: u32,
 }
 pub struct GpuPassthrough {
    pub count: u32,
    pub model: GpuModel,
    pub vram_gb: u32,
 }
 pub enum VmImage {
    /// Pre-built images
    Marketplace { image_id: String, version: String },
    /// Custom image from Synor Storage
    Custom { cid: Cid, format: ImageFormat },
    /// Standard OS images
    Ubuntu { version: String },
    Debian { version: String },
    AlmaLinux { version: String },
    Windows { version: String, license: WindowsLicense },
 }
 pub struct PersistentVolume {
    pub volume_id: VolumeId,
    pub size_gb: u32,
    pub volume_type: VolumeType,
    pub mount_path: String,
    pub encrypted: bool,
 }
 pub enum VolumeType {
    /// High-performance NVMe SSD
    NvmeSsd { iops: u32, throughput_mbps: u32 },
    /// Standard SSD
    Ssd,
    /// HDD for archival
    Hdd,
    /// Distributed storage (Synor Storage L2)
    Distributed { replication: u8 },
 }
 ```
 ### 3.2 VM Lifecycle Management
 ```rust
 // synor-compute/src/vm/lifecycle.rs
 pub enum VmState {
    Pending,
    Provisioning,
    Running,
    Stopping,
    Stopped,
    Hibernating,
    Hibernated,
    Migrating,
    Failed,
    Terminated,
 }
 pub struct VmManager {
    /// Active VMs
    vms: HashMap<VmId, VmInstance>,
    /// Node assignments
    node_assignments: HashMap<VmId, NodeId>,
    /// Live migration coordinator
    migration_coordinator: MigrationCoordinator,
 }
 impl VmManager {
    /// Start a new VM
    pub async fn create(&self, spec: VmSpec) -> Result<VmId, VmError>;
    /// Stop a VM (preserves state)
    pub async fn stop(&self, vm_id: &VmId) -> Result<(), VmError>;
    /// Start a stopped VM
    pub async fn start(&self, vm_id: &VmId) -> Result<(), VmError>;
    /// Hibernate VM to storage (saves memory state)
    pub async fn hibernate(&self, vm_id: &VmId) -> Result<(), VmError>;
    /// Live migrate VM to another node
    pub async fn migrate(&self, vm_id: &VmId, target_node: NodeId) -> Result<(), VmError>;
    /// Resize VM (requires restart)
    pub async fn resize(&self, vm_id: &VmId, new_size: VmSize) -> Result<(), VmError>;
    /// Snapshot VM state
    pub async fn snapshot(&self, vm_id: &VmId) -> Result<SnapshotId, VmError>;
    /// Terminate and delete VM
    pub async fn terminate(&self, vm_id: &VmId) -> Result<(), VmError>;
 }
 ```
 ### 3.3 VM Pricing
 | VM Type | vCPUs | Memory | Storage | GPU | Price (SYNOR/month) |
 |---------|-------|--------|---------|-----|---------------------|
 | micro | 1 | 1 GB | 20 GB SSD | - | 5 |
 | small | 2 | 4 GB | 50 GB SSD | - | 15 |
 | medium | 4 | 8 GB | 100 GB SSD | - | 30 |
 | large | 8 | 32 GB | 200 GB SSD | - | 80 |
 | xlarge | 16 | 64 GB | 500 GB NVMe | - | 200 |
 | gpu-small | 8 | 32 GB | 200 GB NVMe | 1x RTX 4090 | 400 |
 | gpu-medium | 16 | 64 GB | 500 GB NVMe | 2x RTX 4090 | 750 |
 | gpu-large | 32 | 128 GB | 1 TB NVMe | 4x A100 80GB | 2500 |
 | gpu-xlarge | 64 | 256 GB | 2 TB NVMe | 8x H100 | 8000 |
 ---
 ## Milestone 4: Serverless Functions (FaaS)
 ### 4.1 Function Specification
 ```rust
 // synor-compute/src/serverless/function.rs
 /// Serverless function definition
 pub struct Function {
    pub function_id: FunctionId,
    pub owner: Address,
    pub name: String,
    pub runtime: FunctionRuntime,
    pub handler: String,
    pub code: FunctionCode,
    pub resources: FunctionResources,
    pub triggers: Vec<FunctionTrigger>,
    pub environment: HashMap<String, String>,
    pub timeout_ms: u32,
    pub concurrency: ConcurrencyConfig,
 }
 pub enum FunctionRuntime {
    Node20,
    Node22,
    Python311,
    Python312,
    Rust,
    Go122,
    Java21,
    Dotnet8,
    Ruby33,
    Custom { image: String },
 }
 pub struct FunctionCode {
    /// Source code CID in Synor Storage
    pub cid: Cid,
    /// Entry point file
    pub entry_point: String,
    /// Dependencies (package.json, requirements.txt, etc.)
    pub dependencies: Option<Cid>,
 }
 pub struct FunctionResources {
    pub memory_mb: u32,       // 128, 256, 512, 1024, 2048, 4096, 8192
    pub cpu_allocation: f32,  // Proportional to memory
    pub ephemeral_storage_mb: u32,
    pub gpu: Option<GpuAllocation>,
 }
 pub enum FunctionTrigger {
    /// HTTP endpoint
    Http { path: String, methods: Vec<HttpMethod> },
    /// Scheduled execution (cron)
    Schedule { cron: String },
    /// Event from message queue
    Queue { queue_name: String },
    /// Storage events
    Storage { bucket: String, events: Vec<StorageEvent> },
    /// Blockchain events
    Blockchain { contract: Address, events: Vec<String> },
    /// Webhook
    Webhook { url: String },
 }
 ```
 ### 4.2 Cold Start Optimization
 ```rust
 // synor-compute/src/serverless/warmup.rs
 /// Function warmup strategies
 pub struct WarmupConfig {
    /// Minimum warm instances
    pub min_instances: u32,
    /// Provisioned concurrency
    pub provisioned_concurrency: u32,
    /// Warmup schedule
    pub warmup_schedule: Option<String>,
    /// Snapshot-based cold start (SnapStart)
    pub snapstart_enabled: bool,
 }
 pub struct ColdStartOptimizer {
    /// Pre-warmed function pools
    pools: HashMap<FunctionRuntime, WarmPool>,
    /// Snapshot cache
    snapshots: LruCache<FunctionId, FunctionSnapshot>,
    /// Prediction model for scaling
    predictor: ScalingPredictor,
 }
 impl ColdStartOptimizer {
    /// Get a warm instance or create one
    pub async fn get_instance(&self, function: &Function) -> Result<FunctionInstance, Error> {
        // Try snapshot restore first (< 100ms)
        if let Some(snapshot) = self.snapshots.get(&function.function_id) {
            return self.restore_from_snapshot(snapshot).await;
        }
        // Try warm pool (< 50ms)
        if let Some(instance) = self.pools.get(&function.runtime)?.get_warm() {
            return Ok(instance);
        }
        // Cold start (1-5s depending on runtime)
        self.cold_start(function).await
    }
 }
 ```
 ### 4.3 Serverless Pricing
 | Resource | Unit | Price (SYNOR) |
 |----------|------|---------------|
 | Invocations | 1M requests | 0.20 |
 | Duration | GB-second | 0.00001 |
 | Provisioned concurrency | GB-hour | 0.01 |
 | HTTP Gateway | 1M requests | 0.10 |
 | Event bridge | 1M events | 0.50 |
 ---
 ## Milestone 5: Edge Compute
 ### 5.1 Edge Node Architecture
 ```rust
 // synor-compute/src/edge/node.rs
 /// Edge compute node
 pub struct EdgeNode {
    pub node_id: NodeId,
    pub location: GeoLocation,
    pub capabilities: EdgeCapabilities,
    pub latency_zones: Vec<LatencyZone>,
    pub resources: EdgeResources,
 }
 pub struct EdgeCapabilities {
    pub wasm_runtime: bool,
    pub container_runtime: bool,
    pub gpu_inference: bool,
    pub video_transcoding: bool,
    pub cdn_cache: bool,
 }
 pub struct EdgeResources {
    pub cpu_cores: u32,
    pub memory_gb: u32,
    pub storage_gb: u32,
    pub gpu: Option<EdgeGpu>,
    pub bandwidth_gbps: u32,
 }
 /// Edge function for low-latency compute
 pub struct EdgeFunction {
    pub function_id: FunctionId,
    pub code: WasmModule,
    pub memory_limit: u32,
    pub timeout_ms: u32,
    pub allowed_regions: Vec<Region>,
 }
 ```
 ### 5.2 Edge Use Cases
 ```rust
 // synor-compute/src/edge/usecases.rs
 /// CDN with compute at edge
 pub struct EdgeCdn {
    /// Origin servers
    origins: Vec<Origin>,
    /// Cache rules
    cache_rules: Vec<CacheRule>,
    /// Edge workers for request/response transformation
    workers: Vec<EdgeWorker>,
 }
 /// Real-time inference at edge
 pub struct EdgeInference {
    /// Model optimized for edge (quantized, pruned)
    model_id: ModelId,
    /// Inference runtime (TensorRT, ONNX Runtime)
    runtime: EdgeInferenceRuntime,
    /// Max batch size
    max_batch: u32,
    /// Target latency
    target_latency_ms: u32,
 }
 /// Video processing at edge
 pub struct EdgeVideoProcessor {
    /// Transcoding profiles
    profiles: Vec<TranscodingProfile>,
    /// Real-time streaming
    live_streaming: bool,
    /// Adaptive bitrate
    abr_enabled: bool,
 }
 ```
 ### 5.3 Edge Pricing
 | Resource | Unit | Price (SYNOR) |
 |----------|------|---------------|
 | Edge function invocations | 1M | 0.50 |
 | Edge function duration | GB-second | 0.00002 |
 | Edge bandwidth | GB | 0.08 |
 | Edge cache storage | GB/month | 0.02 |
 | Video transcoding | minute | 0.02 |
 ---
 ## Milestone 6: Node Provider Economics
 ### 6.1 Provider Registration
 ```rust
 // synor-compute/src/provider/registration.rs
 /// Compute provider registration
 pub struct ProviderRegistration {
    pub provider_id: ProviderId,
    pub owner: Address,
    /// Stake required to become provider
    pub stake: u64,
    /// Hardware specifications
    pub hardware: HardwareManifest,
    /// Network connectivity
    pub network: NetworkManifest,
    /// Geographic location
    pub location: GeoLocation,
    /// Availability SLA commitment
    pub sla: SlaCommitment,
 }
 pub struct HardwareManifest {
    pub cpus: Vec<CpuSpec>,
    pub memory_total_gb: u64,
    pub gpus: Vec<GpuSpec>,
    pub storage: Vec<StorageSpec>,
    pub verified: bool,  // Hardware attestation passed
 }
 pub struct SlaCommitment {
    pub uptime_percent: f32,      // 99.9, 99.99, etc.
    pub response_time_ms: u32,
    pub data_durability: f32,
    pub penalty_rate: f32,        // Penalty for SLA violation
 }
 ```
 ### 6.2 Provider Revenue Model
 | Revenue Source | Provider Share | Protocol Share |
 |----------------|----------------|----------------|
 | Compute fees | 85% | 15% |
 | Storage fees | 80% | 20% |
 | Network fees | 75% | 25% |
 | SLA bonuses | 100% | 0% |
 | Staking rewards | 100% | 0% |
 ### 6.3 Slashing Conditions
 | Violation | Penalty |
 |-----------|---------|
 | Downtime > committed SLA | 1% stake per hour |
 | Data loss | 10% stake + compensation |
 | Malicious behavior | 100% stake |
 | False hardware attestation | 50% stake |
 ---
 ## Implementation Timeline
 ### Phase 11.1: Foundation (Weeks 1-4)
 - [ ] Node registration and hardware attestation
 - [ ] Basic job scheduler
 - [ ] WASM runtime integration (existing)
 - [ ] Container runtime (containerd)
 - [ ] Network overlay (WireGuard mesh)
 ### Phase 11.2: GPU Compute (Weeks 5-8)
 - [ ] GPU node registration
 - [ ] NVIDIA driver integration
 - [ ] CUDA runtime support
 - [ ] Basic ML job execution
 - [ ] Model storage integration
 ### Phase 11.3: Container Orchestration (Weeks 9-12)
 - [ ] OCI image support
 - [ ] Service deployment
 - [ ] Load balancing
 - [ ] Auto-scaling
 - [ ] Service mesh (mTLS)
 ### Phase 11.4: Persistent VMs (Weeks 13-16)
 - [ ] MicroVM runtime (Firecracker)
 - [ ] VM lifecycle management
 - [ ] Persistent storage
 - [ ] Live migration
 - [ ] Snapshot/restore
 ### Phase 11.5: Serverless (Weeks 17-20)
 - [ ] Function deployment
 - [ ] Cold start optimization
 - [ ] Event triggers
 - [ ] API gateway
 - [ ] Monitoring/logging
 ### Phase 11.6: Edge Compute (Weeks 21-24)
 - [ ] Edge node registration
 - [ ] Edge function runtime
 - [ ] CDN integration
 - [ ] Edge inference
 - [ ] Global anycast
 ---
 ## Security Considerations
 ### Isolation Levels
 | Workload Type | Isolation Technology | Security Level |
 |---------------|---------------------|----------------|
 | WASM | Wasmtime sandbox | High |
 | Serverless | gVisor + seccomp | High |
 | Containers | gVisor or Kata | Medium-High |
 | VMs | Firecracker MicroVM | High |
 | GPU | NVIDIA MIG/MPS | Medium |
 ### Network Security
 - All inter-node traffic encrypted (WireGuard)
 - mTLS for service-to-service communication
 - Network policies for workload isolation
 - DDoS protection at edge
 ### Data Security
 - Encryption at rest (AES-256)
 - Encryption in transit (TLS 1.3)
 - Confidential computing support (AMD SEV, Intel SGX)
 - Secure key management (HSM integration)
 ---
 ## API Examples
 ### Deploy AI Training Job
 ```bash
 synor compute train create \
  --framework pytorch \
  --model-config ./model.yaml \
  --dataset synor://datasets/imagenet \
  --gpus 8 \
  --gpu-type h100 \
  --distributed ddp \
  --epochs 100 \
  --checkpoint-interval 1000 \
  --max-budget 1000
 ```
 ### Deploy Inference Endpoint
 ```bash
 synor compute inference deploy \
  --model synor://models/llama-70b \
  --format vllm \
  --min-replicas 2 \
  --max-replicas 10 \
  --gpu-per-replica 2 \
  --target-utilization 0.7
 ```
 ### Create Persistent VM
 ```bash
 synor compute vm create \
  --name my-dev-server \
  --image ubuntu:22.04 \
  --size gpu-small \
  --volume 100gb:nvme:/data \
  --ssh-key ~/.ssh/id_ed25519.pub \
  --region us-east
 ```
 ### Deploy Container Service
 ```bash
 synor compute service deploy \
  --name my-api \
  --image my-registry/my-api:latest \
  --replicas 3 \
  --cpu 2 \
  --memory 4gb \
  --port 8080 \
  --health-check /health \
  --autoscale 2-10
 ```
 ### Deploy Serverless Function
 ```bash
 synor compute function deploy \
  --name process-image \
  --runtime python312 \
  --handler main.handler \
  --code ./function \
  --memory 1024 \
  --timeout 30000 \
  --trigger http:/api/process
 ```
 ---
 ## Comparison with Existing Synor VM
 | Feature | Current Synor VM | Synor Compute L2 |
 |---------|------------------|------------------|
 | Runtime | WASM only | WASM, Container, MicroVM |
 | Timeout | 30 seconds | Unlimited (VMs) |
 | Memory | 16 MB max | Up to 256 GB |
 | GPU | ❌ | ✅ Full CUDA/ROCm |
 | Networking | ❌ | ✅ Full TCP/UDP |
 | File I/O | ❌ | ✅ Persistent volumes |
 | Threading | ❌ | ✅ Multi-threaded |
 | AI/ML | ❌ | ✅ Training + Inference |
 | OS Hosting | ❌ | ✅ Full Linux/Windows |
 ---
 ## Next Steps
 1. **Milestone 1**: Implement GPU node registration and attestation
 2. **Milestone 2**: Build basic job scheduler with resource allocation
 3. **Milestone 3**: Integrate containerd for container workloads
 4. **Milestone 4**: Add Firecracker for MicroVM support
 5. **Milestone 5**: Implement serverless function runtime
 6. **Milestone 6**: Deploy edge nodes and CDN integration
 This plan transforms Synor from a smart contract platform into a full-stack decentralized cloud provider capable of competing with AWS/GCP/Azure while maintaining decentralization and censorship resistance.