diff --git a/Cargo.toml b/Cargo.toml
index a9b8da5..af65ff4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,6 +9,7 @@ members = [
     "crates/synor-storage",
     "crates/synor-hosting",
     "crates/synor-database",
+    "crates/synor-compute",
     "crates/synor-governance",
     "crates/synor-rpc",
     "crates/synor-vm",
diff --git a/crates/synor-compute/Cargo.toml b/crates/synor-compute/Cargo.toml
new file mode 100644
index 0000000..07d0b3a
--- /dev/null
+++ b/crates/synor-compute/Cargo.toml
@@ -0,0 +1,51 @@
+[package]
+name = "synor-compute"
+version.workspace = true
+edition.workspace = true
+description = "Heterogeneous multi-processor compute platform for Synor blockchain"
+license.workspace = true
+
+[dependencies]
+# Internal crates
+synor-types = { path = "../synor-types" }
+synor-crypto = { path = "../synor-crypto" }
+synor-storage = { path = "../synor-storage" }
+
+# Serialization
+serde.workspace = true
+serde_json.workspace = true
+borsh.workspace = true
+bincode = "1.3"
+
+# Async runtime
+tokio = { workspace = true, features = ["sync", "rt-multi-thread", "time", "macros"] }
+async-trait = "0.1"
+futures = "0.3"
+
+# Concurrency
+parking_lot.workspace = true
+crossbeam-deque = "0.8"
+crossbeam-channel = "0.5"
+dashmap = "5.5"
+
+# Utilities
+thiserror.workspace = true
+tracing.workspace = true
+hex.workspace = true
+
+# Hashing
+blake3.workspace = true
+
+# Data structures
+indexmap = "2.2"
+priority-queue = "2.0"
+
+# Time
+chrono = { version = "0.4", features = ["serde"] }
+
+# Random
+rand = "0.8"
+
+[dev-dependencies]
+tempfile.workspace = true
+tokio-test = "0.4"
diff --git a/crates/synor-compute/src/device/mod.rs b/crates/synor-compute/src/device/mod.rs
new file mode 100644
index 0000000..cb56bbd
--- /dev/null
+++ b/crates/synor-compute/src/device/mod.rs
@@ -0,0 +1,377 @@
+//! Device registry and management.
+//!
+//! Supports all device types:
+//! - Data center servers
+//! - Desktop workstations
+//! - Laptops
+//! - Mobile devices (iOS, Android)
+//! - Browsers (WebGPU, WASM)
+//! - IoT devices
+
+use crate::error::ComputeError;
+use crate::processor::{GenericProcessor, Processor, ProcessorCapabilities, ProcessorId, ProcessorType};
+use crate::{NodeId, ProcessorInfo};
+use parking_lot::RwLock;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::sync::Arc;
+
+/// Unique device identifier.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct DeviceId(pub [u8; 32]);
+
+impl DeviceId {
+    /// Creates a new random device ID.
+    pub fn new() -> Self {
+        use rand::Rng;
+        let mut bytes = [0u8; 32];
+        rand::thread_rng().fill(&mut bytes);
+        DeviceId(bytes)
+    }
+
+    /// Creates from bytes.
+    pub fn from_bytes(bytes: [u8; 32]) -> Self {
+        DeviceId(bytes)
+    }
+}
+
+impl Default for DeviceId {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl std::fmt::Display for DeviceId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "dev_{}", hex::encode(&self.0[..8]))
+    }
+}
+
+/// Device type classification.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum DeviceType {
+    /// Data center server.
+    DataCenter,
+    /// Desktop workstation.
+    Desktop,
+    /// Laptop.
+    Laptop,
+    /// Mobile phone.
+    Mobile,
+    /// Tablet.
+    Tablet,
+    /// IoT device.
+    IoT,
+    /// Browser (WebGPU/WASM).
+    Browser,
+    /// Edge server.
+    Edge,
+}
+
+impl DeviceType {
+    /// Returns typical reliability score (0-100).
+    pub fn reliability(&self) -> u32 {
+        match self {
+            DeviceType::DataCenter => 99,
+            DeviceType::Edge => 95,
+            DeviceType::Desktop => 80,
+            DeviceType::Laptop => 60,
+            DeviceType::Mobile => 40,
+            DeviceType::Tablet => 50,
+            DeviceType::IoT => 70,
+            DeviceType::Browser => 30,
+        }
+    }
+
+    /// Returns typical availability hours per day.
+    pub fn availability_hours(&self) -> f32 {
+        match self {
+            DeviceType::DataCenter => 24.0,
+            DeviceType::Edge => 24.0,
+            DeviceType::Desktop => 8.0,
+            DeviceType::Laptop => 6.0,
+            DeviceType::Mobile => 4.0,
+            DeviceType::Tablet => 4.0,
+            DeviceType::IoT => 24.0,
+            DeviceType::Browser => 2.0,
+        }
+    }
+}
+
+/// Device capabilities.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct DeviceCapabilities {
+    /// Device type.
+    pub device_type: DeviceType,
+    /// Available processors.
+    pub processors: Vec<ProcessorType>,
+    /// Total memory (GB).
+    pub memory_gb: f32,
+    /// Network bandwidth (Mbps).
+    pub bandwidth_mbps: f32,
+    /// Storage available (GB).
+    pub storage_gb: f32,
+    /// Battery powered.
+    pub battery_powered: bool,
+    /// Supports background execution.
+    pub background_execution: bool,
+}
+
+/// Device information.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct DeviceInfo {
+    /// Device ID.
+    pub id: DeviceId,
+    /// Device type.
+    pub device_type: DeviceType,
+    /// Owner address.
+    pub owner: [u8; 32],
+    /// Capabilities.
+    pub capabilities: DeviceCapabilities,
+    /// Current status.
+    pub status: DeviceStatus,
+    /// Reputation score (0-100).
+    pub reputation: u32,
+    /// Total earnings (atomic SYNOR).
+    pub earnings: u64,
+    /// Geographic region.
+    pub region: String,
+}
+
+/// Device status.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub enum DeviceStatus {
+    /// Online and available.
+    Online,
+    /// Online but busy.
+    Busy,
+    /// Idle but available.
+    Idle,
+    /// On battery (reduced capacity).
+    OnBattery,
+    /// Offline.
+    Offline,
+    /// Maintenance.
+    Maintenance,
+}
+
+/// Device registry managing all devices and processors.
+pub struct DeviceRegistry {
+    /// Registered devices.
+    devices: RwLock<HashMap<DeviceId, DeviceInfo>>,
+    /// Node to device mapping.
+    node_devices: RwLock<HashMap<NodeId, Vec<DeviceId>>>,
+    /// All processors (across all nodes).
+    processors: RwLock<HashMap<ProcessorId, Arc<dyn Processor>>>,
+    /// Processor to node mapping.
+    processor_nodes: RwLock<HashMap<ProcessorId, NodeId>>,
+    /// Next processor ID.
+    next_processor_id: std::sync::atomic::AtomicU64,
+}
+
+impl DeviceRegistry {
+    /// Creates a new device registry.
+    pub fn new() -> Self {
+        Self {
+            devices: RwLock::new(HashMap::new()),
+            node_devices: RwLock::new(HashMap::new()),
+            processors: RwLock::new(HashMap::new()),
+            processor_nodes: RwLock::new(HashMap::new()),
+            next_processor_id: std::sync::atomic::AtomicU64::new(0),
+        }
+    }
+
+    /// Registers a device.
+    pub fn register_device(&self, device: DeviceInfo) -> Result<DeviceId, ComputeError> {
+        let id = device.id;
+        self.devices.write().insert(id, device);
+        Ok(id)
+    }
+
+    /// Unregisters a device.
+    pub fn unregister_device(&self, device_id: DeviceId) -> Result<(), ComputeError> {
+        self.devices.write().remove(&device_id);
+        Ok(())
+    }
+
+    /// Gets a device by ID.
+    pub fn get_device(&self, device_id: DeviceId) -> Option<DeviceInfo> {
+        self.devices.read().get(&device_id).cloned()
+    }
+
+    /// Registers a processor for a node.
+    pub fn register_processor(
+        &self,
+        node_id: NodeId,
+        info: ProcessorInfo,
+    ) -> Result<(), ComputeError> {
+        let processor_id = info.id;
+
+        // Create a generic processor from the info
+        let processor: Arc<dyn Processor> = Arc::new(GenericProcessor::new(
+            processor_id,
+            info.processor_type,
+            info.capabilities,
+        ));
+
+        self.processors.write().insert(processor_id, processor);
+        self.processor_nodes.write().insert(processor_id, node_id);
+
+        Ok(())
+    }
+
+    /// Unregisters all processors for a node.
+    pub fn unregister_node(&self, node_id: NodeId) -> Result<(), ComputeError> {
+        let mut processors = self.processors.write();
+        let mut processor_nodes = self.processor_nodes.write();
+
+        // Find and remove all processors for this node
+        let to_remove: Vec<_> = processor_nodes
+            .iter()
+            .filter(|(_, n)| **n == node_id)
+            .map(|(p, _)| *p)
+            .collect();
+
+        for proc_id in to_remove {
+            processors.remove(&proc_id);
+            processor_nodes.remove(&proc_id);
+        }
+
+        Ok(())
+    }
+
+    /// Gets a processor by ID.
+    pub fn get_processor(&self, processor_id: ProcessorId) -> Result<Arc<dyn Processor>, ComputeError> {
+        self.processors
+            .read()
+            .get(&processor_id)
+            .cloned()
+            .ok_or(ComputeError::ProcessorNotFound(processor_id))
+    }
+
+    /// Gets all processors.
+    pub fn all_processors(&self) -> Vec<Arc<dyn Processor>> {
+        self.processors.read().values().cloned().collect()
+    }
+
+    /// Gets processors of a specific type.
+    pub fn processors_by_type(&self, proc_type: ProcessorType) -> Vec<Arc<dyn Processor>> {
+        self.processors
+            .read()
+            .values()
+            .filter(|p| p.processor_type() == proc_type)
+            .cloned()
+            .collect()
+    }
+
+    /// Gets the next processor ID.
+    pub fn next_processor_id(&self) -> ProcessorId {
+        ProcessorId(self.next_processor_id.fetch_add(1, std::sync::atomic::Ordering::SeqCst))
+    }
+
+    /// Gets total number of devices.
+    pub fn device_count(&self) -> usize {
+        self.devices.read().len()
+    }
+
+    /// Gets total number of processors.
+    pub fn processor_count(&self) -> usize {
+        self.processors.read().len()
+    }
+
+    /// Gets devices by type.
+    pub fn devices_by_type(&self, device_type: DeviceType) -> Vec<DeviceInfo> {
+        self.devices
+            .read()
+            .values()
+            .filter(|d| d.device_type == device_type)
+            .cloned()
+            .collect()
+    }
+
+    /// Gets online devices.
+    pub fn online_devices(&self) -> Vec<DeviceInfo> {
+        self.devices
+            .read()
+            .values()
+            .filter(|d| d.status == DeviceStatus::Online || d.status == DeviceStatus::Idle)
+            .cloned()
+            .collect()
+    }
+
+    /// Updates device status.
+    pub fn update_device_status(
+        &self,
+        device_id: DeviceId,
+        status: DeviceStatus,
+    ) -> Result<(), ComputeError> {
+        if let Some(device) = self.devices.write().get_mut(&device_id) {
+            device.status = status;
+            Ok(())
+        } else {
+            Err(ComputeError::Internal(format!("Device not found: {}", device_id)))
+        }
+    }
+}
+
+impl Default for DeviceRegistry {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::processor::{CpuVariant, AvxSupport};
+
+    #[test]
+    fn test_device_id() {
+        let id1 = DeviceId::new();
+        let id2 = DeviceId::new();
+        assert_ne!(id1.0, id2.0);
+    }
+
+    #[test]
+    fn test_device_registry() {
+        let registry = DeviceRegistry::new();
+
+        let device = DeviceInfo {
+            id: DeviceId::new(),
+            device_type: DeviceType::Desktop,
+            owner: [1u8; 32],
+            capabilities: DeviceCapabilities {
+                device_type: DeviceType::Desktop,
+                processors: vec![ProcessorType::Cpu(CpuVariant::X86_64 {
+                    avx: AvxSupport::Avx512,
+                })],
+                memory_gb: 64.0,
+                bandwidth_mbps: 1000.0,
+                storage_gb: 1000.0,
+                battery_powered: false,
+                background_execution: true,
+            },
+            status: DeviceStatus::Online,
+            reputation: 100,
+            earnings: 0,
+            region: "us-east".to_string(),
+        };
+
+        let device_id = device.id;
+        registry.register_device(device).unwrap();
+
+        assert_eq!(registry.device_count(), 1);
+        assert!(registry.get_device(device_id).is_some());
+
+        registry.unregister_device(device_id).unwrap();
+        assert_eq!(registry.device_count(), 0);
+    }
+
+    #[test]
+    fn test_device_type_properties() {
+        assert_eq!(DeviceType::DataCenter.reliability(), 99);
+        assert_eq!(DeviceType::Mobile.reliability(), 40);
+        assert_eq!(DeviceType::DataCenter.availability_hours(), 24.0);
+        assert_eq!(DeviceType::Browser.availability_hours(), 2.0);
+    }
+}
diff --git a/crates/synor-compute/src/error.rs b/crates/synor-compute/src/error.rs
new file mode 100644
index 0000000..33a34ee
--- /dev/null
+++ b/crates/synor-compute/src/error.rs
@@ -0,0 +1,92 @@
+//! Error types for Synor Compute.
+
+use crate::{JobId, NodeId, ProcessorId, ProcessorType};
+use thiserror::Error;
+
+/// Compute errors.
+#[derive(Debug, Error)]
+pub enum ComputeError {
+    /// Job not found.
+    #[error("Job not found: {0}")]
+    JobNotFound(JobId),
+
+    /// Node not found.
+    #[error("Node not found: {0}")]
+    NodeNotFound(NodeId),
+
+    /// Processor not found.
+    #[error("Processor not found: {0}")]
+    ProcessorNotFound(ProcessorId),
+
+    /// No suitable processor for operation.
+    #[error("No suitable processor for operation: {0}")]
+    NoSuitableProcessor(String),
+
+    /// Insufficient resources.
+    #[error("Insufficient resources: {0}")]
+    InsufficientResources(String),
+
+    /// Task execution failed.
+    #[error("Task execution failed: {0}")]
+    TaskExecutionFailed(String),
+
+    /// Scheduling failed.
+    #[error("Scheduling failed: {0}")]
+    SchedulingFailed(String),
+
+    /// Memory allocation failed.
+    #[error("Memory allocation failed: {0}")]
+    MemoryAllocationFailed(String),
+
+    /// Data transfer failed.
+    #[error("Data transfer failed: {0}")]
+    DataTransferFailed(String),
+
+    /// Processor type not supported.
+    #[error("Processor type not supported: {0:?}")]
+    ProcessorTypeNotSupported(ProcessorType),
+
+    /// Operation not supported on processor.
+    #[error("Operation not supported on {0:?}: {1}")]
+    OperationNotSupported(ProcessorType, String),
+
+    /// Timeout.
+    #[error("Operation timed out after {0}ms")]
+    Timeout(u64),
+
+    /// Budget exceeded.
+    #[error("Budget exceeded: required {required}, available {available}")]
+    BudgetExceeded { required: u64, available: u64 },
+
+    /// Node already registered.
+    #[error("Node already registered: {0}")]
+    NodeAlreadyRegistered(NodeId),
+
+    /// Invalid configuration.
+    #[error("Invalid configuration: {0}")]
+    InvalidConfiguration(String),
+
+    /// Serialization error.
+    #[error("Serialization error: {0}")]
+    Serialization(String),
+
+    /// Network error.
+    #[error("Network error: {0}")]
+    Network(String),
+
+    /// Internal error.
+    #[error("Internal error: {0}")]
+    Internal(String),
+}
+
+impl From<bincode::Error> for ComputeError {
+    fn from(err: bincode::Error) -> Self {
+        ComputeError::Serialization(err.to_string())
+    }
+}
+
+impl From<serde_json::Error> for ComputeError {
+    fn from(err: serde_json::Error) -> Self {
+        ComputeError::Serialization(err.to_string())
+    }
+}
diff --git a/crates/synor-compute/src/lib.rs b/crates/synor-compute/src/lib.rs
new file mode 100644
index 0000000..6baef4c
--- /dev/null
+++ b/crates/synor-compute/src/lib.rs
@@ -0,0 +1,631 @@
+//! Synor Compute L2 - Heterogeneous Multi-Processor Compute Platform
+//!
+//! Provides decentralized compute services with:
+//!
+//! - **Heterogeneous Scheduling**: CPU + GPU + TPU + NPU + LPU working simultaneously
+//! - **Consumer Device Mesh**: Mobile, browser, desktop devices contributing compute
+//! - **90% Cost Reduction**: Zero margins, spot markets, electricity arbitrage
+//! - **10x Speed**: Caching, speculative execution, optimal processor assignment
+//!
+//! # Architecture
+//!
+//! ```text
+//! ┌─────────────────────────────────────────────────────────────────────────────┐
+//! │                         SYNOR COMPUTE L2                                     │
+//! ├─────────────────────────────────────────────────────────────────────────────┤
+//! │                                                                              │
+//! │  ┌─────────────────────────────────────────────────────────────────────────┐ │
+//! │  │                      TASK DECOMPOSER                                     │ │
+//! │  │  Analyzes workload → Identifies subtasks → Maps to optimal processors    │ │
+//! │  └─────────────────────────────────────────────────────────────────────────┘ │
+//! │                                    │                                         │
+//! │                                    ▼                                         │
+//! │  ┌─────────────────────────────────────────────────────────────────────────┐ │
+//! │  │                    HETEROGENEOUS SCHEDULER                               │ │
+//! │  │  ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐                 │ │
+//! │  │  │ CPU  │ │ GPU  │ │ TPU  │ │ NPU  │ │ LPU  │ │Custom│                 │ │
+//! │  │  │Queue │ │Queue │ │Queue │ │Queue │ │Queue │ │Queue │                 │ │
+//! │  │  └──────┘ └──────┘ └──────┘ └──────┘ └──────┘ └──────┘                 │ │
+//! │  └─────────────────────────────────────────────────────────────────────────┘ │
+//! │                                                                              │
+//! │  ┌─────────────────────────────────────────────────────────────────────────┐ │
+//! │  │                    UNIFIED MEMORY FABRIC                                 │ │
+//! │  │  Zero-copy data sharing │ Automatic placement │ Cache coherency          │ │
+//! │  └─────────────────────────────────────────────────────────────────────────┘ │
+//! │                                                                              │
+//! └─────────────────────────────────────────────────────────────────────────────┘
+//! ```
+//!
+//! # Pricing
+//!
+//! | Resource | Unit | Price (SYNOR) |
+//! |----------|------|---------------|
+//! | GPU (consumer) | hour | 0.10 |
+//! | GPU (datacenter) | hour | 0.50-4.00 |
+//! | CPU | core/hour | 0.02 |
+//! | Memory | GB/hour | 0.005 |
+//! | Inference | 1M tokens | 0.10 |
+
+#![allow(dead_code)]
+
+pub mod device;
+pub mod error;
+pub mod market;
+pub mod memory;
+pub mod processor;
+pub mod scheduler;
+pub mod task;
+
+pub use device::{
+    DeviceCapabilities, DeviceId, DeviceInfo, DeviceRegistry, DeviceStatus, DeviceType,
+};
+pub use error::ComputeError;
+pub use market::{
+    Auction, AuctionId, CloudComparison, CpuTier as MarketCpuTier, GpuTier as MarketGpuTier,
+    MarketStats, Order, OrderBook, OrderId, OrderSide, OrderType, PricingEngine, ProviderListing,
+    ResourceType, SpotMarket, Trade,
+};
+pub use memory::{MemoryManager, TensorHandle, TransferPath, UnifiedMemory};
+pub use processor::{
+    ComputeThroughput, CpuVariant, GpuVariant, NpuVariant, Operation, OperationType, Processor,
+    ProcessorCapabilities, ProcessorId, ProcessorType, TpuVersion,
+};
+pub use scheduler::{
+    HeterogeneousScheduler, LoadBalancer, Schedule, ScheduleResult, TaskAssignment, WorkQueue,
+};
+pub use task::{
+    ComputeTask, DecomposedWorkload, Task, TaskDecomposer, TaskId, TaskPriority, TaskResult,
+    TaskStatus,
+};
+
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use parking_lot::RwLock;
+
+/// Compute node identifier.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct NodeId(pub u64);
+
+impl std::fmt::Display for NodeId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "node_{}", self.0)
+    }
+}
+
+/// Job identifier.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct JobId(pub [u8; 32]);
+
+impl JobId {
+    /// Creates a new job ID.
+    pub fn new() -> Self {
+        use rand::Rng;
+        let mut bytes = [0u8; 32];
+        rand::thread_rng().fill(&mut bytes);
+        JobId(bytes)
+    }
+
+    /// Creates from bytes.
+    pub fn from_bytes(bytes: [u8; 32]) -> Self {
+        JobId(bytes)
+    }
+}
+
+impl Default for JobId {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl std::fmt::Display for JobId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "job_{}", hex::encode(&self.0[..8]))
+    }
+}
+
+/// Compute job specification.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ComputeJob {
+    /// Job ID.
+    pub id: JobId,
+    /// Owner address.
+    pub owner: [u8; 32],
+    /// Job type.
+    pub job_type: JobType,
+    /// Resource requirements.
+    pub resources: ResourceRequirements,
+    /// Input data reference (CID).
+    pub input_cid: Option<String>,
+    /// Maximum budget (in atomic SYNOR).
+    pub max_budget: u64,
+    /// Priority level.
+    pub priority: JobPriority,
+    /// Created timestamp.
+    pub created_at: u64,
+    /// Deadline (optional).
+    pub deadline: Option<u64>,
+}
+
+/// Job type classification.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub enum JobType {
+    /// AI/ML training job.
+    Training {
+        framework: MlFramework,
+        model_cid: String,
+        dataset_cid: String,
+        epochs: u32,
+        batch_size: u32,
+    },
+    /// AI/ML inference job.
+    Inference {
+        model_cid: String,
+        input_format: String,
+        batch_size: u32,
+    },
+    /// Container workload.
+    Container {
+        image: String,
+        command: Vec<String>,
+        env: HashMap<String, String>,
+    },
+    /// Serverless function.
+    Serverless {
+        runtime: FunctionRuntime,
+        code_cid: String,
+        handler: String,
+    },
+    /// General compute (WASM).
+    Wasm {
+        module_cid: String,
+        entrypoint: String,
+    },
+}
+
+/// ML framework specification.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub enum MlFramework {
+    PyTorch { version: String },
+    TensorFlow { version: String },
+    JAX { version: String },
+    ONNX,
+}
+
+/// Function runtime.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub enum FunctionRuntime {
+    Node20,
+    Python312,
+    Rust,
+    Go,
+    Custom { image: String },
+}
+
+/// Job priority levels.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
+pub enum JobPriority {
+    /// Background job, can be preempted.
+    Background = 0,
+    /// Normal priority.
+    Normal = 1,
+    /// High priority, faster scheduling.
+    High = 2,
+    /// Critical, guaranteed resources.
+    Critical = 3,
+}
+
+impl Default for JobPriority {
+    fn default() -> Self {
+        JobPriority::Normal
+    }
+}
+
+/// Resource requirements for a job.
+#[derive(Clone, Debug, Default, Serialize, Deserialize)]
+pub struct ResourceRequirements {
+    /// Minimum CPU cores.
+    pub min_cpu_cores: f32,
+    /// Minimum memory (GB).
+    pub min_memory_gb: f32,
+    /// GPU requirements.
+    pub gpu: Option<GpuRequirements>,
+    /// Preferred processor types (in priority order).
+    pub preferred_processors: Vec<ProcessorType>,
+    /// Maximum latency (ms) - for inference.
+    pub max_latency_ms: Option<u32>,
+    /// Requires distributed execution.
+    pub distributed: bool,
+}
+
+/// GPU resource requirements.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct GpuRequirements {
+    /// Minimum number of GPUs.
+    pub min_count: u32,
+    /// Maximum number of GPUs.
+    pub max_count: u32,
+    /// Minimum VRAM per GPU (GB).
+    pub min_vram_gb: u32,
+    /// Minimum compute capability.
+    pub min_compute_capability: Option<(u8, u8)>,
+    /// Allow GPU sharing (MPS/MIG).
+    pub allow_sharing: bool,
+}
+
+/// Job execution status.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub enum JobStatus {
+    /// Queued, waiting for resources.
+    Queued,
+    /// Resources allocated, starting.
+    Starting,
+    /// Running.
+    Running {
+        progress: f32,
+        assigned_nodes: Vec<NodeId>,
+    },
+    /// Completed successfully.
+    Completed {
+        result_cid: String,
+        duration_ms: u64,
+        cost: u64,
+    },
+    /// Failed.
+    Failed { error: String },
+    /// Cancelled by user.
+    Cancelled,
+}
+
+/// Compute node registration.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ComputeNode {
+    /// Node ID.
+    pub id: NodeId,
+    /// Owner address.
+    pub owner: [u8; 32],
+    /// Available processors.
+    pub processors: Vec<ProcessorInfo>,
+    /// Total memory (GB).
+    pub total_memory_gb: f32,
+    /// Available memory (GB).
+    pub available_memory_gb: f32,
+    /// Network bandwidth (Gbps).
+    pub bandwidth_gbps: f32,
+    /// Geographic region.
+    pub region: String,
+    /// Stake amount (for PoS).
+    pub stake: u64,
+    /// Reputation score (0-100).
+    pub reputation: u32,
+    /// Current status.
+    pub status: NodeStatus,
+}
+
+/// Processor information on a node.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ProcessorInfo {
+    /// Processor ID (local to node).
+    pub id: ProcessorId,
+    /// Processor type.
+    pub processor_type: ProcessorType,
+    /// Capabilities.
+    pub capabilities: ProcessorCapabilities,
+    /// Current utilization (0.0 - 1.0).
+    pub utilization: f32,
+    /// Current temperature (Celsius).
+    pub temperature: Option<f32>,
+}
+
+/// Node status.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub enum NodeStatus {
+    /// Online and accepting jobs.
+    Online,
+    /// Online but not accepting new jobs.
+    Draining,
+    /// Offline.
+    Offline,
+    /// Maintenance mode.
+    Maintenance,
+}
+
+/// Compute cluster manager.
+pub struct ComputeCluster {
+    /// Registered nodes.
+    nodes: RwLock<HashMap<NodeId, ComputeNode>>,
+    /// Device registry.
+    device_registry: Arc<DeviceRegistry>,
+    /// Heterogeneous scheduler.
+    scheduler: Arc<HeterogeneousScheduler>,
+    /// Spot market.
+    spot_market: Arc<SpotMarket>,
+    /// Memory manager.
+    memory_manager: Arc<MemoryManager>,
+    /// Active jobs.
+    jobs: RwLock<HashMap<JobId, ComputeJob>>,
+}
+
+impl ComputeCluster {
+    /// Creates a new compute cluster.
+    pub fn new() -> Self {
+        let device_registry = Arc::new(DeviceRegistry::new());
+        let scheduler = Arc::new(HeterogeneousScheduler::new(device_registry.clone()));
+        let spot_market = Arc::new(SpotMarket::new());
+        let memory_manager = Arc::new(MemoryManager::new());
+
+        Self {
+            nodes: RwLock::new(HashMap::new()),
+            device_registry,
+            scheduler,
+            spot_market,
+            memory_manager,
+            jobs: RwLock::new(HashMap::new()),
+        }
+    }
+
+    /// Registers a compute node.
+    pub fn register_node(&self, node: ComputeNode) -> Result<(), ComputeError> {
+        let id = node.id;
+
+        // Register processors with device registry
+        for proc in &node.processors {
+            self.device_registry.register_processor(id, proc.clone())?;
+        }
+
+        self.nodes.write().insert(id, node);
+        Ok(())
+    }
+
+    /// Unregisters a compute node.
+    pub fn unregister_node(&self, node_id: NodeId) -> Result<(), ComputeError> {
+        self.device_registry.unregister_node(node_id)?;
+        self.nodes.write().remove(&node_id);
+        Ok(())
+    }
+
+    /// Submits a job for execution.
+    pub async fn submit_job(&self, job: ComputeJob) -> Result<JobId, ComputeError> {
+        let job_id = job.id;
+
+        // Decompose job into tasks
+        let tasks = self.decompose_job(&job)?;
+
+        // Schedule tasks
+        let schedule = self.scheduler.schedule(tasks).await?;
+
+        // Store job
+        self.jobs.write().insert(job_id, job);
+
+        // Execute schedule (async)
+        tokio::spawn({
+            let scheduler = self.scheduler.clone();
+            async move {
+                let _ = scheduler.execute(&schedule.schedule).await;
+            }
+        });
+
+        Ok(job_id)
+    }
+
+    /// Gets job status.
+    pub fn get_job_status(&self, job_id: &JobId) -> Option<JobStatus> {
+        self.jobs.read().get(job_id).map(|_| JobStatus::Queued)
+    }
+
+    /// Cancels a job.
+    pub fn cancel_job(&self, job_id: &JobId) -> Result<(), ComputeError> {
+        if self.jobs.write().remove(job_id).is_some() {
+            Ok(())
+        } else {
+            Err(ComputeError::JobNotFound(*job_id))
+        }
+    }
+
+    /// Gets cluster statistics.
+    pub fn stats(&self) -> ClusterStats {
+        let nodes = self.nodes.read();
+        let jobs = self.jobs.read();
+
+        let total_nodes = nodes.len();
+        let online_nodes = nodes.values().filter(|n| n.status == NodeStatus::Online).count();
+
+        let total_gpus: usize = nodes
+            .values()
+            .flat_map(|n| &n.processors)
+            .filter(|p| matches!(p.processor_type, ProcessorType::Gpu(_)))
+            .count();
+
+        let total_memory: f32 = nodes.values().map(|n| n.total_memory_gb).sum();
+
+        ClusterStats {
+            total_nodes,
+            online_nodes,
+            total_gpus,
+            total_memory_gb: total_memory,
+            active_jobs: jobs.len(),
+            queued_jobs: jobs.values().filter(|_| true).count(), // Simplified
+        }
+    }
+
+    /// Decomposes a job into schedulable tasks.
+    fn decompose_job(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
+        let decomposer = TaskDecomposer::new();
+        decomposer.decompose(job)
+    }
+}
+
+impl Default for ComputeCluster {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Cluster statistics.
+#[derive(Clone, Debug, Default, Serialize, Deserialize)]
+pub struct ClusterStats {
+    /// Total registered nodes.
+    pub total_nodes: usize,
+    /// Online nodes.
+    pub online_nodes: usize,
+    /// Total GPUs across cluster.
+    pub total_gpus: usize,
+    /// Total memory (GB).
+    pub total_memory_gb: f32,
+    /// Active jobs.
+    pub active_jobs: usize,
+    /// Queued jobs.
+    pub queued_jobs: usize,
+}
+
+/// Pricing calculator for compute operations.
+#[derive(Clone, Debug)]
+pub struct ComputePricing {
+    /// GPU cost per hour by type.
+    pub gpu_hourly: HashMap<GpuTier, u64>,
+    /// CPU cost per core-hour.
+    pub cpu_core_hour: u64,
+    /// Memory cost per GB-hour.
+    pub memory_gb_hour: u64,
+    /// Network egress per GB.
+    pub network_egress_gb: u64,
+    /// Inference per million tokens.
+    pub inference_per_million_tokens: u64,
+}
+
+/// GPU pricing tiers.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum GpuTier {
+    /// Consumer GPUs (RTX 30xx, 40xx).
+    Consumer,
+    /// Professional GPUs (RTX A series).
+    Professional,
+    /// Data center GPUs (A100).
+    DataCenter,
+    /// Latest generation (H100).
+    Premium,
+}
+
+impl Default for ComputePricing {
+    fn default() -> Self {
+        let mut gpu_hourly = HashMap::new();
+        gpu_hourly.insert(GpuTier::Consumer, 100_000_000);      // 0.10 SYNOR
+        gpu_hourly.insert(GpuTier::Professional, 300_000_000);  // 0.30 SYNOR
+        gpu_hourly.insert(GpuTier::DataCenter, 2_000_000_000);  // 2.00 SYNOR
+        gpu_hourly.insert(GpuTier::Premium, 4_000_000_000);     // 4.00 SYNOR
+
+        Self {
+            gpu_hourly,
+            cpu_core_hour: 20_000_000,           // 0.02 SYNOR
+            memory_gb_hour: 5_000_000,           // 0.005 SYNOR
+            network_egress_gb: 50_000_000,       // 0.05 SYNOR
+            inference_per_million_tokens: 100_000_000, // 0.10 SYNOR
+        }
+    }
+}
+
+impl ComputePricing {
+    /// Estimates cost for a job.
+    pub fn estimate(&self, job: &ComputeJob, duration_hours: f32) -> u64 {
+        let mut cost = 0u64;
+
+        // CPU cost
+        cost += (self.cpu_core_hour as f32 * job.resources.min_cpu_cores * duration_hours) as u64;
+
+        // Memory cost
+        cost += (self.memory_gb_hour as f32 * job.resources.min_memory_gb * duration_hours) as u64;
+
+        // GPU cost
+        if let Some(gpu) = &job.resources.gpu {
+            let tier = GpuTier::Consumer; // Simplified
+            let gpu_cost = self.gpu_hourly.get(&tier).unwrap_or(&100_000_000);
+            cost += (*gpu_cost as f32 * gpu.min_count as f32 * duration_hours) as u64;
+        }
+
+        cost
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_job_id() {
+        let id1 = JobId::new();
+        let id2 = JobId::new();
+        assert_ne!(id1.0, id2.0);
+    }
+
+    #[test]
+    fn test_compute_cluster() {
+        let cluster = ComputeCluster::new();
+        let stats = cluster.stats();
+        assert_eq!(stats.total_nodes, 0);
+    }
+
+    #[test]
+    fn test_pricing() {
+        let pricing = ComputePricing::default();
+
+        let job = ComputeJob {
+            id: JobId::new(),
+            owner: [0u8; 32],
+            job_type: JobType::Inference {
+                model_cid: "model123".to_string(),
+                input_format: "json".to_string(),
+                batch_size: 32,
+            },
+            resources: ResourceRequirements {
+                min_cpu_cores: 4.0,
+                min_memory_gb: 16.0,
+                gpu: Some(GpuRequirements {
+                    min_count: 1,
+                    max_count: 1,
+                    min_vram_gb: 16,
+                    min_compute_capability: None,
+                    allow_sharing: false,
+                }),
+                ..Default::default()
+            },
+            input_cid: None,
+            max_budget: 1_000_000_000,
+            priority: JobPriority::Normal,
+            created_at: 0,
+            deadline: None,
+        };
+
+        let cost = pricing.estimate(&job, 1.0);
+        assert!(cost > 0);
+    }
+
+    #[test]
+    fn test_node_registration() {
+        let cluster = ComputeCluster::new();
+
+        let node = ComputeNode {
+            id: NodeId(1),
+            owner: [1u8; 32],
+            processors: vec![ProcessorInfo {
+                id: ProcessorId(0),
+                processor_type: ProcessorType::Cpu(CpuVariant::X86_64 {
+                    avx: processor::AvxSupport::Avx512,
+                }),
+                capabilities: ProcessorCapabilities::default(),
+                utilization: 0.0,
+                temperature: Some(45.0),
+            }],
+            total_memory_gb: 64.0,
+            available_memory_gb: 60.0,
+            bandwidth_gbps: 10.0,
+            region: "us-east".to_string(),
+            stake: 1000,
+            reputation: 100,
+            status: NodeStatus::Online,
+        };
+
+        cluster.register_node(node).unwrap();
+        assert_eq!(cluster.stats().total_nodes, 1);
+    }
+}
diff --git a/crates/synor-compute/src/market/mod.rs b/crates/synor-compute/src/market/mod.rs
new file mode 100644
index 0000000..d7300b2
--- /dev/null
+++ b/crates/synor-compute/src/market/mod.rs
@@ -0,0 +1,1151 @@
+//! Spot market and pricing engine for compute resources.
+//!
+//! Implements real-time pricing based on supply/demand, geographic arbitrage,
+//! and auction-based compute allocation for 90% cost reduction vs cloud providers.
+
+use crate::error::ComputeError;
+use crate::processor::ProcessorType;
+use crate::{NodeId, ProcessorId};
+use parking_lot::RwLock;
+use serde::{Deserialize, Serialize};
+use std::collections::{BinaryHeap, HashMap};
+use std::sync::Arc;
+use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
+
+/// Unique order identifier.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct OrderId(pub u64);
+
+impl OrderId {
+    /// Creates a new order ID.
+    pub fn new() -> Self {
+        use rand::Rng;
+        OrderId(rand::thread_rng().gen())
+    }
+}
+
+impl Default for OrderId {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Order side (buy or sell).
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub enum OrderSide {
+    /// Buying compute resources.
+    Buy,
+    /// Selling compute resources.
+    Sell,
+}
+
+/// Order type.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub enum OrderType {
+    /// Market order - execute at best available price.
+    Market,
+    /// Limit order - execute only at specified price or better.
+    Limit,
+    /// Fill or kill - execute entirely or cancel.
+    FillOrKill,
+    /// Immediate or cancel - execute as much as possible immediately.
+    ImmediateOrCancel,
+}
+
+/// Resource type being traded.
+#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum ResourceType {
+    /// GPU compute hours.
+    GpuHours(GpuTier),
+    /// CPU compute hours.
+    CpuHours(CpuTier),
+    /// TPU compute hours.
+    TpuHours,
+    /// NPU compute hours.
+    NpuHours,
+    /// LPU inference credits.
+    LpuCredits,
+    /// Memory GB-hours.
+    MemoryGbHours,
+    /// Network bandwidth GB.
+    NetworkGb,
+    /// Storage GB-hours.
+    StorageGbHours,
+}
+
+/// GPU performance tier.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum GpuTier {
+    /// Entry level (RTX 3060, etc.).
+    Entry,
+    /// Mid-range (RTX 4080, A4000, etc.).
+    Mid,
+    /// High-end (RTX 4090, A6000, etc.).
+    High,
+    /// Data center (A100, H100, etc.).
+    DataCenter,
+    /// Ultra (H100 SXM, B200, etc.).
+    Ultra,
+}
+
+/// CPU performance tier.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum CpuTier {
+    /// Mobile/ARM.
+    Mobile,
+    /// Desktop.
+    Desktop,
+    /// Workstation.
+    Workstation,
+    /// Server.
+    Server,
+}
+
+/// A market order for compute resources.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct Order {
+    /// Unique order ID.
+    pub id: OrderId,
+    /// Node placing the order.
+    pub node_id: NodeId,
+    /// Buy or sell.
+    pub side: OrderSide,
+    /// Order type.
+    pub order_type: OrderType,
+    /// Resource being traded.
+    pub resource: ResourceType,
+    /// Quantity in resource units.
+    pub quantity: f64,
+    /// Price per unit in Synor credits.
+    pub price: f64,
+    /// Remaining unfilled quantity.
+    pub remaining: f64,
+    /// Timestamp.
+    pub timestamp: u64,
+    /// Expiration (None = good till cancelled).
+    pub expires_at: Option<u64>,
+    /// Geographic region preference.
+    pub region: Option<String>,
+}
+
+impl Order {
+    /// Creates a new order.
+    pub fn new(
+        node_id: NodeId,
+        side: OrderSide,
+        order_type: OrderType,
+        resource: ResourceType,
+        quantity: f64,
+        price: f64,
+    ) -> Self {
+        let now = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .unwrap()
+            .as_secs();
+
+        Self {
+            id: OrderId::new(),
+            node_id,
+            side,
+            order_type,
+            resource,
+            quantity,
+            price,
+            remaining: quantity,
+            timestamp: now,
+            expires_at: None,
+            region: None,
+        }
+    }
+
+    /// Sets expiration time.
+    pub fn with_expiration(mut self, expires_at: u64) -> Self {
+        self.expires_at = Some(expires_at);
+        self
+    }
+
+    /// Sets region preference.
+    pub fn with_region(mut self, region: String) -> Self {
+        self.region = Some(region);
+        self
+    }
+
+    /// Checks if order is expired.
+    pub fn is_expired(&self) -> bool {
+        if let Some(expires) = self.expires_at {
+            let now = SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .unwrap()
+                .as_secs();
+            now >= expires
+        } else {
+            false
+        }
+    }
+
+    /// Checks if order is fully filled.
+    pub fn is_filled(&self) -> bool {
+        self.remaining <= 0.0
+    }
+}
+
+/// Order wrapper for priority queue (bid side - max heap).
+#[derive(Clone, Debug)]
+struct BidOrder(Arc<RwLock<Order>>);
+
+impl PartialEq for BidOrder {
+    fn eq(&self, other: &Self) -> bool {
+        let a = self.0.read();
+        let b = other.0.read();
+        a.price == b.price && a.timestamp == b.timestamp
+    }
+}
+
+impl Eq for BidOrder {}
+
+impl PartialOrd for BidOrder {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for BidOrder {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        let a = self.0.read();
+        let b = other.0.read();
+        // Higher price first, then earlier timestamp
+        match a.price.partial_cmp(&b.price) {
+            Some(std::cmp::Ordering::Equal) => b.timestamp.cmp(&a.timestamp),
+            Some(ord) => ord,
+            None => std::cmp::Ordering::Equal,
+        }
+    }
+}
+
+/// Order wrapper for priority queue (ask side - min heap).
+#[derive(Clone, Debug)]
+struct AskOrder(Arc<RwLock<Order>>);
+
+impl PartialEq for AskOrder {
+    fn eq(&self, other: &Self) -> bool {
+        let a = self.0.read();
+        let b = other.0.read();
+        a.price == b.price && a.timestamp == b.timestamp
+    }
+}
+
+impl Eq for AskOrder {}
+
+impl PartialOrd for AskOrder {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for AskOrder {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        let a = self.0.read();
+        let b = other.0.read();
+        // Lower price first (reverse for min-heap), then earlier timestamp
+        match b.price.partial_cmp(&a.price) {
+            Some(std::cmp::Ordering::Equal) => b.timestamp.cmp(&a.timestamp),
+            Some(ord) => ord,
+            None => std::cmp::Ordering::Equal,
+        }
+    }
+}
+
+/// Order book for a single resource type.
+pub struct OrderBook {
+    /// Resource type this book handles.
+    resource: ResourceType,
+    /// Buy orders (bids) - max heap.
+    bids: RwLock<BinaryHeap<BidOrder>>,
+    /// Sell orders (asks) - min heap.
+    asks: RwLock<BinaryHeap<AskOrder>>,
+    /// Order lookup by ID.
+    orders: RwLock<HashMap<OrderId, Arc<RwLock<Order>>>>,
+    /// Last traded price.
+    last_price: RwLock<Option<f64>>,
+    /// Trade history.
+    trades: RwLock<Vec<Trade>>,
+}
+
+/// A completed trade.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct Trade {
+    /// Buyer order ID.
+    pub buyer_order: OrderId,
+    /// Seller order ID.
+    pub seller_order: OrderId,
+    /// Buyer node.
+    pub buyer: NodeId,
+    /// Seller node.
+    pub seller: NodeId,
+    /// Trade price.
+    pub price: f64,
+    /// Trade quantity.
+    pub quantity: f64,
+    /// Timestamp.
+    pub timestamp: u64,
+}
+
+impl OrderBook {
+    /// Creates a new order book.
+    pub fn new(resource: ResourceType) -> Self {
+        Self {
+            resource,
+            bids: RwLock::new(BinaryHeap::new()),
+            asks: RwLock::new(BinaryHeap::new()),
+            orders: RwLock::new(HashMap::new()),
+            last_price: RwLock::new(None),
+            trades: RwLock::new(Vec::new()),
+        }
+    }
+
+    /// Submits an order and attempts to match.
+    pub fn submit(&self, order: Order) -> Result<Vec<Trade>, ComputeError> {
+        let order = Arc::new(RwLock::new(order));
+        let mut trades = Vec::new();
+
+        // Try to match order
+        match order.read().side {
+            OrderSide::Buy => {
+                trades = self.match_buy(&order);
+            }
+            OrderSide::Sell => {
+                trades = self.match_sell(&order);
+            }
+        }
+
+        // Add remaining to book if not filled
+        let (is_filled, is_ioc, id, side) = {
+            let guard = order.read();
+            (
+                guard.is_filled(),
+                matches!(
+                    guard.order_type,
+                    OrderType::ImmediateOrCancel | OrderType::FillOrKill
+                ),
+                guard.id,
+                guard.side,
+            )
+        };
+
+        if !is_filled && !is_ioc {
+            self.orders.write().insert(id, order.clone());
+
+            match side {
+                OrderSide::Buy => {
+                    self.bids.write().push(BidOrder(order));
+                }
+                OrderSide::Sell => {
+                    self.asks.write().push(AskOrder(order));
+                }
+            }
+        }
+
+        // Record trades
+        if !trades.is_empty() {
+            self.trades.write().extend(trades.clone());
+            *self.last_price.write() = Some(trades.last().unwrap().price);
+        }
+
+        Ok(trades)
+    }
+
+    /// Matches a buy order against asks.
+    fn match_buy(&self, buy: &Arc<RwLock<Order>>) -> Vec<Trade> {
+        let mut trades = Vec::new();
+        let mut asks = self.asks.write();
+
+        while !buy.read().is_filled() {
+            // Get best ask
+            let best_ask = match asks.peek() {
+                Some(ask) => ask.clone(),
+                None => break,
+            };
+
+            let ask_order = best_ask.0.read();
+
+            // Check price compatibility
+            if buy.read().price < ask_order.price
+                && !matches!(buy.read().order_type, OrderType::Market)
+            {
+                break;
+            }
+
+            // Remove from heap for modification
+            drop(ask_order);
+            let ask = asks.pop().unwrap().0;
+
+            // Calculate trade
+            let mut buy_guard = buy.write();
+            let mut ask_guard = ask.write();
+
+            let trade_qty = buy_guard.remaining.min(ask_guard.remaining);
+            let trade_price = ask_guard.price; // Use ask price
+
+            buy_guard.remaining -= trade_qty;
+            ask_guard.remaining -= trade_qty;
+
+            let now = SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .unwrap()
+                .as_secs();
+
+            trades.push(Trade {
+                buyer_order: buy_guard.id,
+                seller_order: ask_guard.id,
+                buyer: buy_guard.node_id,
+                seller: ask_guard.node_id,
+                price: trade_price,
+                quantity: trade_qty,
+                timestamp: now,
+            });
+
+            // Put ask back if not filled
+            drop(buy_guard);
+            drop(ask_guard);
+
+            if !ask.read().is_filled() {
+                asks.push(AskOrder(ask));
+            }
+        }
+
+        trades
+    }
+
+    /// Matches a sell order against bids.
+    fn match_sell(&self, sell: &Arc<RwLock<Order>>) -> Vec<Trade> {
+        let mut trades = Vec::new();
+        let mut bids = self.bids.write();
+
+        while !sell.read().is_filled() {
+            // Get best bid
+            let best_bid = match bids.peek() {
+                Some(bid) => bid.clone(),
+                None => break,
+            };
+
+            let bid_order = best_bid.0.read();
+
+            // Check price compatibility
+            if sell.read().price > bid_order.price
+                && !matches!(sell.read().order_type, OrderType::Market)
+            {
+                break;
+            }
+
+            // Remove from heap for modification
+            drop(bid_order);
+            let bid = bids.pop().unwrap().0;
+
+            // Calculate trade
+            let mut sell_guard = sell.write();
+            let mut bid_guard = bid.write();
+
+            let trade_qty = sell_guard.remaining.min(bid_guard.remaining);
+            let trade_price = bid_guard.price; // Use bid price
+
+            sell_guard.remaining -= trade_qty;
+            bid_guard.remaining -= trade_qty;
+
+            let now = SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .unwrap()
+                .as_secs();
+
+            trades.push(Trade {
+                buyer_order: bid_guard.id,
+                seller_order: sell_guard.id,
+                buyer: bid_guard.node_id,
+                seller: sell_guard.node_id,
+                price: trade_price,
+                quantity: trade_qty,
+                timestamp: now,
+            });
+
+            // Put bid back if not filled
+            drop(sell_guard);
+            drop(bid_guard);
+
+            if !bid.read().is_filled() {
+                bids.push(BidOrder(bid));
+            }
+        }
+
+        trades
+    }
+
+    /// Gets the best bid price.
+    pub fn best_bid(&self) -> Option<f64> {
+        self.bids.read().peek().map(|b| b.0.read().price)
+    }
+
+    /// Gets the best ask price.
+    pub fn best_ask(&self) -> Option<f64> {
+        self.asks.read().peek().map(|a| a.0.read().price)
+    }
+
+    /// Gets the bid-ask spread.
+    pub fn spread(&self) -> Option<f64> {
+        match (self.best_bid(), self.best_ask()) {
+            (Some(bid), Some(ask)) => Some(ask - bid),
+            _ => None,
+        }
+    }
+
+    /// Gets the mid price.
+    pub fn mid_price(&self) -> Option<f64> {
+        match (self.best_bid(), self.best_ask()) {
+            (Some(bid), Some(ask)) => Some((bid + ask) / 2.0),
+            _ => self.last_price.read().clone(),
+        }
+    }
+
+    /// Gets last traded price.
+    pub fn last_price(&self) -> Option<f64> {
+        *self.last_price.read()
+    }
+}
+
+/// Regional electricity pricing for geographic arbitrage.
+#[derive(Clone, Debug)]
+pub struct RegionalPricing {
+    /// Region identifier.
+    pub region: String,
+    /// Electricity cost in USD per kWh.
+    pub electricity_cost: f64,
+    /// Carbon intensity (gCO2/kWh).
+    pub carbon_intensity: f64,
+    /// Current grid load factor (0.0-1.0).
+    pub grid_load: f64,
+    /// Renewable energy percentage.
+    pub renewable_pct: f64,
+}
+
+impl RegionalPricing {
+    /// Calculates effective compute cost multiplier.
+    pub fn cost_multiplier(&self) -> f64 {
+        // Base on electricity cost, with grid load adjustment
+        let base = self.electricity_cost / 0.10; // Normalized to $0.10/kWh baseline
+        let load_adj = 1.0 + (self.grid_load - 0.5) * 0.2; // ±10% based on load
+        base * load_adj
+    }
+}
+
+/// Default regional pricing data.
+pub fn default_regional_pricing() -> Vec<RegionalPricing> {
+    vec![
+        RegionalPricing {
+            region: "us-west".to_string(),
+            electricity_cost: 0.12,
+            carbon_intensity: 200.0,
+            grid_load: 0.6,
+            renewable_pct: 0.35,
+        },
+        RegionalPricing {
+            region: "us-east".to_string(),
+            electricity_cost: 0.11,
+            carbon_intensity: 350.0,
+            grid_load: 0.7,
+            renewable_pct: 0.15,
+        },
+        RegionalPricing {
+            region: "eu-west".to_string(),
+            electricity_cost: 0.25,
+            carbon_intensity: 150.0,
+            grid_load: 0.5,
+            renewable_pct: 0.45,
+        },
+        RegionalPricing {
+            region: "eu-north".to_string(),
+            electricity_cost: 0.08,
+            carbon_intensity: 50.0,
+            grid_load: 0.4,
+            renewable_pct: 0.90,
+        },
+        RegionalPricing {
+            region: "asia-east".to_string(),
+            electricity_cost: 0.10,
+            carbon_intensity: 500.0,
+            grid_load: 0.8,
+            renewable_pct: 0.20,
+        },
+        RegionalPricing {
+            region: "asia-south".to_string(),
+            electricity_cost: 0.07,
+            carbon_intensity: 600.0,
+            grid_load: 0.6,
+            renewable_pct: 0.10,
+        },
+    ]
+}
+
+/// Pricing engine for compute resources.
+pub struct PricingEngine {
+    /// Base prices per resource type.
+    base_prices: HashMap<ResourceType, f64>,
+    /// Regional pricing data.
+    regions: Vec<RegionalPricing>,
+    /// Supply/demand factors.
+    supply_demand: RwLock<HashMap<ResourceType, f64>>,
+    /// Time-of-day factors.
+    time_factors: Vec<f64>, // 24 hourly factors
+}
+
+impl PricingEngine {
+    /// Creates a new pricing engine.
+    pub fn new() -> Self {
+        let mut base_prices = HashMap::new();
+
+        // Base prices in Synor credits per unit
+        // Designed to be ~90% cheaper than AWS/GCP/Azure
+        base_prices.insert(ResourceType::GpuHours(GpuTier::Entry), 0.05);
+        base_prices.insert(ResourceType::GpuHours(GpuTier::Mid), 0.15);
+        base_prices.insert(ResourceType::GpuHours(GpuTier::High), 0.30);
+        base_prices.insert(ResourceType::GpuHours(GpuTier::DataCenter), 0.80);
+        base_prices.insert(ResourceType::GpuHours(GpuTier::Ultra), 1.50);
+        base_prices.insert(ResourceType::CpuHours(CpuTier::Mobile), 0.001);
+        base_prices.insert(ResourceType::CpuHours(CpuTier::Desktop), 0.005);
+        base_prices.insert(ResourceType::CpuHours(CpuTier::Workstation), 0.015);
+        base_prices.insert(ResourceType::CpuHours(CpuTier::Server), 0.03);
+        base_prices.insert(ResourceType::TpuHours, 1.00);
+        base_prices.insert(ResourceType::NpuHours, 0.10);
+        base_prices.insert(ResourceType::LpuCredits, 0.50);
+        base_prices.insert(ResourceType::MemoryGbHours, 0.001);
+        base_prices.insert(ResourceType::NetworkGb, 0.01);
+        base_prices.insert(ResourceType::StorageGbHours, 0.0001);
+
+        // Time-of-day factors (0 = midnight UTC)
+        // Lower prices during off-peak hours
+        let time_factors = vec![
+            0.7, 0.6, 0.5, 0.5, 0.5, 0.6, // 00:00 - 05:00 (off-peak)
+            0.8, 0.9, 1.0, 1.0, 1.0, 1.0, // 06:00 - 11:00 (ramp up)
+            1.0, 1.1, 1.2, 1.2, 1.1, 1.0, // 12:00 - 17:00 (peak)
+            0.9, 0.9, 0.8, 0.8, 0.7, 0.7, // 18:00 - 23:00 (wind down)
+        ];
+
+        Self {
+            base_prices,
+            regions: default_regional_pricing(),
+            supply_demand: RwLock::new(HashMap::new()),
+            time_factors,
+        }
+    }
+
+    /// Gets the current spot price for a resource.
+    pub fn spot_price(&self, resource: &ResourceType, region: Option<&str>) -> f64 {
+        let base = self.base_prices.get(resource).copied().unwrap_or(0.1);
+
+        // Apply supply/demand factor
+        let sd_factor = self
+            .supply_demand
+            .read()
+            .get(resource)
+            .copied()
+            .unwrap_or(1.0);
+
+        // Apply time-of-day factor
+        let hour = chrono::Utc::now().hour() as usize;
+        let time_factor = self.time_factors.get(hour).copied().unwrap_or(1.0);
+
+        // Apply regional factor
+        let region_factor = region
+            .and_then(|r| self.regions.iter().find(|p| p.region == r))
+            .map(|p| p.cost_multiplier())
+            .unwrap_or(1.0);
+
+        base * sd_factor * time_factor * region_factor
+    }
+
+    /// Updates supply/demand factor for a resource.
+    pub fn update_supply_demand(&self, resource: ResourceType, supply: f64, demand: f64) {
+        // Factor increases when demand > supply
+        let factor = if supply > 0.0 {
+            (demand / supply).sqrt().clamp(0.5, 2.0)
+        } else if demand > 0.0 {
+            2.0 // Max factor when no supply
+        } else {
+            1.0
+        };
+
+        self.supply_demand.write().insert(resource, factor);
+    }
+
+    /// Gets cheapest region for a resource.
+    pub fn cheapest_region(&self, resource: &ResourceType) -> &str {
+        self.regions
+            .iter()
+            .min_by(|a, b| {
+                let cost_a = self.spot_price(resource, Some(&a.region));
+                let cost_b = self.spot_price(resource, Some(&b.region));
+                cost_a.partial_cmp(&cost_b).unwrap()
+            })
+            .map(|r| r.region.as_str())
+            .unwrap_or("us-west")
+    }
+
+    /// Gets greenest region for a resource.
+    pub fn greenest_region(&self) -> &str {
+        self.regions
+            .iter()
+            .max_by(|a, b| {
+                a.renewable_pct
+                    .partial_cmp(&b.renewable_pct)
+                    .unwrap()
+            })
+            .map(|r| r.region.as_str())
+            .unwrap_or("eu-north")
+    }
+
+    /// Compares price to cloud providers.
+    pub fn compare_to_cloud(&self, resource: &ResourceType, region: Option<&str>) -> CloudComparison {
+        let our_price = self.spot_price(resource, region);
+
+        // Approximate cloud provider prices (USD/hour for GPU)
+        let (aws_price, gcp_price, azure_price) = match resource {
+            ResourceType::GpuHours(GpuTier::DataCenter) => (3.06, 2.95, 3.10), // A100 equivalents
+            ResourceType::GpuHours(GpuTier::Ultra) => (5.00, 4.50, 5.20),       // H100 equivalents
+            ResourceType::GpuHours(GpuTier::High) => (1.50, 1.40, 1.60),        // T4/A10 equivalents
+            ResourceType::CpuHours(CpuTier::Server) => (0.40, 0.35, 0.42),
+            _ => (1.0, 1.0, 1.0),
+        };
+
+        CloudComparison {
+            synor_price: our_price,
+            aws_price,
+            gcp_price,
+            azure_price,
+            aws_savings: ((aws_price - our_price) / aws_price * 100.0).max(0.0),
+            gcp_savings: ((gcp_price - our_price) / gcp_price * 100.0).max(0.0),
+            azure_savings: ((azure_price - our_price) / azure_price * 100.0).max(0.0),
+        }
+    }
+}
+
+impl Default for PricingEngine {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Comparison with cloud provider prices.
+#[derive(Clone, Debug)]
+pub struct CloudComparison {
+    /// Our spot price.
+    pub synor_price: f64,
+    /// AWS price.
+    pub aws_price: f64,
+    /// GCP price.
+    pub gcp_price: f64,
+    /// Azure price.
+    pub azure_price: f64,
+    /// Savings vs AWS (percentage).
+    pub aws_savings: f64,
+    /// Savings vs GCP (percentage).
+    pub gcp_savings: f64,
+    /// Savings vs Azure (percentage).
+    pub azure_savings: f64,
+}
+
+/// Spot market for compute resources.
+pub struct SpotMarket {
+    /// Order books per resource type.
+    order_books: HashMap<ResourceType, OrderBook>,
+    /// Pricing engine.
+    pricing: PricingEngine,
+    /// Provider registry (node -> resources offered).
+    providers: RwLock<HashMap<NodeId, Vec<ProviderListing>>>,
+    /// Active auctions.
+    auctions: RwLock<HashMap<AuctionId, Auction>>,
+}
+
+/// Unique auction identifier.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct AuctionId(pub u64);
+
+impl AuctionId {
+    /// Creates a new auction ID.
+    pub fn new() -> Self {
+        use rand::Rng;
+        AuctionId(rand::thread_rng().gen())
+    }
+}
+
+impl Default for AuctionId {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// A provider's resource listing.
+#[derive(Clone, Debug)]
+pub struct ProviderListing {
+    /// Resource type offered.
+    pub resource: ResourceType,
+    /// Available quantity.
+    pub quantity: f64,
+    /// Minimum price accepted.
+    pub min_price: f64,
+    /// Region.
+    pub region: String,
+    /// Processor types available.
+    pub processors: Vec<ProcessorType>,
+}
+
+/// An auction for compute resources.
+#[derive(Clone, Debug)]
+pub struct Auction {
+    /// Auction ID.
+    pub id: AuctionId,
+    /// Resource being auctioned.
+    pub resource: ResourceType,
+    /// Quantity needed.
+    pub quantity: f64,
+    /// Maximum price buyer will pay.
+    pub max_price: f64,
+    /// Current winning bid.
+    pub winning_bid: Option<(NodeId, f64)>,
+    /// All bids.
+    pub bids: Vec<(NodeId, f64)>,
+    /// Start time.
+    pub started: Instant,
+    /// Duration.
+    pub duration: Duration,
+    /// Whether auction is closed.
+    pub closed: bool,
+}
+
+impl Auction {
+    /// Creates a new auction.
+    pub fn new(resource: ResourceType, quantity: f64, max_price: f64, duration: Duration) -> Self {
+        Self {
+            id: AuctionId::new(),
+            resource,
+            quantity,
+            max_price,
+            winning_bid: None,
+            bids: Vec::new(),
+            started: Instant::now(),
+            duration,
+            closed: false,
+        }
+    }
+
+    /// Submits a bid.
+    pub fn bid(&mut self, node: NodeId, price: f64) -> bool {
+        if self.closed || price > self.max_price {
+            return false;
+        }
+
+        self.bids.push((node, price));
+
+        // Update winning bid (lowest price wins)
+        if self.winning_bid.is_none() || price < self.winning_bid.unwrap().1 {
+            self.winning_bid = Some((node, price));
+        }
+
+        true
+    }
+
+    /// Checks if auction has ended.
+    pub fn is_ended(&self) -> bool {
+        self.closed || self.started.elapsed() >= self.duration
+    }
+
+    /// Closes the auction.
+    pub fn close(&mut self) -> Option<(NodeId, f64)> {
+        self.closed = true;
+        self.winning_bid
+    }
+}
+
+impl SpotMarket {
+    /// Creates a new spot market.
+    pub fn new() -> Self {
+        let mut order_books = HashMap::new();
+
+        // Create order books for common resources
+        for tier in [
+            GpuTier::Entry,
+            GpuTier::Mid,
+            GpuTier::High,
+            GpuTier::DataCenter,
+            GpuTier::Ultra,
+        ] {
+            order_books.insert(
+                ResourceType::GpuHours(tier),
+                OrderBook::new(ResourceType::GpuHours(tier)),
+            );
+        }
+
+        for tier in [
+            CpuTier::Mobile,
+            CpuTier::Desktop,
+            CpuTier::Workstation,
+            CpuTier::Server,
+        ] {
+            order_books.insert(
+                ResourceType::CpuHours(tier),
+                OrderBook::new(ResourceType::CpuHours(tier)),
+            );
+        }
+
+        order_books.insert(ResourceType::TpuHours, OrderBook::new(ResourceType::TpuHours));
+        order_books.insert(ResourceType::NpuHours, OrderBook::new(ResourceType::NpuHours));
+        order_books.insert(ResourceType::LpuCredits, OrderBook::new(ResourceType::LpuCredits));
+
+        Self {
+            order_books,
+            pricing: PricingEngine::new(),
+            providers: RwLock::new(HashMap::new()),
+            auctions: RwLock::new(HashMap::new()),
+        }
+    }
+
+    /// Registers a compute provider.
+    pub fn register_provider(&self, node_id: NodeId, listings: Vec<ProviderListing>) {
+        self.providers.write().insert(node_id, listings);
+    }
+
+    /// Submits an order.
+    pub fn submit_order(&self, order: Order) -> Result<Vec<Trade>, ComputeError> {
+        let book = self.order_books.get(&order.resource).ok_or_else(|| {
+            ComputeError::Internal(format!("No order book for resource: {:?}", order.resource))
+        })?;
+
+        book.submit(order)
+    }
+
+    /// Gets spot price for a resource.
+    pub fn spot_price(&self, resource: &ResourceType, region: Option<&str>) -> f64 {
+        // Check if there's a market price
+        if let Some(book) = self.order_books.get(resource) {
+            if let Some(mid) = book.mid_price() {
+                return mid;
+            }
+        }
+
+        // Fall back to pricing engine
+        self.pricing.spot_price(resource, region)
+    }
+
+    /// Starts an auction for compute resources.
+    pub fn start_auction(
+        &self,
+        resource: ResourceType,
+        quantity: f64,
+        max_price: f64,
+        duration: Duration,
+    ) -> AuctionId {
+        let auction = Auction::new(resource, quantity, max_price, duration);
+        let id = auction.id;
+        self.auctions.write().insert(id, auction);
+        id
+    }
+
+    /// Submits a bid to an auction.
+    pub fn bid_auction(&self, auction_id: AuctionId, node: NodeId, price: f64) -> bool {
+        if let Some(auction) = self.auctions.write().get_mut(&auction_id) {
+            auction.bid(node, price)
+        } else {
+            false
+        }
+    }
+
+    /// Closes an auction and returns the winner.
+    pub fn close_auction(&self, auction_id: AuctionId) -> Option<(NodeId, f64)> {
+        if let Some(auction) = self.auctions.write().get_mut(&auction_id) {
+            auction.close()
+        } else {
+            None
+        }
+    }
+
+    /// Gets pricing comparison with cloud providers.
+    pub fn compare_to_cloud(&self, resource: &ResourceType) -> CloudComparison {
+        self.pricing.compare_to_cloud(resource, None)
+    }
+
+    /// Gets the cheapest region for a resource.
+    pub fn cheapest_region(&self, resource: &ResourceType) -> &str {
+        self.pricing.cheapest_region(resource)
+    }
+
+    /// Gets all provider listings for a resource.
+    pub fn find_providers(&self, resource: &ResourceType) -> Vec<(NodeId, ProviderListing)> {
+        self.providers
+            .read()
+            .iter()
+            .flat_map(|(node, listings)| {
+                listings
+                    .iter()
+                    .filter(|l| &l.resource == resource)
+                    .map(|l| (*node, l.clone()))
+            })
+            .collect()
+    }
+
+    /// Gets market stats for a resource.
+    pub fn market_stats(&self, resource: &ResourceType) -> Option<MarketStats> {
+        let book = self.order_books.get(resource)?;
+
+        Some(MarketStats {
+            best_bid: book.best_bid(),
+            best_ask: book.best_ask(),
+            spread: book.spread(),
+            mid_price: book.mid_price(),
+            last_price: book.last_price(),
+        })
+    }
+}
+
+impl Default for SpotMarket {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Market statistics for a resource.
+#[derive(Clone, Debug)]
+pub struct MarketStats {
+    /// Best bid price.
+    pub best_bid: Option<f64>,
+    /// Best ask price.
+    pub best_ask: Option<f64>,
+    /// Bid-ask spread.
+    pub spread: Option<f64>,
+    /// Mid price.
+    pub mid_price: Option<f64>,
+    /// Last traded price.
+    pub last_price: Option<f64>,
+}
+
+use chrono::Timelike;
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_order_creation() {
+        let order = Order::new(
+            NodeId(1),
+            OrderSide::Buy,
+            OrderType::Limit,
+            ResourceType::GpuHours(GpuTier::DataCenter),
+            10.0,
+            0.80,
+        );
+
+        assert_eq!(order.remaining, 10.0);
+        assert!(!order.is_filled());
+    }
+
+    #[test]
+    fn test_order_book_matching() {
+        let book = OrderBook::new(ResourceType::GpuHours(GpuTier::High));
+
+        // Add a sell order
+        let sell = Order::new(
+            NodeId(1),
+            OrderSide::Sell,
+            OrderType::Limit,
+            ResourceType::GpuHours(GpuTier::High),
+            5.0,
+            0.25,
+        );
+        book.submit(sell).unwrap();
+
+        // Add a matching buy order
+        let buy = Order::new(
+            NodeId(2),
+            OrderSide::Buy,
+            OrderType::Limit,
+            ResourceType::GpuHours(GpuTier::High),
+            3.0,
+            0.30,
+        );
+        let trades = book.submit(buy).unwrap();
+
+        assert_eq!(trades.len(), 1);
+        assert_eq!(trades[0].quantity, 3.0);
+        assert_eq!(trades[0].price, 0.25); // Uses ask price
+    }
+
+    #[test]
+    fn test_pricing_engine() {
+        let engine = PricingEngine::new();
+
+        let price = engine.spot_price(&ResourceType::GpuHours(GpuTier::DataCenter), Some("eu-north"));
+        assert!(price > 0.0);
+
+        // eu-north should be cheaper (low electricity cost)
+        let eu_price = engine.spot_price(&ResourceType::GpuHours(GpuTier::DataCenter), Some("eu-north"));
+        let eu_west_price = engine.spot_price(&ResourceType::GpuHours(GpuTier::DataCenter), Some("eu-west"));
+
+        // eu-north has cheaper electricity
+        assert!(eu_price < eu_west_price);
+    }
+
+    #[test]
+    fn test_cloud_comparison() {
+        let engine = PricingEngine::new();
+
+        let comparison = engine.compare_to_cloud(&ResourceType::GpuHours(GpuTier::DataCenter), None);
+
+        // Should show significant savings
+        assert!(comparison.aws_savings > 50.0);
+        assert!(comparison.gcp_savings > 50.0);
+        assert!(comparison.azure_savings > 50.0);
+    }
+
+    #[test]
+    fn test_auction() {
+        let mut auction = Auction::new(
+            ResourceType::GpuHours(GpuTier::Ultra),
+            100.0,
+            2.0,
+            Duration::from_secs(60),
+        );
+
+        // Submit bids
+        assert!(auction.bid(NodeId(1), 1.8));
+        assert!(auction.bid(NodeId(2), 1.5));
+        assert!(auction.bid(NodeId(3), 1.7));
+
+        // Price too high
+        assert!(!auction.bid(NodeId(4), 2.5));
+
+        // Lowest bid wins
+        let winner = auction.close();
+        assert_eq!(winner, Some((NodeId(2), 1.5)));
+    }
+
+    #[test]
+    fn test_spot_market() {
+        let market = SpotMarket::new();
+
+        // Register a provider
+        market.register_provider(
+            NodeId(1),
+            vec![ProviderListing {
+                resource: ResourceType::GpuHours(GpuTier::High),
+                quantity: 100.0,
+                min_price: 0.20,
+                region: "us-west".to_string(),
+                processors: vec![],
+            }],
+        );
+
+        // Get providers
+        let providers = market.find_providers(&ResourceType::GpuHours(GpuTier::High));
+        assert_eq!(providers.len(), 1);
+
+        // Get spot price
+        let price = market.spot_price(&ResourceType::GpuHours(GpuTier::High), None);
+        assert!(price > 0.0);
+    }
+}
diff --git a/crates/synor-compute/src/memory/mod.rs b/crates/synor-compute/src/memory/mod.rs
new file mode 100644
index 0000000..ef0dd6b
--- /dev/null
+++ b/crates/synor-compute/src/memory/mod.rs
@@ -0,0 +1,370 @@
+//! Unified memory management for heterogeneous compute.
+
+use crate::error::ComputeError;
+use crate::processor::ProcessorType;
+use parking_lot::RwLock;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::sync::Arc;
+
+/// Tensor handle for memory management.
+#[derive(Clone, Debug)]
+pub struct TensorHandle {
+    /// Unique ID.
+    pub id: TensorId,
+    /// Shape.
+    pub shape: Vec<usize>,
+    /// Data type.
+    pub dtype: DataType,
+    /// Size in bytes.
+    pub size_bytes: u64,
+    /// Current locations.
+    pub locations: Vec<ProcessorType>,
+}
+
+impl TensorHandle {
+    /// Creates a new tensor handle.
+    pub fn new(shape: Vec<usize>, dtype: DataType) -> Self {
+        let size_bytes = shape.iter().product::<usize>() as u64 * dtype.size_bytes() as u64;
+        Self {
+            id: TensorId::new(),
+            shape,
+            dtype,
+            size_bytes,
+            locations: Vec::new(),
+        }
+    }
+
+    /// Gets the number of elements.
+    pub fn numel(&self) -> usize {
+        self.shape.iter().product()
+    }
+}
+
+/// Tensor identifier.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct TensorId(pub u64);
+
+impl TensorId {
+    /// Creates a new tensor ID.
+    pub fn new() -> Self {
+        use rand::Rng;
+        TensorId(rand::thread_rng().gen())
+    }
+}
+
+impl Default for TensorId {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Data types for tensors.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum DataType {
+    Float64,
+    Float32,
+    Float16,
+    BFloat16,
+    Int64,
+    Int32,
+    Int16,
+    Int8,
+    UInt8,
+    Bool,
+}
+
+impl DataType {
+    /// Returns size in bytes.
+    pub fn size_bytes(&self) -> usize {
+        match self {
+            DataType::Float64 | DataType::Int64 => 8,
+            DataType::Float32 | DataType::Int32 => 4,
+            DataType::Float16 | DataType::BFloat16 | DataType::Int16 => 2,
+            DataType::Int8 | DataType::UInt8 | DataType::Bool => 1,
+        }
+    }
+}
+
+/// Data transfer path between processors.
+#[derive(Clone, Debug, PartialEq, Eq, Hash)]
+pub enum TransferPath {
+    /// Direct GPU-to-GPU via NVLink.
+    NvLink,
+    /// Direct GPU-to-GPU via PCIe P2P.
+    PciePeerToPeer,
+    /// Through CPU memory.
+    CpuMediated,
+    /// Unified memory (Apple Silicon).
+    UnifiedMemory,
+    /// Network transfer.
+    Network,
+    /// Same memory space (no transfer needed).
+    SameMemory,
+}
+
+impl TransferPath {
+    /// Returns approximate bandwidth in GB/s.
+    pub fn bandwidth_gbps(&self) -> f64 {
+        match self {
+            TransferPath::NvLink => 900.0,       // NVLink 4.0
+            TransferPath::PciePeerToPeer => 64.0, // PCIe 5.0 x16
+            TransferPath::CpuMediated => 50.0,   // DDR5
+            TransferPath::UnifiedMemory => 400.0, // Apple unified
+            TransferPath::Network => 10.0,       // 100Gbps network
+            TransferPath::SameMemory => f64::INFINITY,
+        }
+    }
+
+    /// Estimates transfer time for given bytes.
+    pub fn estimate_transfer_time(&self, bytes: u64) -> std::time::Duration {
+        if matches!(self, TransferPath::SameMemory) {
+            return std::time::Duration::ZERO;
+        }
+
+        let bytes_f64 = bytes as f64;
+        let bandwidth = self.bandwidth_gbps() * 1e9; // Convert to bytes/s
+        let seconds = bytes_f64 / bandwidth;
+        std::time::Duration::from_secs_f64(seconds)
+    }
+}
+
+/// Unified memory manager.
+pub struct MemoryManager {
+    /// Allocated tensors.
+    tensors: RwLock<HashMap<TensorId, TensorHandle>>,
+    /// Memory usage per processor type.
+    usage: RwLock<HashMap<ProcessorType, u64>>,
+    /// Memory limits per processor type.
+    limits: HashMap<ProcessorType, u64>,
+}
+
+impl MemoryManager {
+    /// Creates a new memory manager.
+    pub fn new() -> Self {
+        Self {
+            tensors: RwLock::new(HashMap::new()),
+            usage: RwLock::new(HashMap::new()),
+            limits: HashMap::new(),
+        }
+    }
+
+    /// Sets memory limit for a processor type.
+    pub fn set_limit(&mut self, proc_type: ProcessorType, limit_bytes: u64) {
+        self.limits.insert(proc_type, limit_bytes);
+    }
+
+    /// Allocates a tensor.
+    pub fn allocate(&self, shape: Vec<usize>, dtype: DataType) -> Result<TensorHandle, ComputeError> {
+        let handle = TensorHandle::new(shape, dtype);
+        self.tensors.write().insert(handle.id, handle.clone());
+        Ok(handle)
+    }
+
+    /// Frees a tensor.
+    pub fn free(&self, tensor_id: TensorId) -> Result<(), ComputeError> {
+        if let Some(handle) = self.tensors.write().remove(&tensor_id) {
+            // Update usage for all locations
+            let mut usage = self.usage.write();
+            for loc in &handle.locations {
+                if let Some(u) = usage.get_mut(loc) {
+                    *u = u.saturating_sub(handle.size_bytes);
+                }
+            }
+        }
+        Ok(())
+    }
+
+    /// Gets a tensor handle.
+    pub fn get(&self, tensor_id: TensorId) -> Option<TensorHandle> {
+        self.tensors.read().get(&tensor_id).cloned()
+    }
+
+    /// Ensures tensor is on specified processor.
+    pub fn ensure_on(
+        &self,
+        tensor_id: TensorId,
+        target: ProcessorType,
+    ) -> Result<TransferPath, ComputeError> {
+        let mut tensors = self.tensors.write();
+
+        if let Some(handle) = tensors.get_mut(&tensor_id) {
+            // Check if already on target
+            if handle.locations.contains(&target) {
+                return Ok(TransferPath::SameMemory);
+            }
+
+            // Determine transfer path
+            let path = if handle.locations.is_empty() {
+                // New tensor, allocate on target
+                TransferPath::SameMemory
+            } else {
+                // Find best transfer path from existing location
+                self.find_best_path(&handle.locations[0], &target)
+            };
+
+            // Record new location
+            handle.locations.push(target.clone());
+
+            // Update usage
+            let mut usage = self.usage.write();
+            *usage.entry(target).or_default() += handle.size_bytes;
+
+            Ok(path)
+        } else {
+            Err(ComputeError::Internal("Tensor not found".to_string()))
+        }
+    }
+
+    /// Finds best transfer path between processors.
+    fn find_best_path(&self, from: &ProcessorType, to: &ProcessorType) -> TransferPath {
+        // Check for unified memory (Apple Silicon)
+        if self.shares_memory(from, to) {
+            return TransferPath::UnifiedMemory;
+        }
+
+        // Check for NVLink between NVIDIA GPUs
+        if matches!(from, ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda { .. }))
+            && matches!(to, ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda { .. }))
+        {
+            return TransferPath::NvLink;
+        }
+
+        // Check for PCIe P2P between GPUs
+        if from.is_gpu() && to.is_gpu() {
+            return TransferPath::PciePeerToPeer;
+        }
+
+        // Default to CPU-mediated transfer
+        TransferPath::CpuMediated
+    }
+
+    /// Checks if two processor types share memory.
+    fn shares_memory(&self, a: &ProcessorType, b: &ProcessorType) -> bool {
+        use crate::processor::{CpuVariant, GpuVariant, NpuVariant};
+
+        match (a, b) {
+            // Apple Silicon unified memory
+            (ProcessorType::Cpu(CpuVariant::Arm64 { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal))
+            | (ProcessorType::Gpu(GpuVariant::AppleMetal), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
+            | (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
+            | (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal)) => true,
+            // Same type
+            _ if a == b => true,
+            _ => false,
+        }
+    }
+
+    /// Gets current memory usage for a processor type.
+    pub fn usage(&self, proc_type: ProcessorType) -> u64 {
+        self.usage.read().get(&proc_type).copied().unwrap_or(0)
+    }
+
+    /// Gets available memory for a processor type.
+    pub fn available(&self, proc_type: ProcessorType) -> u64 {
+        let limit = self.limits.get(&proc_type).copied().unwrap_or(u64::MAX);
+        let used = self.usage(proc_type);
+        limit.saturating_sub(used)
+    }
+
+    /// Gets total allocated tensors.
+    pub fn tensor_count(&self) -> usize {
+        self.tensors.read().len()
+    }
+}
+
+impl Default for MemoryManager {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Unified memory abstraction for zero-copy sharing.
+pub struct UnifiedMemory {
+    /// Base pointer (in unified address space).
+    pub base: u64,
+    /// Size in bytes.
+    pub size: u64,
+    /// Accessible from these processor types.
+    pub accessible_from: Vec<ProcessorType>,
+}
+
+impl UnifiedMemory {
+    /// Creates new unified memory region.
+    pub fn new(size: u64) -> Self {
+        Self {
+            base: 0, // Would be actual pointer in real implementation
+            size,
+            accessible_from: Vec::new(),
+        }
+    }
+
+    /// Checks if accessible from processor type.
+    pub fn is_accessible_from(&self, proc_type: &ProcessorType) -> bool {
+        self.accessible_from.contains(proc_type)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_tensor_handle() {
+        let handle = TensorHandle::new(vec![1024, 1024], DataType::Float32);
+        assert_eq!(handle.numel(), 1024 * 1024);
+        assert_eq!(handle.size_bytes, 1024 * 1024 * 4);
+    }
+
+    #[test]
+    fn test_data_type_sizes() {
+        assert_eq!(DataType::Float64.size_bytes(), 8);
+        assert_eq!(DataType::Float32.size_bytes(), 4);
+        assert_eq!(DataType::Float16.size_bytes(), 2);
+        assert_eq!(DataType::Int8.size_bytes(), 1);
+    }
+
+    #[test]
+    fn test_transfer_path_bandwidth() {
+        assert!(TransferPath::NvLink.bandwidth_gbps() > TransferPath::PciePeerToPeer.bandwidth_gbps());
+        assert!(TransferPath::SameMemory.bandwidth_gbps().is_infinite());
+    }
+
+    #[test]
+    fn test_memory_manager() {
+        let manager = MemoryManager::new();
+
+        let handle = manager.allocate(vec![1024, 1024], DataType::Float32).unwrap();
+        assert_eq!(manager.tensor_count(), 1);
+
+        manager.free(handle.id).unwrap();
+        assert_eq!(manager.tensor_count(), 0);
+    }
+
+    #[test]
+    fn test_ensure_on() {
+        let manager = MemoryManager::new();
+
+        let handle = manager.allocate(vec![1024], DataType::Float32).unwrap();
+
+        // First ensure should allocate
+        let path = manager.ensure_on(
+            handle.id,
+            ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda {
+                compute_capability: (8, 0),
+            }),
+        ).unwrap();
+
+        assert_eq!(path, TransferPath::SameMemory);
+
+        // Second ensure to same location should be same memory
+        let path = manager.ensure_on(
+            handle.id,
+            ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda {
+                compute_capability: (8, 0),
+            }),
+        ).unwrap();
+
+        assert_eq!(path, TransferPath::SameMemory);
+    }
+}
diff --git a/crates/synor-compute/src/processor/capabilities.rs b/crates/synor-compute/src/processor/capabilities.rs
new file mode 100644
index 0000000..bedb6aa
--- /dev/null
+++ b/crates/synor-compute/src/processor/capabilities.rs
@@ -0,0 +1,547 @@
+//! Processor capability definitions.
+
+use super::operation::OperationType;
+use super::types::PowerTier;
+use serde::{Deserialize, Serialize};
+use std::collections::HashSet;
+
+/// Detailed processor capabilities.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ProcessorCapabilities {
+    /// Compute throughput.
+    pub compute: ComputeThroughput,
+    /// Memory specifications.
+    pub memory: MemorySpecs,
+    /// Supported operations.
+    pub operations: HashSet<OperationType>,
+    /// Power characteristics.
+    pub power: PowerCharacteristics,
+    /// Optimal workload characteristics.
+    pub optimal_for: Vec<WorkloadCharacteristic>,
+}
+
+impl Default for ProcessorCapabilities {
+    fn default() -> Self {
+        Self {
+            compute: ComputeThroughput::default(),
+            memory: MemorySpecs::default(),
+            operations: Self::default_operations(),
+            power: PowerCharacteristics::default(),
+            optimal_for: vec![],
+        }
+    }
+}
+
+impl ProcessorCapabilities {
+    /// Default operations supported by most processors.
+    fn default_operations() -> HashSet<OperationType> {
+        [
+            OperationType::MatMul,
+            OperationType::Add,
+            OperationType::Mul,
+            OperationType::ReLU,
+            OperationType::Softmax,
+            OperationType::DataLoad,
+            OperationType::DataPreprocess,
+        ]
+        .into_iter()
+        .collect()
+    }
+
+    /// Creates CPU capabilities.
+    pub fn cpu(cores: u32, clock_ghz: f32, avx512: bool) -> Self {
+        let flops_per_cycle = if avx512 { 64.0 } else { 32.0 }; // FP32 ops per cycle with AVX
+        let fp32_tflops = (cores as f64 * clock_ghz as f64 * flops_per_cycle) / 1000.0;
+
+        Self {
+            compute: ComputeThroughput {
+                fp64_tflops: fp32_tflops / 2.0,
+                fp32_tflops,
+                fp16_tflops: fp32_tflops * 2.0,
+                bf16_tflops: fp32_tflops * 2.0,
+                int8_tops: fp32_tflops * 4.0,
+                int4_tops: fp32_tflops * 8.0,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 64 * 1024 * 1024 * 1024, // 64 GB typical
+                bandwidth_gbps: 200,                     // DDR5
+                type_: MemoryType::Ddr5,
+            },
+            operations: Self::cpu_operations(),
+            power: PowerCharacteristics {
+                tdp_watts: 125,
+                efficiency: 0.8,
+                power_tier: PowerTier::Medium,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::Sequential,
+                WorkloadCharacteristic::MemoryBound,
+                WorkloadCharacteristic::SmallBatch,
+            ],
+        }
+    }
+
+    /// Operations typically supported by CPUs.
+    fn cpu_operations() -> HashSet<OperationType> {
+        [
+            // Matrix operations (slow but supported)
+            OperationType::MatMul,
+            OperationType::Conv2d,
+            OperationType::BatchNorm,
+            OperationType::LayerNorm,
+            // Element-wise
+            OperationType::Add,
+            OperationType::Mul,
+            OperationType::ReLU,
+            OperationType::GeLU,
+            OperationType::Softmax,
+            // Data operations (optimal)
+            OperationType::DataLoad,
+            OperationType::DataPreprocess,
+            OperationType::Tokenization,
+            OperationType::Detokenization,
+            // Memory operations
+            OperationType::Transpose,
+            OperationType::Reshape,
+            OperationType::Concat,
+            OperationType::Split,
+            // I/O
+            OperationType::Checkpoint,
+        ]
+        .into_iter()
+        .collect()
+    }
+
+    /// Creates NVIDIA GPU capabilities.
+    pub fn nvidia_gpu(
+        cuda_cores: u32,
+        tensor_cores: u32,
+        vram_gb: u32,
+        bandwidth_gbps: u32,
+        compute_capability: (u8, u8),
+    ) -> Self {
+        // Approximate TFLOPS based on cores and typical clocks
+        let base_clock_ghz = 1.5;
+        let fp32_tflops = (cuda_cores as f64 * base_clock_ghz * 2.0) / 1000.0;
+        let tensor_multiplier = if compute_capability.0 >= 8 { 4.0 } else { 2.0 };
+
+        Self {
+            compute: ComputeThroughput {
+                fp64_tflops: fp32_tflops / 2.0,
+                fp32_tflops,
+                fp16_tflops: fp32_tflops * tensor_multiplier,
+                bf16_tflops: fp32_tflops * tensor_multiplier,
+                int8_tops: fp32_tflops * tensor_multiplier * 2.0,
+                int4_tops: fp32_tflops * tensor_multiplier * 4.0,
+                sparsity_speedup: if compute_capability.0 >= 8 { 2.0 } else { 1.0 },
+            },
+            memory: MemorySpecs {
+                capacity_bytes: vram_gb as u64 * 1024 * 1024 * 1024,
+                bandwidth_gbps,
+                type_: if compute_capability.0 >= 9 {
+                    MemoryType::Hbm3
+                } else {
+                    MemoryType::Hbm2e
+                },
+            },
+            operations: Self::gpu_operations(compute_capability),
+            power: PowerCharacteristics {
+                tdp_watts: if compute_capability.0 >= 9 { 700 } else { 350 },
+                efficiency: 0.9,
+                power_tier: PowerTier::High,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::HighlyParallel,
+                WorkloadCharacteristic::LargeBatch,
+                WorkloadCharacteristic::ComputeBound,
+            ],
+        }
+    }
+
+    /// Operations supported by GPUs.
+    fn gpu_operations(compute_capability: (u8, u8)) -> HashSet<OperationType> {
+        let mut ops: HashSet<OperationType> = [
+            // Matrix operations (optimal)
+            OperationType::MatMul,
+            OperationType::Conv2d,
+            OperationType::Conv3d,
+            OperationType::DepthwiseConv,
+            OperationType::BatchNorm,
+            OperationType::LayerNorm,
+            // Attention
+            OperationType::SelfAttention,
+            OperationType::CrossAttention,
+            // Element-wise
+            OperationType::Add,
+            OperationType::Mul,
+            OperationType::ReLU,
+            OperationType::GeLU,
+            OperationType::SiLU,
+            OperationType::Softmax,
+            // Reduction
+            OperationType::Sum,
+            OperationType::Mean,
+            OperationType::Max,
+            OperationType::ArgMax,
+            // Memory operations
+            OperationType::Transpose,
+            OperationType::Reshape,
+            OperationType::Concat,
+            OperationType::Split,
+            OperationType::Gather,
+            OperationType::Scatter,
+            // LLM specific
+            OperationType::Embedding,
+            OperationType::RoPE,
+            OperationType::KVCache,
+            OperationType::TopK,
+            OperationType::Sampling,
+        ]
+        .into_iter()
+        .collect();
+
+        // FlashAttention for newer GPUs
+        if compute_capability.0 >= 8 {
+            ops.insert(OperationType::FlashAttention);
+        }
+
+        ops
+    }
+
+    /// Creates TPU capabilities.
+    pub fn tpu(version: super::TpuVersion) -> Self {
+        let (bf16_tflops, memory_gb, bandwidth_gbps) = match version {
+            super::TpuVersion::V5p => (918.0, 95, 4800),
+            super::TpuVersion::V5e => (197.0, 16, 1600),
+            super::TpuVersion::V4 => (275.0, 32, 2400),
+            super::TpuVersion::V4i => (138.0, 32, 1200),
+            super::TpuVersion::V3 => (123.0, 16, 900),
+            super::TpuVersion::V2 => (46.0, 8, 600),
+            super::TpuVersion::Edge => (4.0, 0, 0), // Uses host memory
+        };
+
+        Self {
+            compute: ComputeThroughput {
+                fp64_tflops: 0.0, // TPUs don't support FP64
+                fp32_tflops: bf16_tflops / 2.0,
+                fp16_tflops: bf16_tflops,
+                bf16_tflops,
+                int8_tops: bf16_tflops * 2.0,
+                int4_tops: bf16_tflops * 4.0,
+                sparsity_speedup: 2.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: memory_gb as u64 * 1024 * 1024 * 1024,
+                bandwidth_gbps,
+                type_: MemoryType::Hbm2e,
+            },
+            operations: Self::tpu_operations(),
+            power: PowerCharacteristics {
+                tdp_watts: if matches!(version, super::TpuVersion::Edge) {
+                    2
+                } else {
+                    400
+                },
+                efficiency: 0.95,
+                power_tier: if matches!(version, super::TpuVersion::Edge) {
+                    PowerTier::UltraLow
+                } else {
+                    PowerTier::High
+                },
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::HighlyParallel,
+                WorkloadCharacteristic::ComputeBound,
+                WorkloadCharacteristic::FixedShape,
+                WorkloadCharacteristic::LargeBatch,
+            ],
+        }
+    }
+
+    /// Operations supported by TPUs.
+    fn tpu_operations() -> HashSet<OperationType> {
+        [
+            // Matrix operations (optimal)
+            OperationType::MatMul,
+            OperationType::Conv2d,
+            OperationType::BatchNorm,
+            OperationType::LayerNorm,
+            // Attention
+            OperationType::SelfAttention,
+            OperationType::CrossAttention,
+            OperationType::FlashAttention,
+            // Element-wise
+            OperationType::Add,
+            OperationType::Mul,
+            OperationType::ReLU,
+            OperationType::GeLU,
+            OperationType::SiLU,
+            OperationType::Softmax,
+            // Reduction
+            OperationType::Sum,
+            OperationType::Mean,
+            OperationType::Max,
+            // LLM specific
+            OperationType::Embedding,
+            OperationType::RoPE,
+            OperationType::KVCache,
+        ]
+        .into_iter()
+        .collect()
+    }
+
+    /// Creates LPU (Groq) capabilities.
+    pub fn lpu() -> Self {
+        Self {
+            compute: ComputeThroughput {
+                fp64_tflops: 0.0,
+                fp32_tflops: 0.0,
+                fp16_tflops: 188.0,
+                bf16_tflops: 188.0,
+                int8_tops: 750.0,
+                int4_tops: 1500.0,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 230 * 1024 * 1024 * 1024, // 230 GB SRAM!
+                bandwidth_gbps: 80_000,                    // 80 TB/s internal
+                type_: MemoryType::Sram,
+            },
+            operations: Self::lpu_operations(),
+            power: PowerCharacteristics {
+                tdp_watts: 300,
+                efficiency: 0.98, // Very efficient for inference
+                power_tier: PowerTier::Medium,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::Sequential,
+                WorkloadCharacteristic::SmallBatch,
+                WorkloadCharacteristic::VariableLength,
+                WorkloadCharacteristic::LowLatency,
+            ],
+        }
+    }
+
+    /// Operations supported by Groq LPU.
+    fn lpu_operations() -> HashSet<OperationType> {
+        [
+            // Optimized for inference
+            OperationType::MatMul,
+            OperationType::LayerNorm,
+            OperationType::SelfAttention,
+            OperationType::Add,
+            OperationType::Mul,
+            OperationType::ReLU,
+            OperationType::GeLU,
+            OperationType::SiLU,
+            OperationType::Softmax,
+            OperationType::Embedding,
+            OperationType::RoPE,
+            OperationType::KVCache,
+            OperationType::TopK,
+            OperationType::Sampling,
+        ]
+        .into_iter()
+        .collect()
+    }
+
+    /// Creates Apple Neural Engine capabilities.
+    pub fn apple_neural_engine(cores: u32) -> Self {
+        let int8_tops = match cores {
+            16 => 18.0,  // M3
+            32 => 35.0,  // M3 Max
+            _ => cores as f64 * 1.1,
+        };
+
+        Self {
+            compute: ComputeThroughput {
+                fp64_tflops: 0.0,
+                fp32_tflops: int8_tops / 4.0,
+                fp16_tflops: int8_tops / 2.0,
+                bf16_tflops: int8_tops / 2.0,
+                int8_tops,
+                int4_tops: int8_tops * 2.0,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 0, // Uses unified memory
+                bandwidth_gbps: 400,
+                type_: MemoryType::Unified,
+            },
+            operations: Self::npu_operations(),
+            power: PowerCharacteristics {
+                tdp_watts: 15,
+                efficiency: 0.95,
+                power_tier: PowerTier::UltraLow,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::LowPower,
+                WorkloadCharacteristic::LowLatency,
+                WorkloadCharacteristic::SmallBatch,
+            ],
+        }
+    }
+
+    /// Operations supported by NPUs.
+    fn npu_operations() -> HashSet<OperationType> {
+        [
+            // Inference optimized
+            OperationType::MatMul,
+            OperationType::Conv2d,
+            OperationType::DepthwiseConv,
+            OperationType::BatchNorm,
+            OperationType::LayerNorm,
+            OperationType::Add,
+            OperationType::Mul,
+            OperationType::ReLU,
+            OperationType::Softmax,
+            OperationType::Embedding,
+        ]
+        .into_iter()
+        .collect()
+    }
+}
+
+/// Compute throughput metrics.
+#[derive(Clone, Debug, Default, Serialize, Deserialize)]
+pub struct ComputeThroughput {
+    /// FP64 TFLOPS.
+    pub fp64_tflops: f64,
+    /// FP32 TFLOPS.
+    pub fp32_tflops: f64,
+    /// FP16 TFLOPS.
+    pub fp16_tflops: f64,
+    /// BF16 TFLOPS.
+    pub bf16_tflops: f64,
+    /// INT8 TOPS.
+    pub int8_tops: f64,
+    /// INT4 TOPS.
+    pub int4_tops: f64,
+    /// Speedup for sparse operations.
+    pub sparsity_speedup: f64,
+}
+
+/// Memory specifications.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct MemorySpecs {
+    /// Total capacity (bytes).
+    pub capacity_bytes: u64,
+    /// Bandwidth (GB/s).
+    pub bandwidth_gbps: u32,
+    /// Memory type.
+    pub type_: MemoryType,
+}
+
+impl Default for MemorySpecs {
+    fn default() -> Self {
+        Self {
+            capacity_bytes: 16 * 1024 * 1024 * 1024, // 16 GB
+            bandwidth_gbps: 500,
+            type_: MemoryType::Ddr5,
+        }
+    }
+}
+
+/// Memory types.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum MemoryType {
+    /// DDR4 RAM.
+    Ddr4,
+    /// DDR5 RAM.
+    Ddr5,
+    /// GDDR6/6X video memory.
+    Gddr6,
+    /// HBM2.
+    Hbm2,
+    /// HBM2e.
+    Hbm2e,
+    /// HBM3.
+    Hbm3,
+    /// SRAM (on-chip).
+    Sram,
+    /// Unified memory (Apple Silicon).
+    Unified,
+    /// LPDDR (mobile).
+    Lpddr,
+}
+
+/// Power characteristics.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct PowerCharacteristics {
+    /// TDP in watts.
+    pub tdp_watts: u32,
+    /// Efficiency factor (0.0 - 1.0).
+    pub efficiency: f64,
+    /// Power tier.
+    pub power_tier: PowerTier,
+}
+
+impl Default for PowerCharacteristics {
+    fn default() -> Self {
+        Self {
+            tdp_watts: 100,
+            efficiency: 0.8,
+            power_tier: PowerTier::Medium,
+        }
+    }
+}
+
+/// Workload characteristics for processor matching.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum WorkloadCharacteristic {
+    /// High parallelism (GPU, TPU).
+    HighlyParallel,
+    /// Sequential dependencies (CPU, LPU).
+    Sequential,
+    /// Memory bandwidth bound (GPU).
+    MemoryBound,
+    /// Compute bound (TPU).
+    ComputeBound,
+    /// Low latency required (NPU, edge).
+    LowLatency,
+    /// Low power required (NPU, mobile).
+    LowPower,
+    /// Large batch sizes (GPU, TPU).
+    LargeBatch,
+    /// Small batch sizes (CPU, LPU).
+    SmallBatch,
+    /// Variable length sequences (LPU).
+    VariableLength,
+    /// Fixed tensor shapes (TPU).
+    FixedShape,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_cpu_capabilities() {
+        let caps = ProcessorCapabilities::cpu(32, 3.5, true);
+        assert!(caps.compute.fp32_tflops > 0.0);
+        assert!(caps.operations.contains(&OperationType::DataLoad));
+        assert!(caps.operations.contains(&OperationType::Tokenization));
+    }
+
+    #[test]
+    fn test_gpu_capabilities() {
+        let caps = ProcessorCapabilities::nvidia_gpu(16384, 512, 24, 1000, (8, 9));
+        assert!(caps.compute.fp16_tflops > caps.compute.fp32_tflops);
+        assert!(caps.operations.contains(&OperationType::FlashAttention));
+    }
+
+    #[test]
+    fn test_tpu_capabilities() {
+        let caps = ProcessorCapabilities::tpu(super::super::TpuVersion::V5p);
+        assert!(caps.compute.bf16_tflops > 900.0);
+        assert!(!caps.operations.contains(&OperationType::DataLoad)); // TPUs don't do I/O
+    }
+
+    #[test]
+    fn test_lpu_capabilities() {
+        let caps = ProcessorCapabilities::lpu();
+        assert!(caps.memory.bandwidth_gbps > 10000); // Very high internal bandwidth
+        assert!(caps.optimal_for.contains(&WorkloadCharacteristic::Sequential));
+    }
+}
diff --git a/crates/synor-compute/src/processor/mod.rs b/crates/synor-compute/src/processor/mod.rs
new file mode 100644
index 0000000..3bca36d
--- /dev/null
+++ b/crates/synor-compute/src/processor/mod.rs
@@ -0,0 +1,339 @@
+//! Processor abstractions for heterogeneous compute.
+//!
+//! Supports all processor types:
+//! - CPU (x86_64, ARM64, RISC-V)
+//! - GPU (NVIDIA CUDA, AMD ROCm, Intel OneAPI, Apple Metal)
+//! - TPU (Google TPU v2-v5)
+//! - NPU (Apple Neural Engine, Qualcomm Hexagon, Intel VPU)
+//! - LPU (Groq Language Processing Unit)
+//! - FPGA (Xilinx, Intel/Altera)
+//! - DSP (Digital Signal Processors)
+//! - Custom accelerators
+
+mod capabilities;
+mod operation;
+mod profiles;
+mod types;
+
+pub use capabilities::{ComputeThroughput, MemorySpecs, MemoryType, ProcessorCapabilities};
+pub use operation::{Operation, OperationType};
+pub use profiles::ProcessorProfiles;
+pub use types::*;
+
+use crate::error::ComputeError;
+use async_trait::async_trait;
+use serde::{Deserialize, Serialize};
+use std::time::Duration;
+
+/// Unique processor identifier (within a node).
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct ProcessorId(pub u64);
+
+impl std::fmt::Display for ProcessorId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "proc_{}", self.0)
+    }
+}
+
+/// Unified abstraction for any processor type.
+#[async_trait]
+pub trait Processor: Send + Sync {
+    /// Get processor ID.
+    fn id(&self) -> ProcessorId;
+
+    /// Get processor type.
+    fn processor_type(&self) -> ProcessorType;
+
+    /// Get capabilities.
+    fn capabilities(&self) -> &ProcessorCapabilities;
+
+    /// Check if processor can execute operation.
+    fn can_execute(&self, op: &Operation) -> bool;
+
+    /// Estimate execution time for operation.
+    fn estimate_time(&self, op: &Operation) -> Duration;
+
+    /// Estimate energy consumption for operation (Joules).
+    fn estimate_energy(&self, op: &Operation) -> f64;
+
+    /// Execute operation.
+    async fn execute(&self, op: Operation) -> Result<OperationResult, ComputeError>;
+
+    /// Current utilization (0.0 - 1.0).
+    fn utilization(&self) -> f64;
+
+    /// Available memory (bytes).
+    fn available_memory(&self) -> u64;
+
+    /// Check if this processor shares memory with another type.
+    fn shares_memory_with(&self, other: &ProcessorType) -> bool {
+        // By default, processors don't share memory
+        // Override for unified memory architectures (Apple Silicon, AMD APUs)
+        self.processor_type() == *other
+    }
+}
+
+/// Result of an operation execution.
+#[derive(Clone, Debug)]
+pub struct OperationResult {
+    /// Output data.
+    pub output: Vec<u8>,
+    /// Execution time.
+    pub duration: Duration,
+    /// Energy consumed (Joules).
+    pub energy: f64,
+    /// Peak memory used (bytes).
+    pub peak_memory: u64,
+}
+
+/// Generic processor implementation for simulation/testing.
+pub struct GenericProcessor {
+    id: ProcessorId,
+    processor_type: ProcessorType,
+    capabilities: ProcessorCapabilities,
+    utilization: std::sync::atomic::AtomicU64,
+    available_memory: std::sync::atomic::AtomicU64,
+}
+
+impl GenericProcessor {
+    /// Creates a new generic processor.
+    pub fn new(
+        id: ProcessorId,
+        processor_type: ProcessorType,
+        capabilities: ProcessorCapabilities,
+    ) -> Self {
+        let available_memory = capabilities.memory.capacity_bytes;
+        Self {
+            id,
+            processor_type,
+            capabilities,
+            utilization: std::sync::atomic::AtomicU64::new(0),
+            available_memory: std::sync::atomic::AtomicU64::new(available_memory),
+        }
+    }
+
+    /// Creates a CPU processor.
+    pub fn cpu(id: ProcessorId, variant: CpuVariant) -> Self {
+        Self::new(
+            id,
+            ProcessorType::Cpu(variant),
+            ProcessorProfiles::cpu_default(),
+        )
+    }
+
+    /// Creates an NVIDIA GPU processor.
+    pub fn nvidia_gpu(id: ProcessorId, compute_capability: (u8, u8)) -> Self {
+        let capabilities = match compute_capability {
+            (9, 0) => ProcessorProfiles::nvidia_h100(),
+            (8, 9) => ProcessorProfiles::nvidia_rtx_4090(),
+            (8, 6) => ProcessorProfiles::nvidia_rtx_3090(),
+            _ => ProcessorProfiles::nvidia_default(),
+        };
+        Self::new(
+            id,
+            ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability }),
+            capabilities,
+        )
+    }
+
+    /// Creates a TPU processor.
+    pub fn tpu(id: ProcessorId, version: TpuVersion) -> Self {
+        let capabilities = match version {
+            TpuVersion::V5p => ProcessorProfiles::google_tpu_v5p(),
+            TpuVersion::V4 => ProcessorProfiles::google_tpu_v4(),
+            _ => ProcessorProfiles::google_tpu_default(),
+        };
+        Self::new(id, ProcessorType::Tpu(version), capabilities)
+    }
+
+    /// Creates a Groq LPU processor.
+    pub fn lpu(id: ProcessorId) -> Self {
+        Self::new(id, ProcessorType::Lpu, ProcessorProfiles::groq_lpu())
+    }
+
+    /// Creates an Apple Neural Engine processor.
+    pub fn apple_neural_engine(id: ProcessorId, cores: u32) -> Self {
+        Self::new(
+            id,
+            ProcessorType::Npu(NpuVariant::AppleNeuralEngine { cores }),
+            ProcessorProfiles::apple_neural_engine(cores),
+        )
+    }
+}
+
+#[async_trait]
+impl Processor for GenericProcessor {
+    fn id(&self) -> ProcessorId {
+        self.id
+    }
+
+    fn processor_type(&self) -> ProcessorType {
+        self.processor_type.clone()
+    }
+
+    fn capabilities(&self) -> &ProcessorCapabilities {
+        &self.capabilities
+    }
+
+    fn can_execute(&self, op: &Operation) -> bool {
+        self.capabilities.operations.contains(&op.op_type())
+    }
+
+    fn estimate_time(&self, op: &Operation) -> Duration {
+        // Estimate based on FLOPS and operation complexity
+        let flops_needed = op.estimated_flops();
+        let throughput = match op.precision() {
+            Precision::Fp32 => self.capabilities.compute.fp32_tflops,
+            Precision::Fp16 => self.capabilities.compute.fp16_tflops,
+            Precision::Bf16 => self.capabilities.compute.bf16_tflops,
+            Precision::Int8 => self.capabilities.compute.int8_tops,
+            Precision::Int4 => self.capabilities.compute.int4_tops,
+            Precision::Fp64 => self.capabilities.compute.fp64_tflops,
+        };
+
+        if throughput > 0.0 {
+            let tflops = throughput;
+            let flops_per_second = tflops * 1e12;
+            let seconds = flops_needed / flops_per_second;
+            Duration::from_secs_f64(seconds)
+        } else {
+            Duration::from_secs(1) // Fallback
+        }
+    }
+
+    fn estimate_energy(&self, op: &Operation) -> f64 {
+        // Estimate based on TDP and execution time
+        let duration = self.estimate_time(op);
+        let watts = self.capabilities.power.tdp_watts as f64;
+        let efficiency = self.capabilities.power.efficiency;
+        watts * duration.as_secs_f64() * efficiency
+    }
+
+    async fn execute(&self, op: Operation) -> Result<OperationResult, ComputeError> {
+        // Check if we can execute
+        if !self.can_execute(&op) {
+            return Err(ComputeError::OperationNotSupported(
+                self.processor_type.clone(),
+                format!("{:?}", op.op_type()),
+            ));
+        }
+
+        // Simulate execution
+        let duration = self.estimate_time(&op);
+        let energy = self.estimate_energy(&op);
+
+        // Update utilization
+        self.utilization
+            .store(50, std::sync::atomic::Ordering::Relaxed);
+
+        // Simulate work
+        tokio::time::sleep(Duration::from_micros(100)).await;
+
+        // Reset utilization
+        self.utilization
+            .store(0, std::sync::atomic::Ordering::Relaxed);
+
+        Ok(OperationResult {
+            output: vec![],
+            duration,
+            energy,
+            peak_memory: op.estimated_memory(),
+        })
+    }
+
+    fn utilization(&self) -> f64 {
+        self.utilization.load(std::sync::atomic::Ordering::Relaxed) as f64 / 100.0
+    }
+
+    fn available_memory(&self) -> u64 {
+        self.available_memory
+            .load(std::sync::atomic::Ordering::Relaxed)
+    }
+
+    fn shares_memory_with(&self, other: &ProcessorType) -> bool {
+        match (&self.processor_type, other) {
+            // Apple Silicon has unified memory
+            (ProcessorType::Cpu(CpuVariant::Arm64 { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal))
+            | (ProcessorType::Gpu(GpuVariant::AppleMetal), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
+            | (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
+            | (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal)) => true,
+            // Same type always shares
+            (a, b) if a == b => true,
+            _ => false,
+        }
+    }
+}
+
+/// Precision for operations.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum Precision {
+    Fp64,
+    Fp32,
+    Fp16,
+    Bf16,
+    Int8,
+    Int4,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_processor_creation() {
+        let cpu = GenericProcessor::cpu(
+            ProcessorId(0),
+            CpuVariant::X86_64 {
+                avx: AvxSupport::Avx512,
+            },
+        );
+
+        assert_eq!(cpu.id(), ProcessorId(0));
+        assert!(matches!(cpu.processor_type(), ProcessorType::Cpu(_)));
+    }
+
+    #[test]
+    fn test_gpu_creation() {
+        let gpu = GenericProcessor::nvidia_gpu(ProcessorId(1), (9, 0));
+
+        assert_eq!(gpu.id(), ProcessorId(1));
+        assert!(matches!(
+            gpu.processor_type(),
+            ProcessorType::Gpu(GpuVariant::NvidiaCuda { .. })
+        ));
+    }
+
+    #[test]
+    fn test_unified_memory() {
+        let apple_cpu = GenericProcessor::new(
+            ProcessorId(0),
+            ProcessorType::Cpu(CpuVariant::Arm64 { sve: false }),
+            ProcessorCapabilities::default(),
+        );
+
+        assert!(apple_cpu.shares_memory_with(&ProcessorType::Gpu(GpuVariant::AppleMetal)));
+    }
+
+    #[tokio::test]
+    async fn test_operation_execution() {
+        let cpu = GenericProcessor::cpu(
+            ProcessorId(0),
+            CpuVariant::X86_64 {
+                avx: AvxSupport::Avx512,
+            },
+        );
+
+        let op = Operation::MatMul {
+            m: 1024,
+            n: 1024,
+            k: 1024,
+            precision: Precision::Fp32,
+        };
+
+        // CPU might not support all ops depending on capabilities
+        // This is testing the infrastructure
+        let result = cpu.execute(op).await;
+        // Result depends on capabilities
+        assert!(result.is_ok() || result.is_err());
+    }
+}
diff --git a/crates/synor-compute/src/processor/operation.rs b/crates/synor-compute/src/processor/operation.rs
new file mode 100644
index 0000000..41d52b3
--- /dev/null
+++ b/crates/synor-compute/src/processor/operation.rs
@@ -0,0 +1,543 @@
+//! Operation definitions for heterogeneous compute.
+
+use super::Precision;
+use serde::{Deserialize, Serialize};
+
+/// Operation types for processor matching.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum OperationType {
+    // Matrix operations
+    MatMul,
+    Conv2d,
+    Conv3d,
+    DepthwiseConv,
+    BatchNorm,
+    LayerNorm,
+
+    // Attention operations
+    SelfAttention,
+    CrossAttention,
+    FlashAttention,
+
+    // Element-wise operations
+    Add,
+    Mul,
+    ReLU,
+    GeLU,
+    SiLU,
+    Softmax,
+
+    // Reduction operations
+    Sum,
+    Mean,
+    Max,
+    ArgMax,
+
+    // Data movement
+    Transpose,
+    Reshape,
+    Concat,
+    Split,
+    Gather,
+    Scatter,
+
+    // LLM specific
+    Embedding,
+    RoPE, // Rotary Position Embedding
+    KVCache,
+    TopK,
+    Sampling,
+
+    // I/O operations
+    DataLoad,
+    DataPreprocess,
+    Tokenization,
+    Detokenization,
+    Checkpoint,
+
+    // Distributed operations
+    AllReduce,
+    AllGather,
+    ReduceScatter,
+
+    // Training specific
+    Backward,
+    OptimizerStep,
+    GradientClip,
+}
+
+/// Concrete operation with parameters.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub enum Operation {
+    /// Matrix multiplication.
+    MatMul {
+        m: usize,
+        n: usize,
+        k: usize,
+        precision: Precision,
+    },
+
+    /// 2D Convolution.
+    Conv2d {
+        batch: usize,
+        in_channels: usize,
+        out_channels: usize,
+        height: usize,
+        width: usize,
+        kernel_size: usize,
+        precision: Precision,
+    },
+
+    /// Batch normalization.
+    BatchNorm {
+        batch: usize,
+        channels: usize,
+        spatial: usize,
+        precision: Precision,
+    },
+
+    /// Layer normalization.
+    LayerNorm {
+        batch: usize,
+        seq_len: usize,
+        hidden: usize,
+        precision: Precision,
+    },
+
+    /// Self-attention.
+    SelfAttention {
+        batch: usize,
+        seq_len: usize,
+        num_heads: usize,
+        head_dim: usize,
+        precision: Precision,
+    },
+
+    /// Flash attention (fused, memory efficient).
+    FlashAttention {
+        batch: usize,
+        seq_len: usize,
+        num_heads: usize,
+        head_dim: usize,
+        precision: Precision,
+    },
+
+    /// Element-wise addition.
+    Add {
+        elements: usize,
+        precision: Precision,
+    },
+
+    /// Element-wise multiplication.
+    Mul {
+        elements: usize,
+        precision: Precision,
+    },
+
+    /// ReLU activation.
+    ReLU { elements: usize },
+
+    /// GeLU activation.
+    GeLU { elements: usize },
+
+    /// SiLU (Swish) activation.
+    SiLU { elements: usize },
+
+    /// Softmax.
+    Softmax {
+        batch: usize,
+        seq_len: usize,
+        precision: Precision,
+    },
+
+    /// Embedding lookup.
+    Embedding {
+        batch: usize,
+        seq_len: usize,
+        vocab_size: usize,
+        embed_dim: usize,
+        precision: Precision,
+    },
+
+    /// Rotary Position Embedding.
+    RoPE {
+        batch: usize,
+        seq_len: usize,
+        head_dim: usize,
+        precision: Precision,
+    },
+
+    /// KV Cache update.
+    KVCache {
+        batch: usize,
+        seq_len: usize,
+        num_heads: usize,
+        head_dim: usize,
+        precision: Precision,
+    },
+
+    /// Top-K sampling.
+    TopK {
+        batch: usize,
+        vocab_size: usize,
+        k: usize,
+    },
+
+    /// Token sampling.
+    Sampling {
+        batch: usize,
+        vocab_size: usize,
+        temperature: f32,
+    },
+
+    /// Data loading from storage.
+    DataLoad {
+        bytes: usize,
+        async_: bool,
+    },
+
+    /// Data preprocessing.
+    DataPreprocess {
+        batch: usize,
+        transforms: Vec<String>,
+    },
+
+    /// Tokenization.
+    Tokenization {
+        text_bytes: usize,
+        vocab_size: usize,
+    },
+
+    /// Detokenization.
+    Detokenization {
+        tokens: usize,
+        vocab_size: usize,
+    },
+
+    /// Checkpoint save.
+    Checkpoint {
+        bytes: usize,
+        async_: bool,
+    },
+
+    /// All-reduce across devices.
+    AllReduce {
+        elements: usize,
+        precision: Precision,
+        devices: usize,
+    },
+
+    /// Backward pass for a layer.
+    Backward {
+        forward_op: Box<Operation>,
+    },
+
+    /// Optimizer step.
+    OptimizerStep {
+        parameters: usize,
+        optimizer: String,
+        precision: Precision,
+    },
+
+    /// Transpose.
+    Transpose {
+        shape: Vec<usize>,
+        axes: Vec<usize>,
+    },
+
+    /// Reshape.
+    Reshape {
+        from: Vec<usize>,
+        to: Vec<usize>,
+    },
+
+    /// Concatenate tensors.
+    Concat {
+        shapes: Vec<Vec<usize>>,
+        axis: usize,
+    },
+
+    /// Generic operation.
+    Generic {
+        op_type: OperationType,
+        flops: f64,
+        memory: u64,
+    },
+}
+
+impl Operation {
+    /// Returns the operation type.
+    pub fn op_type(&self) -> OperationType {
+        match self {
+            Operation::MatMul { .. } => OperationType::MatMul,
+            Operation::Conv2d { .. } => OperationType::Conv2d,
+            Operation::BatchNorm { .. } => OperationType::BatchNorm,
+            Operation::LayerNorm { .. } => OperationType::LayerNorm,
+            Operation::SelfAttention { .. } => OperationType::SelfAttention,
+            Operation::FlashAttention { .. } => OperationType::FlashAttention,
+            Operation::Add { .. } => OperationType::Add,
+            Operation::Mul { .. } => OperationType::Mul,
+            Operation::ReLU { .. } => OperationType::ReLU,
+            Operation::GeLU { .. } => OperationType::GeLU,
+            Operation::SiLU { .. } => OperationType::SiLU,
+            Operation::Softmax { .. } => OperationType::Softmax,
+            Operation::Embedding { .. } => OperationType::Embedding,
+            Operation::RoPE { .. } => OperationType::RoPE,
+            Operation::KVCache { .. } => OperationType::KVCache,
+            Operation::TopK { .. } => OperationType::TopK,
+            Operation::Sampling { .. } => OperationType::Sampling,
+            Operation::DataLoad { .. } => OperationType::DataLoad,
+            Operation::DataPreprocess { .. } => OperationType::DataPreprocess,
+            Operation::Tokenization { .. } => OperationType::Tokenization,
+            Operation::Detokenization { .. } => OperationType::Detokenization,
+            Operation::Checkpoint { .. } => OperationType::Checkpoint,
+            Operation::AllReduce { .. } => OperationType::AllReduce,
+            Operation::Backward { .. } => OperationType::Backward,
+            Operation::OptimizerStep { .. } => OperationType::OptimizerStep,
+            Operation::Transpose { .. } => OperationType::Transpose,
+            Operation::Reshape { .. } => OperationType::Reshape,
+            Operation::Concat { .. } => OperationType::Concat,
+            Operation::Generic { op_type, .. } => *op_type,
+        }
+    }
+
+    /// Returns the precision used.
+    pub fn precision(&self) -> Precision {
+        match self {
+            Operation::MatMul { precision, .. }
+            | Operation::Conv2d { precision, .. }
+            | Operation::BatchNorm { precision, .. }
+            | Operation::LayerNorm { precision, .. }
+            | Operation::SelfAttention { precision, .. }
+            | Operation::FlashAttention { precision, .. }
+            | Operation::Add { precision, .. }
+            | Operation::Mul { precision, .. }
+            | Operation::Softmax { precision, .. }
+            | Operation::Embedding { precision, .. }
+            | Operation::RoPE { precision, .. }
+            | Operation::KVCache { precision, .. }
+            | Operation::AllReduce { precision, .. }
+            | Operation::OptimizerStep { precision, .. } => *precision,
+            Operation::Backward { forward_op } => forward_op.precision(),
+            _ => Precision::Fp32, // Default
+        }
+    }
+
+    /// Estimates FLOPS for the operation.
+    pub fn estimated_flops(&self) -> f64 {
+        match self {
+            // MatMul: 2 * M * N * K (multiply-add)
+            Operation::MatMul { m, n, k, .. } => 2.0 * (*m as f64) * (*n as f64) * (*k as f64),
+
+            // Conv2d: 2 * batch * out * H * W * in * K * K
+            Operation::Conv2d {
+                batch,
+                in_channels,
+                out_channels,
+                height,
+                width,
+                kernel_size,
+                ..
+            } => {
+                2.0 * (*batch as f64)
+                    * (*out_channels as f64)
+                    * (*height as f64)
+                    * (*width as f64)
+                    * (*in_channels as f64)
+                    * (*kernel_size as f64)
+                    * (*kernel_size as f64)
+            }
+
+            // Self-attention: 4 * batch * seq * seq * head_dim * heads
+            Operation::SelfAttention {
+                batch,
+                seq_len,
+                num_heads,
+                head_dim,
+                ..
+            }
+            | Operation::FlashAttention {
+                batch,
+                seq_len,
+                num_heads,
+                head_dim,
+                ..
+            } => {
+                4.0 * (*batch as f64)
+                    * (*seq_len as f64)
+                    * (*seq_len as f64)
+                    * (*head_dim as f64)
+                    * (*num_heads as f64)
+            }
+
+            // Element-wise: 1 FLOP per element
+            Operation::Add { elements, .. }
+            | Operation::Mul { elements, .. }
+            | Operation::ReLU { elements }
+            | Operation::GeLU { elements }
+            | Operation::SiLU { elements } => *elements as f64,
+
+            // Softmax: ~5 ops per element (exp, sum, div)
+            Operation::Softmax {
+                batch, seq_len, ..
+            } => 5.0 * (*batch as f64) * (*seq_len as f64),
+
+            // Embedding: just lookup, minimal FLOPS
+            Operation::Embedding {
+                batch,
+                seq_len,
+                embed_dim,
+                ..
+            } => (*batch as f64) * (*seq_len as f64) * (*embed_dim as f64) * 0.1,
+
+            // Backward: ~2x forward
+            Operation::Backward { forward_op } => forward_op.estimated_flops() * 2.0,
+
+            // Generic
+            Operation::Generic { flops, .. } => *flops,
+
+            // I/O operations: minimal compute
+            _ => 1000.0,
+        }
+    }
+
+    /// Estimates memory usage (bytes).
+    pub fn estimated_memory(&self) -> u64 {
+        let precision_bytes = match self.precision() {
+            Precision::Fp64 => 8,
+            Precision::Fp32 => 4,
+            Precision::Fp16 | Precision::Bf16 => 2,
+            Precision::Int8 => 1,
+            Precision::Int4 => 1, // Rounded up
+        };
+
+        match self {
+            Operation::MatMul { m, n, k, .. } => {
+                // Input A (m×k) + Input B (k×n) + Output (m×n)
+                ((*m * *k) + (*k * *n) + (*m * *n)) as u64 * precision_bytes
+            }
+
+            Operation::SelfAttention {
+                batch,
+                seq_len,
+                num_heads,
+                head_dim,
+                ..
+            } => {
+                // Q, K, V, Output, intermediate attention
+                5 * (*batch as u64)
+                    * (*seq_len as u64)
+                    * (*num_heads as u64)
+                    * (*head_dim as u64)
+                    * precision_bytes
+            }
+
+            Operation::FlashAttention {
+                batch,
+                seq_len,
+                num_heads,
+                head_dim,
+                ..
+            } => {
+                // FlashAttention uses much less memory
+                2 * (*batch as u64)
+                    * (*seq_len as u64)
+                    * (*num_heads as u64)
+                    * (*head_dim as u64)
+                    * precision_bytes
+            }
+
+            Operation::KVCache {
+                batch,
+                seq_len,
+                num_heads,
+                head_dim,
+                ..
+            } => {
+                // K and V caches
+                2 * (*batch as u64)
+                    * (*seq_len as u64)
+                    * (*num_heads as u64)
+                    * (*head_dim as u64)
+                    * precision_bytes
+            }
+
+            Operation::Generic { memory, .. } => *memory,
+
+            _ => 1024 * 1024, // 1 MB default
+        }
+    }
+
+    /// Creates the backward operation for this operation.
+    pub fn backward(&self) -> Option<Operation> {
+        match self {
+            Operation::MatMul { .. }
+            | Operation::Conv2d { .. }
+            | Operation::SelfAttention { .. }
+            | Operation::FlashAttention { .. }
+            | Operation::LayerNorm { .. }
+            | Operation::BatchNorm { .. } => Some(Operation::Backward {
+                forward_op: Box::new(self.clone()),
+            }),
+            _ => None,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_matmul_flops() {
+        let op = Operation::MatMul {
+            m: 1024,
+            n: 1024,
+            k: 1024,
+            precision: Precision::Fp32,
+        };
+
+        let flops = op.estimated_flops();
+        // 2 * 1024^3 = ~2.1 billion FLOPS
+        assert!(flops > 2e9 && flops < 2.2e9);
+    }
+
+    #[test]
+    fn test_attention_memory() {
+        let regular = Operation::SelfAttention {
+            batch: 1,
+            seq_len: 4096,
+            num_heads: 32,
+            head_dim: 128,
+            precision: Precision::Fp16,
+        };
+
+        let flash = Operation::FlashAttention {
+            batch: 1,
+            seq_len: 4096,
+            num_heads: 32,
+            head_dim: 128,
+            precision: Precision::Fp16,
+        };
+
+        // FlashAttention should use less memory
+        assert!(flash.estimated_memory() < regular.estimated_memory());
+    }
+
+    #[test]
+    fn test_backward_creation() {
+        let forward = Operation::MatMul {
+            m: 1024,
+            n: 1024,
+            k: 1024,
+            precision: Precision::Fp32,
+        };
+
+        let backward = forward.backward();
+        assert!(backward.is_some());
+
+        if let Some(Operation::Backward { forward_op }) = backward {
+            assert!(matches!(*forward_op, Operation::MatMul { .. }));
+        }
+    }
+}
diff --git a/crates/synor-compute/src/processor/profiles.rs b/crates/synor-compute/src/processor/profiles.rs
new file mode 100644
index 0000000..f61be69
--- /dev/null
+++ b/crates/synor-compute/src/processor/profiles.rs
@@ -0,0 +1,513 @@
+//! Pre-defined processor profiles for common hardware.
+
+use super::capabilities::{
+    ComputeThroughput, MemorySpecs, MemoryType, PowerCharacteristics, ProcessorCapabilities,
+    WorkloadCharacteristic,
+};
+use super::operation::OperationType;
+use super::types::PowerTier;
+use super::TpuVersion;
+use std::collections::HashSet;
+
+/// Pre-defined processor profiles.
+pub struct ProcessorProfiles;
+
+impl ProcessorProfiles {
+    // ═══════════════════════════════════════════════════════════════
+    // CPU PROFILES
+    // ═══════════════════════════════════════════════════════════════
+
+    /// Default CPU profile.
+    pub fn cpu_default() -> ProcessorCapabilities {
+        ProcessorCapabilities::cpu(8, 3.5, false)
+    }
+
+    /// AMD EPYC 9654 (96 cores).
+    pub fn amd_epyc_9654() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 2.7,
+                fp32_tflops: 5.4,
+                fp16_tflops: 10.8,
+                bf16_tflops: 10.8,
+                int8_tops: 21.6,
+                int4_tops: 43.2,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 6 * 1024 * 1024 * 1024 * 1024, // 6 TB max
+                bandwidth_gbps: 460,
+                type_: MemoryType::Ddr5,
+            },
+            operations: ProcessorCapabilities::cpu(96, 2.4, false)
+                .operations,
+            power: PowerCharacteristics {
+                tdp_watts: 360,
+                efficiency: 0.85,
+                power_tier: PowerTier::High,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::Sequential,
+                WorkloadCharacteristic::MemoryBound,
+            ],
+        }
+    }
+
+    /// Intel Xeon w9-3595X (56 cores).
+    pub fn intel_xeon_w9_3595x() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 3.2,
+                fp32_tflops: 6.4,
+                fp16_tflops: 12.8,
+                bf16_tflops: 12.8,
+                int8_tops: 25.6,
+                int4_tops: 51.2,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 4 * 1024 * 1024 * 1024 * 1024, // 4 TB max
+                bandwidth_gbps: 307,
+                type_: MemoryType::Ddr5,
+            },
+            operations: ProcessorCapabilities::cpu(56, 2.9, true)
+                .operations,
+            power: PowerCharacteristics {
+                tdp_watts: 350,
+                efficiency: 0.80,
+                power_tier: PowerTier::High,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::Sequential,
+                WorkloadCharacteristic::MemoryBound,
+            ],
+        }
+    }
+
+    /// Apple M3 Max CPU cores.
+    pub fn apple_m3_max_cpu() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 0.3,
+                fp32_tflops: 0.6,
+                fp16_tflops: 1.2,
+                bf16_tflops: 1.2,
+                int8_tops: 2.4,
+                int4_tops: 4.8,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 128 * 1024 * 1024 * 1024, // 128 GB unified
+                bandwidth_gbps: 400,
+                type_: MemoryType::Unified,
+            },
+            operations: ProcessorCapabilities::cpu(16, 4.0, false)
+                .operations,
+            power: PowerCharacteristics {
+                tdp_watts: 40,
+                efficiency: 0.95,
+                power_tier: PowerTier::Low,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::Sequential,
+                WorkloadCharacteristic::LowPower,
+            ],
+        }
+    }
+
+    // ═══════════════════════════════════════════════════════════════
+    // NVIDIA GPU PROFILES
+    // ═══════════════════════════════════════════════════════════════
+
+    /// Default NVIDIA GPU profile.
+    pub fn nvidia_default() -> ProcessorCapabilities {
+        ProcessorCapabilities::nvidia_gpu(8192, 256, 12, 600, (8, 0))
+    }
+
+    /// NVIDIA H100 SXM (80GB).
+    pub fn nvidia_h100() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 67.0,
+                fp32_tflops: 67.0,
+                fp16_tflops: 1979.0, // With sparsity
+                bf16_tflops: 1979.0,
+                int8_tops: 3958.0,
+                int4_tops: 7916.0,
+                sparsity_speedup: 2.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 80 * 1024 * 1024 * 1024,
+                bandwidth_gbps: 3350,
+                type_: MemoryType::Hbm3,
+            },
+            operations: ProcessorCapabilities::nvidia_gpu(16896, 528, 80, 3350, (9, 0))
+                .operations,
+            power: PowerCharacteristics {
+                tdp_watts: 700,
+                efficiency: 0.90,
+                power_tier: PowerTier::High,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::HighlyParallel,
+                WorkloadCharacteristic::LargeBatch,
+                WorkloadCharacteristic::ComputeBound,
+            ],
+        }
+    }
+
+    /// NVIDIA A100 (80GB).
+    pub fn nvidia_a100() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 19.5,
+                fp32_tflops: 19.5,
+                fp16_tflops: 624.0, // With sparsity
+                bf16_tflops: 624.0,
+                int8_tops: 1248.0,
+                int4_tops: 2496.0,
+                sparsity_speedup: 2.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 80 * 1024 * 1024 * 1024,
+                bandwidth_gbps: 2039,
+                type_: MemoryType::Hbm2e,
+            },
+            operations: ProcessorCapabilities::nvidia_gpu(6912, 432, 80, 2039, (8, 0))
+                .operations,
+            power: PowerCharacteristics {
+                tdp_watts: 400,
+                efficiency: 0.88,
+                power_tier: PowerTier::High,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::HighlyParallel,
+                WorkloadCharacteristic::LargeBatch,
+                WorkloadCharacteristic::ComputeBound,
+            ],
+        }
+    }
+
+    /// NVIDIA RTX 4090.
+    pub fn nvidia_rtx_4090() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 1.3,
+                fp32_tflops: 82.6,
+                fp16_tflops: 330.4, // With sparsity
+                bf16_tflops: 330.4,
+                int8_tops: 660.8,
+                int4_tops: 1321.6,
+                sparsity_speedup: 2.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 24 * 1024 * 1024 * 1024,
+                bandwidth_gbps: 1008,
+                type_: MemoryType::Gddr6,
+            },
+            operations: ProcessorCapabilities::nvidia_gpu(16384, 512, 24, 1008, (8, 9))
+                .operations,
+            power: PowerCharacteristics {
+                tdp_watts: 450,
+                efficiency: 0.85,
+                power_tier: PowerTier::High,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::HighlyParallel,
+                WorkloadCharacteristic::LargeBatch,
+            ],
+        }
+    }
+
+    /// NVIDIA RTX 3090.
+    pub fn nvidia_rtx_3090() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 0.6,
+                fp32_tflops: 35.6,
+                fp16_tflops: 71.2,
+                bf16_tflops: 71.2,
+                int8_tops: 142.4,
+                int4_tops: 284.8,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 24 * 1024 * 1024 * 1024,
+                bandwidth_gbps: 936,
+                type_: MemoryType::Gddr6,
+            },
+            operations: ProcessorCapabilities::nvidia_gpu(10496, 328, 24, 936, (8, 6))
+                .operations,
+            power: PowerCharacteristics {
+                tdp_watts: 350,
+                efficiency: 0.82,
+                power_tier: PowerTier::High,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::HighlyParallel,
+                WorkloadCharacteristic::LargeBatch,
+            ],
+        }
+    }
+
+    // ═══════════════════════════════════════════════════════════════
+    // AMD GPU PROFILES
+    // ═══════════════════════════════════════════════════════════════
+
+    /// AMD MI300X.
+    pub fn amd_mi300x() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 163.4,
+                fp32_tflops: 163.4,
+                fp16_tflops: 1307.0,
+                bf16_tflops: 1307.0,
+                int8_tops: 2614.0,
+                int4_tops: 5228.0,
+                sparsity_speedup: 2.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 192 * 1024 * 1024 * 1024, // 192 GB HBM3
+                bandwidth_gbps: 5300,
+                type_: MemoryType::Hbm3,
+            },
+            operations: {
+                let mut ops = ProcessorCapabilities::nvidia_gpu(16384, 512, 80, 5300, (9, 0))
+                    .operations;
+                ops.remove(&OperationType::FlashAttention); // Different implementation
+                ops
+            },
+            power: PowerCharacteristics {
+                tdp_watts: 750,
+                efficiency: 0.88,
+                power_tier: PowerTier::High,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::HighlyParallel,
+                WorkloadCharacteristic::LargeBatch,
+                WorkloadCharacteristic::MemoryBound, // High memory bandwidth
+            ],
+        }
+    }
+
+    /// AMD RX 7900 XTX.
+    pub fn amd_rx_7900_xtx() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 1.9,
+                fp32_tflops: 61.0,
+                fp16_tflops: 122.0,
+                bf16_tflops: 122.0,
+                int8_tops: 244.0,
+                int4_tops: 488.0,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 24 * 1024 * 1024 * 1024,
+                bandwidth_gbps: 960,
+                type_: MemoryType::Gddr6,
+            },
+            operations: {
+                let mut ops = ProcessorCapabilities::nvidia_gpu(6144, 0, 24, 960, (8, 0))
+                    .operations;
+                ops.remove(&OperationType::FlashAttention);
+                ops
+            },
+            power: PowerCharacteristics {
+                tdp_watts: 355,
+                efficiency: 0.80,
+                power_tier: PowerTier::High,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::HighlyParallel,
+            ],
+        }
+    }
+
+    // ═══════════════════════════════════════════════════════════════
+    // GOOGLE TPU PROFILES
+    // ═══════════════════════════════════════════════════════════════
+
+    /// Default TPU profile.
+    pub fn google_tpu_default() -> ProcessorCapabilities {
+        ProcessorCapabilities::tpu(TpuVersion::V4)
+    }
+
+    /// Google TPU v5p.
+    pub fn google_tpu_v5p() -> ProcessorCapabilities {
+        ProcessorCapabilities::tpu(TpuVersion::V5p)
+    }
+
+    /// Google TPU v4.
+    pub fn google_tpu_v4() -> ProcessorCapabilities {
+        ProcessorCapabilities::tpu(TpuVersion::V4)
+    }
+
+    /// Google Edge TPU.
+    pub fn google_edge_tpu() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 0.0,
+                fp32_tflops: 0.0,
+                fp16_tflops: 0.0,
+                bf16_tflops: 0.0,
+                int8_tops: 4.0,
+                int4_tops: 8.0,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 0, // Uses host memory
+                bandwidth_gbps: 0,
+                type_: MemoryType::Unified,
+            },
+            operations: {
+                let mut ops = HashSet::new();
+                ops.insert(OperationType::MatMul);
+                ops.insert(OperationType::Conv2d);
+                ops.insert(OperationType::DepthwiseConv);
+                ops.insert(OperationType::Add);
+                ops.insert(OperationType::Mul);
+                ops.insert(OperationType::ReLU);
+                ops.insert(OperationType::Softmax);
+                ops
+            },
+            power: PowerCharacteristics {
+                tdp_watts: 2,
+                efficiency: 0.95,
+                power_tier: PowerTier::UltraLow,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::LowPower,
+                WorkloadCharacteristic::LowLatency,
+                WorkloadCharacteristic::SmallBatch,
+            ],
+        }
+    }
+
+    // ═══════════════════════════════════════════════════════════════
+    // GROQ LPU PROFILE
+    // ═══════════════════════════════════════════════════════════════
+
+    /// Groq LPU.
+    pub fn groq_lpu() -> ProcessorCapabilities {
+        ProcessorCapabilities::lpu()
+    }
+
+    // ═══════════════════════════════════════════════════════════════
+    // APPLE NEURAL ENGINE PROFILES
+    // ═══════════════════════════════════════════════════════════════
+
+    /// Apple Neural Engine (generic).
+    pub fn apple_neural_engine(cores: u32) -> ProcessorCapabilities {
+        ProcessorCapabilities::apple_neural_engine(cores)
+    }
+
+    /// Apple M3 Neural Engine (16 cores).
+    pub fn apple_m3_neural_engine() -> ProcessorCapabilities {
+        ProcessorCapabilities::apple_neural_engine(16)
+    }
+
+    /// Apple M3 Max Neural Engine (16 cores).
+    pub fn apple_m3_max_neural_engine() -> ProcessorCapabilities {
+        ProcessorCapabilities::apple_neural_engine(16) // Same as M3
+    }
+
+    /// Apple A17 Pro Neural Engine (35 TOPS).
+    pub fn apple_a17_pro_neural_engine() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 0.0,
+                fp32_tflops: 4.4,
+                fp16_tflops: 8.8,
+                bf16_tflops: 8.8,
+                int8_tops: 35.0,
+                int4_tops: 70.0,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 0, // Uses unified memory
+                bandwidth_gbps: 200,
+                type_: MemoryType::Unified,
+            },
+            operations: ProcessorCapabilities::apple_neural_engine(16)
+                .operations,
+            power: PowerCharacteristics {
+                tdp_watts: 8,
+                efficiency: 0.98,
+                power_tier: PowerTier::UltraLow,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::LowPower,
+                WorkloadCharacteristic::LowLatency,
+                WorkloadCharacteristic::SmallBatch,
+            ],
+        }
+    }
+
+    // ═══════════════════════════════════════════════════════════════
+    // QUALCOMM NPU PROFILES
+    // ═══════════════════════════════════════════════════════════════
+
+    /// Qualcomm Hexagon NPU (Snapdragon 8 Gen 3).
+    pub fn qualcomm_hexagon_8g3() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 0.0,
+                fp32_tflops: 3.0,
+                fp16_tflops: 6.0,
+                bf16_tflops: 6.0,
+                int8_tops: 73.0, // 73 TOPS
+                int4_tops: 146.0,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 0, // Uses system memory
+                bandwidth_gbps: 77,
+                type_: MemoryType::Lpddr,
+            },
+            operations: ProcessorCapabilities::apple_neural_engine(16)
+                .operations,
+            power: PowerCharacteristics {
+                tdp_watts: 10,
+                efficiency: 0.95,
+                power_tier: PowerTier::UltraLow,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::LowPower,
+                WorkloadCharacteristic::LowLatency,
+                WorkloadCharacteristic::SmallBatch,
+            ],
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_h100_profile() {
+        let h100 = ProcessorProfiles::nvidia_h100();
+        assert!(h100.compute.fp16_tflops > 1000.0);
+        assert_eq!(h100.memory.capacity_bytes, 80 * 1024 * 1024 * 1024);
+    }
+
+    #[test]
+    fn test_tpu_v5p_profile() {
+        let tpu = ProcessorProfiles::google_tpu_v5p();
+        assert!(tpu.compute.bf16_tflops > 900.0);
+    }
+
+    #[test]
+    fn test_groq_lpu_profile() {
+        let lpu = ProcessorProfiles::groq_lpu();
+        assert!(lpu.memory.bandwidth_gbps > 50000); // Very high internal bandwidth
+    }
+
+    #[test]
+    fn test_apple_ane_profile() {
+        let ane = ProcessorProfiles::apple_m3_neural_engine();
+        assert!(ane.power.tdp_watts < 20);
+        assert!(ane.optimal_for.contains(&WorkloadCharacteristic::LowPower));
+    }
+}
diff --git a/crates/synor-compute/src/processor/types.rs b/crates/synor-compute/src/processor/types.rs
new file mode 100644
index 0000000..7e9ac3e
--- /dev/null
+++ b/crates/synor-compute/src/processor/types.rs
@@ -0,0 +1,367 @@
+//! Processor type definitions.
+
+use serde::{Deserialize, Serialize};
+
+/// All supported processor types.
+#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum ProcessorType {
+    /// Central Processing Unit.
+    Cpu(CpuVariant),
+    /// Graphics Processing Unit.
+    Gpu(GpuVariant),
+    /// Tensor Processing Unit (Google).
+    Tpu(TpuVersion),
+    /// Neural Processing Unit (various vendors).
+    Npu(NpuVariant),
+    /// Language Processing Unit (Groq).
+    Lpu,
+    /// Field Programmable Gate Array.
+    Fpga(FpgaVendor),
+    /// Digital Signal Processor.
+    Dsp(DspVariant),
+    /// WebGPU (browser).
+    WebGpu,
+    /// WebAssembly runtime.
+    Wasm,
+    /// Custom/Unknown accelerator.
+    Custom {
+        vendor: String,
+        model: String,
+    },
+}
+
+impl Default for ProcessorType {
+    fn default() -> Self {
+        ProcessorType::Cpu(CpuVariant::default())
+    }
+}
+
+/// CPU architecture variants.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum CpuVariant {
+    /// x86-64 architecture.
+    X86_64 { avx: AvxSupport },
+    /// ARM 64-bit architecture.
+    Arm64 { sve: bool },
+    /// RISC-V architecture.
+    RiscV { vector: bool },
+}
+
+impl Default for CpuVariant {
+    fn default() -> Self {
+        CpuVariant::X86_64 {
+            avx: AvxSupport::Avx2,
+        }
+    }
+}
+
+/// AVX instruction set support levels.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
+pub enum AvxSupport {
+    /// No AVX.
+    None,
+    /// AVX (Sandy Bridge+).
+    Avx,
+    /// AVX2 (Haswell+).
+    Avx2,
+    /// AVX-512 (Skylake-X+).
+    Avx512,
+    /// AVX10 (future).
+    Avx10,
+}
+
+/// GPU vendor variants.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum GpuVariant {
+    /// NVIDIA CUDA GPU.
+    NvidiaCuda {
+        /// Compute capability (major, minor).
+        compute_capability: (u8, u8),
+    },
+    /// AMD ROCm GPU.
+    AmdRocm {
+        /// GFX version (e.g., 1100 for RDNA3).
+        gfx_version: u32,
+    },
+    /// Intel OneAPI GPU.
+    IntelOneApi,
+    /// Apple Metal GPU.
+    AppleMetal,
+    /// Qualcomm Adreno GPU.
+    QualcommAdreno {
+        /// Adreno model number.
+        model: u32,
+    },
+    /// ARM Mali GPU.
+    ArmMali {
+        /// Mali generation (e.g., G710).
+        model: u32,
+    },
+    /// IMG PowerVR GPU.
+    ImgPowerVr,
+}
+
+/// Google TPU versions.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum TpuVersion {
+    /// TPU v2.
+    V2,
+    /// TPU v3.
+    V3,
+    /// TPU v4.
+    V4,
+    /// TPU v4i (inference).
+    V4i,
+    /// TPU v5e (efficiency).
+    V5e,
+    /// TPU v5p (performance).
+    V5p,
+    /// Edge TPU.
+    Edge,
+}
+
+/// NPU (Neural Processing Unit) variants.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum NpuVariant {
+    /// Apple Neural Engine.
+    AppleNeuralEngine {
+        /// Number of cores.
+        cores: u32,
+    },
+    /// Qualcomm Hexagon DSP/NPU.
+    QualcommHexagon {
+        /// Version number.
+        version: u32,
+    },
+    /// Intel VPU (Movidius).
+    IntelVpu,
+    /// Huawei Ascend.
+    HuaweiAscend {
+        /// Model (310, 910, etc.).
+        model: u32,
+    },
+    /// Google Edge TPU.
+    GoogleEdgeTpu,
+    /// Samsung NPU.
+    SamsungNpu,
+    /// MediaTek APU.
+    MediaTekApu {
+        /// Version.
+        version: u32,
+    },
+    /// Custom NPU.
+    Custom {
+        /// TOPS (Tera Operations Per Second).
+        tops: u32,
+    },
+}
+
+/// FPGA vendors.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum FpgaVendor {
+    /// Xilinx (AMD).
+    Xilinx,
+    /// Intel (Altera).
+    Intel,
+    /// Lattice.
+    Lattice,
+    /// Microchip.
+    Microchip,
+}
+
+/// DSP (Digital Signal Processor) variants.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum DspVariant {
+    /// Texas Instruments DSP.
+    TexasInstruments,
+    /// Analog Devices DSP.
+    AnalogDevices,
+    /// Qualcomm Hexagon DSP.
+    QualcommHexagon,
+    /// Custom DSP.
+    Custom,
+}
+
+impl ProcessorType {
+    /// Returns whether this processor type supports CUDA.
+    pub fn supports_cuda(&self) -> bool {
+        matches!(self, ProcessorType::Gpu(GpuVariant::NvidiaCuda { .. }))
+    }
+
+    /// Returns whether this processor type supports ROCm.
+    pub fn supports_rocm(&self) -> bool {
+        matches!(self, ProcessorType::Gpu(GpuVariant::AmdRocm { .. }))
+    }
+
+    /// Returns whether this processor type supports Metal.
+    pub fn supports_metal(&self) -> bool {
+        matches!(self, ProcessorType::Gpu(GpuVariant::AppleMetal))
+    }
+
+    /// Returns whether this processor type is a GPU.
+    pub fn is_gpu(&self) -> bool {
+        matches!(self, ProcessorType::Gpu(_))
+    }
+
+    /// Returns whether this processor type is a CPU.
+    pub fn is_cpu(&self) -> bool {
+        matches!(self, ProcessorType::Cpu(_))
+    }
+
+    /// Returns whether this processor type is suitable for parallel workloads.
+    pub fn is_parallel(&self) -> bool {
+        matches!(
+            self,
+            ProcessorType::Gpu(_) | ProcessorType::Tpu(_) | ProcessorType::Fpga(_)
+        )
+    }
+
+    /// Returns whether this processor type is suitable for sequential workloads.
+    pub fn is_sequential(&self) -> bool {
+        matches!(self, ProcessorType::Cpu(_) | ProcessorType::Lpu)
+    }
+
+    /// Returns whether this processor type is power-efficient.
+    pub fn is_low_power(&self) -> bool {
+        matches!(
+            self,
+            ProcessorType::Npu(_) | ProcessorType::Tpu(TpuVersion::Edge) | ProcessorType::Wasm
+        )
+    }
+
+    /// Returns the typical power consumption tier.
+    pub fn power_tier(&self) -> PowerTier {
+        match self {
+            ProcessorType::Npu(_) | ProcessorType::Wasm => PowerTier::UltraLow,
+            ProcessorType::Cpu(CpuVariant::Arm64 { .. }) => PowerTier::Low,
+            ProcessorType::Cpu(_) => PowerTier::Medium,
+            ProcessorType::Gpu(GpuVariant::AppleMetal) => PowerTier::Medium,
+            ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability })
+                if compute_capability.0 >= 8 =>
+            {
+                PowerTier::High
+            }
+            ProcessorType::Gpu(_) => PowerTier::Medium,
+            ProcessorType::Tpu(TpuVersion::Edge) => PowerTier::UltraLow,
+            ProcessorType::Tpu(_) => PowerTier::High,
+            ProcessorType::Lpu => PowerTier::Medium,
+            ProcessorType::Fpga(_) => PowerTier::Medium,
+            ProcessorType::Dsp(_) => PowerTier::Low,
+            ProcessorType::WebGpu => PowerTier::Low,
+            ProcessorType::Custom { .. } => PowerTier::Medium,
+        }
+    }
+}
+
+/// Power consumption tiers.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub enum PowerTier {
+    /// < 5W (mobile, IoT).
+    UltraLow,
+    /// 5-30W (laptop, tablet).
+    Low,
+    /// 30-150W (desktop, workstation).
+    Medium,
+    /// > 150W (server, data center).
+    High,
+}
+
+/// Device class for routing decisions.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum DeviceClass {
+    /// Data center equipment.
+    DataCenter,
+    /// Desktop/workstation.
+    Desktop,
+    /// Laptop.
+    Laptop,
+    /// Mobile phone.
+    Mobile,
+    /// Tablet.
+    Tablet,
+    /// IoT device.
+    IoT,
+    /// Browser (WebGPU/WASM).
+    Browser,
+    /// Edge server.
+    Edge,
+}
+
+impl DeviceClass {
+    /// Returns typical available compute hours per day.
+    pub fn typical_availability_hours(&self) -> f32 {
+        match self {
+            DeviceClass::DataCenter => 24.0,
+            DeviceClass::Desktop => 8.0,
+            DeviceClass::Laptop => 6.0,
+            DeviceClass::Mobile => 4.0,
+            DeviceClass::Tablet => 4.0,
+            DeviceClass::IoT => 24.0,
+            DeviceClass::Browser => 2.0,
+            DeviceClass::Edge => 24.0,
+        }
+    }
+
+    /// Returns reliability score (0-100).
+    pub fn reliability_score(&self) -> u32 {
+        match self {
+            DeviceClass::DataCenter => 99,
+            DeviceClass::Edge => 95,
+            DeviceClass::Desktop => 80,
+            DeviceClass::Laptop => 60,
+            DeviceClass::Mobile => 40,
+            DeviceClass::Tablet => 50,
+            DeviceClass::IoT => 70,
+            DeviceClass::Browser => 30,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_processor_type_properties() {
+        let nvidia = ProcessorType::Gpu(GpuVariant::NvidiaCuda {
+            compute_capability: (9, 0),
+        });
+        assert!(nvidia.supports_cuda());
+        assert!(nvidia.is_gpu());
+        assert!(nvidia.is_parallel());
+
+        let cpu = ProcessorType::Cpu(CpuVariant::X86_64 {
+            avx: AvxSupport::Avx512,
+        });
+        assert!(cpu.is_cpu());
+        assert!(cpu.is_sequential());
+
+        let lpu = ProcessorType::Lpu;
+        assert!(lpu.is_sequential());
+
+        let npu = ProcessorType::Npu(NpuVariant::AppleNeuralEngine { cores: 16 });
+        assert!(npu.is_low_power());
+    }
+
+    #[test]
+    fn test_power_tiers() {
+        let h100 = ProcessorType::Gpu(GpuVariant::NvidiaCuda {
+            compute_capability: (9, 0),
+        });
+        assert_eq!(h100.power_tier(), PowerTier::High);
+
+        let npu = ProcessorType::Npu(NpuVariant::AppleNeuralEngine { cores: 16 });
+        assert_eq!(npu.power_tier(), PowerTier::UltraLow);
+
+        let arm = ProcessorType::Cpu(CpuVariant::Arm64 { sve: false });
+        assert_eq!(arm.power_tier(), PowerTier::Low);
+    }
+
+    #[test]
+    fn test_device_class() {
+        assert_eq!(DeviceClass::DataCenter.typical_availability_hours(), 24.0);
+        assert_eq!(DeviceClass::Mobile.typical_availability_hours(), 4.0);
+        assert_eq!(DeviceClass::DataCenter.reliability_score(), 99);
+        assert_eq!(DeviceClass::Browser.reliability_score(), 30);
+    }
+}
diff --git a/crates/synor-compute/src/scheduler/load_balancer.rs b/crates/synor-compute/src/scheduler/load_balancer.rs
new file mode 100644
index 0000000..17a695e
--- /dev/null
+++ b/crates/synor-compute/src/scheduler/load_balancer.rs
@@ -0,0 +1,810 @@
+//! Load balancer with work stealing for heterogeneous compute.
+//!
+//! Supports:
+//! - Cross-processor-type work migration
+//! - Energy-aware balancing
+//! - Latency-aware scheduling
+//! - Real-time utilization metrics
+
+use crate::device::{DeviceInfo, DeviceRegistry};
+use crate::processor::{Operation, OperationType, ProcessorId, ProcessorType};
+use crate::task::{Task, TaskId, TaskPriority};
+use super::TaskAssignment;
+use parking_lot::RwLock;
+use std::collections::HashMap;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+/// Balancing strategy for the load balancer.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum BalancingStrategy {
+    /// Optimize for speed (minimize execution time).
+    Speed,
+    /// Optimize for energy efficiency.
+    Energy,
+    /// Balance speed and energy.
+    Balanced,
+    /// Optimize for cost (spot pricing).
+    Cost,
+    /// Optimize for latency (inference workloads).
+    Latency,
+}
+
+impl Default for BalancingStrategy {
+    fn default() -> Self {
+        BalancingStrategy::Balanced
+    }
+}
+
+/// Real-time processor metrics.
+#[derive(Clone, Debug, Default)]
+pub struct ProcessorMetrics {
+    /// Current utilization (0.0 - 1.0).
+    pub utilization: f64,
+    /// Queue depth (pending tasks).
+    pub queue_depth: u64,
+    /// Average task completion time (ms).
+    pub avg_completion_ms: f64,
+    /// Tasks completed in last minute.
+    pub throughput_per_min: u64,
+    /// Current power draw (watts).
+    pub power_watts: f64,
+    /// Temperature (celsius).
+    pub temperature: f64,
+    /// Last updated timestamp.
+    pub last_updated: Option<Instant>,
+}
+
+/// Load balancer for heterogeneous compute environments.
+pub struct LoadBalancer {
+    /// Device registry for processor info.
+    device_registry: Option<Arc<DeviceRegistry>>,
+    /// Current load per processor (task count).
+    loads: RwLock<HashMap<ProcessorId, AtomicU64>>,
+    /// Real-time metrics per processor.
+    metrics: RwLock<HashMap<ProcessorId, ProcessorMetrics>>,
+    /// Processor type mapping.
+    processor_types: RwLock<HashMap<ProcessorId, ProcessorType>>,
+    /// Work stealing threshold (0.0 - 1.0).
+    steal_threshold: f64,
+    /// Rebalance threshold (0.0 - 1.0).
+    rebalance_threshold: f64,
+    /// Current balancing strategy.
+    strategy: RwLock<BalancingStrategy>,
+    /// Migration history (to prevent thrashing).
+    migration_history: RwLock<Vec<MigrationRecord>>,
+}
+
+/// Record of a task migration.
+#[derive(Clone, Debug)]
+struct MigrationRecord {
+    task_id: TaskId,
+    from: ProcessorId,
+    to: ProcessorId,
+    timestamp: Instant,
+}
+
+impl LoadBalancer {
+    /// Creates a new load balancer.
+    pub fn new() -> Self {
+        Self {
+            device_registry: None,
+            loads: RwLock::new(HashMap::new()),
+            metrics: RwLock::new(HashMap::new()),
+            processor_types: RwLock::new(HashMap::new()),
+            steal_threshold: 0.3,
+            rebalance_threshold: 0.2,
+            strategy: RwLock::new(BalancingStrategy::default()),
+            migration_history: RwLock::new(Vec::new()),
+        }
+    }
+
+    /// Creates a load balancer with device registry.
+    pub fn with_registry(device_registry: Arc<DeviceRegistry>) -> Self {
+        Self {
+            device_registry: Some(device_registry),
+            loads: RwLock::new(HashMap::new()),
+            metrics: RwLock::new(HashMap::new()),
+            processor_types: RwLock::new(HashMap::new()),
+            steal_threshold: 0.3,
+            rebalance_threshold: 0.2,
+            strategy: RwLock::new(BalancingStrategy::default()),
+            migration_history: RwLock::new(Vec::new()),
+        }
+    }
+
+    /// Sets the balancing strategy.
+    pub fn set_strategy(&self, strategy: BalancingStrategy) {
+        *self.strategy.write() = strategy;
+    }
+
+    /// Gets the current strategy.
+    pub fn strategy(&self) -> BalancingStrategy {
+        *self.strategy.read()
+    }
+
+    /// Register a processor with its type.
+    pub fn register_processor(&self, processor_id: ProcessorId, processor_type: ProcessorType) {
+        self.loads.write().insert(processor_id, AtomicU64::new(0));
+        self.metrics.write().insert(processor_id, ProcessorMetrics::default());
+        self.processor_types.write().insert(processor_id, processor_type);
+    }
+
+    /// Unregister a processor.
+    pub fn unregister_processor(&self, processor_id: ProcessorId) {
+        self.loads.write().remove(&processor_id);
+        self.metrics.write().remove(&processor_id);
+        self.processor_types.write().remove(&processor_id);
+    }
+
+    /// Update real-time metrics for a processor.
+    pub fn update_metrics(&self, processor_id: ProcessorId, metrics: ProcessorMetrics) {
+        if let Some(existing) = self.metrics.write().get_mut(&processor_id) {
+            *existing = ProcessorMetrics {
+                last_updated: Some(Instant::now()),
+                ..metrics
+            };
+        }
+    }
+
+    /// Get current load for a processor.
+    pub fn get_load(&self, processor_id: ProcessorId) -> u64 {
+        self.loads.read()
+            .get(&processor_id)
+            .map(|l| l.load(Ordering::Relaxed))
+            .unwrap_or(0)
+    }
+
+    /// Increment load for a processor.
+    pub fn increment_load(&self, processor_id: ProcessorId) {
+        if let Some(load) = self.loads.read().get(&processor_id) {
+            load.fetch_add(1, Ordering::Relaxed);
+        }
+    }
+
+    /// Decrement load for a processor.
+    pub fn decrement_load(&self, processor_id: ProcessorId) {
+        if let Some(load) = self.loads.read().get(&processor_id) {
+            load.fetch_sub(1, Ordering::Relaxed);
+        }
+    }
+
+    /// Check if an operation can run on a processor type.
+    pub fn can_execute(&self, op: &Operation, processor_type: &ProcessorType) -> bool {
+        let op_type = op.op_type();
+
+        match processor_type {
+            // CPUs can handle most sequential operations
+            ProcessorType::Cpu(_) => matches!(
+                op_type,
+                OperationType::MatMul
+                | OperationType::Conv2d
+                | OperationType::Conv3d
+                | OperationType::DepthwiseConv
+                | OperationType::BatchNorm
+                | OperationType::LayerNorm
+                | OperationType::Add
+                | OperationType::Mul
+                | OperationType::ReLU
+                | OperationType::GeLU
+                | OperationType::SiLU
+                | OperationType::Softmax
+                | OperationType::Sum
+                | OperationType::Mean
+                | OperationType::Max
+                | OperationType::ArgMax
+                | OperationType::Embedding
+                | OperationType::TopK
+                | OperationType::Sampling
+                | OperationType::Tokenization
+                | OperationType::Detokenization
+                | OperationType::DataLoad
+                | OperationType::DataPreprocess
+                | OperationType::Transpose
+                | OperationType::Reshape
+                | OperationType::Concat
+                | OperationType::Split
+            ),
+
+            // GPUs excel at parallel operations
+            ProcessorType::Gpu(_) => matches!(
+                op_type,
+                OperationType::MatMul
+                | OperationType::Conv2d
+                | OperationType::Conv3d
+                | OperationType::DepthwiseConv
+                | OperationType::BatchNorm
+                | OperationType::LayerNorm
+                | OperationType::SelfAttention
+                | OperationType::CrossAttention
+                | OperationType::FlashAttention
+                | OperationType::Add
+                | OperationType::Mul
+                | OperationType::ReLU
+                | OperationType::GeLU
+                | OperationType::SiLU
+                | OperationType::Softmax
+                | OperationType::Sum
+                | OperationType::Mean
+                | OperationType::Max
+                | OperationType::ArgMax
+                | OperationType::Embedding
+                | OperationType::RoPE
+                | OperationType::KVCache
+                | OperationType::TopK
+                | OperationType::Sampling
+                | OperationType::Transpose
+                | OperationType::Reshape
+                | OperationType::Concat
+                | OperationType::Split
+                | OperationType::Gather
+                | OperationType::Scatter
+                | OperationType::AllReduce
+                | OperationType::AllGather
+                | OperationType::ReduceScatter
+                | OperationType::Backward
+                | OperationType::OptimizerStep
+                | OperationType::GradientClip
+            ),
+
+            // TPUs optimized for ML
+            ProcessorType::Tpu(_) => matches!(
+                op_type,
+                OperationType::MatMul
+                | OperationType::Conv2d
+                | OperationType::BatchNorm
+                | OperationType::LayerNorm
+                | OperationType::SelfAttention
+                | OperationType::CrossAttention
+                | OperationType::FlashAttention
+                | OperationType::Add
+                | OperationType::Mul
+                | OperationType::ReLU
+                | OperationType::GeLU
+                | OperationType::SiLU
+                | OperationType::Softmax
+                | OperationType::Sum
+                | OperationType::Mean
+                | OperationType::Embedding
+                | OperationType::RoPE
+                | OperationType::KVCache
+                | OperationType::AllReduce
+                | OperationType::AllGather
+                | OperationType::ReduceScatter
+                | OperationType::Backward
+                | OperationType::OptimizerStep
+            ),
+
+            // NPUs for neural network inference
+            ProcessorType::Npu(_) => matches!(
+                op_type,
+                OperationType::MatMul
+                | OperationType::Conv2d
+                | OperationType::DepthwiseConv
+                | OperationType::BatchNorm
+                | OperationType::LayerNorm
+                | OperationType::SelfAttention
+                | OperationType::Add
+                | OperationType::Mul
+                | OperationType::ReLU
+                | OperationType::GeLU
+                | OperationType::SiLU
+                | OperationType::Softmax
+                | OperationType::Sum
+                | OperationType::Mean
+            ),
+
+            // LPUs for sequential inference (optimized for LLMs)
+            ProcessorType::Lpu => matches!(
+                op_type,
+                OperationType::MatMul
+                | OperationType::LayerNorm
+                | OperationType::SelfAttention
+                | OperationType::FlashAttention
+                | OperationType::Add
+                | OperationType::Mul
+                | OperationType::ReLU
+                | OperationType::GeLU
+                | OperationType::SiLU
+                | OperationType::Softmax
+                | OperationType::Embedding
+                | OperationType::RoPE
+                | OperationType::KVCache
+                | OperationType::TopK
+                | OperationType::Sampling
+            ),
+
+            // FPGAs can be programmed for anything
+            ProcessorType::Fpga(_) => true,
+
+            // DSPs for signal processing
+            ProcessorType::Dsp(_) => matches!(
+                op_type,
+                OperationType::Conv2d
+                | OperationType::DepthwiseConv
+                | OperationType::Add
+                | OperationType::Mul
+                | OperationType::Sum
+                | OperationType::Mean
+                | OperationType::Max
+            ),
+
+            // WebGPU has limited operations
+            ProcessorType::WebGpu => matches!(
+                op_type,
+                OperationType::MatMul
+                | OperationType::Conv2d
+                | OperationType::Add
+                | OperationType::Mul
+                | OperationType::ReLU
+                | OperationType::Softmax
+                | OperationType::Sum
+                | OperationType::Transpose
+                | OperationType::Reshape
+            ),
+
+            // WASM for portable compute
+            ProcessorType::Wasm => matches!(
+                op_type,
+                OperationType::MatMul
+                | OperationType::Add
+                | OperationType::Mul
+                | OperationType::ReLU
+                | OperationType::Softmax
+                | OperationType::Sum
+                | OperationType::Mean
+                | OperationType::Tokenization
+                | OperationType::Detokenization
+            ),
+
+            // Custom processors - assume they can handle anything
+            ProcessorType::Custom { .. } => true,
+        }
+    }
+
+    /// Calculate a score for assigning a task to a processor.
+    fn calculate_score(
+        &self,
+        task: &Task,
+        processor_id: ProcessorId,
+        processor_type: &ProcessorType,
+    ) -> f64 {
+        let strategy = *self.strategy.read();
+        let load = self.get_load(processor_id);
+        let metrics = self.metrics.read();
+        let proc_metrics = metrics.get(&processor_id);
+
+        // Base score from compatibility
+        if !self.can_execute(&task.operation, processor_type) {
+            return f64::NEG_INFINITY;
+        }
+
+        // Get utilization and metrics
+        let utilization = proc_metrics.map(|m| m.utilization).unwrap_or(load as f64 / 100.0);
+        let power = proc_metrics.map(|m| m.power_watts).unwrap_or(100.0);
+        let avg_completion = proc_metrics.map(|m| m.avg_completion_ms).unwrap_or(100.0);
+
+        // Calculate score based on strategy
+        match strategy {
+            BalancingStrategy::Speed => {
+                // Prioritize low utilization and fast completion
+                let speed_score = 1.0 / (avg_completion.max(1.0) * (1.0 + utilization));
+
+                // Bonus for powerful processor types
+                let type_bonus = match processor_type {
+                    ProcessorType::Gpu(_) => 2.0,
+                    ProcessorType::Tpu(_) => 2.5,
+                    ProcessorType::Lpu => 3.0, // Fastest for inference
+                    ProcessorType::Npu(_) => 1.5,
+                    _ => 1.0,
+                };
+
+                speed_score * type_bonus
+            }
+
+            BalancingStrategy::Energy => {
+                // Prioritize low power consumption
+                let energy_score = 1.0 / power.max(1.0);
+
+                // Bonus for efficient processor types
+                let efficiency_bonus = match processor_type {
+                    ProcessorType::Npu(_) => 3.0, // Most efficient
+                    ProcessorType::Lpu => 2.0,
+                    ProcessorType::Cpu(_) => 1.5,
+                    ProcessorType::Wasm => 2.0, // Low overhead
+                    _ => 1.0,
+                };
+
+                energy_score * efficiency_bonus * (1.0 - utilization * 0.5)
+            }
+
+            BalancingStrategy::Balanced => {
+                // Balance speed and energy
+                let speed = 1.0 / avg_completion.max(1.0);
+                let efficiency = 1.0 / power.max(1.0);
+                let load_factor = 1.0 - utilization;
+
+                (speed * 0.4 + efficiency * 0.3 + load_factor * 0.3)
+            }
+
+            BalancingStrategy::Cost => {
+                // Prioritize cheaper resources (consumer devices)
+                let cost_factor = match processor_type {
+                    ProcessorType::Wasm => 0.1,      // Cheapest (browser)
+                    ProcessorType::WebGpu => 0.15,
+                    ProcessorType::Cpu(_) => 0.2,
+                    ProcessorType::Npu(_) => 0.3,    // Mobile NPUs
+                    ProcessorType::Gpu(_) => 0.5,
+                    ProcessorType::Lpu => 0.8,
+                    ProcessorType::Tpu(_) => 1.0,    // Most expensive
+                    _ => 0.5,
+                };
+
+                (1.0 - cost_factor) * (1.0 - utilization)
+            }
+
+            BalancingStrategy::Latency => {
+                // Prioritize low latency for inference
+                let latency_score = 1.0 / avg_completion.max(0.1);
+
+                // Bonus for low-latency processors
+                let latency_bonus = match processor_type {
+                    ProcessorType::Lpu => 5.0,      // Designed for low latency
+                    ProcessorType::Npu(_) => 3.0,
+                    ProcessorType::Gpu(_) => 2.0,
+                    ProcessorType::Tpu(_) => 1.5,
+                    _ => 1.0,
+                };
+
+                // Priority boost for critical tasks
+                let priority_boost = match task.priority {
+                    TaskPriority::Critical => 2.0,
+                    TaskPriority::High => 1.5,
+                    TaskPriority::Normal => 1.0,
+                    TaskPriority::Background => 0.5,
+                };
+
+                latency_score * latency_bonus * priority_boost * (1.0 - utilization)
+            }
+        }
+    }
+
+    /// Maybe rebalance a task to a different processor.
+    pub fn maybe_rebalance(
+        &self,
+        task: &Task,
+        suggested_processor: ProcessorId,
+        current_assignment: &TaskAssignment,
+    ) -> ProcessorId {
+        // Get all registered processors
+        let processor_types = self.processor_types.read();
+
+        // If we don't have processor info, use suggested
+        let suggested_type = match processor_types.get(&suggested_processor) {
+            Some(t) => t.clone(),
+            None => return suggested_processor,
+        };
+
+        // Calculate score for suggested processor
+        let suggested_score = self.calculate_score(task, suggested_processor, &suggested_type);
+
+        // Find best alternative
+        let mut best_processor = suggested_processor;
+        let mut best_score = suggested_score;
+
+        for (proc_id, proc_type) in processor_types.iter() {
+            if *proc_id == suggested_processor {
+                continue;
+            }
+
+            let score = self.calculate_score(task, *proc_id, proc_type);
+
+            // Only switch if significantly better (prevents thrashing)
+            if score > best_score * (1.0 + self.rebalance_threshold) {
+                best_score = score;
+                best_processor = *proc_id;
+            }
+        }
+
+        // Record migration if different
+        if best_processor != suggested_processor {
+            self.migration_history.write().push(MigrationRecord {
+                task_id: task.id,
+                from: suggested_processor,
+                to: best_processor,
+                timestamp: Instant::now(),
+            });
+        }
+
+        best_processor
+    }
+
+    /// Check if work stealing should happen between two processors.
+    pub fn should_steal(&self, from: ProcessorId, to: ProcessorId) -> bool {
+        let from_load = self.get_load(from) as f64;
+        let to_load = self.get_load(to) as f64;
+
+        if from_load == 0.0 {
+            return false;
+        }
+
+        // Check if processor types are compatible for the queued work
+        let processor_types = self.processor_types.read();
+        let from_type = processor_types.get(&from);
+        let to_type = processor_types.get(&to);
+
+        // Only steal between same processor types by default
+        // (cross-type stealing requires operation compatibility check)
+        match (from_type, to_type) {
+            (Some(ft), Some(tt)) if std::mem::discriminant(ft) == std::mem::discriminant(tt) => {
+                let diff = (from_load - to_load) / from_load;
+                diff > self.steal_threshold
+            }
+            _ => false,
+        }
+    }
+
+    /// Get rebalancing suggestions based on current load.
+    pub fn get_rebalance_suggestions(&self) -> Vec<(ProcessorId, ProcessorId)> {
+        let mut suggestions = Vec::new();
+        let loads = self.loads.read();
+
+        let load_values: Vec<_> = loads.iter()
+            .map(|(id, load)| (*id, load.load(Ordering::Relaxed)))
+            .collect();
+
+        if load_values.is_empty() {
+            return suggestions;
+        }
+
+        let avg_load: f64 = load_values.iter().map(|(_, l)| *l as f64).sum::<f64>()
+            / load_values.len() as f64;
+
+        let processor_types = self.processor_types.read();
+
+        let overloaded: Vec<_> = load_values.iter()
+            .filter(|(_, l)| *l as f64 > avg_load * (1.0 + self.rebalance_threshold))
+            .collect();
+
+        let underloaded: Vec<_> = load_values.iter()
+            .filter(|(_, l)| (*l as f64) < avg_load * (1.0 - self.rebalance_threshold))
+            .collect();
+
+        // Only suggest migrations between compatible processor types
+        for (over_id, _) in overloaded {
+            let over_type = processor_types.get(over_id);
+
+            for (under_id, _) in &underloaded {
+                let under_type = processor_types.get(under_id);
+
+                // Check type compatibility
+                if let (Some(ot), Some(ut)) = (over_type, under_type) {
+                    if std::mem::discriminant(ot) == std::mem::discriminant(ut) {
+                        suggestions.push((*over_id, *under_id));
+                    }
+                }
+            }
+        }
+
+        suggestions
+    }
+
+    /// Get load statistics.
+    pub fn get_stats(&self) -> LoadBalancerStats {
+        let loads = self.loads.read();
+        let metrics = self.metrics.read();
+
+        let total_load: u64 = loads.values().map(|l| l.load(Ordering::Relaxed)).sum();
+        let processor_count = loads.len();
+        let avg_load = if processor_count > 0 {
+            total_load as f64 / processor_count as f64
+        } else {
+            0.0
+        };
+
+        let total_utilization: f64 = metrics.values().map(|m| m.utilization).sum();
+        let avg_utilization = if processor_count > 0 {
+            total_utilization / processor_count as f64
+        } else {
+            0.0
+        };
+
+        let total_power: f64 = metrics.values().map(|m| m.power_watts).sum();
+        let migrations = self.migration_history.read().len();
+
+        LoadBalancerStats {
+            total_load,
+            avg_load,
+            processor_count,
+            avg_utilization,
+            total_power_watts: total_power,
+            total_migrations: migrations,
+            strategy: *self.strategy.read(),
+        }
+    }
+
+    /// Clean up old migration history.
+    pub fn cleanup_history(&self, max_age: Duration) {
+        let cutoff = Instant::now() - max_age;
+        self.migration_history.write().retain(|r| r.timestamp > cutoff);
+    }
+}
+
+impl Default for LoadBalancer {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Load balancer statistics.
+#[derive(Clone, Debug)]
+pub struct LoadBalancerStats {
+    /// Total tasks across all processors.
+    pub total_load: u64,
+    /// Average load per processor.
+    pub avg_load: f64,
+    /// Number of registered processors.
+    pub processor_count: usize,
+    /// Average utilization (0.0 - 1.0).
+    pub avg_utilization: f64,
+    /// Total power consumption (watts).
+    pub total_power_watts: f64,
+    /// Total migrations performed.
+    pub total_migrations: usize,
+    /// Current balancing strategy.
+    pub strategy: BalancingStrategy,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::processor::{CpuVariant, GpuVariant, Operation, Precision};
+    use crate::task::TaskStatus;
+
+    fn create_test_task(priority: TaskPriority) -> Task {
+        Task {
+            id: TaskId::new(),
+            operation: Operation::MatMul {
+                m: 1024,
+                n: 1024,
+                k: 1024,
+                precision: Precision::Fp32,
+            },
+            priority,
+            dependencies: vec![],
+            status: TaskStatus::Pending,
+            deadline: None,
+        }
+    }
+
+    #[test]
+    fn test_load_tracking() {
+        let balancer = LoadBalancer::new();
+
+        balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
+        balancer.register_processor(ProcessorId(1), ProcessorType::Cpu(CpuVariant::default()));
+
+        assert_eq!(balancer.get_load(ProcessorId(0)), 0);
+
+        balancer.increment_load(ProcessorId(0));
+        balancer.increment_load(ProcessorId(0));
+        balancer.increment_load(ProcessorId(1));
+
+        assert_eq!(balancer.get_load(ProcessorId(0)), 2);
+        assert_eq!(balancer.get_load(ProcessorId(1)), 1);
+
+        balancer.decrement_load(ProcessorId(0));
+        assert_eq!(balancer.get_load(ProcessorId(0)), 1);
+    }
+
+    #[test]
+    fn test_should_steal_same_type() {
+        let balancer = LoadBalancer::new();
+
+        // Register two CPUs
+        balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
+        balancer.register_processor(ProcessorId(1), ProcessorType::Cpu(CpuVariant::default()));
+
+        // Give processor 0 high load
+        for _ in 0..10 {
+            balancer.increment_load(ProcessorId(0));
+        }
+        balancer.increment_load(ProcessorId(1));
+
+        // Should steal between same types
+        assert!(balancer.should_steal(ProcessorId(0), ProcessorId(1)));
+        assert!(!balancer.should_steal(ProcessorId(1), ProcessorId(0)));
+    }
+
+    #[test]
+    fn test_should_not_steal_different_types() {
+        let balancer = LoadBalancer::new();
+
+        // Register CPU and GPU
+        balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
+        balancer.register_processor(
+            ProcessorId(1),
+            ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability: (8, 9) }),
+        );
+
+        // Give CPU high load
+        for _ in 0..10 {
+            balancer.increment_load(ProcessorId(0));
+        }
+
+        // Should NOT steal between different types
+        assert!(!balancer.should_steal(ProcessorId(0), ProcessorId(1)));
+    }
+
+    #[test]
+    fn test_can_execute() {
+        let balancer = LoadBalancer::new();
+
+        let matmul = Operation::MatMul {
+            m: 1024,
+            n: 1024,
+            k: 1024,
+            precision: Precision::Fp32,
+        };
+
+        let flash_attention = Operation::FlashAttention {
+            batch: 32,
+            seq_len: 2048,
+            num_heads: 32,
+            head_dim: 128,
+            precision: Precision::Fp16,
+        };
+
+        let cpu = ProcessorType::Cpu(CpuVariant::default());
+        let gpu = ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability: (8, 9) });
+        let lpu = ProcessorType::Lpu;
+
+        // MatMul can run on all
+        assert!(balancer.can_execute(&matmul, &cpu));
+        assert!(balancer.can_execute(&matmul, &gpu));
+        assert!(balancer.can_execute(&matmul, &lpu));
+
+        // FlashAttention only on GPU/TPU/LPU
+        assert!(!balancer.can_execute(&flash_attention, &cpu));
+        assert!(balancer.can_execute(&flash_attention, &gpu));
+    }
+
+    #[test]
+    fn test_strategy_affects_scoring() {
+        let balancer = LoadBalancer::new();
+
+        let cpu_id = ProcessorId(0);
+        let npu_id = ProcessorId(1);
+
+        balancer.register_processor(cpu_id, ProcessorType::Cpu(CpuVariant::default()));
+        balancer.register_processor(npu_id, ProcessorType::Npu(crate::processor::NpuVariant::AppleNeuralEngine { cores: 16 }));
+
+        let task = create_test_task(TaskPriority::Normal);
+
+        // Energy strategy should prefer NPU
+        balancer.set_strategy(BalancingStrategy::Energy);
+        let assignment = TaskAssignment::new();
+        let result = balancer.maybe_rebalance(&task, cpu_id, &assignment);
+
+        // NPU should be preferred for energy efficiency
+        assert_eq!(result, npu_id);
+    }
+
+    #[test]
+    fn test_stats() {
+        let balancer = LoadBalancer::new();
+
+        balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
+        balancer.register_processor(ProcessorId(1), ProcessorType::Cpu(CpuVariant::default()));
+
+        balancer.increment_load(ProcessorId(0));
+        balancer.increment_load(ProcessorId(0));
+        balancer.increment_load(ProcessorId(1));
+
+        let stats = balancer.get_stats();
+        assert_eq!(stats.total_load, 3);
+        assert_eq!(stats.processor_count, 2);
+        assert!((stats.avg_load - 1.5).abs() < 0.01);
+    }
+}
diff --git a/crates/synor-compute/src/scheduler/mod.rs b/crates/synor-compute/src/scheduler/mod.rs
new file mode 100644
index 0000000..aaf6b5e
--- /dev/null
+++ b/crates/synor-compute/src/scheduler/mod.rs
@@ -0,0 +1,559 @@
+//! Heterogeneous scheduler for multi-processor task assignment.
+//!
+//! Features:
+//! - Optimal task-to-processor assignment
+//! - Work stealing for load balancing
+//! - Pipeline parallelism across processor types
+//! - Dynamic rebalancing based on actual throughput
+
+mod load_balancer;
+mod work_queue;
+
+pub use load_balancer::LoadBalancer;
+pub use work_queue::WorkQueue;
+
+use crate::device::DeviceRegistry;
+use crate::error::ComputeError;
+use crate::processor::{Operation, Processor, ProcessorId, ProcessorType};
+use crate::task::{Task, TaskId, TaskPriority};
+use parking_lot::RwLock;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::Duration;
+
+/// Heterogeneous scheduler that manages tasks across all processor types.
+pub struct HeterogeneousScheduler {
+    /// Device registry.
+    device_registry: Arc<DeviceRegistry>,
+    /// Per-processor-type task queues.
+    queues: RwLock<HashMap<ProcessorType, WorkQueue>>,
+    /// Load balancer.
+    load_balancer: LoadBalancer,
+    /// Active schedules.
+    active_schedules: RwLock<HashMap<ScheduleId, Schedule>>,
+}
+
+impl HeterogeneousScheduler {
+    /// Creates a new heterogeneous scheduler.
+    pub fn new(device_registry: Arc<DeviceRegistry>) -> Self {
+        Self {
+            device_registry,
+            queues: RwLock::new(HashMap::new()),
+            load_balancer: LoadBalancer::new(),
+            active_schedules: RwLock::new(HashMap::new()),
+        }
+    }
+
+    /// Schedule a set of tasks for execution.
+    pub async fn schedule(&self, tasks: Vec<Task>) -> Result<ScheduleResult, ComputeError> {
+        if tasks.is_empty() {
+            return Ok(ScheduleResult {
+                schedule: Schedule::empty(),
+                estimated_makespan: Duration::ZERO,
+                processor_utilization: HashMap::new(),
+            });
+        }
+
+        // 1. Build dependency graph
+        let deps = self.build_dependency_graph(&tasks);
+
+        // 2. Assign tasks to optimal processors
+        let assignment = self.assign_tasks(&tasks, &deps).await?;
+
+        // 3. Create execution schedule with stages
+        let schedule = self.create_schedule(&tasks, &assignment, &deps)?;
+
+        // 4. Estimate metrics
+        let makespan = self.estimate_makespan(&schedule);
+        let utilization = self.estimate_utilization(&schedule);
+
+        // 5. Store active schedule
+        self.active_schedules.write().insert(schedule.id, schedule.clone());
+
+        Ok(ScheduleResult {
+            schedule,
+            estimated_makespan: makespan,
+            processor_utilization: utilization,
+        })
+    }
+
+    /// Execute a schedule.
+    pub async fn execute(&self, schedule: &Schedule) -> Result<ExecutionResult, ComputeError> {
+        let mut results = HashMap::new();
+        let start = std::time::Instant::now();
+
+        // Execute stages in order
+        for stage in &schedule.stages {
+            // Execute all tasks in this stage in parallel
+            let mut handles = Vec::new();
+
+            for task_id in &stage.tasks {
+                let task = schedule.tasks.get(task_id)
+                    .ok_or_else(|| ComputeError::Internal(format!("Task not found: {:?}", task_id)))?;
+                let processor_id = schedule.assignment.get(task_id)
+                    .ok_or_else(|| ComputeError::Internal(format!("No assignment for task: {:?}", task_id)))?;
+
+                let processor = self.device_registry.get_processor(processor_id)?;
+                let task_clone = task.clone();
+
+                handles.push(tokio::spawn(async move {
+                    processor.execute(task_clone.operation).await
+                }));
+            }
+
+            // Wait for all tasks in stage
+            for (i, handle) in handles.into_iter().enumerate() {
+                let task_id = stage.tasks[i];
+                match handle.await {
+                    Ok(Ok(result)) => {
+                        results.insert(task_id, TaskExecutionResult::Success(result));
+                    }
+                    Ok(Err(e)) => {
+                        results.insert(task_id, TaskExecutionResult::Failed(e.to_string()));
+                    }
+                    Err(e) => {
+                        results.insert(task_id, TaskExecutionResult::Failed(e.to_string()));
+                    }
+                }
+            }
+        }
+
+        let total_time = start.elapsed();
+
+        Ok(ExecutionResult {
+            results,
+            total_time,
+            actual_utilization: self.measure_utilization(),
+        })
+    }
+
+    /// Assign tasks to optimal processors.
+    async fn assign_tasks(
+        &self,
+        tasks: &[Task],
+        deps: &DependencyGraph,
+    ) -> Result<TaskAssignment, ComputeError> {
+        let mut assignment = TaskAssignment::new();
+
+        // Sort tasks by priority and dependencies (topological sort)
+        let sorted_tasks = self.topological_sort(tasks, deps);
+
+        for task in sorted_tasks {
+            // Find best processor for this task
+            let best_processor = self.find_best_processor(&task).await?;
+
+            // Check if we should rebalance
+            let final_processor = self.load_balancer
+                .maybe_rebalance(&task, best_processor, &assignment);
+
+            assignment.assign(task.id, final_processor);
+        }
+
+        Ok(assignment)
+    }
+
+    /// Find the best processor for a task.
+    async fn find_best_processor(&self, task: &Task) -> Result<ProcessorId, ComputeError> {
+        let mut best_score = f64::NEG_INFINITY;
+        let mut best_processor = None;
+
+        // Get all available processors
+        let processors = self.device_registry.all_processors();
+
+        for processor in processors {
+            if !processor.can_execute(&task.operation) {
+                continue;
+            }
+
+            // Calculate score based on multiple factors
+            let exec_time = processor.estimate_time(&task.operation);
+            let energy = processor.estimate_energy(&task.operation);
+            let load = processor.utilization();
+
+            // Score = 1 / (time * (1 + load) * energy_factor)
+            let time_factor = exec_time.as_secs_f64().max(0.001);
+            let load_factor = 1.0 + load;
+            let energy_factor = 1.0 + (energy / 1000.0); // Normalize energy
+
+            let score = 1.0 / (time_factor * load_factor * energy_factor);
+
+            if score > best_score {
+                best_score = score;
+                best_processor = Some(processor.id());
+            }
+        }
+
+        best_processor.ok_or_else(|| {
+            ComputeError::NoSuitableProcessor(format!("{:?}", task.operation.op_type()))
+        })
+    }
+
+    /// Build dependency graph from tasks.
+    fn build_dependency_graph(&self, tasks: &[Task]) -> DependencyGraph {
+        let mut graph = DependencyGraph::new();
+
+        for task in tasks {
+            graph.add_node(task.id);
+            for dep in &task.dependencies {
+                graph.add_edge(*dep, task.id);
+            }
+        }
+
+        graph
+    }
+
+    /// Topological sort of tasks respecting dependencies.
+    fn topological_sort(&self, tasks: &[Task], deps: &DependencyGraph) -> Vec<Task> {
+        let mut sorted = Vec::new();
+        let mut visited = std::collections::HashSet::new();
+        let task_map: HashMap<TaskId, Task> = tasks.iter()
+            .map(|t| (t.id, t.clone()))
+            .collect();
+
+        fn visit(
+            task_id: TaskId,
+            task_map: &HashMap<TaskId, Task>,
+            deps: &DependencyGraph,
+            visited: &mut std::collections::HashSet<TaskId>,
+            sorted: &mut Vec<Task>,
+        ) {
+            if visited.contains(&task_id) {
+                return;
+            }
+            visited.insert(task_id);
+
+            // Visit dependencies first
+            if let Some(task_deps) = deps.dependencies.get(&task_id) {
+                for dep in task_deps {
+                    visit(*dep, task_map, deps, visited, sorted);
+                }
+            }
+
+            if let Some(task) = task_map.get(&task_id) {
+                sorted.push(task.clone());
+            }
+        }
+
+        for task in tasks {
+            visit(task.id, &task_map, deps, &mut visited, &mut sorted);
+        }
+
+        // Sort by priority within dependency constraints
+        sorted.sort_by(|a, b| b.priority.cmp(&a.priority));
+
+        sorted
+    }
+
+    /// Create execution schedule with parallel stages.
+    fn create_schedule(
+        &self,
+        tasks: &[Task],
+        assignment: &TaskAssignment,
+        deps: &DependencyGraph,
+    ) -> Result<Schedule, ComputeError> {
+        let mut stages = Vec::new();
+        let mut scheduled = std::collections::HashSet::new();
+        let task_map: HashMap<TaskId, Task> = tasks.iter()
+            .map(|t| (t.id, t.clone()))
+            .collect();
+
+        while scheduled.len() < tasks.len() {
+            let mut stage_tasks = Vec::new();
+
+            for task in tasks {
+                if scheduled.contains(&task.id) {
+                    continue;
+                }
+
+                // Check if all dependencies are satisfied
+                let deps_satisfied = task.dependencies.iter()
+                    .all(|dep| scheduled.contains(dep));
+
+                if deps_satisfied {
+                    stage_tasks.push(task.id);
+                }
+            }
+
+            if stage_tasks.is_empty() {
+                return Err(ComputeError::SchedulingFailed(
+                    "Circular dependency detected".to_string()
+                ));
+            }
+
+            for task_id in &stage_tasks {
+                scheduled.insert(*task_id);
+            }
+
+            stages.push(ScheduleStage {
+                stage_id: stages.len(),
+                tasks: stage_tasks,
+            });
+        }
+
+        Ok(Schedule {
+            id: ScheduleId::new(),
+            tasks: task_map,
+            assignment: assignment.clone(),
+            stages,
+        })
+    }
+
+    /// Estimate makespan (total execution time).
+    fn estimate_makespan(&self, schedule: &Schedule) -> Duration {
+        let mut total = Duration::ZERO;
+
+        for stage in &schedule.stages {
+            let mut max_stage_time = Duration::ZERO;
+
+            for task_id in &stage.tasks {
+                if let (Some(task), Some(proc_id)) = (
+                    schedule.tasks.get(task_id),
+                    schedule.assignment.get(task_id),
+                ) {
+                    if let Ok(processor) = self.device_registry.get_processor(proc_id) {
+                        let time = processor.estimate_time(&task.operation);
+                        max_stage_time = max_stage_time.max(time);
+                    }
+                }
+            }
+
+            total += max_stage_time;
+        }
+
+        total
+    }
+
+    /// Estimate processor utilization.
+    fn estimate_utilization(&self, schedule: &Schedule) -> HashMap<ProcessorType, f64> {
+        let mut work_time: HashMap<ProcessorType, Duration> = HashMap::new();
+        let makespan = self.estimate_makespan(schedule);
+
+        for task_id in schedule.assignment.assignments.keys() {
+            if let (Some(task), Some(proc_id)) = (
+                schedule.tasks.get(task_id),
+                schedule.assignment.get(task_id),
+            ) {
+                if let Ok(processor) = self.device_registry.get_processor(proc_id) {
+                    let proc_type = processor.processor_type();
+                    let time = processor.estimate_time(&task.operation);
+                    *work_time.entry(proc_type).or_default() += time;
+                }
+            }
+        }
+
+        work_time
+            .into_iter()
+            .map(|(proc_type, time)| {
+                let utilization = if makespan.as_secs_f64() > 0.0 {
+                    time.as_secs_f64() / makespan.as_secs_f64()
+                } else {
+                    0.0
+                };
+                (proc_type, utilization.min(1.0))
+            })
+            .collect()
+    }
+
+    /// Measure actual current utilization.
+    fn measure_utilization(&self) -> HashMap<ProcessorType, f64> {
+        let mut utilization = HashMap::new();
+
+        for processor in self.device_registry.all_processors() {
+            let proc_type = processor.processor_type();
+            let util = processor.utilization();
+            utilization
+                .entry(proc_type)
+                .and_modify(|u| *u = (*u + util) / 2.0)
+                .or_insert(util);
+        }
+
+        utilization
+    }
+}
+
+/// Schedule identifier.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct ScheduleId(pub u64);
+
+impl ScheduleId {
+    /// Creates a new schedule ID.
+    pub fn new() -> Self {
+        use rand::Rng;
+        ScheduleId(rand::thread_rng().gen())
+    }
+}
+
+impl Default for ScheduleId {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Task-to-processor assignment.
+#[derive(Clone, Debug, Default)]
+pub struct TaskAssignment {
+    /// Map from task ID to processor ID.
+    pub assignments: HashMap<TaskId, ProcessorId>,
+}
+
+impl TaskAssignment {
+    /// Creates a new empty assignment.
+    pub fn new() -> Self {
+        Self {
+            assignments: HashMap::new(),
+        }
+    }
+
+    /// Assigns a task to a processor.
+    pub fn assign(&mut self, task_id: TaskId, processor_id: ProcessorId) {
+        self.assignments.insert(task_id, processor_id);
+    }
+
+    /// Gets the assigned processor for a task.
+    pub fn get(&self, task_id: &TaskId) -> Option<ProcessorId> {
+        self.assignments.get(task_id).copied()
+    }
+}
+
+/// Dependency graph for tasks.
+#[derive(Clone, Debug, Default)]
+pub struct DependencyGraph {
+    /// Dependencies: task -> list of tasks it depends on.
+    pub dependencies: HashMap<TaskId, Vec<TaskId>>,
+    /// Dependents: task -> list of tasks that depend on it.
+    pub dependents: HashMap<TaskId, Vec<TaskId>>,
+}
+
+impl DependencyGraph {
+    /// Creates a new empty dependency graph.
+    pub fn new() -> Self {
+        Self {
+            dependencies: HashMap::new(),
+            dependents: HashMap::new(),
+        }
+    }
+
+    /// Adds a node (task) to the graph.
+    pub fn add_node(&mut self, task_id: TaskId) {
+        self.dependencies.entry(task_id).or_default();
+        self.dependents.entry(task_id).or_default();
+    }
+
+    /// Adds a dependency edge (from depends on to).
+    pub fn add_edge(&mut self, from: TaskId, to: TaskId) {
+        self.dependencies.entry(to).or_default().push(from);
+        self.dependents.entry(from).or_default().push(to);
+    }
+}
+
+/// Execution schedule.
+#[derive(Clone, Debug)]
+pub struct Schedule {
+    /// Schedule ID.
+    pub id: ScheduleId,
+    /// All tasks.
+    pub tasks: HashMap<TaskId, Task>,
+    /// Task assignments.
+    pub assignment: TaskAssignment,
+    /// Execution stages (tasks within a stage can run in parallel).
+    pub stages: Vec<ScheduleStage>,
+}
+
+impl Schedule {
+    /// Creates an empty schedule.
+    pub fn empty() -> Self {
+        Self {
+            id: ScheduleId::new(),
+            tasks: HashMap::new(),
+            assignment: TaskAssignment::new(),
+            stages: Vec::new(),
+        }
+    }
+}
+
+/// A stage of parallel tasks.
+#[derive(Clone, Debug)]
+pub struct ScheduleStage {
+    /// Stage index.
+    pub stage_id: usize,
+    /// Tasks in this stage (can run in parallel).
+    pub tasks: Vec<TaskId>,
+}
+
+/// Result of scheduling.
+#[derive(Clone, Debug)]
+pub struct ScheduleResult {
+    /// The schedule.
+    pub schedule: Schedule,
+    /// Estimated total execution time.
+    pub estimated_makespan: Duration,
+    /// Estimated processor utilization by type.
+    pub processor_utilization: HashMap<ProcessorType, f64>,
+}
+
+/// Result of execution.
+#[derive(Clone, Debug)]
+pub struct ExecutionResult {
+    /// Results per task.
+    pub results: HashMap<TaskId, TaskExecutionResult>,
+    /// Total execution time.
+    pub total_time: Duration,
+    /// Actual processor utilization.
+    pub actual_utilization: HashMap<ProcessorType, f64>,
+}
+
+/// Result of a single task execution.
+#[derive(Clone, Debug)]
+pub enum TaskExecutionResult {
+    /// Task completed successfully.
+    Success(crate::processor::OperationResult),
+    /// Task failed.
+    Failed(String),
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::processor::Precision;
+    use crate::task::TaskStatus;
+
+    fn create_test_task(id: u64, op: Operation, deps: Vec<TaskId>) -> Task {
+        Task {
+            id: TaskId(id),
+            operation: op,
+            priority: TaskPriority::Normal,
+            dependencies: deps,
+            status: TaskStatus::Pending,
+            deadline: None,
+        }
+    }
+
+    #[test]
+    fn test_dependency_graph() {
+        let mut graph = DependencyGraph::new();
+
+        graph.add_node(TaskId(1));
+        graph.add_node(TaskId(2));
+        graph.add_node(TaskId(3));
+
+        graph.add_edge(TaskId(1), TaskId(2)); // 2 depends on 1
+        graph.add_edge(TaskId(1), TaskId(3)); // 3 depends on 1
+        graph.add_edge(TaskId(2), TaskId(3)); // 3 depends on 2
+
+        assert_eq!(graph.dependencies[&TaskId(2)], vec![TaskId(1)]);
+        assert_eq!(graph.dependencies[&TaskId(3)], vec![TaskId(1), TaskId(2)]);
+    }
+
+    #[test]
+    fn test_task_assignment() {
+        let mut assignment = TaskAssignment::new();
+
+        assignment.assign(TaskId(1), ProcessorId(0));
+        assignment.assign(TaskId(2), ProcessorId(1));
+
+        assert_eq!(assignment.get(&TaskId(1)), Some(ProcessorId(0)));
+        assert_eq!(assignment.get(&TaskId(2)), Some(ProcessorId(1)));
+        assert_eq!(assignment.get(&TaskId(3)), None);
+    }
+}
diff --git a/crates/synor-compute/src/scheduler/work_queue.rs b/crates/synor-compute/src/scheduler/work_queue.rs
new file mode 100644
index 0000000..fba13d1
--- /dev/null
+++ b/crates/synor-compute/src/scheduler/work_queue.rs
@@ -0,0 +1,271 @@
+//! Work queue with thread-safe task management.
+
+use crate::processor::ProcessorType;
+use crate::task::{Task, TaskId, TaskPriority};
+use crossbeam_channel::{bounded, Receiver, Sender, TryRecvError};
+use std::collections::HashMap;
+use std::sync::atomic::{AtomicU64, Ordering};
+
+/// Work queue for a specific processor type.
+pub struct WorkQueue {
+    /// Task sender (for producers).
+    sender: Sender<Task>,
+    /// Task receiver (for consumers).
+    receiver: Receiver<Task>,
+    /// Processor type this queue is for.
+    processor_type: ProcessorType,
+    /// Current queue size.
+    size: AtomicU64,
+    /// Total tasks processed.
+    processed: AtomicU64,
+}
+
+impl WorkQueue {
+    /// Creates a new work queue for a processor type.
+    pub fn new(processor_type: ProcessorType, capacity: usize) -> Self {
+        let (sender, receiver) = bounded(capacity.max(1024));
+
+        Self {
+            sender,
+            receiver,
+            processor_type,
+            size: AtomicU64::new(0),
+            processed: AtomicU64::new(0),
+        }
+    }
+
+    /// Push a task to the queue.
+    pub fn push(&self, task: Task) {
+        if self.sender.try_send(task).is_ok() {
+            self.size.fetch_add(1, Ordering::Relaxed);
+        }
+    }
+
+    /// Pop a task from the queue (ignores worker_id for compatibility).
+    pub fn pop(&self, _worker_id: usize) -> Option<Task> {
+        self.pop_any()
+    }
+
+    /// Pop any task from the queue.
+    pub fn pop_any(&self) -> Option<Task> {
+        match self.receiver.try_recv() {
+            Ok(task) => {
+                self.size.fetch_sub(1, Ordering::Relaxed);
+                self.processed.fetch_add(1, Ordering::Relaxed);
+                Some(task)
+            }
+            Err(TryRecvError::Empty) | Err(TryRecvError::Disconnected) => None,
+        }
+    }
+
+    /// Pop from global queue (alias for pop_any).
+    pub fn pop_global(&self) -> Option<Task> {
+        self.pop_any()
+    }
+
+    /// Steal a batch of tasks from another queue.
+    pub fn steal_batch_from(&self, other: &WorkQueue, max_tasks: usize) -> Vec<Task> {
+        let mut stolen = Vec::new();
+
+        while stolen.len() < max_tasks {
+            if let Some(task) = other.pop_any() {
+                stolen.push(task);
+            } else {
+                break;
+            }
+        }
+
+        // Push stolen tasks to this queue
+        for task in &stolen {
+            // Tasks are already accounted for in `other`, just push to self
+            if self.sender.try_send(task.clone()).is_ok() {
+                self.size.fetch_add(1, Ordering::Relaxed);
+            }
+        }
+
+        stolen
+    }
+
+    /// Get current queue size.
+    pub fn len(&self) -> usize {
+        self.size.load(Ordering::Relaxed) as usize
+    }
+
+    /// Check if queue is empty.
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Get number of tasks processed.
+    pub fn processed_count(&self) -> u64 {
+        self.processed.load(Ordering::Relaxed)
+    }
+
+    /// Get processor type for this queue.
+    pub fn processor_type(&self) -> ProcessorType {
+        self.processor_type.clone()
+    }
+
+    /// Get utilization estimate (0.0 - 1.0).
+    pub fn utilization(&self) -> f64 {
+        let size = self.size.load(Ordering::Relaxed) as f64;
+        let capacity = self.sender.capacity().unwrap_or(1024) as f64;
+        (size / capacity).min(1.0)
+    }
+
+    /// Get a stealer for cross-queue work stealing.
+    pub fn get_stealer(&self) -> QueueStealer {
+        QueueStealer {
+            receiver: self.receiver.clone(),
+        }
+    }
+}
+
+/// Stealer handle for cross-queue work stealing.
+#[derive(Clone)]
+pub struct QueueStealer {
+    receiver: Receiver<Task>,
+}
+
+impl QueueStealer {
+    /// Try to steal a task.
+    pub fn steal(&self) -> Option<Task> {
+        self.receiver.try_recv().ok()
+    }
+}
+
+/// Priority queue wrapper for tasks.
+pub struct PriorityWorkQueue {
+    /// Queues by priority level.
+    queues: HashMap<TaskPriority, WorkQueue>,
+    /// Processor type.
+    processor_type: ProcessorType,
+}
+
+impl PriorityWorkQueue {
+    /// Creates a new priority work queue.
+    pub fn new(processor_type: ProcessorType, capacity_per_priority: usize) -> Self {
+        let mut queues = HashMap::new();
+
+        for priority in [
+            TaskPriority::Critical,
+            TaskPriority::High,
+            TaskPriority::Normal,
+            TaskPriority::Background,
+        ] {
+            queues.insert(priority, WorkQueue::new(processor_type.clone(), capacity_per_priority));
+        }
+
+        Self {
+            queues,
+            processor_type,
+        }
+    }
+
+    /// Push a task with its priority.
+    pub fn push(&self, task: Task) {
+        let priority = task.priority;
+        if let Some(queue) = self.queues.get(&priority) {
+            queue.push(task);
+        }
+    }
+
+    /// Pop highest priority task available.
+    pub fn pop(&self, worker_id: usize) -> Option<Task> {
+        // Try priorities in order: Critical > High > Normal > Background
+        for priority in [
+            TaskPriority::Critical,
+            TaskPriority::High,
+            TaskPriority::Normal,
+            TaskPriority::Background,
+        ] {
+            if let Some(queue) = self.queues.get(&priority) {
+                if let Some(task) = queue.pop(worker_id) {
+                    return Some(task);
+                }
+            }
+        }
+        None
+    }
+
+    /// Get total queue size.
+    pub fn len(&self) -> usize {
+        self.queues.values().map(|q| q.len()).sum()
+    }
+
+    /// Check if all queues are empty.
+    pub fn is_empty(&self) -> bool {
+        self.queues.values().all(|q| q.is_empty())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::processor::{CpuVariant, Operation, Precision};
+    use crate::task::TaskStatus;
+
+    fn create_test_task(id: u64, priority: TaskPriority) -> Task {
+        Task {
+            id: TaskId(id),
+            operation: Operation::MatMul {
+                m: 1024,
+                n: 1024,
+                k: 1024,
+                precision: Precision::Fp32,
+            },
+            priority,
+            dependencies: vec![],
+            status: TaskStatus::Pending,
+            deadline: None,
+        }
+    }
+
+    #[test]
+    fn test_work_queue_basic() {
+        let queue = WorkQueue::new(
+            ProcessorType::Cpu(CpuVariant::default()),
+            100,
+        );
+
+        assert!(queue.is_empty());
+
+        queue.push(create_test_task(1, TaskPriority::Normal));
+        queue.push(create_test_task(2, TaskPriority::Normal));
+
+        assert_eq!(queue.len(), 2);
+
+        let task1 = queue.pop(0);
+        assert!(task1.is_some());
+        assert_eq!(queue.len(), 1);
+
+        let task2 = queue.pop(0);
+        assert!(task2.is_some());
+        assert!(queue.is_empty());
+    }
+
+    #[test]
+    fn test_priority_queue() {
+        let queue = PriorityWorkQueue::new(
+            ProcessorType::Cpu(CpuVariant::default()),
+            100,
+        );
+
+        queue.push(create_test_task(1, TaskPriority::Background));
+        queue.push(create_test_task(2, TaskPriority::Critical));
+        queue.push(create_test_task(3, TaskPriority::Normal));
+
+        // Should get Critical first
+        let task = queue.pop(0).unwrap();
+        assert_eq!(task.id, TaskId(2));
+        assert_eq!(task.priority, TaskPriority::Critical);
+
+        // Then Normal
+        let task = queue.pop(0).unwrap();
+        assert_eq!(task.id, TaskId(3));
+
+        // Then Background
+        let task = queue.pop(0).unwrap();
+        assert_eq!(task.id, TaskId(1));
+    }
+}
diff --git a/crates/synor-compute/src/task/mod.rs b/crates/synor-compute/src/task/mod.rs
new file mode 100644
index 0000000..a51b3f9
--- /dev/null
+++ b/crates/synor-compute/src/task/mod.rs
@@ -0,0 +1,543 @@
+//! Task definitions and decomposition.
+
+use crate::error::ComputeError;
+use crate::processor::{Operation, OperationType, Precision, ProcessorType};
+use crate::{ComputeJob, JobType};
+use serde::{Deserialize, Serialize};
+use std::time::Duration;
+
+/// Unique task identifier.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct TaskId(pub u64);
+
+impl TaskId {
+    /// Creates a new task ID.
+    pub fn new() -> Self {
+        use rand::Rng;
+        TaskId(rand::thread_rng().gen())
+    }
+}
+
+impl Default for TaskId {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl std::fmt::Display for TaskId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "task_{}", self.0)
+    }
+}
+
+/// Task priority levels.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub enum TaskPriority {
+    /// Background, can be preempted.
+    Background = 0,
+    /// Normal priority.
+    Normal = 1,
+    /// High priority.
+    High = 2,
+    /// Critical, must complete.
+    Critical = 3,
+}
+
+impl Default for TaskPriority {
+    fn default() -> Self {
+        TaskPriority::Normal
+    }
+}
+
+/// Task execution status.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub enum TaskStatus {
+    /// Waiting to be scheduled.
+    Pending,
+    /// Queued for execution.
+    Queued,
+    /// Currently executing.
+    Running,
+    /// Completed successfully.
+    Completed,
+    /// Failed.
+    Failed,
+    /// Cancelled.
+    Cancelled,
+}
+
+/// A schedulable task.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct Task {
+    /// Task ID.
+    pub id: TaskId,
+    /// Operation to execute.
+    pub operation: Operation,
+    /// Priority level.
+    pub priority: TaskPriority,
+    /// Dependencies (tasks that must complete first).
+    pub dependencies: Vec<TaskId>,
+    /// Current status.
+    pub status: TaskStatus,
+    /// Deadline (optional).
+    pub deadline: Option<u64>,
+}
+
+impl Task {
+    /// Creates a new task.
+    pub fn new(operation: Operation) -> Self {
+        Self {
+            id: TaskId::new(),
+            operation,
+            priority: TaskPriority::Normal,
+            dependencies: Vec::new(),
+            status: TaskStatus::Pending,
+            deadline: None,
+        }
+    }
+
+    /// Sets the priority.
+    pub fn with_priority(mut self, priority: TaskPriority) -> Self {
+        self.priority = priority;
+        self
+    }
+
+    /// Adds dependencies.
+    pub fn with_dependencies(mut self, deps: Vec<TaskId>) -> Self {
+        self.dependencies = deps;
+        self
+    }
+
+    /// Sets deadline.
+    pub fn with_deadline(mut self, deadline: u64) -> Self {
+        self.deadline = Some(deadline);
+        self
+    }
+
+    /// Checks if task is compatible with a processor type.
+    pub fn is_compatible_with(&self, proc_type: ProcessorType) -> bool {
+        // Check based on operation type
+        let op_type = self.operation.op_type();
+
+        match proc_type {
+            ProcessorType::Cpu(_) => {
+                // CPUs can do most things, but slowly
+                true
+            }
+            ProcessorType::Gpu(_) => {
+                // GPUs are good for parallel operations
+                matches!(
+                    op_type,
+                    OperationType::MatMul
+                        | OperationType::Conv2d
+                        | OperationType::SelfAttention
+                        | OperationType::FlashAttention
+                        | OperationType::Embedding
+                        | OperationType::Add
+                        | OperationType::Mul
+                        | OperationType::Softmax
+                )
+            }
+            ProcessorType::Tpu(_) => {
+                // TPUs are good for large matrix ops
+                matches!(
+                    op_type,
+                    OperationType::MatMul
+                        | OperationType::Conv2d
+                        | OperationType::SelfAttention
+                        | OperationType::FlashAttention
+                )
+            }
+            ProcessorType::Lpu => {
+                // LPUs are good for sequential inference
+                matches!(
+                    op_type,
+                    OperationType::MatMul
+                        | OperationType::SelfAttention
+                        | OperationType::KVCache
+                        | OperationType::Sampling
+                )
+            }
+            ProcessorType::Npu(_) => {
+                // NPUs are good for inference
+                matches!(
+                    op_type,
+                    OperationType::MatMul
+                        | OperationType::Conv2d
+                        | OperationType::Add
+                        | OperationType::Softmax
+                )
+            }
+            _ => true, // Default to compatible
+        }
+    }
+}
+
+/// Result of task execution.
+#[derive(Clone, Debug)]
+pub struct TaskResult {
+    /// Task ID.
+    pub task_id: TaskId,
+    /// Output data.
+    pub output: Vec<u8>,
+    /// Execution duration.
+    pub duration: Duration,
+    /// Energy consumed (Joules).
+    pub energy: f64,
+}
+
+/// Compute task for job execution.
+#[derive(Clone, Debug)]
+pub struct ComputeTask {
+    /// Task.
+    pub task: Task,
+    /// Resource requirements.
+    pub requirements: TaskRequirements,
+    /// Preferred processor type.
+    pub preferred_processor: Option<ProcessorType>,
+    /// Fallback processor type.
+    pub fallback_processor: Option<ProcessorType>,
+}
+
+/// Task resource requirements.
+#[derive(Clone, Debug, Default)]
+pub struct TaskRequirements {
+    /// Minimum memory (bytes).
+    pub min_memory: u64,
+    /// Minimum TFLOPS.
+    pub min_tflops: f64,
+    /// Maximum latency (ms).
+    pub max_latency_ms: Option<u32>,
+    /// Requires specific precision.
+    pub precision: Option<Precision>,
+}
+
+/// Decomposed workload.
+#[derive(Clone, Debug)]
+pub struct DecomposedWorkload {
+    /// All tasks.
+    pub tasks: Vec<Task>,
+    /// Total estimated FLOPS.
+    pub estimated_flops: f64,
+    /// Total estimated memory.
+    pub estimated_memory: u64,
+}
+
+/// Task decomposer that breaks jobs into schedulable tasks.
+pub struct TaskDecomposer {
+    /// Default batch size for inference.
+    inference_batch_size: usize,
+    /// Default precision.
+    default_precision: Precision,
+}
+
+impl TaskDecomposer {
+    /// Creates a new task decomposer.
+    pub fn new() -> Self {
+        Self {
+            inference_batch_size: 32,
+            default_precision: Precision::Fp16,
+        }
+    }
+
+    /// Decomposes a job into tasks.
+    pub fn decompose(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
+        match &job.job_type {
+            JobType::Training { .. } => self.decompose_training(job),
+            JobType::Inference { .. } => self.decompose_inference(job),
+            JobType::Container { .. } => self.decompose_container(job),
+            JobType::Serverless { .. } => self.decompose_serverless(job),
+            JobType::Wasm { .. } => self.decompose_wasm(job),
+        }
+    }
+
+    /// Decompose training job.
+    fn decompose_training(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
+        let mut tasks = Vec::new();
+
+        if let JobType::Training {
+            epochs,
+            batch_size,
+            ..
+        } = &job.job_type
+        {
+            // Data loading task
+            tasks.push(
+                Task::new(Operation::DataLoad {
+                    bytes: 1024 * 1024 * 100, // 100MB
+                    async_: true,
+                })
+                .with_priority(TaskPriority::High),
+            );
+
+            let data_load_id = tasks[0].id;
+
+            // Preprocessing task
+            tasks.push(
+                Task::new(Operation::DataPreprocess {
+                    batch: *batch_size as usize,
+                    transforms: vec!["normalize".to_string(), "augment".to_string()],
+                })
+                .with_dependencies(vec![data_load_id])
+                .with_priority(TaskPriority::High),
+            );
+
+            let preprocess_id = tasks[1].id;
+
+            // Forward pass (simplified as MatMul)
+            tasks.push(
+                Task::new(Operation::MatMul {
+                    m: *batch_size as usize,
+                    n: 4096,
+                    k: 4096,
+                    precision: self.default_precision,
+                })
+                .with_dependencies(vec![preprocess_id])
+                .with_priority(TaskPriority::Critical),
+            );
+
+            let forward_id = tasks[2].id;
+
+            // Backward pass
+            tasks.push(
+                Task::new(Operation::Backward {
+                    forward_op: Box::new(Operation::MatMul {
+                        m: *batch_size as usize,
+                        n: 4096,
+                        k: 4096,
+                        precision: self.default_precision,
+                    }),
+                })
+                .with_dependencies(vec![forward_id])
+                .with_priority(TaskPriority::Critical),
+            );
+
+            let backward_id = tasks[3].id;
+
+            // Optimizer step
+            tasks.push(
+                Task::new(Operation::OptimizerStep {
+                    parameters: 1_000_000,
+                    optimizer: "adamw".to_string(),
+                    precision: self.default_precision,
+                })
+                .with_dependencies(vec![backward_id])
+                .with_priority(TaskPriority::High),
+            );
+        }
+
+        Ok(tasks)
+    }
+
+    /// Decompose inference job.
+    fn decompose_inference(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
+        let mut tasks = Vec::new();
+
+        if let JobType::Inference { batch_size, .. } = &job.job_type {
+            // Tokenization (CPU optimal)
+            tasks.push(
+                Task::new(Operation::Tokenization {
+                    text_bytes: 4096,
+                    vocab_size: 32000,
+                })
+                .with_priority(TaskPriority::High),
+            );
+
+            let token_id = tasks[0].id;
+
+            // Embedding (GPU optimal)
+            tasks.push(
+                Task::new(Operation::Embedding {
+                    batch: *batch_size as usize,
+                    seq_len: 512,
+                    vocab_size: 32000,
+                    embed_dim: 4096,
+                    precision: self.default_precision,
+                })
+                .with_dependencies(vec![token_id])
+                .with_priority(TaskPriority::Critical),
+            );
+
+            let embed_id = tasks[1].id;
+
+            // Self-attention (TPU/GPU optimal)
+            tasks.push(
+                Task::new(Operation::SelfAttention {
+                    batch: *batch_size as usize,
+                    seq_len: 512,
+                    num_heads: 32,
+                    head_dim: 128,
+                    precision: self.default_precision,
+                })
+                .with_dependencies(vec![embed_id])
+                .with_priority(TaskPriority::Critical),
+            );
+
+            let attention_id = tasks[2].id;
+
+            // Sampling (LPU optimal)
+            tasks.push(
+                Task::new(Operation::Sampling {
+                    batch: *batch_size as usize,
+                    vocab_size: 32000,
+                    temperature: 0.7,
+                })
+                .with_dependencies(vec![attention_id])
+                .with_priority(TaskPriority::High),
+            );
+
+            let sample_id = tasks[3].id;
+
+            // Detokenization (CPU optimal)
+            tasks.push(
+                Task::new(Operation::Detokenization {
+                    tokens: 256,
+                    vocab_size: 32000,
+                })
+                .with_dependencies(vec![sample_id])
+                .with_priority(TaskPriority::Normal),
+            );
+        }
+
+        Ok(tasks)
+    }
+
+    /// Decompose container job.
+    fn decompose_container(&self, _job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
+        // Container jobs are typically a single task
+        Ok(vec![Task::new(Operation::Generic {
+            op_type: OperationType::DataLoad,
+            flops: 1e9,
+            memory: 1024 * 1024 * 1024,
+        })
+        .with_priority(TaskPriority::Normal)])
+    }
+
+    /// Decompose serverless function.
+    fn decompose_serverless(&self, _job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
+        // Serverless is typically a single task
+        Ok(vec![Task::new(Operation::Generic {
+            op_type: OperationType::DataPreprocess,
+            flops: 1e6,
+            memory: 256 * 1024 * 1024,
+        })
+        .with_priority(TaskPriority::High)])
+    }
+
+    /// Decompose WASM job.
+    fn decompose_wasm(&self, _job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
+        // WASM is typically a single task
+        Ok(vec![Task::new(Operation::Generic {
+            op_type: OperationType::DataPreprocess,
+            flops: 1e6,
+            memory: 16 * 1024 * 1024,
+        })
+        .with_priority(TaskPriority::Normal)])
+    }
+}
+
+impl Default for TaskDecomposer {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_task_creation() {
+        let task = Task::new(Operation::MatMul {
+            m: 1024,
+            n: 1024,
+            k: 1024,
+            precision: Precision::Fp32,
+        })
+        .with_priority(TaskPriority::High);
+
+        assert_eq!(task.priority, TaskPriority::High);
+        assert!(task.dependencies.is_empty());
+        assert_eq!(task.status, TaskStatus::Pending);
+    }
+
+    #[test]
+    fn test_task_dependencies() {
+        let task1 = Task::new(Operation::DataLoad {
+            bytes: 1000,
+            async_: true,
+        });
+        let task1_id = task1.id;
+
+        let task2 = Task::new(Operation::MatMul {
+            m: 1024,
+            n: 1024,
+            k: 1024,
+            precision: Precision::Fp32,
+        })
+        .with_dependencies(vec![task1_id]);
+
+        assert_eq!(task2.dependencies, vec![task1_id]);
+    }
+
+    #[test]
+    fn test_task_compatibility() {
+        let matmul_task = Task::new(Operation::MatMul {
+            m: 1024,
+            n: 1024,
+            k: 1024,
+            precision: Precision::Fp32,
+        });
+
+        // MatMul should be compatible with GPU and TPU
+        assert!(matmul_task.is_compatible_with(ProcessorType::Gpu(
+            crate::processor::GpuVariant::NvidiaCuda {
+                compute_capability: (8, 0)
+            }
+        )));
+        assert!(matmul_task.is_compatible_with(ProcessorType::Tpu(
+            crate::processor::TpuVersion::V5p
+        )));
+
+        let data_load_task = Task::new(Operation::DataLoad {
+            bytes: 1000,
+            async_: true,
+        });
+
+        // DataLoad should be compatible with CPU
+        assert!(data_load_task.is_compatible_with(ProcessorType::Cpu(
+            crate::processor::CpuVariant::default()
+        )));
+    }
+
+    #[test]
+    fn test_task_decomposer() {
+        let decomposer = TaskDecomposer::new();
+
+        let job = ComputeJob {
+            id: crate::JobId::new(),
+            owner: [0u8; 32],
+            job_type: JobType::Inference {
+                model_cid: "model".to_string(),
+                input_format: "json".to_string(),
+                batch_size: 1,
+            },
+            resources: crate::ResourceRequirements::default(),
+            input_cid: None,
+            max_budget: 1_000_000,
+            priority: crate::JobPriority::Normal,
+            created_at: 0,
+            deadline: None,
+        };
+
+        let tasks = decomposer.decompose(&job).unwrap();
+        assert!(!tasks.is_empty());
+
+        // Check dependencies form a chain
+        for (i, task) in tasks.iter().enumerate() {
+            if i > 0 {
+                assert!(!task.dependencies.is_empty());
+            }
+        }
+    }
+}
diff --git a/docs/PLAN/PHASE11-Synor-Compute-L2-Part2-HyperEfficiency.md b/docs/PLAN/PHASE11-Synor-Compute-L2-Part2-HyperEfficiency.md
new file mode 100644
index 0000000..e3127bd
--- /dev/null
+++ b/docs/PLAN/PHASE11-Synor-Compute-L2-Part2-HyperEfficiency.md
@@ -0,0 +1,1584 @@
+# Phase 11 Part 2: Hyper-Efficient Distributed Compute
+
+> **Goal**: 90% cost reduction vs AWS/GCP/Azure + 10x speed improvement through innovative architecture
+
+---
+
+## Executive Summary
+
+Traditional cloud providers have structural inefficiencies:
+- **30-60% profit margins** built into pricing
+- **Centralized data centers** with high real estate/cooling costs
+- **Idle capacity** that customers pay for but don't use
+- **Geographic lock-in** preventing arbitrage on electricity costs
+- **Billions of idle consumer devices** completely untapped
+
+Synor Compute eliminates these inefficiencies through:
+1. **Protocol-only overhead** (no corporate margin)
+2. **Distributed infrastructure** (homes, offices, edge locations)
+3. **Real-time spot markets** (fill idle capacity instantly)
+4. **Global electricity arbitrage** (route to cheapest regions)
+5. **Consumer device mesh** (phones, browsers, desktops)
+
+---
+
+## Part 1: Cost Reduction Architecture
+
+### 1.1 Zero-Margin Protocol Design
+
+```rust
+// synor-compute/src/economics/pricing.rs
+
+/// Dynamic pricing engine with near-zero overhead
+pub struct DynamicPricingEngine {
+    /// Base cost = provider's actual cost (electricity + depreciation)
+    base_cost_calculator: BaseCostCalculator,
+    /// Protocol fee (only fee in system)
+    protocol_fee_percent: f32,  // 5-10% for network sustainability
+    /// Real-time supply/demand
+    market_state: MarketState,
+    /// Geographic cost map
+    geo_costs: GeoCostMap,
+}
+
+impl DynamicPricingEngine {
+    /// Calculate price for compute job
+    pub fn calculate_price(&self, job: &ComputeJob) -> Price {
+        // 1. Calculate provider's actual cost
+        let base_cost = self.base_cost_calculator.compute_cost(
+            job.resources(),
+            job.duration_estimate(),
+            job.provider_location(),
+        );
+
+        // 2. Apply supply/demand multiplier (0.5x to 2x)
+        let demand_multiplier = self.market_state.demand_multiplier(
+            job.resource_type(),
+            job.urgency(),
+        );
+
+        // 3. Add minimal protocol fee
+        let protocol_fee = base_cost * self.protocol_fee_percent;
+
+        Price {
+            base: base_cost,
+            demand_adjustment: base_cost * (demand_multiplier - 1.0),
+            protocol_fee,
+            total: base_cost * demand_multiplier + protocol_fee,
+        }
+    }
+}
+
+/// Provider's actual operating cost
+pub struct BaseCostCalculator {
+    /// Electricity cost by region ($/kWh)
+    electricity_rates: HashMap<Region, f64>,
+    /// Hardware depreciation rates
+    depreciation: HardwareDepreciation,
+    /// Cooling efficiency (PUE)
+    pue_by_climate: HashMap<Climate, f64>,
+}
+
+impl BaseCostCalculator {
+    pub fn compute_cost(
+        &self,
+        resources: &Resources,
+        duration: Duration,
+        location: &GeoLocation,
+    ) -> f64 {
+        let region = location.region();
+        let electricity_rate = self.electricity_rates.get(&region).unwrap_or(&0.10);
+        let pue = self.pue_by_climate.get(&location.climate()).unwrap_or(&1.5);
+
+        // Power consumption
+        let power_kw = resources.estimated_power_kw();
+        let energy_kwh = power_kw * duration.as_hours() * pue;
+        let electricity_cost = energy_kwh * electricity_rate;
+
+        // Hardware depreciation
+        let depreciation_cost = self.depreciation.cost_per_hour(resources)
+            * duration.as_hours();
+
+        // Network cost (minimal for most jobs)
+        let network_cost = resources.network_gb() * 0.01; // $0.01/GB
+
+        electricity_cost + depreciation_cost + network_cost
+    }
+}
+```
+
+### 1.2 Geographic Electricity Arbitrage
+
+```rust
+// synor-compute/src/scheduler/geo_arbitrage.rs
+
+/// Routes compute to cheapest electricity regions
+pub struct GeoArbitrageScheduler {
+    /// Real-time electricity prices by region
+    electricity_prices: Arc<RwLock<HashMap<Region, ElectricityPrice>>>,
+    /// Provider locations and capabilities
+    providers: ProviderRegistry,
+    /// Latency requirements
+    latency_constraints: LatencyConstraints,
+}
+
+/// Real-time electricity pricing
+pub struct ElectricityPrice {
+    pub region: Region,
+    pub price_per_kwh: f64,
+    pub carbon_intensity: f64,  // gCO2/kWh
+    pub renewable_percent: f64,
+    pub timestamp: Timestamp,
+    pub forecast_24h: Vec<f64>,  // Predicted prices
+}
+
+impl GeoArbitrageScheduler {
+    /// Find cheapest region for job
+    pub async fn find_optimal_region(
+        &self,
+        job: &ComputeJob,
+    ) -> Result<SchedulingDecision, Error> {
+        let prices = self.electricity_prices.read();
+
+        // Get regions with available capacity
+        let available_regions = self.providers
+            .regions_with_capacity(job.resources())
+            .await?;
+
+        // Filter by latency requirements
+        let viable_regions: Vec<_> = available_regions
+            .into_iter()
+            .filter(|r| self.meets_latency_requirements(r, job))
+            .collect();
+
+        // Sort by total cost (electricity + network)
+        let mut scored: Vec<_> = viable_regions
+            .iter()
+            .map(|region| {
+                let electricity = prices.get(region).map(|p| p.price_per_kwh).unwrap_or(0.15);
+                let network_cost = self.network_cost_to_user(region, job.user_location());
+                let total_score = electricity * job.estimated_kwh() + network_cost;
+                (region, total_score)
+            })
+            .collect();
+
+        scored.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
+
+        Ok(SchedulingDecision {
+            region: scored[0].0.clone(),
+            estimated_cost: scored[0].1,
+            alternatives: scored[1..].iter().map(|(r, c)| (*r, *c)).collect(),
+        })
+    }
+}
+
+/// Electricity price feeds from multiple sources
+pub struct ElectricityPriceFeed {
+    sources: Vec<Box<dyn ElectricityDataSource>>,
+}
+
+#[async_trait]
+pub trait ElectricityDataSource: Send + Sync {
+    async fn get_prices(&self, regions: &[Region]) -> Result<Vec<ElectricityPrice>, Error>;
+}
+
+// Implementations for various markets:
+// - US: PJM, CAISO, ERCOT, MISO, etc.
+// - Europe: EPEX SPOT, Nord Pool
+// - Asia: JEPX (Japan), KPX (Korea)
+```
+
+### 1.3 Spot Market for Idle Capacity
+
+```rust
+// synor-compute/src/market/spot.rs
+
+/// Real-time spot market for compute resources
+pub struct SpotMarket {
+    /// Order book for each resource type
+    order_books: HashMap<ResourceType, OrderBook>,
+    /// Matching engine
+    matcher: MatchingEngine,
+    /// Price discovery
+    price_discovery: PriceDiscovery,
+}
+
+/// Compute resource order
+pub struct SpotOrder {
+    pub order_id: OrderId,
+    pub order_type: OrderType,
+    pub resource_type: ResourceType,
+    pub quantity: ResourceQuantity,
+    pub price_limit: Option<f64>,  // None = market order
+    pub duration: Duration,
+    pub preemptible: bool,  // Can be interrupted
+    pub constraints: JobConstraints,
+}
+
+pub enum OrderType {
+    /// Provider offering compute
+    Ask {
+        provider_id: ProviderId,
+        available_until: Timestamp,
+        interruptible: bool,
+    },
+    /// User requesting compute
+    Bid {
+        user_id: UserId,
+        deadline: Option<Timestamp>,
+        priority: Priority,
+    },
+}
+
+impl SpotMarket {
+    /// Submit order to market
+    pub async fn submit_order(&self, order: SpotOrder) -> Result<OrderResult, Error> {
+        let book = self.order_books.get_mut(&order.resource_type)?;
+
+        match order.order_type {
+            OrderType::Ask { .. } => {
+                // Provider offering capacity
+                book.add_ask(order.clone());
+
+                // Try to match with existing bids
+                let matches = self.matcher.match_asks(&book);
+                self.execute_matches(matches).await
+            }
+            OrderType::Bid { .. } => {
+                // User requesting capacity
+                if let Some(price_limit) = order.price_limit {
+                    // Limit order - add to book
+                    book.add_bid(order.clone());
+                }
+
+                // Try to match immediately
+                let matches = self.matcher.match_bids(&book, &order);
+                self.execute_matches(matches).await
+            }
+        }
+    }
+
+    /// Get current spot price for resource
+    pub fn spot_price(&self, resource: &ResourceType) -> SpotPrice {
+        let book = &self.order_books[resource];
+        SpotPrice {
+            bid: book.best_bid(),
+            ask: book.best_ask(),
+            last_trade: book.last_trade_price(),
+            volume_24h: book.volume_24h(),
+        }
+    }
+}
+
+/// Preemptible compute (like AWS Spot Instances)
+pub struct PreemptibleCompute {
+    /// Discount vs on-demand (typically 70-90%)
+    discount_percent: f32,
+    /// Warning time before preemption
+    warning_seconds: u32,
+    /// Checkpoint strategy
+    checkpoint: CheckpointStrategy,
+}
+
+impl PreemptibleCompute {
+    /// Price at 10-30% of on-demand
+    pub fn calculate_price(&self, on_demand_price: f64) -> f64 {
+        on_demand_price * (1.0 - self.discount_percent as f64 / 100.0)
+    }
+
+    /// Handle preemption gracefully
+    pub async fn preempt(&self, job: &mut ComputeJob) -> Result<(), Error> {
+        // 1. Send warning to job
+        job.send_preemption_warning(self.warning_seconds).await?;
+
+        // 2. Trigger checkpoint if configured
+        if let CheckpointStrategy::Auto = self.checkpoint {
+            job.checkpoint().await?;
+        }
+
+        // 3. Migrate or terminate
+        if let Some(new_capacity) = self.find_alternative_capacity(job).await? {
+            job.migrate_to(new_capacity).await
+        } else {
+            job.terminate_gracefully().await
+        }
+    }
+}
+```
+
+### 1.4 Cost Comparison Calculator
+
+```rust
+// synor-compute/src/economics/comparison.rs
+
+/// Compare Synor vs traditional cloud pricing
+pub struct CostComparison {
+    synor_pricing: DynamicPricingEngine,
+    aws_pricing: AwsPricingData,
+    gcp_pricing: GcpPricingData,
+    azure_pricing: AzurePricingData,
+}
+
+impl CostComparison {
+    pub fn compare(&self, workload: &Workload) -> ComparisonResult {
+        let synor_cost = self.synor_pricing.calculate_total(workload);
+        let aws_cost = self.aws_pricing.calculate_total(workload);
+        let gcp_cost = self.gcp_pricing.calculate_total(workload);
+        let azure_cost = self.azure_pricing.calculate_total(workload);
+
+        let min_cloud = aws_cost.min(gcp_cost).min(azure_cost);
+        let savings_percent = ((min_cloud - synor_cost) / min_cloud) * 100.0;
+
+        ComparisonResult {
+            synor: synor_cost,
+            aws: aws_cost,
+            gcp: gcp_cost,
+            azure: azure_cost,
+            savings_vs_cheapest: savings_percent,
+            savings_breakdown: self.breakdown_savings(workload),
+        }
+    }
+
+    fn breakdown_savings(&self, workload: &Workload) -> SavingsBreakdown {
+        SavingsBreakdown {
+            // No cloud margin
+            margin_elimination: 35.0,  // ~35% of cloud pricing is margin
+            // Distributed infrastructure
+            infrastructure_savings: 15.0,  // No data center overhead
+            // Spot/preemptible usage
+            spot_savings: 20.0,  // If using preemptible
+            // Geographic arbitrage
+            geo_arbitrage: 10.0,  // Routing to cheap electricity
+            // Consumer device usage (if applicable)
+            consumer_devices: 10.0,  // Free compute from devices
+            // Total
+            total: 90.0,
+        }
+    }
+}
+```
+
+---
+
+## Part 2: 10x Speed Architecture
+
+### 2.1 Intelligent Caching Layer
+
+```rust
+// synor-compute/src/acceleration/cache.rs
+
+/// Multi-tier caching for inference acceleration
+pub struct InferenceCache {
+    /// L1: Hot cache in GPU memory
+    gpu_cache: GpuCache,
+    /// L2: Warm cache in system memory
+    memory_cache: MemoryCache,
+    /// L3: Cold cache on NVMe
+    nvme_cache: NvmeCache,
+    /// L4: Distributed cache across nodes
+    distributed_cache: DistributedCache,
+    /// Semantic cache for similar queries
+    semantic_cache: SemanticCache,
+}
+
+impl InferenceCache {
+    /// Check all cache tiers for result
+    pub async fn get(&self, request: &InferenceRequest) -> Option<CachedResult> {
+        // 1. Exact match in hot cache (sub-ms)
+        if let Some(result) = self.gpu_cache.get(&request.hash()).await {
+            return Some(CachedResult::exact(result));
+        }
+
+        // 2. Exact match in memory cache (~1ms)
+        if let Some(result) = self.memory_cache.get(&request.hash()).await {
+            // Promote to GPU cache
+            self.gpu_cache.insert(&request.hash(), &result).await;
+            return Some(CachedResult::exact(result));
+        }
+
+        // 3. Semantic similarity search (~5ms)
+        if let Some((similar_req, result, similarity)) =
+            self.semantic_cache.find_similar(request, 0.95).await
+        {
+            // If >95% similar, reuse result
+            return Some(CachedResult::semantic(result, similarity));
+        }
+
+        // 4. Check distributed cache (~10-50ms)
+        if let Some(result) = self.distributed_cache.get(&request.hash()).await {
+            self.memory_cache.insert(&request.hash(), &result).await;
+            return Some(CachedResult::exact(result));
+        }
+
+        None
+    }
+}
+
+/// Semantic cache using embeddings
+pub struct SemanticCache {
+    /// Embedding model for queries
+    embedder: EmbeddingModel,
+    /// Vector index for similarity search
+    index: VectorIndex,
+    /// Cached results
+    results: HashMap<QueryHash, InferenceResult>,
+}
+
+impl SemanticCache {
+    /// Find semantically similar cached query
+    pub async fn find_similar(
+        &self,
+        request: &InferenceRequest,
+        min_similarity: f32,
+    ) -> Option<(InferenceRequest, InferenceResult, f32)> {
+        // Embed the query
+        let embedding = self.embedder.embed(&request.input).await?;
+
+        // Search for similar
+        let results = self.index.search(&embedding, 1, min_similarity).await;
+
+        results.first().map(|r| {
+            let cached_result = self.results.get(&r.id).unwrap();
+            (r.request.clone(), cached_result.clone(), r.similarity)
+        })
+    }
+}
+```
+
+### 2.2 Speculative Execution
+
+```rust
+// synor-compute/src/acceleration/speculative.rs
+
+/// Speculative execution for predictable workloads
+pub struct SpeculativeExecutor {
+    /// Prediction model for next likely requests
+    predictor: RequestPredictor,
+    /// Pre-computed results
+    precomputed: PrecomputedResults,
+    /// Background speculation workers
+    workers: Vec<SpeculationWorker>,
+}
+
+impl SpeculativeExecutor {
+    /// Predict and pre-execute likely next requests
+    pub async fn speculate(&self, context: &UserContext) -> Vec<PrecomputedResult> {
+        // 1. Predict likely next requests
+        let predictions = self.predictor.predict_next(context, 5).await;
+
+        // 2. Execute in background if not cached
+        let mut futures = Vec::new();
+        for (request, probability) in predictions {
+            if probability > 0.3 && !self.is_cached(&request).await {
+                futures.push(self.execute_speculative(request, probability));
+            }
+        }
+
+        // 3. Store results for instant retrieval
+        let results = join_all(futures).await;
+        for result in &results {
+            self.precomputed.store(result).await;
+        }
+
+        results
+    }
+
+    /// Check if speculative result is available
+    pub async fn get_speculative(&self, request: &InferenceRequest) -> Option<InferenceResult> {
+        self.precomputed.get(&request.hash()).await
+    }
+}
+
+/// Request pattern predictor using ML
+pub struct RequestPredictor {
+    /// Sequence model for request patterns
+    model: SequenceModel,
+    /// User behavior history
+    history: UserHistoryStore,
+}
+
+impl RequestPredictor {
+    pub async fn predict_next(
+        &self,
+        context: &UserContext,
+        count: usize,
+    ) -> Vec<(InferenceRequest, f32)> {
+        // Get user's recent request history
+        let history = self.history.get_recent(&context.user_id, 10).await;
+
+        // Predict next likely requests
+        let predictions = self.model.predict(&history, count).await;
+
+        predictions
+            .into_iter()
+            .map(|(req, prob)| (req, prob))
+            .collect()
+    }
+}
+```
+
+### 2.3 Model Optimization Pipeline
+
+```rust
+// synor-compute/src/acceleration/optimization.rs
+
+/// Automatic model optimization for faster inference
+pub struct ModelOptimizer {
+    /// Quantization engine
+    quantizer: Quantizer,
+    /// Pruning engine
+    pruner: Pruner,
+    /// Distillation engine
+    distiller: Distiller,
+    /// Compilation (TensorRT, etc.)
+    compiler: ModelCompiler,
+}
+
+impl ModelOptimizer {
+    /// Optimize model for target hardware
+    pub async fn optimize(
+        &self,
+        model: &Model,
+        target: &HardwareTarget,
+        constraints: &OptimizationConstraints,
+    ) -> Result<OptimizedModel, Error> {
+        let mut optimized = model.clone();
+
+        // 1. Quantization (FP32 → FP16 → INT8 → INT4)
+        if constraints.allow_quantization {
+            optimized = self.quantizer.quantize(
+                &optimized,
+                constraints.min_precision,
+                constraints.max_accuracy_loss,
+            ).await?;
+        }
+
+        // 2. Pruning (remove unimportant weights)
+        if constraints.allow_pruning {
+            optimized = self.pruner.prune(
+                &optimized,
+                constraints.max_sparsity,
+                constraints.max_accuracy_loss,
+            ).await?;
+        }
+
+        // 3. Compile for target hardware
+        optimized = self.compiler.compile(&optimized, target).await?;
+
+        Ok(optimized)
+    }
+}
+
+/// Quantization levels
+pub enum QuantizationLevel {
+    FP32,           // Full precision (baseline)
+    FP16,           // Half precision (~2x speedup)
+    BF16,           // Brain float 16 (better range)
+    INT8,           // 8-bit integer (~4x speedup)
+    INT4,           // 4-bit integer (~8x speedup)
+    FP8,            // 8-bit float (H100+)
+    Mixed,          // Dynamic mixed precision
+}
+
+/// Hardware-specific compilation
+pub struct ModelCompiler {
+    /// TensorRT for NVIDIA
+    tensorrt: TensorRtCompiler,
+    /// ROCm MIGraphX for AMD
+    migraphx: MiGraphXCompiler,
+    /// OpenVINO for Intel
+    openvino: OpenVinoCompiler,
+    /// Core ML for Apple
+    coreml: CoreMlCompiler,
+}
+
+impl ModelCompiler {
+    pub async fn compile(
+        &self,
+        model: &Model,
+        target: &HardwareTarget,
+    ) -> Result<CompiledModel, Error> {
+        match target.vendor {
+            Vendor::Nvidia => self.tensorrt.compile(model, target).await,
+            Vendor::Amd => self.migraphx.compile(model, target).await,
+            Vendor::Intel => self.openvino.compile(model, target).await,
+            Vendor::Apple => self.coreml.compile(model, target).await,
+            _ => Ok(model.clone().into()),
+        }
+    }
+}
+```
+
+### 2.4 Continuous Batching
+
+```rust
+// synor-compute/src/acceleration/batching.rs
+
+/// Continuous batching for maximum GPU utilization
+pub struct ContinuousBatcher {
+    /// Request queue
+    queue: RequestQueue,
+    /// Active batches
+    active_batches: Vec<ActiveBatch>,
+    /// Batching configuration
+    config: BatchConfig,
+}
+
+pub struct BatchConfig {
+    /// Maximum batch size
+    pub max_batch_size: usize,
+    /// Maximum wait time for batching
+    pub max_wait_ms: u64,
+    /// Enable dynamic batching
+    pub dynamic: bool,
+    /// Enable iteration-level batching (for LLMs)
+    pub iteration_level: bool,
+}
+
+impl ContinuousBatcher {
+    /// Process requests with continuous batching
+    pub async fn process(&self) -> Result<(), Error> {
+        loop {
+            // 1. Collect requests up to batch size or timeout
+            let requests = self.queue.collect_batch(
+                self.config.max_batch_size,
+                self.config.max_wait_ms,
+            ).await;
+
+            if requests.is_empty() {
+                continue;
+            }
+
+            // 2. Create batch
+            let batch = self.create_batch(requests)?;
+
+            // 3. Execute batch
+            let results = self.execute_batch(batch).await?;
+
+            // 4. Dispatch results to individual requests
+            self.dispatch_results(results).await;
+        }
+    }
+
+    /// Iteration-level batching for LLMs (vLLM-style)
+    pub async fn process_iterative(&self) -> Result<(), Error> {
+        let mut active_sequences: Vec<ActiveSequence> = Vec::new();
+
+        loop {
+            // 1. Add new requests to active sequences
+            while active_sequences.len() < self.config.max_batch_size {
+                if let Some(req) = self.queue.try_pop() {
+                    active_sequences.push(ActiveSequence::new(req));
+                } else {
+                    break;
+                }
+            }
+
+            if active_sequences.is_empty() {
+                tokio::time::sleep(Duration::from_millis(1)).await;
+                continue;
+            }
+
+            // 2. Run one iteration for all active sequences
+            let next_tokens = self.run_iteration(&active_sequences).await?;
+
+            // 3. Update sequences and remove completed ones
+            let mut completed = Vec::new();
+            for (i, (seq, token)) in active_sequences.iter_mut()
+                .zip(next_tokens.iter())
+                .enumerate()
+            {
+                seq.append_token(*token);
+                if seq.is_complete() {
+                    completed.push(i);
+                }
+            }
+
+            // 4. Return completed sequences
+            for i in completed.into_iter().rev() {
+                let seq = active_sequences.remove(i);
+                seq.complete().await;
+            }
+        }
+    }
+}
+```
+
+### 2.5 Speed Comparison
+
+| Optimization | Speedup Factor | Notes |
+|--------------|----------------|-------|
+| Semantic caching | 100-1000x | Cache hits are instant |
+| Speculative execution | 2-5x | For predictable workloads |
+| INT8 quantization | 2-4x | Minimal accuracy loss |
+| INT4 quantization | 4-8x | For LLMs with good quality |
+| TensorRT compilation | 2-5x | Hardware-specific optimization |
+| Continuous batching | 3-10x | Maximum GPU utilization |
+| KV cache optimization | 2-3x | For LLM inference |
+| **Combined** | **10-50x** | Achievable with all optimizations |
+
+---
+
+## Part 3: Consumer Device Mesh Network
+
+### 3.1 Universal Device Support
+
+```rust
+// synor-compute/src/mesh/device.rs
+
+/// Any device that can contribute compute
+pub enum DeviceType {
+    /// Data center GPU (NVIDIA A100, H100)
+    DataCenterGpu {
+        model: GpuModel,
+        vram_gb: u32,
+        tensor_cores: u32,
+    },
+    /// Consumer GPU (RTX 3090, 4090)
+    ConsumerGpu {
+        model: GpuModel,
+        vram_gb: u32,
+    },
+    /// Mobile device (phone, tablet)
+    Mobile {
+        platform: MobilePlatform,
+        chip: MobileChip,
+        gpu: MobileGpu,
+    },
+    /// Desktop/Laptop CPU
+    Cpu {
+        vendor: CpuVendor,
+        cores: u32,
+        threads: u32,
+        avx_support: AvxSupport,
+    },
+    /// Browser (WebGPU/WebAssembly)
+    Browser {
+        runtime: BrowserRuntime,
+        gpu_available: bool,
+        wasm_simd: bool,
+    },
+    /// Apple Silicon (M1, M2, M3)
+    AppleSilicon {
+        chip: AppleChip,
+        gpu_cores: u32,
+        neural_engine_cores: u32,
+        unified_memory_gb: u32,
+    },
+    /// TPU (if accessible)
+    Tpu {
+        version: TpuVersion,
+        chips: u32,
+    },
+    /// Custom accelerator (Groq LPU, Cerebras, etc.)
+    CustomAccelerator {
+        vendor: String,
+        model: String,
+        tops: f32,  // Tera operations per second
+    },
+}
+
+pub enum MobilePlatform {
+    Ios,
+    Android,
+}
+
+pub enum MobileChip {
+    // Apple
+    A15Bionic,
+    A16Bionic,
+    A17Pro,
+    // Qualcomm
+    Snapdragon8Gen1,
+    Snapdragon8Gen2,
+    Snapdragon8Gen3,
+    // Samsung
+    Exynos2200,
+    Exynos2400,
+    // Google
+    Tensor,
+    TensorG2,
+    TensorG3,
+    // MediaTek
+    Dimensity9000,
+    Dimensity9300,
+}
+
+pub enum MobileGpu {
+    // Apple
+    AppleGpu { cores: u32 },
+    // Qualcomm
+    Adreno { model: u32 },
+    // ARM
+    MaliG { model: u32 },
+    // IMG
+    PowerVR { model: String },
+}
+```
+
+### 3.2 Device Capability Registry
+
+```rust
+// synor-compute/src/mesh/registry.rs
+
+/// Central registry of all contributing devices
+pub struct DeviceRegistry {
+    /// All registered devices
+    devices: HashMap<DeviceId, DeviceInfo>,
+    /// Devices by capability
+    by_capability: CapabilityIndex,
+    /// Devices by location
+    by_location: GeoIndex,
+    /// Device reputation scores
+    reputation: ReputationStore,
+}
+
+/// Detailed device capabilities
+pub struct DeviceInfo {
+    pub device_id: DeviceId,
+    pub device_type: DeviceType,
+    pub owner: Address,
+    /// Compute capabilities
+    pub compute: ComputeCapabilities,
+    /// Network capabilities
+    pub network: NetworkCapabilities,
+    /// Availability schedule
+    pub availability: AvailabilitySchedule,
+    /// Current status
+    pub status: DeviceStatus,
+    /// Reputation score (0-100)
+    pub reputation: u32,
+}
+
+pub struct ComputeCapabilities {
+    /// FLOPS (single precision)
+    pub fp32_gflops: f64,
+    /// FLOPS (half precision)
+    pub fp16_gflops: f64,
+    /// Integer operations
+    pub int8_tops: f64,
+    /// Memory bandwidth (GB/s)
+    pub memory_bandwidth: f64,
+    /// Available memory (GB)
+    pub memory_gb: f64,
+    /// Supported frameworks
+    pub frameworks: Vec<Framework>,
+    /// Supported model formats
+    pub model_formats: Vec<ModelFormat>,
+}
+
+pub struct NetworkCapabilities {
+    /// Download speed (Mbps)
+    pub download_mbps: f64,
+    /// Upload speed (Mbps)
+    pub upload_mbps: f64,
+    /// Latency to nearest edge (ms)
+    pub edge_latency_ms: u32,
+    /// NAT type
+    pub nat_type: NatType,
+}
+
+/// When device is available
+pub struct AvailabilitySchedule {
+    /// Always available
+    pub always: bool,
+    /// Available hours (UTC)
+    pub hours: Option<Vec<(u8, u8)>>,
+    /// Available only when idle
+    pub idle_only: bool,
+    /// Available only when charging (mobile)
+    pub charging_only: bool,
+    /// Minimum battery level (mobile)
+    pub min_battery: Option<u8>,
+}
+```
+
+### 3.3 Mobile SDK
+
+```rust
+// synor-compute/src/sdk/mobile.rs
+
+/// Mobile SDK for contributing compute
+pub struct SynorMobileSDK {
+    /// Device identifier
+    device_id: DeviceId,
+    /// User wallet
+    wallet: Wallet,
+    /// Local inference runtime
+    runtime: MobileInferenceRuntime,
+    /// Task queue
+    tasks: TaskQueue,
+    /// Earnings tracker
+    earnings: EarningsTracker,
+}
+
+impl SynorMobileSDK {
+    /// Initialize SDK
+    pub async fn init(config: MobileConfig) -> Result<Self, Error> {
+        // 1. Generate or load device ID
+        let device_id = Self::get_or_create_device_id().await?;
+
+        // 2. Initialize wallet
+        let wallet = Wallet::load_or_create(&config.keystore_path).await?;
+
+        // 3. Detect device capabilities
+        let capabilities = Self::detect_capabilities().await?;
+
+        // 4. Initialize inference runtime
+        let runtime = MobileInferenceRuntime::new(&capabilities)?;
+
+        // 5. Register with network
+        Self::register_device(&device_id, &capabilities).await?;
+
+        Ok(Self {
+            device_id,
+            wallet,
+            runtime,
+            tasks: TaskQueue::new(),
+            earnings: EarningsTracker::new(),
+        })
+    }
+
+    /// Start contributing compute
+    pub async fn start_contributing(&self, settings: ContributionSettings) -> Result<(), Error> {
+        loop {
+            // 1. Check if we should be active
+            if !self.should_be_active(&settings).await {
+                tokio::time::sleep(Duration::from_secs(60)).await;
+                continue;
+            }
+
+            // 2. Get available tasks
+            let task = self.get_next_task().await?;
+
+            // 3. Execute task
+            let result = self.execute_task(&task).await?;
+
+            // 4. Submit result and earn rewards
+            let reward = self.submit_result(&task, &result).await?;
+            self.earnings.add(reward);
+        }
+    }
+
+    /// Check contribution conditions
+    async fn should_be_active(&self, settings: &ContributionSettings) -> bool {
+        // Check battery
+        if let Some(min_battery) = settings.min_battery {
+            if Self::battery_level() < min_battery {
+                return false;
+            }
+        }
+
+        // Check if charging
+        if settings.charging_only && !Self::is_charging() {
+            return false;
+        }
+
+        // Check if idle
+        if settings.idle_only && !Self::is_idle() {
+            return false;
+        }
+
+        // Check thermal state
+        if Self::thermal_state() == ThermalState::Critical {
+            return false;
+        }
+
+        true
+    }
+}
+
+/// Mobile inference runtime
+pub struct MobileInferenceRuntime {
+    /// Core ML for iOS
+    #[cfg(target_os = "ios")]
+    coreml: CoreMlRuntime,
+    /// NNAPI/GPU delegate for Android
+    #[cfg(target_os = "android")]
+    tflite: TfLiteRuntime,
+    /// Metal for Apple GPUs
+    #[cfg(any(target_os = "ios", target_os = "macos"))]
+    metal: MetalRuntime,
+    /// OpenCL for Android GPUs
+    #[cfg(target_os = "android")]
+    opencl: OpenClRuntime,
+}
+```
+
+### 3.4 Browser SDK (WebGPU + WASM)
+
+```typescript
+// synor-compute/sdk/browser/src/index.ts
+
+/**
+ * Browser SDK for contributing compute via WebGPU/WASM
+ */
+export class SynorBrowserSDK {
+  private deviceId: string;
+  private wallet: BrowserWallet;
+  private runtime: BrowserRuntime;
+  private webgpu: GPUDevice | null;
+  private worker: Worker;
+
+  /**
+   * Initialize SDK in browser
+   */
+  static async init(config: BrowserConfig): Promise<SynorBrowserSDK> {
+    const sdk = new SynorBrowserSDK();
+
+    // 1. Check WebGPU support
+    if (navigator.gpu) {
+      const adapter = await navigator.gpu.requestAdapter();
+      if (adapter) {
+        sdk.webgpu = await adapter.requestDevice();
+        console.log('WebGPU available');
+      }
+    }
+
+    // 2. Initialize WASM runtime
+    sdk.runtime = await BrowserRuntime.init();
+
+    // 3. Create/load wallet
+    sdk.wallet = await BrowserWallet.loadOrCreate();
+
+    // 4. Generate device ID
+    sdk.deviceId = await sdk.generateDeviceId();
+
+    // 5. Start worker thread
+    sdk.worker = new Worker(new URL('./worker.ts', import.meta.url));
+
+    return sdk;
+  }
+
+  /**
+   * Start contributing compute
+   */
+  async startContributing(settings: ContributionSettings): Promise<void> {
+    // Register capabilities
+    const capabilities = await this.detectCapabilities();
+    await this.registerDevice(capabilities);
+
+    // Start task loop in worker
+    this.worker.postMessage({
+      type: 'start',
+      settings,
+      capabilities,
+    });
+
+    // Listen for results
+    this.worker.onmessage = async (event) => {
+      if (event.data.type === 'task_complete') {
+        await this.submitResult(event.data.taskId, event.data.result);
+      } else if (event.data.type === 'earnings_update') {
+        this.onEarningsUpdate?.(event.data.earnings);
+      }
+    };
+  }
+
+  /**
+   * Detect browser compute capabilities
+   */
+  private async detectCapabilities(): Promise<BrowserCapabilities> {
+    return {
+      // WebGPU
+      webgpu: {
+        available: !!this.webgpu,
+        maxBufferSize: this.webgpu?.limits.maxBufferSize,
+        maxComputeWorkgroupSizeX: this.webgpu?.limits.maxComputeWorkgroupSizeX,
+      },
+      // WASM
+      wasm: {
+        simd: await this.checkWasmSimd(),
+        threads: await this.checkWasmThreads(),
+        memory64: await this.checkWasmMemory64(),
+      },
+      // Hardware
+      hardwareConcurrency: navigator.hardwareConcurrency,
+      deviceMemory: (navigator as any).deviceMemory,
+      // Network
+      connection: (navigator as any).connection,
+    };
+  }
+
+  /**
+   * Execute inference task using WebGPU
+   */
+  private async executeWithWebGPU(task: InferenceTask): Promise<InferenceResult> {
+    // Load model if not cached
+    if (!this.modelCache.has(task.modelId)) {
+      const model = await this.loadModel(task.modelId);
+      this.modelCache.set(task.modelId, model);
+    }
+
+    const model = this.modelCache.get(task.modelId)!;
+
+    // Create input buffer
+    const inputBuffer = this.webgpu!.createBuffer({
+      size: task.input.byteLength,
+      usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST,
+    });
+    this.webgpu!.queue.writeBuffer(inputBuffer, 0, task.input);
+
+    // Execute compute shader
+    const commandEncoder = this.webgpu!.createCommandEncoder();
+    const passEncoder = commandEncoder.beginComputePass();
+    passEncoder.setPipeline(model.pipeline);
+    passEncoder.setBindGroup(0, model.bindGroup);
+    passEncoder.dispatchWorkgroups(
+      Math.ceil(task.input.length / 256)
+    );
+    passEncoder.end();
+
+    // Read results
+    const outputBuffer = this.webgpu!.createBuffer({
+      size: model.outputSize,
+      usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST,
+    });
+    commandEncoder.copyBufferToBuffer(
+      model.outputBuffer, 0,
+      outputBuffer, 0,
+      model.outputSize
+    );
+
+    this.webgpu!.queue.submit([commandEncoder.finish()]);
+
+    await outputBuffer.mapAsync(GPUMapMode.READ);
+    const result = new Float32Array(outputBuffer.getMappedRange());
+    outputBuffer.unmap();
+
+    return { output: result };
+  }
+}
+```
+
+### 3.5 Desktop App SDK
+
+```rust
+// synor-compute/src/sdk/desktop.rs
+
+/// Desktop SDK for contributing compute
+pub struct SynorDesktopSDK {
+    device_id: DeviceId,
+    wallet: Wallet,
+    /// GPU runtime (CUDA, ROCm, Metal, etc.)
+    gpu_runtime: Option<GpuRuntime>,
+    /// CPU runtime
+    cpu_runtime: CpuRuntime,
+    /// Task scheduler
+    scheduler: LocalScheduler,
+    /// System monitor
+    monitor: SystemMonitor,
+}
+
+impl SynorDesktopSDK {
+    pub async fn init() -> Result<Self, Error> {
+        // 1. Detect all available compute resources
+        let gpus = GpuDetector::detect_all().await?;
+        let cpus = CpuDetector::detect().await?;
+
+        // 2. Initialize runtimes
+        let gpu_runtime = if !gpus.is_empty() {
+            Some(GpuRuntime::init(&gpus).await?)
+        } else {
+            None
+        };
+
+        let cpu_runtime = CpuRuntime::init(&cpus).await?;
+
+        // 3. Start system monitor
+        let monitor = SystemMonitor::new();
+
+        Ok(Self {
+            device_id: DeviceId::generate(),
+            wallet: Wallet::load_or_create().await?,
+            gpu_runtime,
+            cpu_runtime,
+            scheduler: LocalScheduler::new(),
+            monitor,
+        })
+    }
+
+    /// Configure resource sharing
+    pub fn configure(&mut self, config: DesktopContributionConfig) {
+        // How much GPU to share (0-100%)
+        self.scheduler.set_gpu_limit(config.gpu_share_percent);
+
+        // How much CPU to share
+        self.scheduler.set_cpu_limit(config.cpu_share_percent);
+
+        // How much memory to share
+        self.scheduler.set_memory_limit(config.memory_share_percent);
+
+        // Only run when idle
+        self.scheduler.set_idle_only(config.idle_only);
+
+        // Power mode preferences
+        self.scheduler.set_power_mode(config.power_mode);
+    }
+}
+
+/// GPU detection for all platforms
+pub struct GpuDetector;
+
+impl GpuDetector {
+    pub async fn detect_all() -> Result<Vec<GpuInfo>, Error> {
+        let mut gpus = Vec::new();
+
+        // NVIDIA (CUDA)
+        #[cfg(feature = "cuda")]
+        gpus.extend(Self::detect_nvidia().await?);
+
+        // AMD (ROCm)
+        #[cfg(feature = "rocm")]
+        gpus.extend(Self::detect_amd().await?);
+
+        // Intel (OneAPI)
+        #[cfg(feature = "oneapi")]
+        gpus.extend(Self::detect_intel().await?);
+
+        // Apple (Metal)
+        #[cfg(target_os = "macos")]
+        gpus.extend(Self::detect_apple().await?);
+
+        Ok(gpus)
+    }
+
+    #[cfg(feature = "cuda")]
+    async fn detect_nvidia() -> Result<Vec<GpuInfo>, Error> {
+        use nvml_wrapper::Nvml;
+
+        let nvml = Nvml::init()?;
+        let device_count = nvml.device_count()?;
+
+        let mut gpus = Vec::new();
+        for i in 0..device_count {
+            let device = nvml.device_by_index(i)?;
+            gpus.push(GpuInfo {
+                vendor: GpuVendor::Nvidia,
+                name: device.name()?,
+                vram_bytes: device.memory_info()?.total,
+                compute_capability: device.cuda_compute_capability()?,
+                driver_version: nvml.sys_driver_version()?,
+            });
+        }
+
+        Ok(gpus)
+    }
+}
+```
+
+### 3.6 Contribution Rewards Model
+
+```rust
+// synor-compute/src/economics/rewards.rs
+
+/// Reward calculation for device contributors
+pub struct RewardCalculator {
+    /// Base reward rates
+    base_rates: BaseRates,
+    /// Reputation multiplier
+    reputation_multiplier: ReputationMultiplier,
+    /// Uptime bonuses
+    uptime_bonus: UptimeBonus,
+}
+
+pub struct BaseRates {
+    /// Per TFLOP-second (GPU)
+    pub gpu_tflops: f64,      // 0.000001 SYNOR/TFLOP-s
+    /// Per GFLOP-second (CPU)
+    pub cpu_gflops: f64,      // 0.00000001 SYNOR/GFLOP-s
+    /// Per GB transferred
+    pub bandwidth_gb: f64,    // 0.001 SYNOR/GB
+    /// Per hour of availability
+    pub availability_hour: f64, // 0.0001 SYNOR/hour
+}
+
+impl RewardCalculator {
+    pub fn calculate_reward(&self, contribution: &Contribution) -> Reward {
+        let base = match contribution.resource_type {
+            ResourceType::Gpu => {
+                contribution.tflops * contribution.duration.as_secs_f64()
+                    * self.base_rates.gpu_tflops
+            }
+            ResourceType::Cpu => {
+                contribution.gflops * contribution.duration.as_secs_f64()
+                    * self.base_rates.cpu_gflops
+            }
+            ResourceType::Bandwidth => {
+                contribution.gb_transferred * self.base_rates.bandwidth_gb
+            }
+        };
+
+        // Apply reputation multiplier (0.5x to 2x)
+        let reputation_mult = self.reputation_multiplier.get(contribution.reputation);
+
+        // Apply uptime bonus (up to 20% extra)
+        let uptime_mult = self.uptime_bonus.get(contribution.uptime_percent);
+
+        Reward {
+            base,
+            reputation_bonus: base * (reputation_mult - 1.0),
+            uptime_bonus: base * (uptime_mult - 1.0),
+            total: base * reputation_mult * uptime_mult,
+        }
+    }
+}
+
+/// Expected monthly earnings by device type
+pub struct EarningsEstimator;
+
+impl EarningsEstimator {
+    pub fn estimate_monthly(device: &DeviceType, hours_per_day: f64) -> MonthlyEarnings {
+        let hourly = match device {
+            DeviceType::DataCenterGpu { .. } => 0.50,    // $0.50/hour
+            DeviceType::ConsumerGpu { .. } => 0.10,      // $0.10/hour
+            DeviceType::AppleSilicon { .. } => 0.05,     // $0.05/hour
+            DeviceType::Cpu { .. } => 0.01,              // $0.01/hour
+            DeviceType::Mobile { .. } => 0.005,          // $0.005/hour
+            DeviceType::Browser { .. } => 0.002,         // $0.002/hour
+            _ => 0.01,
+        };
+
+        let daily = hourly * hours_per_day;
+        let monthly = daily * 30.0;
+
+        MonthlyEarnings {
+            low: monthly * 0.5,   // 50% utilization
+            medium: monthly * 0.7, // 70% utilization
+            high: monthly,         // 100% utilization
+        }
+    }
+}
+```
+
+---
+
+## Part 4: Task Distribution Algorithm
+
+### 4.1 Optimal Task Router
+
+```rust
+// synor-compute/src/scheduler/router.rs
+
+/// Routes tasks to optimal devices
+pub struct TaskRouter {
+    /// Device registry
+    devices: Arc<DeviceRegistry>,
+    /// Cost optimizer
+    cost_optimizer: CostOptimizer,
+    /// Latency optimizer
+    latency_optimizer: LatencyOptimizer,
+    /// Load balancer
+    load_balancer: LoadBalancer,
+}
+
+impl TaskRouter {
+    /// Find optimal device(s) for task
+    pub async fn route(&self, task: &ComputeTask) -> Result<RoutingDecision, Error> {
+        // 1. Filter devices that can handle this task
+        let capable_devices = self.devices
+            .find_capable(&task.requirements)
+            .await?;
+
+        // 2. Score each device
+        let mut scored: Vec<(DeviceId, RoutingScore)> = Vec::new();
+
+        for device in capable_devices {
+            let score = self.score_device(&device, task).await?;
+            scored.push((device.device_id, score));
+        }
+
+        // 3. Sort by composite score
+        scored.sort_by(|a, b| b.1.composite.partial_cmp(&a.1.composite).unwrap());
+
+        // 4. Select best device(s)
+        let selected = if task.distributed {
+            // Select multiple devices for distributed task
+            self.select_distributed(&scored, task)
+        } else {
+            // Single best device
+            vec![scored[0].0.clone()]
+        };
+
+        Ok(RoutingDecision {
+            devices: selected,
+            estimated_cost: scored[0].1.cost,
+            estimated_latency: scored[0].1.latency,
+            estimated_duration: scored[0].1.duration,
+        })
+    }
+
+    async fn score_device(&self, device: &DeviceInfo, task: &ComputeTask) -> Result<RoutingScore, Error> {
+        // Cost score (lower is better)
+        let cost = self.cost_optimizer.estimate_cost(device, task);
+        let cost_score = 1.0 / (1.0 + cost);
+
+        // Latency score (lower is better)
+        let latency = self.latency_optimizer.estimate_latency(device, task);
+        let latency_score = 1.0 / (1.0 + latency.as_millis() as f64 / 1000.0);
+
+        // Capability score (higher compute = better)
+        let capability_score = device.compute.fp16_gflops / task.requirements.min_gflops;
+
+        // Reputation score
+        let reputation_score = device.reputation as f64 / 100.0;
+
+        // Load score (less loaded = better)
+        let load = self.load_balancer.current_load(&device.device_id).await?;
+        let load_score = 1.0 - load;
+
+        // Composite score with weights
+        let composite =
+            cost_score * 0.3 +
+            latency_score * 0.2 +
+            capability_score * 0.2 +
+            reputation_score * 0.15 +
+            load_score * 0.15;
+
+        Ok(RoutingScore {
+            cost,
+            latency,
+            duration: self.estimate_duration(device, task),
+            composite,
+        })
+    }
+}
+```
+
+### 4.2 Distributed Task Sharding
+
+```rust
+// synor-compute/src/scheduler/sharding.rs
+
+/// Shard large tasks across multiple devices
+pub struct TaskSharder {
+    /// Sharding strategies
+    strategies: HashMap<TaskType, Box<dyn ShardingStrategy>>,
+}
+
+#[async_trait]
+pub trait ShardingStrategy: Send + Sync {
+    /// Shard task into subtasks
+    async fn shard(&self, task: &ComputeTask, devices: &[DeviceInfo]) -> Result<Vec<Shard>, Error>;
+
+    /// Aggregate results from shards
+    async fn aggregate(&self, results: Vec<ShardResult>) -> Result<TaskResult, Error>;
+}
+
+/// Data parallel sharding (same model, different data)
+pub struct DataParallelSharder;
+
+#[async_trait]
+impl ShardingStrategy for DataParallelSharder {
+    async fn shard(&self, task: &ComputeTask, devices: &[DeviceInfo]) -> Result<Vec<Shard>, Error> {
+        let data_size = task.input_data.len();
+        let num_shards = devices.len();
+        let shard_size = data_size / num_shards;
+
+        let mut shards = Vec::new();
+        for (i, device) in devices.iter().enumerate() {
+            let start = i * shard_size;
+            let end = if i == num_shards - 1 { data_size } else { start + shard_size };
+
+            shards.push(Shard {
+                shard_id: i as u32,
+                device_id: device.device_id.clone(),
+                model: task.model.clone(),
+                data_range: start..end,
+            });
+        }
+
+        Ok(shards)
+    }
+
+    async fn aggregate(&self, results: Vec<ShardResult>) -> Result<TaskResult, Error> {
+        // Concatenate results in order
+        let mut aggregated = Vec::new();
+        for result in results.into_iter().sorted_by_key(|r| r.shard_id) {
+            aggregated.extend(result.output);
+        }
+        Ok(TaskResult { output: aggregated })
+    }
+}
+
+/// Model parallel sharding (different model layers on different devices)
+pub struct ModelParallelSharder {
+    /// Layer assignments
+    layer_assignments: Vec<(usize, usize)>,  // (start_layer, end_layer)
+}
+
+#[async_trait]
+impl ShardingStrategy for ModelParallelSharder {
+    async fn shard(&self, task: &ComputeTask, devices: &[DeviceInfo]) -> Result<Vec<Shard>, Error> {
+        // Assign model layers to devices based on memory
+        let total_layers = task.model.num_layers();
+        let mut assignments = Vec::new();
+        let mut current_layer = 0;
+
+        for device in devices {
+            let layers_for_device = self.calculate_layers_for_device(
+                device,
+                &task.model,
+                total_layers - current_layer,
+            );
+
+            assignments.push(Shard {
+                shard_id: assignments.len() as u32,
+                device_id: device.device_id.clone(),
+                model_layers: current_layer..(current_layer + layers_for_device),
+                data: task.input_data.clone(),
+            });
+
+            current_layer += layers_for_device;
+        }
+
+        Ok(assignments)
+    }
+
+    async fn aggregate(&self, results: Vec<ShardResult>) -> Result<TaskResult, Error> {
+        // Pipeline results through layers
+        // Last shard result is the final output
+        Ok(results.into_iter().last().unwrap().into())
+    }
+}
+```
+
+---
+
+## Summary: Achieving 90% Cost Reduction + 10x Speed
+
+### Cost Reduction Breakdown
+
+| Factor | Savings | How |
+|--------|---------|-----|
+| Zero cloud margin | 35% | Protocol-only, no corporate overhead |
+| Distributed infra | 15% | No data center costs |
+| Spot market | 20% | Fill idle capacity at discount |
+| Geo arbitrage | 10% | Route to cheap electricity |
+| Consumer devices | 10% | Free idle compute |
+| **Total** | **90%** | Combined savings |
+
+### Speed Improvement Breakdown
+
+| Optimization | Speedup | How |
+|--------------|---------|-----|
+| Semantic caching | 10-100x | Reuse similar results |
+| Speculative execution | 2-5x | Pre-compute likely requests |
+| Quantization (INT4/INT8) | 4-8x | Reduced precision inference |
+| Hardware compilation | 2-5x | TensorRT, custom kernels |
+| Continuous batching | 3-10x | Maximum GPU utilization |
+| Edge compute | 2-5x | Compute closer to user |
+| **Combined** | **10-50x** | With all optimizations |
+
+### Consumer Device Contribution
+
+| Device Type | Contribution | Monthly Earnings |
+|-------------|--------------|------------------|
+| Data center GPU | Full training/inference | $100-500 |
+| Consumer GPU | Inference, light training | $30-100 |
+| Apple Silicon | Efficient inference | $15-50 |
+| Desktop CPU | Data processing, embeddings | $5-20 |
+| Mobile device | Edge inference | $2-10 |
+| Browser | Light compute, idle cycles | $1-5 |
+
+This architecture creates a truly decentralized compute network that can undercut traditional cloud providers while providing competitive performance.
diff --git a/docs/PLAN/PHASE11-Synor-Compute-L2-Part3-HeterogeneousCompute.md b/docs/PLAN/PHASE11-Synor-Compute-L2-Part3-HeterogeneousCompute.md
new file mode 100644
index 0000000..628ca8f
--- /dev/null
+++ b/docs/PLAN/PHASE11-Synor-Compute-L2-Part3-HeterogeneousCompute.md
@@ -0,0 +1,1564 @@
+# Phase 11 Part 3: Heterogeneous Multi-Processor Compute
+
+> **Goal**: Utilize ALL processor types simultaneously (CPU+GPU+TPU+NPU+LPU+Custom) with intelligent task scheduling to achieve maximum throughput and zero idle processors.
+
+---
+
+## Executive Summary
+
+Modern compute workloads can be decomposed into subtasks that are optimal for different processor types:
+
+| Processor | Optimal For | Examples |
+|-----------|-------------|----------|
+| **CPU** | Sequential logic, control flow, I/O | Data loading, preprocessing, orchestration |
+| **GPU** | Parallel matrix operations | Neural network layers, convolutions |
+| **TPU** | Tensor operations, ML inference | Transformer attention, matrix multiply |
+| **NPU** | Low-power inference | Edge inference, mobile AI |
+| **LPU** | Sequential inference (Groq) | LLM token generation |
+| **FPGA** | Custom bit-level operations | Cryptography, specialized kernels |
+| **DSP** | Signal processing | Audio, video, sensor data |
+
+**Key Insight**: A single AI training job contains ALL these subtask types. By routing each subtask to the optimal processor, we achieve **2-5x speedup** over GPU-only execution.
+
+---
+
+## Architecture: Unified Heterogeneous Scheduler
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                    SYNOR HETEROGENEOUS COMPUTE ENGINE                        │
+├─────────────────────────────────────────────────────────────────────────────┤
+│                                                                              │
+│  ┌─────────────────────────────────────────────────────────────────────────┐ │
+│  │                      TASK DECOMPOSER                                     │ │
+│  │  Analyzes workload → Identifies subtasks → Maps to optimal processors    │ │
+│  └─────────────────────────────────────────────────────────────────────────┘ │
+│                                    │                                         │
+│                                    ▼                                         │
+│  ┌─────────────────────────────────────────────────────────────────────────┐ │
+│  │                    HETEROGENEOUS SCHEDULER                               │ │
+│  │  ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐        │ │
+│  │  │ CPU  │ │ GPU  │ │ TPU  │ │ NPU  │ │ LPU  │ │ FPGA │ │ DSP  │        │ │
+│  │  │Queue │ │Queue │ │Queue │ │Queue │ │Queue │ │Queue │ │Queue │        │ │
+│  │  └──┬───┘ └──┬───┘ └──┬───┘ └──┬───┘ └──┬───┘ └──┬───┘ └──┬───┘        │ │
+│  └─────┼────────┼────────┼────────┼────────┼────────┼────────┼────────────┘ │
+│        │        │        │        │        │        │        │              │
+│        ▼        ▼        ▼        ▼        ▼        ▼        ▼              │
+│  ┌─────────────────────────────────────────────────────────────────────────┐ │
+│  │                    PROCESSOR FABRIC                                      │ │
+│  │                                                                          │ │
+│  │  ┌────────┐  ┌────────┐  ┌────────┐  ┌────────┐  ┌────────┐            │ │
+│  │  │  CPU   │  │  GPU   │  │  TPU   │  │  NPU   │  │  LPU   │            │ │
+│  │  │Cluster │  │Cluster │  │ Pods   │  │ Array  │  │ Rack   │            │ │
+│  │  │        │  │        │  │        │  │        │  │        │            │ │
+│  │  │ ┌────┐ │  │ ┌────┐ │  │ ┌────┐ │  │ ┌────┐ │  │ ┌────┐ │            │ │
+│  │  │ │Core│ │  │ │CUDA│ │  │ │MXU │ │  │ │ NPE│ │  │ │TSP │ │            │ │
+│  │  │ │Core│ │  │ │CUDA│ │  │ │MXU │ │  │ │ NPE│ │  │ │TSP │ │            │ │
+│  │  │ │Core│ │  │ │CUDA│ │  │ │MXU │ │  │ │ NPE│ │  │ │TSP │ │            │ │
+│  │  │ └────┘ │  │ └────┘ │  │ └────┘ │  │ └────┘ │  │ └────┘ │            │ │
+│  │  └────────┘  └────────┘  └────────┘  └────────┘  └────────┘            │ │
+│  │                                                                          │ │
+│  └─────────────────────────────────────────────────────────────────────────┘ │
+│                                    │                                         │
+│                                    ▼                                         │
+│  ┌─────────────────────────────────────────────────────────────────────────┐ │
+│  │                    UNIFIED MEMORY FABRIC                                 │ │
+│  │  Zero-copy data sharing │ Automatic placement │ Cache coherency          │ │
+│  └─────────────────────────────────────────────────────────────────────────┘ │
+│                                                                              │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## Part 1: Processor Type Definitions
+
+### 1.1 Unified Processor Abstraction
+
+```rust
+// synor-compute/src/heterogeneous/processor.rs
+
+/// Unified abstraction for any processor type
+pub trait Processor: Send + Sync {
+    /// Processor type identifier
+    fn processor_type(&self) -> ProcessorType;
+
+    /// Get capabilities
+    fn capabilities(&self) -> &ProcessorCapabilities;
+
+    /// Check if processor can execute operation
+    fn can_execute(&self, op: &Operation) -> bool;
+
+    /// Estimate execution time for operation
+    fn estimate_time(&self, op: &Operation) -> Duration;
+
+    /// Estimate energy consumption for operation
+    fn estimate_energy(&self, op: &Operation) -> f64; // Joules
+
+    /// Execute operation
+    async fn execute(&self, op: Operation) -> Result<OperationResult, ProcessorError>;
+
+    /// Current utilization (0.0 - 1.0)
+    fn utilization(&self) -> f64;
+
+    /// Available memory
+    fn available_memory(&self) -> u64;
+}
+
+/// All supported processor types
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum ProcessorType {
+    /// Central Processing Unit
+    Cpu(CpuVariant),
+    /// Graphics Processing Unit
+    Gpu(GpuVariant),
+    /// Tensor Processing Unit (Google)
+    Tpu(TpuVersion),
+    /// Neural Processing Unit (various vendors)
+    Npu(NpuVariant),
+    /// Language Processing Unit (Groq)
+    Lpu,
+    /// Field Programmable Gate Array
+    Fpga(FpgaVendor),
+    /// Digital Signal Processor
+    Dsp(DspVariant),
+    /// Custom/Unknown Accelerator
+    Custom { vendor: String, model: String },
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum CpuVariant {
+    X86_64 { avx: AvxSupport },
+    Arm64 { sve: bool },
+    RiscV { vector: bool },
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum GpuVariant {
+    NvidiaCuda { compute_capability: (u8, u8) },
+    AmdRocm { gfx_version: u32 },
+    IntelOneApi,
+    AppleMetal,
+    QualcommAdreno,
+    ArmMali,
+    WebGpu,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum TpuVersion {
+    V2, V3, V4, V4i, V5e, V5p,
+    EdgeTpu,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum NpuVariant {
+    AppleNeuralEngine { cores: u32 },
+    QualcommHexagon { version: u32 },
+    IntelVpu,
+    HuaweiAscend,
+    GoogleEdgeTpu,
+    Custom { tops: f32 },
+}
+```
+
+### 1.2 Processor Capabilities
+
+```rust
+// synor-compute/src/heterogeneous/capabilities.rs
+
+/// Detailed processor capabilities
+#[derive(Clone, Debug)]
+pub struct ProcessorCapabilities {
+    /// Compute throughput
+    pub compute: ComputeThroughput,
+    /// Memory specs
+    pub memory: MemorySpecs,
+    /// Supported operations
+    pub operations: HashSet<OperationType>,
+    /// Supported data types
+    pub data_types: HashSet<DataType>,
+    /// Power characteristics
+    pub power: PowerCharacteristics,
+    /// Optimal workload characteristics
+    pub optimal_for: Vec<WorkloadCharacteristic>,
+}
+
+#[derive(Clone, Debug)]
+pub struct ComputeThroughput {
+    /// FP64 TFLOPS
+    pub fp64_tflops: f64,
+    /// FP32 TFLOPS
+    pub fp32_tflops: f64,
+    /// FP16 TFLOPS
+    pub fp16_tflops: f64,
+    /// BF16 TFLOPS
+    pub bf16_tflops: f64,
+    /// INT8 TOPS
+    pub int8_tops: f64,
+    /// INT4 TOPS
+    pub int4_tops: f64,
+    /// Sparse operations multiplier
+    pub sparsity_speedup: f64,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum OperationType {
+    // Matrix operations
+    MatMul,
+    Conv2d,
+    Conv3d,
+    DepthwiseConv,
+    BatchNorm,
+    LayerNorm,
+    // Attention operations
+    SelfAttention,
+    CrossAttention,
+    FlashAttention,
+    // Activation functions
+    ReLU,
+    GeLU,
+    SiLU,
+    Softmax,
+    // Reduction operations
+    Sum,
+    Mean,
+    Max,
+    ArgMax,
+    // Data movement
+    Transpose,
+    Reshape,
+    Concat,
+    Split,
+    Gather,
+    Scatter,
+    // Special operations
+    Embedding,
+    RoPE,  // Rotary Position Embedding
+    KVCache,
+    TopK,
+    Sampling,
+    // I/O operations
+    DataLoad,
+    DataPreprocess,
+    Tokenization,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum WorkloadCharacteristic {
+    /// High parallelism (GPU, TPU)
+    HighlyParallel,
+    /// Sequential dependencies (CPU, LPU)
+    Sequential,
+    /// Memory bandwidth bound (GPU)
+    MemoryBound,
+    /// Compute bound (TPU)
+    ComputeBound,
+    /// Low latency required (NPU, edge)
+    LowLatency,
+    /// Low power required (NPU, mobile)
+    LowPower,
+    /// Large batch sizes (GPU, TPU)
+    LargeBatch,
+    /// Small batch sizes (CPU, LPU)
+    SmallBatch,
+    /// Variable length sequences (LPU)
+    VariableLength,
+    /// Fixed tensor shapes (TPU)
+    FixedShape,
+}
+```
+
+### 1.3 Processor Profiles by Type
+
+```rust
+// synor-compute/src/heterogeneous/profiles.rs
+
+/// Pre-defined processor profiles
+pub struct ProcessorProfiles;
+
+impl ProcessorProfiles {
+    /// NVIDIA H100 SXM profile
+    pub fn nvidia_h100() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 67.0,
+                fp32_tflops: 67.0,
+                fp16_tflops: 1979.0,  // With sparsity
+                bf16_tflops: 1979.0,
+                int8_tops: 3958.0,
+                int4_tops: 7916.0,
+                sparsity_speedup: 2.0,
+            },
+            memory: MemorySpecs {
+                capacity_gb: 80,
+                bandwidth_gbps: 3350,
+                type_: MemoryType::Hbm3,
+            },
+            operations: [
+                OperationType::MatMul,
+                OperationType::Conv2d,
+                OperationType::SelfAttention,
+                OperationType::FlashAttention,
+                // ... all GPU operations
+            ].into_iter().collect(),
+            optimal_for: vec![
+                WorkloadCharacteristic::HighlyParallel,
+                WorkloadCharacteristic::LargeBatch,
+                WorkloadCharacteristic::ComputeBound,
+            ],
+            ..Default::default()
+        }
+    }
+
+    /// Google TPU v5p profile
+    pub fn google_tpu_v5p() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp32_tflops: 459.0,
+                bf16_tflops: 918.0,
+                int8_tops: 1836.0,
+                ..Default::default()
+            },
+            memory: MemorySpecs {
+                capacity_gb: 95,
+                bandwidth_gbps: 4800,
+                type_: MemoryType::Hbm2e,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::HighlyParallel,
+                WorkloadCharacteristic::ComputeBound,
+                WorkloadCharacteristic::FixedShape,
+                WorkloadCharacteristic::LargeBatch,
+            ],
+            ..Default::default()
+        }
+    }
+
+    /// Groq LPU profile
+    pub fn groq_lpu() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                int8_tops: 750.0,
+                ..Default::default()
+            },
+            memory: MemorySpecs {
+                capacity_gb: 230,  // SRAM!
+                bandwidth_gbps: 80_000,  // 80 TB/s internal
+                type_: MemoryType::Sram,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::Sequential,
+                WorkloadCharacteristic::SmallBatch,
+                WorkloadCharacteristic::VariableLength,
+                WorkloadCharacteristic::LowLatency,
+            ],
+            ..Default::default()
+        }
+    }
+
+    /// Apple M3 Max Neural Engine profile
+    pub fn apple_neural_engine_m3() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                int8_tops: 18.0,
+                ..Default::default()
+            },
+            memory: MemorySpecs {
+                capacity_gb: 0,  // Uses unified memory
+                bandwidth_gbps: 400,
+                type_: MemoryType::Unified,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::LowPower,
+                WorkloadCharacteristic::LowLatency,
+                WorkloadCharacteristic::SmallBatch,
+            ],
+            ..Default::default()
+        }
+    }
+
+    /// AMD EPYC 9654 CPU profile
+    pub fn amd_epyc_9654() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 5.4,
+                fp32_tflops: 10.8,
+                ..Default::default()
+            },
+            memory: MemorySpecs {
+                capacity_gb: 6144,  // 6TB max
+                bandwidth_gbps: 460,
+                type_: MemoryType::Ddr5,
+            },
+            operations: [
+                OperationType::DataLoad,
+                OperationType::DataPreprocess,
+                OperationType::Tokenization,
+                // Sequential operations
+            ].into_iter().collect(),
+            optimal_for: vec![
+                WorkloadCharacteristic::Sequential,
+                WorkloadCharacteristic::MemoryBound,
+            ],
+            ..Default::default()
+        }
+    }
+}
+```
+
+---
+
+## Part 2: Task Decomposition Engine
+
+### 2.1 Workload Analyzer
+
+```rust
+// synor-compute/src/heterogeneous/analyzer.rs
+
+/// Analyzes workloads and decomposes into optimal subtasks
+pub struct WorkloadAnalyzer {
+    /// Operation cost models for each processor type
+    cost_models: HashMap<ProcessorType, OperationCostModel>,
+    /// Dependency graph builder
+    graph_builder: DependencyGraphBuilder,
+    /// ML model for workload prediction
+    predictor: WorkloadPredictor,
+}
+
+impl WorkloadAnalyzer {
+    /// Analyze a computation graph and decompose into subtasks
+    pub async fn analyze(&self, graph: &ComputationGraph) -> WorkloadAnalysis {
+        // 1. Build dependency graph
+        let deps = self.graph_builder.build(graph);
+
+        // 2. Identify operation types
+        let operations = self.identify_operations(graph);
+
+        // 3. Estimate costs for each processor type
+        let cost_matrix = self.estimate_costs(&operations);
+
+        // 4. Find optimal assignment
+        let assignment = self.optimize_assignment(&deps, &cost_matrix);
+
+        // 5. Create execution plan
+        WorkloadAnalysis {
+            operations,
+            dependencies: deps,
+            cost_matrix,
+            optimal_assignment: assignment,
+            estimated_speedup: self.calculate_speedup(&assignment),
+        }
+    }
+
+    /// Estimate operation costs across all processor types
+    fn estimate_costs(&self, operations: &[Operation]) -> CostMatrix {
+        let mut matrix = CostMatrix::new(operations.len(), self.cost_models.len());
+
+        for (op_idx, op) in operations.iter().enumerate() {
+            for (proc_idx, (proc_type, model)) in self.cost_models.iter().enumerate() {
+                let cost = if model.can_execute(op) {
+                    model.estimate_cost(op)
+                } else {
+                    f64::INFINITY  // Can't execute on this processor
+                };
+                matrix.set(op_idx, proc_idx, cost);
+            }
+        }
+
+        matrix
+    }
+
+    /// Optimize task-to-processor assignment
+    fn optimize_assignment(
+        &self,
+        deps: &DependencyGraph,
+        costs: &CostMatrix,
+    ) -> TaskAssignment {
+        // Use ILP (Integer Linear Programming) or heuristic
+        // to minimize total execution time considering:
+        // 1. Operation costs on each processor
+        // 2. Data transfer costs between processors
+        // 3. Dependency constraints (ordering)
+        // 4. Processor capacity constraints
+
+        let solver = HeterogeneousSchedulingSolver::new();
+        solver.solve(deps, costs)
+    }
+}
+
+/// Cost matrix: operations × processor types
+pub struct CostMatrix {
+    /// Rows: operations, Cols: processor types
+    data: Vec<Vec<f64>>,
+    /// Data transfer costs between processor types
+    transfer_costs: HashMap<(ProcessorType, ProcessorType), f64>,
+}
+
+impl CostMatrix {
+    /// Get cost of operation on processor
+    pub fn get(&self, op: usize, proc: usize) -> f64 {
+        self.data[op][proc]
+    }
+
+    /// Get data transfer cost between processors
+    pub fn transfer_cost(&self, from: ProcessorType, to: ProcessorType, bytes: u64) -> f64 {
+        if from == to {
+            0.0  // Same processor type, no transfer
+        } else {
+            let per_byte = self.transfer_costs
+                .get(&(from, to))
+                .unwrap_or(&1e-9);  // Default: 1ns per byte
+            *per_byte * bytes as f64
+        }
+    }
+}
+```
+
+### 2.2 AI Training Decomposition Example
+
+```rust
+// synor-compute/src/heterogeneous/training.rs
+
+/// Decompose AI training into heterogeneous subtasks
+pub struct TrainingDecomposer;
+
+impl TrainingDecomposer {
+    /// Decompose a training iteration into processor-specific tasks
+    pub fn decompose_iteration(
+        &self,
+        model: &Model,
+        batch: &Batch,
+        available_processors: &[ProcessorInfo],
+    ) -> DecomposedIteration {
+        let mut tasks = Vec::new();
+
+        // ═══════════════════════════════════════════════════════════════
+        // PHASE 1: DATA LOADING & PREPROCESSING → CPU
+        // ═══════════════════════════════════════════════════════════════
+        tasks.push(Task {
+            id: TaskId::new(),
+            operation: Operation::DataLoad {
+                batch_ids: batch.ids.clone(),
+                shuffle: true,
+            },
+            optimal_processor: ProcessorType::Cpu(CpuVariant::X86_64 { avx: AvxSupport::Avx512 }),
+            priority: TaskPriority::High,
+            dependencies: vec![],
+        });
+
+        tasks.push(Task {
+            id: TaskId::new(),
+            operation: Operation::DataPreprocess {
+                transforms: vec![
+                    Transform::Normalize,
+                    Transform::Augment,
+                    Transform::ToTensor,
+                ],
+            },
+            optimal_processor: ProcessorType::Cpu(CpuVariant::X86_64 { avx: AvxSupport::Avx512 }),
+            priority: TaskPriority::High,
+            dependencies: vec![tasks[0].id],
+        });
+
+        // ═══════════════════════════════════════════════════════════════
+        // PHASE 2: TOKENIZATION (for LLMs) → CPU or NPU
+        // ═══════════════════════════════════════════════════════════════
+        if model.model_type == ModelType::Llm {
+            tasks.push(Task {
+                id: TaskId::new(),
+                operation: Operation::Tokenization {
+                    vocab_size: model.vocab_size,
+                    max_length: model.max_seq_len,
+                },
+                optimal_processor: ProcessorType::Cpu(CpuVariant::X86_64 { avx: AvxSupport::Avx512 }),
+                priority: TaskPriority::High,
+                dependencies: vec![tasks[1].id],
+            });
+        }
+
+        // ═══════════════════════════════════════════════════════════════
+        // PHASE 3: EMBEDDING LOOKUP → GPU (memory bandwidth bound)
+        // ═══════════════════════════════════════════════════════════════
+        tasks.push(Task {
+            id: TaskId::new(),
+            operation: Operation::Embedding {
+                vocab_size: model.vocab_size,
+                embedding_dim: model.embedding_dim,
+            },
+            optimal_processor: ProcessorType::Gpu(GpuVariant::NvidiaCuda {
+                compute_capability: (9, 0), // H100
+            }),
+            priority: TaskPriority::High,
+            dependencies: vec![tasks.last().unwrap().id],
+        });
+
+        // ═══════════════════════════════════════════════════════════════
+        // PHASE 4: TRANSFORMER LAYERS → TPU or GPU (compute bound)
+        // ═══════════════════════════════════════════════════════════════
+        let embedding_task_id = tasks.last().unwrap().id;
+
+        for layer_idx in 0..model.num_layers {
+            // Self-attention → TPU optimal (large matrix multiplies)
+            tasks.push(Task {
+                id: TaskId::new(),
+                operation: Operation::SelfAttention {
+                    layer: layer_idx,
+                    num_heads: model.num_heads,
+                    head_dim: model.head_dim,
+                    use_flash: true,
+                },
+                optimal_processor: ProcessorType::Tpu(TpuVersion::V5p),
+                fallback_processor: Some(ProcessorType::Gpu(GpuVariant::NvidiaCuda {
+                    compute_capability: (9, 0),
+                })),
+                priority: TaskPriority::Critical,
+                dependencies: vec![
+                    if layer_idx == 0 { embedding_task_id } else { tasks.last().unwrap().id }
+                ],
+            });
+
+            // FFN (Feed-Forward Network) → GPU optimal
+            tasks.push(Task {
+                id: TaskId::new(),
+                operation: Operation::FeedForward {
+                    layer: layer_idx,
+                    hidden_dim: model.ffn_dim,
+                    activation: Activation::SiLU,
+                },
+                optimal_processor: ProcessorType::Gpu(GpuVariant::NvidiaCuda {
+                    compute_capability: (9, 0),
+                }),
+                priority: TaskPriority::Critical,
+                dependencies: vec![tasks.last().unwrap().id],
+            });
+        }
+
+        // ═══════════════════════════════════════════════════════════════
+        // PHASE 5: OUTPUT PROJECTION & LOSS → GPU
+        // ═══════════════════════════════════════════════════════════════
+        tasks.push(Task {
+            id: TaskId::new(),
+            operation: Operation::OutputProjection {
+                vocab_size: model.vocab_size,
+            },
+            optimal_processor: ProcessorType::Gpu(GpuVariant::NvidiaCuda {
+                compute_capability: (9, 0),
+            }),
+            priority: TaskPriority::High,
+            dependencies: vec![tasks.last().unwrap().id],
+        });
+
+        tasks.push(Task {
+            id: TaskId::new(),
+            operation: Operation::CrossEntropyLoss {},
+            optimal_processor: ProcessorType::Gpu(GpuVariant::NvidiaCuda {
+                compute_capability: (9, 0),
+            }),
+            priority: TaskPriority::High,
+            dependencies: vec![tasks.last().unwrap().id],
+        });
+
+        // ═══════════════════════════════════════════════════════════════
+        // PHASE 6: BACKWARD PASS → Same as forward, reversed
+        // ═══════════════════════════════════════════════════════════════
+        let forward_tasks = tasks.clone();
+        for task in forward_tasks.iter().rev() {
+            if let Some(backward_op) = task.operation.backward() {
+                tasks.push(Task {
+                    id: TaskId::new(),
+                    operation: backward_op,
+                    optimal_processor: task.optimal_processor,
+                    priority: task.priority,
+                    dependencies: vec![tasks.last().unwrap().id],
+                });
+            }
+        }
+
+        // ═══════════════════════════════════════════════════════════════
+        // PHASE 7: GRADIENT AGGREGATION → CPU (network I/O) + GPU (compute)
+        // ═══════════════════════════════════════════════════════════════
+        tasks.push(Task {
+            id: TaskId::new(),
+            operation: Operation::AllReduce {
+                algorithm: AllReduceAlgorithm::RingAllReduce,
+            },
+            optimal_processor: ProcessorType::Cpu(CpuVariant::X86_64 { avx: AvxSupport::Avx512 }),
+            priority: TaskPriority::Critical,
+            dependencies: vec![tasks.last().unwrap().id],
+        });
+
+        // ═══════════════════════════════════════════════════════════════
+        // PHASE 8: OPTIMIZER STEP → GPU
+        // ═══════════════════════════════════════════════════════════════
+        tasks.push(Task {
+            id: TaskId::new(),
+            operation: Operation::OptimizerStep {
+                optimizer: OptimizerType::AdamW,
+                learning_rate: 1e-4,
+            },
+            optimal_processor: ProcessorType::Gpu(GpuVariant::NvidiaCuda {
+                compute_capability: (9, 0),
+            }),
+            priority: TaskPriority::High,
+            dependencies: vec![tasks.last().unwrap().id],
+        });
+
+        // ═══════════════════════════════════════════════════════════════
+        // PHASE 9: CHECKPOINTING → CPU (I/O)
+        // ═══════════════════════════════════════════════════════════════
+        tasks.push(Task {
+            id: TaskId::new(),
+            operation: Operation::Checkpoint {
+                async_: true,
+            },
+            optimal_processor: ProcessorType::Cpu(CpuVariant::X86_64 { avx: AvxSupport::Avx512 }),
+            priority: TaskPriority::Low,
+            dependencies: vec![tasks.last().unwrap().id],
+        });
+
+        DecomposedIteration {
+            tasks,
+            estimated_time: self.estimate_total_time(&tasks),
+            processor_utilization: self.estimate_utilization(&tasks),
+        }
+    }
+}
+```
+
+---
+
+## Part 3: Heterogeneous Scheduler
+
+### 3.1 Multi-Queue Scheduler
+
+```rust
+// synor-compute/src/heterogeneous/scheduler.rs
+
+/// Scheduler that manages tasks across all processor types
+pub struct HeterogeneousScheduler {
+    /// Per-processor-type task queues
+    queues: HashMap<ProcessorType, TaskQueue>,
+    /// Available processors
+    processors: Vec<Arc<dyn Processor>>,
+    /// Task dependency tracker
+    dependencies: DependencyTracker,
+    /// Load balancer
+    load_balancer: LoadBalancer,
+    /// Data placement optimizer
+    data_placement: DataPlacementOptimizer,
+}
+
+impl HeterogeneousScheduler {
+    /// Schedule a decomposed workload
+    pub async fn schedule(&self, workload: DecomposedWorkload) -> Result<ScheduleResult, Error> {
+        // 1. Build execution graph
+        let graph = self.dependencies.build_graph(&workload.tasks);
+
+        // 2. Assign tasks to processors
+        let assignment = self.assign_tasks(&workload.tasks, &graph).await?;
+
+        // 3. Optimize data placement
+        let data_plan = self.data_placement.optimize(&assignment).await?;
+
+        // 4. Create execution schedule
+        let schedule = self.create_schedule(&assignment, &data_plan, &graph)?;
+
+        Ok(ScheduleResult {
+            schedule,
+            data_plan,
+            estimated_makespan: self.estimate_makespan(&schedule),
+            processor_utilization: self.estimate_utilization(&schedule),
+        })
+    }
+
+    /// Assign tasks to optimal processors
+    async fn assign_tasks(
+        &self,
+        tasks: &[Task],
+        graph: &DependencyGraph,
+    ) -> Result<TaskAssignment, Error> {
+        let mut assignment = TaskAssignment::new();
+
+        // Sort tasks by priority and dependencies (topological sort)
+        let sorted_tasks = graph.topological_sort(tasks);
+
+        for task in sorted_tasks {
+            // Find best processor for this task
+            let best_processor = self.find_best_processor(&task).await?;
+
+            // Check if we should steal work for load balancing
+            let final_processor = self.load_balancer
+                .maybe_rebalance(&task, best_processor, &assignment)
+                .await?;
+
+            assignment.assign(task.id, final_processor);
+        }
+
+        Ok(assignment)
+    }
+
+    /// Find the best processor for a task
+    async fn find_best_processor(&self, task: &Task) -> Result<ProcessorId, Error> {
+        let mut best_score = f64::NEG_INFINITY;
+        let mut best_processor = None;
+
+        for processor in &self.processors {
+            if !processor.can_execute(&task.operation) {
+                continue;
+            }
+
+            // Score = 1 / (execution_time + data_transfer_time)
+            let exec_time = processor.estimate_time(&task.operation);
+            let transfer_time = self.estimate_data_transfer_time(task, processor.as_ref());
+            let total_time = exec_time + transfer_time;
+
+            // Adjust for current load
+            let load_factor = 1.0 + processor.utilization();
+            let adjusted_time = total_time.as_secs_f64() * load_factor;
+
+            let score = 1.0 / adjusted_time;
+
+            if score > best_score {
+                best_score = score;
+                best_processor = Some(processor.id());
+            }
+        }
+
+        best_processor.ok_or(Error::NoSuitableProcessor)
+    }
+
+    /// Execute the schedule
+    pub async fn execute(&self, schedule: &Schedule) -> Result<ExecutionResult, Error> {
+        let mut handles = Vec::new();
+        let results = Arc::new(Mutex::new(HashMap::new()));
+        let completed = Arc::new(AtomicUsize::new(0));
+
+        // Create execution contexts for each processor
+        let contexts: HashMap<ProcessorId, ExecutionContext> = self.processors
+            .iter()
+            .map(|p| (p.id(), ExecutionContext::new(p.clone())))
+            .collect();
+
+        // Execute tasks in schedule order
+        for stage in &schedule.stages {
+            // Execute all tasks in this stage in parallel
+            let stage_handles: Vec<_> = stage.tasks
+                .iter()
+                .map(|task_id| {
+                    let task = schedule.get_task(*task_id);
+                    let processor_id = schedule.get_assignment(*task_id);
+                    let context = contexts.get(&processor_id).unwrap().clone();
+                    let results = results.clone();
+                    let completed = completed.clone();
+
+                    tokio::spawn(async move {
+                        // Wait for dependencies
+                        task.wait_for_dependencies(&results).await;
+
+                        // Execute on assigned processor
+                        let result = context.execute(&task).await?;
+
+                        // Store result
+                        results.lock().await.insert(task.id, result);
+                        completed.fetch_add(1, Ordering::SeqCst);
+
+                        Ok::<_, Error>(())
+                    })
+                })
+                .collect();
+
+            // Wait for all tasks in stage to complete
+            for handle in stage_handles {
+                handle.await??;
+            }
+        }
+
+        Ok(ExecutionResult {
+            results: Arc::try_unwrap(results).unwrap().into_inner(),
+            total_time: schedule.estimated_makespan,
+            processor_utilization: self.measure_utilization(&contexts),
+        })
+    }
+}
+```
+
+### 3.2 Work Stealing for Load Balancing
+
+```rust
+// synor-compute/src/heterogeneous/work_stealing.rs
+
+/// Work stealing scheduler for load balancing
+pub struct WorkStealingScheduler {
+    /// Per-processor work queues (deques for work stealing)
+    queues: HashMap<ProcessorId, WorkQueue>,
+    /// Stealing policy
+    policy: StealingPolicy,
+}
+
+impl WorkStealingScheduler {
+    /// Try to steal work for an idle processor
+    pub async fn try_steal(&self, idle_processor: ProcessorId) -> Option<Task> {
+        let idle_type = self.get_processor_type(idle_processor);
+
+        // Find most loaded processor with compatible tasks
+        let mut best_victim = None;
+        let mut best_load = 0;
+
+        for (proc_id, queue) in &self.queues {
+            if *proc_id == idle_processor {
+                continue;
+            }
+
+            // Check if this queue has tasks compatible with idle processor
+            let compatible_count = queue.count_compatible(idle_type);
+            if compatible_count > best_load {
+                best_load = compatible_count;
+                best_victim = Some(*proc_id);
+            }
+        }
+
+        // Steal from the most loaded compatible queue
+        if let Some(victim) = best_victim {
+            let victim_queue = self.queues.get(&victim)?;
+
+            // Steal from the back of the queue (oldest tasks)
+            victim_queue.steal_compatible(idle_type).await
+        } else {
+            None
+        }
+    }
+
+    /// Rebalance when processor utilization is uneven
+    pub async fn rebalance(&self) -> Vec<TaskMigration> {
+        let mut migrations = Vec::new();
+
+        // Calculate average utilization
+        let total_util: f64 = self.queues.values().map(|q| q.utilization()).sum();
+        let avg_util = total_util / self.queues.len() as f64;
+
+        // Find overloaded and underloaded processors
+        let mut overloaded: Vec<_> = self.queues.iter()
+            .filter(|(_, q)| q.utilization() > avg_util * 1.2)
+            .collect();
+        let mut underloaded: Vec<_> = self.queues.iter()
+            .filter(|(_, q)| q.utilization() < avg_util * 0.8)
+            .collect();
+
+        // Sort by utilization
+        overloaded.sort_by(|a, b| b.1.utilization().partial_cmp(&a.1.utilization()).unwrap());
+        underloaded.sort_by(|a, b| a.1.utilization().partial_cmp(&b.1.utilization()).unwrap());
+
+        // Migrate tasks from overloaded to underloaded
+        for (over_id, over_queue) in overloaded {
+            for (under_id, under_queue) in &underloaded {
+                if over_queue.utilization() <= avg_util {
+                    break;
+                }
+
+                let under_type = self.get_processor_type(**under_id);
+
+                // Find tasks that can be migrated
+                if let Some(task) = over_queue.find_migratable(under_type) {
+                    migrations.push(TaskMigration {
+                        task_id: task.id,
+                        from: *over_id,
+                        to: **under_id,
+                    });
+                }
+            }
+        }
+
+        migrations
+    }
+}
+
+/// Work queue with lock-free deque for work stealing
+pub struct WorkQueue {
+    /// Double-ended queue for work stealing
+    deque: crossbeam_deque::Injector<Task>,
+    /// Local queues per worker
+    local: Vec<crossbeam_deque::Worker<Task>>,
+    /// Stealers for other workers
+    stealers: Vec<crossbeam_deque::Stealer<Task>>,
+    /// Current utilization
+    utilization: AtomicU64,
+}
+
+impl WorkQueue {
+    /// Push task (owner pushes to front)
+    pub fn push(&self, task: Task) {
+        self.deque.push(task);
+    }
+
+    /// Pop task (owner pops from front)
+    pub fn pop(&self) -> Option<Task> {
+        self.deque.steal().success()
+    }
+
+    /// Steal task (thieves steal from back)
+    pub async fn steal_compatible(&self, processor_type: ProcessorType) -> Option<Task> {
+        // Try to steal a task compatible with the given processor type
+        loop {
+            match self.deque.steal() {
+                crossbeam_deque::Steal::Success(task) => {
+                    if task.is_compatible_with(processor_type) {
+                        return Some(task);
+                    } else {
+                        // Put it back and try again
+                        self.deque.push(task);
+                    }
+                }
+                crossbeam_deque::Steal::Empty => return None,
+                crossbeam_deque::Steal::Retry => continue,
+            }
+        }
+    }
+}
+```
+
+### 3.3 Pipeline Parallelism Across Processors
+
+```rust
+// synor-compute/src/heterogeneous/pipeline.rs
+
+/// Pipeline parallelism across heterogeneous processors
+pub struct HeterogeneousPipeline {
+    /// Pipeline stages
+    stages: Vec<PipelineStage>,
+    /// Inter-stage buffers
+    buffers: Vec<PipelineBuffer>,
+    /// Synchronization
+    sync: PipelineSync,
+}
+
+/// A stage in the pipeline assigned to a processor type
+pub struct PipelineStage {
+    pub stage_id: usize,
+    pub operations: Vec<Operation>,
+    pub processor_type: ProcessorType,
+    pub processors: Vec<ProcessorId>,  // Multiple processors for parallelism
+}
+
+impl HeterogeneousPipeline {
+    /// Create a pipeline for LLM inference
+    pub fn create_llm_pipeline(
+        model: &LlmModel,
+        available_processors: &ProcessorRegistry,
+    ) -> Self {
+        let mut stages = Vec::new();
+
+        // Stage 1: Tokenization → CPU
+        stages.push(PipelineStage {
+            stage_id: 0,
+            operations: vec![Operation::Tokenization { .. }],
+            processor_type: ProcessorType::Cpu(CpuVariant::X86_64 { .. }),
+            processors: available_processors.get_type(ProcessorType::Cpu(..)),
+        });
+
+        // Stage 2: Embedding → GPU (memory bound)
+        stages.push(PipelineStage {
+            stage_id: 1,
+            operations: vec![Operation::Embedding { .. }],
+            processor_type: ProcessorType::Gpu(GpuVariant::NvidiaCuda { .. }),
+            processors: available_processors.get_type(ProcessorType::Gpu(..)),
+        });
+
+        // Stage 3: Transformer layers → TPU (if available) or GPU
+        let transformer_processor = if available_processors.has_tpu() {
+            ProcessorType::Tpu(TpuVersion::V5p)
+        } else {
+            ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability: (9, 0) })
+        };
+
+        stages.push(PipelineStage {
+            stage_id: 2,
+            operations: model.layers.iter().flat_map(|l| l.operations()).collect(),
+            processor_type: transformer_processor,
+            processors: available_processors.get_type(transformer_processor),
+        });
+
+        // Stage 4: Token generation → LPU (if available, best for sequential) or GPU
+        let generation_processor = if available_processors.has_lpu() {
+            ProcessorType::Lpu
+        } else {
+            ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability: (9, 0) })
+        };
+
+        stages.push(PipelineStage {
+            stage_id: 3,
+            operations: vec![
+                Operation::OutputProjection { .. },
+                Operation::Sampling { .. },
+            ],
+            processor_type: generation_processor,
+            processors: available_processors.get_type(generation_processor),
+        });
+
+        // Stage 5: Detokenization → CPU
+        stages.push(PipelineStage {
+            stage_id: 4,
+            operations: vec![Operation::Detokenization { .. }],
+            processor_type: ProcessorType::Cpu(CpuVariant::X86_64 { .. }),
+            processors: available_processors.get_type(ProcessorType::Cpu(..)),
+        });
+
+        // Create inter-stage buffers
+        let buffers = (0..stages.len() - 1)
+            .map(|i| PipelineBuffer::new(
+                stages[i].processor_type,
+                stages[i + 1].processor_type,
+            ))
+            .collect();
+
+        Self {
+            stages,
+            buffers,
+            sync: PipelineSync::new(),
+        }
+    }
+
+    /// Execute pipeline with micro-batching
+    pub async fn execute_stream(
+        &self,
+        input_stream: impl Stream<Item = Request>,
+    ) -> impl Stream<Item = Response> {
+        let (tx, rx) = mpsc::channel(1024);
+
+        // Start pipeline stages
+        for (i, stage) in self.stages.iter().enumerate() {
+            let input_buffer = if i == 0 {
+                None
+            } else {
+                Some(self.buffers[i - 1].clone())
+            };
+
+            let output_buffer = if i == self.stages.len() - 1 {
+                None
+            } else {
+                Some(self.buffers[i].clone())
+            };
+
+            let stage = stage.clone();
+            let tx = tx.clone();
+
+            tokio::spawn(async move {
+                stage.run(input_buffer, output_buffer, tx).await;
+            });
+        }
+
+        // Feed input stream to first stage
+        let first_buffer = self.buffers[0].clone();
+        tokio::spawn(async move {
+            pin_mut!(input_stream);
+            while let Some(request) = input_stream.next().await {
+                first_buffer.push(request.into()).await;
+            }
+        });
+
+        ReceiverStream::new(rx)
+    }
+}
+
+/// Buffer between pipeline stages with automatic data transfer
+pub struct PipelineBuffer {
+    /// Source processor type
+    source_type: ProcessorType,
+    /// Destination processor type
+    dest_type: ProcessorType,
+    /// Data queue
+    queue: Arc<ArrayQueue<PipelineData>>,
+    /// Transfer strategy
+    transfer: DataTransferStrategy,
+}
+
+impl PipelineBuffer {
+    /// Push data from source stage
+    pub async fn push(&self, data: PipelineData) {
+        // Transfer data if processors have different memory spaces
+        let transferred = if self.needs_transfer() {
+            self.transfer.transfer(&data, self.source_type, self.dest_type).await
+        } else {
+            data
+        };
+
+        self.queue.push(transferred).unwrap();
+    }
+
+    /// Pop data for destination stage
+    pub async fn pop(&self) -> Option<PipelineData> {
+        self.queue.pop()
+    }
+
+    fn needs_transfer(&self) -> bool {
+        !self.source_type.shares_memory_with(&self.dest_type)
+    }
+}
+```
+
+---
+
+## Part 4: Data Movement Optimization
+
+### 4.1 Unified Memory Management
+
+```rust
+// synor-compute/src/heterogeneous/memory.rs
+
+/// Unified memory manager across all processor types
+pub struct UnifiedMemoryManager {
+    /// Memory allocators per processor type
+    allocators: HashMap<ProcessorType, Box<dyn MemoryAllocator>>,
+    /// Data location tracker
+    locations: DataLocationTracker,
+    /// Transfer scheduler
+    transfer_scheduler: TransferScheduler,
+    /// Prefetch predictor
+    prefetcher: PrefetchPredictor,
+}
+
+impl UnifiedMemoryManager {
+    /// Allocate tensor with optimal placement
+    pub async fn allocate_tensor(
+        &self,
+        shape: &[usize],
+        dtype: DataType,
+        hint: PlacementHint,
+    ) -> Result<TensorHandle, Error> {
+        // Determine optimal initial placement
+        let location = match hint {
+            PlacementHint::Processor(proc_type) => proc_type,
+            PlacementHint::Operation(op) => self.optimal_location_for_op(&op),
+            PlacementHint::Auto => self.predict_optimal_location(shape, dtype),
+        };
+
+        // Allocate on chosen processor
+        let allocator = self.allocators.get(&location)?;
+        let ptr = allocator.allocate(shape.iter().product::<usize>() * dtype.size())?;
+
+        // Register with location tracker
+        let handle = TensorHandle::new(ptr, shape.to_vec(), dtype);
+        self.locations.register(&handle, location);
+
+        Ok(handle)
+    }
+
+    /// Ensure tensor is available on specified processor
+    pub async fn ensure_on(
+        &self,
+        tensor: &TensorHandle,
+        target: ProcessorType,
+    ) -> Result<TensorView, Error> {
+        let current_location = self.locations.get(tensor)?;
+
+        if current_location == target {
+            // Already on target, return view
+            return Ok(TensorView::new(tensor, target));
+        }
+
+        // Check if already cached on target
+        if let Some(cached) = self.locations.get_cached(tensor, target) {
+            return Ok(cached);
+        }
+
+        // Need to transfer
+        let transfer = self.transfer_scheduler.schedule_transfer(
+            tensor,
+            current_location,
+            target,
+        ).await?;
+
+        // Execute transfer
+        transfer.execute().await?;
+
+        // Register new location
+        self.locations.add_copy(tensor, target);
+
+        Ok(TensorView::new(tensor, target))
+    }
+
+    /// Prefetch data before it's needed
+    pub async fn prefetch(&self, tensor: &TensorHandle, target: ProcessorType) {
+        // Don't wait, just schedule the transfer
+        let _ = self.transfer_scheduler.schedule_transfer_async(
+            tensor,
+            self.locations.get(tensor).unwrap_or(ProcessorType::Cpu(Default::default())),
+            target,
+        ).await;
+    }
+}
+
+/// Optimized data transfer between processors
+pub struct TransferScheduler {
+    /// Direct transfer paths (e.g., NVLink, PCIe P2P)
+    direct_paths: HashMap<(ProcessorType, ProcessorType), TransferPath>,
+    /// Transfer queue
+    queue: TransferQueue,
+}
+
+impl TransferScheduler {
+    /// Schedule optimal transfer
+    pub async fn schedule_transfer(
+        &self,
+        tensor: &TensorHandle,
+        from: ProcessorType,
+        to: ProcessorType,
+    ) -> Result<Transfer, Error> {
+        // Find optimal path
+        let path = self.find_optimal_path(from, to, tensor.size_bytes());
+
+        // Create transfer
+        let transfer = Transfer {
+            tensor: tensor.clone(),
+            path,
+            size: tensor.size_bytes(),
+        };
+
+        // Add to queue (batching similar transfers)
+        self.queue.enqueue(transfer.clone()).await;
+
+        Ok(transfer)
+    }
+
+    fn find_optimal_path(
+        &self,
+        from: ProcessorType,
+        to: ProcessorType,
+        size: usize,
+    ) -> TransferPath {
+        // Check for direct path first
+        if let Some(direct) = self.direct_paths.get(&(from, to)) {
+            return direct.clone();
+        }
+
+        // Check for direct path in reverse (bidirectional)
+        if let Some(direct) = self.direct_paths.get(&(to, from)) {
+            return direct.clone();
+        }
+
+        // Fall back to CPU-mediated transfer
+        TransferPath::CpuMediated { from, to }
+    }
+}
+
+/// Available transfer paths
+#[derive(Clone, Debug)]
+pub enum TransferPath {
+    /// Direct GPU-to-GPU (NVLink, NVSwitch)
+    NvLink { bandwidth_gbps: u32 },
+    /// PCIe peer-to-peer
+    PciePeerToPeer { gen: u8, lanes: u8 },
+    /// Through CPU memory (slowest)
+    CpuMediated { from: ProcessorType, to: ProcessorType },
+    /// Unified memory (Apple, some AMD APUs)
+    UnifiedMemory,
+    /// Network transfer (for distributed)
+    Network { protocol: NetworkProtocol },
+}
+```
+
+---
+
+## Part 5: Example: Heterogeneous LLM Inference
+
+### 5.1 Complete Example Flow
+
+```rust
+// synor-compute/src/examples/heterogeneous_llm.rs
+
+/// Example: Running LLM inference across CPU + GPU + TPU + LPU
+pub async fn run_heterogeneous_inference(
+    prompt: &str,
+    model: &LlmModel,
+    processors: &ProcessorRegistry,
+) -> Result<String, Error> {
+    let scheduler = HeterogeneousScheduler::new(processors);
+
+    // ═══════════════════════════════════════════════════════════════
+    // STEP 1: TOKENIZATION (CPU)
+    // CPU is optimal for string processing and variable-length operations
+    // ═══════════════════════════════════════════════════════════════
+    let cpu = processors.get_best(ProcessorType::Cpu(..))?;
+    let tokens = cpu.execute(Operation::Tokenization {
+        text: prompt.to_string(),
+        vocab: model.vocab.clone(),
+    }).await?;
+
+    println!("✓ Tokenization complete on CPU: {} tokens", tokens.len());
+
+    // ═══════════════════════════════════════════════════════════════
+    // STEP 2: EMBEDDING LOOKUP (GPU)
+    // GPU is optimal for memory-bandwidth-bound operations
+    // ═══════════════════════════════════════════════════════════════
+    let gpu = processors.get_best(ProcessorType::Gpu(..))?;
+    let embeddings = gpu.execute(Operation::Embedding {
+        tokens: tokens.clone(),
+        embedding_table: model.embedding_table.clone(),
+    }).await?;
+
+    println!("✓ Embedding complete on GPU");
+
+    // ═══════════════════════════════════════════════════════════════
+    // STEP 3: PREFILL (PARALLEL ATTENTION) → TPU or GPU
+    // TPU excels at large matrix multiplications with fixed shapes
+    // ═══════════════════════════════════════════════════════════════
+    let prefill_processor = processors
+        .get_best(ProcessorType::Tpu(..))
+        .or_else(|_| processors.get_best(ProcessorType::Gpu(..)))?;
+
+    let mut hidden_states = embeddings;
+
+    for layer_idx in 0..model.num_layers {
+        hidden_states = prefill_processor.execute(Operation::TransformerLayer {
+            layer: layer_idx,
+            input: hidden_states,
+            attention_mask: None,
+            kv_cache: None,  // No cache for prefill
+        }).await?;
+    }
+
+    println!("✓ Prefill complete on {:?}", prefill_processor.processor_type());
+
+    // ═══════════════════════════════════════════════════════════════
+    // STEP 4: DECODE (SEQUENTIAL TOKEN GENERATION) → LPU or GPU
+    // LPU excels at sequential, low-batch operations (autoregressive)
+    // ═══════════════════════════════════════════════════════════════
+    let decode_processor = processors
+        .get_best(ProcessorType::Lpu)
+        .or_else(|_| processors.get_best(ProcessorType::Gpu(..)))?;
+
+    let mut generated_tokens = Vec::new();
+    let mut kv_cache = KvCache::new();
+
+    for _ in 0..model.max_new_tokens {
+        // Run one decode step
+        let logits = decode_processor.execute(Operation::DecodeStep {
+            hidden_states: hidden_states.last_token(),
+            kv_cache: &mut kv_cache,
+            layers: &model.layers,
+        }).await?;
+
+        // Sample next token
+        let next_token = decode_processor.execute(Operation::Sampling {
+            logits,
+            temperature: 0.7,
+            top_p: 0.9,
+        }).await?;
+
+        if next_token == model.eos_token {
+            break;
+        }
+
+        generated_tokens.push(next_token);
+
+        // Get embedding for next iteration
+        hidden_states = gpu.execute(Operation::Embedding {
+            tokens: vec![next_token],
+            embedding_table: model.embedding_table.clone(),
+        }).await?;
+    }
+
+    println!("✓ Decode complete on {:?}: {} tokens generated",
+             decode_processor.processor_type(),
+             generated_tokens.len());
+
+    // ═══════════════════════════════════════════════════════════════
+    // STEP 5: DETOKENIZATION (CPU)
+    // CPU handles string operations and variable-length output
+    // ═══════════════════════════════════════════════════════════════
+    let output = cpu.execute(Operation::Detokenization {
+        tokens: generated_tokens,
+        vocab: model.vocab.clone(),
+    }).await?;
+
+    println!("✓ Detokenization complete on CPU");
+
+    Ok(output)
+}
+```
+
+### 5.2 Utilization Report
+
+```
+╔═══════════════════════════════════════════════════════════════════════════╗
+║                    HETEROGENEOUS INFERENCE REPORT                          ║
+╠═══════════════════════════════════════════════════════════════════════════╣
+║                                                                            ║
+║  Model: Llama-70B                                                         ║
+║  Input: 512 tokens                                                        ║
+║  Output: 256 tokens                                                       ║
+║                                                                            ║
+║  ┌────────────────────────────────────────────────────────────────────┐   ║
+║  │ PROCESSOR UTILIZATION                                               │   ║
+║  ├────────────┬──────────┬──────────┬──────────┬────────────────────┤   ║
+║  │ Processor  │ Time     │ Util %   │ Tasks    │ Operations         │   ║
+║  ├────────────┼──────────┼──────────┼──────────┼────────────────────┤   ║
+║  │ CPU        │ 15ms     │ 8%       │ 2        │ Token, Detoken     │   ║
+║  │ GPU (H100) │ 120ms    │ 65%      │ 257      │ Embedding (×257)   │   ║
+║  │ TPU v5p    │ 200ms    │ 95%      │ 80       │ Prefill layers     │   ║
+║  │ LPU (Groq) │ 450ms    │ 92%      │ 256      │ Decode steps       │   ║
+║  └────────────┴──────────┴──────────┴──────────┴────────────────────┘   ║
+║                                                                            ║
+║  Total Time: 785ms (vs 2400ms GPU-only = 3.1x speedup)                   ║
+║  Zero Idle Processors: ✓                                                  ║
+║                                                                            ║
+║  ┌────────────────────────────────────────────────────────────────────┐   ║
+║  │ TIMELINE                                                            │   ║
+║  ├────────────────────────────────────────────────────────────────────┤   ║
+║  │                                                                      │   ║
+║  │ CPU  ██░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░██  │   ║
+║  │      │Tok                                                    Detok│  │   ║
+║  │                                                                      │   ║
+║  │ GPU  ░░██████████████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░  │   ║
+║  │         │Embed×512  │                                              │   ║
+║  │                                                                      │   ║
+║  │ TPU  ░░░░░░░░░░░░░░██████████████████████████░░░░░░░░░░░░░░░░░░░░  │   ║
+║  │                     │Prefill (80 layers)     │                      │   ║
+║  │                                                                      │   ║
+║  │ LPU  ░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░██████████████████████████  │   ║
+║  │                                          │Decode (256 steps)      │  │   ║
+║  │                                                                      │   ║
+║  │      0ms         200ms        400ms        600ms        800ms      │   ║
+║  └────────────────────────────────────────────────────────────────────┘   ║
+║                                                                            ║
+╚═══════════════════════════════════════════════════════════════════════════╝
+```
+
+---
+
+## Summary: Multi-Processor Advantages
+
+### Processor-Task Mapping
+
+| Task Type | Best Processor | Why |
+|-----------|----------------|-----|
+| Data loading, I/O | **CPU** | Sequential, system calls |
+| Tokenization/Detokenization | **CPU** | String processing |
+| Embedding lookup | **GPU** | Memory bandwidth |
+| Matrix multiply (large) | **TPU** | Dedicated MXU units |
+| Attention (prefill) | **TPU/GPU** | Parallel, compute-bound |
+| Token generation (decode) | **LPU** | Sequential, low latency |
+| On-device inference | **NPU** | Power efficient |
+| Browser compute | **WebGPU** | Platform agnostic |
+| Cryptography | **FPGA** | Custom bit operations |
+| Signal processing | **DSP** | Specialized math |
+
+### Expected Speedups
+
+| Workload | GPU-Only | Heterogeneous | Speedup |
+|----------|----------|---------------|---------|
+| LLM Training | 1x | 1.5-2x | +50-100% |
+| LLM Inference | 1x | 2-4x | +100-300% |
+| Image Generation | 1x | 1.3-1.8x | +30-80% |
+| RAG Pipeline | 1x | 2-3x | +100-200% |
+| Real-time Video | 1x | 3-5x | +200-400% |
+
+### Zero Idle Guarantee
+
+The heterogeneous scheduler ensures:
+1. **Parallel execution** across processor types
+2. **Pipeline overlap** between stages
+3. **Work stealing** when processors become idle
+4. **Predictive prefetching** of data
+5. **Dynamic rebalancing** based on actual throughput
+
+This architecture maximizes hardware utilization and minimizes total execution time by using EVERY available processor simultaneously.
diff --git a/docs/PLAN/PHASE11-Synor-Compute-L2.md b/docs/PLAN/PHASE11-Synor-Compute-L2.md
new file mode 100644
index 0000000..dea2dd9
--- /dev/null
+++ b/docs/PLAN/PHASE11-Synor-Compute-L2.md
@@ -0,0 +1,906 @@
+# Phase 11: Synor Compute L2 - Full-Stack Compute Platform
+
+> **Mission**: Build a decentralized compute platform capable of AI/ML training, inference, OS hosting, and general-purpose high-performance computing.
+
+---
+
+## Executive Summary
+
+Synor Compute L2 extends beyond the current WASM-only Synor VM to provide:
+- **GPU Compute**: AI/ML training and inference with CUDA/ROCm support
+- **Container Orchestration**: Docker-compatible workloads with Kubernetes-style scheduling
+- **Persistent VMs**: Long-running virtual machines for OS hosting
+- **Serverless Functions**: Short-lived compute for API backends and event processing
+- **Edge Compute**: Low-latency compute at network edge nodes
+
+---
+
+## Architecture Overview
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                         SYNOR COMPUTE L2                                     │
+├─────────────────────────────────────────────────────────────────────────────┤
+│                                                                              │
+│  ┌─────────────────────────────────────────────────────────────────────────┐ │
+│  │                      APPLICATION LAYER                                   │ │
+│  ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
+│  │   AI/ML      │  Serverless  │  Containers  │  Persistent  │   Edge     │ │
+│  │   Training   │  Functions   │  (Docker)    │  VMs (Linux) │  Compute   │ │
+│  └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
+│                                                                              │
+│  ┌─────────────────────────────────────────────────────────────────────────┐ │
+│  │                      ORCHESTRATION LAYER                                 │ │
+│  ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
+│  │   Job        │  Resource    │  Network     │  Storage     │   Health   │ │
+│  │   Scheduler  │  Manager     │  Fabric      │  Orchestrator│   Monitor  │ │
+│  └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
+│                                                                              │
+│  ┌─────────────────────────────────────────────────────────────────────────┐ │
+│  │                      COMPUTE RUNTIME LAYER                               │ │
+│  ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
+│  │   GPU        │  Container   │  MicroVM     │  WASM        │   Native   │ │
+│  │   Runtime    │  Runtime     │  Runtime     │  Runtime     │   Runtime  │ │
+│  │   (CUDA/ROCm)│  (containerd)│  (Firecracker)│  (Wasmtime) │   (gVisor) │ │
+│  └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
+│                                                                              │
+│  ┌─────────────────────────────────────────────────────────────────────────┐ │
+│  │                      INFRASTRUCTURE LAYER                                │ │
+│  ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
+│  │   Node       │  Network     │  Distributed │  Consensus   │   Billing  │ │
+│  │   Registry   │  Overlay     │  Storage     │  (PoS+PoW)   │   Metering │ │
+│  └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
+│                                                                              │
+│  ┌─────────────────────────────────────────────────────────────────────────┐ │
+│  │              SYNOR L1 BLOCKCHAIN (GHOSTDAG + DAG-RIDER)                  │ │
+│  └─────────────────────────────────────────────────────────────────────────┘ │
+│                                                                              │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## Milestone 1: GPU Compute Foundation (AI/ML Training & Inference)
+
+### 1.1 GPU Node Registration
+
+```rust
+// synor-compute/src/gpu/node.rs
+
+/// GPU node capabilities
+pub struct GpuNode {
+    /// Unique node ID
+    pub node_id: NodeId,
+    /// GPU specifications
+    pub gpus: Vec<GpuSpec>,
+    /// Total VRAM available (bytes)
+    pub total_vram: u64,
+    /// Available VRAM (bytes)
+    pub available_vram: u64,
+    /// CUDA compute capability (e.g., 8.6 for RTX 3090)
+    pub cuda_capability: Option<(u8, u8)>,
+    /// ROCm version (for AMD)
+    pub rocm_version: Option<String>,
+    /// Network bandwidth (Gbps)
+    pub bandwidth_gbps: u32,
+    /// Geographic region
+    pub region: Region,
+    /// Stake amount (for PoS validation)
+    pub stake: u64,
+}
+
+pub struct GpuSpec {
+    pub model: String,           // "NVIDIA RTX 4090"
+    pub vram_gb: u32,            // 24
+    pub tensor_cores: u32,       // 512
+    pub cuda_cores: u32,         // 16384
+    pub memory_bandwidth: u32,   // 1008 GB/s
+    pub fp32_tflops: f32,        // 82.6
+    pub fp16_tflops: f32,        // 165.2
+    pub int8_tops: f32,          // 330.4
+}
+```
+
+### 1.2 AI/ML Job Specification
+
+```rust
+// synor-compute/src/ai/job.rs
+
+/// AI/ML training job specification
+pub struct TrainingJob {
+    /// Job ID
+    pub job_id: JobId,
+    /// Owner address
+    pub owner: Address,
+    /// Framework (PyTorch, TensorFlow, JAX)
+    pub framework: MlFramework,
+    /// Model specification
+    pub model: ModelSpec,
+    /// Dataset reference (Synor Storage CID)
+    pub dataset_cid: Cid,
+    /// Training configuration
+    pub config: TrainingConfig,
+    /// Resource requirements
+    pub resources: GpuResources,
+    /// Maximum budget (SYNOR tokens)
+    pub max_budget: u64,
+    /// Checkpoint interval (steps)
+    pub checkpoint_interval: u64,
+}
+
+pub struct GpuResources {
+    pub min_gpus: u32,
+    pub max_gpus: u32,
+    pub min_vram_per_gpu: u64,
+    pub cuda_capability_min: Option<(u8, u8)>,
+    pub distributed: bool,        // Multi-node training
+    pub priority: JobPriority,
+}
+
+pub enum MlFramework {
+    PyTorch { version: String },
+    TensorFlow { version: String },
+    JAX { version: String },
+    ONNX,
+    Custom { image: String },
+}
+
+pub struct TrainingConfig {
+    pub epochs: u32,
+    pub batch_size: u32,
+    pub learning_rate: f32,
+    pub optimizer: String,
+    pub mixed_precision: bool,
+    pub gradient_accumulation: u32,
+    pub distributed_strategy: DistributedStrategy,
+}
+
+pub enum DistributedStrategy {
+    DataParallel,
+    ModelParallel,
+    PipelineParallel,
+    ZeRO { stage: u8 },  // DeepSpeed ZeRO stages 1-3
+    FSDP,                // Fully Sharded Data Parallel
+}
+```
+
+### 1.3 Inference Service
+
+```rust
+// synor-compute/src/ai/inference.rs
+
+/// Inference endpoint specification
+pub struct InferenceEndpoint {
+    /// Endpoint ID
+    pub endpoint_id: EndpointId,
+    /// Model reference (Synor Storage CID)
+    pub model_cid: Cid,
+    /// Model format
+    pub format: ModelFormat,
+    /// Scaling configuration
+    pub scaling: AutoscaleConfig,
+    /// GPU requirements per replica
+    pub gpu_per_replica: GpuResources,
+    /// Request timeout
+    pub timeout_ms: u32,
+    /// Max batch size for batching inference
+    pub max_batch_size: u32,
+    /// Batching timeout
+    pub batch_timeout_ms: u32,
+}
+
+pub enum ModelFormat {
+    PyTorch,
+    ONNX,
+    TensorRT,
+    Triton,
+    vLLM,          // For LLM serving
+    TGI,           // Text Generation Inference
+    Custom,
+}
+
+pub struct AutoscaleConfig {
+    pub min_replicas: u32,
+    pub max_replicas: u32,
+    pub target_gpu_utilization: f32,
+    pub scale_up_threshold: f32,
+    pub scale_down_threshold: f32,
+    pub cooldown_seconds: u32,
+}
+```
+
+### 1.4 Pricing Model for GPU Compute
+
+| Resource | Unit | Price (SYNOR/unit) |
+|----------|------|-------------------|
+| GPU (RTX 4090 equivalent) | hour | 0.50 |
+| GPU (A100 80GB equivalent) | hour | 2.00 |
+| GPU (H100 equivalent) | hour | 4.00 |
+| VRAM | GB/hour | 0.01 |
+| Network egress | GB | 0.05 |
+| Storage (hot, NVMe) | GB/month | 0.10 |
+| Inference requests | 1M tokens | 0.10 |
+
+---
+
+## Milestone 2: Container Orchestration (Docker/Kubernetes-Compatible)
+
+### 2.1 Container Runtime
+
+```rust
+// synor-compute/src/container/runtime.rs
+
+/// Container specification (OCI-compatible)
+pub struct ContainerSpec {
+    /// Image reference
+    pub image: ImageRef,
+    /// Resource limits
+    pub resources: ContainerResources,
+    /// Environment variables
+    pub env: HashMap<String, String>,
+    /// Volume mounts
+    pub volumes: Vec<VolumeMount>,
+    /// Network configuration
+    pub network: NetworkConfig,
+    /// Security context
+    pub security: SecurityContext,
+    /// Health check
+    pub health_check: Option<HealthCheck>,
+}
+
+pub struct ContainerResources {
+    pub cpu_cores: f32,          // 0.5, 1.0, 2.0, etc.
+    pub memory_mb: u64,
+    pub gpu: Option<GpuAllocation>,
+    pub ephemeral_storage_gb: u32,
+    pub network_bandwidth_mbps: u32,
+}
+
+pub struct GpuAllocation {
+    pub count: u32,
+    pub vram_mb: u64,
+    pub shared: bool,  // Allow GPU sharing via MPS/MIG
+}
+```
+
+### 2.2 Service Mesh & Networking
+
+```rust
+// synor-compute/src/network/mesh.rs
+
+/// Service definition for container orchestration
+pub struct Service {
+    pub service_id: ServiceId,
+    pub name: String,
+    pub containers: Vec<ContainerSpec>,
+    pub replicas: ReplicaConfig,
+    pub load_balancer: LoadBalancerConfig,
+    pub service_mesh: ServiceMeshConfig,
+}
+
+pub struct ServiceMeshConfig {
+    pub mtls_enabled: bool,
+    pub traffic_policy: TrafficPolicy,
+    pub circuit_breaker: CircuitBreakerConfig,
+    pub retry_policy: RetryPolicy,
+    pub rate_limit: Option<RateLimitConfig>,
+}
+
+pub struct LoadBalancerConfig {
+    pub algorithm: LoadBalancerAlgorithm,
+    pub health_check: HealthCheck,
+    pub sticky_sessions: bool,
+    pub ssl_termination: SslTermination,
+}
+
+pub enum LoadBalancerAlgorithm {
+    RoundRobin,
+    LeastConnections,
+    WeightedRoundRobin { weights: Vec<u32> },
+    IPHash,
+    Random,
+}
+```
+
+### 2.3 Container Pricing
+
+| Resource | Unit | Price (SYNOR/unit) |
+|----------|------|-------------------|
+| CPU | core/hour | 0.02 |
+| Memory | GB/hour | 0.005 |
+| Ephemeral storage | GB/hour | 0.001 |
+| Network ingress | GB | FREE |
+| Network egress | GB | 0.05 |
+| Load balancer | hour | 0.01 |
+| Static IP | month | 2.00 |
+
+---
+
+## Milestone 3: Persistent Virtual Machines (OS Hosting)
+
+### 3.1 MicroVM Architecture (Firecracker-based)
+
+```rust
+// synor-compute/src/vm/microvm.rs
+
+/// Virtual machine specification
+pub struct VmSpec {
+    /// VM ID
+    pub vm_id: VmId,
+    /// Owner address
+    pub owner: Address,
+    /// VM size
+    pub size: VmSize,
+    /// Boot image
+    pub image: VmImage,
+    /// Persistent volumes
+    pub volumes: Vec<PersistentVolume>,
+    /// Network configuration
+    pub network: VmNetworkConfig,
+    /// SSH keys for access
+    pub ssh_keys: Vec<SshPublicKey>,
+    /// Cloud-init user data
+    pub user_data: Option<String>,
+}
+
+pub struct VmSize {
+    pub vcpus: u32,
+    pub memory_gb: u32,
+    pub gpu: Option<GpuPassthrough>,
+    pub network_bandwidth_gbps: u32,
+}
+
+pub struct GpuPassthrough {
+    pub count: u32,
+    pub model: GpuModel,
+    pub vram_gb: u32,
+}
+
+pub enum VmImage {
+    /// Pre-built images
+    Marketplace { image_id: String, version: String },
+    /// Custom image from Synor Storage
+    Custom { cid: Cid, format: ImageFormat },
+    /// Standard OS images
+    Ubuntu { version: String },
+    Debian { version: String },
+    AlmaLinux { version: String },
+    Windows { version: String, license: WindowsLicense },
+}
+
+pub struct PersistentVolume {
+    pub volume_id: VolumeId,
+    pub size_gb: u32,
+    pub volume_type: VolumeType,
+    pub mount_path: String,
+    pub encrypted: bool,
+}
+
+pub enum VolumeType {
+    /// High-performance NVMe SSD
+    NvmeSsd { iops: u32, throughput_mbps: u32 },
+    /// Standard SSD
+    Ssd,
+    /// HDD for archival
+    Hdd,
+    /// Distributed storage (Synor Storage L2)
+    Distributed { replication: u8 },
+}
+```
+
+### 3.2 VM Lifecycle Management
+
+```rust
+// synor-compute/src/vm/lifecycle.rs
+
+pub enum VmState {
+    Pending,
+    Provisioning,
+    Running,
+    Stopping,
+    Stopped,
+    Hibernating,
+    Hibernated,
+    Migrating,
+    Failed,
+    Terminated,
+}
+
+pub struct VmManager {
+    /// Active VMs
+    vms: HashMap<VmId, VmInstance>,
+    /// Node assignments
+    node_assignments: HashMap<VmId, NodeId>,
+    /// Live migration coordinator
+    migration_coordinator: MigrationCoordinator,
+}
+
+impl VmManager {
+    /// Start a new VM
+    pub async fn create(&self, spec: VmSpec) -> Result<VmId, VmError>;
+
+    /// Stop a VM (preserves state)
+    pub async fn stop(&self, vm_id: &VmId) -> Result<(), VmError>;
+
+    /// Start a stopped VM
+    pub async fn start(&self, vm_id: &VmId) -> Result<(), VmError>;
+
+    /// Hibernate VM to storage (saves memory state)
+    pub async fn hibernate(&self, vm_id: &VmId) -> Result<(), VmError>;
+
+    /// Live migrate VM to another node
+    pub async fn migrate(&self, vm_id: &VmId, target_node: NodeId) -> Result<(), VmError>;
+
+    /// Resize VM (requires restart)
+    pub async fn resize(&self, vm_id: &VmId, new_size: VmSize) -> Result<(), VmError>;
+
+    /// Snapshot VM state
+    pub async fn snapshot(&self, vm_id: &VmId) -> Result<SnapshotId, VmError>;
+
+    /// Terminate and delete VM
+    pub async fn terminate(&self, vm_id: &VmId) -> Result<(), VmError>;
+}
+```
+
+### 3.3 VM Pricing
+
+| VM Type | vCPUs | Memory | Storage | GPU | Price (SYNOR/month) |
+|---------|-------|--------|---------|-----|---------------------|
+| micro | 1 | 1 GB | 20 GB SSD | - | 5 |
+| small | 2 | 4 GB | 50 GB SSD | - | 15 |
+| medium | 4 | 8 GB | 100 GB SSD | - | 30 |
+| large | 8 | 32 GB | 200 GB SSD | - | 80 |
+| xlarge | 16 | 64 GB | 500 GB NVMe | - | 200 |
+| gpu-small | 8 | 32 GB | 200 GB NVMe | 1x RTX 4090 | 400 |
+| gpu-medium | 16 | 64 GB | 500 GB NVMe | 2x RTX 4090 | 750 |
+| gpu-large | 32 | 128 GB | 1 TB NVMe | 4x A100 80GB | 2500 |
+| gpu-xlarge | 64 | 256 GB | 2 TB NVMe | 8x H100 | 8000 |
+
+---
+
+## Milestone 4: Serverless Functions (FaaS)
+
+### 4.1 Function Specification
+
+```rust
+// synor-compute/src/serverless/function.rs
+
+/// Serverless function definition
+pub struct Function {
+    pub function_id: FunctionId,
+    pub owner: Address,
+    pub name: String,
+    pub runtime: FunctionRuntime,
+    pub handler: String,
+    pub code: FunctionCode,
+    pub resources: FunctionResources,
+    pub triggers: Vec<FunctionTrigger>,
+    pub environment: HashMap<String, String>,
+    pub timeout_ms: u32,
+    pub concurrency: ConcurrencyConfig,
+}
+
+pub enum FunctionRuntime {
+    Node20,
+    Node22,
+    Python311,
+    Python312,
+    Rust,
+    Go122,
+    Java21,
+    Dotnet8,
+    Ruby33,
+    Custom { image: String },
+}
+
+pub struct FunctionCode {
+    /// Source code CID in Synor Storage
+    pub cid: Cid,
+    /// Entry point file
+    pub entry_point: String,
+    /// Dependencies (package.json, requirements.txt, etc.)
+    pub dependencies: Option<Cid>,
+}
+
+pub struct FunctionResources {
+    pub memory_mb: u32,       // 128, 256, 512, 1024, 2048, 4096, 8192
+    pub cpu_allocation: f32,  // Proportional to memory
+    pub ephemeral_storage_mb: u32,
+    pub gpu: Option<GpuAllocation>,
+}
+
+pub enum FunctionTrigger {
+    /// HTTP endpoint
+    Http { path: String, methods: Vec<HttpMethod> },
+    /// Scheduled execution (cron)
+    Schedule { cron: String },
+    /// Event from message queue
+    Queue { queue_name: String },
+    /// Storage events
+    Storage { bucket: String, events: Vec<StorageEvent> },
+    /// Blockchain events
+    Blockchain { contract: Address, events: Vec<String> },
+    /// Webhook
+    Webhook { url: String },
+}
+```
+
+### 4.2 Cold Start Optimization
+
+```rust
+// synor-compute/src/serverless/warmup.rs
+
+/// Function warmup strategies
+pub struct WarmupConfig {
+    /// Minimum warm instances
+    pub min_instances: u32,
+    /// Provisioned concurrency
+    pub provisioned_concurrency: u32,
+    /// Warmup schedule
+    pub warmup_schedule: Option<String>,
+    /// Snapshot-based cold start (SnapStart)
+    pub snapstart_enabled: bool,
+}
+
+pub struct ColdStartOptimizer {
+    /// Pre-warmed function pools
+    pools: HashMap<FunctionRuntime, WarmPool>,
+    /// Snapshot cache
+    snapshots: LruCache<FunctionId, FunctionSnapshot>,
+    /// Prediction model for scaling
+    predictor: ScalingPredictor,
+}
+
+impl ColdStartOptimizer {
+    /// Get a warm instance or create one
+    pub async fn get_instance(&self, function: &Function) -> Result<FunctionInstance, Error> {
+        // Try snapshot restore first (< 100ms)
+        if let Some(snapshot) = self.snapshots.get(&function.function_id) {
+            return self.restore_from_snapshot(snapshot).await;
+        }
+
+        // Try warm pool (< 50ms)
+        if let Some(instance) = self.pools.get(&function.runtime)?.get_warm() {
+            return Ok(instance);
+        }
+
+        // Cold start (1-5s depending on runtime)
+        self.cold_start(function).await
+    }
+}
+```
+
+### 4.3 Serverless Pricing
+
+| Resource | Unit | Price (SYNOR) |
+|----------|------|---------------|
+| Invocations | 1M requests | 0.20 |
+| Duration | GB-second | 0.00001 |
+| Provisioned concurrency | GB-hour | 0.01 |
+| HTTP Gateway | 1M requests | 0.10 |
+| Event bridge | 1M events | 0.50 |
+
+---
+
+## Milestone 5: Edge Compute
+
+### 5.1 Edge Node Architecture
+
+```rust
+// synor-compute/src/edge/node.rs
+
+/// Edge compute node
+pub struct EdgeNode {
+    pub node_id: NodeId,
+    pub location: GeoLocation,
+    pub capabilities: EdgeCapabilities,
+    pub latency_zones: Vec<LatencyZone>,
+    pub resources: EdgeResources,
+}
+
+pub struct EdgeCapabilities {
+    pub wasm_runtime: bool,
+    pub container_runtime: bool,
+    pub gpu_inference: bool,
+    pub video_transcoding: bool,
+    pub cdn_cache: bool,
+}
+
+pub struct EdgeResources {
+    pub cpu_cores: u32,
+    pub memory_gb: u32,
+    pub storage_gb: u32,
+    pub gpu: Option<EdgeGpu>,
+    pub bandwidth_gbps: u32,
+}
+
+/// Edge function for low-latency compute
+pub struct EdgeFunction {
+    pub function_id: FunctionId,
+    pub code: WasmModule,
+    pub memory_limit: u32,
+    pub timeout_ms: u32,
+    pub allowed_regions: Vec<Region>,
+}
+```
+
+### 5.2 Edge Use Cases
+
+```rust
+// synor-compute/src/edge/usecases.rs
+
+/// CDN with compute at edge
+pub struct EdgeCdn {
+    /// Origin servers
+    origins: Vec<Origin>,
+    /// Cache rules
+    cache_rules: Vec<CacheRule>,
+    /// Edge workers for request/response transformation
+    workers: Vec<EdgeWorker>,
+}
+
+/// Real-time inference at edge
+pub struct EdgeInference {
+    /// Model optimized for edge (quantized, pruned)
+    model_id: ModelId,
+    /// Inference runtime (TensorRT, ONNX Runtime)
+    runtime: EdgeInferenceRuntime,
+    /// Max batch size
+    max_batch: u32,
+    /// Target latency
+    target_latency_ms: u32,
+}
+
+/// Video processing at edge
+pub struct EdgeVideoProcessor {
+    /// Transcoding profiles
+    profiles: Vec<TranscodingProfile>,
+    /// Real-time streaming
+    live_streaming: bool,
+    /// Adaptive bitrate
+    abr_enabled: bool,
+}
+```
+
+### 5.3 Edge Pricing
+
+| Resource | Unit | Price (SYNOR) |
+|----------|------|---------------|
+| Edge function invocations | 1M | 0.50 |
+| Edge function duration | GB-second | 0.00002 |
+| Edge bandwidth | GB | 0.08 |
+| Edge cache storage | GB/month | 0.02 |
+| Video transcoding | minute | 0.02 |
+
+---
+
+## Milestone 6: Node Provider Economics
+
+### 6.1 Provider Registration
+
+```rust
+// synor-compute/src/provider/registration.rs
+
+/// Compute provider registration
+pub struct ProviderRegistration {
+    pub provider_id: ProviderId,
+    pub owner: Address,
+    /// Stake required to become provider
+    pub stake: u64,
+    /// Hardware specifications
+    pub hardware: HardwareManifest,
+    /// Network connectivity
+    pub network: NetworkManifest,
+    /// Geographic location
+    pub location: GeoLocation,
+    /// Availability SLA commitment
+    pub sla: SlaCommitment,
+}
+
+pub struct HardwareManifest {
+    pub cpus: Vec<CpuSpec>,
+    pub memory_total_gb: u64,
+    pub gpus: Vec<GpuSpec>,
+    pub storage: Vec<StorageSpec>,
+    pub verified: bool,  // Hardware attestation passed
+}
+
+pub struct SlaCommitment {
+    pub uptime_percent: f32,      // 99.9, 99.99, etc.
+    pub response_time_ms: u32,
+    pub data_durability: f32,
+    pub penalty_rate: f32,        // Penalty for SLA violation
+}
+```
+
+### 6.2 Provider Revenue Model
+
+| Revenue Source | Provider Share | Protocol Share |
+|----------------|----------------|----------------|
+| Compute fees | 85% | 15% |
+| Storage fees | 80% | 20% |
+| Network fees | 75% | 25% |
+| SLA bonuses | 100% | 0% |
+| Staking rewards | 100% | 0% |
+
+### 6.3 Slashing Conditions
+
+| Violation | Penalty |
+|-----------|---------|
+| Downtime > committed SLA | 1% stake per hour |
+| Data loss | 10% stake + compensation |
+| Malicious behavior | 100% stake |
+| False hardware attestation | 50% stake |
+
+---
+
+## Implementation Timeline
+
+### Phase 11.1: Foundation (Weeks 1-4)
+- [ ] Node registration and hardware attestation
+- [ ] Basic job scheduler
+- [ ] WASM runtime integration (existing)
+- [ ] Container runtime (containerd)
+- [ ] Network overlay (WireGuard mesh)
+
+### Phase 11.2: GPU Compute (Weeks 5-8)
+- [ ] GPU node registration
+- [ ] NVIDIA driver integration
+- [ ] CUDA runtime support
+- [ ] Basic ML job execution
+- [ ] Model storage integration
+
+### Phase 11.3: Container Orchestration (Weeks 9-12)
+- [ ] OCI image support
+- [ ] Service deployment
+- [ ] Load balancing
+- [ ] Auto-scaling
+- [ ] Service mesh (mTLS)
+
+### Phase 11.4: Persistent VMs (Weeks 13-16)
+- [ ] MicroVM runtime (Firecracker)
+- [ ] VM lifecycle management
+- [ ] Persistent storage
+- [ ] Live migration
+- [ ] Snapshot/restore
+
+### Phase 11.5: Serverless (Weeks 17-20)
+- [ ] Function deployment
+- [ ] Cold start optimization
+- [ ] Event triggers
+- [ ] API gateway
+- [ ] Monitoring/logging
+
+### Phase 11.6: Edge Compute (Weeks 21-24)
+- [ ] Edge node registration
+- [ ] Edge function runtime
+- [ ] CDN integration
+- [ ] Edge inference
+- [ ] Global anycast
+
+---
+
+## Security Considerations
+
+### Isolation Levels
+
+| Workload Type | Isolation Technology | Security Level |
+|---------------|---------------------|----------------|
+| WASM | Wasmtime sandbox | High |
+| Serverless | gVisor + seccomp | High |
+| Containers | gVisor or Kata | Medium-High |
+| VMs | Firecracker MicroVM | High |
+| GPU | NVIDIA MIG/MPS | Medium |
+
+### Network Security
+
+- All inter-node traffic encrypted (WireGuard)
+- mTLS for service-to-service communication
+- Network policies for workload isolation
+- DDoS protection at edge
+
+### Data Security
+
+- Encryption at rest (AES-256)
+- Encryption in transit (TLS 1.3)
+- Confidential computing support (AMD SEV, Intel SGX)
+- Secure key management (HSM integration)
+
+---
+
+## API Examples
+
+### Deploy AI Training Job
+
+```bash
+synor compute train create \
+  --framework pytorch \
+  --model-config ./model.yaml \
+  --dataset synor://datasets/imagenet \
+  --gpus 8 \
+  --gpu-type h100 \
+  --distributed ddp \
+  --epochs 100 \
+  --checkpoint-interval 1000 \
+  --max-budget 1000
+```
+
+### Deploy Inference Endpoint
+
+```bash
+synor compute inference deploy \
+  --model synor://models/llama-70b \
+  --format vllm \
+  --min-replicas 2 \
+  --max-replicas 10 \
+  --gpu-per-replica 2 \
+  --target-utilization 0.7
+```
+
+### Create Persistent VM
+
+```bash
+synor compute vm create \
+  --name my-dev-server \
+  --image ubuntu:22.04 \
+  --size gpu-small \
+  --volume 100gb:nvme:/data \
+  --ssh-key ~/.ssh/id_ed25519.pub \
+  --region us-east
+```
+
+### Deploy Container Service
+
+```bash
+synor compute service deploy \
+  --name my-api \
+  --image my-registry/my-api:latest \
+  --replicas 3 \
+  --cpu 2 \
+  --memory 4gb \
+  --port 8080 \
+  --health-check /health \
+  --autoscale 2-10
+```
+
+### Deploy Serverless Function
+
+```bash
+synor compute function deploy \
+  --name process-image \
+  --runtime python312 \
+  --handler main.handler \
+  --code ./function \
+  --memory 1024 \
+  --timeout 30000 \
+  --trigger http:/api/process
+```
+
+---
+
+## Comparison with Existing Synor VM
+
+| Feature | Current Synor VM | Synor Compute L2 |
+|---------|------------------|------------------|
+| Runtime | WASM only | WASM, Container, MicroVM |
+| Timeout | 30 seconds | Unlimited (VMs) |
+| Memory | 16 MB max | Up to 256 GB |
+| GPU | ❌ | ✅ Full CUDA/ROCm |
+| Networking | ❌ | ✅ Full TCP/UDP |
+| File I/O | ❌ | ✅ Persistent volumes |
+| Threading | ❌ | ✅ Multi-threaded |
+| AI/ML | ❌ | ✅ Training + Inference |
+| OS Hosting | ❌ | ✅ Full Linux/Windows |
+
+---
+
+## Next Steps
+
+1. **Milestone 1**: Implement GPU node registration and attestation
+2. **Milestone 2**: Build basic job scheduler with resource allocation
+3. **Milestone 3**: Integrate containerd for container workloads
+4. **Milestone 4**: Add Firecracker for MicroVM support
+5. **Milestone 5**: Implement serverless function runtime
+6. **Milestone 6**: Deploy edge nodes and CDN integration
+
+This plan transforms Synor from a smart contract platform into a full-stack decentralized cloud provider capable of competing with AWS/GCP/Azure while maintaining decentralization and censorship resistance.