feat(compute): add Phase 11 Synor Compute L2 heterogeneous compute layer

- Add synor-compute crate for heterogeneous compute orchestration - Implement processor abstraction for CPU/GPU/TPU/NPU/LPU/FPGA/DSP - Add device registry with cross-vendor capability tracking - Implement task scheduler with work stealing and load balancing - Add energy-aware and latency-aware balancing strategies - Create spot market for compute resources with order matching - Add memory manager with tensor handles and cross-device transfers - Support processor capability profiles (H100, TPU v5p, Groq LPU, etc.) - Implement priority work queues with task decomposition Processor types supported: - CPU (x86-64 AVX512, ARM64 SVE, RISC-V Vector) - GPU (NVIDIA CUDA, AMD ROCm, Intel OneAPI, Apple Metal) - TPU (v2-v5p, Edge TPU) - NPU (Apple Neural Engine, Qualcomm Hexagon, Intel VPU) - LPU (Groq Language Processing Unit) - FPGA (Xilinx, Intel Altera) - DSP (TI, Analog Devices) - WebGPU and WASM runtimes
2026-01-11 13:53:57 +05:30 · 2026-01-11 13:53:57 +05:30 · 4c36ddbdc2
commit 4c36ddbdc2
parent 8da34bc73d
19 changed files with 11219 additions and 0 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -9,6 +9,7 @@ members = [
    "crates/synor-storage",
    "crates/synor-hosting",
    "crates/synor-database",
+    "crates/synor-compute",
    "crates/synor-governance",
    "crates/synor-rpc",
    "crates/synor-vm",
--- a/crates/synor-compute/Cargo.toml
+++ b/crates/synor-compute/Cargo.toml
@ -0,0 +1,51 @@
+[package]
+name = "synor-compute"
+version.workspace = true
+edition.workspace = true
+description = "Heterogeneous multi-processor compute platform for Synor blockchain"
+license.workspace = true
+
+[dependencies]
+# Internal crates
+synor-types = { path = "../synor-types" }
+synor-crypto = { path = "../synor-crypto" }
+synor-storage = { path = "../synor-storage" }
+
+# Serialization
+serde.workspace = true
+serde_json.workspace = true
+borsh.workspace = true
+bincode = "1.3"
+
+# Async runtime
+tokio = { workspace = true, features = ["sync", "rt-multi-thread", "time", "macros"] }
+async-trait = "0.1"
+futures = "0.3"
+
+# Concurrency
+parking_lot.workspace = true
+crossbeam-deque = "0.8"
+crossbeam-channel = "0.5"
+dashmap = "5.5"
+
+# Utilities
+thiserror.workspace = true
+tracing.workspace = true
+hex.workspace = true
+
+# Hashing
+blake3.workspace = true
+
+# Data structures
+indexmap = "2.2"
+priority-queue = "2.0"
+
+# Time
+chrono = { version = "0.4", features = ["serde"] }
+
+# Random
+rand = "0.8"
+
+[dev-dependencies]
+tempfile.workspace = true
+tokio-test = "0.4"
--- a/crates/synor-compute/src/device/mod.rs
+++ b/crates/synor-compute/src/device/mod.rs
@ -0,0 +1,377 @@
+//! Device registry and management.
+//!
+//! Supports all device types:
+//! - Data center servers
+//! - Desktop workstations
+//! - Laptops
+//! - Mobile devices (iOS, Android)
+//! - Browsers (WebGPU, WASM)
+//! - IoT devices
+
+use crate::error::ComputeError;
+use crate::processor::{GenericProcessor, Processor, ProcessorCapabilities, ProcessorId, ProcessorType};
+use crate::{NodeId, ProcessorInfo};
+use parking_lot::RwLock;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::sync::Arc;
+
+/// Unique device identifier.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct DeviceId(pub [u8; 32]);
+
+impl DeviceId {
+    /// Creates a new random device ID.
+    pub fn new() -> Self {
+        use rand::Rng;
+        let mut bytes = [0u8; 32];
+        rand::thread_rng().fill(&mut bytes);
+        DeviceId(bytes)
+    }
+
+    /// Creates from bytes.
+    pub fn from_bytes(bytes: [u8; 32]) -> Self {
+        DeviceId(bytes)
+    }
+}
+
+impl Default for DeviceId {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl std::fmt::Display for DeviceId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "dev_{}", hex::encode(&self.0[..8]))
+    }
+}
+
+/// Device type classification.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum DeviceType {
+    /// Data center server.
+    DataCenter,
+    /// Desktop workstation.
+    Desktop,
+    /// Laptop.
+    Laptop,
+    /// Mobile phone.
+    Mobile,
+    /// Tablet.
+    Tablet,
+    /// IoT device.
+    IoT,
+    /// Browser (WebGPU/WASM).
+    Browser,
+    /// Edge server.
+    Edge,
+}
+
+impl DeviceType {
+    /// Returns typical reliability score (0-100).
+    pub fn reliability(&self) -> u32 {
+        match self {
+            DeviceType::DataCenter => 99,
+            DeviceType::Edge => 95,
+            DeviceType::Desktop => 80,
+            DeviceType::Laptop => 60,
+            DeviceType::Mobile => 40,
+            DeviceType::Tablet => 50,
+            DeviceType::IoT => 70,
+            DeviceType::Browser => 30,
+        }
+    }
+
+    /// Returns typical availability hours per day.
+    pub fn availability_hours(&self) -> f32 {
+        match self {
+            DeviceType::DataCenter => 24.0,
+            DeviceType::Edge => 24.0,
+            DeviceType::Desktop => 8.0,
+            DeviceType::Laptop => 6.0,
+            DeviceType::Mobile => 4.0,
+            DeviceType::Tablet => 4.0,
+            DeviceType::IoT => 24.0,
+            DeviceType::Browser => 2.0,
+        }
+    }
+}
+
+/// Device capabilities.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct DeviceCapabilities {
+    /// Device type.
+    pub device_type: DeviceType,
+    /// Available processors.
+    pub processors: Vec<ProcessorType>,
+    /// Total memory (GB).
+    pub memory_gb: f32,
+    /// Network bandwidth (Mbps).
+    pub bandwidth_mbps: f32,
+    /// Storage available (GB).
+    pub storage_gb: f32,
+    /// Battery powered.
+    pub battery_powered: bool,
+    /// Supports background execution.
+    pub background_execution: bool,
+}
+
+/// Device information.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct DeviceInfo {
+    /// Device ID.
+    pub id: DeviceId,
+    /// Device type.
+    pub device_type: DeviceType,
+    /// Owner address.
+    pub owner: [u8; 32],
+    /// Capabilities.
+    pub capabilities: DeviceCapabilities,
+    /// Current status.
+    pub status: DeviceStatus,
+    /// Reputation score (0-100).
+    pub reputation: u32,
+    /// Total earnings (atomic SYNOR).
+    pub earnings: u64,
+    /// Geographic region.
+    pub region: String,
+}
+
+/// Device status.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub enum DeviceStatus {
+    /// Online and available.
+    Online,
+    /// Online but busy.
+    Busy,
+    /// Idle but available.
+    Idle,
+    /// On battery (reduced capacity).
+    OnBattery,
+    /// Offline.
+    Offline,
+    /// Maintenance.
+    Maintenance,
+}
+
+/// Device registry managing all devices and processors.
+pub struct DeviceRegistry {
+    /// Registered devices.
+    devices: RwLock<HashMap<DeviceId, DeviceInfo>>,
+    /// Node to device mapping.
+    node_devices: RwLock<HashMap<NodeId, Vec<DeviceId>>>,
+    /// All processors (across all nodes).
+    processors: RwLock<HashMap<ProcessorId, Arc<dyn Processor>>>,
+    /// Processor to node mapping.
+    processor_nodes: RwLock<HashMap<ProcessorId, NodeId>>,
+    /// Next processor ID.
+    next_processor_id: std::sync::atomic::AtomicU64,
+}
+
+impl DeviceRegistry {
+    /// Creates a new device registry.
+    pub fn new() -> Self {
+        Self {
+            devices: RwLock::new(HashMap::new()),
+            node_devices: RwLock::new(HashMap::new()),
+            processors: RwLock::new(HashMap::new()),
+            processor_nodes: RwLock::new(HashMap::new()),
+            next_processor_id: std::sync::atomic::AtomicU64::new(0),
+        }
+    }
+
+    /// Registers a device.
+    pub fn register_device(&self, device: DeviceInfo) -> Result<DeviceId, ComputeError> {
+        let id = device.id;
+        self.devices.write().insert(id, device);
+        Ok(id)
+    }
+
+    /// Unregisters a device.
+    pub fn unregister_device(&self, device_id: DeviceId) -> Result<(), ComputeError> {
+        self.devices.write().remove(&device_id);
+        Ok(())
+    }
+
+    /// Gets a device by ID.
+    pub fn get_device(&self, device_id: DeviceId) -> Option<DeviceInfo> {
+        self.devices.read().get(&device_id).cloned()
+    }
+
+    /// Registers a processor for a node.
+    pub fn register_processor(
+        &self,
+        node_id: NodeId,
+        info: ProcessorInfo,
+    ) -> Result<(), ComputeError> {
+        let processor_id = info.id;
+
+        // Create a generic processor from the info
+        let processor: Arc<dyn Processor> = Arc::new(GenericProcessor::new(
+            processor_id,
+            info.processor_type,
+            info.capabilities,
+        ));
+
+        self.processors.write().insert(processor_id, processor);
+        self.processor_nodes.write().insert(processor_id, node_id);
+
+        Ok(())
+    }
+
+    /// Unregisters all processors for a node.
+    pub fn unregister_node(&self, node_id: NodeId) -> Result<(), ComputeError> {
+        let mut processors = self.processors.write();
+        let mut processor_nodes = self.processor_nodes.write();
+
+        // Find and remove all processors for this node
+        let to_remove: Vec<_> = processor_nodes
+            .iter()
+            .filter(|(_, n)| **n == node_id)
+            .map(|(p, _)| *p)
+            .collect();
+
+        for proc_id in to_remove {
+            processors.remove(&proc_id);
+            processor_nodes.remove(&proc_id);
+        }
+
+        Ok(())
+    }
+
+    /// Gets a processor by ID.
+    pub fn get_processor(&self, processor_id: ProcessorId) -> Result<Arc<dyn Processor>, ComputeError> {
+        self.processors
+            .read()
+            .get(&processor_id)
+            .cloned()
+            .ok_or(ComputeError::ProcessorNotFound(processor_id))
+    }
+
+    /// Gets all processors.
+    pub fn all_processors(&self) -> Vec<Arc<dyn Processor>> {
+        self.processors.read().values().cloned().collect()
+    }
+
+    /// Gets processors of a specific type.
+    pub fn processors_by_type(&self, proc_type: ProcessorType) -> Vec<Arc<dyn Processor>> {
+        self.processors
+            .read()
+            .values()
+            .filter(|p| p.processor_type() == proc_type)
+            .cloned()
+            .collect()
+    }
+
+    /// Gets the next processor ID.
+    pub fn next_processor_id(&self) -> ProcessorId {
+        ProcessorId(self.next_processor_id.fetch_add(1, std::sync::atomic::Ordering::SeqCst))
+    }
+
+    /// Gets total number of devices.
+    pub fn device_count(&self) -> usize {
+        self.devices.read().len()
+    }
+
+    /// Gets total number of processors.
+    pub fn processor_count(&self) -> usize {
+        self.processors.read().len()
+    }
+
+    /// Gets devices by type.
+    pub fn devices_by_type(&self, device_type: DeviceType) -> Vec<DeviceInfo> {
+        self.devices
+            .read()
+            .values()
+            .filter(|d| d.device_type == device_type)
+            .cloned()
+            .collect()
+    }
+
+    /// Gets online devices.
+    pub fn online_devices(&self) -> Vec<DeviceInfo> {
+        self.devices
+            .read()
+            .values()
+            .filter(|d| d.status == DeviceStatus::Online || d.status == DeviceStatus::Idle)
+            .cloned()
+            .collect()
+    }
+
+    /// Updates device status.
+    pub fn update_device_status(
+        &self,
+        device_id: DeviceId,
+        status: DeviceStatus,
+    ) -> Result<(), ComputeError> {
+        if let Some(device) = self.devices.write().get_mut(&device_id) {
+            device.status = status;
+            Ok(())
+        } else {
+            Err(ComputeError::Internal(format!("Device not found: {}", device_id)))
+        }
+    }
+}
+
+impl Default for DeviceRegistry {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::processor::{CpuVariant, AvxSupport};
+
+    #[test]
+    fn test_device_id() {
+        let id1 = DeviceId::new();
+        let id2 = DeviceId::new();
+        assert_ne!(id1.0, id2.0);
+    }
+
+    #[test]
+    fn test_device_registry() {
+        let registry = DeviceRegistry::new();
+
+        let device = DeviceInfo {
+            id: DeviceId::new(),
+            device_type: DeviceType::Desktop,
+            owner: [1u8; 32],
+            capabilities: DeviceCapabilities {
+                device_type: DeviceType::Desktop,
+                processors: vec![ProcessorType::Cpu(CpuVariant::X86_64 {
+                    avx: AvxSupport::Avx512,
+                })],
+                memory_gb: 64.0,
+                bandwidth_mbps: 1000.0,
+                storage_gb: 1000.0,
+                battery_powered: false,
+                background_execution: true,
+            },
+            status: DeviceStatus::Online,
+            reputation: 100,
+            earnings: 0,
+            region: "us-east".to_string(),
+        };
+
+        let device_id = device.id;
+        registry.register_device(device).unwrap();
+
+        assert_eq!(registry.device_count(), 1);
+        assert!(registry.get_device(device_id).is_some());
+
+        registry.unregister_device(device_id).unwrap();
+        assert_eq!(registry.device_count(), 0);
+    }
+
+    #[test]
+    fn test_device_type_properties() {
+        assert_eq!(DeviceType::DataCenter.reliability(), 99);
+        assert_eq!(DeviceType::Mobile.reliability(), 40);
+        assert_eq!(DeviceType::DataCenter.availability_hours(), 24.0);
+        assert_eq!(DeviceType::Browser.availability_hours(), 2.0);
+    }
+}
--- a/crates/synor-compute/src/error.rs
+++ b/crates/synor-compute/src/error.rs
@ -0,0 +1,92 @@
+//! Error types for Synor Compute.
+
+use crate::{JobId, NodeId, ProcessorId, ProcessorType};
+use thiserror::Error;
+
+/// Compute errors.
+#[derive(Debug, Error)]
+pub enum ComputeError {
+    /// Job not found.
+    #[error("Job not found: {0}")]
+    JobNotFound(JobId),
+
+    /// Node not found.
+    #[error("Node not found: {0}")]
+    NodeNotFound(NodeId),
+
+    /// Processor not found.
+    #[error("Processor not found: {0}")]
+    ProcessorNotFound(ProcessorId),
+
+    /// No suitable processor for operation.
+    #[error("No suitable processor for operation: {0}")]
+    NoSuitableProcessor(String),
+
+    /// Insufficient resources.
+    #[error("Insufficient resources: {0}")]
+    InsufficientResources(String),
+
+    /// Task execution failed.
+    #[error("Task execution failed: {0}")]
+    TaskExecutionFailed(String),
+
+    /// Scheduling failed.
+    #[error("Scheduling failed: {0}")]
+    SchedulingFailed(String),
+
+    /// Memory allocation failed.
+    #[error("Memory allocation failed: {0}")]
+    MemoryAllocationFailed(String),
+
+    /// Data transfer failed.
+    #[error("Data transfer failed: {0}")]
+    DataTransferFailed(String),
+
+    /// Processor type not supported.
+    #[error("Processor type not supported: {0:?}")]
+    ProcessorTypeNotSupported(ProcessorType),
+
+    /// Operation not supported on processor.
+    #[error("Operation not supported on {0:?}: {1}")]
+    OperationNotSupported(ProcessorType, String),
+
+    /// Timeout.
+    #[error("Operation timed out after {0}ms")]
+    Timeout(u64),
+
+    /// Budget exceeded.
+    #[error("Budget exceeded: required {required}, available {available}")]
+    BudgetExceeded { required: u64, available: u64 },
+
+    /// Node already registered.
+    #[error("Node already registered: {0}")]
+    NodeAlreadyRegistered(NodeId),
+
+    /// Invalid configuration.
+    #[error("Invalid configuration: {0}")]
+    InvalidConfiguration(String),
+
+    /// Serialization error.
+    #[error("Serialization error: {0}")]
+    Serialization(String),
+
+    /// Network error.
+    #[error("Network error: {0}")]
+    Network(String),
+
+    /// Internal error.
+    #[error("Internal error: {0}")]
+    Internal(String),
+}
+
+impl From<bincode::Error> for ComputeError {
+    fn from(err: bincode::Error) -> Self {
+        ComputeError::Serialization(err.to_string())
+    }
+}
+
+impl From<serde_json::Error> for ComputeError {
+    fn from(err: serde_json::Error) -> Self {
+        ComputeError::Serialization(err.to_string())
+    }
+}
--- a/crates/synor-compute/src/lib.rs
+++ b/crates/synor-compute/src/lib.rs
@ -0,0 +1,631 @@
+//! Synor Compute L2 - Heterogeneous Multi-Processor Compute Platform
+//!
+//! Provides decentralized compute services with:
+//!
+//! - **Heterogeneous Scheduling**: CPU + GPU + TPU + NPU + LPU working simultaneously
+//! - **Consumer Device Mesh**: Mobile, browser, desktop devices contributing compute
+//! - **90% Cost Reduction**: Zero margins, spot markets, electricity arbitrage
+//! - **10x Speed**: Caching, speculative execution, optimal processor assignment
+//!
+//! # Architecture
+//!
+//! ```text
+//! ┌─────────────────────────────────────────────────────────────────────────────┐
+//! │                         SYNOR COMPUTE L2                                     │
+//! ├─────────────────────────────────────────────────────────────────────────────┤
+//! │                                                                              │
+//! │  ┌─────────────────────────────────────────────────────────────────────────┐ │
+//! │  │                      TASK DECOMPOSER                                     │ │
+//! │  │  Analyzes workload → Identifies subtasks → Maps to optimal processors    │ │
+//! │  └─────────────────────────────────────────────────────────────────────────┘ │
+//! │                                    │                                         │
+//! │                                    ▼                                         │
+//! │  ┌─────────────────────────────────────────────────────────────────────────┐ │
+//! │  │                    HETEROGENEOUS SCHEDULER                               │ │
+//! │  │  ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐                 │ │
+//! │  │  │ CPU  │ │ GPU  │ │ TPU  │ │ NPU  │ │ LPU  │ │Custom│                 │ │
+//! │  │  │Queue │ │Queue │ │Queue │ │Queue │ │Queue │ │Queue │                 │ │
+//! │  │  └──────┘ └──────┘ └──────┘ └──────┘ └──────┘ └──────┘                 │ │
+//! │  └─────────────────────────────────────────────────────────────────────────┘ │
+//! │                                                                              │
+//! │  ┌─────────────────────────────────────────────────────────────────────────┐ │
+//! │  │                    UNIFIED MEMORY FABRIC                                 │ │
+//! │  │  Zero-copy data sharing │ Automatic placement │ Cache coherency          │ │
+//! │  └─────────────────────────────────────────────────────────────────────────┘ │
+//! │                                                                              │
+//! └─────────────────────────────────────────────────────────────────────────────┘
+//! ```
+//!
+//! # Pricing
+//!
+//! | Resource | Unit | Price (SYNOR) |
+//! |----------|------|---------------|
+//! | GPU (consumer) | hour | 0.10 |
+//! | GPU (datacenter) | hour | 0.50-4.00 |
+//! | CPU | core/hour | 0.02 |
+//! | Memory | GB/hour | 0.005 |
+//! | Inference | 1M tokens | 0.10 |
+
+#![allow(dead_code)]
+
+pub mod device;
+pub mod error;
+pub mod market;
+pub mod memory;
+pub mod processor;
+pub mod scheduler;
+pub mod task;
+
+pub use device::{
+    DeviceCapabilities, DeviceId, DeviceInfo, DeviceRegistry, DeviceStatus, DeviceType,
+};
+pub use error::ComputeError;
+pub use market::{
+    Auction, AuctionId, CloudComparison, CpuTier as MarketCpuTier, GpuTier as MarketGpuTier,
+    MarketStats, Order, OrderBook, OrderId, OrderSide, OrderType, PricingEngine, ProviderListing,
+    ResourceType, SpotMarket, Trade,
+};
+pub use memory::{MemoryManager, TensorHandle, TransferPath, UnifiedMemory};
+pub use processor::{
+    ComputeThroughput, CpuVariant, GpuVariant, NpuVariant, Operation, OperationType, Processor,
+    ProcessorCapabilities, ProcessorId, ProcessorType, TpuVersion,
+};
+pub use scheduler::{
+    HeterogeneousScheduler, LoadBalancer, Schedule, ScheduleResult, TaskAssignment, WorkQueue,
+};
+pub use task::{
+    ComputeTask, DecomposedWorkload, Task, TaskDecomposer, TaskId, TaskPriority, TaskResult,
+    TaskStatus,
+};
+
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use parking_lot::RwLock;
+
+/// Compute node identifier.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct NodeId(pub u64);
+
+impl std::fmt::Display for NodeId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "node_{}", self.0)
+    }
+}
+
+/// Job identifier.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct JobId(pub [u8; 32]);
+
+impl JobId {
+    /// Creates a new job ID.
+    pub fn new() -> Self {
+        use rand::Rng;
+        let mut bytes = [0u8; 32];
+        rand::thread_rng().fill(&mut bytes);
+        JobId(bytes)
+    }
+
+    /// Creates from bytes.
+    pub fn from_bytes(bytes: [u8; 32]) -> Self {
+        JobId(bytes)
+    }
+}
+
+impl Default for JobId {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl std::fmt::Display for JobId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "job_{}", hex::encode(&self.0[..8]))
+    }
+}
+
+/// Compute job specification.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ComputeJob {
+    /// Job ID.
+    pub id: JobId,
+    /// Owner address.
+    pub owner: [u8; 32],
+    /// Job type.
+    pub job_type: JobType,
+    /// Resource requirements.
+    pub resources: ResourceRequirements,
+    /// Input data reference (CID).
+    pub input_cid: Option<String>,
+    /// Maximum budget (in atomic SYNOR).
+    pub max_budget: u64,
+    /// Priority level.
+    pub priority: JobPriority,
+    /// Created timestamp.
+    pub created_at: u64,
+    /// Deadline (optional).
+    pub deadline: Option<u64>,
+}
+
+/// Job type classification.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub enum JobType {
+    /// AI/ML training job.
+    Training {
+        framework: MlFramework,
+        model_cid: String,
+        dataset_cid: String,
+        epochs: u32,
+        batch_size: u32,
+    },
+    /// AI/ML inference job.
+    Inference {
+        model_cid: String,
+        input_format: String,
+        batch_size: u32,
+    },
+    /// Container workload.
+    Container {
+        image: String,
+        command: Vec<String>,
+        env: HashMap<String, String>,
+    },
+    /// Serverless function.
+    Serverless {
+        runtime: FunctionRuntime,
+        code_cid: String,
+        handler: String,
+    },
+    /// General compute (WASM).
+    Wasm {
+        module_cid: String,
+        entrypoint: String,
+    },
+}
+
+/// ML framework specification.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub enum MlFramework {
+    PyTorch { version: String },
+    TensorFlow { version: String },
+    JAX { version: String },
+    ONNX,
+}
+
+/// Function runtime.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub enum FunctionRuntime {
+    Node20,
+    Python312,
+    Rust,
+    Go,
+    Custom { image: String },
+}
+
+/// Job priority levels.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
+pub enum JobPriority {
+    /// Background job, can be preempted.
+    Background = 0,
+    /// Normal priority.
+    Normal = 1,
+    /// High priority, faster scheduling.
+    High = 2,
+    /// Critical, guaranteed resources.
+    Critical = 3,
+}
+
+impl Default for JobPriority {
+    fn default() -> Self {
+        JobPriority::Normal
+    }
+}
+
+/// Resource requirements for a job.
+#[derive(Clone, Debug, Default, Serialize, Deserialize)]
+pub struct ResourceRequirements {
+    /// Minimum CPU cores.
+    pub min_cpu_cores: f32,
+    /// Minimum memory (GB).
+    pub min_memory_gb: f32,
+    /// GPU requirements.
+    pub gpu: Option<GpuRequirements>,
+    /// Preferred processor types (in priority order).
+    pub preferred_processors: Vec<ProcessorType>,
+    /// Maximum latency (ms) - for inference.
+    pub max_latency_ms: Option<u32>,
+    /// Requires distributed execution.
+    pub distributed: bool,
+}
+
+/// GPU resource requirements.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct GpuRequirements {
+    /// Minimum number of GPUs.
+    pub min_count: u32,
+    /// Maximum number of GPUs.
+    pub max_count: u32,
+    /// Minimum VRAM per GPU (GB).
+    pub min_vram_gb: u32,
+    /// Minimum compute capability.
+    pub min_compute_capability: Option<(u8, u8)>,
+    /// Allow GPU sharing (MPS/MIG).
+    pub allow_sharing: bool,
+}
+
+/// Job execution status.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub enum JobStatus {
+    /// Queued, waiting for resources.
+    Queued,
+    /// Resources allocated, starting.
+    Starting,
+    /// Running.
+    Running {
+        progress: f32,
+        assigned_nodes: Vec<NodeId>,
+    },
+    /// Completed successfully.
+    Completed {
+        result_cid: String,
+        duration_ms: u64,
+        cost: u64,
+    },
+    /// Failed.
+    Failed { error: String },
+    /// Cancelled by user.
+    Cancelled,
+}
+
+/// Compute node registration.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ComputeNode {
+    /// Node ID.
+    pub id: NodeId,
+    /// Owner address.
+    pub owner: [u8; 32],
+    /// Available processors.
+    pub processors: Vec<ProcessorInfo>,
+    /// Total memory (GB).
+    pub total_memory_gb: f32,
+    /// Available memory (GB).
+    pub available_memory_gb: f32,
+    /// Network bandwidth (Gbps).
+    pub bandwidth_gbps: f32,
+    /// Geographic region.
+    pub region: String,
+    /// Stake amount (for PoS).
+    pub stake: u64,
+    /// Reputation score (0-100).
+    pub reputation: u32,
+    /// Current status.
+    pub status: NodeStatus,
+}
+
+/// Processor information on a node.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ProcessorInfo {
+    /// Processor ID (local to node).
+    pub id: ProcessorId,
+    /// Processor type.
+    pub processor_type: ProcessorType,
+    /// Capabilities.
+    pub capabilities: ProcessorCapabilities,
+    /// Current utilization (0.0 - 1.0).
+    pub utilization: f32,
+    /// Current temperature (Celsius).
+    pub temperature: Option<f32>,
+}
+
+/// Node status.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub enum NodeStatus {
+    /// Online and accepting jobs.
+    Online,
+    /// Online but not accepting new jobs.
+    Draining,
+    /// Offline.
+    Offline,
+    /// Maintenance mode.
+    Maintenance,
+}
+
+/// Compute cluster manager.
+pub struct ComputeCluster {
+    /// Registered nodes.
+    nodes: RwLock<HashMap<NodeId, ComputeNode>>,
+    /// Device registry.
+    device_registry: Arc<DeviceRegistry>,
+    /// Heterogeneous scheduler.
+    scheduler: Arc<HeterogeneousScheduler>,
+    /// Spot market.
+    spot_market: Arc<SpotMarket>,
+    /// Memory manager.
+    memory_manager: Arc<MemoryManager>,
+    /// Active jobs.
+    jobs: RwLock<HashMap<JobId, ComputeJob>>,
+}
+
+impl ComputeCluster {
+    /// Creates a new compute cluster.
+    pub fn new() -> Self {
+        let device_registry = Arc::new(DeviceRegistry::new());
+        let scheduler = Arc::new(HeterogeneousScheduler::new(device_registry.clone()));
+        let spot_market = Arc::new(SpotMarket::new());
+        let memory_manager = Arc::new(MemoryManager::new());
+
+        Self {
+            nodes: RwLock::new(HashMap::new()),
+            device_registry,
+            scheduler,
+            spot_market,
+            memory_manager,
+            jobs: RwLock::new(HashMap::new()),
+        }
+    }
+
+    /// Registers a compute node.
+    pub fn register_node(&self, node: ComputeNode) -> Result<(), ComputeError> {
+        let id = node.id;
+
+        // Register processors with device registry
+        for proc in &node.processors {
+            self.device_registry.register_processor(id, proc.clone())?;
+        }
+
+        self.nodes.write().insert(id, node);
+        Ok(())
+    }
+
+    /// Unregisters a compute node.
+    pub fn unregister_node(&self, node_id: NodeId) -> Result<(), ComputeError> {
+        self.device_registry.unregister_node(node_id)?;
+        self.nodes.write().remove(&node_id);
+        Ok(())
+    }
+
+    /// Submits a job for execution.
+    pub async fn submit_job(&self, job: ComputeJob) -> Result<JobId, ComputeError> {
+        let job_id = job.id;
+
+        // Decompose job into tasks
+        let tasks = self.decompose_job(&job)?;
+
+        // Schedule tasks
+        let schedule = self.scheduler.schedule(tasks).await?;
+
+        // Store job
+        self.jobs.write().insert(job_id, job);
+
+        // Execute schedule (async)
+        tokio::spawn({
+            let scheduler = self.scheduler.clone();
+            async move {
+                let _ = scheduler.execute(&schedule.schedule).await;
+            }
+        });
+
+        Ok(job_id)
+    }
+
+    /// Gets job status.
+    pub fn get_job_status(&self, job_id: &JobId) -> Option<JobStatus> {
+        self.jobs.read().get(job_id).map(|_| JobStatus::Queued)
+    }
+
+    /// Cancels a job.
+    pub fn cancel_job(&self, job_id: &JobId) -> Result<(), ComputeError> {
+        if self.jobs.write().remove(job_id).is_some() {
+            Ok(())
+        } else {
+            Err(ComputeError::JobNotFound(*job_id))
+        }
+    }
+
+    /// Gets cluster statistics.
+    pub fn stats(&self) -> ClusterStats {
+        let nodes = self.nodes.read();
+        let jobs = self.jobs.read();
+
+        let total_nodes = nodes.len();
+        let online_nodes = nodes.values().filter(|n| n.status == NodeStatus::Online).count();
+
+        let total_gpus: usize = nodes
+            .values()
+            .flat_map(|n| &n.processors)
+            .filter(|p| matches!(p.processor_type, ProcessorType::Gpu(_)))
+            .count();
+
+        let total_memory: f32 = nodes.values().map(|n| n.total_memory_gb).sum();
+
+        ClusterStats {
+            total_nodes,
+            online_nodes,
+            total_gpus,
+            total_memory_gb: total_memory,
+            active_jobs: jobs.len(),
+            queued_jobs: jobs.values().filter(|_| true).count(), // Simplified
+        }
+    }
+
+    /// Decomposes a job into schedulable tasks.
+    fn decompose_job(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
+        let decomposer = TaskDecomposer::new();
+        decomposer.decompose(job)
+    }
+}
+
+impl Default for ComputeCluster {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Cluster statistics.
+#[derive(Clone, Debug, Default, Serialize, Deserialize)]
+pub struct ClusterStats {
+    /// Total registered nodes.
+    pub total_nodes: usize,
+    /// Online nodes.
+    pub online_nodes: usize,
+    /// Total GPUs across cluster.
+    pub total_gpus: usize,
+    /// Total memory (GB).
+    pub total_memory_gb: f32,
+    /// Active jobs.
+    pub active_jobs: usize,
+    /// Queued jobs.
+    pub queued_jobs: usize,
+}
+
+/// Pricing calculator for compute operations.
+#[derive(Clone, Debug)]
+pub struct ComputePricing {
+    /// GPU cost per hour by type.
+    pub gpu_hourly: HashMap<GpuTier, u64>,
+    /// CPU cost per core-hour.
+    pub cpu_core_hour: u64,
+    /// Memory cost per GB-hour.
+    pub memory_gb_hour: u64,
+    /// Network egress per GB.
+    pub network_egress_gb: u64,
+    /// Inference per million tokens.
+    pub inference_per_million_tokens: u64,
+}
+
+/// GPU pricing tiers.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum GpuTier {
+    /// Consumer GPUs (RTX 30xx, 40xx).
+    Consumer,
+    /// Professional GPUs (RTX A series).
+    Professional,
+    /// Data center GPUs (A100).
+    DataCenter,
+    /// Latest generation (H100).
+    Premium,
+}
+
+impl Default for ComputePricing {
+    fn default() -> Self {
+        let mut gpu_hourly = HashMap::new();
+        gpu_hourly.insert(GpuTier::Consumer, 100_000_000);      // 0.10 SYNOR
+        gpu_hourly.insert(GpuTier::Professional, 300_000_000);  // 0.30 SYNOR
+        gpu_hourly.insert(GpuTier::DataCenter, 2_000_000_000);  // 2.00 SYNOR
+        gpu_hourly.insert(GpuTier::Premium, 4_000_000_000);     // 4.00 SYNOR
+
+        Self {
+            gpu_hourly,
+            cpu_core_hour: 20_000_000,           // 0.02 SYNOR
+            memory_gb_hour: 5_000_000,           // 0.005 SYNOR
+            network_egress_gb: 50_000_000,       // 0.05 SYNOR
+            inference_per_million_tokens: 100_000_000, // 0.10 SYNOR
+        }
+    }
+}
+
+impl ComputePricing {
+    /// Estimates cost for a job.
+    pub fn estimate(&self, job: &ComputeJob, duration_hours: f32) -> u64 {
+        let mut cost = 0u64;
+
+        // CPU cost
+        cost += (self.cpu_core_hour as f32 * job.resources.min_cpu_cores * duration_hours) as u64;
+
+        // Memory cost
+        cost += (self.memory_gb_hour as f32 * job.resources.min_memory_gb * duration_hours) as u64;
+
+        // GPU cost
+        if let Some(gpu) = &job.resources.gpu {
+            let tier = GpuTier::Consumer; // Simplified
+            let gpu_cost = self.gpu_hourly.get(&tier).unwrap_or(&100_000_000);
+            cost += (*gpu_cost as f32 * gpu.min_count as f32 * duration_hours) as u64;
+        }
+
+        cost
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_job_id() {
+        let id1 = JobId::new();
+        let id2 = JobId::new();
+        assert_ne!(id1.0, id2.0);
+    }
+
+    #[test]
+    fn test_compute_cluster() {
+        let cluster = ComputeCluster::new();
+        let stats = cluster.stats();
+        assert_eq!(stats.total_nodes, 0);
+    }
+
+    #[test]
+    fn test_pricing() {
+        let pricing = ComputePricing::default();
+
+        let job = ComputeJob {
+            id: JobId::new(),
+            owner: [0u8; 32],
+            job_type: JobType::Inference {
+                model_cid: "model123".to_string(),
+                input_format: "json".to_string(),
+                batch_size: 32,
+            },
+            resources: ResourceRequirements {
+                min_cpu_cores: 4.0,
+                min_memory_gb: 16.0,
+                gpu: Some(GpuRequirements {
+                    min_count: 1,
+                    max_count: 1,
+                    min_vram_gb: 16,
+                    min_compute_capability: None,
+                    allow_sharing: false,
+                }),
+                ..Default::default()
+            },
+            input_cid: None,
+            max_budget: 1_000_000_000,
+            priority: JobPriority::Normal,
+            created_at: 0,
+            deadline: None,
+        };
+
+        let cost = pricing.estimate(&job, 1.0);
+        assert!(cost > 0);
+    }
+
+    #[test]
+    fn test_node_registration() {
+        let cluster = ComputeCluster::new();
+
+        let node = ComputeNode {
+            id: NodeId(1),
+            owner: [1u8; 32],
+            processors: vec![ProcessorInfo {
+                id: ProcessorId(0),
+                processor_type: ProcessorType::Cpu(CpuVariant::X86_64 {
+                    avx: processor::AvxSupport::Avx512,
+                }),
+                capabilities: ProcessorCapabilities::default(),
+                utilization: 0.0,
+                temperature: Some(45.0),
+            }],
+            total_memory_gb: 64.0,
+            available_memory_gb: 60.0,
+            bandwidth_gbps: 10.0,
+            region: "us-east".to_string(),
+            stake: 1000,
+            reputation: 100,
+            status: NodeStatus::Online,
+        };
+
+        cluster.register_node(node).unwrap();
+        assert_eq!(cluster.stats().total_nodes, 1);
+    }
+}
--- a/crates/synor-compute/src/market/mod.rs
+++ b/crates/synor-compute/src/market/mod.rs
--- a/crates/synor-compute/src/memory/mod.rs
+++ b/crates/synor-compute/src/memory/mod.rs
@ -0,0 +1,370 @@
+//! Unified memory management for heterogeneous compute.
+
+use crate::error::ComputeError;
+use crate::processor::ProcessorType;
+use parking_lot::RwLock;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::sync::Arc;
+
+/// Tensor handle for memory management.
+#[derive(Clone, Debug)]
+pub struct TensorHandle {
+    /// Unique ID.
+    pub id: TensorId,
+    /// Shape.
+    pub shape: Vec<usize>,
+    /// Data type.
+    pub dtype: DataType,
+    /// Size in bytes.
+    pub size_bytes: u64,
+    /// Current locations.
+    pub locations: Vec<ProcessorType>,
+}
+
+impl TensorHandle {
+    /// Creates a new tensor handle.
+    pub fn new(shape: Vec<usize>, dtype: DataType) -> Self {
+        let size_bytes = shape.iter().product::<usize>() as u64 * dtype.size_bytes() as u64;
+        Self {
+            id: TensorId::new(),
+            shape,
+            dtype,
+            size_bytes,
+            locations: Vec::new(),
+        }
+    }
+
+    /// Gets the number of elements.
+    pub fn numel(&self) -> usize {
+        self.shape.iter().product()
+    }
+}
+
+/// Tensor identifier.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct TensorId(pub u64);
+
+impl TensorId {
+    /// Creates a new tensor ID.
+    pub fn new() -> Self {
+        use rand::Rng;
+        TensorId(rand::thread_rng().gen())
+    }
+}
+
+impl Default for TensorId {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Data types for tensors.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum DataType {
+    Float64,
+    Float32,
+    Float16,
+    BFloat16,
+    Int64,
+    Int32,
+    Int16,
+    Int8,
+    UInt8,
+    Bool,
+}
+
+impl DataType {
+    /// Returns size in bytes.
+    pub fn size_bytes(&self) -> usize {
+        match self {
+            DataType::Float64 | DataType::Int64 => 8,
+            DataType::Float32 | DataType::Int32 => 4,
+            DataType::Float16 | DataType::BFloat16 | DataType::Int16 => 2,
+            DataType::Int8 | DataType::UInt8 | DataType::Bool => 1,
+        }
+    }
+}
+
+/// Data transfer path between processors.
+#[derive(Clone, Debug, PartialEq, Eq, Hash)]
+pub enum TransferPath {
+    /// Direct GPU-to-GPU via NVLink.
+    NvLink,
+    /// Direct GPU-to-GPU via PCIe P2P.
+    PciePeerToPeer,
+    /// Through CPU memory.
+    CpuMediated,
+    /// Unified memory (Apple Silicon).
+    UnifiedMemory,
+    /// Network transfer.
+    Network,
+    /// Same memory space (no transfer needed).
+    SameMemory,
+}
+
+impl TransferPath {
+    /// Returns approximate bandwidth in GB/s.
+    pub fn bandwidth_gbps(&self) -> f64 {
+        match self {
+            TransferPath::NvLink => 900.0,       // NVLink 4.0
+            TransferPath::PciePeerToPeer => 64.0, // PCIe 5.0 x16
+            TransferPath::CpuMediated => 50.0,   // DDR5
+            TransferPath::UnifiedMemory => 400.0, // Apple unified
+            TransferPath::Network => 10.0,       // 100Gbps network
+            TransferPath::SameMemory => f64::INFINITY,
+        }
+    }
+
+    /// Estimates transfer time for given bytes.
+    pub fn estimate_transfer_time(&self, bytes: u64) -> std::time::Duration {
+        if matches!(self, TransferPath::SameMemory) {
+            return std::time::Duration::ZERO;
+        }
+
+        let bytes_f64 = bytes as f64;
+        let bandwidth = self.bandwidth_gbps() * 1e9; // Convert to bytes/s
+        let seconds = bytes_f64 / bandwidth;
+        std::time::Duration::from_secs_f64(seconds)
+    }
+}
+
+/// Unified memory manager.
+pub struct MemoryManager {
+    /// Allocated tensors.
+    tensors: RwLock<HashMap<TensorId, TensorHandle>>,
+    /// Memory usage per processor type.
+    usage: RwLock<HashMap<ProcessorType, u64>>,
+    /// Memory limits per processor type.
+    limits: HashMap<ProcessorType, u64>,
+}
+
+impl MemoryManager {
+    /// Creates a new memory manager.
+    pub fn new() -> Self {
+        Self {
+            tensors: RwLock::new(HashMap::new()),
+            usage: RwLock::new(HashMap::new()),
+            limits: HashMap::new(),
+        }
+    }
+
+    /// Sets memory limit for a processor type.
+    pub fn set_limit(&mut self, proc_type: ProcessorType, limit_bytes: u64) {
+        self.limits.insert(proc_type, limit_bytes);
+    }
+
+    /// Allocates a tensor.
+    pub fn allocate(&self, shape: Vec<usize>, dtype: DataType) -> Result<TensorHandle, ComputeError> {
+        let handle = TensorHandle::new(shape, dtype);
+        self.tensors.write().insert(handle.id, handle.clone());
+        Ok(handle)
+    }
+
+    /// Frees a tensor.
+    pub fn free(&self, tensor_id: TensorId) -> Result<(), ComputeError> {
+        if let Some(handle) = self.tensors.write().remove(&tensor_id) {
+            // Update usage for all locations
+            let mut usage = self.usage.write();
+            for loc in &handle.locations {
+                if let Some(u) = usage.get_mut(loc) {
+                    *u = u.saturating_sub(handle.size_bytes);
+                }
+            }
+        }
+        Ok(())
+    }
+
+    /// Gets a tensor handle.
+    pub fn get(&self, tensor_id: TensorId) -> Option<TensorHandle> {
+        self.tensors.read().get(&tensor_id).cloned()
+    }
+
+    /// Ensures tensor is on specified processor.
+    pub fn ensure_on(
+        &self,
+        tensor_id: TensorId,
+        target: ProcessorType,
+    ) -> Result<TransferPath, ComputeError> {
+        let mut tensors = self.tensors.write();
+
+        if let Some(handle) = tensors.get_mut(&tensor_id) {
+            // Check if already on target
+            if handle.locations.contains(&target) {
+                return Ok(TransferPath::SameMemory);
+            }
+
+            // Determine transfer path
+            let path = if handle.locations.is_empty() {
+                // New tensor, allocate on target
+                TransferPath::SameMemory
+            } else {
+                // Find best transfer path from existing location
+                self.find_best_path(&handle.locations[0], &target)
+            };
+
+            // Record new location
+            handle.locations.push(target.clone());
+
+            // Update usage
+            let mut usage = self.usage.write();
+            *usage.entry(target).or_default() += handle.size_bytes;
+
+            Ok(path)
+        } else {
+            Err(ComputeError::Internal("Tensor not found".to_string()))
+        }
+    }
+
+    /// Finds best transfer path between processors.
+    fn find_best_path(&self, from: &ProcessorType, to: &ProcessorType) -> TransferPath {
+        // Check for unified memory (Apple Silicon)
+        if self.shares_memory(from, to) {
+            return TransferPath::UnifiedMemory;
+        }
+
+        // Check for NVLink between NVIDIA GPUs
+        if matches!(from, ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda { .. }))
+            && matches!(to, ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda { .. }))
+        {
+            return TransferPath::NvLink;
+        }
+
+        // Check for PCIe P2P between GPUs
+        if from.is_gpu() && to.is_gpu() {
+            return TransferPath::PciePeerToPeer;
+        }
+
+        // Default to CPU-mediated transfer
+        TransferPath::CpuMediated
+    }
+
+    /// Checks if two processor types share memory.
+    fn shares_memory(&self, a: &ProcessorType, b: &ProcessorType) -> bool {
+        use crate::processor::{CpuVariant, GpuVariant, NpuVariant};
+
+        match (a, b) {
+            // Apple Silicon unified memory
+            (ProcessorType::Cpu(CpuVariant::Arm64 { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal))
+            | (ProcessorType::Gpu(GpuVariant::AppleMetal), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
+            | (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
+            | (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal)) => true,
+            // Same type
+            _ if a == b => true,
+            _ => false,
+        }
+    }
+
+    /// Gets current memory usage for a processor type.
+    pub fn usage(&self, proc_type: ProcessorType) -> u64 {
+        self.usage.read().get(&proc_type).copied().unwrap_or(0)
+    }
+
+    /// Gets available memory for a processor type.
+    pub fn available(&self, proc_type: ProcessorType) -> u64 {
+        let limit = self.limits.get(&proc_type).copied().unwrap_or(u64::MAX);
+        let used = self.usage(proc_type);
+        limit.saturating_sub(used)
+    }
+
+    /// Gets total allocated tensors.
+    pub fn tensor_count(&self) -> usize {
+        self.tensors.read().len()
+    }
+}
+
+impl Default for MemoryManager {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Unified memory abstraction for zero-copy sharing.
+pub struct UnifiedMemory {
+    /// Base pointer (in unified address space).
+    pub base: u64,
+    /// Size in bytes.
+    pub size: u64,
+    /// Accessible from these processor types.
+    pub accessible_from: Vec<ProcessorType>,
+}
+
+impl UnifiedMemory {
+    /// Creates new unified memory region.
+    pub fn new(size: u64) -> Self {
+        Self {
+            base: 0, // Would be actual pointer in real implementation
+            size,
+            accessible_from: Vec::new(),
+        }
+    }
+
+    /// Checks if accessible from processor type.
+    pub fn is_accessible_from(&self, proc_type: &ProcessorType) -> bool {
+        self.accessible_from.contains(proc_type)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_tensor_handle() {
+        let handle = TensorHandle::new(vec![1024, 1024], DataType::Float32);
+        assert_eq!(handle.numel(), 1024 * 1024);
+        assert_eq!(handle.size_bytes, 1024 * 1024 * 4);
+    }
+
+    #[test]
+    fn test_data_type_sizes() {
+        assert_eq!(DataType::Float64.size_bytes(), 8);
+        assert_eq!(DataType::Float32.size_bytes(), 4);
+        assert_eq!(DataType::Float16.size_bytes(), 2);
+        assert_eq!(DataType::Int8.size_bytes(), 1);
+    }
+
+    #[test]
+    fn test_transfer_path_bandwidth() {
+        assert!(TransferPath::NvLink.bandwidth_gbps() > TransferPath::PciePeerToPeer.bandwidth_gbps());
+        assert!(TransferPath::SameMemory.bandwidth_gbps().is_infinite());
+    }
+
+    #[test]
+    fn test_memory_manager() {
+        let manager = MemoryManager::new();
+
+        let handle = manager.allocate(vec![1024, 1024], DataType::Float32).unwrap();
+        assert_eq!(manager.tensor_count(), 1);
+
+        manager.free(handle.id).unwrap();
+        assert_eq!(manager.tensor_count(), 0);
+    }
+
+    #[test]
+    fn test_ensure_on() {
+        let manager = MemoryManager::new();
+
+        let handle = manager.allocate(vec![1024], DataType::Float32).unwrap();
+
+        // First ensure should allocate
+        let path = manager.ensure_on(
+            handle.id,
+            ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda {
+                compute_capability: (8, 0),
+            }),
+        ).unwrap();
+
+        assert_eq!(path, TransferPath::SameMemory);
+
+        // Second ensure to same location should be same memory
+        let path = manager.ensure_on(
+            handle.id,
+            ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda {
+                compute_capability: (8, 0),
+            }),
+        ).unwrap();
+
+        assert_eq!(path, TransferPath::SameMemory);
+    }
+}
--- a/crates/synor-compute/src/processor/capabilities.rs
+++ b/crates/synor-compute/src/processor/capabilities.rs
@ -0,0 +1,547 @@
+//! Processor capability definitions.
+
+use super::operation::OperationType;
+use super::types::PowerTier;
+use serde::{Deserialize, Serialize};
+use std::collections::HashSet;
+
+/// Detailed processor capabilities.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ProcessorCapabilities {
+    /// Compute throughput.
+    pub compute: ComputeThroughput,
+    /// Memory specifications.
+    pub memory: MemorySpecs,
+    /// Supported operations.
+    pub operations: HashSet<OperationType>,
+    /// Power characteristics.
+    pub power: PowerCharacteristics,
+    /// Optimal workload characteristics.
+    pub optimal_for: Vec<WorkloadCharacteristic>,
+}
+
+impl Default for ProcessorCapabilities {
+    fn default() -> Self {
+        Self {
+            compute: ComputeThroughput::default(),
+            memory: MemorySpecs::default(),
+            operations: Self::default_operations(),
+            power: PowerCharacteristics::default(),
+            optimal_for: vec![],
+        }
+    }
+}
+
+impl ProcessorCapabilities {
+    /// Default operations supported by most processors.
+    fn default_operations() -> HashSet<OperationType> {
+        [
+            OperationType::MatMul,
+            OperationType::Add,
+            OperationType::Mul,
+            OperationType::ReLU,
+            OperationType::Softmax,
+            OperationType::DataLoad,
+            OperationType::DataPreprocess,
+        ]
+        .into_iter()
+        .collect()
+    }
+
+    /// Creates CPU capabilities.
+    pub fn cpu(cores: u32, clock_ghz: f32, avx512: bool) -> Self {
+        let flops_per_cycle = if avx512 { 64.0 } else { 32.0 }; // FP32 ops per cycle with AVX
+        let fp32_tflops = (cores as f64 * clock_ghz as f64 * flops_per_cycle) / 1000.0;
+
+        Self {
+            compute: ComputeThroughput {
+                fp64_tflops: fp32_tflops / 2.0,
+                fp32_tflops,
+                fp16_tflops: fp32_tflops * 2.0,
+                bf16_tflops: fp32_tflops * 2.0,
+                int8_tops: fp32_tflops * 4.0,
+                int4_tops: fp32_tflops * 8.0,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 64 * 1024 * 1024 * 1024, // 64 GB typical
+                bandwidth_gbps: 200,                     // DDR5
+                type_: MemoryType::Ddr5,
+            },
+            operations: Self::cpu_operations(),
+            power: PowerCharacteristics {
+                tdp_watts: 125,
+                efficiency: 0.8,
+                power_tier: PowerTier::Medium,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::Sequential,
+                WorkloadCharacteristic::MemoryBound,
+                WorkloadCharacteristic::SmallBatch,
+            ],
+        }
+    }
+
+    /// Operations typically supported by CPUs.
+    fn cpu_operations() -> HashSet<OperationType> {
+        [
+            // Matrix operations (slow but supported)
+            OperationType::MatMul,
+            OperationType::Conv2d,
+            OperationType::BatchNorm,
+            OperationType::LayerNorm,
+            // Element-wise
+            OperationType::Add,
+            OperationType::Mul,
+            OperationType::ReLU,
+            OperationType::GeLU,
+            OperationType::Softmax,
+            // Data operations (optimal)
+            OperationType::DataLoad,
+            OperationType::DataPreprocess,
+            OperationType::Tokenization,
+            OperationType::Detokenization,
+            // Memory operations
+            OperationType::Transpose,
+            OperationType::Reshape,
+            OperationType::Concat,
+            OperationType::Split,
+            // I/O
+            OperationType::Checkpoint,
+        ]
+        .into_iter()
+        .collect()
+    }
+
+    /// Creates NVIDIA GPU capabilities.
+    pub fn nvidia_gpu(
+        cuda_cores: u32,
+        tensor_cores: u32,
+        vram_gb: u32,
+        bandwidth_gbps: u32,
+        compute_capability: (u8, u8),
+    ) -> Self {
+        // Approximate TFLOPS based on cores and typical clocks
+        let base_clock_ghz = 1.5;
+        let fp32_tflops = (cuda_cores as f64 * base_clock_ghz * 2.0) / 1000.0;
+        let tensor_multiplier = if compute_capability.0 >= 8 { 4.0 } else { 2.0 };
+
+        Self {
+            compute: ComputeThroughput {
+                fp64_tflops: fp32_tflops / 2.0,
+                fp32_tflops,
+                fp16_tflops: fp32_tflops * tensor_multiplier,
+                bf16_tflops: fp32_tflops * tensor_multiplier,
+                int8_tops: fp32_tflops * tensor_multiplier * 2.0,
+                int4_tops: fp32_tflops * tensor_multiplier * 4.0,
+                sparsity_speedup: if compute_capability.0 >= 8 { 2.0 } else { 1.0 },
+            },
+            memory: MemorySpecs {
+                capacity_bytes: vram_gb as u64 * 1024 * 1024 * 1024,
+                bandwidth_gbps,
+                type_: if compute_capability.0 >= 9 {
+                    MemoryType::Hbm3
+                } else {
+                    MemoryType::Hbm2e
+                },
+            },
+            operations: Self::gpu_operations(compute_capability),
+            power: PowerCharacteristics {
+                tdp_watts: if compute_capability.0 >= 9 { 700 } else { 350 },
+                efficiency: 0.9,
+                power_tier: PowerTier::High,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::HighlyParallel,
+                WorkloadCharacteristic::LargeBatch,
+                WorkloadCharacteristic::ComputeBound,
+            ],
+        }
+    }
+
+    /// Operations supported by GPUs.
+    fn gpu_operations(compute_capability: (u8, u8)) -> HashSet<OperationType> {
+        let mut ops: HashSet<OperationType> = [
+            // Matrix operations (optimal)
+            OperationType::MatMul,
+            OperationType::Conv2d,
+            OperationType::Conv3d,
+            OperationType::DepthwiseConv,
+            OperationType::BatchNorm,
+            OperationType::LayerNorm,
+            // Attention
+            OperationType::SelfAttention,
+            OperationType::CrossAttention,
+            // Element-wise
+            OperationType::Add,
+            OperationType::Mul,
+            OperationType::ReLU,
+            OperationType::GeLU,
+            OperationType::SiLU,
+            OperationType::Softmax,
+            // Reduction
+            OperationType::Sum,
+            OperationType::Mean,
+            OperationType::Max,
+            OperationType::ArgMax,
+            // Memory operations
+            OperationType::Transpose,
+            OperationType::Reshape,
+            OperationType::Concat,
+            OperationType::Split,
+            OperationType::Gather,
+            OperationType::Scatter,
+            // LLM specific
+            OperationType::Embedding,
+            OperationType::RoPE,
+            OperationType::KVCache,
+            OperationType::TopK,
+            OperationType::Sampling,
+        ]
+        .into_iter()
+        .collect();
+
+        // FlashAttention for newer GPUs
+        if compute_capability.0 >= 8 {
+            ops.insert(OperationType::FlashAttention);
+        }
+
+        ops
+    }
+
+    /// Creates TPU capabilities.
+    pub fn tpu(version: super::TpuVersion) -> Self {
+        let (bf16_tflops, memory_gb, bandwidth_gbps) = match version {
+            super::TpuVersion::V5p => (918.0, 95, 4800),
+            super::TpuVersion::V5e => (197.0, 16, 1600),
+            super::TpuVersion::V4 => (275.0, 32, 2400),
+            super::TpuVersion::V4i => (138.0, 32, 1200),
+            super::TpuVersion::V3 => (123.0, 16, 900),
+            super::TpuVersion::V2 => (46.0, 8, 600),
+            super::TpuVersion::Edge => (4.0, 0, 0), // Uses host memory
+        };
+
+        Self {
+            compute: ComputeThroughput {
+                fp64_tflops: 0.0, // TPUs don't support FP64
+                fp32_tflops: bf16_tflops / 2.0,
+                fp16_tflops: bf16_tflops,
+                bf16_tflops,
+                int8_tops: bf16_tflops * 2.0,
+                int4_tops: bf16_tflops * 4.0,
+                sparsity_speedup: 2.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: memory_gb as u64 * 1024 * 1024 * 1024,
+                bandwidth_gbps,
+                type_: MemoryType::Hbm2e,
+            },
+            operations: Self::tpu_operations(),
+            power: PowerCharacteristics {
+                tdp_watts: if matches!(version, super::TpuVersion::Edge) {
+                    2
+                } else {
+                    400
+                },
+                efficiency: 0.95,
+                power_tier: if matches!(version, super::TpuVersion::Edge) {
+                    PowerTier::UltraLow
+                } else {
+                    PowerTier::High
+                },
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::HighlyParallel,
+                WorkloadCharacteristic::ComputeBound,
+                WorkloadCharacteristic::FixedShape,
+                WorkloadCharacteristic::LargeBatch,
+            ],
+        }
+    }
+
+    /// Operations supported by TPUs.
+    fn tpu_operations() -> HashSet<OperationType> {
+        [
+            // Matrix operations (optimal)
+            OperationType::MatMul,
+            OperationType::Conv2d,
+            OperationType::BatchNorm,
+            OperationType::LayerNorm,
+            // Attention
+            OperationType::SelfAttention,
+            OperationType::CrossAttention,
+            OperationType::FlashAttention,
+            // Element-wise
+            OperationType::Add,
+            OperationType::Mul,
+            OperationType::ReLU,
+            OperationType::GeLU,
+            OperationType::SiLU,
+            OperationType::Softmax,
+            // Reduction
+            OperationType::Sum,
+            OperationType::Mean,
+            OperationType::Max,
+            // LLM specific
+            OperationType::Embedding,
+            OperationType::RoPE,
+            OperationType::KVCache,
+        ]
+        .into_iter()
+        .collect()
+    }
+
+    /// Creates LPU (Groq) capabilities.
+    pub fn lpu() -> Self {
+        Self {
+            compute: ComputeThroughput {
+                fp64_tflops: 0.0,
+                fp32_tflops: 0.0,
+                fp16_tflops: 188.0,
+                bf16_tflops: 188.0,
+                int8_tops: 750.0,
+                int4_tops: 1500.0,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 230 * 1024 * 1024 * 1024, // 230 GB SRAM!
+                bandwidth_gbps: 80_000,                    // 80 TB/s internal
+                type_: MemoryType::Sram,
+            },
+            operations: Self::lpu_operations(),
+            power: PowerCharacteristics {
+                tdp_watts: 300,
+                efficiency: 0.98, // Very efficient for inference
+                power_tier: PowerTier::Medium,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::Sequential,
+                WorkloadCharacteristic::SmallBatch,
+                WorkloadCharacteristic::VariableLength,
+                WorkloadCharacteristic::LowLatency,
+            ],
+        }
+    }
+
+    /// Operations supported by Groq LPU.
+    fn lpu_operations() -> HashSet<OperationType> {
+        [
+            // Optimized for inference
+            OperationType::MatMul,
+            OperationType::LayerNorm,
+            OperationType::SelfAttention,
+            OperationType::Add,
+            OperationType::Mul,
+            OperationType::ReLU,
+            OperationType::GeLU,
+            OperationType::SiLU,
+            OperationType::Softmax,
+            OperationType::Embedding,
+            OperationType::RoPE,
+            OperationType::KVCache,
+            OperationType::TopK,
+            OperationType::Sampling,
+        ]
+        .into_iter()
+        .collect()
+    }
+
+    /// Creates Apple Neural Engine capabilities.
+    pub fn apple_neural_engine(cores: u32) -> Self {
+        let int8_tops = match cores {
+            16 => 18.0,  // M3
+            32 => 35.0,  // M3 Max
+            _ => cores as f64 * 1.1,
+        };
+
+        Self {
+            compute: ComputeThroughput {
+                fp64_tflops: 0.0,
+                fp32_tflops: int8_tops / 4.0,
+                fp16_tflops: int8_tops / 2.0,
+                bf16_tflops: int8_tops / 2.0,
+                int8_tops,
+                int4_tops: int8_tops * 2.0,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 0, // Uses unified memory
+                bandwidth_gbps: 400,
+                type_: MemoryType::Unified,
+            },
+            operations: Self::npu_operations(),
+            power: PowerCharacteristics {
+                tdp_watts: 15,
+                efficiency: 0.95,
+                power_tier: PowerTier::UltraLow,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::LowPower,
+                WorkloadCharacteristic::LowLatency,
+                WorkloadCharacteristic::SmallBatch,
+            ],
+        }
+    }
+
+    /// Operations supported by NPUs.
+    fn npu_operations() -> HashSet<OperationType> {
+        [
+            // Inference optimized
+            OperationType::MatMul,
+            OperationType::Conv2d,
+            OperationType::DepthwiseConv,
+            OperationType::BatchNorm,
+            OperationType::LayerNorm,
+            OperationType::Add,
+            OperationType::Mul,
+            OperationType::ReLU,
+            OperationType::Softmax,
+            OperationType::Embedding,
+        ]
+        .into_iter()
+        .collect()
+    }
+}
+
+/// Compute throughput metrics.
+#[derive(Clone, Debug, Default, Serialize, Deserialize)]
+pub struct ComputeThroughput {
+    /// FP64 TFLOPS.
+    pub fp64_tflops: f64,
+    /// FP32 TFLOPS.
+    pub fp32_tflops: f64,
+    /// FP16 TFLOPS.
+    pub fp16_tflops: f64,
+    /// BF16 TFLOPS.
+    pub bf16_tflops: f64,
+    /// INT8 TOPS.
+    pub int8_tops: f64,
+    /// INT4 TOPS.
+    pub int4_tops: f64,
+    /// Speedup for sparse operations.
+    pub sparsity_speedup: f64,
+}
+
+/// Memory specifications.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct MemorySpecs {
+    /// Total capacity (bytes).
+    pub capacity_bytes: u64,
+    /// Bandwidth (GB/s).
+    pub bandwidth_gbps: u32,
+    /// Memory type.
+    pub type_: MemoryType,
+}
+
+impl Default for MemorySpecs {
+    fn default() -> Self {
+        Self {
+            capacity_bytes: 16 * 1024 * 1024 * 1024, // 16 GB
+            bandwidth_gbps: 500,
+            type_: MemoryType::Ddr5,
+        }
+    }
+}
+
+/// Memory types.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum MemoryType {
+    /// DDR4 RAM.
+    Ddr4,
+    /// DDR5 RAM.
+    Ddr5,
+    /// GDDR6/6X video memory.
+    Gddr6,
+    /// HBM2.
+    Hbm2,
+    /// HBM2e.
+    Hbm2e,
+    /// HBM3.
+    Hbm3,
+    /// SRAM (on-chip).
+    Sram,
+    /// Unified memory (Apple Silicon).
+    Unified,
+    /// LPDDR (mobile).
+    Lpddr,
+}
+
+/// Power characteristics.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct PowerCharacteristics {
+    /// TDP in watts.
+    pub tdp_watts: u32,
+    /// Efficiency factor (0.0 - 1.0).
+    pub efficiency: f64,
+    /// Power tier.
+    pub power_tier: PowerTier,
+}
+
+impl Default for PowerCharacteristics {
+    fn default() -> Self {
+        Self {
+            tdp_watts: 100,
+            efficiency: 0.8,
+            power_tier: PowerTier::Medium,
+        }
+    }
+}
+
+/// Workload characteristics for processor matching.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum WorkloadCharacteristic {
+    /// High parallelism (GPU, TPU).
+    HighlyParallel,
+    /// Sequential dependencies (CPU, LPU).
+    Sequential,
+    /// Memory bandwidth bound (GPU).
+    MemoryBound,
+    /// Compute bound (TPU).
+    ComputeBound,
+    /// Low latency required (NPU, edge).
+    LowLatency,
+    /// Low power required (NPU, mobile).
+    LowPower,
+    /// Large batch sizes (GPU, TPU).
+    LargeBatch,
+    /// Small batch sizes (CPU, LPU).
+    SmallBatch,
+    /// Variable length sequences (LPU).
+    VariableLength,
+    /// Fixed tensor shapes (TPU).
+    FixedShape,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_cpu_capabilities() {
+        let caps = ProcessorCapabilities::cpu(32, 3.5, true);
+        assert!(caps.compute.fp32_tflops > 0.0);
+        assert!(caps.operations.contains(&OperationType::DataLoad));
+        assert!(caps.operations.contains(&OperationType::Tokenization));
+    }
+
+    #[test]
+    fn test_gpu_capabilities() {
+        let caps = ProcessorCapabilities::nvidia_gpu(16384, 512, 24, 1000, (8, 9));
+        assert!(caps.compute.fp16_tflops > caps.compute.fp32_tflops);
+        assert!(caps.operations.contains(&OperationType::FlashAttention));
+    }
+
+    #[test]
+    fn test_tpu_capabilities() {
+        let caps = ProcessorCapabilities::tpu(super::super::TpuVersion::V5p);
+        assert!(caps.compute.bf16_tflops > 900.0);
+        assert!(!caps.operations.contains(&OperationType::DataLoad)); // TPUs don't do I/O
+    }
+
+    #[test]
+    fn test_lpu_capabilities() {
+        let caps = ProcessorCapabilities::lpu();
+        assert!(caps.memory.bandwidth_gbps > 10000); // Very high internal bandwidth
+        assert!(caps.optimal_for.contains(&WorkloadCharacteristic::Sequential));
+    }
+}
--- a/crates/synor-compute/src/processor/mod.rs
+++ b/crates/synor-compute/src/processor/mod.rs
@ -0,0 +1,339 @@
+//! Processor abstractions for heterogeneous compute.
+//!
+//! Supports all processor types:
+//! - CPU (x86_64, ARM64, RISC-V)
+//! - GPU (NVIDIA CUDA, AMD ROCm, Intel OneAPI, Apple Metal)
+//! - TPU (Google TPU v2-v5)
+//! - NPU (Apple Neural Engine, Qualcomm Hexagon, Intel VPU)
+//! - LPU (Groq Language Processing Unit)
+//! - FPGA (Xilinx, Intel/Altera)
+//! - DSP (Digital Signal Processors)
+//! - Custom accelerators
+
+mod capabilities;
+mod operation;
+mod profiles;
+mod types;
+
+pub use capabilities::{ComputeThroughput, MemorySpecs, MemoryType, ProcessorCapabilities};
+pub use operation::{Operation, OperationType};
+pub use profiles::ProcessorProfiles;
+pub use types::*;
+
+use crate::error::ComputeError;
+use async_trait::async_trait;
+use serde::{Deserialize, Serialize};
+use std::time::Duration;
+
+/// Unique processor identifier (within a node).
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct ProcessorId(pub u64);
+
+impl std::fmt::Display for ProcessorId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "proc_{}", self.0)
+    }
+}
+
+/// Unified abstraction for any processor type.
+#[async_trait]
+pub trait Processor: Send + Sync {
+    /// Get processor ID.
+    fn id(&self) -> ProcessorId;
+
+    /// Get processor type.
+    fn processor_type(&self) -> ProcessorType;
+
+    /// Get capabilities.
+    fn capabilities(&self) -> &ProcessorCapabilities;
+
+    /// Check if processor can execute operation.
+    fn can_execute(&self, op: &Operation) -> bool;
+
+    /// Estimate execution time for operation.
+    fn estimate_time(&self, op: &Operation) -> Duration;
+
+    /// Estimate energy consumption for operation (Joules).
+    fn estimate_energy(&self, op: &Operation) -> f64;
+
+    /// Execute operation.
+    async fn execute(&self, op: Operation) -> Result<OperationResult, ComputeError>;
+
+    /// Current utilization (0.0 - 1.0).
+    fn utilization(&self) -> f64;
+
+    /// Available memory (bytes).
+    fn available_memory(&self) -> u64;
+
+    /// Check if this processor shares memory with another type.
+    fn shares_memory_with(&self, other: &ProcessorType) -> bool {
+        // By default, processors don't share memory
+        // Override for unified memory architectures (Apple Silicon, AMD APUs)
+        self.processor_type() == *other
+    }
+}
+
+/// Result of an operation execution.
+#[derive(Clone, Debug)]
+pub struct OperationResult {
+    /// Output data.
+    pub output: Vec<u8>,
+    /// Execution time.
+    pub duration: Duration,
+    /// Energy consumed (Joules).
+    pub energy: f64,
+    /// Peak memory used (bytes).
+    pub peak_memory: u64,
+}
+
+/// Generic processor implementation for simulation/testing.
+pub struct GenericProcessor {
+    id: ProcessorId,
+    processor_type: ProcessorType,
+    capabilities: ProcessorCapabilities,
+    utilization: std::sync::atomic::AtomicU64,
+    available_memory: std::sync::atomic::AtomicU64,
+}
+
+impl GenericProcessor {
+    /// Creates a new generic processor.
+    pub fn new(
+        id: ProcessorId,
+        processor_type: ProcessorType,
+        capabilities: ProcessorCapabilities,
+    ) -> Self {
+        let available_memory = capabilities.memory.capacity_bytes;
+        Self {
+            id,
+            processor_type,
+            capabilities,
+            utilization: std::sync::atomic::AtomicU64::new(0),
+            available_memory: std::sync::atomic::AtomicU64::new(available_memory),
+        }
+    }
+
+    /// Creates a CPU processor.
+    pub fn cpu(id: ProcessorId, variant: CpuVariant) -> Self {
+        Self::new(
+            id,
+            ProcessorType::Cpu(variant),
+            ProcessorProfiles::cpu_default(),
+        )
+    }
+
+    /// Creates an NVIDIA GPU processor.
+    pub fn nvidia_gpu(id: ProcessorId, compute_capability: (u8, u8)) -> Self {
+        let capabilities = match compute_capability {
+            (9, 0) => ProcessorProfiles::nvidia_h100(),
+            (8, 9) => ProcessorProfiles::nvidia_rtx_4090(),
+            (8, 6) => ProcessorProfiles::nvidia_rtx_3090(),
+            _ => ProcessorProfiles::nvidia_default(),
+        };
+        Self::new(
+            id,
+            ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability }),
+            capabilities,
+        )
+    }
+
+    /// Creates a TPU processor.
+    pub fn tpu(id: ProcessorId, version: TpuVersion) -> Self {
+        let capabilities = match version {
+            TpuVersion::V5p => ProcessorProfiles::google_tpu_v5p(),
+            TpuVersion::V4 => ProcessorProfiles::google_tpu_v4(),
+            _ => ProcessorProfiles::google_tpu_default(),
+        };
+        Self::new(id, ProcessorType::Tpu(version), capabilities)
+    }
+
+    /// Creates a Groq LPU processor.
+    pub fn lpu(id: ProcessorId) -> Self {
+        Self::new(id, ProcessorType::Lpu, ProcessorProfiles::groq_lpu())
+    }
+
+    /// Creates an Apple Neural Engine processor.
+    pub fn apple_neural_engine(id: ProcessorId, cores: u32) -> Self {
+        Self::new(
+            id,
+            ProcessorType::Npu(NpuVariant::AppleNeuralEngine { cores }),
+            ProcessorProfiles::apple_neural_engine(cores),
+        )
+    }
+}
+
+#[async_trait]
+impl Processor for GenericProcessor {
+    fn id(&self) -> ProcessorId {
+        self.id
+    }
+
+    fn processor_type(&self) -> ProcessorType {
+        self.processor_type.clone()
+    }
+
+    fn capabilities(&self) -> &ProcessorCapabilities {
+        &self.capabilities
+    }
+
+    fn can_execute(&self, op: &Operation) -> bool {
+        self.capabilities.operations.contains(&op.op_type())
+    }
+
+    fn estimate_time(&self, op: &Operation) -> Duration {
+        // Estimate based on FLOPS and operation complexity
+        let flops_needed = op.estimated_flops();
+        let throughput = match op.precision() {
+            Precision::Fp32 => self.capabilities.compute.fp32_tflops,
+            Precision::Fp16 => self.capabilities.compute.fp16_tflops,
+            Precision::Bf16 => self.capabilities.compute.bf16_tflops,
+            Precision::Int8 => self.capabilities.compute.int8_tops,
+            Precision::Int4 => self.capabilities.compute.int4_tops,
+            Precision::Fp64 => self.capabilities.compute.fp64_tflops,
+        };
+
+        if throughput > 0.0 {
+            let tflops = throughput;
+            let flops_per_second = tflops * 1e12;
+            let seconds = flops_needed / flops_per_second;
+            Duration::from_secs_f64(seconds)
+        } else {
+            Duration::from_secs(1) // Fallback
+        }
+    }
+
+    fn estimate_energy(&self, op: &Operation) -> f64 {
+        // Estimate based on TDP and execution time
+        let duration = self.estimate_time(op);
+        let watts = self.capabilities.power.tdp_watts as f64;
+        let efficiency = self.capabilities.power.efficiency;
+        watts * duration.as_secs_f64() * efficiency
+    }
+
+    async fn execute(&self, op: Operation) -> Result<OperationResult, ComputeError> {
+        // Check if we can execute
+        if !self.can_execute(&op) {
+            return Err(ComputeError::OperationNotSupported(
+                self.processor_type.clone(),
+                format!("{:?}", op.op_type()),
+            ));
+        }
+
+        // Simulate execution
+        let duration = self.estimate_time(&op);
+        let energy = self.estimate_energy(&op);
+
+        // Update utilization
+        self.utilization
+            .store(50, std::sync::atomic::Ordering::Relaxed);
+
+        // Simulate work
+        tokio::time::sleep(Duration::from_micros(100)).await;
+
+        // Reset utilization
+        self.utilization
+            .store(0, std::sync::atomic::Ordering::Relaxed);
+
+        Ok(OperationResult {
+            output: vec![],
+            duration,
+            energy,
+            peak_memory: op.estimated_memory(),
+        })
+    }
+
+    fn utilization(&self) -> f64 {
+        self.utilization.load(std::sync::atomic::Ordering::Relaxed) as f64 / 100.0
+    }
+
+    fn available_memory(&self) -> u64 {
+        self.available_memory
+            .load(std::sync::atomic::Ordering::Relaxed)
+    }
+
+    fn shares_memory_with(&self, other: &ProcessorType) -> bool {
+        match (&self.processor_type, other) {
+            // Apple Silicon has unified memory
+            (ProcessorType::Cpu(CpuVariant::Arm64 { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal))
+            | (ProcessorType::Gpu(GpuVariant::AppleMetal), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
+            | (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
+            | (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal)) => true,
+            // Same type always shares
+            (a, b) if a == b => true,
+            _ => false,
+        }
+    }
+}
+
+/// Precision for operations.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum Precision {
+    Fp64,
+    Fp32,
+    Fp16,
+    Bf16,
+    Int8,
+    Int4,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_processor_creation() {
+        let cpu = GenericProcessor::cpu(
+            ProcessorId(0),
+            CpuVariant::X86_64 {
+                avx: AvxSupport::Avx512,
+            },
+        );
+
+        assert_eq!(cpu.id(), ProcessorId(0));
+        assert!(matches!(cpu.processor_type(), ProcessorType::Cpu(_)));
+    }
+
+    #[test]
+    fn test_gpu_creation() {
+        let gpu = GenericProcessor::nvidia_gpu(ProcessorId(1), (9, 0));
+
+        assert_eq!(gpu.id(), ProcessorId(1));
+        assert!(matches!(
+            gpu.processor_type(),
+            ProcessorType::Gpu(GpuVariant::NvidiaCuda { .. })
+        ));
+    }
+
+    #[test]
+    fn test_unified_memory() {
+        let apple_cpu = GenericProcessor::new(
+            ProcessorId(0),
+            ProcessorType::Cpu(CpuVariant::Arm64 { sve: false }),
+            ProcessorCapabilities::default(),
+        );
+
+        assert!(apple_cpu.shares_memory_with(&ProcessorType::Gpu(GpuVariant::AppleMetal)));
+    }
+
+    #[tokio::test]
+    async fn test_operation_execution() {
+        let cpu = GenericProcessor::cpu(
+            ProcessorId(0),
+            CpuVariant::X86_64 {
+                avx: AvxSupport::Avx512,
+            },
+        );
+
+        let op = Operation::MatMul {
+            m: 1024,
+            n: 1024,
+            k: 1024,
+            precision: Precision::Fp32,
+        };
+
+        // CPU might not support all ops depending on capabilities
+        // This is testing the infrastructure
+        let result = cpu.execute(op).await;
+        // Result depends on capabilities
+        assert!(result.is_ok() || result.is_err());
+    }
+}
--- a/crates/synor-compute/src/processor/operation.rs
+++ b/crates/synor-compute/src/processor/operation.rs
@ -0,0 +1,543 @@
+//! Operation definitions for heterogeneous compute.
+
+use super::Precision;
+use serde::{Deserialize, Serialize};
+
+/// Operation types for processor matching.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum OperationType {
+    // Matrix operations
+    MatMul,
+    Conv2d,
+    Conv3d,
+    DepthwiseConv,
+    BatchNorm,
+    LayerNorm,
+
+    // Attention operations
+    SelfAttention,
+    CrossAttention,
+    FlashAttention,
+
+    // Element-wise operations
+    Add,
+    Mul,
+    ReLU,
+    GeLU,
+    SiLU,
+    Softmax,
+
+    // Reduction operations
+    Sum,
+    Mean,
+    Max,
+    ArgMax,
+
+    // Data movement
+    Transpose,
+    Reshape,
+    Concat,
+    Split,
+    Gather,
+    Scatter,
+
+    // LLM specific
+    Embedding,
+    RoPE, // Rotary Position Embedding
+    KVCache,
+    TopK,
+    Sampling,
+
+    // I/O operations
+    DataLoad,
+    DataPreprocess,
+    Tokenization,
+    Detokenization,
+    Checkpoint,
+
+    // Distributed operations
+    AllReduce,
+    AllGather,
+    ReduceScatter,
+
+    // Training specific
+    Backward,
+    OptimizerStep,
+    GradientClip,
+}
+
+/// Concrete operation with parameters.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub enum Operation {
+    /// Matrix multiplication.
+    MatMul {
+        m: usize,
+        n: usize,
+        k: usize,
+        precision: Precision,
+    },
+
+    /// 2D Convolution.
+    Conv2d {
+        batch: usize,
+        in_channels: usize,
+        out_channels: usize,
+        height: usize,
+        width: usize,
+        kernel_size: usize,
+        precision: Precision,
+    },
+
+    /// Batch normalization.
+    BatchNorm {
+        batch: usize,
+        channels: usize,
+        spatial: usize,
+        precision: Precision,
+    },
+
+    /// Layer normalization.
+    LayerNorm {
+        batch: usize,
+        seq_len: usize,
+        hidden: usize,
+        precision: Precision,
+    },
+
+    /// Self-attention.
+    SelfAttention {
+        batch: usize,
+        seq_len: usize,
+        num_heads: usize,
+        head_dim: usize,
+        precision: Precision,
+    },
+
+    /// Flash attention (fused, memory efficient).
+    FlashAttention {
+        batch: usize,
+        seq_len: usize,
+        num_heads: usize,
+        head_dim: usize,
+        precision: Precision,
+    },
+
+    /// Element-wise addition.
+    Add {
+        elements: usize,
+        precision: Precision,
+    },
+
+    /// Element-wise multiplication.
+    Mul {
+        elements: usize,
+        precision: Precision,
+    },
+
+    /// ReLU activation.
+    ReLU { elements: usize },
+
+    /// GeLU activation.
+    GeLU { elements: usize },
+
+    /// SiLU (Swish) activation.
+    SiLU { elements: usize },
+
+    /// Softmax.
+    Softmax {
+        batch: usize,
+        seq_len: usize,
+        precision: Precision,
+    },
+
+    /// Embedding lookup.
+    Embedding {
+        batch: usize,
+        seq_len: usize,
+        vocab_size: usize,
+        embed_dim: usize,
+        precision: Precision,
+    },
+
+    /// Rotary Position Embedding.
+    RoPE {
+        batch: usize,
+        seq_len: usize,
+        head_dim: usize,
+        precision: Precision,
+    },
+
+    /// KV Cache update.
+    KVCache {
+        batch: usize,
+        seq_len: usize,
+        num_heads: usize,
+        head_dim: usize,
+        precision: Precision,
+    },
+
+    /// Top-K sampling.
+    TopK {
+        batch: usize,
+        vocab_size: usize,
+        k: usize,
+    },
+
+    /// Token sampling.
+    Sampling {
+        batch: usize,
+        vocab_size: usize,
+        temperature: f32,
+    },
+
+    /// Data loading from storage.
+    DataLoad {
+        bytes: usize,
+        async_: bool,
+    },
+
+    /// Data preprocessing.
+    DataPreprocess {
+        batch: usize,
+        transforms: Vec<String>,
+    },
+
+    /// Tokenization.
+    Tokenization {
+        text_bytes: usize,
+        vocab_size: usize,
+    },
+
+    /// Detokenization.
+    Detokenization {
+        tokens: usize,
+        vocab_size: usize,
+    },
+
+    /// Checkpoint save.
+    Checkpoint {
+        bytes: usize,
+        async_: bool,
+    },
+
+    /// All-reduce across devices.
+    AllReduce {
+        elements: usize,
+        precision: Precision,
+        devices: usize,
+    },
+
+    /// Backward pass for a layer.
+    Backward {
+        forward_op: Box<Operation>,
+    },
+
+    /// Optimizer step.
+    OptimizerStep {
+        parameters: usize,
+        optimizer: String,
+        precision: Precision,
+    },
+
+    /// Transpose.
+    Transpose {
+        shape: Vec<usize>,
+        axes: Vec<usize>,
+    },
+
+    /// Reshape.
+    Reshape {
+        from: Vec<usize>,
+        to: Vec<usize>,
+    },
+
+    /// Concatenate tensors.
+    Concat {
+        shapes: Vec<Vec<usize>>,
+        axis: usize,
+    },
+
+    /// Generic operation.
+    Generic {
+        op_type: OperationType,
+        flops: f64,
+        memory: u64,
+    },
+}
+
+impl Operation {
+    /// Returns the operation type.
+    pub fn op_type(&self) -> OperationType {
+        match self {
+            Operation::MatMul { .. } => OperationType::MatMul,
+            Operation::Conv2d { .. } => OperationType::Conv2d,
+            Operation::BatchNorm { .. } => OperationType::BatchNorm,
+            Operation::LayerNorm { .. } => OperationType::LayerNorm,
+            Operation::SelfAttention { .. } => OperationType::SelfAttention,
+            Operation::FlashAttention { .. } => OperationType::FlashAttention,
+            Operation::Add { .. } => OperationType::Add,
+            Operation::Mul { .. } => OperationType::Mul,
+            Operation::ReLU { .. } => OperationType::ReLU,
+            Operation::GeLU { .. } => OperationType::GeLU,
+            Operation::SiLU { .. } => OperationType::SiLU,
+            Operation::Softmax { .. } => OperationType::Softmax,
+            Operation::Embedding { .. } => OperationType::Embedding,
+            Operation::RoPE { .. } => OperationType::RoPE,
+            Operation::KVCache { .. } => OperationType::KVCache,
+            Operation::TopK { .. } => OperationType::TopK,
+            Operation::Sampling { .. } => OperationType::Sampling,
+            Operation::DataLoad { .. } => OperationType::DataLoad,
+            Operation::DataPreprocess { .. } => OperationType::DataPreprocess,
+            Operation::Tokenization { .. } => OperationType::Tokenization,
+            Operation::Detokenization { .. } => OperationType::Detokenization,
+            Operation::Checkpoint { .. } => OperationType::Checkpoint,
+            Operation::AllReduce { .. } => OperationType::AllReduce,
+            Operation::Backward { .. } => OperationType::Backward,
+            Operation::OptimizerStep { .. } => OperationType::OptimizerStep,
+            Operation::Transpose { .. } => OperationType::Transpose,
+            Operation::Reshape { .. } => OperationType::Reshape,
+            Operation::Concat { .. } => OperationType::Concat,
+            Operation::Generic { op_type, .. } => *op_type,
+        }
+    }
+
+    /// Returns the precision used.
+    pub fn precision(&self) -> Precision {
+        match self {
+            Operation::MatMul { precision, .. }
+            | Operation::Conv2d { precision, .. }
+            | Operation::BatchNorm { precision, .. }
+            | Operation::LayerNorm { precision, .. }
+            | Operation::SelfAttention { precision, .. }
+            | Operation::FlashAttention { precision, .. }
+            | Operation::Add { precision, .. }
+            | Operation::Mul { precision, .. }
+            | Operation::Softmax { precision, .. }
+            | Operation::Embedding { precision, .. }
+            | Operation::RoPE { precision, .. }
+            | Operation::KVCache { precision, .. }
+            | Operation::AllReduce { precision, .. }
+            | Operation::OptimizerStep { precision, .. } => *precision,
+            Operation::Backward { forward_op } => forward_op.precision(),
+            _ => Precision::Fp32, // Default
+        }
+    }
+
+    /// Estimates FLOPS for the operation.
+    pub fn estimated_flops(&self) -> f64 {
+        match self {
+            // MatMul: 2 * M * N * K (multiply-add)
+            Operation::MatMul { m, n, k, .. } => 2.0 * (*m as f64) * (*n as f64) * (*k as f64),
+
+            // Conv2d: 2 * batch * out * H * W * in * K * K
+            Operation::Conv2d {
+                batch,
+                in_channels,
+                out_channels,
+                height,
+                width,
+                kernel_size,
+                ..
+            } => {
+                2.0 * (*batch as f64)
+                    * (*out_channels as f64)
+                    * (*height as f64)
+                    * (*width as f64)
+                    * (*in_channels as f64)
+                    * (*kernel_size as f64)
+                    * (*kernel_size as f64)
+            }
+
+            // Self-attention: 4 * batch * seq * seq * head_dim * heads
+            Operation::SelfAttention {
+                batch,
+                seq_len,
+                num_heads,
+                head_dim,
+                ..
+            }
+            | Operation::FlashAttention {
+                batch,
+                seq_len,
+                num_heads,
+                head_dim,
+                ..
+            } => {
+                4.0 * (*batch as f64)
+                    * (*seq_len as f64)
+                    * (*seq_len as f64)
+                    * (*head_dim as f64)
+                    * (*num_heads as f64)
+            }
+
+            // Element-wise: 1 FLOP per element
+            Operation::Add { elements, .. }
+            | Operation::Mul { elements, .. }
+            | Operation::ReLU { elements }
+            | Operation::GeLU { elements }
+            | Operation::SiLU { elements } => *elements as f64,
+
+            // Softmax: ~5 ops per element (exp, sum, div)
+            Operation::Softmax {
+                batch, seq_len, ..
+            } => 5.0 * (*batch as f64) * (*seq_len as f64),
+
+            // Embedding: just lookup, minimal FLOPS
+            Operation::Embedding {
+                batch,
+                seq_len,
+                embed_dim,
+                ..
+            } => (*batch as f64) * (*seq_len as f64) * (*embed_dim as f64) * 0.1,
+
+            // Backward: ~2x forward
+            Operation::Backward { forward_op } => forward_op.estimated_flops() * 2.0,
+
+            // Generic
+            Operation::Generic { flops, .. } => *flops,
+
+            // I/O operations: minimal compute
+            _ => 1000.0,
+        }
+    }
+
+    /// Estimates memory usage (bytes).
+    pub fn estimated_memory(&self) -> u64 {
+        let precision_bytes = match self.precision() {
+            Precision::Fp64 => 8,
+            Precision::Fp32 => 4,
+            Precision::Fp16 | Precision::Bf16 => 2,
+            Precision::Int8 => 1,
+            Precision::Int4 => 1, // Rounded up
+        };
+
+        match self {
+            Operation::MatMul { m, n, k, .. } => {
+                // Input A (m×k) + Input B (k×n) + Output (m×n)
+                ((*m * *k) + (*k * *n) + (*m * *n)) as u64 * precision_bytes
+            }
+
+            Operation::SelfAttention {
+                batch,
+                seq_len,
+                num_heads,
+                head_dim,
+                ..
+            } => {
+                // Q, K, V, Output, intermediate attention
+                5 * (*batch as u64)
+                    * (*seq_len as u64)
+                    * (*num_heads as u64)
+                    * (*head_dim as u64)
+                    * precision_bytes
+            }
+
+            Operation::FlashAttention {
+                batch,
+                seq_len,
+                num_heads,
+                head_dim,
+                ..
+            } => {
+                // FlashAttention uses much less memory
+                2 * (*batch as u64)
+                    * (*seq_len as u64)
+                    * (*num_heads as u64)
+                    * (*head_dim as u64)
+                    * precision_bytes
+            }
+
+            Operation::KVCache {
+                batch,
+                seq_len,
+                num_heads,
+                head_dim,
+                ..
+            } => {
+                // K and V caches
+                2 * (*batch as u64)
+                    * (*seq_len as u64)
+                    * (*num_heads as u64)
+                    * (*head_dim as u64)
+                    * precision_bytes
+            }
+
+            Operation::Generic { memory, .. } => *memory,
+
+            _ => 1024 * 1024, // 1 MB default
+        }
+    }
+
+    /// Creates the backward operation for this operation.
+    pub fn backward(&self) -> Option<Operation> {
+        match self {
+            Operation::MatMul { .. }
+            | Operation::Conv2d { .. }
+            | Operation::SelfAttention { .. }
+            | Operation::FlashAttention { .. }
+            | Operation::LayerNorm { .. }
+            | Operation::BatchNorm { .. } => Some(Operation::Backward {
+                forward_op: Box::new(self.clone()),
+            }),
+            _ => None,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_matmul_flops() {
+        let op = Operation::MatMul {
+            m: 1024,
+            n: 1024,
+            k: 1024,
+            precision: Precision::Fp32,
+        };
+
+        let flops = op.estimated_flops();
+        // 2 * 1024^3 = ~2.1 billion FLOPS
+        assert!(flops > 2e9 && flops < 2.2e9);
+    }
+
+    #[test]
+    fn test_attention_memory() {
+        let regular = Operation::SelfAttention {
+            batch: 1,
+            seq_len: 4096,
+            num_heads: 32,
+            head_dim: 128,
+            precision: Precision::Fp16,
+        };
+
+        let flash = Operation::FlashAttention {
+            batch: 1,
+            seq_len: 4096,
+            num_heads: 32,
+            head_dim: 128,
+            precision: Precision::Fp16,
+        };
+
+        // FlashAttention should use less memory
+        assert!(flash.estimated_memory() < regular.estimated_memory());
+    }
+
+    #[test]
+    fn test_backward_creation() {
+        let forward = Operation::MatMul {
+            m: 1024,
+            n: 1024,
+            k: 1024,
+            precision: Precision::Fp32,
+        };
+
+        let backward = forward.backward();
+        assert!(backward.is_some());
+
+        if let Some(Operation::Backward { forward_op }) = backward {
+            assert!(matches!(*forward_op, Operation::MatMul { .. }));
+        }
+    }
+}
--- a/crates/synor-compute/src/processor/profiles.rs
+++ b/crates/synor-compute/src/processor/profiles.rs
@ -0,0 +1,513 @@
+//! Pre-defined processor profiles for common hardware.
+
+use super::capabilities::{
+    ComputeThroughput, MemorySpecs, MemoryType, PowerCharacteristics, ProcessorCapabilities,
+    WorkloadCharacteristic,
+};
+use super::operation::OperationType;
+use super::types::PowerTier;
+use super::TpuVersion;
+use std::collections::HashSet;
+
+/// Pre-defined processor profiles.
+pub struct ProcessorProfiles;
+
+impl ProcessorProfiles {
+    // ═══════════════════════════════════════════════════════════════
+    // CPU PROFILES
+    // ═══════════════════════════════════════════════════════════════
+
+    /// Default CPU profile.
+    pub fn cpu_default() -> ProcessorCapabilities {
+        ProcessorCapabilities::cpu(8, 3.5, false)
+    }
+
+    /// AMD EPYC 9654 (96 cores).
+    pub fn amd_epyc_9654() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 2.7,
+                fp32_tflops: 5.4,
+                fp16_tflops: 10.8,
+                bf16_tflops: 10.8,
+                int8_tops: 21.6,
+                int4_tops: 43.2,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 6 * 1024 * 1024 * 1024 * 1024, // 6 TB max
+                bandwidth_gbps: 460,
+                type_: MemoryType::Ddr5,
+            },
+            operations: ProcessorCapabilities::cpu(96, 2.4, false)
+                .operations,
+            power: PowerCharacteristics {
+                tdp_watts: 360,
+                efficiency: 0.85,
+                power_tier: PowerTier::High,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::Sequential,
+                WorkloadCharacteristic::MemoryBound,
+            ],
+        }
+    }
+
+    /// Intel Xeon w9-3595X (56 cores).
+    pub fn intel_xeon_w9_3595x() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 3.2,
+                fp32_tflops: 6.4,
+                fp16_tflops: 12.8,
+                bf16_tflops: 12.8,
+                int8_tops: 25.6,
+                int4_tops: 51.2,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 4 * 1024 * 1024 * 1024 * 1024, // 4 TB max
+                bandwidth_gbps: 307,
+                type_: MemoryType::Ddr5,
+            },
+            operations: ProcessorCapabilities::cpu(56, 2.9, true)
+                .operations,
+            power: PowerCharacteristics {
+                tdp_watts: 350,
+                efficiency: 0.80,
+                power_tier: PowerTier::High,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::Sequential,
+                WorkloadCharacteristic::MemoryBound,
+            ],
+        }
+    }
+
+    /// Apple M3 Max CPU cores.
+    pub fn apple_m3_max_cpu() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 0.3,
+                fp32_tflops: 0.6,
+                fp16_tflops: 1.2,
+                bf16_tflops: 1.2,
+                int8_tops: 2.4,
+                int4_tops: 4.8,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 128 * 1024 * 1024 * 1024, // 128 GB unified
+                bandwidth_gbps: 400,
+                type_: MemoryType::Unified,
+            },
+            operations: ProcessorCapabilities::cpu(16, 4.0, false)
+                .operations,
+            power: PowerCharacteristics {
+                tdp_watts: 40,
+                efficiency: 0.95,
+                power_tier: PowerTier::Low,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::Sequential,
+                WorkloadCharacteristic::LowPower,
+            ],
+        }
+    }
+
+    // ═══════════════════════════════════════════════════════════════
+    // NVIDIA GPU PROFILES
+    // ═══════════════════════════════════════════════════════════════
+
+    /// Default NVIDIA GPU profile.
+    pub fn nvidia_default() -> ProcessorCapabilities {
+        ProcessorCapabilities::nvidia_gpu(8192, 256, 12, 600, (8, 0))
+    }
+
+    /// NVIDIA H100 SXM (80GB).
+    pub fn nvidia_h100() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 67.0,
+                fp32_tflops: 67.0,
+                fp16_tflops: 1979.0, // With sparsity
+                bf16_tflops: 1979.0,
+                int8_tops: 3958.0,
+                int4_tops: 7916.0,
+                sparsity_speedup: 2.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 80 * 1024 * 1024 * 1024,
+                bandwidth_gbps: 3350,
+                type_: MemoryType::Hbm3,
+            },
+            operations: ProcessorCapabilities::nvidia_gpu(16896, 528, 80, 3350, (9, 0))
+                .operations,
+            power: PowerCharacteristics {
+                tdp_watts: 700,
+                efficiency: 0.90,
+                power_tier: PowerTier::High,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::HighlyParallel,
+                WorkloadCharacteristic::LargeBatch,
+                WorkloadCharacteristic::ComputeBound,
+            ],
+        }
+    }
+
+    /// NVIDIA A100 (80GB).
+    pub fn nvidia_a100() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 19.5,
+                fp32_tflops: 19.5,
+                fp16_tflops: 624.0, // With sparsity
+                bf16_tflops: 624.0,
+                int8_tops: 1248.0,
+                int4_tops: 2496.0,
+                sparsity_speedup: 2.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 80 * 1024 * 1024 * 1024,
+                bandwidth_gbps: 2039,
+                type_: MemoryType::Hbm2e,
+            },
+            operations: ProcessorCapabilities::nvidia_gpu(6912, 432, 80, 2039, (8, 0))
+                .operations,
+            power: PowerCharacteristics {
+                tdp_watts: 400,
+                efficiency: 0.88,
+                power_tier: PowerTier::High,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::HighlyParallel,
+                WorkloadCharacteristic::LargeBatch,
+                WorkloadCharacteristic::ComputeBound,
+            ],
+        }
+    }
+
+    /// NVIDIA RTX 4090.
+    pub fn nvidia_rtx_4090() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 1.3,
+                fp32_tflops: 82.6,
+                fp16_tflops: 330.4, // With sparsity
+                bf16_tflops: 330.4,
+                int8_tops: 660.8,
+                int4_tops: 1321.6,
+                sparsity_speedup: 2.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 24 * 1024 * 1024 * 1024,
+                bandwidth_gbps: 1008,
+                type_: MemoryType::Gddr6,
+            },
+            operations: ProcessorCapabilities::nvidia_gpu(16384, 512, 24, 1008, (8, 9))
+                .operations,
+            power: PowerCharacteristics {
+                tdp_watts: 450,
+                efficiency: 0.85,
+                power_tier: PowerTier::High,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::HighlyParallel,
+                WorkloadCharacteristic::LargeBatch,
+            ],
+        }
+    }
+
+    /// NVIDIA RTX 3090.
+    pub fn nvidia_rtx_3090() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 0.6,
+                fp32_tflops: 35.6,
+                fp16_tflops: 71.2,
+                bf16_tflops: 71.2,
+                int8_tops: 142.4,
+                int4_tops: 284.8,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 24 * 1024 * 1024 * 1024,
+                bandwidth_gbps: 936,
+                type_: MemoryType::Gddr6,
+            },
+            operations: ProcessorCapabilities::nvidia_gpu(10496, 328, 24, 936, (8, 6))
+                .operations,
+            power: PowerCharacteristics {
+                tdp_watts: 350,
+                efficiency: 0.82,
+                power_tier: PowerTier::High,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::HighlyParallel,
+                WorkloadCharacteristic::LargeBatch,
+            ],
+        }
+    }
+
+    // ═══════════════════════════════════════════════════════════════
+    // AMD GPU PROFILES
+    // ═══════════════════════════════════════════════════════════════
+
+    /// AMD MI300X.
+    pub fn amd_mi300x() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 163.4,
+                fp32_tflops: 163.4,
+                fp16_tflops: 1307.0,
+                bf16_tflops: 1307.0,
+                int8_tops: 2614.0,
+                int4_tops: 5228.0,
+                sparsity_speedup: 2.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 192 * 1024 * 1024 * 1024, // 192 GB HBM3
+                bandwidth_gbps: 5300,
+                type_: MemoryType::Hbm3,
+            },
+            operations: {
+                let mut ops = ProcessorCapabilities::nvidia_gpu(16384, 512, 80, 5300, (9, 0))
+                    .operations;
+                ops.remove(&OperationType::FlashAttention); // Different implementation
+                ops
+            },
+            power: PowerCharacteristics {
+                tdp_watts: 750,
+                efficiency: 0.88,
+                power_tier: PowerTier::High,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::HighlyParallel,
+                WorkloadCharacteristic::LargeBatch,
+                WorkloadCharacteristic::MemoryBound, // High memory bandwidth
+            ],
+        }
+    }
+
+    /// AMD RX 7900 XTX.
+    pub fn amd_rx_7900_xtx() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 1.9,
+                fp32_tflops: 61.0,
+                fp16_tflops: 122.0,
+                bf16_tflops: 122.0,
+                int8_tops: 244.0,
+                int4_tops: 488.0,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 24 * 1024 * 1024 * 1024,
+                bandwidth_gbps: 960,
+                type_: MemoryType::Gddr6,
+            },
+            operations: {
+                let mut ops = ProcessorCapabilities::nvidia_gpu(6144, 0, 24, 960, (8, 0))
+                    .operations;
+                ops.remove(&OperationType::FlashAttention);
+                ops
+            },
+            power: PowerCharacteristics {
+                tdp_watts: 355,
+                efficiency: 0.80,
+                power_tier: PowerTier::High,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::HighlyParallel,
+            ],
+        }
+    }
+
+    // ═══════════════════════════════════════════════════════════════
+    // GOOGLE TPU PROFILES
+    // ═══════════════════════════════════════════════════════════════
+
+    /// Default TPU profile.
+    pub fn google_tpu_default() -> ProcessorCapabilities {
+        ProcessorCapabilities::tpu(TpuVersion::V4)
+    }
+
+    /// Google TPU v5p.
+    pub fn google_tpu_v5p() -> ProcessorCapabilities {
+        ProcessorCapabilities::tpu(TpuVersion::V5p)
+    }
+
+    /// Google TPU v4.
+    pub fn google_tpu_v4() -> ProcessorCapabilities {
+        ProcessorCapabilities::tpu(TpuVersion::V4)
+    }
+
+    /// Google Edge TPU.
+    pub fn google_edge_tpu() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 0.0,
+                fp32_tflops: 0.0,
+                fp16_tflops: 0.0,
+                bf16_tflops: 0.0,
+                int8_tops: 4.0,
+                int4_tops: 8.0,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 0, // Uses host memory
+                bandwidth_gbps: 0,
+                type_: MemoryType::Unified,
+            },
+            operations: {
+                let mut ops = HashSet::new();
+                ops.insert(OperationType::MatMul);
+                ops.insert(OperationType::Conv2d);
+                ops.insert(OperationType::DepthwiseConv);
+                ops.insert(OperationType::Add);
+                ops.insert(OperationType::Mul);
+                ops.insert(OperationType::ReLU);
+                ops.insert(OperationType::Softmax);
+                ops
+            },
+            power: PowerCharacteristics {
+                tdp_watts: 2,
+                efficiency: 0.95,
+                power_tier: PowerTier::UltraLow,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::LowPower,
+                WorkloadCharacteristic::LowLatency,
+                WorkloadCharacteristic::SmallBatch,
+            ],
+        }
+    }
+
+    // ═══════════════════════════════════════════════════════════════
+    // GROQ LPU PROFILE
+    // ═══════════════════════════════════════════════════════════════
+
+    /// Groq LPU.
+    pub fn groq_lpu() -> ProcessorCapabilities {
+        ProcessorCapabilities::lpu()
+    }
+
+    // ═══════════════════════════════════════════════════════════════
+    // APPLE NEURAL ENGINE PROFILES
+    // ═══════════════════════════════════════════════════════════════
+
+    /// Apple Neural Engine (generic).
+    pub fn apple_neural_engine(cores: u32) -> ProcessorCapabilities {
+        ProcessorCapabilities::apple_neural_engine(cores)
+    }
+
+    /// Apple M3 Neural Engine (16 cores).
+    pub fn apple_m3_neural_engine() -> ProcessorCapabilities {
+        ProcessorCapabilities::apple_neural_engine(16)
+    }
+
+    /// Apple M3 Max Neural Engine (16 cores).
+    pub fn apple_m3_max_neural_engine() -> ProcessorCapabilities {
+        ProcessorCapabilities::apple_neural_engine(16) // Same as M3
+    }
+
+    /// Apple A17 Pro Neural Engine (35 TOPS).
+    pub fn apple_a17_pro_neural_engine() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 0.0,
+                fp32_tflops: 4.4,
+                fp16_tflops: 8.8,
+                bf16_tflops: 8.8,
+                int8_tops: 35.0,
+                int4_tops: 70.0,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 0, // Uses unified memory
+                bandwidth_gbps: 200,
+                type_: MemoryType::Unified,
+            },
+            operations: ProcessorCapabilities::apple_neural_engine(16)
+                .operations,
+            power: PowerCharacteristics {
+                tdp_watts: 8,
+                efficiency: 0.98,
+                power_tier: PowerTier::UltraLow,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::LowPower,
+                WorkloadCharacteristic::LowLatency,
+                WorkloadCharacteristic::SmallBatch,
+            ],
+        }
+    }
+
+    // ═══════════════════════════════════════════════════════════════
+    // QUALCOMM NPU PROFILES
+    // ═══════════════════════════════════════════════════════════════
+
+    /// Qualcomm Hexagon NPU (Snapdragon 8 Gen 3).
+    pub fn qualcomm_hexagon_8g3() -> ProcessorCapabilities {
+        ProcessorCapabilities {
+            compute: ComputeThroughput {
+                fp64_tflops: 0.0,
+                fp32_tflops: 3.0,
+                fp16_tflops: 6.0,
+                bf16_tflops: 6.0,
+                int8_tops: 73.0, // 73 TOPS
+                int4_tops: 146.0,
+                sparsity_speedup: 1.0,
+            },
+            memory: MemorySpecs {
+                capacity_bytes: 0, // Uses system memory
+                bandwidth_gbps: 77,
+                type_: MemoryType::Lpddr,
+            },
+            operations: ProcessorCapabilities::apple_neural_engine(16)
+                .operations,
+            power: PowerCharacteristics {
+                tdp_watts: 10,
+                efficiency: 0.95,
+                power_tier: PowerTier::UltraLow,
+            },
+            optimal_for: vec![
+                WorkloadCharacteristic::LowPower,
+                WorkloadCharacteristic::LowLatency,
+                WorkloadCharacteristic::SmallBatch,
+            ],
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_h100_profile() {
+        let h100 = ProcessorProfiles::nvidia_h100();
+        assert!(h100.compute.fp16_tflops > 1000.0);
+        assert_eq!(h100.memory.capacity_bytes, 80 * 1024 * 1024 * 1024);
+    }
+
+    #[test]
+    fn test_tpu_v5p_profile() {
+        let tpu = ProcessorProfiles::google_tpu_v5p();
+        assert!(tpu.compute.bf16_tflops > 900.0);
+    }
+
+    #[test]
+    fn test_groq_lpu_profile() {
+        let lpu = ProcessorProfiles::groq_lpu();
+        assert!(lpu.memory.bandwidth_gbps > 50000); // Very high internal bandwidth
+    }
+
+    #[test]
+    fn test_apple_ane_profile() {
+        let ane = ProcessorProfiles::apple_m3_neural_engine();
+        assert!(ane.power.tdp_watts < 20);
+        assert!(ane.optimal_for.contains(&WorkloadCharacteristic::LowPower));
+    }
+}
--- a/crates/synor-compute/src/processor/types.rs
+++ b/crates/synor-compute/src/processor/types.rs
@ -0,0 +1,367 @@
+//! Processor type definitions.
+
+use serde::{Deserialize, Serialize};
+
+/// All supported processor types.
+#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum ProcessorType {
+    /// Central Processing Unit.
+    Cpu(CpuVariant),
+    /// Graphics Processing Unit.
+    Gpu(GpuVariant),
+    /// Tensor Processing Unit (Google).
+    Tpu(TpuVersion),
+    /// Neural Processing Unit (various vendors).
+    Npu(NpuVariant),
+    /// Language Processing Unit (Groq).
+    Lpu,
+    /// Field Programmable Gate Array.
+    Fpga(FpgaVendor),
+    /// Digital Signal Processor.
+    Dsp(DspVariant),
+    /// WebGPU (browser).
+    WebGpu,
+    /// WebAssembly runtime.
+    Wasm,
+    /// Custom/Unknown accelerator.
+    Custom {
+        vendor: String,
+        model: String,
+    },
+}
+
+impl Default for ProcessorType {
+    fn default() -> Self {
+        ProcessorType::Cpu(CpuVariant::default())
+    }
+}
+
+/// CPU architecture variants.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum CpuVariant {
+    /// x86-64 architecture.
+    X86_64 { avx: AvxSupport },
+    /// ARM 64-bit architecture.
+    Arm64 { sve: bool },
+    /// RISC-V architecture.
+    RiscV { vector: bool },
+}
+
+impl Default for CpuVariant {
+    fn default() -> Self {
+        CpuVariant::X86_64 {
+            avx: AvxSupport::Avx2,
+        }
+    }
+}
+
+/// AVX instruction set support levels.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
+pub enum AvxSupport {
+    /// No AVX.
+    None,
+    /// AVX (Sandy Bridge+).
+    Avx,
+    /// AVX2 (Haswell+).
+    Avx2,
+    /// AVX-512 (Skylake-X+).
+    Avx512,
+    /// AVX10 (future).
+    Avx10,
+}
+
+/// GPU vendor variants.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum GpuVariant {
+    /// NVIDIA CUDA GPU.
+    NvidiaCuda {
+        /// Compute capability (major, minor).
+        compute_capability: (u8, u8),
+    },
+    /// AMD ROCm GPU.
+    AmdRocm {
+        /// GFX version (e.g., 1100 for RDNA3).
+        gfx_version: u32,
+    },
+    /// Intel OneAPI GPU.
+    IntelOneApi,
+    /// Apple Metal GPU.
+    AppleMetal,
+    /// Qualcomm Adreno GPU.
+    QualcommAdreno {
+        /// Adreno model number.
+        model: u32,
+    },
+    /// ARM Mali GPU.
+    ArmMali {
+        /// Mali generation (e.g., G710).
+        model: u32,
+    },
+    /// IMG PowerVR GPU.
+    ImgPowerVr,
+}
+
+/// Google TPU versions.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum TpuVersion {
+    /// TPU v2.
+    V2,
+    /// TPU v3.
+    V3,
+    /// TPU v4.
+    V4,
+    /// TPU v4i (inference).
+    V4i,
+    /// TPU v5e (efficiency).
+    V5e,
+    /// TPU v5p (performance).
+    V5p,
+    /// Edge TPU.
+    Edge,
+}
+
+/// NPU (Neural Processing Unit) variants.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum NpuVariant {
+    /// Apple Neural Engine.
+    AppleNeuralEngine {
+        /// Number of cores.
+        cores: u32,
+    },
+    /// Qualcomm Hexagon DSP/NPU.
+    QualcommHexagon {
+        /// Version number.
+        version: u32,
+    },
+    /// Intel VPU (Movidius).
+    IntelVpu,
+    /// Huawei Ascend.
+    HuaweiAscend {
+        /// Model (310, 910, etc.).
+        model: u32,
+    },
+    /// Google Edge TPU.
+    GoogleEdgeTpu,
+    /// Samsung NPU.
+    SamsungNpu,
+    /// MediaTek APU.
+    MediaTekApu {
+        /// Version.
+        version: u32,
+    },
+    /// Custom NPU.
+    Custom {
+        /// TOPS (Tera Operations Per Second).
+        tops: u32,
+    },
+}
+
+/// FPGA vendors.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum FpgaVendor {
+    /// Xilinx (AMD).
+    Xilinx,
+    /// Intel (Altera).
+    Intel,
+    /// Lattice.
+    Lattice,
+    /// Microchip.
+    Microchip,
+}
+
+/// DSP (Digital Signal Processor) variants.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum DspVariant {
+    /// Texas Instruments DSP.
+    TexasInstruments,
+    /// Analog Devices DSP.
+    AnalogDevices,
+    /// Qualcomm Hexagon DSP.
+    QualcommHexagon,
+    /// Custom DSP.
+    Custom,
+}
+
+impl ProcessorType {
+    /// Returns whether this processor type supports CUDA.
+    pub fn supports_cuda(&self) -> bool {
+        matches!(self, ProcessorType::Gpu(GpuVariant::NvidiaCuda { .. }))
+    }
+
+    /// Returns whether this processor type supports ROCm.
+    pub fn supports_rocm(&self) -> bool {
+        matches!(self, ProcessorType::Gpu(GpuVariant::AmdRocm { .. }))
+    }
+
+    /// Returns whether this processor type supports Metal.
+    pub fn supports_metal(&self) -> bool {
+        matches!(self, ProcessorType::Gpu(GpuVariant::AppleMetal))
+    }
+
+    /// Returns whether this processor type is a GPU.
+    pub fn is_gpu(&self) -> bool {
+        matches!(self, ProcessorType::Gpu(_))
+    }
+
+    /// Returns whether this processor type is a CPU.
+    pub fn is_cpu(&self) -> bool {
+        matches!(self, ProcessorType::Cpu(_))
+    }
+
+    /// Returns whether this processor type is suitable for parallel workloads.
+    pub fn is_parallel(&self) -> bool {
+        matches!(
+            self,
+            ProcessorType::Gpu(_) | ProcessorType::Tpu(_) | ProcessorType::Fpga(_)
+        )
+    }
+
+    /// Returns whether this processor type is suitable for sequential workloads.
+    pub fn is_sequential(&self) -> bool {
+        matches!(self, ProcessorType::Cpu(_) | ProcessorType::Lpu)
+    }
+
+    /// Returns whether this processor type is power-efficient.
+    pub fn is_low_power(&self) -> bool {
+        matches!(
+            self,
+            ProcessorType::Npu(_) | ProcessorType::Tpu(TpuVersion::Edge) | ProcessorType::Wasm
+        )
+    }
+
+    /// Returns the typical power consumption tier.
+    pub fn power_tier(&self) -> PowerTier {
+        match self {
+            ProcessorType::Npu(_) | ProcessorType::Wasm => PowerTier::UltraLow,
+            ProcessorType::Cpu(CpuVariant::Arm64 { .. }) => PowerTier::Low,
+            ProcessorType::Cpu(_) => PowerTier::Medium,
+            ProcessorType::Gpu(GpuVariant::AppleMetal) => PowerTier::Medium,
+            ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability })
+                if compute_capability.0 >= 8 =>
+            {
+                PowerTier::High
+            }
+            ProcessorType::Gpu(_) => PowerTier::Medium,
+            ProcessorType::Tpu(TpuVersion::Edge) => PowerTier::UltraLow,
+            ProcessorType::Tpu(_) => PowerTier::High,
+            ProcessorType::Lpu => PowerTier::Medium,
+            ProcessorType::Fpga(_) => PowerTier::Medium,
+            ProcessorType::Dsp(_) => PowerTier::Low,
+            ProcessorType::WebGpu => PowerTier::Low,
+            ProcessorType::Custom { .. } => PowerTier::Medium,
+        }
+    }
+}
+
+/// Power consumption tiers.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub enum PowerTier {
+    /// < 5W (mobile, IoT).
+    UltraLow,
+    /// 5-30W (laptop, tablet).
+    Low,
+    /// 30-150W (desktop, workstation).
+    Medium,
+    /// > 150W (server, data center).
+    High,
+}
+
+/// Device class for routing decisions.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum DeviceClass {
+    /// Data center equipment.
+    DataCenter,
+    /// Desktop/workstation.
+    Desktop,
+    /// Laptop.
+    Laptop,
+    /// Mobile phone.
+    Mobile,
+    /// Tablet.
+    Tablet,
+    /// IoT device.
+    IoT,
+    /// Browser (WebGPU/WASM).
+    Browser,
+    /// Edge server.
+    Edge,
+}
+
+impl DeviceClass {
+    /// Returns typical available compute hours per day.
+    pub fn typical_availability_hours(&self) -> f32 {
+        match self {
+            DeviceClass::DataCenter => 24.0,
+            DeviceClass::Desktop => 8.0,
+            DeviceClass::Laptop => 6.0,
+            DeviceClass::Mobile => 4.0,
+            DeviceClass::Tablet => 4.0,
+            DeviceClass::IoT => 24.0,
+            DeviceClass::Browser => 2.0,
+            DeviceClass::Edge => 24.0,
+        }
+    }
+
+    /// Returns reliability score (0-100).
+    pub fn reliability_score(&self) -> u32 {
+        match self {
+            DeviceClass::DataCenter => 99,
+            DeviceClass::Edge => 95,
+            DeviceClass::Desktop => 80,
+            DeviceClass::Laptop => 60,
+            DeviceClass::Mobile => 40,
+            DeviceClass::Tablet => 50,
+            DeviceClass::IoT => 70,
+            DeviceClass::Browser => 30,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_processor_type_properties() {
+        let nvidia = ProcessorType::Gpu(GpuVariant::NvidiaCuda {
+            compute_capability: (9, 0),
+        });
+        assert!(nvidia.supports_cuda());
+        assert!(nvidia.is_gpu());
+        assert!(nvidia.is_parallel());
+
+        let cpu = ProcessorType::Cpu(CpuVariant::X86_64 {
+            avx: AvxSupport::Avx512,
+        });
+        assert!(cpu.is_cpu());
+        assert!(cpu.is_sequential());
+
+        let lpu = ProcessorType::Lpu;
+        assert!(lpu.is_sequential());
+
+        let npu = ProcessorType::Npu(NpuVariant::AppleNeuralEngine { cores: 16 });
+        assert!(npu.is_low_power());
+    }
+
+    #[test]
+    fn test_power_tiers() {
+        let h100 = ProcessorType::Gpu(GpuVariant::NvidiaCuda {
+            compute_capability: (9, 0),
+        });
+        assert_eq!(h100.power_tier(), PowerTier::High);
+
+        let npu = ProcessorType::Npu(NpuVariant::AppleNeuralEngine { cores: 16 });
+        assert_eq!(npu.power_tier(), PowerTier::UltraLow);
+
+        let arm = ProcessorType::Cpu(CpuVariant::Arm64 { sve: false });
+        assert_eq!(arm.power_tier(), PowerTier::Low);
+    }
+
+    #[test]
+    fn test_device_class() {
+        assert_eq!(DeviceClass::DataCenter.typical_availability_hours(), 24.0);
+        assert_eq!(DeviceClass::Mobile.typical_availability_hours(), 4.0);
+        assert_eq!(DeviceClass::DataCenter.reliability_score(), 99);
+        assert_eq!(DeviceClass::Browser.reliability_score(), 30);
+    }
+}
--- a/crates/synor-compute/src/scheduler/load_balancer.rs
+++ b/crates/synor-compute/src/scheduler/load_balancer.rs
@ -0,0 +1,810 @@
+//! Load balancer with work stealing for heterogeneous compute.
+//!
+//! Supports:
+//! - Cross-processor-type work migration
+//! - Energy-aware balancing
+//! - Latency-aware scheduling
+//! - Real-time utilization metrics
+
+use crate::device::{DeviceInfo, DeviceRegistry};
+use crate::processor::{Operation, OperationType, ProcessorId, ProcessorType};
+use crate::task::{Task, TaskId, TaskPriority};
+use super::TaskAssignment;
+use parking_lot::RwLock;
+use std::collections::HashMap;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+/// Balancing strategy for the load balancer.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum BalancingStrategy {
+    /// Optimize for speed (minimize execution time).
+    Speed,
+    /// Optimize for energy efficiency.
+    Energy,
+    /// Balance speed and energy.
+    Balanced,
+    /// Optimize for cost (spot pricing).
+    Cost,
+    /// Optimize for latency (inference workloads).
+    Latency,
+}
+
+impl Default for BalancingStrategy {
+    fn default() -> Self {
+        BalancingStrategy::Balanced
+    }
+}
+
+/// Real-time processor metrics.
+#[derive(Clone, Debug, Default)]
+pub struct ProcessorMetrics {
+    /// Current utilization (0.0 - 1.0).
+    pub utilization: f64,
+    /// Queue depth (pending tasks).
+    pub queue_depth: u64,
+    /// Average task completion time (ms).
+    pub avg_completion_ms: f64,
+    /// Tasks completed in last minute.
+    pub throughput_per_min: u64,
+    /// Current power draw (watts).
+    pub power_watts: f64,
+    /// Temperature (celsius).
+    pub temperature: f64,
+    /// Last updated timestamp.
+    pub last_updated: Option<Instant>,
+}
+
+/// Load balancer for heterogeneous compute environments.
+pub struct LoadBalancer {
+    /// Device registry for processor info.
+    device_registry: Option<Arc<DeviceRegistry>>,
+    /// Current load per processor (task count).
+    loads: RwLock<HashMap<ProcessorId, AtomicU64>>,
+    /// Real-time metrics per processor.
+    metrics: RwLock<HashMap<ProcessorId, ProcessorMetrics>>,
+    /// Processor type mapping.
+    processor_types: RwLock<HashMap<ProcessorId, ProcessorType>>,
+    /// Work stealing threshold (0.0 - 1.0).
+    steal_threshold: f64,
+    /// Rebalance threshold (0.0 - 1.0).
+    rebalance_threshold: f64,
+    /// Current balancing strategy.
+    strategy: RwLock<BalancingStrategy>,
+    /// Migration history (to prevent thrashing).
+    migration_history: RwLock<Vec<MigrationRecord>>,
+}
+
+/// Record of a task migration.
+#[derive(Clone, Debug)]
+struct MigrationRecord {
+    task_id: TaskId,
+    from: ProcessorId,
+    to: ProcessorId,
+    timestamp: Instant,
+}
+
+impl LoadBalancer {
+    /// Creates a new load balancer.
+    pub fn new() -> Self {
+        Self {
+            device_registry: None,
+            loads: RwLock::new(HashMap::new()),
+            metrics: RwLock::new(HashMap::new()),
+            processor_types: RwLock::new(HashMap::new()),
+            steal_threshold: 0.3,
+            rebalance_threshold: 0.2,
+            strategy: RwLock::new(BalancingStrategy::default()),
+            migration_history: RwLock::new(Vec::new()),
+        }
+    }
+
+    /// Creates a load balancer with device registry.
+    pub fn with_registry(device_registry: Arc<DeviceRegistry>) -> Self {
+        Self {
+            device_registry: Some(device_registry),
+            loads: RwLock::new(HashMap::new()),
+            metrics: RwLock::new(HashMap::new()),
+            processor_types: RwLock::new(HashMap::new()),
+            steal_threshold: 0.3,
+            rebalance_threshold: 0.2,
+            strategy: RwLock::new(BalancingStrategy::default()),
+            migration_history: RwLock::new(Vec::new()),
+        }
+    }
+
+    /// Sets the balancing strategy.
+    pub fn set_strategy(&self, strategy: BalancingStrategy) {
+        *self.strategy.write() = strategy;
+    }
+
+    /// Gets the current strategy.
+    pub fn strategy(&self) -> BalancingStrategy {
+        *self.strategy.read()
+    }
+
+    /// Register a processor with its type.
+    pub fn register_processor(&self, processor_id: ProcessorId, processor_type: ProcessorType) {
+        self.loads.write().insert(processor_id, AtomicU64::new(0));
+        self.metrics.write().insert(processor_id, ProcessorMetrics::default());
+        self.processor_types.write().insert(processor_id, processor_type);
+    }
+
+    /// Unregister a processor.
+    pub fn unregister_processor(&self, processor_id: ProcessorId) {
+        self.loads.write().remove(&processor_id);
+        self.metrics.write().remove(&processor_id);
+        self.processor_types.write().remove(&processor_id);
+    }
+
+    /// Update real-time metrics for a processor.
+    pub fn update_metrics(&self, processor_id: ProcessorId, metrics: ProcessorMetrics) {
+        if let Some(existing) = self.metrics.write().get_mut(&processor_id) {
+            *existing = ProcessorMetrics {
+                last_updated: Some(Instant::now()),
+                ..metrics
+            };
+        }
+    }
+
+    /// Get current load for a processor.
+    pub fn get_load(&self, processor_id: ProcessorId) -> u64 {
+        self.loads.read()
+            .get(&processor_id)
+            .map(|l| l.load(Ordering::Relaxed))
+            .unwrap_or(0)
+    }
+
+    /// Increment load for a processor.
+    pub fn increment_load(&self, processor_id: ProcessorId) {
+        if let Some(load) = self.loads.read().get(&processor_id) {
+            load.fetch_add(1, Ordering::Relaxed);
+        }
+    }
+
+    /// Decrement load for a processor.
+    pub fn decrement_load(&self, processor_id: ProcessorId) {
+        if let Some(load) = self.loads.read().get(&processor_id) {
+            load.fetch_sub(1, Ordering::Relaxed);
+        }
+    }
+
+    /// Check if an operation can run on a processor type.
+    pub fn can_execute(&self, op: &Operation, processor_type: &ProcessorType) -> bool {
+        let op_type = op.op_type();
+
+        match processor_type {
+            // CPUs can handle most sequential operations
+            ProcessorType::Cpu(_) => matches!(
+                op_type,
+                OperationType::MatMul
+                | OperationType::Conv2d
+                | OperationType::Conv3d
+                | OperationType::DepthwiseConv
+                | OperationType::BatchNorm
+                | OperationType::LayerNorm
+                | OperationType::Add
+                | OperationType::Mul
+                | OperationType::ReLU
+                | OperationType::GeLU
+                | OperationType::SiLU
+                | OperationType::Softmax
+                | OperationType::Sum
+                | OperationType::Mean
+                | OperationType::Max
+                | OperationType::ArgMax
+                | OperationType::Embedding
+                | OperationType::TopK
+                | OperationType::Sampling
+                | OperationType::Tokenization
+                | OperationType::Detokenization
+                | OperationType::DataLoad
+                | OperationType::DataPreprocess
+                | OperationType::Transpose
+                | OperationType::Reshape
+                | OperationType::Concat
+                | OperationType::Split
+            ),
+
+            // GPUs excel at parallel operations
+            ProcessorType::Gpu(_) => matches!(
+                op_type,
+                OperationType::MatMul
+                | OperationType::Conv2d
+                | OperationType::Conv3d
+                | OperationType::DepthwiseConv
+                | OperationType::BatchNorm
+                | OperationType::LayerNorm
+                | OperationType::SelfAttention
+                | OperationType::CrossAttention
+                | OperationType::FlashAttention
+                | OperationType::Add
+                | OperationType::Mul
+                | OperationType::ReLU
+                | OperationType::GeLU
+                | OperationType::SiLU
+                | OperationType::Softmax
+                | OperationType::Sum
+                | OperationType::Mean
+                | OperationType::Max
+                | OperationType::ArgMax
+                | OperationType::Embedding
+                | OperationType::RoPE
+                | OperationType::KVCache
+                | OperationType::TopK
+                | OperationType::Sampling
+                | OperationType::Transpose
+                | OperationType::Reshape
+                | OperationType::Concat
+                | OperationType::Split
+                | OperationType::Gather
+                | OperationType::Scatter
+                | OperationType::AllReduce
+                | OperationType::AllGather
+                | OperationType::ReduceScatter
+                | OperationType::Backward
+                | OperationType::OptimizerStep
+                | OperationType::GradientClip
+            ),
+
+            // TPUs optimized for ML
+            ProcessorType::Tpu(_) => matches!(
+                op_type,
+                OperationType::MatMul
+                | OperationType::Conv2d
+                | OperationType::BatchNorm
+                | OperationType::LayerNorm
+                | OperationType::SelfAttention
+                | OperationType::CrossAttention
+                | OperationType::FlashAttention
+                | OperationType::Add
+                | OperationType::Mul
+                | OperationType::ReLU
+                | OperationType::GeLU
+                | OperationType::SiLU
+                | OperationType::Softmax
+                | OperationType::Sum
+                | OperationType::Mean
+                | OperationType::Embedding
+                | OperationType::RoPE
+                | OperationType::KVCache
+                | OperationType::AllReduce
+                | OperationType::AllGather
+                | OperationType::ReduceScatter
+                | OperationType::Backward
+                | OperationType::OptimizerStep
+            ),
+
+            // NPUs for neural network inference
+            ProcessorType::Npu(_) => matches!(
+                op_type,
+                OperationType::MatMul
+                | OperationType::Conv2d
+                | OperationType::DepthwiseConv
+                | OperationType::BatchNorm
+                | OperationType::LayerNorm
+                | OperationType::SelfAttention
+                | OperationType::Add
+                | OperationType::Mul
+                | OperationType::ReLU
+                | OperationType::GeLU
+                | OperationType::SiLU
+                | OperationType::Softmax
+                | OperationType::Sum
+                | OperationType::Mean
+            ),
+
+            // LPUs for sequential inference (optimized for LLMs)
+            ProcessorType::Lpu => matches!(
+                op_type,
+                OperationType::MatMul
+                | OperationType::LayerNorm
+                | OperationType::SelfAttention
+                | OperationType::FlashAttention
+                | OperationType::Add
+                | OperationType::Mul
+                | OperationType::ReLU
+                | OperationType::GeLU
+                | OperationType::SiLU
+                | OperationType::Softmax
+                | OperationType::Embedding
+                | OperationType::RoPE
+                | OperationType::KVCache
+                | OperationType::TopK
+                | OperationType::Sampling
+            ),
+
+            // FPGAs can be programmed for anything
+            ProcessorType::Fpga(_) => true,
+
+            // DSPs for signal processing
+            ProcessorType::Dsp(_) => matches!(
+                op_type,
+                OperationType::Conv2d
+                | OperationType::DepthwiseConv
+                | OperationType::Add
+                | OperationType::Mul
+                | OperationType::Sum
+                | OperationType::Mean
+                | OperationType::Max
+            ),
+
+            // WebGPU has limited operations
+            ProcessorType::WebGpu => matches!(
+                op_type,
+                OperationType::MatMul
+                | OperationType::Conv2d
+                | OperationType::Add
+                | OperationType::Mul
+                | OperationType::ReLU
+                | OperationType::Softmax
+                | OperationType::Sum
+                | OperationType::Transpose
+                | OperationType::Reshape
+            ),
+
+            // WASM for portable compute
+            ProcessorType::Wasm => matches!(
+                op_type,
+                OperationType::MatMul
+                | OperationType::Add
+                | OperationType::Mul
+                | OperationType::ReLU
+                | OperationType::Softmax
+                | OperationType::Sum
+                | OperationType::Mean
+                | OperationType::Tokenization
+                | OperationType::Detokenization
+            ),
+
+            // Custom processors - assume they can handle anything
+            ProcessorType::Custom { .. } => true,
+        }
+    }
+
+    /// Calculate a score for assigning a task to a processor.
+    fn calculate_score(
+        &self,
+        task: &Task,
+        processor_id: ProcessorId,
+        processor_type: &ProcessorType,
+    ) -> f64 {
+        let strategy = *self.strategy.read();
+        let load = self.get_load(processor_id);
+        let metrics = self.metrics.read();
+        let proc_metrics = metrics.get(&processor_id);
+
+        // Base score from compatibility
+        if !self.can_execute(&task.operation, processor_type) {
+            return f64::NEG_INFINITY;
+        }
+
+        // Get utilization and metrics
+        let utilization = proc_metrics.map(|m| m.utilization).unwrap_or(load as f64 / 100.0);
+        let power = proc_metrics.map(|m| m.power_watts).unwrap_or(100.0);
+        let avg_completion = proc_metrics.map(|m| m.avg_completion_ms).unwrap_or(100.0);
+
+        // Calculate score based on strategy
+        match strategy {
+            BalancingStrategy::Speed => {
+                // Prioritize low utilization and fast completion
+                let speed_score = 1.0 / (avg_completion.max(1.0) * (1.0 + utilization));
+
+                // Bonus for powerful processor types
+                let type_bonus = match processor_type {
+                    ProcessorType::Gpu(_) => 2.0,
+                    ProcessorType::Tpu(_) => 2.5,
+                    ProcessorType::Lpu => 3.0, // Fastest for inference
+                    ProcessorType::Npu(_) => 1.5,
+                    _ => 1.0,
+                };
+
+                speed_score * type_bonus
+            }
+
+            BalancingStrategy::Energy => {
+                // Prioritize low power consumption
+                let energy_score = 1.0 / power.max(1.0);
+
+                // Bonus for efficient processor types
+                let efficiency_bonus = match processor_type {
+                    ProcessorType::Npu(_) => 3.0, // Most efficient
+                    ProcessorType::Lpu => 2.0,
+                    ProcessorType::Cpu(_) => 1.5,
+                    ProcessorType::Wasm => 2.0, // Low overhead
+                    _ => 1.0,
+                };
+
+                energy_score * efficiency_bonus * (1.0 - utilization * 0.5)
+            }
+
+            BalancingStrategy::Balanced => {
+                // Balance speed and energy
+                let speed = 1.0 / avg_completion.max(1.0);
+                let efficiency = 1.0 / power.max(1.0);
+                let load_factor = 1.0 - utilization;
+
+                (speed * 0.4 + efficiency * 0.3 + load_factor * 0.3)
+            }
+
+            BalancingStrategy::Cost => {
+                // Prioritize cheaper resources (consumer devices)
+                let cost_factor = match processor_type {
+                    ProcessorType::Wasm => 0.1,      // Cheapest (browser)
+                    ProcessorType::WebGpu => 0.15,
+                    ProcessorType::Cpu(_) => 0.2,
+                    ProcessorType::Npu(_) => 0.3,    // Mobile NPUs
+                    ProcessorType::Gpu(_) => 0.5,
+                    ProcessorType::Lpu => 0.8,
+                    ProcessorType::Tpu(_) => 1.0,    // Most expensive
+                    _ => 0.5,
+                };
+
+                (1.0 - cost_factor) * (1.0 - utilization)
+            }
+
+            BalancingStrategy::Latency => {
+                // Prioritize low latency for inference
+                let latency_score = 1.0 / avg_completion.max(0.1);
+
+                // Bonus for low-latency processors
+                let latency_bonus = match processor_type {
+                    ProcessorType::Lpu => 5.0,      // Designed for low latency
+                    ProcessorType::Npu(_) => 3.0,
+                    ProcessorType::Gpu(_) => 2.0,
+                    ProcessorType::Tpu(_) => 1.5,
+                    _ => 1.0,
+                };
+
+                // Priority boost for critical tasks
+                let priority_boost = match task.priority {
+                    TaskPriority::Critical => 2.0,
+                    TaskPriority::High => 1.5,
+                    TaskPriority::Normal => 1.0,
+                    TaskPriority::Background => 0.5,
+                };
+
+                latency_score * latency_bonus * priority_boost * (1.0 - utilization)
+            }
+        }
+    }
+
+    /// Maybe rebalance a task to a different processor.
+    pub fn maybe_rebalance(
+        &self,
+        task: &Task,
+        suggested_processor: ProcessorId,
+        current_assignment: &TaskAssignment,
+    ) -> ProcessorId {
+        // Get all registered processors
+        let processor_types = self.processor_types.read();
+
+        // If we don't have processor info, use suggested
+        let suggested_type = match processor_types.get(&suggested_processor) {
+            Some(t) => t.clone(),
+            None => return suggested_processor,
+        };
+
+        // Calculate score for suggested processor
+        let suggested_score = self.calculate_score(task, suggested_processor, &suggested_type);
+
+        // Find best alternative
+        let mut best_processor = suggested_processor;
+        let mut best_score = suggested_score;
+
+        for (proc_id, proc_type) in processor_types.iter() {
+            if *proc_id == suggested_processor {
+                continue;
+            }
+
+            let score = self.calculate_score(task, *proc_id, proc_type);
+
+            // Only switch if significantly better (prevents thrashing)
+            if score > best_score * (1.0 + self.rebalance_threshold) {
+                best_score = score;
+                best_processor = *proc_id;
+            }
+        }
+
+        // Record migration if different
+        if best_processor != suggested_processor {
+            self.migration_history.write().push(MigrationRecord {
+                task_id: task.id,
+                from: suggested_processor,
+                to: best_processor,
+                timestamp: Instant::now(),
+            });
+        }
+
+        best_processor
+    }
+
+    /// Check if work stealing should happen between two processors.
+    pub fn should_steal(&self, from: ProcessorId, to: ProcessorId) -> bool {
+        let from_load = self.get_load(from) as f64;
+        let to_load = self.get_load(to) as f64;
+
+        if from_load == 0.0 {
+            return false;
+        }
+
+        // Check if processor types are compatible for the queued work
+        let processor_types = self.processor_types.read();
+        let from_type = processor_types.get(&from);
+        let to_type = processor_types.get(&to);
+
+        // Only steal between same processor types by default
+        // (cross-type stealing requires operation compatibility check)
+        match (from_type, to_type) {
+            (Some(ft), Some(tt)) if std::mem::discriminant(ft) == std::mem::discriminant(tt) => {
+                let diff = (from_load - to_load) / from_load;
+                diff > self.steal_threshold
+            }
+            _ => false,
+        }
+    }
+
+    /// Get rebalancing suggestions based on current load.
+    pub fn get_rebalance_suggestions(&self) -> Vec<(ProcessorId, ProcessorId)> {
+        let mut suggestions = Vec::new();
+        let loads = self.loads.read();
+
+        let load_values: Vec<_> = loads.iter()
+            .map(|(id, load)| (*id, load.load(Ordering::Relaxed)))
+            .collect();
+
+        if load_values.is_empty() {
+            return suggestions;
+        }
+
+        let avg_load: f64 = load_values.iter().map(|(_, l)| *l as f64).sum::<f64>()
+            / load_values.len() as f64;
+
+        let processor_types = self.processor_types.read();
+
+        let overloaded: Vec<_> = load_values.iter()
+            .filter(|(_, l)| *l as f64 > avg_load * (1.0 + self.rebalance_threshold))
+            .collect();
+
+        let underloaded: Vec<_> = load_values.iter()
+            .filter(|(_, l)| (*l as f64) < avg_load * (1.0 - self.rebalance_threshold))
+            .collect();
+
+        // Only suggest migrations between compatible processor types
+        for (over_id, _) in overloaded {
+            let over_type = processor_types.get(over_id);
+
+            for (under_id, _) in &underloaded {
+                let under_type = processor_types.get(under_id);
+
+                // Check type compatibility
+                if let (Some(ot), Some(ut)) = (over_type, under_type) {
+                    if std::mem::discriminant(ot) == std::mem::discriminant(ut) {
+                        suggestions.push((*over_id, *under_id));
+                    }
+                }
+            }
+        }
+
+        suggestions
+    }
+
+    /// Get load statistics.
+    pub fn get_stats(&self) -> LoadBalancerStats {
+        let loads = self.loads.read();
+        let metrics = self.metrics.read();
+
+        let total_load: u64 = loads.values().map(|l| l.load(Ordering::Relaxed)).sum();
+        let processor_count = loads.len();
+        let avg_load = if processor_count > 0 {
+            total_load as f64 / processor_count as f64
+        } else {
+            0.0
+        };
+
+        let total_utilization: f64 = metrics.values().map(|m| m.utilization).sum();
+        let avg_utilization = if processor_count > 0 {
+            total_utilization / processor_count as f64
+        } else {
+            0.0
+        };
+
+        let total_power: f64 = metrics.values().map(|m| m.power_watts).sum();
+        let migrations = self.migration_history.read().len();
+
+        LoadBalancerStats {
+            total_load,
+            avg_load,
+            processor_count,
+            avg_utilization,
+            total_power_watts: total_power,
+            total_migrations: migrations,
+            strategy: *self.strategy.read(),
+        }
+    }
+
+    /// Clean up old migration history.
+    pub fn cleanup_history(&self, max_age: Duration) {
+        let cutoff = Instant::now() - max_age;
+        self.migration_history.write().retain(|r| r.timestamp > cutoff);
+    }
+}
+
+impl Default for LoadBalancer {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Load balancer statistics.
+#[derive(Clone, Debug)]
+pub struct LoadBalancerStats {
+    /// Total tasks across all processors.
+    pub total_load: u64,
+    /// Average load per processor.
+    pub avg_load: f64,
+    /// Number of registered processors.
+    pub processor_count: usize,
+    /// Average utilization (0.0 - 1.0).
+    pub avg_utilization: f64,
+    /// Total power consumption (watts).
+    pub total_power_watts: f64,
+    /// Total migrations performed.
+    pub total_migrations: usize,
+    /// Current balancing strategy.
+    pub strategy: BalancingStrategy,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::processor::{CpuVariant, GpuVariant, Operation, Precision};
+    use crate::task::TaskStatus;
+
+    fn create_test_task(priority: TaskPriority) -> Task {
+        Task {
+            id: TaskId::new(),
+            operation: Operation::MatMul {
+                m: 1024,
+                n: 1024,
+                k: 1024,
+                precision: Precision::Fp32,
+            },
+            priority,
+            dependencies: vec![],
+            status: TaskStatus::Pending,
+            deadline: None,
+        }
+    }
+
+    #[test]
+    fn test_load_tracking() {
+        let balancer = LoadBalancer::new();
+
+        balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
+        balancer.register_processor(ProcessorId(1), ProcessorType::Cpu(CpuVariant::default()));
+
+        assert_eq!(balancer.get_load(ProcessorId(0)), 0);
+
+        balancer.increment_load(ProcessorId(0));
+        balancer.increment_load(ProcessorId(0));
+        balancer.increment_load(ProcessorId(1));
+
+        assert_eq!(balancer.get_load(ProcessorId(0)), 2);
+        assert_eq!(balancer.get_load(ProcessorId(1)), 1);
+
+        balancer.decrement_load(ProcessorId(0));
+        assert_eq!(balancer.get_load(ProcessorId(0)), 1);
+    }
+
+    #[test]
+    fn test_should_steal_same_type() {
+        let balancer = LoadBalancer::new();
+
+        // Register two CPUs
+        balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
+        balancer.register_processor(ProcessorId(1), ProcessorType::Cpu(CpuVariant::default()));
+
+        // Give processor 0 high load
+        for _ in 0..10 {
+            balancer.increment_load(ProcessorId(0));
+        }
+        balancer.increment_load(ProcessorId(1));
+
+        // Should steal between same types
+        assert!(balancer.should_steal(ProcessorId(0), ProcessorId(1)));
+        assert!(!balancer.should_steal(ProcessorId(1), ProcessorId(0)));
+    }
+
+    #[test]
+    fn test_should_not_steal_different_types() {
+        let balancer = LoadBalancer::new();
+
+        // Register CPU and GPU
+        balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
+        balancer.register_processor(
+            ProcessorId(1),
+            ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability: (8, 9) }),
+        );
+
+        // Give CPU high load
+        for _ in 0..10 {
+            balancer.increment_load(ProcessorId(0));
+        }
+
+        // Should NOT steal between different types
+        assert!(!balancer.should_steal(ProcessorId(0), ProcessorId(1)));
+    }
+
+    #[test]
+    fn test_can_execute() {
+        let balancer = LoadBalancer::new();
+
+        let matmul = Operation::MatMul {
+            m: 1024,
+            n: 1024,
+            k: 1024,
+            precision: Precision::Fp32,
+        };
+
+        let flash_attention = Operation::FlashAttention {
+            batch: 32,
+            seq_len: 2048,
+            num_heads: 32,
+            head_dim: 128,
+            precision: Precision::Fp16,
+        };
+
+        let cpu = ProcessorType::Cpu(CpuVariant::default());
+        let gpu = ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability: (8, 9) });
+        let lpu = ProcessorType::Lpu;
+
+        // MatMul can run on all
+        assert!(balancer.can_execute(&matmul, &cpu));
+        assert!(balancer.can_execute(&matmul, &gpu));
+        assert!(balancer.can_execute(&matmul, &lpu));
+
+        // FlashAttention only on GPU/TPU/LPU
+        assert!(!balancer.can_execute(&flash_attention, &cpu));
+        assert!(balancer.can_execute(&flash_attention, &gpu));
+    }
+
+    #[test]
+    fn test_strategy_affects_scoring() {
+        let balancer = LoadBalancer::new();
+
+        let cpu_id = ProcessorId(0);
+        let npu_id = ProcessorId(1);
+
+        balancer.register_processor(cpu_id, ProcessorType::Cpu(CpuVariant::default()));
+        balancer.register_processor(npu_id, ProcessorType::Npu(crate::processor::NpuVariant::AppleNeuralEngine { cores: 16 }));
+
+        let task = create_test_task(TaskPriority::Normal);
+
+        // Energy strategy should prefer NPU
+        balancer.set_strategy(BalancingStrategy::Energy);
+        let assignment = TaskAssignment::new();
+        let result = balancer.maybe_rebalance(&task, cpu_id, &assignment);
+
+        // NPU should be preferred for energy efficiency
+        assert_eq!(result, npu_id);
+    }
+
+    #[test]
+    fn test_stats() {
+        let balancer = LoadBalancer::new();
+
+        balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
+        balancer.register_processor(ProcessorId(1), ProcessorType::Cpu(CpuVariant::default()));
+
+        balancer.increment_load(ProcessorId(0));
+        balancer.increment_load(ProcessorId(0));
+        balancer.increment_load(ProcessorId(1));
+
+        let stats = balancer.get_stats();
+        assert_eq!(stats.total_load, 3);
+        assert_eq!(stats.processor_count, 2);
+        assert!((stats.avg_load - 1.5).abs() < 0.01);
+    }
+}
--- a/crates/synor-compute/src/scheduler/mod.rs
+++ b/crates/synor-compute/src/scheduler/mod.rs
@ -0,0 +1,559 @@
+//! Heterogeneous scheduler for multi-processor task assignment.
+//!
+//! Features:
+//! - Optimal task-to-processor assignment
+//! - Work stealing for load balancing
+//! - Pipeline parallelism across processor types
+//! - Dynamic rebalancing based on actual throughput
+
+mod load_balancer;
+mod work_queue;
+
+pub use load_balancer::LoadBalancer;
+pub use work_queue::WorkQueue;
+
+use crate::device::DeviceRegistry;
+use crate::error::ComputeError;
+use crate::processor::{Operation, Processor, ProcessorId, ProcessorType};
+use crate::task::{Task, TaskId, TaskPriority};
+use parking_lot::RwLock;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::Duration;
+
+/// Heterogeneous scheduler that manages tasks across all processor types.
+pub struct HeterogeneousScheduler {
+    /// Device registry.
+    device_registry: Arc<DeviceRegistry>,
+    /// Per-processor-type task queues.
+    queues: RwLock<HashMap<ProcessorType, WorkQueue>>,
+    /// Load balancer.
+    load_balancer: LoadBalancer,
+    /// Active schedules.
+    active_schedules: RwLock<HashMap<ScheduleId, Schedule>>,
+}
+
+impl HeterogeneousScheduler {
+    /// Creates a new heterogeneous scheduler.
+    pub fn new(device_registry: Arc<DeviceRegistry>) -> Self {
+        Self {
+            device_registry,
+            queues: RwLock::new(HashMap::new()),
+            load_balancer: LoadBalancer::new(),
+            active_schedules: RwLock::new(HashMap::new()),
+        }
+    }
+
+    /// Schedule a set of tasks for execution.
+    pub async fn schedule(&self, tasks: Vec<Task>) -> Result<ScheduleResult, ComputeError> {
+        if tasks.is_empty() {
+            return Ok(ScheduleResult {
+                schedule: Schedule::empty(),
+                estimated_makespan: Duration::ZERO,
+                processor_utilization: HashMap::new(),
+            });
+        }
+
+        // 1. Build dependency graph
+        let deps = self.build_dependency_graph(&tasks);
+
+        // 2. Assign tasks to optimal processors
+        let assignment = self.assign_tasks(&tasks, &deps).await?;
+
+        // 3. Create execution schedule with stages
+        let schedule = self.create_schedule(&tasks, &assignment, &deps)?;
+
+        // 4. Estimate metrics
+        let makespan = self.estimate_makespan(&schedule);
+        let utilization = self.estimate_utilization(&schedule);
+
+        // 5. Store active schedule
+        self.active_schedules.write().insert(schedule.id, schedule.clone());
+
+        Ok(ScheduleResult {
+            schedule,
+            estimated_makespan: makespan,
+            processor_utilization: utilization,
+        })
+    }
+
+    /// Execute a schedule.
+    pub async fn execute(&self, schedule: &Schedule) -> Result<ExecutionResult, ComputeError> {
+        let mut results = HashMap::new();
+        let start = std::time::Instant::now();
+
+        // Execute stages in order
+        for stage in &schedule.stages {
+            // Execute all tasks in this stage in parallel
+            let mut handles = Vec::new();
+
+            for task_id in &stage.tasks {
+                let task = schedule.tasks.get(task_id)
+                    .ok_or_else(|| ComputeError::Internal(format!("Task not found: {:?}", task_id)))?;
+                let processor_id = schedule.assignment.get(task_id)
+                    .ok_or_else(|| ComputeError::Internal(format!("No assignment for task: {:?}", task_id)))?;
+
+                let processor = self.device_registry.get_processor(processor_id)?;
+                let task_clone = task.clone();
+
+                handles.push(tokio::spawn(async move {
+                    processor.execute(task_clone.operation).await
+                }));
+            }
+
+            // Wait for all tasks in stage
+            for (i, handle) in handles.into_iter().enumerate() {
+                let task_id = stage.tasks[i];
+                match handle.await {
+                    Ok(Ok(result)) => {
+                        results.insert(task_id, TaskExecutionResult::Success(result));
+                    }
+                    Ok(Err(e)) => {
+                        results.insert(task_id, TaskExecutionResult::Failed(e.to_string()));
+                    }
+                    Err(e) => {
+                        results.insert(task_id, TaskExecutionResult::Failed(e.to_string()));
+                    }
+                }
+            }
+        }
+
+        let total_time = start.elapsed();
+
+        Ok(ExecutionResult {
+            results,
+            total_time,
+            actual_utilization: self.measure_utilization(),
+        })
+    }
+
+    /// Assign tasks to optimal processors.
+    async fn assign_tasks(
+        &self,
+        tasks: &[Task],
+        deps: &DependencyGraph,
+    ) -> Result<TaskAssignment, ComputeError> {
+        let mut assignment = TaskAssignment::new();
+
+        // Sort tasks by priority and dependencies (topological sort)
+        let sorted_tasks = self.topological_sort(tasks, deps);
+
+        for task in sorted_tasks {
+            // Find best processor for this task
+            let best_processor = self.find_best_processor(&task).await?;
+
+            // Check if we should rebalance
+            let final_processor = self.load_balancer
+                .maybe_rebalance(&task, best_processor, &assignment);
+
+            assignment.assign(task.id, final_processor);
+        }
+
+        Ok(assignment)
+    }
+
+    /// Find the best processor for a task.
+    async fn find_best_processor(&self, task: &Task) -> Result<ProcessorId, ComputeError> {
+        let mut best_score = f64::NEG_INFINITY;
+        let mut best_processor = None;
+
+        // Get all available processors
+        let processors = self.device_registry.all_processors();
+
+        for processor in processors {
+            if !processor.can_execute(&task.operation) {
+                continue;
+            }
+
+            // Calculate score based on multiple factors
+            let exec_time = processor.estimate_time(&task.operation);
+            let energy = processor.estimate_energy(&task.operation);
+            let load = processor.utilization();
+
+            // Score = 1 / (time * (1 + load) * energy_factor)
+            let time_factor = exec_time.as_secs_f64().max(0.001);
+            let load_factor = 1.0 + load;
+            let energy_factor = 1.0 + (energy / 1000.0); // Normalize energy
+
+            let score = 1.0 / (time_factor * load_factor * energy_factor);
+
+            if score > best_score {
+                best_score = score;
+                best_processor = Some(processor.id());
+            }
+        }
+
+        best_processor.ok_or_else(|| {
+            ComputeError::NoSuitableProcessor(format!("{:?}", task.operation.op_type()))
+        })
+    }
+
+    /// Build dependency graph from tasks.
+    fn build_dependency_graph(&self, tasks: &[Task]) -> DependencyGraph {
+        let mut graph = DependencyGraph::new();
+
+        for task in tasks {
+            graph.add_node(task.id);
+            for dep in &task.dependencies {
+                graph.add_edge(*dep, task.id);
+            }
+        }
+
+        graph
+    }
+
+    /// Topological sort of tasks respecting dependencies.
+    fn topological_sort(&self, tasks: &[Task], deps: &DependencyGraph) -> Vec<Task> {
+        let mut sorted = Vec::new();
+        let mut visited = std::collections::HashSet::new();
+        let task_map: HashMap<TaskId, Task> = tasks.iter()
+            .map(|t| (t.id, t.clone()))
+            .collect();
+
+        fn visit(
+            task_id: TaskId,
+            task_map: &HashMap<TaskId, Task>,
+            deps: &DependencyGraph,
+            visited: &mut std::collections::HashSet<TaskId>,
+            sorted: &mut Vec<Task>,
+        ) {
+            if visited.contains(&task_id) {
+                return;
+            }
+            visited.insert(task_id);
+
+            // Visit dependencies first
+            if let Some(task_deps) = deps.dependencies.get(&task_id) {
+                for dep in task_deps {
+                    visit(*dep, task_map, deps, visited, sorted);
+                }
+            }
+
+            if let Some(task) = task_map.get(&task_id) {
+                sorted.push(task.clone());
+            }
+        }
+
+        for task in tasks {
+            visit(task.id, &task_map, deps, &mut visited, &mut sorted);
+        }
+
+        // Sort by priority within dependency constraints
+        sorted.sort_by(|a, b| b.priority.cmp(&a.priority));
+
+        sorted
+    }
+
+    /// Create execution schedule with parallel stages.
+    fn create_schedule(
+        &self,
+        tasks: &[Task],
+        assignment: &TaskAssignment,
+        deps: &DependencyGraph,
+    ) -> Result<Schedule, ComputeError> {
+        let mut stages = Vec::new();
+        let mut scheduled = std::collections::HashSet::new();
+        let task_map: HashMap<TaskId, Task> = tasks.iter()
+            .map(|t| (t.id, t.clone()))
+            .collect();
+
+        while scheduled.len() < tasks.len() {
+            let mut stage_tasks = Vec::new();
+
+            for task in tasks {
+                if scheduled.contains(&task.id) {
+                    continue;
+                }
+
+                // Check if all dependencies are satisfied
+                let deps_satisfied = task.dependencies.iter()
+                    .all(|dep| scheduled.contains(dep));
+
+                if deps_satisfied {
+                    stage_tasks.push(task.id);
+                }
+            }
+
+            if stage_tasks.is_empty() {
+                return Err(ComputeError::SchedulingFailed(
+                    "Circular dependency detected".to_string()
+                ));
+            }
+
+            for task_id in &stage_tasks {
+                scheduled.insert(*task_id);
+            }
+
+            stages.push(ScheduleStage {
+                stage_id: stages.len(),
+                tasks: stage_tasks,
+            });
+        }
+
+        Ok(Schedule {
+            id: ScheduleId::new(),
+            tasks: task_map,
+            assignment: assignment.clone(),
+            stages,
+        })
+    }
+
+    /// Estimate makespan (total execution time).
+    fn estimate_makespan(&self, schedule: &Schedule) -> Duration {
+        let mut total = Duration::ZERO;
+
+        for stage in &schedule.stages {
+            let mut max_stage_time = Duration::ZERO;
+
+            for task_id in &stage.tasks {
+                if let (Some(task), Some(proc_id)) = (
+                    schedule.tasks.get(task_id),
+                    schedule.assignment.get(task_id),
+                ) {
+                    if let Ok(processor) = self.device_registry.get_processor(proc_id) {
+                        let time = processor.estimate_time(&task.operation);
+                        max_stage_time = max_stage_time.max(time);
+                    }
+                }
+            }
+
+            total += max_stage_time;
+        }
+
+        total
+    }
+
+    /// Estimate processor utilization.
+    fn estimate_utilization(&self, schedule: &Schedule) -> HashMap<ProcessorType, f64> {
+        let mut work_time: HashMap<ProcessorType, Duration> = HashMap::new();
+        let makespan = self.estimate_makespan(schedule);
+
+        for task_id in schedule.assignment.assignments.keys() {
+            if let (Some(task), Some(proc_id)) = (
+                schedule.tasks.get(task_id),
+                schedule.assignment.get(task_id),
+            ) {
+                if let Ok(processor) = self.device_registry.get_processor(proc_id) {
+                    let proc_type = processor.processor_type();
+                    let time = processor.estimate_time(&task.operation);
+                    *work_time.entry(proc_type).or_default() += time;
+                }
+            }
+        }
+
+        work_time
+            .into_iter()
+            .map(|(proc_type, time)| {
+                let utilization = if makespan.as_secs_f64() > 0.0 {
+                    time.as_secs_f64() / makespan.as_secs_f64()
+                } else {
+                    0.0
+                };
+                (proc_type, utilization.min(1.0))
+            })
+            .collect()
+    }
+
+    /// Measure actual current utilization.
+    fn measure_utilization(&self) -> HashMap<ProcessorType, f64> {
+        let mut utilization = HashMap::new();
+
+        for processor in self.device_registry.all_processors() {
+            let proc_type = processor.processor_type();
+            let util = processor.utilization();
+            utilization
+                .entry(proc_type)
+                .and_modify(|u| *u = (*u + util) / 2.0)
+                .or_insert(util);
+        }
+
+        utilization
+    }
+}
+
+/// Schedule identifier.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct ScheduleId(pub u64);
+
+impl ScheduleId {
+    /// Creates a new schedule ID.
+    pub fn new() -> Self {
+        use rand::Rng;
+        ScheduleId(rand::thread_rng().gen())
+    }
+}
+
+impl Default for ScheduleId {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Task-to-processor assignment.
+#[derive(Clone, Debug, Default)]
+pub struct TaskAssignment {
+    /// Map from task ID to processor ID.
+    pub assignments: HashMap<TaskId, ProcessorId>,
+}
+
+impl TaskAssignment {
+    /// Creates a new empty assignment.
+    pub fn new() -> Self {
+        Self {
+            assignments: HashMap::new(),
+        }
+    }
+
+    /// Assigns a task to a processor.
+    pub fn assign(&mut self, task_id: TaskId, processor_id: ProcessorId) {
+        self.assignments.insert(task_id, processor_id);
+    }
+
+    /// Gets the assigned processor for a task.
+    pub fn get(&self, task_id: &TaskId) -> Option<ProcessorId> {
+        self.assignments.get(task_id).copied()
+    }
+}
+
+/// Dependency graph for tasks.
+#[derive(Clone, Debug, Default)]
+pub struct DependencyGraph {
+    /// Dependencies: task -> list of tasks it depends on.
+    pub dependencies: HashMap<TaskId, Vec<TaskId>>,
+    /// Dependents: task -> list of tasks that depend on it.
+    pub dependents: HashMap<TaskId, Vec<TaskId>>,
+}
+
+impl DependencyGraph {
+    /// Creates a new empty dependency graph.
+    pub fn new() -> Self {
+        Self {
+            dependencies: HashMap::new(),
+            dependents: HashMap::new(),
+        }
+    }
+
+    /// Adds a node (task) to the graph.
+    pub fn add_node(&mut self, task_id: TaskId) {
+        self.dependencies.entry(task_id).or_default();
+        self.dependents.entry(task_id).or_default();
+    }
+
+    /// Adds a dependency edge (from depends on to).
+    pub fn add_edge(&mut self, from: TaskId, to: TaskId) {
+        self.dependencies.entry(to).or_default().push(from);
+        self.dependents.entry(from).or_default().push(to);
+    }
+}
+
+/// Execution schedule.
+#[derive(Clone, Debug)]
+pub struct Schedule {
+    /// Schedule ID.
+    pub id: ScheduleId,
+    /// All tasks.
+    pub tasks: HashMap<TaskId, Task>,
+    /// Task assignments.
+    pub assignment: TaskAssignment,
+    /// Execution stages (tasks within a stage can run in parallel).
+    pub stages: Vec<ScheduleStage>,
+}
+
+impl Schedule {
+    /// Creates an empty schedule.
+    pub fn empty() -> Self {
+        Self {
+            id: ScheduleId::new(),
+            tasks: HashMap::new(),
+            assignment: TaskAssignment::new(),
+            stages: Vec::new(),
+        }
+    }
+}
+
+/// A stage of parallel tasks.
+#[derive(Clone, Debug)]
+pub struct ScheduleStage {
+    /// Stage index.
+    pub stage_id: usize,
+    /// Tasks in this stage (can run in parallel).
+    pub tasks: Vec<TaskId>,
+}
+
+/// Result of scheduling.
+#[derive(Clone, Debug)]
+pub struct ScheduleResult {
+    /// The schedule.
+    pub schedule: Schedule,
+    /// Estimated total execution time.
+    pub estimated_makespan: Duration,
+    /// Estimated processor utilization by type.
+    pub processor_utilization: HashMap<ProcessorType, f64>,
+}
+
+/// Result of execution.
+#[derive(Clone, Debug)]
+pub struct ExecutionResult {
+    /// Results per task.
+    pub results: HashMap<TaskId, TaskExecutionResult>,
+    /// Total execution time.
+    pub total_time: Duration,
+    /// Actual processor utilization.
+    pub actual_utilization: HashMap<ProcessorType, f64>,
+}
+
+/// Result of a single task execution.
+#[derive(Clone, Debug)]
+pub enum TaskExecutionResult {
+    /// Task completed successfully.
+    Success(crate::processor::OperationResult),
+    /// Task failed.
+    Failed(String),
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::processor::Precision;
+    use crate::task::TaskStatus;
+
+    fn create_test_task(id: u64, op: Operation, deps: Vec<TaskId>) -> Task {
+        Task {
+            id: TaskId(id),
+            operation: op,
+            priority: TaskPriority::Normal,
+            dependencies: deps,
+            status: TaskStatus::Pending,
+            deadline: None,
+        }
+    }
+
+    #[test]
+    fn test_dependency_graph() {
+        let mut graph = DependencyGraph::new();
+
+        graph.add_node(TaskId(1));
+        graph.add_node(TaskId(2));
+        graph.add_node(TaskId(3));
+
+        graph.add_edge(TaskId(1), TaskId(2)); // 2 depends on 1
+        graph.add_edge(TaskId(1), TaskId(3)); // 3 depends on 1
+        graph.add_edge(TaskId(2), TaskId(3)); // 3 depends on 2
+
+        assert_eq!(graph.dependencies[&TaskId(2)], vec![TaskId(1)]);
+        assert_eq!(graph.dependencies[&TaskId(3)], vec![TaskId(1), TaskId(2)]);
+    }
+
+    #[test]
+    fn test_task_assignment() {
+        let mut assignment = TaskAssignment::new();
+
+        assignment.assign(TaskId(1), ProcessorId(0));
+        assignment.assign(TaskId(2), ProcessorId(1));
+
+        assert_eq!(assignment.get(&TaskId(1)), Some(ProcessorId(0)));
+        assert_eq!(assignment.get(&TaskId(2)), Some(ProcessorId(1)));
+        assert_eq!(assignment.get(&TaskId(3)), None);
+    }
+}
--- a/crates/synor-compute/src/scheduler/work_queue.rs
+++ b/crates/synor-compute/src/scheduler/work_queue.rs
@ -0,0 +1,271 @@
+//! Work queue with thread-safe task management.
+
+use crate::processor::ProcessorType;
+use crate::task::{Task, TaskId, TaskPriority};
+use crossbeam_channel::{bounded, Receiver, Sender, TryRecvError};
+use std::collections::HashMap;
+use std::sync::atomic::{AtomicU64, Ordering};
+
+/// Work queue for a specific processor type.
+pub struct WorkQueue {
+    /// Task sender (for producers).
+    sender: Sender<Task>,
+    /// Task receiver (for consumers).
+    receiver: Receiver<Task>,
+    /// Processor type this queue is for.
+    processor_type: ProcessorType,
+    /// Current queue size.
+    size: AtomicU64,
+    /// Total tasks processed.
+    processed: AtomicU64,
+}
+
+impl WorkQueue {
+    /// Creates a new work queue for a processor type.
+    pub fn new(processor_type: ProcessorType, capacity: usize) -> Self {
+        let (sender, receiver) = bounded(capacity.max(1024));
+
+        Self {
+            sender,
+            receiver,
+            processor_type,
+            size: AtomicU64::new(0),
+            processed: AtomicU64::new(0),
+        }
+    }
+
+    /// Push a task to the queue.
+    pub fn push(&self, task: Task) {
+        if self.sender.try_send(task).is_ok() {
+            self.size.fetch_add(1, Ordering::Relaxed);
+        }
+    }
+
+    /// Pop a task from the queue (ignores worker_id for compatibility).
+    pub fn pop(&self, _worker_id: usize) -> Option<Task> {
+        self.pop_any()
+    }
+
+    /// Pop any task from the queue.
+    pub fn pop_any(&self) -> Option<Task> {
+        match self.receiver.try_recv() {
+            Ok(task) => {
+                self.size.fetch_sub(1, Ordering::Relaxed);
+                self.processed.fetch_add(1, Ordering::Relaxed);
+                Some(task)
+            }
+            Err(TryRecvError::Empty) | Err(TryRecvError::Disconnected) => None,
+        }
+    }
+
+    /// Pop from global queue (alias for pop_any).
+    pub fn pop_global(&self) -> Option<Task> {
+        self.pop_any()
+    }
+
+    /// Steal a batch of tasks from another queue.
+    pub fn steal_batch_from(&self, other: &WorkQueue, max_tasks: usize) -> Vec<Task> {
+        let mut stolen = Vec::new();
+
+        while stolen.len() < max_tasks {
+            if let Some(task) = other.pop_any() {
+                stolen.push(task);
+            } else {
+                break;
+            }
+        }
+
+        // Push stolen tasks to this queue
+        for task in &stolen {
+            // Tasks are already accounted for in `other`, just push to self
+            if self.sender.try_send(task.clone()).is_ok() {
+                self.size.fetch_add(1, Ordering::Relaxed);
+            }
+        }
+
+        stolen
+    }
+
+    /// Get current queue size.
+    pub fn len(&self) -> usize {
+        self.size.load(Ordering::Relaxed) as usize
+    }
+
+    /// Check if queue is empty.
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Get number of tasks processed.
+    pub fn processed_count(&self) -> u64 {
+        self.processed.load(Ordering::Relaxed)
+    }
+
+    /// Get processor type for this queue.
+    pub fn processor_type(&self) -> ProcessorType {
+        self.processor_type.clone()
+    }
+
+    /// Get utilization estimate (0.0 - 1.0).
+    pub fn utilization(&self) -> f64 {
+        let size = self.size.load(Ordering::Relaxed) as f64;
+        let capacity = self.sender.capacity().unwrap_or(1024) as f64;
+        (size / capacity).min(1.0)
+    }
+
+    /// Get a stealer for cross-queue work stealing.
+    pub fn get_stealer(&self) -> QueueStealer {
+        QueueStealer {
+            receiver: self.receiver.clone(),
+        }
+    }
+}
+
+/// Stealer handle for cross-queue work stealing.
+#[derive(Clone)]
+pub struct QueueStealer {
+    receiver: Receiver<Task>,
+}
+
+impl QueueStealer {
+    /// Try to steal a task.
+    pub fn steal(&self) -> Option<Task> {
+        self.receiver.try_recv().ok()
+    }
+}
+
+/// Priority queue wrapper for tasks.
+pub struct PriorityWorkQueue {
+    /// Queues by priority level.
+    queues: HashMap<TaskPriority, WorkQueue>,
+    /// Processor type.
+    processor_type: ProcessorType,
+}
+
+impl PriorityWorkQueue {
+    /// Creates a new priority work queue.
+    pub fn new(processor_type: ProcessorType, capacity_per_priority: usize) -> Self {
+        let mut queues = HashMap::new();
+
+        for priority in [
+            TaskPriority::Critical,
+            TaskPriority::High,
+            TaskPriority::Normal,
+            TaskPriority::Background,
+        ] {
+            queues.insert(priority, WorkQueue::new(processor_type.clone(), capacity_per_priority));
+        }
+
+        Self {
+            queues,
+            processor_type,
+        }
+    }
+
+    /// Push a task with its priority.
+    pub fn push(&self, task: Task) {
+        let priority = task.priority;
+        if let Some(queue) = self.queues.get(&priority) {
+            queue.push(task);
+        }
+    }
+
+    /// Pop highest priority task available.
+    pub fn pop(&self, worker_id: usize) -> Option<Task> {
+        // Try priorities in order: Critical > High > Normal > Background
+        for priority in [
+            TaskPriority::Critical,
+            TaskPriority::High,
+            TaskPriority::Normal,
+            TaskPriority::Background,
+        ] {
+            if let Some(queue) = self.queues.get(&priority) {
+                if let Some(task) = queue.pop(worker_id) {
+                    return Some(task);
+                }
+            }
+        }
+        None
+    }
+
+    /// Get total queue size.
+    pub fn len(&self) -> usize {
+        self.queues.values().map(|q| q.len()).sum()
+    }
+
+    /// Check if all queues are empty.
+    pub fn is_empty(&self) -> bool {
+        self.queues.values().all(|q| q.is_empty())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::processor::{CpuVariant, Operation, Precision};
+    use crate::task::TaskStatus;
+
+    fn create_test_task(id: u64, priority: TaskPriority) -> Task {
+        Task {
+            id: TaskId(id),
+            operation: Operation::MatMul {
+                m: 1024,
+                n: 1024,
+                k: 1024,
+                precision: Precision::Fp32,
+            },
+            priority,
+            dependencies: vec![],
+            status: TaskStatus::Pending,
+            deadline: None,
+        }
+    }
+
+    #[test]
+    fn test_work_queue_basic() {
+        let queue = WorkQueue::new(
+            ProcessorType::Cpu(CpuVariant::default()),
+            100,
+        );
+
+        assert!(queue.is_empty());
+
+        queue.push(create_test_task(1, TaskPriority::Normal));
+        queue.push(create_test_task(2, TaskPriority::Normal));
+
+        assert_eq!(queue.len(), 2);
+
+        let task1 = queue.pop(0);
+        assert!(task1.is_some());
+        assert_eq!(queue.len(), 1);
+
+        let task2 = queue.pop(0);
+        assert!(task2.is_some());
+        assert!(queue.is_empty());
+    }
+
+    #[test]
+    fn test_priority_queue() {
+        let queue = PriorityWorkQueue::new(
+            ProcessorType::Cpu(CpuVariant::default()),
+            100,
+        );
+
+        queue.push(create_test_task(1, TaskPriority::Background));
+        queue.push(create_test_task(2, TaskPriority::Critical));
+        queue.push(create_test_task(3, TaskPriority::Normal));
+
+        // Should get Critical first
+        let task = queue.pop(0).unwrap();
+        assert_eq!(task.id, TaskId(2));
+        assert_eq!(task.priority, TaskPriority::Critical);
+
+        // Then Normal
+        let task = queue.pop(0).unwrap();
+        assert_eq!(task.id, TaskId(3));
+
+        // Then Background
+        let task = queue.pop(0).unwrap();
+        assert_eq!(task.id, TaskId(1));
+    }
+}
--- a/crates/synor-compute/src/task/mod.rs
+++ b/crates/synor-compute/src/task/mod.rs
@ -0,0 +1,543 @@
+//! Task definitions and decomposition.
+
+use crate::error::ComputeError;
+use crate::processor::{Operation, OperationType, Precision, ProcessorType};
+use crate::{ComputeJob, JobType};
+use serde::{Deserialize, Serialize};
+use std::time::Duration;
+
+/// Unique task identifier.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct TaskId(pub u64);
+
+impl TaskId {
+    /// Creates a new task ID.
+    pub fn new() -> Self {
+        use rand::Rng;
+        TaskId(rand::thread_rng().gen())
+    }
+}
+
+impl Default for TaskId {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl std::fmt::Display for TaskId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "task_{}", self.0)
+    }
+}
+
+/// Task priority levels.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub enum TaskPriority {
+    /// Background, can be preempted.
+    Background = 0,
+    /// Normal priority.
+    Normal = 1,
+    /// High priority.
+    High = 2,
+    /// Critical, must complete.
+    Critical = 3,
+}
+
+impl Default for TaskPriority {
+    fn default() -> Self {
+        TaskPriority::Normal
+    }
+}
+
+/// Task execution status.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub enum TaskStatus {
+    /// Waiting to be scheduled.
+    Pending,
+    /// Queued for execution.
+    Queued,
+    /// Currently executing.
+    Running,
+    /// Completed successfully.
+    Completed,
+    /// Failed.
+    Failed,
+    /// Cancelled.
+    Cancelled,
+}
+
+/// A schedulable task.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct Task {
+    /// Task ID.
+    pub id: TaskId,
+    /// Operation to execute.
+    pub operation: Operation,
+    /// Priority level.
+    pub priority: TaskPriority,
+    /// Dependencies (tasks that must complete first).
+    pub dependencies: Vec<TaskId>,
+    /// Current status.
+    pub status: TaskStatus,
+    /// Deadline (optional).
+    pub deadline: Option<u64>,
+}
+
+impl Task {
+    /// Creates a new task.
+    pub fn new(operation: Operation) -> Self {
+        Self {
+            id: TaskId::new(),
+            operation,
+            priority: TaskPriority::Normal,
+            dependencies: Vec::new(),
+            status: TaskStatus::Pending,
+            deadline: None,
+        }
+    }
+
+    /// Sets the priority.
+    pub fn with_priority(mut self, priority: TaskPriority) -> Self {
+        self.priority = priority;
+        self
+    }
+
+    /// Adds dependencies.
+    pub fn with_dependencies(mut self, deps: Vec<TaskId>) -> Self {
+        self.dependencies = deps;
+        self
+    }
+
+    /// Sets deadline.
+    pub fn with_deadline(mut self, deadline: u64) -> Self {
+        self.deadline = Some(deadline);
+        self
+    }
+
+    /// Checks if task is compatible with a processor type.
+    pub fn is_compatible_with(&self, proc_type: ProcessorType) -> bool {
+        // Check based on operation type
+        let op_type = self.operation.op_type();
+
+        match proc_type {
+            ProcessorType::Cpu(_) => {
+                // CPUs can do most things, but slowly
+                true
+            }
+            ProcessorType::Gpu(_) => {
+                // GPUs are good for parallel operations
+                matches!(
+                    op_type,
+                    OperationType::MatMul
+                        | OperationType::Conv2d
+                        | OperationType::SelfAttention
+                        | OperationType::FlashAttention
+                        | OperationType::Embedding
+                        | OperationType::Add
+                        | OperationType::Mul
+                        | OperationType::Softmax
+                )
+            }
+            ProcessorType::Tpu(_) => {
+                // TPUs are good for large matrix ops
+                matches!(
+                    op_type,
+                    OperationType::MatMul
+                        | OperationType::Conv2d
+                        | OperationType::SelfAttention
+                        | OperationType::FlashAttention
+                )
+            }
+            ProcessorType::Lpu => {
+                // LPUs are good for sequential inference
+                matches!(
+                    op_type,
+                    OperationType::MatMul
+                        | OperationType::SelfAttention
+                        | OperationType::KVCache
+                        | OperationType::Sampling
+                )
+            }
+            ProcessorType::Npu(_) => {
+                // NPUs are good for inference
+                matches!(
+                    op_type,
+                    OperationType::MatMul
+                        | OperationType::Conv2d
+                        | OperationType::Add
+                        | OperationType::Softmax
+                )
+            }
+            _ => true, // Default to compatible
+        }
+    }
+}
+
+/// Result of task execution.
+#[derive(Clone, Debug)]
+pub struct TaskResult {
+    /// Task ID.
+    pub task_id: TaskId,
+    /// Output data.
+    pub output: Vec<u8>,
+    /// Execution duration.
+    pub duration: Duration,
+    /// Energy consumed (Joules).
+    pub energy: f64,
+}
+
+/// Compute task for job execution.
+#[derive(Clone, Debug)]
+pub struct ComputeTask {
+    /// Task.
+    pub task: Task,
+    /// Resource requirements.
+    pub requirements: TaskRequirements,
+    /// Preferred processor type.
+    pub preferred_processor: Option<ProcessorType>,
+    /// Fallback processor type.
+    pub fallback_processor: Option<ProcessorType>,
+}
+
+/// Task resource requirements.
+#[derive(Clone, Debug, Default)]
+pub struct TaskRequirements {
+    /// Minimum memory (bytes).
+    pub min_memory: u64,
+    /// Minimum TFLOPS.
+    pub min_tflops: f64,
+    /// Maximum latency (ms).
+    pub max_latency_ms: Option<u32>,
+    /// Requires specific precision.
+    pub precision: Option<Precision>,
+}
+
+/// Decomposed workload.
+#[derive(Clone, Debug)]
+pub struct DecomposedWorkload {
+    /// All tasks.
+    pub tasks: Vec<Task>,
+    /// Total estimated FLOPS.
+    pub estimated_flops: f64,
+    /// Total estimated memory.
+    pub estimated_memory: u64,
+}
+
+/// Task decomposer that breaks jobs into schedulable tasks.
+pub struct TaskDecomposer {
+    /// Default batch size for inference.
+    inference_batch_size: usize,
+    /// Default precision.
+    default_precision: Precision,
+}
+
+impl TaskDecomposer {
+    /// Creates a new task decomposer.
+    pub fn new() -> Self {
+        Self {
+            inference_batch_size: 32,
+            default_precision: Precision::Fp16,
+        }
+    }
+
+    /// Decomposes a job into tasks.
+    pub fn decompose(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
+        match &job.job_type {
+            JobType::Training { .. } => self.decompose_training(job),
+            JobType::Inference { .. } => self.decompose_inference(job),
+            JobType::Container { .. } => self.decompose_container(job),
+            JobType::Serverless { .. } => self.decompose_serverless(job),
+            JobType::Wasm { .. } => self.decompose_wasm(job),
+        }
+    }
+
+    /// Decompose training job.
+    fn decompose_training(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
+        let mut tasks = Vec::new();
+
+        if let JobType::Training {
+            epochs,
+            batch_size,
+            ..
+        } = &job.job_type
+        {
+            // Data loading task
+            tasks.push(
+                Task::new(Operation::DataLoad {
+                    bytes: 1024 * 1024 * 100, // 100MB
+                    async_: true,
+                })
+                .with_priority(TaskPriority::High),
+            );
+
+            let data_load_id = tasks[0].id;
+
+            // Preprocessing task
+            tasks.push(
+                Task::new(Operation::DataPreprocess {
+                    batch: *batch_size as usize,
+                    transforms: vec!["normalize".to_string(), "augment".to_string()],
+                })
+                .with_dependencies(vec![data_load_id])
+                .with_priority(TaskPriority::High),
+            );
+
+            let preprocess_id = tasks[1].id;
+
+            // Forward pass (simplified as MatMul)
+            tasks.push(
+                Task::new(Operation::MatMul {
+                    m: *batch_size as usize,
+                    n: 4096,
+                    k: 4096,
+                    precision: self.default_precision,
+                })
+                .with_dependencies(vec![preprocess_id])
+                .with_priority(TaskPriority::Critical),
+            );
+
+            let forward_id = tasks[2].id;
+
+            // Backward pass
+            tasks.push(
+                Task::new(Operation::Backward {
+                    forward_op: Box::new(Operation::MatMul {
+                        m: *batch_size as usize,
+                        n: 4096,
+                        k: 4096,
+                        precision: self.default_precision,
+                    }),
+                })
+                .with_dependencies(vec![forward_id])
+                .with_priority(TaskPriority::Critical),
+            );
+
+            let backward_id = tasks[3].id;
+
+            // Optimizer step
+            tasks.push(
+                Task::new(Operation::OptimizerStep {
+                    parameters: 1_000_000,
+                    optimizer: "adamw".to_string(),
+                    precision: self.default_precision,
+                })
+                .with_dependencies(vec![backward_id])
+                .with_priority(TaskPriority::High),
+            );
+        }
+
+        Ok(tasks)
+    }
+
+    /// Decompose inference job.
+    fn decompose_inference(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
+        let mut tasks = Vec::new();
+
+        if let JobType::Inference { batch_size, .. } = &job.job_type {
+            // Tokenization (CPU optimal)
+            tasks.push(
+                Task::new(Operation::Tokenization {
+                    text_bytes: 4096,
+                    vocab_size: 32000,
+                })
+                .with_priority(TaskPriority::High),
+            );
+
+            let token_id = tasks[0].id;
+
+            // Embedding (GPU optimal)
+            tasks.push(
+                Task::new(Operation::Embedding {
+                    batch: *batch_size as usize,
+                    seq_len: 512,
+                    vocab_size: 32000,
+                    embed_dim: 4096,
+                    precision: self.default_precision,
+                })
+                .with_dependencies(vec![token_id])
+                .with_priority(TaskPriority::Critical),
+            );
+
+            let embed_id = tasks[1].id;
+
+            // Self-attention (TPU/GPU optimal)
+            tasks.push(
+                Task::new(Operation::SelfAttention {
+                    batch: *batch_size as usize,
+                    seq_len: 512,
+                    num_heads: 32,
+                    head_dim: 128,
+                    precision: self.default_precision,
+                })
+                .with_dependencies(vec![embed_id])
+                .with_priority(TaskPriority::Critical),
+            );
+
+            let attention_id = tasks[2].id;
+
+            // Sampling (LPU optimal)
+            tasks.push(
+                Task::new(Operation::Sampling {
+                    batch: *batch_size as usize,
+                    vocab_size: 32000,
+                    temperature: 0.7,
+                })
+                .with_dependencies(vec![attention_id])
+                .with_priority(TaskPriority::High),
+            );
+
+            let sample_id = tasks[3].id;
+
+            // Detokenization (CPU optimal)
+            tasks.push(
+                Task::new(Operation::Detokenization {
+                    tokens: 256,
+                    vocab_size: 32000,
+                })
+                .with_dependencies(vec![sample_id])
+                .with_priority(TaskPriority::Normal),
+            );
+        }
+
+        Ok(tasks)
+    }
+
+    /// Decompose container job.
+    fn decompose_container(&self, _job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
+        // Container jobs are typically a single task
+        Ok(vec![Task::new(Operation::Generic {
+            op_type: OperationType::DataLoad,
+            flops: 1e9,
+            memory: 1024 * 1024 * 1024,
+        })
+        .with_priority(TaskPriority::Normal)])
+    }
+
+    /// Decompose serverless function.
+    fn decompose_serverless(&self, _job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
+        // Serverless is typically a single task
+        Ok(vec![Task::new(Operation::Generic {
+            op_type: OperationType::DataPreprocess,
+            flops: 1e6,
+            memory: 256 * 1024 * 1024,
+        })
+        .with_priority(TaskPriority::High)])
+    }
+
+    /// Decompose WASM job.
+    fn decompose_wasm(&self, _job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
+        // WASM is typically a single task
+        Ok(vec![Task::new(Operation::Generic {
+            op_type: OperationType::DataPreprocess,
+            flops: 1e6,
+            memory: 16 * 1024 * 1024,
+        })
+        .with_priority(TaskPriority::Normal)])
+    }
+}
+
+impl Default for TaskDecomposer {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_task_creation() {
+        let task = Task::new(Operation::MatMul {
+            m: 1024,
+            n: 1024,
+            k: 1024,
+            precision: Precision::Fp32,
+        })
+        .with_priority(TaskPriority::High);
+
+        assert_eq!(task.priority, TaskPriority::High);
+        assert!(task.dependencies.is_empty());
+        assert_eq!(task.status, TaskStatus::Pending);
+    }
+
+    #[test]
+    fn test_task_dependencies() {
+        let task1 = Task::new(Operation::DataLoad {
+            bytes: 1000,
+            async_: true,
+        });
+        let task1_id = task1.id;
+
+        let task2 = Task::new(Operation::MatMul {
+            m: 1024,
+            n: 1024,
+            k: 1024,
+            precision: Precision::Fp32,
+        })
+        .with_dependencies(vec![task1_id]);
+
+        assert_eq!(task2.dependencies, vec![task1_id]);
+    }
+
+    #[test]
+    fn test_task_compatibility() {
+        let matmul_task = Task::new(Operation::MatMul {
+            m: 1024,
+            n: 1024,
+            k: 1024,
+            precision: Precision::Fp32,
+        });
+
+        // MatMul should be compatible with GPU and TPU
+        assert!(matmul_task.is_compatible_with(ProcessorType::Gpu(
+            crate::processor::GpuVariant::NvidiaCuda {
+                compute_capability: (8, 0)
+            }
+        )));
+        assert!(matmul_task.is_compatible_with(ProcessorType::Tpu(
+            crate::processor::TpuVersion::V5p
+        )));
+
+        let data_load_task = Task::new(Operation::DataLoad {
+            bytes: 1000,
+            async_: true,
+        });
+
+        // DataLoad should be compatible with CPU
+        assert!(data_load_task.is_compatible_with(ProcessorType::Cpu(
+            crate::processor::CpuVariant::default()
+        )));
+    }
+
+    #[test]
+    fn test_task_decomposer() {
+        let decomposer = TaskDecomposer::new();
+
+        let job = ComputeJob {
+            id: crate::JobId::new(),
+            owner: [0u8; 32],
+            job_type: JobType::Inference {
+                model_cid: "model".to_string(),
+                input_format: "json".to_string(),
+                batch_size: 1,
+            },
+            resources: crate::ResourceRequirements::default(),
+            input_cid: None,
+            max_budget: 1_000_000,
+            priority: crate::JobPriority::Normal,
+            created_at: 0,
+            deadline: None,
+        };
+
+        let tasks = decomposer.decompose(&job).unwrap();
+        assert!(!tasks.is_empty());
+
+        // Check dependencies form a chain
+        for (i, task) in tasks.iter().enumerate() {
+            if i > 0 {
+                assert!(!task.dependencies.is_empty());
+            }
+        }
+    }
+}
--- a/docs/PLAN/PHASE11-Synor-Compute-L2-Part2-HyperEfficiency.md
+++ b/docs/PLAN/PHASE11-Synor-Compute-L2-Part2-HyperEfficiency.md
--- a/docs/PLAN/PHASE11-Synor-Compute-L2-Part3-HeterogeneousCompute.md
+++ b/docs/PLAN/PHASE11-Synor-Compute-L2-Part3-HeterogeneousCompute.md
--- a/docs/PLAN/PHASE11-Synor-Compute-L2.md
+++ b/docs/PLAN/PHASE11-Synor-Compute-L2.md
@ -0,0 +1,906 @@
+# Phase 11: Synor Compute L2 - Full-Stack Compute Platform
+
+> **Mission**: Build a decentralized compute platform capable of AI/ML training, inference, OS hosting, and general-purpose high-performance computing.
+
+---
+
+## Executive Summary
+
+Synor Compute L2 extends beyond the current WASM-only Synor VM to provide:
+- **GPU Compute**: AI/ML training and inference with CUDA/ROCm support
+- **Container Orchestration**: Docker-compatible workloads with Kubernetes-style scheduling
+- **Persistent VMs**: Long-running virtual machines for OS hosting
+- **Serverless Functions**: Short-lived compute for API backends and event processing
+- **Edge Compute**: Low-latency compute at network edge nodes
+
+---
+
+## Architecture Overview
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                         SYNOR COMPUTE L2                                     │
+├─────────────────────────────────────────────────────────────────────────────┤
+│                                                                              │
+│  ┌─────────────────────────────────────────────────────────────────────────┐ │
+│  │                      APPLICATION LAYER                                   │ │
+│  ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
+│  │   AI/ML      │  Serverless  │  Containers  │  Persistent  │   Edge     │ │
+│  │   Training   │  Functions   │  (Docker)    │  VMs (Linux) │  Compute   │ │
+│  └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
+│                                                                              │
+│  ┌─────────────────────────────────────────────────────────────────────────┐ │
+│  │                      ORCHESTRATION LAYER                                 │ │
+│  ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
+│  │   Job        │  Resource    │  Network     │  Storage     │   Health   │ │
+│  │   Scheduler  │  Manager     │  Fabric      │  Orchestrator│   Monitor  │ │
+│  └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
+│                                                                              │
+│  ┌─────────────────────────────────────────────────────────────────────────┐ │
+│  │                      COMPUTE RUNTIME LAYER                               │ │
+│  ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
+│  │   GPU        │  Container   │  MicroVM     │  WASM        │   Native   │ │
+│  │   Runtime    │  Runtime     │  Runtime     │  Runtime     │   Runtime  │ │
+│  │   (CUDA/ROCm)│  (containerd)│  (Firecracker)│  (Wasmtime) │   (gVisor) │ │
+│  └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
+│                                                                              │
+│  ┌─────────────────────────────────────────────────────────────────────────┐ │
+│  │                      INFRASTRUCTURE LAYER                                │ │
+│  ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
+│  │   Node       │  Network     │  Distributed │  Consensus   │   Billing  │ │
+│  │   Registry   │  Overlay     │  Storage     │  (PoS+PoW)   │   Metering │ │
+│  └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
+│                                                                              │
+│  ┌─────────────────────────────────────────────────────────────────────────┐ │
+│  │              SYNOR L1 BLOCKCHAIN (GHOSTDAG + DAG-RIDER)                  │ │
+│  └─────────────────────────────────────────────────────────────────────────┘ │
+│                                                                              │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## Milestone 1: GPU Compute Foundation (AI/ML Training & Inference)
+
+### 1.1 GPU Node Registration
+
+```rust
+// synor-compute/src/gpu/node.rs
+
+/// GPU node capabilities
+pub struct GpuNode {
+    /// Unique node ID
+    pub node_id: NodeId,
+    /// GPU specifications
+    pub gpus: Vec<GpuSpec>,
+    /// Total VRAM available (bytes)
+    pub total_vram: u64,
+    /// Available VRAM (bytes)
+    pub available_vram: u64,
+    /// CUDA compute capability (e.g., 8.6 for RTX 3090)
+    pub cuda_capability: Option<(u8, u8)>,
+    /// ROCm version (for AMD)
+    pub rocm_version: Option<String>,
+    /// Network bandwidth (Gbps)
+    pub bandwidth_gbps: u32,
+    /// Geographic region
+    pub region: Region,
+    /// Stake amount (for PoS validation)
+    pub stake: u64,
+}
+
+pub struct GpuSpec {
+    pub model: String,           // "NVIDIA RTX 4090"
+    pub vram_gb: u32,            // 24
+    pub tensor_cores: u32,       // 512
+    pub cuda_cores: u32,         // 16384
+    pub memory_bandwidth: u32,   // 1008 GB/s
+    pub fp32_tflops: f32,        // 82.6
+    pub fp16_tflops: f32,        // 165.2
+    pub int8_tops: f32,          // 330.4
+}
+```
+
+### 1.2 AI/ML Job Specification
+
+```rust
+// synor-compute/src/ai/job.rs
+
+/// AI/ML training job specification
+pub struct TrainingJob {
+    /// Job ID
+    pub job_id: JobId,
+    /// Owner address
+    pub owner: Address,
+    /// Framework (PyTorch, TensorFlow, JAX)
+    pub framework: MlFramework,
+    /// Model specification
+    pub model: ModelSpec,
+    /// Dataset reference (Synor Storage CID)
+    pub dataset_cid: Cid,
+    /// Training configuration
+    pub config: TrainingConfig,
+    /// Resource requirements
+    pub resources: GpuResources,
+    /// Maximum budget (SYNOR tokens)
+    pub max_budget: u64,
+    /// Checkpoint interval (steps)
+    pub checkpoint_interval: u64,
+}
+
+pub struct GpuResources {
+    pub min_gpus: u32,
+    pub max_gpus: u32,
+    pub min_vram_per_gpu: u64,
+    pub cuda_capability_min: Option<(u8, u8)>,
+    pub distributed: bool,        // Multi-node training
+    pub priority: JobPriority,
+}
+
+pub enum MlFramework {
+    PyTorch { version: String },
+    TensorFlow { version: String },
+    JAX { version: String },
+    ONNX,
+    Custom { image: String },
+}
+
+pub struct TrainingConfig {
+    pub epochs: u32,
+    pub batch_size: u32,
+    pub learning_rate: f32,
+    pub optimizer: String,
+    pub mixed_precision: bool,
+    pub gradient_accumulation: u32,
+    pub distributed_strategy: DistributedStrategy,
+}
+
+pub enum DistributedStrategy {
+    DataParallel,
+    ModelParallel,
+    PipelineParallel,
+    ZeRO { stage: u8 },  // DeepSpeed ZeRO stages 1-3
+    FSDP,                // Fully Sharded Data Parallel
+}
+```
+
+### 1.3 Inference Service
+
+```rust
+// synor-compute/src/ai/inference.rs
+
+/// Inference endpoint specification
+pub struct InferenceEndpoint {
+    /// Endpoint ID
+    pub endpoint_id: EndpointId,
+    /// Model reference (Synor Storage CID)
+    pub model_cid: Cid,
+    /// Model format
+    pub format: ModelFormat,
+    /// Scaling configuration
+    pub scaling: AutoscaleConfig,
+    /// GPU requirements per replica
+    pub gpu_per_replica: GpuResources,
+    /// Request timeout
+    pub timeout_ms: u32,
+    /// Max batch size for batching inference
+    pub max_batch_size: u32,
+    /// Batching timeout
+    pub batch_timeout_ms: u32,
+}
+
+pub enum ModelFormat {
+    PyTorch,
+    ONNX,
+    TensorRT,
+    Triton,
+    vLLM,          // For LLM serving
+    TGI,           // Text Generation Inference
+    Custom,
+}
+
+pub struct AutoscaleConfig {
+    pub min_replicas: u32,
+    pub max_replicas: u32,
+    pub target_gpu_utilization: f32,
+    pub scale_up_threshold: f32,
+    pub scale_down_threshold: f32,
+    pub cooldown_seconds: u32,
+}
+```
+
+### 1.4 Pricing Model for GPU Compute
+
+| Resource | Unit | Price (SYNOR/unit) |
+|----------|------|-------------------|
+| GPU (RTX 4090 equivalent) | hour | 0.50 |
+| GPU (A100 80GB equivalent) | hour | 2.00 |
+| GPU (H100 equivalent) | hour | 4.00 |
+| VRAM | GB/hour | 0.01 |
+| Network egress | GB | 0.05 |
+| Storage (hot, NVMe) | GB/month | 0.10 |
+| Inference requests | 1M tokens | 0.10 |
+
+---
+
+## Milestone 2: Container Orchestration (Docker/Kubernetes-Compatible)
+
+### 2.1 Container Runtime
+
+```rust
+// synor-compute/src/container/runtime.rs
+
+/// Container specification (OCI-compatible)
+pub struct ContainerSpec {
+    /// Image reference
+    pub image: ImageRef,
+    /// Resource limits
+    pub resources: ContainerResources,
+    /// Environment variables
+    pub env: HashMap<String, String>,
+    /// Volume mounts
+    pub volumes: Vec<VolumeMount>,
+    /// Network configuration
+    pub network: NetworkConfig,
+    /// Security context
+    pub security: SecurityContext,
+    /// Health check
+    pub health_check: Option<HealthCheck>,
+}
+
+pub struct ContainerResources {
+    pub cpu_cores: f32,          // 0.5, 1.0, 2.0, etc.
+    pub memory_mb: u64,
+    pub gpu: Option<GpuAllocation>,
+    pub ephemeral_storage_gb: u32,
+    pub network_bandwidth_mbps: u32,
+}
+
+pub struct GpuAllocation {
+    pub count: u32,
+    pub vram_mb: u64,
+    pub shared: bool,  // Allow GPU sharing via MPS/MIG
+}
+```
+
+### 2.2 Service Mesh & Networking
+
+```rust
+// synor-compute/src/network/mesh.rs
+
+/// Service definition for container orchestration
+pub struct Service {
+    pub service_id: ServiceId,
+    pub name: String,
+    pub containers: Vec<ContainerSpec>,
+    pub replicas: ReplicaConfig,
+    pub load_balancer: LoadBalancerConfig,
+    pub service_mesh: ServiceMeshConfig,
+}
+
+pub struct ServiceMeshConfig {
+    pub mtls_enabled: bool,
+    pub traffic_policy: TrafficPolicy,
+    pub circuit_breaker: CircuitBreakerConfig,
+    pub retry_policy: RetryPolicy,
+    pub rate_limit: Option<RateLimitConfig>,
+}
+
+pub struct LoadBalancerConfig {
+    pub algorithm: LoadBalancerAlgorithm,
+    pub health_check: HealthCheck,
+    pub sticky_sessions: bool,
+    pub ssl_termination: SslTermination,
+}
+
+pub enum LoadBalancerAlgorithm {
+    RoundRobin,
+    LeastConnections,
+    WeightedRoundRobin { weights: Vec<u32> },
+    IPHash,
+    Random,
+}
+```
+
+### 2.3 Container Pricing
+
+| Resource | Unit | Price (SYNOR/unit) |
+|----------|------|-------------------|
+| CPU | core/hour | 0.02 |
+| Memory | GB/hour | 0.005 |
+| Ephemeral storage | GB/hour | 0.001 |
+| Network ingress | GB | FREE |
+| Network egress | GB | 0.05 |
+| Load balancer | hour | 0.01 |
+| Static IP | month | 2.00 |
+
+---
+
+## Milestone 3: Persistent Virtual Machines (OS Hosting)
+
+### 3.1 MicroVM Architecture (Firecracker-based)
+
+```rust
+// synor-compute/src/vm/microvm.rs
+
+/// Virtual machine specification
+pub struct VmSpec {
+    /// VM ID
+    pub vm_id: VmId,
+    /// Owner address
+    pub owner: Address,
+    /// VM size
+    pub size: VmSize,
+    /// Boot image
+    pub image: VmImage,
+    /// Persistent volumes
+    pub volumes: Vec<PersistentVolume>,
+    /// Network configuration
+    pub network: VmNetworkConfig,
+    /// SSH keys for access
+    pub ssh_keys: Vec<SshPublicKey>,
+    /// Cloud-init user data
+    pub user_data: Option<String>,
+}
+
+pub struct VmSize {
+    pub vcpus: u32,
+    pub memory_gb: u32,
+    pub gpu: Option<GpuPassthrough>,
+    pub network_bandwidth_gbps: u32,
+}
+
+pub struct GpuPassthrough {
+    pub count: u32,
+    pub model: GpuModel,
+    pub vram_gb: u32,
+}
+
+pub enum VmImage {
+    /// Pre-built images
+    Marketplace { image_id: String, version: String },
+    /// Custom image from Synor Storage
+    Custom { cid: Cid, format: ImageFormat },
+    /// Standard OS images
+    Ubuntu { version: String },
+    Debian { version: String },
+    AlmaLinux { version: String },
+    Windows { version: String, license: WindowsLicense },
+}
+
+pub struct PersistentVolume {
+    pub volume_id: VolumeId,
+    pub size_gb: u32,
+    pub volume_type: VolumeType,
+    pub mount_path: String,
+    pub encrypted: bool,
+}
+
+pub enum VolumeType {
+    /// High-performance NVMe SSD
+    NvmeSsd { iops: u32, throughput_mbps: u32 },
+    /// Standard SSD
+    Ssd,
+    /// HDD for archival
+    Hdd,
+    /// Distributed storage (Synor Storage L2)
+    Distributed { replication: u8 },
+}
+```
+
+### 3.2 VM Lifecycle Management
+
+```rust
+// synor-compute/src/vm/lifecycle.rs
+
+pub enum VmState {
+    Pending,
+    Provisioning,
+    Running,
+    Stopping,
+    Stopped,
+    Hibernating,
+    Hibernated,
+    Migrating,
+    Failed,
+    Terminated,
+}
+
+pub struct VmManager {
+    /// Active VMs
+    vms: HashMap<VmId, VmInstance>,
+    /// Node assignments
+    node_assignments: HashMap<VmId, NodeId>,
+    /// Live migration coordinator
+    migration_coordinator: MigrationCoordinator,
+}
+
+impl VmManager {
+    /// Start a new VM
+    pub async fn create(&self, spec: VmSpec) -> Result<VmId, VmError>;
+
+    /// Stop a VM (preserves state)
+    pub async fn stop(&self, vm_id: &VmId) -> Result<(), VmError>;
+
+    /// Start a stopped VM
+    pub async fn start(&self, vm_id: &VmId) -> Result<(), VmError>;
+
+    /// Hibernate VM to storage (saves memory state)
+    pub async fn hibernate(&self, vm_id: &VmId) -> Result<(), VmError>;
+
+    /// Live migrate VM to another node
+    pub async fn migrate(&self, vm_id: &VmId, target_node: NodeId) -> Result<(), VmError>;
+
+    /// Resize VM (requires restart)
+    pub async fn resize(&self, vm_id: &VmId, new_size: VmSize) -> Result<(), VmError>;
+
+    /// Snapshot VM state
+    pub async fn snapshot(&self, vm_id: &VmId) -> Result<SnapshotId, VmError>;
+
+    /// Terminate and delete VM
+    pub async fn terminate(&self, vm_id: &VmId) -> Result<(), VmError>;
+}
+```
+
+### 3.3 VM Pricing
+
+| VM Type | vCPUs | Memory | Storage | GPU | Price (SYNOR/month) |
+|---------|-------|--------|---------|-----|---------------------|
+| micro | 1 | 1 GB | 20 GB SSD | - | 5 |
+| small | 2 | 4 GB | 50 GB SSD | - | 15 |
+| medium | 4 | 8 GB | 100 GB SSD | - | 30 |
+| large | 8 | 32 GB | 200 GB SSD | - | 80 |
+| xlarge | 16 | 64 GB | 500 GB NVMe | - | 200 |
+| gpu-small | 8 | 32 GB | 200 GB NVMe | 1x RTX 4090 | 400 |
+| gpu-medium | 16 | 64 GB | 500 GB NVMe | 2x RTX 4090 | 750 |
+| gpu-large | 32 | 128 GB | 1 TB NVMe | 4x A100 80GB | 2500 |
+| gpu-xlarge | 64 | 256 GB | 2 TB NVMe | 8x H100 | 8000 |
+
+---
+
+## Milestone 4: Serverless Functions (FaaS)
+
+### 4.1 Function Specification
+
+```rust
+// synor-compute/src/serverless/function.rs
+
+/// Serverless function definition
+pub struct Function {
+    pub function_id: FunctionId,
+    pub owner: Address,
+    pub name: String,
+    pub runtime: FunctionRuntime,
+    pub handler: String,
+    pub code: FunctionCode,
+    pub resources: FunctionResources,
+    pub triggers: Vec<FunctionTrigger>,
+    pub environment: HashMap<String, String>,
+    pub timeout_ms: u32,
+    pub concurrency: ConcurrencyConfig,
+}
+
+pub enum FunctionRuntime {
+    Node20,
+    Node22,
+    Python311,
+    Python312,
+    Rust,
+    Go122,
+    Java21,
+    Dotnet8,
+    Ruby33,
+    Custom { image: String },
+}
+
+pub struct FunctionCode {
+    /// Source code CID in Synor Storage
+    pub cid: Cid,
+    /// Entry point file
+    pub entry_point: String,
+    /// Dependencies (package.json, requirements.txt, etc.)
+    pub dependencies: Option<Cid>,
+}
+
+pub struct FunctionResources {
+    pub memory_mb: u32,       // 128, 256, 512, 1024, 2048, 4096, 8192
+    pub cpu_allocation: f32,  // Proportional to memory
+    pub ephemeral_storage_mb: u32,
+    pub gpu: Option<GpuAllocation>,
+}
+
+pub enum FunctionTrigger {
+    /// HTTP endpoint
+    Http { path: String, methods: Vec<HttpMethod> },
+    /// Scheduled execution (cron)
+    Schedule { cron: String },
+    /// Event from message queue
+    Queue { queue_name: String },
+    /// Storage events
+    Storage { bucket: String, events: Vec<StorageEvent> },
+    /// Blockchain events
+    Blockchain { contract: Address, events: Vec<String> },
+    /// Webhook
+    Webhook { url: String },
+}
+```
+
+### 4.2 Cold Start Optimization
+
+```rust
+// synor-compute/src/serverless/warmup.rs
+
+/// Function warmup strategies
+pub struct WarmupConfig {
+    /// Minimum warm instances
+    pub min_instances: u32,
+    /// Provisioned concurrency
+    pub provisioned_concurrency: u32,
+    /// Warmup schedule
+    pub warmup_schedule: Option<String>,
+    /// Snapshot-based cold start (SnapStart)
+    pub snapstart_enabled: bool,
+}
+
+pub struct ColdStartOptimizer {
+    /// Pre-warmed function pools
+    pools: HashMap<FunctionRuntime, WarmPool>,
+    /// Snapshot cache
+    snapshots: LruCache<FunctionId, FunctionSnapshot>,
+    /// Prediction model for scaling
+    predictor: ScalingPredictor,
+}
+
+impl ColdStartOptimizer {
+    /// Get a warm instance or create one
+    pub async fn get_instance(&self, function: &Function) -> Result<FunctionInstance, Error> {
+        // Try snapshot restore first (< 100ms)
+        if let Some(snapshot) = self.snapshots.get(&function.function_id) {
+            return self.restore_from_snapshot(snapshot).await;
+        }
+
+        // Try warm pool (< 50ms)
+        if let Some(instance) = self.pools.get(&function.runtime)?.get_warm() {
+            return Ok(instance);
+        }
+
+        // Cold start (1-5s depending on runtime)
+        self.cold_start(function).await
+    }
+}
+```
+
+### 4.3 Serverless Pricing
+
+| Resource | Unit | Price (SYNOR) |
+|----------|------|---------------|
+| Invocations | 1M requests | 0.20 |
+| Duration | GB-second | 0.00001 |
+| Provisioned concurrency | GB-hour | 0.01 |
+| HTTP Gateway | 1M requests | 0.10 |
+| Event bridge | 1M events | 0.50 |
+
+---
+
+## Milestone 5: Edge Compute
+
+### 5.1 Edge Node Architecture
+
+```rust
+// synor-compute/src/edge/node.rs
+
+/// Edge compute node
+pub struct EdgeNode {
+    pub node_id: NodeId,
+    pub location: GeoLocation,
+    pub capabilities: EdgeCapabilities,
+    pub latency_zones: Vec<LatencyZone>,
+    pub resources: EdgeResources,
+}
+
+pub struct EdgeCapabilities {
+    pub wasm_runtime: bool,
+    pub container_runtime: bool,
+    pub gpu_inference: bool,
+    pub video_transcoding: bool,
+    pub cdn_cache: bool,
+}
+
+pub struct EdgeResources {
+    pub cpu_cores: u32,
+    pub memory_gb: u32,
+    pub storage_gb: u32,
+    pub gpu: Option<EdgeGpu>,
+    pub bandwidth_gbps: u32,
+}
+
+/// Edge function for low-latency compute
+pub struct EdgeFunction {
+    pub function_id: FunctionId,
+    pub code: WasmModule,
+    pub memory_limit: u32,
+    pub timeout_ms: u32,
+    pub allowed_regions: Vec<Region>,
+}
+```
+
+### 5.2 Edge Use Cases
+
+```rust
+// synor-compute/src/edge/usecases.rs
+
+/// CDN with compute at edge
+pub struct EdgeCdn {
+    /// Origin servers
+    origins: Vec<Origin>,
+    /// Cache rules
+    cache_rules: Vec<CacheRule>,
+    /// Edge workers for request/response transformation
+    workers: Vec<EdgeWorker>,
+}
+
+/// Real-time inference at edge
+pub struct EdgeInference {
+    /// Model optimized for edge (quantized, pruned)
+    model_id: ModelId,
+    /// Inference runtime (TensorRT, ONNX Runtime)
+    runtime: EdgeInferenceRuntime,
+    /// Max batch size
+    max_batch: u32,
+    /// Target latency
+    target_latency_ms: u32,
+}
+
+/// Video processing at edge
+pub struct EdgeVideoProcessor {
+    /// Transcoding profiles
+    profiles: Vec<TranscodingProfile>,
+    /// Real-time streaming
+    live_streaming: bool,
+    /// Adaptive bitrate
+    abr_enabled: bool,
+}
+```
+
+### 5.3 Edge Pricing
+
+| Resource | Unit | Price (SYNOR) |
+|----------|------|---------------|
+| Edge function invocations | 1M | 0.50 |
+| Edge function duration | GB-second | 0.00002 |
+| Edge bandwidth | GB | 0.08 |
+| Edge cache storage | GB/month | 0.02 |
+| Video transcoding | minute | 0.02 |
+
+---
+
+## Milestone 6: Node Provider Economics
+
+### 6.1 Provider Registration
+
+```rust
+// synor-compute/src/provider/registration.rs
+
+/// Compute provider registration
+pub struct ProviderRegistration {
+    pub provider_id: ProviderId,
+    pub owner: Address,
+    /// Stake required to become provider
+    pub stake: u64,
+    /// Hardware specifications
+    pub hardware: HardwareManifest,
+    /// Network connectivity
+    pub network: NetworkManifest,
+    /// Geographic location
+    pub location: GeoLocation,
+    /// Availability SLA commitment
+    pub sla: SlaCommitment,
+}
+
+pub struct HardwareManifest {
+    pub cpus: Vec<CpuSpec>,
+    pub memory_total_gb: u64,
+    pub gpus: Vec<GpuSpec>,
+    pub storage: Vec<StorageSpec>,
+    pub verified: bool,  // Hardware attestation passed
+}
+
+pub struct SlaCommitment {
+    pub uptime_percent: f32,      // 99.9, 99.99, etc.
+    pub response_time_ms: u32,
+    pub data_durability: f32,
+    pub penalty_rate: f32,        // Penalty for SLA violation
+}
+```
+
+### 6.2 Provider Revenue Model
+
+| Revenue Source | Provider Share | Protocol Share |
+|----------------|----------------|----------------|
+| Compute fees | 85% | 15% |
+| Storage fees | 80% | 20% |
+| Network fees | 75% | 25% |
+| SLA bonuses | 100% | 0% |
+| Staking rewards | 100% | 0% |
+
+### 6.3 Slashing Conditions
+
+| Violation | Penalty |
+|-----------|---------|
+| Downtime > committed SLA | 1% stake per hour |
+| Data loss | 10% stake + compensation |
+| Malicious behavior | 100% stake |
+| False hardware attestation | 50% stake |
+
+---
+
+## Implementation Timeline
+
+### Phase 11.1: Foundation (Weeks 1-4)
+- [ ] Node registration and hardware attestation
+- [ ] Basic job scheduler
+- [ ] WASM runtime integration (existing)
+- [ ] Container runtime (containerd)
+- [ ] Network overlay (WireGuard mesh)
+
+### Phase 11.2: GPU Compute (Weeks 5-8)
+- [ ] GPU node registration
+- [ ] NVIDIA driver integration
+- [ ] CUDA runtime support
+- [ ] Basic ML job execution
+- [ ] Model storage integration
+
+### Phase 11.3: Container Orchestration (Weeks 9-12)
+- [ ] OCI image support
+- [ ] Service deployment
+- [ ] Load balancing
+- [ ] Auto-scaling
+- [ ] Service mesh (mTLS)
+
+### Phase 11.4: Persistent VMs (Weeks 13-16)
+- [ ] MicroVM runtime (Firecracker)
+- [ ] VM lifecycle management
+- [ ] Persistent storage
+- [ ] Live migration
+- [ ] Snapshot/restore
+
+### Phase 11.5: Serverless (Weeks 17-20)
+- [ ] Function deployment
+- [ ] Cold start optimization
+- [ ] Event triggers
+- [ ] API gateway
+- [ ] Monitoring/logging
+
+### Phase 11.6: Edge Compute (Weeks 21-24)
+- [ ] Edge node registration
+- [ ] Edge function runtime
+- [ ] CDN integration
+- [ ] Edge inference
+- [ ] Global anycast
+
+---
+
+## Security Considerations
+
+### Isolation Levels
+
+| Workload Type | Isolation Technology | Security Level |
+|---------------|---------------------|----------------|
+| WASM | Wasmtime sandbox | High |
+| Serverless | gVisor + seccomp | High |
+| Containers | gVisor or Kata | Medium-High |
+| VMs | Firecracker MicroVM | High |
+| GPU | NVIDIA MIG/MPS | Medium |
+
+### Network Security
+
+- All inter-node traffic encrypted (WireGuard)
+- mTLS for service-to-service communication
+- Network policies for workload isolation
+- DDoS protection at edge
+
+### Data Security
+
+- Encryption at rest (AES-256)
+- Encryption in transit (TLS 1.3)
+- Confidential computing support (AMD SEV, Intel SGX)
+- Secure key management (HSM integration)
+
+---
+
+## API Examples
+
+### Deploy AI Training Job
+
+```bash
+synor compute train create \
+  --framework pytorch \
+  --model-config ./model.yaml \
+  --dataset synor://datasets/imagenet \
+  --gpus 8 \
+  --gpu-type h100 \
+  --distributed ddp \
+  --epochs 100 \
+  --checkpoint-interval 1000 \
+  --max-budget 1000
+```
+
+### Deploy Inference Endpoint
+
+```bash
+synor compute inference deploy \
+  --model synor://models/llama-70b \
+  --format vllm \
+  --min-replicas 2 \
+  --max-replicas 10 \
+  --gpu-per-replica 2 \
+  --target-utilization 0.7
+```
+
+### Create Persistent VM
+
+```bash
+synor compute vm create \
+  --name my-dev-server \
+  --image ubuntu:22.04 \
+  --size gpu-small \
+  --volume 100gb:nvme:/data \
+  --ssh-key ~/.ssh/id_ed25519.pub \
+  --region us-east
+```
+
+### Deploy Container Service
+
+```bash
+synor compute service deploy \
+  --name my-api \
+  --image my-registry/my-api:latest \
+  --replicas 3 \
+  --cpu 2 \
+  --memory 4gb \
+  --port 8080 \
+  --health-check /health \
+  --autoscale 2-10
+```
+
+### Deploy Serverless Function
+
+```bash
+synor compute function deploy \
+  --name process-image \
+  --runtime python312 \
+  --handler main.handler \
+  --code ./function \
+  --memory 1024 \
+  --timeout 30000 \
+  --trigger http:/api/process
+```
+
+---
+
+## Comparison with Existing Synor VM
+
+| Feature | Current Synor VM | Synor Compute L2 |
+|---------|------------------|------------------|
+| Runtime | WASM only | WASM, Container, MicroVM |
+| Timeout | 30 seconds | Unlimited (VMs) |
+| Memory | 16 MB max | Up to 256 GB |
+| GPU | ❌ | ✅ Full CUDA/ROCm |
+| Networking | ❌ | ✅ Full TCP/UDP |
+| File I/O | ❌ | ✅ Persistent volumes |
+| Threading | ❌ | ✅ Multi-threaded |
+| AI/ML | ❌ | ✅ Training + Inference |
+| OS Hosting | ❌ | ✅ Full Linux/Windows |
+
+---
+
+## Next Steps
+
+1. **Milestone 1**: Implement GPU node registration and attestation
+2. **Milestone 2**: Build basic job scheduler with resource allocation
+3. **Milestone 3**: Integrate containerd for container workloads
+4. **Milestone 4**: Add Firecracker for MicroVM support
+5. **Milestone 5**: Implement serverless function runtime
+6. **Milestone 6**: Deploy edge nodes and CDN integration
+
+This plan transforms Synor from a smart contract platform into a full-stack decentralized cloud provider capable of competing with AWS/GCP/Azure while maintaining decentralization and censorship resistance.