feat(compute): add Phase 11 Synor Compute L2 heterogeneous compute layer
- Add synor-compute crate for heterogeneous compute orchestration - Implement processor abstraction for CPU/GPU/TPU/NPU/LPU/FPGA/DSP - Add device registry with cross-vendor capability tracking - Implement task scheduler with work stealing and load balancing - Add energy-aware and latency-aware balancing strategies - Create spot market for compute resources with order matching - Add memory manager with tensor handles and cross-device transfers - Support processor capability profiles (H100, TPU v5p, Groq LPU, etc.) - Implement priority work queues with task decomposition Processor types supported: - CPU (x86-64 AVX512, ARM64 SVE, RISC-V Vector) - GPU (NVIDIA CUDA, AMD ROCm, Intel OneAPI, Apple Metal) - TPU (v2-v5p, Edge TPU) - NPU (Apple Neural Engine, Qualcomm Hexagon, Intel VPU) - LPU (Groq Language Processing Unit) - FPGA (Xilinx, Intel Altera) - DSP (TI, Analog Devices) - WebGPU and WASM runtimes
This commit is contained in:
parent
8da34bc73d
commit
4c36ddbdc2
19 changed files with 11219 additions and 0 deletions
|
|
@ -9,6 +9,7 @@ members = [
|
|||
"crates/synor-storage",
|
||||
"crates/synor-hosting",
|
||||
"crates/synor-database",
|
||||
"crates/synor-compute",
|
||||
"crates/synor-governance",
|
||||
"crates/synor-rpc",
|
||||
"crates/synor-vm",
|
||||
|
|
|
|||
51
crates/synor-compute/Cargo.toml
Normal file
51
crates/synor-compute/Cargo.toml
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
[package]
|
||||
name = "synor-compute"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
description = "Heterogeneous multi-processor compute platform for Synor blockchain"
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
# Internal crates
|
||||
synor-types = { path = "../synor-types" }
|
||||
synor-crypto = { path = "../synor-crypto" }
|
||||
synor-storage = { path = "../synor-storage" }
|
||||
|
||||
# Serialization
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
borsh.workspace = true
|
||||
bincode = "1.3"
|
||||
|
||||
# Async runtime
|
||||
tokio = { workspace = true, features = ["sync", "rt-multi-thread", "time", "macros"] }
|
||||
async-trait = "0.1"
|
||||
futures = "0.3"
|
||||
|
||||
# Concurrency
|
||||
parking_lot.workspace = true
|
||||
crossbeam-deque = "0.8"
|
||||
crossbeam-channel = "0.5"
|
||||
dashmap = "5.5"
|
||||
|
||||
# Utilities
|
||||
thiserror.workspace = true
|
||||
tracing.workspace = true
|
||||
hex.workspace = true
|
||||
|
||||
# Hashing
|
||||
blake3.workspace = true
|
||||
|
||||
# Data structures
|
||||
indexmap = "2.2"
|
||||
priority-queue = "2.0"
|
||||
|
||||
# Time
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
|
||||
# Random
|
||||
rand = "0.8"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile.workspace = true
|
||||
tokio-test = "0.4"
|
||||
377
crates/synor-compute/src/device/mod.rs
Normal file
377
crates/synor-compute/src/device/mod.rs
Normal file
|
|
@ -0,0 +1,377 @@
|
|||
//! Device registry and management.
|
||||
//!
|
||||
//! Supports all device types:
|
||||
//! - Data center servers
|
||||
//! - Desktop workstations
|
||||
//! - Laptops
|
||||
//! - Mobile devices (iOS, Android)
|
||||
//! - Browsers (WebGPU, WASM)
|
||||
//! - IoT devices
|
||||
|
||||
use crate::error::ComputeError;
|
||||
use crate::processor::{GenericProcessor, Processor, ProcessorCapabilities, ProcessorId, ProcessorType};
|
||||
use crate::{NodeId, ProcessorInfo};
|
||||
use parking_lot::RwLock;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Unique device identifier.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct DeviceId(pub [u8; 32]);
|
||||
|
||||
impl DeviceId {
|
||||
/// Creates a new random device ID.
|
||||
pub fn new() -> Self {
|
||||
use rand::Rng;
|
||||
let mut bytes = [0u8; 32];
|
||||
rand::thread_rng().fill(&mut bytes);
|
||||
DeviceId(bytes)
|
||||
}
|
||||
|
||||
/// Creates from bytes.
|
||||
pub fn from_bytes(bytes: [u8; 32]) -> Self {
|
||||
DeviceId(bytes)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for DeviceId {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for DeviceId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "dev_{}", hex::encode(&self.0[..8]))
|
||||
}
|
||||
}
|
||||
|
||||
/// Device type classification.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum DeviceType {
|
||||
/// Data center server.
|
||||
DataCenter,
|
||||
/// Desktop workstation.
|
||||
Desktop,
|
||||
/// Laptop.
|
||||
Laptop,
|
||||
/// Mobile phone.
|
||||
Mobile,
|
||||
/// Tablet.
|
||||
Tablet,
|
||||
/// IoT device.
|
||||
IoT,
|
||||
/// Browser (WebGPU/WASM).
|
||||
Browser,
|
||||
/// Edge server.
|
||||
Edge,
|
||||
}
|
||||
|
||||
impl DeviceType {
|
||||
/// Returns typical reliability score (0-100).
|
||||
pub fn reliability(&self) -> u32 {
|
||||
match self {
|
||||
DeviceType::DataCenter => 99,
|
||||
DeviceType::Edge => 95,
|
||||
DeviceType::Desktop => 80,
|
||||
DeviceType::Laptop => 60,
|
||||
DeviceType::Mobile => 40,
|
||||
DeviceType::Tablet => 50,
|
||||
DeviceType::IoT => 70,
|
||||
DeviceType::Browser => 30,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns typical availability hours per day.
|
||||
pub fn availability_hours(&self) -> f32 {
|
||||
match self {
|
||||
DeviceType::DataCenter => 24.0,
|
||||
DeviceType::Edge => 24.0,
|
||||
DeviceType::Desktop => 8.0,
|
||||
DeviceType::Laptop => 6.0,
|
||||
DeviceType::Mobile => 4.0,
|
||||
DeviceType::Tablet => 4.0,
|
||||
DeviceType::IoT => 24.0,
|
||||
DeviceType::Browser => 2.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Device capabilities.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct DeviceCapabilities {
|
||||
/// Device type.
|
||||
pub device_type: DeviceType,
|
||||
/// Available processors.
|
||||
pub processors: Vec<ProcessorType>,
|
||||
/// Total memory (GB).
|
||||
pub memory_gb: f32,
|
||||
/// Network bandwidth (Mbps).
|
||||
pub bandwidth_mbps: f32,
|
||||
/// Storage available (GB).
|
||||
pub storage_gb: f32,
|
||||
/// Battery powered.
|
||||
pub battery_powered: bool,
|
||||
/// Supports background execution.
|
||||
pub background_execution: bool,
|
||||
}
|
||||
|
||||
/// Device information.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct DeviceInfo {
|
||||
/// Device ID.
|
||||
pub id: DeviceId,
|
||||
/// Device type.
|
||||
pub device_type: DeviceType,
|
||||
/// Owner address.
|
||||
pub owner: [u8; 32],
|
||||
/// Capabilities.
|
||||
pub capabilities: DeviceCapabilities,
|
||||
/// Current status.
|
||||
pub status: DeviceStatus,
|
||||
/// Reputation score (0-100).
|
||||
pub reputation: u32,
|
||||
/// Total earnings (atomic SYNOR).
|
||||
pub earnings: u64,
|
||||
/// Geographic region.
|
||||
pub region: String,
|
||||
}
|
||||
|
||||
/// Device status.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum DeviceStatus {
|
||||
/// Online and available.
|
||||
Online,
|
||||
/// Online but busy.
|
||||
Busy,
|
||||
/// Idle but available.
|
||||
Idle,
|
||||
/// On battery (reduced capacity).
|
||||
OnBattery,
|
||||
/// Offline.
|
||||
Offline,
|
||||
/// Maintenance.
|
||||
Maintenance,
|
||||
}
|
||||
|
||||
/// Device registry managing all devices and processors.
|
||||
pub struct DeviceRegistry {
|
||||
/// Registered devices.
|
||||
devices: RwLock<HashMap<DeviceId, DeviceInfo>>,
|
||||
/// Node to device mapping.
|
||||
node_devices: RwLock<HashMap<NodeId, Vec<DeviceId>>>,
|
||||
/// All processors (across all nodes).
|
||||
processors: RwLock<HashMap<ProcessorId, Arc<dyn Processor>>>,
|
||||
/// Processor to node mapping.
|
||||
processor_nodes: RwLock<HashMap<ProcessorId, NodeId>>,
|
||||
/// Next processor ID.
|
||||
next_processor_id: std::sync::atomic::AtomicU64,
|
||||
}
|
||||
|
||||
impl DeviceRegistry {
|
||||
/// Creates a new device registry.
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
devices: RwLock::new(HashMap::new()),
|
||||
node_devices: RwLock::new(HashMap::new()),
|
||||
processors: RwLock::new(HashMap::new()),
|
||||
processor_nodes: RwLock::new(HashMap::new()),
|
||||
next_processor_id: std::sync::atomic::AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// Registers a device.
|
||||
pub fn register_device(&self, device: DeviceInfo) -> Result<DeviceId, ComputeError> {
|
||||
let id = device.id;
|
||||
self.devices.write().insert(id, device);
|
||||
Ok(id)
|
||||
}
|
||||
|
||||
/// Unregisters a device.
|
||||
pub fn unregister_device(&self, device_id: DeviceId) -> Result<(), ComputeError> {
|
||||
self.devices.write().remove(&device_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Gets a device by ID.
|
||||
pub fn get_device(&self, device_id: DeviceId) -> Option<DeviceInfo> {
|
||||
self.devices.read().get(&device_id).cloned()
|
||||
}
|
||||
|
||||
/// Registers a processor for a node.
|
||||
pub fn register_processor(
|
||||
&self,
|
||||
node_id: NodeId,
|
||||
info: ProcessorInfo,
|
||||
) -> Result<(), ComputeError> {
|
||||
let processor_id = info.id;
|
||||
|
||||
// Create a generic processor from the info
|
||||
let processor: Arc<dyn Processor> = Arc::new(GenericProcessor::new(
|
||||
processor_id,
|
||||
info.processor_type,
|
||||
info.capabilities,
|
||||
));
|
||||
|
||||
self.processors.write().insert(processor_id, processor);
|
||||
self.processor_nodes.write().insert(processor_id, node_id);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Unregisters all processors for a node.
|
||||
pub fn unregister_node(&self, node_id: NodeId) -> Result<(), ComputeError> {
|
||||
let mut processors = self.processors.write();
|
||||
let mut processor_nodes = self.processor_nodes.write();
|
||||
|
||||
// Find and remove all processors for this node
|
||||
let to_remove: Vec<_> = processor_nodes
|
||||
.iter()
|
||||
.filter(|(_, n)| **n == node_id)
|
||||
.map(|(p, _)| *p)
|
||||
.collect();
|
||||
|
||||
for proc_id in to_remove {
|
||||
processors.remove(&proc_id);
|
||||
processor_nodes.remove(&proc_id);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Gets a processor by ID.
|
||||
pub fn get_processor(&self, processor_id: ProcessorId) -> Result<Arc<dyn Processor>, ComputeError> {
|
||||
self.processors
|
||||
.read()
|
||||
.get(&processor_id)
|
||||
.cloned()
|
||||
.ok_or(ComputeError::ProcessorNotFound(processor_id))
|
||||
}
|
||||
|
||||
/// Gets all processors.
|
||||
pub fn all_processors(&self) -> Vec<Arc<dyn Processor>> {
|
||||
self.processors.read().values().cloned().collect()
|
||||
}
|
||||
|
||||
/// Gets processors of a specific type.
|
||||
pub fn processors_by_type(&self, proc_type: ProcessorType) -> Vec<Arc<dyn Processor>> {
|
||||
self.processors
|
||||
.read()
|
||||
.values()
|
||||
.filter(|p| p.processor_type() == proc_type)
|
||||
.cloned()
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Gets the next processor ID.
|
||||
pub fn next_processor_id(&self) -> ProcessorId {
|
||||
ProcessorId(self.next_processor_id.fetch_add(1, std::sync::atomic::Ordering::SeqCst))
|
||||
}
|
||||
|
||||
/// Gets total number of devices.
|
||||
pub fn device_count(&self) -> usize {
|
||||
self.devices.read().len()
|
||||
}
|
||||
|
||||
/// Gets total number of processors.
|
||||
pub fn processor_count(&self) -> usize {
|
||||
self.processors.read().len()
|
||||
}
|
||||
|
||||
/// Gets devices by type.
|
||||
pub fn devices_by_type(&self, device_type: DeviceType) -> Vec<DeviceInfo> {
|
||||
self.devices
|
||||
.read()
|
||||
.values()
|
||||
.filter(|d| d.device_type == device_type)
|
||||
.cloned()
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Gets online devices.
|
||||
pub fn online_devices(&self) -> Vec<DeviceInfo> {
|
||||
self.devices
|
||||
.read()
|
||||
.values()
|
||||
.filter(|d| d.status == DeviceStatus::Online || d.status == DeviceStatus::Idle)
|
||||
.cloned()
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Updates device status.
|
||||
pub fn update_device_status(
|
||||
&self,
|
||||
device_id: DeviceId,
|
||||
status: DeviceStatus,
|
||||
) -> Result<(), ComputeError> {
|
||||
if let Some(device) = self.devices.write().get_mut(&device_id) {
|
||||
device.status = status;
|
||||
Ok(())
|
||||
} else {
|
||||
Err(ComputeError::Internal(format!("Device not found: {}", device_id)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for DeviceRegistry {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::processor::{CpuVariant, AvxSupport};
|
||||
|
||||
#[test]
|
||||
fn test_device_id() {
|
||||
let id1 = DeviceId::new();
|
||||
let id2 = DeviceId::new();
|
||||
assert_ne!(id1.0, id2.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_device_registry() {
|
||||
let registry = DeviceRegistry::new();
|
||||
|
||||
let device = DeviceInfo {
|
||||
id: DeviceId::new(),
|
||||
device_type: DeviceType::Desktop,
|
||||
owner: [1u8; 32],
|
||||
capabilities: DeviceCapabilities {
|
||||
device_type: DeviceType::Desktop,
|
||||
processors: vec![ProcessorType::Cpu(CpuVariant::X86_64 {
|
||||
avx: AvxSupport::Avx512,
|
||||
})],
|
||||
memory_gb: 64.0,
|
||||
bandwidth_mbps: 1000.0,
|
||||
storage_gb: 1000.0,
|
||||
battery_powered: false,
|
||||
background_execution: true,
|
||||
},
|
||||
status: DeviceStatus::Online,
|
||||
reputation: 100,
|
||||
earnings: 0,
|
||||
region: "us-east".to_string(),
|
||||
};
|
||||
|
||||
let device_id = device.id;
|
||||
registry.register_device(device).unwrap();
|
||||
|
||||
assert_eq!(registry.device_count(), 1);
|
||||
assert!(registry.get_device(device_id).is_some());
|
||||
|
||||
registry.unregister_device(device_id).unwrap();
|
||||
assert_eq!(registry.device_count(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_device_type_properties() {
|
||||
assert_eq!(DeviceType::DataCenter.reliability(), 99);
|
||||
assert_eq!(DeviceType::Mobile.reliability(), 40);
|
||||
assert_eq!(DeviceType::DataCenter.availability_hours(), 24.0);
|
||||
assert_eq!(DeviceType::Browser.availability_hours(), 2.0);
|
||||
}
|
||||
}
|
||||
92
crates/synor-compute/src/error.rs
Normal file
92
crates/synor-compute/src/error.rs
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
//! Error types for Synor Compute.
|
||||
|
||||
use crate::{JobId, NodeId, ProcessorId, ProcessorType};
|
||||
use thiserror::Error;
|
||||
|
||||
/// Compute errors.
|
||||
#[derive(Debug, Error)]
|
||||
pub enum ComputeError {
|
||||
/// Job not found.
|
||||
#[error("Job not found: {0}")]
|
||||
JobNotFound(JobId),
|
||||
|
||||
/// Node not found.
|
||||
#[error("Node not found: {0}")]
|
||||
NodeNotFound(NodeId),
|
||||
|
||||
/// Processor not found.
|
||||
#[error("Processor not found: {0}")]
|
||||
ProcessorNotFound(ProcessorId),
|
||||
|
||||
/// No suitable processor for operation.
|
||||
#[error("No suitable processor for operation: {0}")]
|
||||
NoSuitableProcessor(String),
|
||||
|
||||
/// Insufficient resources.
|
||||
#[error("Insufficient resources: {0}")]
|
||||
InsufficientResources(String),
|
||||
|
||||
/// Task execution failed.
|
||||
#[error("Task execution failed: {0}")]
|
||||
TaskExecutionFailed(String),
|
||||
|
||||
/// Scheduling failed.
|
||||
#[error("Scheduling failed: {0}")]
|
||||
SchedulingFailed(String),
|
||||
|
||||
/// Memory allocation failed.
|
||||
#[error("Memory allocation failed: {0}")]
|
||||
MemoryAllocationFailed(String),
|
||||
|
||||
/// Data transfer failed.
|
||||
#[error("Data transfer failed: {0}")]
|
||||
DataTransferFailed(String),
|
||||
|
||||
/// Processor type not supported.
|
||||
#[error("Processor type not supported: {0:?}")]
|
||||
ProcessorTypeNotSupported(ProcessorType),
|
||||
|
||||
/// Operation not supported on processor.
|
||||
#[error("Operation not supported on {0:?}: {1}")]
|
||||
OperationNotSupported(ProcessorType, String),
|
||||
|
||||
/// Timeout.
|
||||
#[error("Operation timed out after {0}ms")]
|
||||
Timeout(u64),
|
||||
|
||||
/// Budget exceeded.
|
||||
#[error("Budget exceeded: required {required}, available {available}")]
|
||||
BudgetExceeded { required: u64, available: u64 },
|
||||
|
||||
/// Node already registered.
|
||||
#[error("Node already registered: {0}")]
|
||||
NodeAlreadyRegistered(NodeId),
|
||||
|
||||
/// Invalid configuration.
|
||||
#[error("Invalid configuration: {0}")]
|
||||
InvalidConfiguration(String),
|
||||
|
||||
/// Serialization error.
|
||||
#[error("Serialization error: {0}")]
|
||||
Serialization(String),
|
||||
|
||||
/// Network error.
|
||||
#[error("Network error: {0}")]
|
||||
Network(String),
|
||||
|
||||
/// Internal error.
|
||||
#[error("Internal error: {0}")]
|
||||
Internal(String),
|
||||
}
|
||||
|
||||
impl From<bincode::Error> for ComputeError {
|
||||
fn from(err: bincode::Error) -> Self {
|
||||
ComputeError::Serialization(err.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<serde_json::Error> for ComputeError {
|
||||
fn from(err: serde_json::Error) -> Self {
|
||||
ComputeError::Serialization(err.to_string())
|
||||
}
|
||||
}
|
||||
631
crates/synor-compute/src/lib.rs
Normal file
631
crates/synor-compute/src/lib.rs
Normal file
|
|
@ -0,0 +1,631 @@
|
|||
//! Synor Compute L2 - Heterogeneous Multi-Processor Compute Platform
|
||||
//!
|
||||
//! Provides decentralized compute services with:
|
||||
//!
|
||||
//! - **Heterogeneous Scheduling**: CPU + GPU + TPU + NPU + LPU working simultaneously
|
||||
//! - **Consumer Device Mesh**: Mobile, browser, desktop devices contributing compute
|
||||
//! - **90% Cost Reduction**: Zero margins, spot markets, electricity arbitrage
|
||||
//! - **10x Speed**: Caching, speculative execution, optimal processor assignment
|
||||
//!
|
||||
//! # Architecture
|
||||
//!
|
||||
//! ```text
|
||||
//! ┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
//! │ SYNOR COMPUTE L2 │
|
||||
//! ├─────────────────────────────────────────────────────────────────────────────┤
|
||||
//! │ │
|
||||
//! │ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||
//! │ │ TASK DECOMPOSER │ │
|
||||
//! │ │ Analyzes workload → Identifies subtasks → Maps to optimal processors │ │
|
||||
//! │ └─────────────────────────────────────────────────────────────────────────┘ │
|
||||
//! │ │ │
|
||||
//! │ ▼ │
|
||||
//! │ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||
//! │ │ HETEROGENEOUS SCHEDULER │ │
|
||||
//! │ │ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ │ │
|
||||
//! │ │ │ CPU │ │ GPU │ │ TPU │ │ NPU │ │ LPU │ │Custom│ │ │
|
||||
//! │ │ │Queue │ │Queue │ │Queue │ │Queue │ │Queue │ │Queue │ │ │
|
||||
//! │ │ └──────┘ └──────┘ └──────┘ └──────┘ └──────┘ └──────┘ │ │
|
||||
//! │ └─────────────────────────────────────────────────────────────────────────┘ │
|
||||
//! │ │
|
||||
//! │ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||
//! │ │ UNIFIED MEMORY FABRIC │ │
|
||||
//! │ │ Zero-copy data sharing │ Automatic placement │ Cache coherency │ │
|
||||
//! │ └─────────────────────────────────────────────────────────────────────────┘ │
|
||||
//! │ │
|
||||
//! └─────────────────────────────────────────────────────────────────────────────┘
|
||||
//! ```
|
||||
//!
|
||||
//! # Pricing
|
||||
//!
|
||||
//! | Resource | Unit | Price (SYNOR) |
|
||||
//! |----------|------|---------------|
|
||||
//! | GPU (consumer) | hour | 0.10 |
|
||||
//! | GPU (datacenter) | hour | 0.50-4.00 |
|
||||
//! | CPU | core/hour | 0.02 |
|
||||
//! | Memory | GB/hour | 0.005 |
|
||||
//! | Inference | 1M tokens | 0.10 |
|
||||
|
||||
#![allow(dead_code)]
|
||||
|
||||
pub mod device;
|
||||
pub mod error;
|
||||
pub mod market;
|
||||
pub mod memory;
|
||||
pub mod processor;
|
||||
pub mod scheduler;
|
||||
pub mod task;
|
||||
|
||||
pub use device::{
|
||||
DeviceCapabilities, DeviceId, DeviceInfo, DeviceRegistry, DeviceStatus, DeviceType,
|
||||
};
|
||||
pub use error::ComputeError;
|
||||
pub use market::{
|
||||
Auction, AuctionId, CloudComparison, CpuTier as MarketCpuTier, GpuTier as MarketGpuTier,
|
||||
MarketStats, Order, OrderBook, OrderId, OrderSide, OrderType, PricingEngine, ProviderListing,
|
||||
ResourceType, SpotMarket, Trade,
|
||||
};
|
||||
pub use memory::{MemoryManager, TensorHandle, TransferPath, UnifiedMemory};
|
||||
pub use processor::{
|
||||
ComputeThroughput, CpuVariant, GpuVariant, NpuVariant, Operation, OperationType, Processor,
|
||||
ProcessorCapabilities, ProcessorId, ProcessorType, TpuVersion,
|
||||
};
|
||||
pub use scheduler::{
|
||||
HeterogeneousScheduler, LoadBalancer, Schedule, ScheduleResult, TaskAssignment, WorkQueue,
|
||||
};
|
||||
pub use task::{
|
||||
ComputeTask, DecomposedWorkload, Task, TaskDecomposer, TaskId, TaskPriority, TaskResult,
|
||||
TaskStatus,
|
||||
};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use parking_lot::RwLock;
|
||||
|
||||
/// Compute node identifier.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct NodeId(pub u64);
|
||||
|
||||
impl std::fmt::Display for NodeId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "node_{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
/// Job identifier.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct JobId(pub [u8; 32]);
|
||||
|
||||
impl JobId {
|
||||
/// Creates a new job ID.
|
||||
pub fn new() -> Self {
|
||||
use rand::Rng;
|
||||
let mut bytes = [0u8; 32];
|
||||
rand::thread_rng().fill(&mut bytes);
|
||||
JobId(bytes)
|
||||
}
|
||||
|
||||
/// Creates from bytes.
|
||||
pub fn from_bytes(bytes: [u8; 32]) -> Self {
|
||||
JobId(bytes)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for JobId {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for JobId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "job_{}", hex::encode(&self.0[..8]))
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute job specification.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct ComputeJob {
|
||||
/// Job ID.
|
||||
pub id: JobId,
|
||||
/// Owner address.
|
||||
pub owner: [u8; 32],
|
||||
/// Job type.
|
||||
pub job_type: JobType,
|
||||
/// Resource requirements.
|
||||
pub resources: ResourceRequirements,
|
||||
/// Input data reference (CID).
|
||||
pub input_cid: Option<String>,
|
||||
/// Maximum budget (in atomic SYNOR).
|
||||
pub max_budget: u64,
|
||||
/// Priority level.
|
||||
pub priority: JobPriority,
|
||||
/// Created timestamp.
|
||||
pub created_at: u64,
|
||||
/// Deadline (optional).
|
||||
pub deadline: Option<u64>,
|
||||
}
|
||||
|
||||
/// Job type classification.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub enum JobType {
|
||||
/// AI/ML training job.
|
||||
Training {
|
||||
framework: MlFramework,
|
||||
model_cid: String,
|
||||
dataset_cid: String,
|
||||
epochs: u32,
|
||||
batch_size: u32,
|
||||
},
|
||||
/// AI/ML inference job.
|
||||
Inference {
|
||||
model_cid: String,
|
||||
input_format: String,
|
||||
batch_size: u32,
|
||||
},
|
||||
/// Container workload.
|
||||
Container {
|
||||
image: String,
|
||||
command: Vec<String>,
|
||||
env: HashMap<String, String>,
|
||||
},
|
||||
/// Serverless function.
|
||||
Serverless {
|
||||
runtime: FunctionRuntime,
|
||||
code_cid: String,
|
||||
handler: String,
|
||||
},
|
||||
/// General compute (WASM).
|
||||
Wasm {
|
||||
module_cid: String,
|
||||
entrypoint: String,
|
||||
},
|
||||
}
|
||||
|
||||
/// ML framework specification.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub enum MlFramework {
|
||||
PyTorch { version: String },
|
||||
TensorFlow { version: String },
|
||||
JAX { version: String },
|
||||
ONNX,
|
||||
}
|
||||
|
||||
/// Function runtime.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub enum FunctionRuntime {
|
||||
Node20,
|
||||
Python312,
|
||||
Rust,
|
||||
Go,
|
||||
Custom { image: String },
|
||||
}
|
||||
|
||||
/// Job priority levels.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
|
||||
pub enum JobPriority {
|
||||
/// Background job, can be preempted.
|
||||
Background = 0,
|
||||
/// Normal priority.
|
||||
Normal = 1,
|
||||
/// High priority, faster scheduling.
|
||||
High = 2,
|
||||
/// Critical, guaranteed resources.
|
||||
Critical = 3,
|
||||
}
|
||||
|
||||
impl Default for JobPriority {
|
||||
fn default() -> Self {
|
||||
JobPriority::Normal
|
||||
}
|
||||
}
|
||||
|
||||
/// Resource requirements for a job.
|
||||
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
|
||||
pub struct ResourceRequirements {
|
||||
/// Minimum CPU cores.
|
||||
pub min_cpu_cores: f32,
|
||||
/// Minimum memory (GB).
|
||||
pub min_memory_gb: f32,
|
||||
/// GPU requirements.
|
||||
pub gpu: Option<GpuRequirements>,
|
||||
/// Preferred processor types (in priority order).
|
||||
pub preferred_processors: Vec<ProcessorType>,
|
||||
/// Maximum latency (ms) - for inference.
|
||||
pub max_latency_ms: Option<u32>,
|
||||
/// Requires distributed execution.
|
||||
pub distributed: bool,
|
||||
}
|
||||
|
||||
/// GPU resource requirements.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct GpuRequirements {
|
||||
/// Minimum number of GPUs.
|
||||
pub min_count: u32,
|
||||
/// Maximum number of GPUs.
|
||||
pub max_count: u32,
|
||||
/// Minimum VRAM per GPU (GB).
|
||||
pub min_vram_gb: u32,
|
||||
/// Minimum compute capability.
|
||||
pub min_compute_capability: Option<(u8, u8)>,
|
||||
/// Allow GPU sharing (MPS/MIG).
|
||||
pub allow_sharing: bool,
|
||||
}
|
||||
|
||||
/// Job execution status.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub enum JobStatus {
|
||||
/// Queued, waiting for resources.
|
||||
Queued,
|
||||
/// Resources allocated, starting.
|
||||
Starting,
|
||||
/// Running.
|
||||
Running {
|
||||
progress: f32,
|
||||
assigned_nodes: Vec<NodeId>,
|
||||
},
|
||||
/// Completed successfully.
|
||||
Completed {
|
||||
result_cid: String,
|
||||
duration_ms: u64,
|
||||
cost: u64,
|
||||
},
|
||||
/// Failed.
|
||||
Failed { error: String },
|
||||
/// Cancelled by user.
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
/// Compute node registration.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct ComputeNode {
|
||||
/// Node ID.
|
||||
pub id: NodeId,
|
||||
/// Owner address.
|
||||
pub owner: [u8; 32],
|
||||
/// Available processors.
|
||||
pub processors: Vec<ProcessorInfo>,
|
||||
/// Total memory (GB).
|
||||
pub total_memory_gb: f32,
|
||||
/// Available memory (GB).
|
||||
pub available_memory_gb: f32,
|
||||
/// Network bandwidth (Gbps).
|
||||
pub bandwidth_gbps: f32,
|
||||
/// Geographic region.
|
||||
pub region: String,
|
||||
/// Stake amount (for PoS).
|
||||
pub stake: u64,
|
||||
/// Reputation score (0-100).
|
||||
pub reputation: u32,
|
||||
/// Current status.
|
||||
pub status: NodeStatus,
|
||||
}
|
||||
|
||||
/// Processor information on a node.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct ProcessorInfo {
|
||||
/// Processor ID (local to node).
|
||||
pub id: ProcessorId,
|
||||
/// Processor type.
|
||||
pub processor_type: ProcessorType,
|
||||
/// Capabilities.
|
||||
pub capabilities: ProcessorCapabilities,
|
||||
/// Current utilization (0.0 - 1.0).
|
||||
pub utilization: f32,
|
||||
/// Current temperature (Celsius).
|
||||
pub temperature: Option<f32>,
|
||||
}
|
||||
|
||||
/// Node status.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum NodeStatus {
|
||||
/// Online and accepting jobs.
|
||||
Online,
|
||||
/// Online but not accepting new jobs.
|
||||
Draining,
|
||||
/// Offline.
|
||||
Offline,
|
||||
/// Maintenance mode.
|
||||
Maintenance,
|
||||
}
|
||||
|
||||
/// Compute cluster manager.
|
||||
pub struct ComputeCluster {
|
||||
/// Registered nodes.
|
||||
nodes: RwLock<HashMap<NodeId, ComputeNode>>,
|
||||
/// Device registry.
|
||||
device_registry: Arc<DeviceRegistry>,
|
||||
/// Heterogeneous scheduler.
|
||||
scheduler: Arc<HeterogeneousScheduler>,
|
||||
/// Spot market.
|
||||
spot_market: Arc<SpotMarket>,
|
||||
/// Memory manager.
|
||||
memory_manager: Arc<MemoryManager>,
|
||||
/// Active jobs.
|
||||
jobs: RwLock<HashMap<JobId, ComputeJob>>,
|
||||
}
|
||||
|
||||
impl ComputeCluster {
|
||||
/// Creates a new compute cluster.
|
||||
pub fn new() -> Self {
|
||||
let device_registry = Arc::new(DeviceRegistry::new());
|
||||
let scheduler = Arc::new(HeterogeneousScheduler::new(device_registry.clone()));
|
||||
let spot_market = Arc::new(SpotMarket::new());
|
||||
let memory_manager = Arc::new(MemoryManager::new());
|
||||
|
||||
Self {
|
||||
nodes: RwLock::new(HashMap::new()),
|
||||
device_registry,
|
||||
scheduler,
|
||||
spot_market,
|
||||
memory_manager,
|
||||
jobs: RwLock::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Registers a compute node.
|
||||
pub fn register_node(&self, node: ComputeNode) -> Result<(), ComputeError> {
|
||||
let id = node.id;
|
||||
|
||||
// Register processors with device registry
|
||||
for proc in &node.processors {
|
||||
self.device_registry.register_processor(id, proc.clone())?;
|
||||
}
|
||||
|
||||
self.nodes.write().insert(id, node);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Unregisters a compute node.
|
||||
pub fn unregister_node(&self, node_id: NodeId) -> Result<(), ComputeError> {
|
||||
self.device_registry.unregister_node(node_id)?;
|
||||
self.nodes.write().remove(&node_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Submits a job for execution.
|
||||
pub async fn submit_job(&self, job: ComputeJob) -> Result<JobId, ComputeError> {
|
||||
let job_id = job.id;
|
||||
|
||||
// Decompose job into tasks
|
||||
let tasks = self.decompose_job(&job)?;
|
||||
|
||||
// Schedule tasks
|
||||
let schedule = self.scheduler.schedule(tasks).await?;
|
||||
|
||||
// Store job
|
||||
self.jobs.write().insert(job_id, job);
|
||||
|
||||
// Execute schedule (async)
|
||||
tokio::spawn({
|
||||
let scheduler = self.scheduler.clone();
|
||||
async move {
|
||||
let _ = scheduler.execute(&schedule.schedule).await;
|
||||
}
|
||||
});
|
||||
|
||||
Ok(job_id)
|
||||
}
|
||||
|
||||
/// Gets job status.
|
||||
pub fn get_job_status(&self, job_id: &JobId) -> Option<JobStatus> {
|
||||
self.jobs.read().get(job_id).map(|_| JobStatus::Queued)
|
||||
}
|
||||
|
||||
/// Cancels a job.
|
||||
pub fn cancel_job(&self, job_id: &JobId) -> Result<(), ComputeError> {
|
||||
if self.jobs.write().remove(job_id).is_some() {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(ComputeError::JobNotFound(*job_id))
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets cluster statistics.
|
||||
pub fn stats(&self) -> ClusterStats {
|
||||
let nodes = self.nodes.read();
|
||||
let jobs = self.jobs.read();
|
||||
|
||||
let total_nodes = nodes.len();
|
||||
let online_nodes = nodes.values().filter(|n| n.status == NodeStatus::Online).count();
|
||||
|
||||
let total_gpus: usize = nodes
|
||||
.values()
|
||||
.flat_map(|n| &n.processors)
|
||||
.filter(|p| matches!(p.processor_type, ProcessorType::Gpu(_)))
|
||||
.count();
|
||||
|
||||
let total_memory: f32 = nodes.values().map(|n| n.total_memory_gb).sum();
|
||||
|
||||
ClusterStats {
|
||||
total_nodes,
|
||||
online_nodes,
|
||||
total_gpus,
|
||||
total_memory_gb: total_memory,
|
||||
active_jobs: jobs.len(),
|
||||
queued_jobs: jobs.values().filter(|_| true).count(), // Simplified
|
||||
}
|
||||
}
|
||||
|
||||
/// Decomposes a job into schedulable tasks.
|
||||
fn decompose_job(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
|
||||
let decomposer = TaskDecomposer::new();
|
||||
decomposer.decompose(job)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ComputeCluster {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Cluster statistics.
|
||||
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
|
||||
pub struct ClusterStats {
|
||||
/// Total registered nodes.
|
||||
pub total_nodes: usize,
|
||||
/// Online nodes.
|
||||
pub online_nodes: usize,
|
||||
/// Total GPUs across cluster.
|
||||
pub total_gpus: usize,
|
||||
/// Total memory (GB).
|
||||
pub total_memory_gb: f32,
|
||||
/// Active jobs.
|
||||
pub active_jobs: usize,
|
||||
/// Queued jobs.
|
||||
pub queued_jobs: usize,
|
||||
}
|
||||
|
||||
/// Pricing calculator for compute operations.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ComputePricing {
|
||||
/// GPU cost per hour by type.
|
||||
pub gpu_hourly: HashMap<GpuTier, u64>,
|
||||
/// CPU cost per core-hour.
|
||||
pub cpu_core_hour: u64,
|
||||
/// Memory cost per GB-hour.
|
||||
pub memory_gb_hour: u64,
|
||||
/// Network egress per GB.
|
||||
pub network_egress_gb: u64,
|
||||
/// Inference per million tokens.
|
||||
pub inference_per_million_tokens: u64,
|
||||
}
|
||||
|
||||
/// GPU pricing tiers.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum GpuTier {
|
||||
/// Consumer GPUs (RTX 30xx, 40xx).
|
||||
Consumer,
|
||||
/// Professional GPUs (RTX A series).
|
||||
Professional,
|
||||
/// Data center GPUs (A100).
|
||||
DataCenter,
|
||||
/// Latest generation (H100).
|
||||
Premium,
|
||||
}
|
||||
|
||||
impl Default for ComputePricing {
|
||||
fn default() -> Self {
|
||||
let mut gpu_hourly = HashMap::new();
|
||||
gpu_hourly.insert(GpuTier::Consumer, 100_000_000); // 0.10 SYNOR
|
||||
gpu_hourly.insert(GpuTier::Professional, 300_000_000); // 0.30 SYNOR
|
||||
gpu_hourly.insert(GpuTier::DataCenter, 2_000_000_000); // 2.00 SYNOR
|
||||
gpu_hourly.insert(GpuTier::Premium, 4_000_000_000); // 4.00 SYNOR
|
||||
|
||||
Self {
|
||||
gpu_hourly,
|
||||
cpu_core_hour: 20_000_000, // 0.02 SYNOR
|
||||
memory_gb_hour: 5_000_000, // 0.005 SYNOR
|
||||
network_egress_gb: 50_000_000, // 0.05 SYNOR
|
||||
inference_per_million_tokens: 100_000_000, // 0.10 SYNOR
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ComputePricing {
|
||||
/// Estimates cost for a job.
|
||||
pub fn estimate(&self, job: &ComputeJob, duration_hours: f32) -> u64 {
|
||||
let mut cost = 0u64;
|
||||
|
||||
// CPU cost
|
||||
cost += (self.cpu_core_hour as f32 * job.resources.min_cpu_cores * duration_hours) as u64;
|
||||
|
||||
// Memory cost
|
||||
cost += (self.memory_gb_hour as f32 * job.resources.min_memory_gb * duration_hours) as u64;
|
||||
|
||||
// GPU cost
|
||||
if let Some(gpu) = &job.resources.gpu {
|
||||
let tier = GpuTier::Consumer; // Simplified
|
||||
let gpu_cost = self.gpu_hourly.get(&tier).unwrap_or(&100_000_000);
|
||||
cost += (*gpu_cost as f32 * gpu.min_count as f32 * duration_hours) as u64;
|
||||
}
|
||||
|
||||
cost
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_job_id() {
|
||||
let id1 = JobId::new();
|
||||
let id2 = JobId::new();
|
||||
assert_ne!(id1.0, id2.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compute_cluster() {
|
||||
let cluster = ComputeCluster::new();
|
||||
let stats = cluster.stats();
|
||||
assert_eq!(stats.total_nodes, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pricing() {
|
||||
let pricing = ComputePricing::default();
|
||||
|
||||
let job = ComputeJob {
|
||||
id: JobId::new(),
|
||||
owner: [0u8; 32],
|
||||
job_type: JobType::Inference {
|
||||
model_cid: "model123".to_string(),
|
||||
input_format: "json".to_string(),
|
||||
batch_size: 32,
|
||||
},
|
||||
resources: ResourceRequirements {
|
||||
min_cpu_cores: 4.0,
|
||||
min_memory_gb: 16.0,
|
||||
gpu: Some(GpuRequirements {
|
||||
min_count: 1,
|
||||
max_count: 1,
|
||||
min_vram_gb: 16,
|
||||
min_compute_capability: None,
|
||||
allow_sharing: false,
|
||||
}),
|
||||
..Default::default()
|
||||
},
|
||||
input_cid: None,
|
||||
max_budget: 1_000_000_000,
|
||||
priority: JobPriority::Normal,
|
||||
created_at: 0,
|
||||
deadline: None,
|
||||
};
|
||||
|
||||
let cost = pricing.estimate(&job, 1.0);
|
||||
assert!(cost > 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_node_registration() {
|
||||
let cluster = ComputeCluster::new();
|
||||
|
||||
let node = ComputeNode {
|
||||
id: NodeId(1),
|
||||
owner: [1u8; 32],
|
||||
processors: vec![ProcessorInfo {
|
||||
id: ProcessorId(0),
|
||||
processor_type: ProcessorType::Cpu(CpuVariant::X86_64 {
|
||||
avx: processor::AvxSupport::Avx512,
|
||||
}),
|
||||
capabilities: ProcessorCapabilities::default(),
|
||||
utilization: 0.0,
|
||||
temperature: Some(45.0),
|
||||
}],
|
||||
total_memory_gb: 64.0,
|
||||
available_memory_gb: 60.0,
|
||||
bandwidth_gbps: 10.0,
|
||||
region: "us-east".to_string(),
|
||||
stake: 1000,
|
||||
reputation: 100,
|
||||
status: NodeStatus::Online,
|
||||
};
|
||||
|
||||
cluster.register_node(node).unwrap();
|
||||
assert_eq!(cluster.stats().total_nodes, 1);
|
||||
}
|
||||
}
|
||||
1151
crates/synor-compute/src/market/mod.rs
Normal file
1151
crates/synor-compute/src/market/mod.rs
Normal file
File diff suppressed because it is too large
Load diff
370
crates/synor-compute/src/memory/mod.rs
Normal file
370
crates/synor-compute/src/memory/mod.rs
Normal file
|
|
@ -0,0 +1,370 @@
|
|||
//! Unified memory management for heterogeneous compute.
|
||||
|
||||
use crate::error::ComputeError;
|
||||
use crate::processor::ProcessorType;
|
||||
use parking_lot::RwLock;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Tensor handle for memory management.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct TensorHandle {
|
||||
/// Unique ID.
|
||||
pub id: TensorId,
|
||||
/// Shape.
|
||||
pub shape: Vec<usize>,
|
||||
/// Data type.
|
||||
pub dtype: DataType,
|
||||
/// Size in bytes.
|
||||
pub size_bytes: u64,
|
||||
/// Current locations.
|
||||
pub locations: Vec<ProcessorType>,
|
||||
}
|
||||
|
||||
impl TensorHandle {
|
||||
/// Creates a new tensor handle.
|
||||
pub fn new(shape: Vec<usize>, dtype: DataType) -> Self {
|
||||
let size_bytes = shape.iter().product::<usize>() as u64 * dtype.size_bytes() as u64;
|
||||
Self {
|
||||
id: TensorId::new(),
|
||||
shape,
|
||||
dtype,
|
||||
size_bytes,
|
||||
locations: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the number of elements.
|
||||
pub fn numel(&self) -> usize {
|
||||
self.shape.iter().product()
|
||||
}
|
||||
}
|
||||
|
||||
/// Tensor identifier.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct TensorId(pub u64);
|
||||
|
||||
impl TensorId {
|
||||
/// Creates a new tensor ID.
|
||||
pub fn new() -> Self {
|
||||
use rand::Rng;
|
||||
TensorId(rand::thread_rng().gen())
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for TensorId {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Data types for tensors.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum DataType {
|
||||
Float64,
|
||||
Float32,
|
||||
Float16,
|
||||
BFloat16,
|
||||
Int64,
|
||||
Int32,
|
||||
Int16,
|
||||
Int8,
|
||||
UInt8,
|
||||
Bool,
|
||||
}
|
||||
|
||||
impl DataType {
|
||||
/// Returns size in bytes.
|
||||
pub fn size_bytes(&self) -> usize {
|
||||
match self {
|
||||
DataType::Float64 | DataType::Int64 => 8,
|
||||
DataType::Float32 | DataType::Int32 => 4,
|
||||
DataType::Float16 | DataType::BFloat16 | DataType::Int16 => 2,
|
||||
DataType::Int8 | DataType::UInt8 | DataType::Bool => 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Data transfer path between processors.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
|
||||
pub enum TransferPath {
|
||||
/// Direct GPU-to-GPU via NVLink.
|
||||
NvLink,
|
||||
/// Direct GPU-to-GPU via PCIe P2P.
|
||||
PciePeerToPeer,
|
||||
/// Through CPU memory.
|
||||
CpuMediated,
|
||||
/// Unified memory (Apple Silicon).
|
||||
UnifiedMemory,
|
||||
/// Network transfer.
|
||||
Network,
|
||||
/// Same memory space (no transfer needed).
|
||||
SameMemory,
|
||||
}
|
||||
|
||||
impl TransferPath {
|
||||
/// Returns approximate bandwidth in GB/s.
|
||||
pub fn bandwidth_gbps(&self) -> f64 {
|
||||
match self {
|
||||
TransferPath::NvLink => 900.0, // NVLink 4.0
|
||||
TransferPath::PciePeerToPeer => 64.0, // PCIe 5.0 x16
|
||||
TransferPath::CpuMediated => 50.0, // DDR5
|
||||
TransferPath::UnifiedMemory => 400.0, // Apple unified
|
||||
TransferPath::Network => 10.0, // 100Gbps network
|
||||
TransferPath::SameMemory => f64::INFINITY,
|
||||
}
|
||||
}
|
||||
|
||||
/// Estimates transfer time for given bytes.
|
||||
pub fn estimate_transfer_time(&self, bytes: u64) -> std::time::Duration {
|
||||
if matches!(self, TransferPath::SameMemory) {
|
||||
return std::time::Duration::ZERO;
|
||||
}
|
||||
|
||||
let bytes_f64 = bytes as f64;
|
||||
let bandwidth = self.bandwidth_gbps() * 1e9; // Convert to bytes/s
|
||||
let seconds = bytes_f64 / bandwidth;
|
||||
std::time::Duration::from_secs_f64(seconds)
|
||||
}
|
||||
}
|
||||
|
||||
/// Unified memory manager.
|
||||
pub struct MemoryManager {
|
||||
/// Allocated tensors.
|
||||
tensors: RwLock<HashMap<TensorId, TensorHandle>>,
|
||||
/// Memory usage per processor type.
|
||||
usage: RwLock<HashMap<ProcessorType, u64>>,
|
||||
/// Memory limits per processor type.
|
||||
limits: HashMap<ProcessorType, u64>,
|
||||
}
|
||||
|
||||
impl MemoryManager {
|
||||
/// Creates a new memory manager.
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
tensors: RwLock::new(HashMap::new()),
|
||||
usage: RwLock::new(HashMap::new()),
|
||||
limits: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets memory limit for a processor type.
|
||||
pub fn set_limit(&mut self, proc_type: ProcessorType, limit_bytes: u64) {
|
||||
self.limits.insert(proc_type, limit_bytes);
|
||||
}
|
||||
|
||||
/// Allocates a tensor.
|
||||
pub fn allocate(&self, shape: Vec<usize>, dtype: DataType) -> Result<TensorHandle, ComputeError> {
|
||||
let handle = TensorHandle::new(shape, dtype);
|
||||
self.tensors.write().insert(handle.id, handle.clone());
|
||||
Ok(handle)
|
||||
}
|
||||
|
||||
/// Frees a tensor.
|
||||
pub fn free(&self, tensor_id: TensorId) -> Result<(), ComputeError> {
|
||||
if let Some(handle) = self.tensors.write().remove(&tensor_id) {
|
||||
// Update usage for all locations
|
||||
let mut usage = self.usage.write();
|
||||
for loc in &handle.locations {
|
||||
if let Some(u) = usage.get_mut(loc) {
|
||||
*u = u.saturating_sub(handle.size_bytes);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Gets a tensor handle.
|
||||
pub fn get(&self, tensor_id: TensorId) -> Option<TensorHandle> {
|
||||
self.tensors.read().get(&tensor_id).cloned()
|
||||
}
|
||||
|
||||
/// Ensures tensor is on specified processor.
|
||||
pub fn ensure_on(
|
||||
&self,
|
||||
tensor_id: TensorId,
|
||||
target: ProcessorType,
|
||||
) -> Result<TransferPath, ComputeError> {
|
||||
let mut tensors = self.tensors.write();
|
||||
|
||||
if let Some(handle) = tensors.get_mut(&tensor_id) {
|
||||
// Check if already on target
|
||||
if handle.locations.contains(&target) {
|
||||
return Ok(TransferPath::SameMemory);
|
||||
}
|
||||
|
||||
// Determine transfer path
|
||||
let path = if handle.locations.is_empty() {
|
||||
// New tensor, allocate on target
|
||||
TransferPath::SameMemory
|
||||
} else {
|
||||
// Find best transfer path from existing location
|
||||
self.find_best_path(&handle.locations[0], &target)
|
||||
};
|
||||
|
||||
// Record new location
|
||||
handle.locations.push(target.clone());
|
||||
|
||||
// Update usage
|
||||
let mut usage = self.usage.write();
|
||||
*usage.entry(target).or_default() += handle.size_bytes;
|
||||
|
||||
Ok(path)
|
||||
} else {
|
||||
Err(ComputeError::Internal("Tensor not found".to_string()))
|
||||
}
|
||||
}
|
||||
|
||||
/// Finds best transfer path between processors.
|
||||
fn find_best_path(&self, from: &ProcessorType, to: &ProcessorType) -> TransferPath {
|
||||
// Check for unified memory (Apple Silicon)
|
||||
if self.shares_memory(from, to) {
|
||||
return TransferPath::UnifiedMemory;
|
||||
}
|
||||
|
||||
// Check for NVLink between NVIDIA GPUs
|
||||
if matches!(from, ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda { .. }))
|
||||
&& matches!(to, ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda { .. }))
|
||||
{
|
||||
return TransferPath::NvLink;
|
||||
}
|
||||
|
||||
// Check for PCIe P2P between GPUs
|
||||
if from.is_gpu() && to.is_gpu() {
|
||||
return TransferPath::PciePeerToPeer;
|
||||
}
|
||||
|
||||
// Default to CPU-mediated transfer
|
||||
TransferPath::CpuMediated
|
||||
}
|
||||
|
||||
/// Checks if two processor types share memory.
|
||||
fn shares_memory(&self, a: &ProcessorType, b: &ProcessorType) -> bool {
|
||||
use crate::processor::{CpuVariant, GpuVariant, NpuVariant};
|
||||
|
||||
match (a, b) {
|
||||
// Apple Silicon unified memory
|
||||
(ProcessorType::Cpu(CpuVariant::Arm64 { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal))
|
||||
| (ProcessorType::Gpu(GpuVariant::AppleMetal), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
|
||||
| (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
|
||||
| (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal)) => true,
|
||||
// Same type
|
||||
_ if a == b => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets current memory usage for a processor type.
|
||||
pub fn usage(&self, proc_type: ProcessorType) -> u64 {
|
||||
self.usage.read().get(&proc_type).copied().unwrap_or(0)
|
||||
}
|
||||
|
||||
/// Gets available memory for a processor type.
|
||||
pub fn available(&self, proc_type: ProcessorType) -> u64 {
|
||||
let limit = self.limits.get(&proc_type).copied().unwrap_or(u64::MAX);
|
||||
let used = self.usage(proc_type);
|
||||
limit.saturating_sub(used)
|
||||
}
|
||||
|
||||
/// Gets total allocated tensors.
|
||||
pub fn tensor_count(&self) -> usize {
|
||||
self.tensors.read().len()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for MemoryManager {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Unified memory abstraction for zero-copy sharing.
|
||||
pub struct UnifiedMemory {
|
||||
/// Base pointer (in unified address space).
|
||||
pub base: u64,
|
||||
/// Size in bytes.
|
||||
pub size: u64,
|
||||
/// Accessible from these processor types.
|
||||
pub accessible_from: Vec<ProcessorType>,
|
||||
}
|
||||
|
||||
impl UnifiedMemory {
|
||||
/// Creates new unified memory region.
|
||||
pub fn new(size: u64) -> Self {
|
||||
Self {
|
||||
base: 0, // Would be actual pointer in real implementation
|
||||
size,
|
||||
accessible_from: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks if accessible from processor type.
|
||||
pub fn is_accessible_from(&self, proc_type: &ProcessorType) -> bool {
|
||||
self.accessible_from.contains(proc_type)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_tensor_handle() {
|
||||
let handle = TensorHandle::new(vec![1024, 1024], DataType::Float32);
|
||||
assert_eq!(handle.numel(), 1024 * 1024);
|
||||
assert_eq!(handle.size_bytes, 1024 * 1024 * 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_data_type_sizes() {
|
||||
assert_eq!(DataType::Float64.size_bytes(), 8);
|
||||
assert_eq!(DataType::Float32.size_bytes(), 4);
|
||||
assert_eq!(DataType::Float16.size_bytes(), 2);
|
||||
assert_eq!(DataType::Int8.size_bytes(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_transfer_path_bandwidth() {
|
||||
assert!(TransferPath::NvLink.bandwidth_gbps() > TransferPath::PciePeerToPeer.bandwidth_gbps());
|
||||
assert!(TransferPath::SameMemory.bandwidth_gbps().is_infinite());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_memory_manager() {
|
||||
let manager = MemoryManager::new();
|
||||
|
||||
let handle = manager.allocate(vec![1024, 1024], DataType::Float32).unwrap();
|
||||
assert_eq!(manager.tensor_count(), 1);
|
||||
|
||||
manager.free(handle.id).unwrap();
|
||||
assert_eq!(manager.tensor_count(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ensure_on() {
|
||||
let manager = MemoryManager::new();
|
||||
|
||||
let handle = manager.allocate(vec![1024], DataType::Float32).unwrap();
|
||||
|
||||
// First ensure should allocate
|
||||
let path = manager.ensure_on(
|
||||
handle.id,
|
||||
ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda {
|
||||
compute_capability: (8, 0),
|
||||
}),
|
||||
).unwrap();
|
||||
|
||||
assert_eq!(path, TransferPath::SameMemory);
|
||||
|
||||
// Second ensure to same location should be same memory
|
||||
let path = manager.ensure_on(
|
||||
handle.id,
|
||||
ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda {
|
||||
compute_capability: (8, 0),
|
||||
}),
|
||||
).unwrap();
|
||||
|
||||
assert_eq!(path, TransferPath::SameMemory);
|
||||
}
|
||||
}
|
||||
547
crates/synor-compute/src/processor/capabilities.rs
Normal file
547
crates/synor-compute/src/processor/capabilities.rs
Normal file
|
|
@ -0,0 +1,547 @@
|
|||
//! Processor capability definitions.
|
||||
|
||||
use super::operation::OperationType;
|
||||
use super::types::PowerTier;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
|
||||
/// Detailed processor capabilities.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct ProcessorCapabilities {
|
||||
/// Compute throughput.
|
||||
pub compute: ComputeThroughput,
|
||||
/// Memory specifications.
|
||||
pub memory: MemorySpecs,
|
||||
/// Supported operations.
|
||||
pub operations: HashSet<OperationType>,
|
||||
/// Power characteristics.
|
||||
pub power: PowerCharacteristics,
|
||||
/// Optimal workload characteristics.
|
||||
pub optimal_for: Vec<WorkloadCharacteristic>,
|
||||
}
|
||||
|
||||
impl Default for ProcessorCapabilities {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
compute: ComputeThroughput::default(),
|
||||
memory: MemorySpecs::default(),
|
||||
operations: Self::default_operations(),
|
||||
power: PowerCharacteristics::default(),
|
||||
optimal_for: vec![],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ProcessorCapabilities {
|
||||
/// Default operations supported by most processors.
|
||||
fn default_operations() -> HashSet<OperationType> {
|
||||
[
|
||||
OperationType::MatMul,
|
||||
OperationType::Add,
|
||||
OperationType::Mul,
|
||||
OperationType::ReLU,
|
||||
OperationType::Softmax,
|
||||
OperationType::DataLoad,
|
||||
OperationType::DataPreprocess,
|
||||
]
|
||||
.into_iter()
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Creates CPU capabilities.
|
||||
pub fn cpu(cores: u32, clock_ghz: f32, avx512: bool) -> Self {
|
||||
let flops_per_cycle = if avx512 { 64.0 } else { 32.0 }; // FP32 ops per cycle with AVX
|
||||
let fp32_tflops = (cores as f64 * clock_ghz as f64 * flops_per_cycle) / 1000.0;
|
||||
|
||||
Self {
|
||||
compute: ComputeThroughput {
|
||||
fp64_tflops: fp32_tflops / 2.0,
|
||||
fp32_tflops,
|
||||
fp16_tflops: fp32_tflops * 2.0,
|
||||
bf16_tflops: fp32_tflops * 2.0,
|
||||
int8_tops: fp32_tflops * 4.0,
|
||||
int4_tops: fp32_tflops * 8.0,
|
||||
sparsity_speedup: 1.0,
|
||||
},
|
||||
memory: MemorySpecs {
|
||||
capacity_bytes: 64 * 1024 * 1024 * 1024, // 64 GB typical
|
||||
bandwidth_gbps: 200, // DDR5
|
||||
type_: MemoryType::Ddr5,
|
||||
},
|
||||
operations: Self::cpu_operations(),
|
||||
power: PowerCharacteristics {
|
||||
tdp_watts: 125,
|
||||
efficiency: 0.8,
|
||||
power_tier: PowerTier::Medium,
|
||||
},
|
||||
optimal_for: vec![
|
||||
WorkloadCharacteristic::Sequential,
|
||||
WorkloadCharacteristic::MemoryBound,
|
||||
WorkloadCharacteristic::SmallBatch,
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
/// Operations typically supported by CPUs.
|
||||
fn cpu_operations() -> HashSet<OperationType> {
|
||||
[
|
||||
// Matrix operations (slow but supported)
|
||||
OperationType::MatMul,
|
||||
OperationType::Conv2d,
|
||||
OperationType::BatchNorm,
|
||||
OperationType::LayerNorm,
|
||||
// Element-wise
|
||||
OperationType::Add,
|
||||
OperationType::Mul,
|
||||
OperationType::ReLU,
|
||||
OperationType::GeLU,
|
||||
OperationType::Softmax,
|
||||
// Data operations (optimal)
|
||||
OperationType::DataLoad,
|
||||
OperationType::DataPreprocess,
|
||||
OperationType::Tokenization,
|
||||
OperationType::Detokenization,
|
||||
// Memory operations
|
||||
OperationType::Transpose,
|
||||
OperationType::Reshape,
|
||||
OperationType::Concat,
|
||||
OperationType::Split,
|
||||
// I/O
|
||||
OperationType::Checkpoint,
|
||||
]
|
||||
.into_iter()
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Creates NVIDIA GPU capabilities.
|
||||
pub fn nvidia_gpu(
|
||||
cuda_cores: u32,
|
||||
tensor_cores: u32,
|
||||
vram_gb: u32,
|
||||
bandwidth_gbps: u32,
|
||||
compute_capability: (u8, u8),
|
||||
) -> Self {
|
||||
// Approximate TFLOPS based on cores and typical clocks
|
||||
let base_clock_ghz = 1.5;
|
||||
let fp32_tflops = (cuda_cores as f64 * base_clock_ghz * 2.0) / 1000.0;
|
||||
let tensor_multiplier = if compute_capability.0 >= 8 { 4.0 } else { 2.0 };
|
||||
|
||||
Self {
|
||||
compute: ComputeThroughput {
|
||||
fp64_tflops: fp32_tflops / 2.0,
|
||||
fp32_tflops,
|
||||
fp16_tflops: fp32_tflops * tensor_multiplier,
|
||||
bf16_tflops: fp32_tflops * tensor_multiplier,
|
||||
int8_tops: fp32_tflops * tensor_multiplier * 2.0,
|
||||
int4_tops: fp32_tflops * tensor_multiplier * 4.0,
|
||||
sparsity_speedup: if compute_capability.0 >= 8 { 2.0 } else { 1.0 },
|
||||
},
|
||||
memory: MemorySpecs {
|
||||
capacity_bytes: vram_gb as u64 * 1024 * 1024 * 1024,
|
||||
bandwidth_gbps,
|
||||
type_: if compute_capability.0 >= 9 {
|
||||
MemoryType::Hbm3
|
||||
} else {
|
||||
MemoryType::Hbm2e
|
||||
},
|
||||
},
|
||||
operations: Self::gpu_operations(compute_capability),
|
||||
power: PowerCharacteristics {
|
||||
tdp_watts: if compute_capability.0 >= 9 { 700 } else { 350 },
|
||||
efficiency: 0.9,
|
||||
power_tier: PowerTier::High,
|
||||
},
|
||||
optimal_for: vec![
|
||||
WorkloadCharacteristic::HighlyParallel,
|
||||
WorkloadCharacteristic::LargeBatch,
|
||||
WorkloadCharacteristic::ComputeBound,
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
/// Operations supported by GPUs.
|
||||
fn gpu_operations(compute_capability: (u8, u8)) -> HashSet<OperationType> {
|
||||
let mut ops: HashSet<OperationType> = [
|
||||
// Matrix operations (optimal)
|
||||
OperationType::MatMul,
|
||||
OperationType::Conv2d,
|
||||
OperationType::Conv3d,
|
||||
OperationType::DepthwiseConv,
|
||||
OperationType::BatchNorm,
|
||||
OperationType::LayerNorm,
|
||||
// Attention
|
||||
OperationType::SelfAttention,
|
||||
OperationType::CrossAttention,
|
||||
// Element-wise
|
||||
OperationType::Add,
|
||||
OperationType::Mul,
|
||||
OperationType::ReLU,
|
||||
OperationType::GeLU,
|
||||
OperationType::SiLU,
|
||||
OperationType::Softmax,
|
||||
// Reduction
|
||||
OperationType::Sum,
|
||||
OperationType::Mean,
|
||||
OperationType::Max,
|
||||
OperationType::ArgMax,
|
||||
// Memory operations
|
||||
OperationType::Transpose,
|
||||
OperationType::Reshape,
|
||||
OperationType::Concat,
|
||||
OperationType::Split,
|
||||
OperationType::Gather,
|
||||
OperationType::Scatter,
|
||||
// LLM specific
|
||||
OperationType::Embedding,
|
||||
OperationType::RoPE,
|
||||
OperationType::KVCache,
|
||||
OperationType::TopK,
|
||||
OperationType::Sampling,
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
// FlashAttention for newer GPUs
|
||||
if compute_capability.0 >= 8 {
|
||||
ops.insert(OperationType::FlashAttention);
|
||||
}
|
||||
|
||||
ops
|
||||
}
|
||||
|
||||
/// Creates TPU capabilities.
|
||||
pub fn tpu(version: super::TpuVersion) -> Self {
|
||||
let (bf16_tflops, memory_gb, bandwidth_gbps) = match version {
|
||||
super::TpuVersion::V5p => (918.0, 95, 4800),
|
||||
super::TpuVersion::V5e => (197.0, 16, 1600),
|
||||
super::TpuVersion::V4 => (275.0, 32, 2400),
|
||||
super::TpuVersion::V4i => (138.0, 32, 1200),
|
||||
super::TpuVersion::V3 => (123.0, 16, 900),
|
||||
super::TpuVersion::V2 => (46.0, 8, 600),
|
||||
super::TpuVersion::Edge => (4.0, 0, 0), // Uses host memory
|
||||
};
|
||||
|
||||
Self {
|
||||
compute: ComputeThroughput {
|
||||
fp64_tflops: 0.0, // TPUs don't support FP64
|
||||
fp32_tflops: bf16_tflops / 2.0,
|
||||
fp16_tflops: bf16_tflops,
|
||||
bf16_tflops,
|
||||
int8_tops: bf16_tflops * 2.0,
|
||||
int4_tops: bf16_tflops * 4.0,
|
||||
sparsity_speedup: 2.0,
|
||||
},
|
||||
memory: MemorySpecs {
|
||||
capacity_bytes: memory_gb as u64 * 1024 * 1024 * 1024,
|
||||
bandwidth_gbps,
|
||||
type_: MemoryType::Hbm2e,
|
||||
},
|
||||
operations: Self::tpu_operations(),
|
||||
power: PowerCharacteristics {
|
||||
tdp_watts: if matches!(version, super::TpuVersion::Edge) {
|
||||
2
|
||||
} else {
|
||||
400
|
||||
},
|
||||
efficiency: 0.95,
|
||||
power_tier: if matches!(version, super::TpuVersion::Edge) {
|
||||
PowerTier::UltraLow
|
||||
} else {
|
||||
PowerTier::High
|
||||
},
|
||||
},
|
||||
optimal_for: vec![
|
||||
WorkloadCharacteristic::HighlyParallel,
|
||||
WorkloadCharacteristic::ComputeBound,
|
||||
WorkloadCharacteristic::FixedShape,
|
||||
WorkloadCharacteristic::LargeBatch,
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
/// Operations supported by TPUs.
|
||||
fn tpu_operations() -> HashSet<OperationType> {
|
||||
[
|
||||
// Matrix operations (optimal)
|
||||
OperationType::MatMul,
|
||||
OperationType::Conv2d,
|
||||
OperationType::BatchNorm,
|
||||
OperationType::LayerNorm,
|
||||
// Attention
|
||||
OperationType::SelfAttention,
|
||||
OperationType::CrossAttention,
|
||||
OperationType::FlashAttention,
|
||||
// Element-wise
|
||||
OperationType::Add,
|
||||
OperationType::Mul,
|
||||
OperationType::ReLU,
|
||||
OperationType::GeLU,
|
||||
OperationType::SiLU,
|
||||
OperationType::Softmax,
|
||||
// Reduction
|
||||
OperationType::Sum,
|
||||
OperationType::Mean,
|
||||
OperationType::Max,
|
||||
// LLM specific
|
||||
OperationType::Embedding,
|
||||
OperationType::RoPE,
|
||||
OperationType::KVCache,
|
||||
]
|
||||
.into_iter()
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Creates LPU (Groq) capabilities.
|
||||
pub fn lpu() -> Self {
|
||||
Self {
|
||||
compute: ComputeThroughput {
|
||||
fp64_tflops: 0.0,
|
||||
fp32_tflops: 0.0,
|
||||
fp16_tflops: 188.0,
|
||||
bf16_tflops: 188.0,
|
||||
int8_tops: 750.0,
|
||||
int4_tops: 1500.0,
|
||||
sparsity_speedup: 1.0,
|
||||
},
|
||||
memory: MemorySpecs {
|
||||
capacity_bytes: 230 * 1024 * 1024 * 1024, // 230 GB SRAM!
|
||||
bandwidth_gbps: 80_000, // 80 TB/s internal
|
||||
type_: MemoryType::Sram,
|
||||
},
|
||||
operations: Self::lpu_operations(),
|
||||
power: PowerCharacteristics {
|
||||
tdp_watts: 300,
|
||||
efficiency: 0.98, // Very efficient for inference
|
||||
power_tier: PowerTier::Medium,
|
||||
},
|
||||
optimal_for: vec![
|
||||
WorkloadCharacteristic::Sequential,
|
||||
WorkloadCharacteristic::SmallBatch,
|
||||
WorkloadCharacteristic::VariableLength,
|
||||
WorkloadCharacteristic::LowLatency,
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
/// Operations supported by Groq LPU.
|
||||
fn lpu_operations() -> HashSet<OperationType> {
|
||||
[
|
||||
// Optimized for inference
|
||||
OperationType::MatMul,
|
||||
OperationType::LayerNorm,
|
||||
OperationType::SelfAttention,
|
||||
OperationType::Add,
|
||||
OperationType::Mul,
|
||||
OperationType::ReLU,
|
||||
OperationType::GeLU,
|
||||
OperationType::SiLU,
|
||||
OperationType::Softmax,
|
||||
OperationType::Embedding,
|
||||
OperationType::RoPE,
|
||||
OperationType::KVCache,
|
||||
OperationType::TopK,
|
||||
OperationType::Sampling,
|
||||
]
|
||||
.into_iter()
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Creates Apple Neural Engine capabilities.
|
||||
pub fn apple_neural_engine(cores: u32) -> Self {
|
||||
let int8_tops = match cores {
|
||||
16 => 18.0, // M3
|
||||
32 => 35.0, // M3 Max
|
||||
_ => cores as f64 * 1.1,
|
||||
};
|
||||
|
||||
Self {
|
||||
compute: ComputeThroughput {
|
||||
fp64_tflops: 0.0,
|
||||
fp32_tflops: int8_tops / 4.0,
|
||||
fp16_tflops: int8_tops / 2.0,
|
||||
bf16_tflops: int8_tops / 2.0,
|
||||
int8_tops,
|
||||
int4_tops: int8_tops * 2.0,
|
||||
sparsity_speedup: 1.0,
|
||||
},
|
||||
memory: MemorySpecs {
|
||||
capacity_bytes: 0, // Uses unified memory
|
||||
bandwidth_gbps: 400,
|
||||
type_: MemoryType::Unified,
|
||||
},
|
||||
operations: Self::npu_operations(),
|
||||
power: PowerCharacteristics {
|
||||
tdp_watts: 15,
|
||||
efficiency: 0.95,
|
||||
power_tier: PowerTier::UltraLow,
|
||||
},
|
||||
optimal_for: vec![
|
||||
WorkloadCharacteristic::LowPower,
|
||||
WorkloadCharacteristic::LowLatency,
|
||||
WorkloadCharacteristic::SmallBatch,
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
/// Operations supported by NPUs.
|
||||
fn npu_operations() -> HashSet<OperationType> {
|
||||
[
|
||||
// Inference optimized
|
||||
OperationType::MatMul,
|
||||
OperationType::Conv2d,
|
||||
OperationType::DepthwiseConv,
|
||||
OperationType::BatchNorm,
|
||||
OperationType::LayerNorm,
|
||||
OperationType::Add,
|
||||
OperationType::Mul,
|
||||
OperationType::ReLU,
|
||||
OperationType::Softmax,
|
||||
OperationType::Embedding,
|
||||
]
|
||||
.into_iter()
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute throughput metrics.
|
||||
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
|
||||
pub struct ComputeThroughput {
|
||||
/// FP64 TFLOPS.
|
||||
pub fp64_tflops: f64,
|
||||
/// FP32 TFLOPS.
|
||||
pub fp32_tflops: f64,
|
||||
/// FP16 TFLOPS.
|
||||
pub fp16_tflops: f64,
|
||||
/// BF16 TFLOPS.
|
||||
pub bf16_tflops: f64,
|
||||
/// INT8 TOPS.
|
||||
pub int8_tops: f64,
|
||||
/// INT4 TOPS.
|
||||
pub int4_tops: f64,
|
||||
/// Speedup for sparse operations.
|
||||
pub sparsity_speedup: f64,
|
||||
}
|
||||
|
||||
/// Memory specifications.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct MemorySpecs {
|
||||
/// Total capacity (bytes).
|
||||
pub capacity_bytes: u64,
|
||||
/// Bandwidth (GB/s).
|
||||
pub bandwidth_gbps: u32,
|
||||
/// Memory type.
|
||||
pub type_: MemoryType,
|
||||
}
|
||||
|
||||
impl Default for MemorySpecs {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
capacity_bytes: 16 * 1024 * 1024 * 1024, // 16 GB
|
||||
bandwidth_gbps: 500,
|
||||
type_: MemoryType::Ddr5,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Memory types.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum MemoryType {
|
||||
/// DDR4 RAM.
|
||||
Ddr4,
|
||||
/// DDR5 RAM.
|
||||
Ddr5,
|
||||
/// GDDR6/6X video memory.
|
||||
Gddr6,
|
||||
/// HBM2.
|
||||
Hbm2,
|
||||
/// HBM2e.
|
||||
Hbm2e,
|
||||
/// HBM3.
|
||||
Hbm3,
|
||||
/// SRAM (on-chip).
|
||||
Sram,
|
||||
/// Unified memory (Apple Silicon).
|
||||
Unified,
|
||||
/// LPDDR (mobile).
|
||||
Lpddr,
|
||||
}
|
||||
|
||||
/// Power characteristics.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct PowerCharacteristics {
|
||||
/// TDP in watts.
|
||||
pub tdp_watts: u32,
|
||||
/// Efficiency factor (0.0 - 1.0).
|
||||
pub efficiency: f64,
|
||||
/// Power tier.
|
||||
pub power_tier: PowerTier,
|
||||
}
|
||||
|
||||
impl Default for PowerCharacteristics {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
tdp_watts: 100,
|
||||
efficiency: 0.8,
|
||||
power_tier: PowerTier::Medium,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Workload characteristics for processor matching.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum WorkloadCharacteristic {
|
||||
/// High parallelism (GPU, TPU).
|
||||
HighlyParallel,
|
||||
/// Sequential dependencies (CPU, LPU).
|
||||
Sequential,
|
||||
/// Memory bandwidth bound (GPU).
|
||||
MemoryBound,
|
||||
/// Compute bound (TPU).
|
||||
ComputeBound,
|
||||
/// Low latency required (NPU, edge).
|
||||
LowLatency,
|
||||
/// Low power required (NPU, mobile).
|
||||
LowPower,
|
||||
/// Large batch sizes (GPU, TPU).
|
||||
LargeBatch,
|
||||
/// Small batch sizes (CPU, LPU).
|
||||
SmallBatch,
|
||||
/// Variable length sequences (LPU).
|
||||
VariableLength,
|
||||
/// Fixed tensor shapes (TPU).
|
||||
FixedShape,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_cpu_capabilities() {
|
||||
let caps = ProcessorCapabilities::cpu(32, 3.5, true);
|
||||
assert!(caps.compute.fp32_tflops > 0.0);
|
||||
assert!(caps.operations.contains(&OperationType::DataLoad));
|
||||
assert!(caps.operations.contains(&OperationType::Tokenization));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gpu_capabilities() {
|
||||
let caps = ProcessorCapabilities::nvidia_gpu(16384, 512, 24, 1000, (8, 9));
|
||||
assert!(caps.compute.fp16_tflops > caps.compute.fp32_tflops);
|
||||
assert!(caps.operations.contains(&OperationType::FlashAttention));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tpu_capabilities() {
|
||||
let caps = ProcessorCapabilities::tpu(super::super::TpuVersion::V5p);
|
||||
assert!(caps.compute.bf16_tflops > 900.0);
|
||||
assert!(!caps.operations.contains(&OperationType::DataLoad)); // TPUs don't do I/O
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lpu_capabilities() {
|
||||
let caps = ProcessorCapabilities::lpu();
|
||||
assert!(caps.memory.bandwidth_gbps > 10000); // Very high internal bandwidth
|
||||
assert!(caps.optimal_for.contains(&WorkloadCharacteristic::Sequential));
|
||||
}
|
||||
}
|
||||
339
crates/synor-compute/src/processor/mod.rs
Normal file
339
crates/synor-compute/src/processor/mod.rs
Normal file
|
|
@ -0,0 +1,339 @@
|
|||
//! Processor abstractions for heterogeneous compute.
|
||||
//!
|
||||
//! Supports all processor types:
|
||||
//! - CPU (x86_64, ARM64, RISC-V)
|
||||
//! - GPU (NVIDIA CUDA, AMD ROCm, Intel OneAPI, Apple Metal)
|
||||
//! - TPU (Google TPU v2-v5)
|
||||
//! - NPU (Apple Neural Engine, Qualcomm Hexagon, Intel VPU)
|
||||
//! - LPU (Groq Language Processing Unit)
|
||||
//! - FPGA (Xilinx, Intel/Altera)
|
||||
//! - DSP (Digital Signal Processors)
|
||||
//! - Custom accelerators
|
||||
|
||||
mod capabilities;
|
||||
mod operation;
|
||||
mod profiles;
|
||||
mod types;
|
||||
|
||||
pub use capabilities::{ComputeThroughput, MemorySpecs, MemoryType, ProcessorCapabilities};
|
||||
pub use operation::{Operation, OperationType};
|
||||
pub use profiles::ProcessorProfiles;
|
||||
pub use types::*;
|
||||
|
||||
use crate::error::ComputeError;
|
||||
use async_trait::async_trait;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::time::Duration;
|
||||
|
||||
/// Unique processor identifier (within a node).
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct ProcessorId(pub u64);
|
||||
|
||||
impl std::fmt::Display for ProcessorId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "proc_{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
/// Unified abstraction for any processor type.
|
||||
#[async_trait]
|
||||
pub trait Processor: Send + Sync {
|
||||
/// Get processor ID.
|
||||
fn id(&self) -> ProcessorId;
|
||||
|
||||
/// Get processor type.
|
||||
fn processor_type(&self) -> ProcessorType;
|
||||
|
||||
/// Get capabilities.
|
||||
fn capabilities(&self) -> &ProcessorCapabilities;
|
||||
|
||||
/// Check if processor can execute operation.
|
||||
fn can_execute(&self, op: &Operation) -> bool;
|
||||
|
||||
/// Estimate execution time for operation.
|
||||
fn estimate_time(&self, op: &Operation) -> Duration;
|
||||
|
||||
/// Estimate energy consumption for operation (Joules).
|
||||
fn estimate_energy(&self, op: &Operation) -> f64;
|
||||
|
||||
/// Execute operation.
|
||||
async fn execute(&self, op: Operation) -> Result<OperationResult, ComputeError>;
|
||||
|
||||
/// Current utilization (0.0 - 1.0).
|
||||
fn utilization(&self) -> f64;
|
||||
|
||||
/// Available memory (bytes).
|
||||
fn available_memory(&self) -> u64;
|
||||
|
||||
/// Check if this processor shares memory with another type.
|
||||
fn shares_memory_with(&self, other: &ProcessorType) -> bool {
|
||||
// By default, processors don't share memory
|
||||
// Override for unified memory architectures (Apple Silicon, AMD APUs)
|
||||
self.processor_type() == *other
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of an operation execution.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct OperationResult {
|
||||
/// Output data.
|
||||
pub output: Vec<u8>,
|
||||
/// Execution time.
|
||||
pub duration: Duration,
|
||||
/// Energy consumed (Joules).
|
||||
pub energy: f64,
|
||||
/// Peak memory used (bytes).
|
||||
pub peak_memory: u64,
|
||||
}
|
||||
|
||||
/// Generic processor implementation for simulation/testing.
|
||||
pub struct GenericProcessor {
|
||||
id: ProcessorId,
|
||||
processor_type: ProcessorType,
|
||||
capabilities: ProcessorCapabilities,
|
||||
utilization: std::sync::atomic::AtomicU64,
|
||||
available_memory: std::sync::atomic::AtomicU64,
|
||||
}
|
||||
|
||||
impl GenericProcessor {
|
||||
/// Creates a new generic processor.
|
||||
pub fn new(
|
||||
id: ProcessorId,
|
||||
processor_type: ProcessorType,
|
||||
capabilities: ProcessorCapabilities,
|
||||
) -> Self {
|
||||
let available_memory = capabilities.memory.capacity_bytes;
|
||||
Self {
|
||||
id,
|
||||
processor_type,
|
||||
capabilities,
|
||||
utilization: std::sync::atomic::AtomicU64::new(0),
|
||||
available_memory: std::sync::atomic::AtomicU64::new(available_memory),
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a CPU processor.
|
||||
pub fn cpu(id: ProcessorId, variant: CpuVariant) -> Self {
|
||||
Self::new(
|
||||
id,
|
||||
ProcessorType::Cpu(variant),
|
||||
ProcessorProfiles::cpu_default(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Creates an NVIDIA GPU processor.
|
||||
pub fn nvidia_gpu(id: ProcessorId, compute_capability: (u8, u8)) -> Self {
|
||||
let capabilities = match compute_capability {
|
||||
(9, 0) => ProcessorProfiles::nvidia_h100(),
|
||||
(8, 9) => ProcessorProfiles::nvidia_rtx_4090(),
|
||||
(8, 6) => ProcessorProfiles::nvidia_rtx_3090(),
|
||||
_ => ProcessorProfiles::nvidia_default(),
|
||||
};
|
||||
Self::new(
|
||||
id,
|
||||
ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability }),
|
||||
capabilities,
|
||||
)
|
||||
}
|
||||
|
||||
/// Creates a TPU processor.
|
||||
pub fn tpu(id: ProcessorId, version: TpuVersion) -> Self {
|
||||
let capabilities = match version {
|
||||
TpuVersion::V5p => ProcessorProfiles::google_tpu_v5p(),
|
||||
TpuVersion::V4 => ProcessorProfiles::google_tpu_v4(),
|
||||
_ => ProcessorProfiles::google_tpu_default(),
|
||||
};
|
||||
Self::new(id, ProcessorType::Tpu(version), capabilities)
|
||||
}
|
||||
|
||||
/// Creates a Groq LPU processor.
|
||||
pub fn lpu(id: ProcessorId) -> Self {
|
||||
Self::new(id, ProcessorType::Lpu, ProcessorProfiles::groq_lpu())
|
||||
}
|
||||
|
||||
/// Creates an Apple Neural Engine processor.
|
||||
pub fn apple_neural_engine(id: ProcessorId, cores: u32) -> Self {
|
||||
Self::new(
|
||||
id,
|
||||
ProcessorType::Npu(NpuVariant::AppleNeuralEngine { cores }),
|
||||
ProcessorProfiles::apple_neural_engine(cores),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Processor for GenericProcessor {
|
||||
fn id(&self) -> ProcessorId {
|
||||
self.id
|
||||
}
|
||||
|
||||
fn processor_type(&self) -> ProcessorType {
|
||||
self.processor_type.clone()
|
||||
}
|
||||
|
||||
fn capabilities(&self) -> &ProcessorCapabilities {
|
||||
&self.capabilities
|
||||
}
|
||||
|
||||
fn can_execute(&self, op: &Operation) -> bool {
|
||||
self.capabilities.operations.contains(&op.op_type())
|
||||
}
|
||||
|
||||
fn estimate_time(&self, op: &Operation) -> Duration {
|
||||
// Estimate based on FLOPS and operation complexity
|
||||
let flops_needed = op.estimated_flops();
|
||||
let throughput = match op.precision() {
|
||||
Precision::Fp32 => self.capabilities.compute.fp32_tflops,
|
||||
Precision::Fp16 => self.capabilities.compute.fp16_tflops,
|
||||
Precision::Bf16 => self.capabilities.compute.bf16_tflops,
|
||||
Precision::Int8 => self.capabilities.compute.int8_tops,
|
||||
Precision::Int4 => self.capabilities.compute.int4_tops,
|
||||
Precision::Fp64 => self.capabilities.compute.fp64_tflops,
|
||||
};
|
||||
|
||||
if throughput > 0.0 {
|
||||
let tflops = throughput;
|
||||
let flops_per_second = tflops * 1e12;
|
||||
let seconds = flops_needed / flops_per_second;
|
||||
Duration::from_secs_f64(seconds)
|
||||
} else {
|
||||
Duration::from_secs(1) // Fallback
|
||||
}
|
||||
}
|
||||
|
||||
fn estimate_energy(&self, op: &Operation) -> f64 {
|
||||
// Estimate based on TDP and execution time
|
||||
let duration = self.estimate_time(op);
|
||||
let watts = self.capabilities.power.tdp_watts as f64;
|
||||
let efficiency = self.capabilities.power.efficiency;
|
||||
watts * duration.as_secs_f64() * efficiency
|
||||
}
|
||||
|
||||
async fn execute(&self, op: Operation) -> Result<OperationResult, ComputeError> {
|
||||
// Check if we can execute
|
||||
if !self.can_execute(&op) {
|
||||
return Err(ComputeError::OperationNotSupported(
|
||||
self.processor_type.clone(),
|
||||
format!("{:?}", op.op_type()),
|
||||
));
|
||||
}
|
||||
|
||||
// Simulate execution
|
||||
let duration = self.estimate_time(&op);
|
||||
let energy = self.estimate_energy(&op);
|
||||
|
||||
// Update utilization
|
||||
self.utilization
|
||||
.store(50, std::sync::atomic::Ordering::Relaxed);
|
||||
|
||||
// Simulate work
|
||||
tokio::time::sleep(Duration::from_micros(100)).await;
|
||||
|
||||
// Reset utilization
|
||||
self.utilization
|
||||
.store(0, std::sync::atomic::Ordering::Relaxed);
|
||||
|
||||
Ok(OperationResult {
|
||||
output: vec![],
|
||||
duration,
|
||||
energy,
|
||||
peak_memory: op.estimated_memory(),
|
||||
})
|
||||
}
|
||||
|
||||
fn utilization(&self) -> f64 {
|
||||
self.utilization.load(std::sync::atomic::Ordering::Relaxed) as f64 / 100.0
|
||||
}
|
||||
|
||||
fn available_memory(&self) -> u64 {
|
||||
self.available_memory
|
||||
.load(std::sync::atomic::Ordering::Relaxed)
|
||||
}
|
||||
|
||||
fn shares_memory_with(&self, other: &ProcessorType) -> bool {
|
||||
match (&self.processor_type, other) {
|
||||
// Apple Silicon has unified memory
|
||||
(ProcessorType::Cpu(CpuVariant::Arm64 { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal))
|
||||
| (ProcessorType::Gpu(GpuVariant::AppleMetal), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
|
||||
| (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
|
||||
| (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal)) => true,
|
||||
// Same type always shares
|
||||
(a, b) if a == b => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Precision for operations.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum Precision {
|
||||
Fp64,
|
||||
Fp32,
|
||||
Fp16,
|
||||
Bf16,
|
||||
Int8,
|
||||
Int4,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_processor_creation() {
|
||||
let cpu = GenericProcessor::cpu(
|
||||
ProcessorId(0),
|
||||
CpuVariant::X86_64 {
|
||||
avx: AvxSupport::Avx512,
|
||||
},
|
||||
);
|
||||
|
||||
assert_eq!(cpu.id(), ProcessorId(0));
|
||||
assert!(matches!(cpu.processor_type(), ProcessorType::Cpu(_)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gpu_creation() {
|
||||
let gpu = GenericProcessor::nvidia_gpu(ProcessorId(1), (9, 0));
|
||||
|
||||
assert_eq!(gpu.id(), ProcessorId(1));
|
||||
assert!(matches!(
|
||||
gpu.processor_type(),
|
||||
ProcessorType::Gpu(GpuVariant::NvidiaCuda { .. })
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_unified_memory() {
|
||||
let apple_cpu = GenericProcessor::new(
|
||||
ProcessorId(0),
|
||||
ProcessorType::Cpu(CpuVariant::Arm64 { sve: false }),
|
||||
ProcessorCapabilities::default(),
|
||||
);
|
||||
|
||||
assert!(apple_cpu.shares_memory_with(&ProcessorType::Gpu(GpuVariant::AppleMetal)));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_operation_execution() {
|
||||
let cpu = GenericProcessor::cpu(
|
||||
ProcessorId(0),
|
||||
CpuVariant::X86_64 {
|
||||
avx: AvxSupport::Avx512,
|
||||
},
|
||||
);
|
||||
|
||||
let op = Operation::MatMul {
|
||||
m: 1024,
|
||||
n: 1024,
|
||||
k: 1024,
|
||||
precision: Precision::Fp32,
|
||||
};
|
||||
|
||||
// CPU might not support all ops depending on capabilities
|
||||
// This is testing the infrastructure
|
||||
let result = cpu.execute(op).await;
|
||||
// Result depends on capabilities
|
||||
assert!(result.is_ok() || result.is_err());
|
||||
}
|
||||
}
|
||||
543
crates/synor-compute/src/processor/operation.rs
Normal file
543
crates/synor-compute/src/processor/operation.rs
Normal file
|
|
@ -0,0 +1,543 @@
|
|||
//! Operation definitions for heterogeneous compute.
|
||||
|
||||
use super::Precision;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Operation types for processor matching.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum OperationType {
|
||||
// Matrix operations
|
||||
MatMul,
|
||||
Conv2d,
|
||||
Conv3d,
|
||||
DepthwiseConv,
|
||||
BatchNorm,
|
||||
LayerNorm,
|
||||
|
||||
// Attention operations
|
||||
SelfAttention,
|
||||
CrossAttention,
|
||||
FlashAttention,
|
||||
|
||||
// Element-wise operations
|
||||
Add,
|
||||
Mul,
|
||||
ReLU,
|
||||
GeLU,
|
||||
SiLU,
|
||||
Softmax,
|
||||
|
||||
// Reduction operations
|
||||
Sum,
|
||||
Mean,
|
||||
Max,
|
||||
ArgMax,
|
||||
|
||||
// Data movement
|
||||
Transpose,
|
||||
Reshape,
|
||||
Concat,
|
||||
Split,
|
||||
Gather,
|
||||
Scatter,
|
||||
|
||||
// LLM specific
|
||||
Embedding,
|
||||
RoPE, // Rotary Position Embedding
|
||||
KVCache,
|
||||
TopK,
|
||||
Sampling,
|
||||
|
||||
// I/O operations
|
||||
DataLoad,
|
||||
DataPreprocess,
|
||||
Tokenization,
|
||||
Detokenization,
|
||||
Checkpoint,
|
||||
|
||||
// Distributed operations
|
||||
AllReduce,
|
||||
AllGather,
|
||||
ReduceScatter,
|
||||
|
||||
// Training specific
|
||||
Backward,
|
||||
OptimizerStep,
|
||||
GradientClip,
|
||||
}
|
||||
|
||||
/// Concrete operation with parameters.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub enum Operation {
|
||||
/// Matrix multiplication.
|
||||
MatMul {
|
||||
m: usize,
|
||||
n: usize,
|
||||
k: usize,
|
||||
precision: Precision,
|
||||
},
|
||||
|
||||
/// 2D Convolution.
|
||||
Conv2d {
|
||||
batch: usize,
|
||||
in_channels: usize,
|
||||
out_channels: usize,
|
||||
height: usize,
|
||||
width: usize,
|
||||
kernel_size: usize,
|
||||
precision: Precision,
|
||||
},
|
||||
|
||||
/// Batch normalization.
|
||||
BatchNorm {
|
||||
batch: usize,
|
||||
channels: usize,
|
||||
spatial: usize,
|
||||
precision: Precision,
|
||||
},
|
||||
|
||||
/// Layer normalization.
|
||||
LayerNorm {
|
||||
batch: usize,
|
||||
seq_len: usize,
|
||||
hidden: usize,
|
||||
precision: Precision,
|
||||
},
|
||||
|
||||
/// Self-attention.
|
||||
SelfAttention {
|
||||
batch: usize,
|
||||
seq_len: usize,
|
||||
num_heads: usize,
|
||||
head_dim: usize,
|
||||
precision: Precision,
|
||||
},
|
||||
|
||||
/// Flash attention (fused, memory efficient).
|
||||
FlashAttention {
|
||||
batch: usize,
|
||||
seq_len: usize,
|
||||
num_heads: usize,
|
||||
head_dim: usize,
|
||||
precision: Precision,
|
||||
},
|
||||
|
||||
/// Element-wise addition.
|
||||
Add {
|
||||
elements: usize,
|
||||
precision: Precision,
|
||||
},
|
||||
|
||||
/// Element-wise multiplication.
|
||||
Mul {
|
||||
elements: usize,
|
||||
precision: Precision,
|
||||
},
|
||||
|
||||
/// ReLU activation.
|
||||
ReLU { elements: usize },
|
||||
|
||||
/// GeLU activation.
|
||||
GeLU { elements: usize },
|
||||
|
||||
/// SiLU (Swish) activation.
|
||||
SiLU { elements: usize },
|
||||
|
||||
/// Softmax.
|
||||
Softmax {
|
||||
batch: usize,
|
||||
seq_len: usize,
|
||||
precision: Precision,
|
||||
},
|
||||
|
||||
/// Embedding lookup.
|
||||
Embedding {
|
||||
batch: usize,
|
||||
seq_len: usize,
|
||||
vocab_size: usize,
|
||||
embed_dim: usize,
|
||||
precision: Precision,
|
||||
},
|
||||
|
||||
/// Rotary Position Embedding.
|
||||
RoPE {
|
||||
batch: usize,
|
||||
seq_len: usize,
|
||||
head_dim: usize,
|
||||
precision: Precision,
|
||||
},
|
||||
|
||||
/// KV Cache update.
|
||||
KVCache {
|
||||
batch: usize,
|
||||
seq_len: usize,
|
||||
num_heads: usize,
|
||||
head_dim: usize,
|
||||
precision: Precision,
|
||||
},
|
||||
|
||||
/// Top-K sampling.
|
||||
TopK {
|
||||
batch: usize,
|
||||
vocab_size: usize,
|
||||
k: usize,
|
||||
},
|
||||
|
||||
/// Token sampling.
|
||||
Sampling {
|
||||
batch: usize,
|
||||
vocab_size: usize,
|
||||
temperature: f32,
|
||||
},
|
||||
|
||||
/// Data loading from storage.
|
||||
DataLoad {
|
||||
bytes: usize,
|
||||
async_: bool,
|
||||
},
|
||||
|
||||
/// Data preprocessing.
|
||||
DataPreprocess {
|
||||
batch: usize,
|
||||
transforms: Vec<String>,
|
||||
},
|
||||
|
||||
/// Tokenization.
|
||||
Tokenization {
|
||||
text_bytes: usize,
|
||||
vocab_size: usize,
|
||||
},
|
||||
|
||||
/// Detokenization.
|
||||
Detokenization {
|
||||
tokens: usize,
|
||||
vocab_size: usize,
|
||||
},
|
||||
|
||||
/// Checkpoint save.
|
||||
Checkpoint {
|
||||
bytes: usize,
|
||||
async_: bool,
|
||||
},
|
||||
|
||||
/// All-reduce across devices.
|
||||
AllReduce {
|
||||
elements: usize,
|
||||
precision: Precision,
|
||||
devices: usize,
|
||||
},
|
||||
|
||||
/// Backward pass for a layer.
|
||||
Backward {
|
||||
forward_op: Box<Operation>,
|
||||
},
|
||||
|
||||
/// Optimizer step.
|
||||
OptimizerStep {
|
||||
parameters: usize,
|
||||
optimizer: String,
|
||||
precision: Precision,
|
||||
},
|
||||
|
||||
/// Transpose.
|
||||
Transpose {
|
||||
shape: Vec<usize>,
|
||||
axes: Vec<usize>,
|
||||
},
|
||||
|
||||
/// Reshape.
|
||||
Reshape {
|
||||
from: Vec<usize>,
|
||||
to: Vec<usize>,
|
||||
},
|
||||
|
||||
/// Concatenate tensors.
|
||||
Concat {
|
||||
shapes: Vec<Vec<usize>>,
|
||||
axis: usize,
|
||||
},
|
||||
|
||||
/// Generic operation.
|
||||
Generic {
|
||||
op_type: OperationType,
|
||||
flops: f64,
|
||||
memory: u64,
|
||||
},
|
||||
}
|
||||
|
||||
impl Operation {
|
||||
/// Returns the operation type.
|
||||
pub fn op_type(&self) -> OperationType {
|
||||
match self {
|
||||
Operation::MatMul { .. } => OperationType::MatMul,
|
||||
Operation::Conv2d { .. } => OperationType::Conv2d,
|
||||
Operation::BatchNorm { .. } => OperationType::BatchNorm,
|
||||
Operation::LayerNorm { .. } => OperationType::LayerNorm,
|
||||
Operation::SelfAttention { .. } => OperationType::SelfAttention,
|
||||
Operation::FlashAttention { .. } => OperationType::FlashAttention,
|
||||
Operation::Add { .. } => OperationType::Add,
|
||||
Operation::Mul { .. } => OperationType::Mul,
|
||||
Operation::ReLU { .. } => OperationType::ReLU,
|
||||
Operation::GeLU { .. } => OperationType::GeLU,
|
||||
Operation::SiLU { .. } => OperationType::SiLU,
|
||||
Operation::Softmax { .. } => OperationType::Softmax,
|
||||
Operation::Embedding { .. } => OperationType::Embedding,
|
||||
Operation::RoPE { .. } => OperationType::RoPE,
|
||||
Operation::KVCache { .. } => OperationType::KVCache,
|
||||
Operation::TopK { .. } => OperationType::TopK,
|
||||
Operation::Sampling { .. } => OperationType::Sampling,
|
||||
Operation::DataLoad { .. } => OperationType::DataLoad,
|
||||
Operation::DataPreprocess { .. } => OperationType::DataPreprocess,
|
||||
Operation::Tokenization { .. } => OperationType::Tokenization,
|
||||
Operation::Detokenization { .. } => OperationType::Detokenization,
|
||||
Operation::Checkpoint { .. } => OperationType::Checkpoint,
|
||||
Operation::AllReduce { .. } => OperationType::AllReduce,
|
||||
Operation::Backward { .. } => OperationType::Backward,
|
||||
Operation::OptimizerStep { .. } => OperationType::OptimizerStep,
|
||||
Operation::Transpose { .. } => OperationType::Transpose,
|
||||
Operation::Reshape { .. } => OperationType::Reshape,
|
||||
Operation::Concat { .. } => OperationType::Concat,
|
||||
Operation::Generic { op_type, .. } => *op_type,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the precision used.
|
||||
pub fn precision(&self) -> Precision {
|
||||
match self {
|
||||
Operation::MatMul { precision, .. }
|
||||
| Operation::Conv2d { precision, .. }
|
||||
| Operation::BatchNorm { precision, .. }
|
||||
| Operation::LayerNorm { precision, .. }
|
||||
| Operation::SelfAttention { precision, .. }
|
||||
| Operation::FlashAttention { precision, .. }
|
||||
| Operation::Add { precision, .. }
|
||||
| Operation::Mul { precision, .. }
|
||||
| Operation::Softmax { precision, .. }
|
||||
| Operation::Embedding { precision, .. }
|
||||
| Operation::RoPE { precision, .. }
|
||||
| Operation::KVCache { precision, .. }
|
||||
| Operation::AllReduce { precision, .. }
|
||||
| Operation::OptimizerStep { precision, .. } => *precision,
|
||||
Operation::Backward { forward_op } => forward_op.precision(),
|
||||
_ => Precision::Fp32, // Default
|
||||
}
|
||||
}
|
||||
|
||||
/// Estimates FLOPS for the operation.
|
||||
pub fn estimated_flops(&self) -> f64 {
|
||||
match self {
|
||||
// MatMul: 2 * M * N * K (multiply-add)
|
||||
Operation::MatMul { m, n, k, .. } => 2.0 * (*m as f64) * (*n as f64) * (*k as f64),
|
||||
|
||||
// Conv2d: 2 * batch * out * H * W * in * K * K
|
||||
Operation::Conv2d {
|
||||
batch,
|
||||
in_channels,
|
||||
out_channels,
|
||||
height,
|
||||
width,
|
||||
kernel_size,
|
||||
..
|
||||
} => {
|
||||
2.0 * (*batch as f64)
|
||||
* (*out_channels as f64)
|
||||
* (*height as f64)
|
||||
* (*width as f64)
|
||||
* (*in_channels as f64)
|
||||
* (*kernel_size as f64)
|
||||
* (*kernel_size as f64)
|
||||
}
|
||||
|
||||
// Self-attention: 4 * batch * seq * seq * head_dim * heads
|
||||
Operation::SelfAttention {
|
||||
batch,
|
||||
seq_len,
|
||||
num_heads,
|
||||
head_dim,
|
||||
..
|
||||
}
|
||||
| Operation::FlashAttention {
|
||||
batch,
|
||||
seq_len,
|
||||
num_heads,
|
||||
head_dim,
|
||||
..
|
||||
} => {
|
||||
4.0 * (*batch as f64)
|
||||
* (*seq_len as f64)
|
||||
* (*seq_len as f64)
|
||||
* (*head_dim as f64)
|
||||
* (*num_heads as f64)
|
||||
}
|
||||
|
||||
// Element-wise: 1 FLOP per element
|
||||
Operation::Add { elements, .. }
|
||||
| Operation::Mul { elements, .. }
|
||||
| Operation::ReLU { elements }
|
||||
| Operation::GeLU { elements }
|
||||
| Operation::SiLU { elements } => *elements as f64,
|
||||
|
||||
// Softmax: ~5 ops per element (exp, sum, div)
|
||||
Operation::Softmax {
|
||||
batch, seq_len, ..
|
||||
} => 5.0 * (*batch as f64) * (*seq_len as f64),
|
||||
|
||||
// Embedding: just lookup, minimal FLOPS
|
||||
Operation::Embedding {
|
||||
batch,
|
||||
seq_len,
|
||||
embed_dim,
|
||||
..
|
||||
} => (*batch as f64) * (*seq_len as f64) * (*embed_dim as f64) * 0.1,
|
||||
|
||||
// Backward: ~2x forward
|
||||
Operation::Backward { forward_op } => forward_op.estimated_flops() * 2.0,
|
||||
|
||||
// Generic
|
||||
Operation::Generic { flops, .. } => *flops,
|
||||
|
||||
// I/O operations: minimal compute
|
||||
_ => 1000.0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Estimates memory usage (bytes).
|
||||
pub fn estimated_memory(&self) -> u64 {
|
||||
let precision_bytes = match self.precision() {
|
||||
Precision::Fp64 => 8,
|
||||
Precision::Fp32 => 4,
|
||||
Precision::Fp16 | Precision::Bf16 => 2,
|
||||
Precision::Int8 => 1,
|
||||
Precision::Int4 => 1, // Rounded up
|
||||
};
|
||||
|
||||
match self {
|
||||
Operation::MatMul { m, n, k, .. } => {
|
||||
// Input A (m×k) + Input B (k×n) + Output (m×n)
|
||||
((*m * *k) + (*k * *n) + (*m * *n)) as u64 * precision_bytes
|
||||
}
|
||||
|
||||
Operation::SelfAttention {
|
||||
batch,
|
||||
seq_len,
|
||||
num_heads,
|
||||
head_dim,
|
||||
..
|
||||
} => {
|
||||
// Q, K, V, Output, intermediate attention
|
||||
5 * (*batch as u64)
|
||||
* (*seq_len as u64)
|
||||
* (*num_heads as u64)
|
||||
* (*head_dim as u64)
|
||||
* precision_bytes
|
||||
}
|
||||
|
||||
Operation::FlashAttention {
|
||||
batch,
|
||||
seq_len,
|
||||
num_heads,
|
||||
head_dim,
|
||||
..
|
||||
} => {
|
||||
// FlashAttention uses much less memory
|
||||
2 * (*batch as u64)
|
||||
* (*seq_len as u64)
|
||||
* (*num_heads as u64)
|
||||
* (*head_dim as u64)
|
||||
* precision_bytes
|
||||
}
|
||||
|
||||
Operation::KVCache {
|
||||
batch,
|
||||
seq_len,
|
||||
num_heads,
|
||||
head_dim,
|
||||
..
|
||||
} => {
|
||||
// K and V caches
|
||||
2 * (*batch as u64)
|
||||
* (*seq_len as u64)
|
||||
* (*num_heads as u64)
|
||||
* (*head_dim as u64)
|
||||
* precision_bytes
|
||||
}
|
||||
|
||||
Operation::Generic { memory, .. } => *memory,
|
||||
|
||||
_ => 1024 * 1024, // 1 MB default
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates the backward operation for this operation.
|
||||
pub fn backward(&self) -> Option<Operation> {
|
||||
match self {
|
||||
Operation::MatMul { .. }
|
||||
| Operation::Conv2d { .. }
|
||||
| Operation::SelfAttention { .. }
|
||||
| Operation::FlashAttention { .. }
|
||||
| Operation::LayerNorm { .. }
|
||||
| Operation::BatchNorm { .. } => Some(Operation::Backward {
|
||||
forward_op: Box::new(self.clone()),
|
||||
}),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_matmul_flops() {
|
||||
let op = Operation::MatMul {
|
||||
m: 1024,
|
||||
n: 1024,
|
||||
k: 1024,
|
||||
precision: Precision::Fp32,
|
||||
};
|
||||
|
||||
let flops = op.estimated_flops();
|
||||
// 2 * 1024^3 = ~2.1 billion FLOPS
|
||||
assert!(flops > 2e9 && flops < 2.2e9);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_attention_memory() {
|
||||
let regular = Operation::SelfAttention {
|
||||
batch: 1,
|
||||
seq_len: 4096,
|
||||
num_heads: 32,
|
||||
head_dim: 128,
|
||||
precision: Precision::Fp16,
|
||||
};
|
||||
|
||||
let flash = Operation::FlashAttention {
|
||||
batch: 1,
|
||||
seq_len: 4096,
|
||||
num_heads: 32,
|
||||
head_dim: 128,
|
||||
precision: Precision::Fp16,
|
||||
};
|
||||
|
||||
// FlashAttention should use less memory
|
||||
assert!(flash.estimated_memory() < regular.estimated_memory());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_backward_creation() {
|
||||
let forward = Operation::MatMul {
|
||||
m: 1024,
|
||||
n: 1024,
|
||||
k: 1024,
|
||||
precision: Precision::Fp32,
|
||||
};
|
||||
|
||||
let backward = forward.backward();
|
||||
assert!(backward.is_some());
|
||||
|
||||
if let Some(Operation::Backward { forward_op }) = backward {
|
||||
assert!(matches!(*forward_op, Operation::MatMul { .. }));
|
||||
}
|
||||
}
|
||||
}
|
||||
513
crates/synor-compute/src/processor/profiles.rs
Normal file
513
crates/synor-compute/src/processor/profiles.rs
Normal file
|
|
@ -0,0 +1,513 @@
|
|||
//! Pre-defined processor profiles for common hardware.
|
||||
|
||||
use super::capabilities::{
|
||||
ComputeThroughput, MemorySpecs, MemoryType, PowerCharacteristics, ProcessorCapabilities,
|
||||
WorkloadCharacteristic,
|
||||
};
|
||||
use super::operation::OperationType;
|
||||
use super::types::PowerTier;
|
||||
use super::TpuVersion;
|
||||
use std::collections::HashSet;
|
||||
|
||||
/// Pre-defined processor profiles.
|
||||
pub struct ProcessorProfiles;
|
||||
|
||||
impl ProcessorProfiles {
|
||||
// ═══════════════════════════════════════════════════════════════
|
||||
// CPU PROFILES
|
||||
// ═══════════════════════════════════════════════════════════════
|
||||
|
||||
/// Default CPU profile.
|
||||
pub fn cpu_default() -> ProcessorCapabilities {
|
||||
ProcessorCapabilities::cpu(8, 3.5, false)
|
||||
}
|
||||
|
||||
/// AMD EPYC 9654 (96 cores).
|
||||
pub fn amd_epyc_9654() -> ProcessorCapabilities {
|
||||
ProcessorCapabilities {
|
||||
compute: ComputeThroughput {
|
||||
fp64_tflops: 2.7,
|
||||
fp32_tflops: 5.4,
|
||||
fp16_tflops: 10.8,
|
||||
bf16_tflops: 10.8,
|
||||
int8_tops: 21.6,
|
||||
int4_tops: 43.2,
|
||||
sparsity_speedup: 1.0,
|
||||
},
|
||||
memory: MemorySpecs {
|
||||
capacity_bytes: 6 * 1024 * 1024 * 1024 * 1024, // 6 TB max
|
||||
bandwidth_gbps: 460,
|
||||
type_: MemoryType::Ddr5,
|
||||
},
|
||||
operations: ProcessorCapabilities::cpu(96, 2.4, false)
|
||||
.operations,
|
||||
power: PowerCharacteristics {
|
||||
tdp_watts: 360,
|
||||
efficiency: 0.85,
|
||||
power_tier: PowerTier::High,
|
||||
},
|
||||
optimal_for: vec![
|
||||
WorkloadCharacteristic::Sequential,
|
||||
WorkloadCharacteristic::MemoryBound,
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
/// Intel Xeon w9-3595X (56 cores).
|
||||
pub fn intel_xeon_w9_3595x() -> ProcessorCapabilities {
|
||||
ProcessorCapabilities {
|
||||
compute: ComputeThroughput {
|
||||
fp64_tflops: 3.2,
|
||||
fp32_tflops: 6.4,
|
||||
fp16_tflops: 12.8,
|
||||
bf16_tflops: 12.8,
|
||||
int8_tops: 25.6,
|
||||
int4_tops: 51.2,
|
||||
sparsity_speedup: 1.0,
|
||||
},
|
||||
memory: MemorySpecs {
|
||||
capacity_bytes: 4 * 1024 * 1024 * 1024 * 1024, // 4 TB max
|
||||
bandwidth_gbps: 307,
|
||||
type_: MemoryType::Ddr5,
|
||||
},
|
||||
operations: ProcessorCapabilities::cpu(56, 2.9, true)
|
||||
.operations,
|
||||
power: PowerCharacteristics {
|
||||
tdp_watts: 350,
|
||||
efficiency: 0.80,
|
||||
power_tier: PowerTier::High,
|
||||
},
|
||||
optimal_for: vec![
|
||||
WorkloadCharacteristic::Sequential,
|
||||
WorkloadCharacteristic::MemoryBound,
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
/// Apple M3 Max CPU cores.
|
||||
pub fn apple_m3_max_cpu() -> ProcessorCapabilities {
|
||||
ProcessorCapabilities {
|
||||
compute: ComputeThroughput {
|
||||
fp64_tflops: 0.3,
|
||||
fp32_tflops: 0.6,
|
||||
fp16_tflops: 1.2,
|
||||
bf16_tflops: 1.2,
|
||||
int8_tops: 2.4,
|
||||
int4_tops: 4.8,
|
||||
sparsity_speedup: 1.0,
|
||||
},
|
||||
memory: MemorySpecs {
|
||||
capacity_bytes: 128 * 1024 * 1024 * 1024, // 128 GB unified
|
||||
bandwidth_gbps: 400,
|
||||
type_: MemoryType::Unified,
|
||||
},
|
||||
operations: ProcessorCapabilities::cpu(16, 4.0, false)
|
||||
.operations,
|
||||
power: PowerCharacteristics {
|
||||
tdp_watts: 40,
|
||||
efficiency: 0.95,
|
||||
power_tier: PowerTier::Low,
|
||||
},
|
||||
optimal_for: vec![
|
||||
WorkloadCharacteristic::Sequential,
|
||||
WorkloadCharacteristic::LowPower,
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════
|
||||
// NVIDIA GPU PROFILES
|
||||
// ═══════════════════════════════════════════════════════════════
|
||||
|
||||
/// Default NVIDIA GPU profile.
|
||||
pub fn nvidia_default() -> ProcessorCapabilities {
|
||||
ProcessorCapabilities::nvidia_gpu(8192, 256, 12, 600, (8, 0))
|
||||
}
|
||||
|
||||
/// NVIDIA H100 SXM (80GB).
|
||||
pub fn nvidia_h100() -> ProcessorCapabilities {
|
||||
ProcessorCapabilities {
|
||||
compute: ComputeThroughput {
|
||||
fp64_tflops: 67.0,
|
||||
fp32_tflops: 67.0,
|
||||
fp16_tflops: 1979.0, // With sparsity
|
||||
bf16_tflops: 1979.0,
|
||||
int8_tops: 3958.0,
|
||||
int4_tops: 7916.0,
|
||||
sparsity_speedup: 2.0,
|
||||
},
|
||||
memory: MemorySpecs {
|
||||
capacity_bytes: 80 * 1024 * 1024 * 1024,
|
||||
bandwidth_gbps: 3350,
|
||||
type_: MemoryType::Hbm3,
|
||||
},
|
||||
operations: ProcessorCapabilities::nvidia_gpu(16896, 528, 80, 3350, (9, 0))
|
||||
.operations,
|
||||
power: PowerCharacteristics {
|
||||
tdp_watts: 700,
|
||||
efficiency: 0.90,
|
||||
power_tier: PowerTier::High,
|
||||
},
|
||||
optimal_for: vec![
|
||||
WorkloadCharacteristic::HighlyParallel,
|
||||
WorkloadCharacteristic::LargeBatch,
|
||||
WorkloadCharacteristic::ComputeBound,
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
/// NVIDIA A100 (80GB).
|
||||
pub fn nvidia_a100() -> ProcessorCapabilities {
|
||||
ProcessorCapabilities {
|
||||
compute: ComputeThroughput {
|
||||
fp64_tflops: 19.5,
|
||||
fp32_tflops: 19.5,
|
||||
fp16_tflops: 624.0, // With sparsity
|
||||
bf16_tflops: 624.0,
|
||||
int8_tops: 1248.0,
|
||||
int4_tops: 2496.0,
|
||||
sparsity_speedup: 2.0,
|
||||
},
|
||||
memory: MemorySpecs {
|
||||
capacity_bytes: 80 * 1024 * 1024 * 1024,
|
||||
bandwidth_gbps: 2039,
|
||||
type_: MemoryType::Hbm2e,
|
||||
},
|
||||
operations: ProcessorCapabilities::nvidia_gpu(6912, 432, 80, 2039, (8, 0))
|
||||
.operations,
|
||||
power: PowerCharacteristics {
|
||||
tdp_watts: 400,
|
||||
efficiency: 0.88,
|
||||
power_tier: PowerTier::High,
|
||||
},
|
||||
optimal_for: vec![
|
||||
WorkloadCharacteristic::HighlyParallel,
|
||||
WorkloadCharacteristic::LargeBatch,
|
||||
WorkloadCharacteristic::ComputeBound,
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
/// NVIDIA RTX 4090.
|
||||
pub fn nvidia_rtx_4090() -> ProcessorCapabilities {
|
||||
ProcessorCapabilities {
|
||||
compute: ComputeThroughput {
|
||||
fp64_tflops: 1.3,
|
||||
fp32_tflops: 82.6,
|
||||
fp16_tflops: 330.4, // With sparsity
|
||||
bf16_tflops: 330.4,
|
||||
int8_tops: 660.8,
|
||||
int4_tops: 1321.6,
|
||||
sparsity_speedup: 2.0,
|
||||
},
|
||||
memory: MemorySpecs {
|
||||
capacity_bytes: 24 * 1024 * 1024 * 1024,
|
||||
bandwidth_gbps: 1008,
|
||||
type_: MemoryType::Gddr6,
|
||||
},
|
||||
operations: ProcessorCapabilities::nvidia_gpu(16384, 512, 24, 1008, (8, 9))
|
||||
.operations,
|
||||
power: PowerCharacteristics {
|
||||
tdp_watts: 450,
|
||||
efficiency: 0.85,
|
||||
power_tier: PowerTier::High,
|
||||
},
|
||||
optimal_for: vec![
|
||||
WorkloadCharacteristic::HighlyParallel,
|
||||
WorkloadCharacteristic::LargeBatch,
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
/// NVIDIA RTX 3090.
|
||||
pub fn nvidia_rtx_3090() -> ProcessorCapabilities {
|
||||
ProcessorCapabilities {
|
||||
compute: ComputeThroughput {
|
||||
fp64_tflops: 0.6,
|
||||
fp32_tflops: 35.6,
|
||||
fp16_tflops: 71.2,
|
||||
bf16_tflops: 71.2,
|
||||
int8_tops: 142.4,
|
||||
int4_tops: 284.8,
|
||||
sparsity_speedup: 1.0,
|
||||
},
|
||||
memory: MemorySpecs {
|
||||
capacity_bytes: 24 * 1024 * 1024 * 1024,
|
||||
bandwidth_gbps: 936,
|
||||
type_: MemoryType::Gddr6,
|
||||
},
|
||||
operations: ProcessorCapabilities::nvidia_gpu(10496, 328, 24, 936, (8, 6))
|
||||
.operations,
|
||||
power: PowerCharacteristics {
|
||||
tdp_watts: 350,
|
||||
efficiency: 0.82,
|
||||
power_tier: PowerTier::High,
|
||||
},
|
||||
optimal_for: vec![
|
||||
WorkloadCharacteristic::HighlyParallel,
|
||||
WorkloadCharacteristic::LargeBatch,
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════
|
||||
// AMD GPU PROFILES
|
||||
// ═══════════════════════════════════════════════════════════════
|
||||
|
||||
/// AMD MI300X.
|
||||
pub fn amd_mi300x() -> ProcessorCapabilities {
|
||||
ProcessorCapabilities {
|
||||
compute: ComputeThroughput {
|
||||
fp64_tflops: 163.4,
|
||||
fp32_tflops: 163.4,
|
||||
fp16_tflops: 1307.0,
|
||||
bf16_tflops: 1307.0,
|
||||
int8_tops: 2614.0,
|
||||
int4_tops: 5228.0,
|
||||
sparsity_speedup: 2.0,
|
||||
},
|
||||
memory: MemorySpecs {
|
||||
capacity_bytes: 192 * 1024 * 1024 * 1024, // 192 GB HBM3
|
||||
bandwidth_gbps: 5300,
|
||||
type_: MemoryType::Hbm3,
|
||||
},
|
||||
operations: {
|
||||
let mut ops = ProcessorCapabilities::nvidia_gpu(16384, 512, 80, 5300, (9, 0))
|
||||
.operations;
|
||||
ops.remove(&OperationType::FlashAttention); // Different implementation
|
||||
ops
|
||||
},
|
||||
power: PowerCharacteristics {
|
||||
tdp_watts: 750,
|
||||
efficiency: 0.88,
|
||||
power_tier: PowerTier::High,
|
||||
},
|
||||
optimal_for: vec![
|
||||
WorkloadCharacteristic::HighlyParallel,
|
||||
WorkloadCharacteristic::LargeBatch,
|
||||
WorkloadCharacteristic::MemoryBound, // High memory bandwidth
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
/// AMD RX 7900 XTX.
|
||||
pub fn amd_rx_7900_xtx() -> ProcessorCapabilities {
|
||||
ProcessorCapabilities {
|
||||
compute: ComputeThroughput {
|
||||
fp64_tflops: 1.9,
|
||||
fp32_tflops: 61.0,
|
||||
fp16_tflops: 122.0,
|
||||
bf16_tflops: 122.0,
|
||||
int8_tops: 244.0,
|
||||
int4_tops: 488.0,
|
||||
sparsity_speedup: 1.0,
|
||||
},
|
||||
memory: MemorySpecs {
|
||||
capacity_bytes: 24 * 1024 * 1024 * 1024,
|
||||
bandwidth_gbps: 960,
|
||||
type_: MemoryType::Gddr6,
|
||||
},
|
||||
operations: {
|
||||
let mut ops = ProcessorCapabilities::nvidia_gpu(6144, 0, 24, 960, (8, 0))
|
||||
.operations;
|
||||
ops.remove(&OperationType::FlashAttention);
|
||||
ops
|
||||
},
|
||||
power: PowerCharacteristics {
|
||||
tdp_watts: 355,
|
||||
efficiency: 0.80,
|
||||
power_tier: PowerTier::High,
|
||||
},
|
||||
optimal_for: vec![
|
||||
WorkloadCharacteristic::HighlyParallel,
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════
|
||||
// GOOGLE TPU PROFILES
|
||||
// ═══════════════════════════════════════════════════════════════
|
||||
|
||||
/// Default TPU profile.
|
||||
pub fn google_tpu_default() -> ProcessorCapabilities {
|
||||
ProcessorCapabilities::tpu(TpuVersion::V4)
|
||||
}
|
||||
|
||||
/// Google TPU v5p.
|
||||
pub fn google_tpu_v5p() -> ProcessorCapabilities {
|
||||
ProcessorCapabilities::tpu(TpuVersion::V5p)
|
||||
}
|
||||
|
||||
/// Google TPU v4.
|
||||
pub fn google_tpu_v4() -> ProcessorCapabilities {
|
||||
ProcessorCapabilities::tpu(TpuVersion::V4)
|
||||
}
|
||||
|
||||
/// Google Edge TPU.
|
||||
pub fn google_edge_tpu() -> ProcessorCapabilities {
|
||||
ProcessorCapabilities {
|
||||
compute: ComputeThroughput {
|
||||
fp64_tflops: 0.0,
|
||||
fp32_tflops: 0.0,
|
||||
fp16_tflops: 0.0,
|
||||
bf16_tflops: 0.0,
|
||||
int8_tops: 4.0,
|
||||
int4_tops: 8.0,
|
||||
sparsity_speedup: 1.0,
|
||||
},
|
||||
memory: MemorySpecs {
|
||||
capacity_bytes: 0, // Uses host memory
|
||||
bandwidth_gbps: 0,
|
||||
type_: MemoryType::Unified,
|
||||
},
|
||||
operations: {
|
||||
let mut ops = HashSet::new();
|
||||
ops.insert(OperationType::MatMul);
|
||||
ops.insert(OperationType::Conv2d);
|
||||
ops.insert(OperationType::DepthwiseConv);
|
||||
ops.insert(OperationType::Add);
|
||||
ops.insert(OperationType::Mul);
|
||||
ops.insert(OperationType::ReLU);
|
||||
ops.insert(OperationType::Softmax);
|
||||
ops
|
||||
},
|
||||
power: PowerCharacteristics {
|
||||
tdp_watts: 2,
|
||||
efficiency: 0.95,
|
||||
power_tier: PowerTier::UltraLow,
|
||||
},
|
||||
optimal_for: vec![
|
||||
WorkloadCharacteristic::LowPower,
|
||||
WorkloadCharacteristic::LowLatency,
|
||||
WorkloadCharacteristic::SmallBatch,
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════
|
||||
// GROQ LPU PROFILE
|
||||
// ═══════════════════════════════════════════════════════════════
|
||||
|
||||
/// Groq LPU.
|
||||
pub fn groq_lpu() -> ProcessorCapabilities {
|
||||
ProcessorCapabilities::lpu()
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════
|
||||
// APPLE NEURAL ENGINE PROFILES
|
||||
// ═══════════════════════════════════════════════════════════════
|
||||
|
||||
/// Apple Neural Engine (generic).
|
||||
pub fn apple_neural_engine(cores: u32) -> ProcessorCapabilities {
|
||||
ProcessorCapabilities::apple_neural_engine(cores)
|
||||
}
|
||||
|
||||
/// Apple M3 Neural Engine (16 cores).
|
||||
pub fn apple_m3_neural_engine() -> ProcessorCapabilities {
|
||||
ProcessorCapabilities::apple_neural_engine(16)
|
||||
}
|
||||
|
||||
/// Apple M3 Max Neural Engine (16 cores).
|
||||
pub fn apple_m3_max_neural_engine() -> ProcessorCapabilities {
|
||||
ProcessorCapabilities::apple_neural_engine(16) // Same as M3
|
||||
}
|
||||
|
||||
/// Apple A17 Pro Neural Engine (35 TOPS).
|
||||
pub fn apple_a17_pro_neural_engine() -> ProcessorCapabilities {
|
||||
ProcessorCapabilities {
|
||||
compute: ComputeThroughput {
|
||||
fp64_tflops: 0.0,
|
||||
fp32_tflops: 4.4,
|
||||
fp16_tflops: 8.8,
|
||||
bf16_tflops: 8.8,
|
||||
int8_tops: 35.0,
|
||||
int4_tops: 70.0,
|
||||
sparsity_speedup: 1.0,
|
||||
},
|
||||
memory: MemorySpecs {
|
||||
capacity_bytes: 0, // Uses unified memory
|
||||
bandwidth_gbps: 200,
|
||||
type_: MemoryType::Unified,
|
||||
},
|
||||
operations: ProcessorCapabilities::apple_neural_engine(16)
|
||||
.operations,
|
||||
power: PowerCharacteristics {
|
||||
tdp_watts: 8,
|
||||
efficiency: 0.98,
|
||||
power_tier: PowerTier::UltraLow,
|
||||
},
|
||||
optimal_for: vec![
|
||||
WorkloadCharacteristic::LowPower,
|
||||
WorkloadCharacteristic::LowLatency,
|
||||
WorkloadCharacteristic::SmallBatch,
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════
|
||||
// QUALCOMM NPU PROFILES
|
||||
// ═══════════════════════════════════════════════════════════════
|
||||
|
||||
/// Qualcomm Hexagon NPU (Snapdragon 8 Gen 3).
|
||||
pub fn qualcomm_hexagon_8g3() -> ProcessorCapabilities {
|
||||
ProcessorCapabilities {
|
||||
compute: ComputeThroughput {
|
||||
fp64_tflops: 0.0,
|
||||
fp32_tflops: 3.0,
|
||||
fp16_tflops: 6.0,
|
||||
bf16_tflops: 6.0,
|
||||
int8_tops: 73.0, // 73 TOPS
|
||||
int4_tops: 146.0,
|
||||
sparsity_speedup: 1.0,
|
||||
},
|
||||
memory: MemorySpecs {
|
||||
capacity_bytes: 0, // Uses system memory
|
||||
bandwidth_gbps: 77,
|
||||
type_: MemoryType::Lpddr,
|
||||
},
|
||||
operations: ProcessorCapabilities::apple_neural_engine(16)
|
||||
.operations,
|
||||
power: PowerCharacteristics {
|
||||
tdp_watts: 10,
|
||||
efficiency: 0.95,
|
||||
power_tier: PowerTier::UltraLow,
|
||||
},
|
||||
optimal_for: vec![
|
||||
WorkloadCharacteristic::LowPower,
|
||||
WorkloadCharacteristic::LowLatency,
|
||||
WorkloadCharacteristic::SmallBatch,
|
||||
],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_h100_profile() {
|
||||
let h100 = ProcessorProfiles::nvidia_h100();
|
||||
assert!(h100.compute.fp16_tflops > 1000.0);
|
||||
assert_eq!(h100.memory.capacity_bytes, 80 * 1024 * 1024 * 1024);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tpu_v5p_profile() {
|
||||
let tpu = ProcessorProfiles::google_tpu_v5p();
|
||||
assert!(tpu.compute.bf16_tflops > 900.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_groq_lpu_profile() {
|
||||
let lpu = ProcessorProfiles::groq_lpu();
|
||||
assert!(lpu.memory.bandwidth_gbps > 50000); // Very high internal bandwidth
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_apple_ane_profile() {
|
||||
let ane = ProcessorProfiles::apple_m3_neural_engine();
|
||||
assert!(ane.power.tdp_watts < 20);
|
||||
assert!(ane.optimal_for.contains(&WorkloadCharacteristic::LowPower));
|
||||
}
|
||||
}
|
||||
367
crates/synor-compute/src/processor/types.rs
Normal file
367
crates/synor-compute/src/processor/types.rs
Normal file
|
|
@ -0,0 +1,367 @@
|
|||
//! Processor type definitions.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// All supported processor types.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum ProcessorType {
|
||||
/// Central Processing Unit.
|
||||
Cpu(CpuVariant),
|
||||
/// Graphics Processing Unit.
|
||||
Gpu(GpuVariant),
|
||||
/// Tensor Processing Unit (Google).
|
||||
Tpu(TpuVersion),
|
||||
/// Neural Processing Unit (various vendors).
|
||||
Npu(NpuVariant),
|
||||
/// Language Processing Unit (Groq).
|
||||
Lpu,
|
||||
/// Field Programmable Gate Array.
|
||||
Fpga(FpgaVendor),
|
||||
/// Digital Signal Processor.
|
||||
Dsp(DspVariant),
|
||||
/// WebGPU (browser).
|
||||
WebGpu,
|
||||
/// WebAssembly runtime.
|
||||
Wasm,
|
||||
/// Custom/Unknown accelerator.
|
||||
Custom {
|
||||
vendor: String,
|
||||
model: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl Default for ProcessorType {
|
||||
fn default() -> Self {
|
||||
ProcessorType::Cpu(CpuVariant::default())
|
||||
}
|
||||
}
|
||||
|
||||
/// CPU architecture variants.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum CpuVariant {
|
||||
/// x86-64 architecture.
|
||||
X86_64 { avx: AvxSupport },
|
||||
/// ARM 64-bit architecture.
|
||||
Arm64 { sve: bool },
|
||||
/// RISC-V architecture.
|
||||
RiscV { vector: bool },
|
||||
}
|
||||
|
||||
impl Default for CpuVariant {
|
||||
fn default() -> Self {
|
||||
CpuVariant::X86_64 {
|
||||
avx: AvxSupport::Avx2,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// AVX instruction set support levels.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
|
||||
pub enum AvxSupport {
|
||||
/// No AVX.
|
||||
None,
|
||||
/// AVX (Sandy Bridge+).
|
||||
Avx,
|
||||
/// AVX2 (Haswell+).
|
||||
Avx2,
|
||||
/// AVX-512 (Skylake-X+).
|
||||
Avx512,
|
||||
/// AVX10 (future).
|
||||
Avx10,
|
||||
}
|
||||
|
||||
/// GPU vendor variants.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum GpuVariant {
|
||||
/// NVIDIA CUDA GPU.
|
||||
NvidiaCuda {
|
||||
/// Compute capability (major, minor).
|
||||
compute_capability: (u8, u8),
|
||||
},
|
||||
/// AMD ROCm GPU.
|
||||
AmdRocm {
|
||||
/// GFX version (e.g., 1100 for RDNA3).
|
||||
gfx_version: u32,
|
||||
},
|
||||
/// Intel OneAPI GPU.
|
||||
IntelOneApi,
|
||||
/// Apple Metal GPU.
|
||||
AppleMetal,
|
||||
/// Qualcomm Adreno GPU.
|
||||
QualcommAdreno {
|
||||
/// Adreno model number.
|
||||
model: u32,
|
||||
},
|
||||
/// ARM Mali GPU.
|
||||
ArmMali {
|
||||
/// Mali generation (e.g., G710).
|
||||
model: u32,
|
||||
},
|
||||
/// IMG PowerVR GPU.
|
||||
ImgPowerVr,
|
||||
}
|
||||
|
||||
/// Google TPU versions.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum TpuVersion {
|
||||
/// TPU v2.
|
||||
V2,
|
||||
/// TPU v3.
|
||||
V3,
|
||||
/// TPU v4.
|
||||
V4,
|
||||
/// TPU v4i (inference).
|
||||
V4i,
|
||||
/// TPU v5e (efficiency).
|
||||
V5e,
|
||||
/// TPU v5p (performance).
|
||||
V5p,
|
||||
/// Edge TPU.
|
||||
Edge,
|
||||
}
|
||||
|
||||
/// NPU (Neural Processing Unit) variants.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum NpuVariant {
|
||||
/// Apple Neural Engine.
|
||||
AppleNeuralEngine {
|
||||
/// Number of cores.
|
||||
cores: u32,
|
||||
},
|
||||
/// Qualcomm Hexagon DSP/NPU.
|
||||
QualcommHexagon {
|
||||
/// Version number.
|
||||
version: u32,
|
||||
},
|
||||
/// Intel VPU (Movidius).
|
||||
IntelVpu,
|
||||
/// Huawei Ascend.
|
||||
HuaweiAscend {
|
||||
/// Model (310, 910, etc.).
|
||||
model: u32,
|
||||
},
|
||||
/// Google Edge TPU.
|
||||
GoogleEdgeTpu,
|
||||
/// Samsung NPU.
|
||||
SamsungNpu,
|
||||
/// MediaTek APU.
|
||||
MediaTekApu {
|
||||
/// Version.
|
||||
version: u32,
|
||||
},
|
||||
/// Custom NPU.
|
||||
Custom {
|
||||
/// TOPS (Tera Operations Per Second).
|
||||
tops: u32,
|
||||
},
|
||||
}
|
||||
|
||||
/// FPGA vendors.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum FpgaVendor {
|
||||
/// Xilinx (AMD).
|
||||
Xilinx,
|
||||
/// Intel (Altera).
|
||||
Intel,
|
||||
/// Lattice.
|
||||
Lattice,
|
||||
/// Microchip.
|
||||
Microchip,
|
||||
}
|
||||
|
||||
/// DSP (Digital Signal Processor) variants.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum DspVariant {
|
||||
/// Texas Instruments DSP.
|
||||
TexasInstruments,
|
||||
/// Analog Devices DSP.
|
||||
AnalogDevices,
|
||||
/// Qualcomm Hexagon DSP.
|
||||
QualcommHexagon,
|
||||
/// Custom DSP.
|
||||
Custom,
|
||||
}
|
||||
|
||||
impl ProcessorType {
|
||||
/// Returns whether this processor type supports CUDA.
|
||||
pub fn supports_cuda(&self) -> bool {
|
||||
matches!(self, ProcessorType::Gpu(GpuVariant::NvidiaCuda { .. }))
|
||||
}
|
||||
|
||||
/// Returns whether this processor type supports ROCm.
|
||||
pub fn supports_rocm(&self) -> bool {
|
||||
matches!(self, ProcessorType::Gpu(GpuVariant::AmdRocm { .. }))
|
||||
}
|
||||
|
||||
/// Returns whether this processor type supports Metal.
|
||||
pub fn supports_metal(&self) -> bool {
|
||||
matches!(self, ProcessorType::Gpu(GpuVariant::AppleMetal))
|
||||
}
|
||||
|
||||
/// Returns whether this processor type is a GPU.
|
||||
pub fn is_gpu(&self) -> bool {
|
||||
matches!(self, ProcessorType::Gpu(_))
|
||||
}
|
||||
|
||||
/// Returns whether this processor type is a CPU.
|
||||
pub fn is_cpu(&self) -> bool {
|
||||
matches!(self, ProcessorType::Cpu(_))
|
||||
}
|
||||
|
||||
/// Returns whether this processor type is suitable for parallel workloads.
|
||||
pub fn is_parallel(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
ProcessorType::Gpu(_) | ProcessorType::Tpu(_) | ProcessorType::Fpga(_)
|
||||
)
|
||||
}
|
||||
|
||||
/// Returns whether this processor type is suitable for sequential workloads.
|
||||
pub fn is_sequential(&self) -> bool {
|
||||
matches!(self, ProcessorType::Cpu(_) | ProcessorType::Lpu)
|
||||
}
|
||||
|
||||
/// Returns whether this processor type is power-efficient.
|
||||
pub fn is_low_power(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
ProcessorType::Npu(_) | ProcessorType::Tpu(TpuVersion::Edge) | ProcessorType::Wasm
|
||||
)
|
||||
}
|
||||
|
||||
/// Returns the typical power consumption tier.
|
||||
pub fn power_tier(&self) -> PowerTier {
|
||||
match self {
|
||||
ProcessorType::Npu(_) | ProcessorType::Wasm => PowerTier::UltraLow,
|
||||
ProcessorType::Cpu(CpuVariant::Arm64 { .. }) => PowerTier::Low,
|
||||
ProcessorType::Cpu(_) => PowerTier::Medium,
|
||||
ProcessorType::Gpu(GpuVariant::AppleMetal) => PowerTier::Medium,
|
||||
ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability })
|
||||
if compute_capability.0 >= 8 =>
|
||||
{
|
||||
PowerTier::High
|
||||
}
|
||||
ProcessorType::Gpu(_) => PowerTier::Medium,
|
||||
ProcessorType::Tpu(TpuVersion::Edge) => PowerTier::UltraLow,
|
||||
ProcessorType::Tpu(_) => PowerTier::High,
|
||||
ProcessorType::Lpu => PowerTier::Medium,
|
||||
ProcessorType::Fpga(_) => PowerTier::Medium,
|
||||
ProcessorType::Dsp(_) => PowerTier::Low,
|
||||
ProcessorType::WebGpu => PowerTier::Low,
|
||||
ProcessorType::Custom { .. } => PowerTier::Medium,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Power consumption tiers.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||
pub enum PowerTier {
|
||||
/// < 5W (mobile, IoT).
|
||||
UltraLow,
|
||||
/// 5-30W (laptop, tablet).
|
||||
Low,
|
||||
/// 30-150W (desktop, workstation).
|
||||
Medium,
|
||||
/// > 150W (server, data center).
|
||||
High,
|
||||
}
|
||||
|
||||
/// Device class for routing decisions.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum DeviceClass {
|
||||
/// Data center equipment.
|
||||
DataCenter,
|
||||
/// Desktop/workstation.
|
||||
Desktop,
|
||||
/// Laptop.
|
||||
Laptop,
|
||||
/// Mobile phone.
|
||||
Mobile,
|
||||
/// Tablet.
|
||||
Tablet,
|
||||
/// IoT device.
|
||||
IoT,
|
||||
/// Browser (WebGPU/WASM).
|
||||
Browser,
|
||||
/// Edge server.
|
||||
Edge,
|
||||
}
|
||||
|
||||
impl DeviceClass {
|
||||
/// Returns typical available compute hours per day.
|
||||
pub fn typical_availability_hours(&self) -> f32 {
|
||||
match self {
|
||||
DeviceClass::DataCenter => 24.0,
|
||||
DeviceClass::Desktop => 8.0,
|
||||
DeviceClass::Laptop => 6.0,
|
||||
DeviceClass::Mobile => 4.0,
|
||||
DeviceClass::Tablet => 4.0,
|
||||
DeviceClass::IoT => 24.0,
|
||||
DeviceClass::Browser => 2.0,
|
||||
DeviceClass::Edge => 24.0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns reliability score (0-100).
|
||||
pub fn reliability_score(&self) -> u32 {
|
||||
match self {
|
||||
DeviceClass::DataCenter => 99,
|
||||
DeviceClass::Edge => 95,
|
||||
DeviceClass::Desktop => 80,
|
||||
DeviceClass::Laptop => 60,
|
||||
DeviceClass::Mobile => 40,
|
||||
DeviceClass::Tablet => 50,
|
||||
DeviceClass::IoT => 70,
|
||||
DeviceClass::Browser => 30,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_processor_type_properties() {
|
||||
let nvidia = ProcessorType::Gpu(GpuVariant::NvidiaCuda {
|
||||
compute_capability: (9, 0),
|
||||
});
|
||||
assert!(nvidia.supports_cuda());
|
||||
assert!(nvidia.is_gpu());
|
||||
assert!(nvidia.is_parallel());
|
||||
|
||||
let cpu = ProcessorType::Cpu(CpuVariant::X86_64 {
|
||||
avx: AvxSupport::Avx512,
|
||||
});
|
||||
assert!(cpu.is_cpu());
|
||||
assert!(cpu.is_sequential());
|
||||
|
||||
let lpu = ProcessorType::Lpu;
|
||||
assert!(lpu.is_sequential());
|
||||
|
||||
let npu = ProcessorType::Npu(NpuVariant::AppleNeuralEngine { cores: 16 });
|
||||
assert!(npu.is_low_power());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_power_tiers() {
|
||||
let h100 = ProcessorType::Gpu(GpuVariant::NvidiaCuda {
|
||||
compute_capability: (9, 0),
|
||||
});
|
||||
assert_eq!(h100.power_tier(), PowerTier::High);
|
||||
|
||||
let npu = ProcessorType::Npu(NpuVariant::AppleNeuralEngine { cores: 16 });
|
||||
assert_eq!(npu.power_tier(), PowerTier::UltraLow);
|
||||
|
||||
let arm = ProcessorType::Cpu(CpuVariant::Arm64 { sve: false });
|
||||
assert_eq!(arm.power_tier(), PowerTier::Low);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_device_class() {
|
||||
assert_eq!(DeviceClass::DataCenter.typical_availability_hours(), 24.0);
|
||||
assert_eq!(DeviceClass::Mobile.typical_availability_hours(), 4.0);
|
||||
assert_eq!(DeviceClass::DataCenter.reliability_score(), 99);
|
||||
assert_eq!(DeviceClass::Browser.reliability_score(), 30);
|
||||
}
|
||||
}
|
||||
810
crates/synor-compute/src/scheduler/load_balancer.rs
Normal file
810
crates/synor-compute/src/scheduler/load_balancer.rs
Normal file
|
|
@ -0,0 +1,810 @@
|
|||
//! Load balancer with work stealing for heterogeneous compute.
|
||||
//!
|
||||
//! Supports:
|
||||
//! - Cross-processor-type work migration
|
||||
//! - Energy-aware balancing
|
||||
//! - Latency-aware scheduling
|
||||
//! - Real-time utilization metrics
|
||||
|
||||
use crate::device::{DeviceInfo, DeviceRegistry};
|
||||
use crate::processor::{Operation, OperationType, ProcessorId, ProcessorType};
|
||||
use crate::task::{Task, TaskId, TaskPriority};
|
||||
use super::TaskAssignment;
|
||||
use parking_lot::RwLock;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
/// Balancing strategy for the load balancer.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
pub enum BalancingStrategy {
|
||||
/// Optimize for speed (minimize execution time).
|
||||
Speed,
|
||||
/// Optimize for energy efficiency.
|
||||
Energy,
|
||||
/// Balance speed and energy.
|
||||
Balanced,
|
||||
/// Optimize for cost (spot pricing).
|
||||
Cost,
|
||||
/// Optimize for latency (inference workloads).
|
||||
Latency,
|
||||
}
|
||||
|
||||
impl Default for BalancingStrategy {
|
||||
fn default() -> Self {
|
||||
BalancingStrategy::Balanced
|
||||
}
|
||||
}
|
||||
|
||||
/// Real-time processor metrics.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct ProcessorMetrics {
|
||||
/// Current utilization (0.0 - 1.0).
|
||||
pub utilization: f64,
|
||||
/// Queue depth (pending tasks).
|
||||
pub queue_depth: u64,
|
||||
/// Average task completion time (ms).
|
||||
pub avg_completion_ms: f64,
|
||||
/// Tasks completed in last minute.
|
||||
pub throughput_per_min: u64,
|
||||
/// Current power draw (watts).
|
||||
pub power_watts: f64,
|
||||
/// Temperature (celsius).
|
||||
pub temperature: f64,
|
||||
/// Last updated timestamp.
|
||||
pub last_updated: Option<Instant>,
|
||||
}
|
||||
|
||||
/// Load balancer for heterogeneous compute environments.
|
||||
pub struct LoadBalancer {
|
||||
/// Device registry for processor info.
|
||||
device_registry: Option<Arc<DeviceRegistry>>,
|
||||
/// Current load per processor (task count).
|
||||
loads: RwLock<HashMap<ProcessorId, AtomicU64>>,
|
||||
/// Real-time metrics per processor.
|
||||
metrics: RwLock<HashMap<ProcessorId, ProcessorMetrics>>,
|
||||
/// Processor type mapping.
|
||||
processor_types: RwLock<HashMap<ProcessorId, ProcessorType>>,
|
||||
/// Work stealing threshold (0.0 - 1.0).
|
||||
steal_threshold: f64,
|
||||
/// Rebalance threshold (0.0 - 1.0).
|
||||
rebalance_threshold: f64,
|
||||
/// Current balancing strategy.
|
||||
strategy: RwLock<BalancingStrategy>,
|
||||
/// Migration history (to prevent thrashing).
|
||||
migration_history: RwLock<Vec<MigrationRecord>>,
|
||||
}
|
||||
|
||||
/// Record of a task migration.
|
||||
#[derive(Clone, Debug)]
|
||||
struct MigrationRecord {
|
||||
task_id: TaskId,
|
||||
from: ProcessorId,
|
||||
to: ProcessorId,
|
||||
timestamp: Instant,
|
||||
}
|
||||
|
||||
impl LoadBalancer {
|
||||
/// Creates a new load balancer.
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
device_registry: None,
|
||||
loads: RwLock::new(HashMap::new()),
|
||||
metrics: RwLock::new(HashMap::new()),
|
||||
processor_types: RwLock::new(HashMap::new()),
|
||||
steal_threshold: 0.3,
|
||||
rebalance_threshold: 0.2,
|
||||
strategy: RwLock::new(BalancingStrategy::default()),
|
||||
migration_history: RwLock::new(Vec::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a load balancer with device registry.
|
||||
pub fn with_registry(device_registry: Arc<DeviceRegistry>) -> Self {
|
||||
Self {
|
||||
device_registry: Some(device_registry),
|
||||
loads: RwLock::new(HashMap::new()),
|
||||
metrics: RwLock::new(HashMap::new()),
|
||||
processor_types: RwLock::new(HashMap::new()),
|
||||
steal_threshold: 0.3,
|
||||
rebalance_threshold: 0.2,
|
||||
strategy: RwLock::new(BalancingStrategy::default()),
|
||||
migration_history: RwLock::new(Vec::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets the balancing strategy.
|
||||
pub fn set_strategy(&self, strategy: BalancingStrategy) {
|
||||
*self.strategy.write() = strategy;
|
||||
}
|
||||
|
||||
/// Gets the current strategy.
|
||||
pub fn strategy(&self) -> BalancingStrategy {
|
||||
*self.strategy.read()
|
||||
}
|
||||
|
||||
/// Register a processor with its type.
|
||||
pub fn register_processor(&self, processor_id: ProcessorId, processor_type: ProcessorType) {
|
||||
self.loads.write().insert(processor_id, AtomicU64::new(0));
|
||||
self.metrics.write().insert(processor_id, ProcessorMetrics::default());
|
||||
self.processor_types.write().insert(processor_id, processor_type);
|
||||
}
|
||||
|
||||
/// Unregister a processor.
|
||||
pub fn unregister_processor(&self, processor_id: ProcessorId) {
|
||||
self.loads.write().remove(&processor_id);
|
||||
self.metrics.write().remove(&processor_id);
|
||||
self.processor_types.write().remove(&processor_id);
|
||||
}
|
||||
|
||||
/// Update real-time metrics for a processor.
|
||||
pub fn update_metrics(&self, processor_id: ProcessorId, metrics: ProcessorMetrics) {
|
||||
if let Some(existing) = self.metrics.write().get_mut(&processor_id) {
|
||||
*existing = ProcessorMetrics {
|
||||
last_updated: Some(Instant::now()),
|
||||
..metrics
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// Get current load for a processor.
|
||||
pub fn get_load(&self, processor_id: ProcessorId) -> u64 {
|
||||
self.loads.read()
|
||||
.get(&processor_id)
|
||||
.map(|l| l.load(Ordering::Relaxed))
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
/// Increment load for a processor.
|
||||
pub fn increment_load(&self, processor_id: ProcessorId) {
|
||||
if let Some(load) = self.loads.read().get(&processor_id) {
|
||||
load.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
/// Decrement load for a processor.
|
||||
pub fn decrement_load(&self, processor_id: ProcessorId) {
|
||||
if let Some(load) = self.loads.read().get(&processor_id) {
|
||||
load.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if an operation can run on a processor type.
|
||||
pub fn can_execute(&self, op: &Operation, processor_type: &ProcessorType) -> bool {
|
||||
let op_type = op.op_type();
|
||||
|
||||
match processor_type {
|
||||
// CPUs can handle most sequential operations
|
||||
ProcessorType::Cpu(_) => matches!(
|
||||
op_type,
|
||||
OperationType::MatMul
|
||||
| OperationType::Conv2d
|
||||
| OperationType::Conv3d
|
||||
| OperationType::DepthwiseConv
|
||||
| OperationType::BatchNorm
|
||||
| OperationType::LayerNorm
|
||||
| OperationType::Add
|
||||
| OperationType::Mul
|
||||
| OperationType::ReLU
|
||||
| OperationType::GeLU
|
||||
| OperationType::SiLU
|
||||
| OperationType::Softmax
|
||||
| OperationType::Sum
|
||||
| OperationType::Mean
|
||||
| OperationType::Max
|
||||
| OperationType::ArgMax
|
||||
| OperationType::Embedding
|
||||
| OperationType::TopK
|
||||
| OperationType::Sampling
|
||||
| OperationType::Tokenization
|
||||
| OperationType::Detokenization
|
||||
| OperationType::DataLoad
|
||||
| OperationType::DataPreprocess
|
||||
| OperationType::Transpose
|
||||
| OperationType::Reshape
|
||||
| OperationType::Concat
|
||||
| OperationType::Split
|
||||
),
|
||||
|
||||
// GPUs excel at parallel operations
|
||||
ProcessorType::Gpu(_) => matches!(
|
||||
op_type,
|
||||
OperationType::MatMul
|
||||
| OperationType::Conv2d
|
||||
| OperationType::Conv3d
|
||||
| OperationType::DepthwiseConv
|
||||
| OperationType::BatchNorm
|
||||
| OperationType::LayerNorm
|
||||
| OperationType::SelfAttention
|
||||
| OperationType::CrossAttention
|
||||
| OperationType::FlashAttention
|
||||
| OperationType::Add
|
||||
| OperationType::Mul
|
||||
| OperationType::ReLU
|
||||
| OperationType::GeLU
|
||||
| OperationType::SiLU
|
||||
| OperationType::Softmax
|
||||
| OperationType::Sum
|
||||
| OperationType::Mean
|
||||
| OperationType::Max
|
||||
| OperationType::ArgMax
|
||||
| OperationType::Embedding
|
||||
| OperationType::RoPE
|
||||
| OperationType::KVCache
|
||||
| OperationType::TopK
|
||||
| OperationType::Sampling
|
||||
| OperationType::Transpose
|
||||
| OperationType::Reshape
|
||||
| OperationType::Concat
|
||||
| OperationType::Split
|
||||
| OperationType::Gather
|
||||
| OperationType::Scatter
|
||||
| OperationType::AllReduce
|
||||
| OperationType::AllGather
|
||||
| OperationType::ReduceScatter
|
||||
| OperationType::Backward
|
||||
| OperationType::OptimizerStep
|
||||
| OperationType::GradientClip
|
||||
),
|
||||
|
||||
// TPUs optimized for ML
|
||||
ProcessorType::Tpu(_) => matches!(
|
||||
op_type,
|
||||
OperationType::MatMul
|
||||
| OperationType::Conv2d
|
||||
| OperationType::BatchNorm
|
||||
| OperationType::LayerNorm
|
||||
| OperationType::SelfAttention
|
||||
| OperationType::CrossAttention
|
||||
| OperationType::FlashAttention
|
||||
| OperationType::Add
|
||||
| OperationType::Mul
|
||||
| OperationType::ReLU
|
||||
| OperationType::GeLU
|
||||
| OperationType::SiLU
|
||||
| OperationType::Softmax
|
||||
| OperationType::Sum
|
||||
| OperationType::Mean
|
||||
| OperationType::Embedding
|
||||
| OperationType::RoPE
|
||||
| OperationType::KVCache
|
||||
| OperationType::AllReduce
|
||||
| OperationType::AllGather
|
||||
| OperationType::ReduceScatter
|
||||
| OperationType::Backward
|
||||
| OperationType::OptimizerStep
|
||||
),
|
||||
|
||||
// NPUs for neural network inference
|
||||
ProcessorType::Npu(_) => matches!(
|
||||
op_type,
|
||||
OperationType::MatMul
|
||||
| OperationType::Conv2d
|
||||
| OperationType::DepthwiseConv
|
||||
| OperationType::BatchNorm
|
||||
| OperationType::LayerNorm
|
||||
| OperationType::SelfAttention
|
||||
| OperationType::Add
|
||||
| OperationType::Mul
|
||||
| OperationType::ReLU
|
||||
| OperationType::GeLU
|
||||
| OperationType::SiLU
|
||||
| OperationType::Softmax
|
||||
| OperationType::Sum
|
||||
| OperationType::Mean
|
||||
),
|
||||
|
||||
// LPUs for sequential inference (optimized for LLMs)
|
||||
ProcessorType::Lpu => matches!(
|
||||
op_type,
|
||||
OperationType::MatMul
|
||||
| OperationType::LayerNorm
|
||||
| OperationType::SelfAttention
|
||||
| OperationType::FlashAttention
|
||||
| OperationType::Add
|
||||
| OperationType::Mul
|
||||
| OperationType::ReLU
|
||||
| OperationType::GeLU
|
||||
| OperationType::SiLU
|
||||
| OperationType::Softmax
|
||||
| OperationType::Embedding
|
||||
| OperationType::RoPE
|
||||
| OperationType::KVCache
|
||||
| OperationType::TopK
|
||||
| OperationType::Sampling
|
||||
),
|
||||
|
||||
// FPGAs can be programmed for anything
|
||||
ProcessorType::Fpga(_) => true,
|
||||
|
||||
// DSPs for signal processing
|
||||
ProcessorType::Dsp(_) => matches!(
|
||||
op_type,
|
||||
OperationType::Conv2d
|
||||
| OperationType::DepthwiseConv
|
||||
| OperationType::Add
|
||||
| OperationType::Mul
|
||||
| OperationType::Sum
|
||||
| OperationType::Mean
|
||||
| OperationType::Max
|
||||
),
|
||||
|
||||
// WebGPU has limited operations
|
||||
ProcessorType::WebGpu => matches!(
|
||||
op_type,
|
||||
OperationType::MatMul
|
||||
| OperationType::Conv2d
|
||||
| OperationType::Add
|
||||
| OperationType::Mul
|
||||
| OperationType::ReLU
|
||||
| OperationType::Softmax
|
||||
| OperationType::Sum
|
||||
| OperationType::Transpose
|
||||
| OperationType::Reshape
|
||||
),
|
||||
|
||||
// WASM for portable compute
|
||||
ProcessorType::Wasm => matches!(
|
||||
op_type,
|
||||
OperationType::MatMul
|
||||
| OperationType::Add
|
||||
| OperationType::Mul
|
||||
| OperationType::ReLU
|
||||
| OperationType::Softmax
|
||||
| OperationType::Sum
|
||||
| OperationType::Mean
|
||||
| OperationType::Tokenization
|
||||
| OperationType::Detokenization
|
||||
),
|
||||
|
||||
// Custom processors - assume they can handle anything
|
||||
ProcessorType::Custom { .. } => true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate a score for assigning a task to a processor.
|
||||
fn calculate_score(
|
||||
&self,
|
||||
task: &Task,
|
||||
processor_id: ProcessorId,
|
||||
processor_type: &ProcessorType,
|
||||
) -> f64 {
|
||||
let strategy = *self.strategy.read();
|
||||
let load = self.get_load(processor_id);
|
||||
let metrics = self.metrics.read();
|
||||
let proc_metrics = metrics.get(&processor_id);
|
||||
|
||||
// Base score from compatibility
|
||||
if !self.can_execute(&task.operation, processor_type) {
|
||||
return f64::NEG_INFINITY;
|
||||
}
|
||||
|
||||
// Get utilization and metrics
|
||||
let utilization = proc_metrics.map(|m| m.utilization).unwrap_or(load as f64 / 100.0);
|
||||
let power = proc_metrics.map(|m| m.power_watts).unwrap_or(100.0);
|
||||
let avg_completion = proc_metrics.map(|m| m.avg_completion_ms).unwrap_or(100.0);
|
||||
|
||||
// Calculate score based on strategy
|
||||
match strategy {
|
||||
BalancingStrategy::Speed => {
|
||||
// Prioritize low utilization and fast completion
|
||||
let speed_score = 1.0 / (avg_completion.max(1.0) * (1.0 + utilization));
|
||||
|
||||
// Bonus for powerful processor types
|
||||
let type_bonus = match processor_type {
|
||||
ProcessorType::Gpu(_) => 2.0,
|
||||
ProcessorType::Tpu(_) => 2.5,
|
||||
ProcessorType::Lpu => 3.0, // Fastest for inference
|
||||
ProcessorType::Npu(_) => 1.5,
|
||||
_ => 1.0,
|
||||
};
|
||||
|
||||
speed_score * type_bonus
|
||||
}
|
||||
|
||||
BalancingStrategy::Energy => {
|
||||
// Prioritize low power consumption
|
||||
let energy_score = 1.0 / power.max(1.0);
|
||||
|
||||
// Bonus for efficient processor types
|
||||
let efficiency_bonus = match processor_type {
|
||||
ProcessorType::Npu(_) => 3.0, // Most efficient
|
||||
ProcessorType::Lpu => 2.0,
|
||||
ProcessorType::Cpu(_) => 1.5,
|
||||
ProcessorType::Wasm => 2.0, // Low overhead
|
||||
_ => 1.0,
|
||||
};
|
||||
|
||||
energy_score * efficiency_bonus * (1.0 - utilization * 0.5)
|
||||
}
|
||||
|
||||
BalancingStrategy::Balanced => {
|
||||
// Balance speed and energy
|
||||
let speed = 1.0 / avg_completion.max(1.0);
|
||||
let efficiency = 1.0 / power.max(1.0);
|
||||
let load_factor = 1.0 - utilization;
|
||||
|
||||
(speed * 0.4 + efficiency * 0.3 + load_factor * 0.3)
|
||||
}
|
||||
|
||||
BalancingStrategy::Cost => {
|
||||
// Prioritize cheaper resources (consumer devices)
|
||||
let cost_factor = match processor_type {
|
||||
ProcessorType::Wasm => 0.1, // Cheapest (browser)
|
||||
ProcessorType::WebGpu => 0.15,
|
||||
ProcessorType::Cpu(_) => 0.2,
|
||||
ProcessorType::Npu(_) => 0.3, // Mobile NPUs
|
||||
ProcessorType::Gpu(_) => 0.5,
|
||||
ProcessorType::Lpu => 0.8,
|
||||
ProcessorType::Tpu(_) => 1.0, // Most expensive
|
||||
_ => 0.5,
|
||||
};
|
||||
|
||||
(1.0 - cost_factor) * (1.0 - utilization)
|
||||
}
|
||||
|
||||
BalancingStrategy::Latency => {
|
||||
// Prioritize low latency for inference
|
||||
let latency_score = 1.0 / avg_completion.max(0.1);
|
||||
|
||||
// Bonus for low-latency processors
|
||||
let latency_bonus = match processor_type {
|
||||
ProcessorType::Lpu => 5.0, // Designed for low latency
|
||||
ProcessorType::Npu(_) => 3.0,
|
||||
ProcessorType::Gpu(_) => 2.0,
|
||||
ProcessorType::Tpu(_) => 1.5,
|
||||
_ => 1.0,
|
||||
};
|
||||
|
||||
// Priority boost for critical tasks
|
||||
let priority_boost = match task.priority {
|
||||
TaskPriority::Critical => 2.0,
|
||||
TaskPriority::High => 1.5,
|
||||
TaskPriority::Normal => 1.0,
|
||||
TaskPriority::Background => 0.5,
|
||||
};
|
||||
|
||||
latency_score * latency_bonus * priority_boost * (1.0 - utilization)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Maybe rebalance a task to a different processor.
|
||||
pub fn maybe_rebalance(
|
||||
&self,
|
||||
task: &Task,
|
||||
suggested_processor: ProcessorId,
|
||||
current_assignment: &TaskAssignment,
|
||||
) -> ProcessorId {
|
||||
// Get all registered processors
|
||||
let processor_types = self.processor_types.read();
|
||||
|
||||
// If we don't have processor info, use suggested
|
||||
let suggested_type = match processor_types.get(&suggested_processor) {
|
||||
Some(t) => t.clone(),
|
||||
None => return suggested_processor,
|
||||
};
|
||||
|
||||
// Calculate score for suggested processor
|
||||
let suggested_score = self.calculate_score(task, suggested_processor, &suggested_type);
|
||||
|
||||
// Find best alternative
|
||||
let mut best_processor = suggested_processor;
|
||||
let mut best_score = suggested_score;
|
||||
|
||||
for (proc_id, proc_type) in processor_types.iter() {
|
||||
if *proc_id == suggested_processor {
|
||||
continue;
|
||||
}
|
||||
|
||||
let score = self.calculate_score(task, *proc_id, proc_type);
|
||||
|
||||
// Only switch if significantly better (prevents thrashing)
|
||||
if score > best_score * (1.0 + self.rebalance_threshold) {
|
||||
best_score = score;
|
||||
best_processor = *proc_id;
|
||||
}
|
||||
}
|
||||
|
||||
// Record migration if different
|
||||
if best_processor != suggested_processor {
|
||||
self.migration_history.write().push(MigrationRecord {
|
||||
task_id: task.id,
|
||||
from: suggested_processor,
|
||||
to: best_processor,
|
||||
timestamp: Instant::now(),
|
||||
});
|
||||
}
|
||||
|
||||
best_processor
|
||||
}
|
||||
|
||||
/// Check if work stealing should happen between two processors.
|
||||
pub fn should_steal(&self, from: ProcessorId, to: ProcessorId) -> bool {
|
||||
let from_load = self.get_load(from) as f64;
|
||||
let to_load = self.get_load(to) as f64;
|
||||
|
||||
if from_load == 0.0 {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if processor types are compatible for the queued work
|
||||
let processor_types = self.processor_types.read();
|
||||
let from_type = processor_types.get(&from);
|
||||
let to_type = processor_types.get(&to);
|
||||
|
||||
// Only steal between same processor types by default
|
||||
// (cross-type stealing requires operation compatibility check)
|
||||
match (from_type, to_type) {
|
||||
(Some(ft), Some(tt)) if std::mem::discriminant(ft) == std::mem::discriminant(tt) => {
|
||||
let diff = (from_load - to_load) / from_load;
|
||||
diff > self.steal_threshold
|
||||
}
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get rebalancing suggestions based on current load.
|
||||
pub fn get_rebalance_suggestions(&self) -> Vec<(ProcessorId, ProcessorId)> {
|
||||
let mut suggestions = Vec::new();
|
||||
let loads = self.loads.read();
|
||||
|
||||
let load_values: Vec<_> = loads.iter()
|
||||
.map(|(id, load)| (*id, load.load(Ordering::Relaxed)))
|
||||
.collect();
|
||||
|
||||
if load_values.is_empty() {
|
||||
return suggestions;
|
||||
}
|
||||
|
||||
let avg_load: f64 = load_values.iter().map(|(_, l)| *l as f64).sum::<f64>()
|
||||
/ load_values.len() as f64;
|
||||
|
||||
let processor_types = self.processor_types.read();
|
||||
|
||||
let overloaded: Vec<_> = load_values.iter()
|
||||
.filter(|(_, l)| *l as f64 > avg_load * (1.0 + self.rebalance_threshold))
|
||||
.collect();
|
||||
|
||||
let underloaded: Vec<_> = load_values.iter()
|
||||
.filter(|(_, l)| (*l as f64) < avg_load * (1.0 - self.rebalance_threshold))
|
||||
.collect();
|
||||
|
||||
// Only suggest migrations between compatible processor types
|
||||
for (over_id, _) in overloaded {
|
||||
let over_type = processor_types.get(over_id);
|
||||
|
||||
for (under_id, _) in &underloaded {
|
||||
let under_type = processor_types.get(under_id);
|
||||
|
||||
// Check type compatibility
|
||||
if let (Some(ot), Some(ut)) = (over_type, under_type) {
|
||||
if std::mem::discriminant(ot) == std::mem::discriminant(ut) {
|
||||
suggestions.push((*over_id, *under_id));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
suggestions
|
||||
}
|
||||
|
||||
/// Get load statistics.
|
||||
pub fn get_stats(&self) -> LoadBalancerStats {
|
||||
let loads = self.loads.read();
|
||||
let metrics = self.metrics.read();
|
||||
|
||||
let total_load: u64 = loads.values().map(|l| l.load(Ordering::Relaxed)).sum();
|
||||
let processor_count = loads.len();
|
||||
let avg_load = if processor_count > 0 {
|
||||
total_load as f64 / processor_count as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let total_utilization: f64 = metrics.values().map(|m| m.utilization).sum();
|
||||
let avg_utilization = if processor_count > 0 {
|
||||
total_utilization / processor_count as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let total_power: f64 = metrics.values().map(|m| m.power_watts).sum();
|
||||
let migrations = self.migration_history.read().len();
|
||||
|
||||
LoadBalancerStats {
|
||||
total_load,
|
||||
avg_load,
|
||||
processor_count,
|
||||
avg_utilization,
|
||||
total_power_watts: total_power,
|
||||
total_migrations: migrations,
|
||||
strategy: *self.strategy.read(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Clean up old migration history.
|
||||
pub fn cleanup_history(&self, max_age: Duration) {
|
||||
let cutoff = Instant::now() - max_age;
|
||||
self.migration_history.write().retain(|r| r.timestamp > cutoff);
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for LoadBalancer {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Load balancer statistics.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct LoadBalancerStats {
|
||||
/// Total tasks across all processors.
|
||||
pub total_load: u64,
|
||||
/// Average load per processor.
|
||||
pub avg_load: f64,
|
||||
/// Number of registered processors.
|
||||
pub processor_count: usize,
|
||||
/// Average utilization (0.0 - 1.0).
|
||||
pub avg_utilization: f64,
|
||||
/// Total power consumption (watts).
|
||||
pub total_power_watts: f64,
|
||||
/// Total migrations performed.
|
||||
pub total_migrations: usize,
|
||||
/// Current balancing strategy.
|
||||
pub strategy: BalancingStrategy,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::processor::{CpuVariant, GpuVariant, Operation, Precision};
|
||||
use crate::task::TaskStatus;
|
||||
|
||||
fn create_test_task(priority: TaskPriority) -> Task {
|
||||
Task {
|
||||
id: TaskId::new(),
|
||||
operation: Operation::MatMul {
|
||||
m: 1024,
|
||||
n: 1024,
|
||||
k: 1024,
|
||||
precision: Precision::Fp32,
|
||||
},
|
||||
priority,
|
||||
dependencies: vec![],
|
||||
status: TaskStatus::Pending,
|
||||
deadline: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_load_tracking() {
|
||||
let balancer = LoadBalancer::new();
|
||||
|
||||
balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
|
||||
balancer.register_processor(ProcessorId(1), ProcessorType::Cpu(CpuVariant::default()));
|
||||
|
||||
assert_eq!(balancer.get_load(ProcessorId(0)), 0);
|
||||
|
||||
balancer.increment_load(ProcessorId(0));
|
||||
balancer.increment_load(ProcessorId(0));
|
||||
balancer.increment_load(ProcessorId(1));
|
||||
|
||||
assert_eq!(balancer.get_load(ProcessorId(0)), 2);
|
||||
assert_eq!(balancer.get_load(ProcessorId(1)), 1);
|
||||
|
||||
balancer.decrement_load(ProcessorId(0));
|
||||
assert_eq!(balancer.get_load(ProcessorId(0)), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_should_steal_same_type() {
|
||||
let balancer = LoadBalancer::new();
|
||||
|
||||
// Register two CPUs
|
||||
balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
|
||||
balancer.register_processor(ProcessorId(1), ProcessorType::Cpu(CpuVariant::default()));
|
||||
|
||||
// Give processor 0 high load
|
||||
for _ in 0..10 {
|
||||
balancer.increment_load(ProcessorId(0));
|
||||
}
|
||||
balancer.increment_load(ProcessorId(1));
|
||||
|
||||
// Should steal between same types
|
||||
assert!(balancer.should_steal(ProcessorId(0), ProcessorId(1)));
|
||||
assert!(!balancer.should_steal(ProcessorId(1), ProcessorId(0)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_should_not_steal_different_types() {
|
||||
let balancer = LoadBalancer::new();
|
||||
|
||||
// Register CPU and GPU
|
||||
balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
|
||||
balancer.register_processor(
|
||||
ProcessorId(1),
|
||||
ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability: (8, 9) }),
|
||||
);
|
||||
|
||||
// Give CPU high load
|
||||
for _ in 0..10 {
|
||||
balancer.increment_load(ProcessorId(0));
|
||||
}
|
||||
|
||||
// Should NOT steal between different types
|
||||
assert!(!balancer.should_steal(ProcessorId(0), ProcessorId(1)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_can_execute() {
|
||||
let balancer = LoadBalancer::new();
|
||||
|
||||
let matmul = Operation::MatMul {
|
||||
m: 1024,
|
||||
n: 1024,
|
||||
k: 1024,
|
||||
precision: Precision::Fp32,
|
||||
};
|
||||
|
||||
let flash_attention = Operation::FlashAttention {
|
||||
batch: 32,
|
||||
seq_len: 2048,
|
||||
num_heads: 32,
|
||||
head_dim: 128,
|
||||
precision: Precision::Fp16,
|
||||
};
|
||||
|
||||
let cpu = ProcessorType::Cpu(CpuVariant::default());
|
||||
let gpu = ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability: (8, 9) });
|
||||
let lpu = ProcessorType::Lpu;
|
||||
|
||||
// MatMul can run on all
|
||||
assert!(balancer.can_execute(&matmul, &cpu));
|
||||
assert!(balancer.can_execute(&matmul, &gpu));
|
||||
assert!(balancer.can_execute(&matmul, &lpu));
|
||||
|
||||
// FlashAttention only on GPU/TPU/LPU
|
||||
assert!(!balancer.can_execute(&flash_attention, &cpu));
|
||||
assert!(balancer.can_execute(&flash_attention, &gpu));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_strategy_affects_scoring() {
|
||||
let balancer = LoadBalancer::new();
|
||||
|
||||
let cpu_id = ProcessorId(0);
|
||||
let npu_id = ProcessorId(1);
|
||||
|
||||
balancer.register_processor(cpu_id, ProcessorType::Cpu(CpuVariant::default()));
|
||||
balancer.register_processor(npu_id, ProcessorType::Npu(crate::processor::NpuVariant::AppleNeuralEngine { cores: 16 }));
|
||||
|
||||
let task = create_test_task(TaskPriority::Normal);
|
||||
|
||||
// Energy strategy should prefer NPU
|
||||
balancer.set_strategy(BalancingStrategy::Energy);
|
||||
let assignment = TaskAssignment::new();
|
||||
let result = balancer.maybe_rebalance(&task, cpu_id, &assignment);
|
||||
|
||||
// NPU should be preferred for energy efficiency
|
||||
assert_eq!(result, npu_id);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stats() {
|
||||
let balancer = LoadBalancer::new();
|
||||
|
||||
balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
|
||||
balancer.register_processor(ProcessorId(1), ProcessorType::Cpu(CpuVariant::default()));
|
||||
|
||||
balancer.increment_load(ProcessorId(0));
|
||||
balancer.increment_load(ProcessorId(0));
|
||||
balancer.increment_load(ProcessorId(1));
|
||||
|
||||
let stats = balancer.get_stats();
|
||||
assert_eq!(stats.total_load, 3);
|
||||
assert_eq!(stats.processor_count, 2);
|
||||
assert!((stats.avg_load - 1.5).abs() < 0.01);
|
||||
}
|
||||
}
|
||||
559
crates/synor-compute/src/scheduler/mod.rs
Normal file
559
crates/synor-compute/src/scheduler/mod.rs
Normal file
|
|
@ -0,0 +1,559 @@
|
|||
//! Heterogeneous scheduler for multi-processor task assignment.
|
||||
//!
|
||||
//! Features:
|
||||
//! - Optimal task-to-processor assignment
|
||||
//! - Work stealing for load balancing
|
||||
//! - Pipeline parallelism across processor types
|
||||
//! - Dynamic rebalancing based on actual throughput
|
||||
|
||||
mod load_balancer;
|
||||
mod work_queue;
|
||||
|
||||
pub use load_balancer::LoadBalancer;
|
||||
pub use work_queue::WorkQueue;
|
||||
|
||||
use crate::device::DeviceRegistry;
|
||||
use crate::error::ComputeError;
|
||||
use crate::processor::{Operation, Processor, ProcessorId, ProcessorType};
|
||||
use crate::task::{Task, TaskId, TaskPriority};
|
||||
use parking_lot::RwLock;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
/// Heterogeneous scheduler that manages tasks across all processor types.
|
||||
pub struct HeterogeneousScheduler {
|
||||
/// Device registry.
|
||||
device_registry: Arc<DeviceRegistry>,
|
||||
/// Per-processor-type task queues.
|
||||
queues: RwLock<HashMap<ProcessorType, WorkQueue>>,
|
||||
/// Load balancer.
|
||||
load_balancer: LoadBalancer,
|
||||
/// Active schedules.
|
||||
active_schedules: RwLock<HashMap<ScheduleId, Schedule>>,
|
||||
}
|
||||
|
||||
impl HeterogeneousScheduler {
|
||||
/// Creates a new heterogeneous scheduler.
|
||||
pub fn new(device_registry: Arc<DeviceRegistry>) -> Self {
|
||||
Self {
|
||||
device_registry,
|
||||
queues: RwLock::new(HashMap::new()),
|
||||
load_balancer: LoadBalancer::new(),
|
||||
active_schedules: RwLock::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Schedule a set of tasks for execution.
|
||||
pub async fn schedule(&self, tasks: Vec<Task>) -> Result<ScheduleResult, ComputeError> {
|
||||
if tasks.is_empty() {
|
||||
return Ok(ScheduleResult {
|
||||
schedule: Schedule::empty(),
|
||||
estimated_makespan: Duration::ZERO,
|
||||
processor_utilization: HashMap::new(),
|
||||
});
|
||||
}
|
||||
|
||||
// 1. Build dependency graph
|
||||
let deps = self.build_dependency_graph(&tasks);
|
||||
|
||||
// 2. Assign tasks to optimal processors
|
||||
let assignment = self.assign_tasks(&tasks, &deps).await?;
|
||||
|
||||
// 3. Create execution schedule with stages
|
||||
let schedule = self.create_schedule(&tasks, &assignment, &deps)?;
|
||||
|
||||
// 4. Estimate metrics
|
||||
let makespan = self.estimate_makespan(&schedule);
|
||||
let utilization = self.estimate_utilization(&schedule);
|
||||
|
||||
// 5. Store active schedule
|
||||
self.active_schedules.write().insert(schedule.id, schedule.clone());
|
||||
|
||||
Ok(ScheduleResult {
|
||||
schedule,
|
||||
estimated_makespan: makespan,
|
||||
processor_utilization: utilization,
|
||||
})
|
||||
}
|
||||
|
||||
/// Execute a schedule.
|
||||
pub async fn execute(&self, schedule: &Schedule) -> Result<ExecutionResult, ComputeError> {
|
||||
let mut results = HashMap::new();
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
// Execute stages in order
|
||||
for stage in &schedule.stages {
|
||||
// Execute all tasks in this stage in parallel
|
||||
let mut handles = Vec::new();
|
||||
|
||||
for task_id in &stage.tasks {
|
||||
let task = schedule.tasks.get(task_id)
|
||||
.ok_or_else(|| ComputeError::Internal(format!("Task not found: {:?}", task_id)))?;
|
||||
let processor_id = schedule.assignment.get(task_id)
|
||||
.ok_or_else(|| ComputeError::Internal(format!("No assignment for task: {:?}", task_id)))?;
|
||||
|
||||
let processor = self.device_registry.get_processor(processor_id)?;
|
||||
let task_clone = task.clone();
|
||||
|
||||
handles.push(tokio::spawn(async move {
|
||||
processor.execute(task_clone.operation).await
|
||||
}));
|
||||
}
|
||||
|
||||
// Wait for all tasks in stage
|
||||
for (i, handle) in handles.into_iter().enumerate() {
|
||||
let task_id = stage.tasks[i];
|
||||
match handle.await {
|
||||
Ok(Ok(result)) => {
|
||||
results.insert(task_id, TaskExecutionResult::Success(result));
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
results.insert(task_id, TaskExecutionResult::Failed(e.to_string()));
|
||||
}
|
||||
Err(e) => {
|
||||
results.insert(task_id, TaskExecutionResult::Failed(e.to_string()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let total_time = start.elapsed();
|
||||
|
||||
Ok(ExecutionResult {
|
||||
results,
|
||||
total_time,
|
||||
actual_utilization: self.measure_utilization(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Assign tasks to optimal processors.
|
||||
async fn assign_tasks(
|
||||
&self,
|
||||
tasks: &[Task],
|
||||
deps: &DependencyGraph,
|
||||
) -> Result<TaskAssignment, ComputeError> {
|
||||
let mut assignment = TaskAssignment::new();
|
||||
|
||||
// Sort tasks by priority and dependencies (topological sort)
|
||||
let sorted_tasks = self.topological_sort(tasks, deps);
|
||||
|
||||
for task in sorted_tasks {
|
||||
// Find best processor for this task
|
||||
let best_processor = self.find_best_processor(&task).await?;
|
||||
|
||||
// Check if we should rebalance
|
||||
let final_processor = self.load_balancer
|
||||
.maybe_rebalance(&task, best_processor, &assignment);
|
||||
|
||||
assignment.assign(task.id, final_processor);
|
||||
}
|
||||
|
||||
Ok(assignment)
|
||||
}
|
||||
|
||||
/// Find the best processor for a task.
|
||||
async fn find_best_processor(&self, task: &Task) -> Result<ProcessorId, ComputeError> {
|
||||
let mut best_score = f64::NEG_INFINITY;
|
||||
let mut best_processor = None;
|
||||
|
||||
// Get all available processors
|
||||
let processors = self.device_registry.all_processors();
|
||||
|
||||
for processor in processors {
|
||||
if !processor.can_execute(&task.operation) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Calculate score based on multiple factors
|
||||
let exec_time = processor.estimate_time(&task.operation);
|
||||
let energy = processor.estimate_energy(&task.operation);
|
||||
let load = processor.utilization();
|
||||
|
||||
// Score = 1 / (time * (1 + load) * energy_factor)
|
||||
let time_factor = exec_time.as_secs_f64().max(0.001);
|
||||
let load_factor = 1.0 + load;
|
||||
let energy_factor = 1.0 + (energy / 1000.0); // Normalize energy
|
||||
|
||||
let score = 1.0 / (time_factor * load_factor * energy_factor);
|
||||
|
||||
if score > best_score {
|
||||
best_score = score;
|
||||
best_processor = Some(processor.id());
|
||||
}
|
||||
}
|
||||
|
||||
best_processor.ok_or_else(|| {
|
||||
ComputeError::NoSuitableProcessor(format!("{:?}", task.operation.op_type()))
|
||||
})
|
||||
}
|
||||
|
||||
/// Build dependency graph from tasks.
|
||||
fn build_dependency_graph(&self, tasks: &[Task]) -> DependencyGraph {
|
||||
let mut graph = DependencyGraph::new();
|
||||
|
||||
for task in tasks {
|
||||
graph.add_node(task.id);
|
||||
for dep in &task.dependencies {
|
||||
graph.add_edge(*dep, task.id);
|
||||
}
|
||||
}
|
||||
|
||||
graph
|
||||
}
|
||||
|
||||
/// Topological sort of tasks respecting dependencies.
|
||||
fn topological_sort(&self, tasks: &[Task], deps: &DependencyGraph) -> Vec<Task> {
|
||||
let mut sorted = Vec::new();
|
||||
let mut visited = std::collections::HashSet::new();
|
||||
let task_map: HashMap<TaskId, Task> = tasks.iter()
|
||||
.map(|t| (t.id, t.clone()))
|
||||
.collect();
|
||||
|
||||
fn visit(
|
||||
task_id: TaskId,
|
||||
task_map: &HashMap<TaskId, Task>,
|
||||
deps: &DependencyGraph,
|
||||
visited: &mut std::collections::HashSet<TaskId>,
|
||||
sorted: &mut Vec<Task>,
|
||||
) {
|
||||
if visited.contains(&task_id) {
|
||||
return;
|
||||
}
|
||||
visited.insert(task_id);
|
||||
|
||||
// Visit dependencies first
|
||||
if let Some(task_deps) = deps.dependencies.get(&task_id) {
|
||||
for dep in task_deps {
|
||||
visit(*dep, task_map, deps, visited, sorted);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(task) = task_map.get(&task_id) {
|
||||
sorted.push(task.clone());
|
||||
}
|
||||
}
|
||||
|
||||
for task in tasks {
|
||||
visit(task.id, &task_map, deps, &mut visited, &mut sorted);
|
||||
}
|
||||
|
||||
// Sort by priority within dependency constraints
|
||||
sorted.sort_by(|a, b| b.priority.cmp(&a.priority));
|
||||
|
||||
sorted
|
||||
}
|
||||
|
||||
/// Create execution schedule with parallel stages.
|
||||
fn create_schedule(
|
||||
&self,
|
||||
tasks: &[Task],
|
||||
assignment: &TaskAssignment,
|
||||
deps: &DependencyGraph,
|
||||
) -> Result<Schedule, ComputeError> {
|
||||
let mut stages = Vec::new();
|
||||
let mut scheduled = std::collections::HashSet::new();
|
||||
let task_map: HashMap<TaskId, Task> = tasks.iter()
|
||||
.map(|t| (t.id, t.clone()))
|
||||
.collect();
|
||||
|
||||
while scheduled.len() < tasks.len() {
|
||||
let mut stage_tasks = Vec::new();
|
||||
|
||||
for task in tasks {
|
||||
if scheduled.contains(&task.id) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if all dependencies are satisfied
|
||||
let deps_satisfied = task.dependencies.iter()
|
||||
.all(|dep| scheduled.contains(dep));
|
||||
|
||||
if deps_satisfied {
|
||||
stage_tasks.push(task.id);
|
||||
}
|
||||
}
|
||||
|
||||
if stage_tasks.is_empty() {
|
||||
return Err(ComputeError::SchedulingFailed(
|
||||
"Circular dependency detected".to_string()
|
||||
));
|
||||
}
|
||||
|
||||
for task_id in &stage_tasks {
|
||||
scheduled.insert(*task_id);
|
||||
}
|
||||
|
||||
stages.push(ScheduleStage {
|
||||
stage_id: stages.len(),
|
||||
tasks: stage_tasks,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(Schedule {
|
||||
id: ScheduleId::new(),
|
||||
tasks: task_map,
|
||||
assignment: assignment.clone(),
|
||||
stages,
|
||||
})
|
||||
}
|
||||
|
||||
/// Estimate makespan (total execution time).
|
||||
fn estimate_makespan(&self, schedule: &Schedule) -> Duration {
|
||||
let mut total = Duration::ZERO;
|
||||
|
||||
for stage in &schedule.stages {
|
||||
let mut max_stage_time = Duration::ZERO;
|
||||
|
||||
for task_id in &stage.tasks {
|
||||
if let (Some(task), Some(proc_id)) = (
|
||||
schedule.tasks.get(task_id),
|
||||
schedule.assignment.get(task_id),
|
||||
) {
|
||||
if let Ok(processor) = self.device_registry.get_processor(proc_id) {
|
||||
let time = processor.estimate_time(&task.operation);
|
||||
max_stage_time = max_stage_time.max(time);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
total += max_stage_time;
|
||||
}
|
||||
|
||||
total
|
||||
}
|
||||
|
||||
/// Estimate processor utilization.
|
||||
fn estimate_utilization(&self, schedule: &Schedule) -> HashMap<ProcessorType, f64> {
|
||||
let mut work_time: HashMap<ProcessorType, Duration> = HashMap::new();
|
||||
let makespan = self.estimate_makespan(schedule);
|
||||
|
||||
for task_id in schedule.assignment.assignments.keys() {
|
||||
if let (Some(task), Some(proc_id)) = (
|
||||
schedule.tasks.get(task_id),
|
||||
schedule.assignment.get(task_id),
|
||||
) {
|
||||
if let Ok(processor) = self.device_registry.get_processor(proc_id) {
|
||||
let proc_type = processor.processor_type();
|
||||
let time = processor.estimate_time(&task.operation);
|
||||
*work_time.entry(proc_type).or_default() += time;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
work_time
|
||||
.into_iter()
|
||||
.map(|(proc_type, time)| {
|
||||
let utilization = if makespan.as_secs_f64() > 0.0 {
|
||||
time.as_secs_f64() / makespan.as_secs_f64()
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
(proc_type, utilization.min(1.0))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Measure actual current utilization.
|
||||
fn measure_utilization(&self) -> HashMap<ProcessorType, f64> {
|
||||
let mut utilization = HashMap::new();
|
||||
|
||||
for processor in self.device_registry.all_processors() {
|
||||
let proc_type = processor.processor_type();
|
||||
let util = processor.utilization();
|
||||
utilization
|
||||
.entry(proc_type)
|
||||
.and_modify(|u| *u = (*u + util) / 2.0)
|
||||
.or_insert(util);
|
||||
}
|
||||
|
||||
utilization
|
||||
}
|
||||
}
|
||||
|
||||
/// Schedule identifier.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct ScheduleId(pub u64);
|
||||
|
||||
impl ScheduleId {
|
||||
/// Creates a new schedule ID.
|
||||
pub fn new() -> Self {
|
||||
use rand::Rng;
|
||||
ScheduleId(rand::thread_rng().gen())
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ScheduleId {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Task-to-processor assignment.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct TaskAssignment {
|
||||
/// Map from task ID to processor ID.
|
||||
pub assignments: HashMap<TaskId, ProcessorId>,
|
||||
}
|
||||
|
||||
impl TaskAssignment {
|
||||
/// Creates a new empty assignment.
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
assignments: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Assigns a task to a processor.
|
||||
pub fn assign(&mut self, task_id: TaskId, processor_id: ProcessorId) {
|
||||
self.assignments.insert(task_id, processor_id);
|
||||
}
|
||||
|
||||
/// Gets the assigned processor for a task.
|
||||
pub fn get(&self, task_id: &TaskId) -> Option<ProcessorId> {
|
||||
self.assignments.get(task_id).copied()
|
||||
}
|
||||
}
|
||||
|
||||
/// Dependency graph for tasks.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct DependencyGraph {
|
||||
/// Dependencies: task -> list of tasks it depends on.
|
||||
pub dependencies: HashMap<TaskId, Vec<TaskId>>,
|
||||
/// Dependents: task -> list of tasks that depend on it.
|
||||
pub dependents: HashMap<TaskId, Vec<TaskId>>,
|
||||
}
|
||||
|
||||
impl DependencyGraph {
|
||||
/// Creates a new empty dependency graph.
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
dependencies: HashMap::new(),
|
||||
dependents: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Adds a node (task) to the graph.
|
||||
pub fn add_node(&mut self, task_id: TaskId) {
|
||||
self.dependencies.entry(task_id).or_default();
|
||||
self.dependents.entry(task_id).or_default();
|
||||
}
|
||||
|
||||
/// Adds a dependency edge (from depends on to).
|
||||
pub fn add_edge(&mut self, from: TaskId, to: TaskId) {
|
||||
self.dependencies.entry(to).or_default().push(from);
|
||||
self.dependents.entry(from).or_default().push(to);
|
||||
}
|
||||
}
|
||||
|
||||
/// Execution schedule.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Schedule {
|
||||
/// Schedule ID.
|
||||
pub id: ScheduleId,
|
||||
/// All tasks.
|
||||
pub tasks: HashMap<TaskId, Task>,
|
||||
/// Task assignments.
|
||||
pub assignment: TaskAssignment,
|
||||
/// Execution stages (tasks within a stage can run in parallel).
|
||||
pub stages: Vec<ScheduleStage>,
|
||||
}
|
||||
|
||||
impl Schedule {
|
||||
/// Creates an empty schedule.
|
||||
pub fn empty() -> Self {
|
||||
Self {
|
||||
id: ScheduleId::new(),
|
||||
tasks: HashMap::new(),
|
||||
assignment: TaskAssignment::new(),
|
||||
stages: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A stage of parallel tasks.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ScheduleStage {
|
||||
/// Stage index.
|
||||
pub stage_id: usize,
|
||||
/// Tasks in this stage (can run in parallel).
|
||||
pub tasks: Vec<TaskId>,
|
||||
}
|
||||
|
||||
/// Result of scheduling.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ScheduleResult {
|
||||
/// The schedule.
|
||||
pub schedule: Schedule,
|
||||
/// Estimated total execution time.
|
||||
pub estimated_makespan: Duration,
|
||||
/// Estimated processor utilization by type.
|
||||
pub processor_utilization: HashMap<ProcessorType, f64>,
|
||||
}
|
||||
|
||||
/// Result of execution.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ExecutionResult {
|
||||
/// Results per task.
|
||||
pub results: HashMap<TaskId, TaskExecutionResult>,
|
||||
/// Total execution time.
|
||||
pub total_time: Duration,
|
||||
/// Actual processor utilization.
|
||||
pub actual_utilization: HashMap<ProcessorType, f64>,
|
||||
}
|
||||
|
||||
/// Result of a single task execution.
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum TaskExecutionResult {
|
||||
/// Task completed successfully.
|
||||
Success(crate::processor::OperationResult),
|
||||
/// Task failed.
|
||||
Failed(String),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::processor::Precision;
|
||||
use crate::task::TaskStatus;
|
||||
|
||||
fn create_test_task(id: u64, op: Operation, deps: Vec<TaskId>) -> Task {
|
||||
Task {
|
||||
id: TaskId(id),
|
||||
operation: op,
|
||||
priority: TaskPriority::Normal,
|
||||
dependencies: deps,
|
||||
status: TaskStatus::Pending,
|
||||
deadline: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dependency_graph() {
|
||||
let mut graph = DependencyGraph::new();
|
||||
|
||||
graph.add_node(TaskId(1));
|
||||
graph.add_node(TaskId(2));
|
||||
graph.add_node(TaskId(3));
|
||||
|
||||
graph.add_edge(TaskId(1), TaskId(2)); // 2 depends on 1
|
||||
graph.add_edge(TaskId(1), TaskId(3)); // 3 depends on 1
|
||||
graph.add_edge(TaskId(2), TaskId(3)); // 3 depends on 2
|
||||
|
||||
assert_eq!(graph.dependencies[&TaskId(2)], vec![TaskId(1)]);
|
||||
assert_eq!(graph.dependencies[&TaskId(3)], vec![TaskId(1), TaskId(2)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_task_assignment() {
|
||||
let mut assignment = TaskAssignment::new();
|
||||
|
||||
assignment.assign(TaskId(1), ProcessorId(0));
|
||||
assignment.assign(TaskId(2), ProcessorId(1));
|
||||
|
||||
assert_eq!(assignment.get(&TaskId(1)), Some(ProcessorId(0)));
|
||||
assert_eq!(assignment.get(&TaskId(2)), Some(ProcessorId(1)));
|
||||
assert_eq!(assignment.get(&TaskId(3)), None);
|
||||
}
|
||||
}
|
||||
271
crates/synor-compute/src/scheduler/work_queue.rs
Normal file
271
crates/synor-compute/src/scheduler/work_queue.rs
Normal file
|
|
@ -0,0 +1,271 @@
|
|||
//! Work queue with thread-safe task management.
|
||||
|
||||
use crate::processor::ProcessorType;
|
||||
use crate::task::{Task, TaskId, TaskPriority};
|
||||
use crossbeam_channel::{bounded, Receiver, Sender, TryRecvError};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
/// Work queue for a specific processor type.
|
||||
pub struct WorkQueue {
|
||||
/// Task sender (for producers).
|
||||
sender: Sender<Task>,
|
||||
/// Task receiver (for consumers).
|
||||
receiver: Receiver<Task>,
|
||||
/// Processor type this queue is for.
|
||||
processor_type: ProcessorType,
|
||||
/// Current queue size.
|
||||
size: AtomicU64,
|
||||
/// Total tasks processed.
|
||||
processed: AtomicU64,
|
||||
}
|
||||
|
||||
impl WorkQueue {
|
||||
/// Creates a new work queue for a processor type.
|
||||
pub fn new(processor_type: ProcessorType, capacity: usize) -> Self {
|
||||
let (sender, receiver) = bounded(capacity.max(1024));
|
||||
|
||||
Self {
|
||||
sender,
|
||||
receiver,
|
||||
processor_type,
|
||||
size: AtomicU64::new(0),
|
||||
processed: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// Push a task to the queue.
|
||||
pub fn push(&self, task: Task) {
|
||||
if self.sender.try_send(task).is_ok() {
|
||||
self.size.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
/// Pop a task from the queue (ignores worker_id for compatibility).
|
||||
pub fn pop(&self, _worker_id: usize) -> Option<Task> {
|
||||
self.pop_any()
|
||||
}
|
||||
|
||||
/// Pop any task from the queue.
|
||||
pub fn pop_any(&self) -> Option<Task> {
|
||||
match self.receiver.try_recv() {
|
||||
Ok(task) => {
|
||||
self.size.fetch_sub(1, Ordering::Relaxed);
|
||||
self.processed.fetch_add(1, Ordering::Relaxed);
|
||||
Some(task)
|
||||
}
|
||||
Err(TryRecvError::Empty) | Err(TryRecvError::Disconnected) => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Pop from global queue (alias for pop_any).
|
||||
pub fn pop_global(&self) -> Option<Task> {
|
||||
self.pop_any()
|
||||
}
|
||||
|
||||
/// Steal a batch of tasks from another queue.
|
||||
pub fn steal_batch_from(&self, other: &WorkQueue, max_tasks: usize) -> Vec<Task> {
|
||||
let mut stolen = Vec::new();
|
||||
|
||||
while stolen.len() < max_tasks {
|
||||
if let Some(task) = other.pop_any() {
|
||||
stolen.push(task);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Push stolen tasks to this queue
|
||||
for task in &stolen {
|
||||
// Tasks are already accounted for in `other`, just push to self
|
||||
if self.sender.try_send(task.clone()).is_ok() {
|
||||
self.size.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
stolen
|
||||
}
|
||||
|
||||
/// Get current queue size.
|
||||
pub fn len(&self) -> usize {
|
||||
self.size.load(Ordering::Relaxed) as usize
|
||||
}
|
||||
|
||||
/// Check if queue is empty.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
|
||||
/// Get number of tasks processed.
|
||||
pub fn processed_count(&self) -> u64 {
|
||||
self.processed.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
/// Get processor type for this queue.
|
||||
pub fn processor_type(&self) -> ProcessorType {
|
||||
self.processor_type.clone()
|
||||
}
|
||||
|
||||
/// Get utilization estimate (0.0 - 1.0).
|
||||
pub fn utilization(&self) -> f64 {
|
||||
let size = self.size.load(Ordering::Relaxed) as f64;
|
||||
let capacity = self.sender.capacity().unwrap_or(1024) as f64;
|
||||
(size / capacity).min(1.0)
|
||||
}
|
||||
|
||||
/// Get a stealer for cross-queue work stealing.
|
||||
pub fn get_stealer(&self) -> QueueStealer {
|
||||
QueueStealer {
|
||||
receiver: self.receiver.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Stealer handle for cross-queue work stealing.
|
||||
#[derive(Clone)]
|
||||
pub struct QueueStealer {
|
||||
receiver: Receiver<Task>,
|
||||
}
|
||||
|
||||
impl QueueStealer {
|
||||
/// Try to steal a task.
|
||||
pub fn steal(&self) -> Option<Task> {
|
||||
self.receiver.try_recv().ok()
|
||||
}
|
||||
}
|
||||
|
||||
/// Priority queue wrapper for tasks.
|
||||
pub struct PriorityWorkQueue {
|
||||
/// Queues by priority level.
|
||||
queues: HashMap<TaskPriority, WorkQueue>,
|
||||
/// Processor type.
|
||||
processor_type: ProcessorType,
|
||||
}
|
||||
|
||||
impl PriorityWorkQueue {
|
||||
/// Creates a new priority work queue.
|
||||
pub fn new(processor_type: ProcessorType, capacity_per_priority: usize) -> Self {
|
||||
let mut queues = HashMap::new();
|
||||
|
||||
for priority in [
|
||||
TaskPriority::Critical,
|
||||
TaskPriority::High,
|
||||
TaskPriority::Normal,
|
||||
TaskPriority::Background,
|
||||
] {
|
||||
queues.insert(priority, WorkQueue::new(processor_type.clone(), capacity_per_priority));
|
||||
}
|
||||
|
||||
Self {
|
||||
queues,
|
||||
processor_type,
|
||||
}
|
||||
}
|
||||
|
||||
/// Push a task with its priority.
|
||||
pub fn push(&self, task: Task) {
|
||||
let priority = task.priority;
|
||||
if let Some(queue) = self.queues.get(&priority) {
|
||||
queue.push(task);
|
||||
}
|
||||
}
|
||||
|
||||
/// Pop highest priority task available.
|
||||
pub fn pop(&self, worker_id: usize) -> Option<Task> {
|
||||
// Try priorities in order: Critical > High > Normal > Background
|
||||
for priority in [
|
||||
TaskPriority::Critical,
|
||||
TaskPriority::High,
|
||||
TaskPriority::Normal,
|
||||
TaskPriority::Background,
|
||||
] {
|
||||
if let Some(queue) = self.queues.get(&priority) {
|
||||
if let Some(task) = queue.pop(worker_id) {
|
||||
return Some(task);
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Get total queue size.
|
||||
pub fn len(&self) -> usize {
|
||||
self.queues.values().map(|q| q.len()).sum()
|
||||
}
|
||||
|
||||
/// Check if all queues are empty.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.queues.values().all(|q| q.is_empty())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::processor::{CpuVariant, Operation, Precision};
|
||||
use crate::task::TaskStatus;
|
||||
|
||||
fn create_test_task(id: u64, priority: TaskPriority) -> Task {
|
||||
Task {
|
||||
id: TaskId(id),
|
||||
operation: Operation::MatMul {
|
||||
m: 1024,
|
||||
n: 1024,
|
||||
k: 1024,
|
||||
precision: Precision::Fp32,
|
||||
},
|
||||
priority,
|
||||
dependencies: vec![],
|
||||
status: TaskStatus::Pending,
|
||||
deadline: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_work_queue_basic() {
|
||||
let queue = WorkQueue::new(
|
||||
ProcessorType::Cpu(CpuVariant::default()),
|
||||
100,
|
||||
);
|
||||
|
||||
assert!(queue.is_empty());
|
||||
|
||||
queue.push(create_test_task(1, TaskPriority::Normal));
|
||||
queue.push(create_test_task(2, TaskPriority::Normal));
|
||||
|
||||
assert_eq!(queue.len(), 2);
|
||||
|
||||
let task1 = queue.pop(0);
|
||||
assert!(task1.is_some());
|
||||
assert_eq!(queue.len(), 1);
|
||||
|
||||
let task2 = queue.pop(0);
|
||||
assert!(task2.is_some());
|
||||
assert!(queue.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_priority_queue() {
|
||||
let queue = PriorityWorkQueue::new(
|
||||
ProcessorType::Cpu(CpuVariant::default()),
|
||||
100,
|
||||
);
|
||||
|
||||
queue.push(create_test_task(1, TaskPriority::Background));
|
||||
queue.push(create_test_task(2, TaskPriority::Critical));
|
||||
queue.push(create_test_task(3, TaskPriority::Normal));
|
||||
|
||||
// Should get Critical first
|
||||
let task = queue.pop(0).unwrap();
|
||||
assert_eq!(task.id, TaskId(2));
|
||||
assert_eq!(task.priority, TaskPriority::Critical);
|
||||
|
||||
// Then Normal
|
||||
let task = queue.pop(0).unwrap();
|
||||
assert_eq!(task.id, TaskId(3));
|
||||
|
||||
// Then Background
|
||||
let task = queue.pop(0).unwrap();
|
||||
assert_eq!(task.id, TaskId(1));
|
||||
}
|
||||
}
|
||||
543
crates/synor-compute/src/task/mod.rs
Normal file
543
crates/synor-compute/src/task/mod.rs
Normal file
|
|
@ -0,0 +1,543 @@
|
|||
//! Task definitions and decomposition.
|
||||
|
||||
use crate::error::ComputeError;
|
||||
use crate::processor::{Operation, OperationType, Precision, ProcessorType};
|
||||
use crate::{ComputeJob, JobType};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::time::Duration;
|
||||
|
||||
/// Unique task identifier.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct TaskId(pub u64);
|
||||
|
||||
impl TaskId {
|
||||
/// Creates a new task ID.
|
||||
pub fn new() -> Self {
|
||||
use rand::Rng;
|
||||
TaskId(rand::thread_rng().gen())
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for TaskId {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for TaskId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "task_{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
/// Task priority levels.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||
pub enum TaskPriority {
|
||||
/// Background, can be preempted.
|
||||
Background = 0,
|
||||
/// Normal priority.
|
||||
Normal = 1,
|
||||
/// High priority.
|
||||
High = 2,
|
||||
/// Critical, must complete.
|
||||
Critical = 3,
|
||||
}
|
||||
|
||||
impl Default for TaskPriority {
|
||||
fn default() -> Self {
|
||||
TaskPriority::Normal
|
||||
}
|
||||
}
|
||||
|
||||
/// Task execution status.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum TaskStatus {
|
||||
/// Waiting to be scheduled.
|
||||
Pending,
|
||||
/// Queued for execution.
|
||||
Queued,
|
||||
/// Currently executing.
|
||||
Running,
|
||||
/// Completed successfully.
|
||||
Completed,
|
||||
/// Failed.
|
||||
Failed,
|
||||
/// Cancelled.
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
/// A schedulable task.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct Task {
|
||||
/// Task ID.
|
||||
pub id: TaskId,
|
||||
/// Operation to execute.
|
||||
pub operation: Operation,
|
||||
/// Priority level.
|
||||
pub priority: TaskPriority,
|
||||
/// Dependencies (tasks that must complete first).
|
||||
pub dependencies: Vec<TaskId>,
|
||||
/// Current status.
|
||||
pub status: TaskStatus,
|
||||
/// Deadline (optional).
|
||||
pub deadline: Option<u64>,
|
||||
}
|
||||
|
||||
impl Task {
|
||||
/// Creates a new task.
|
||||
pub fn new(operation: Operation) -> Self {
|
||||
Self {
|
||||
id: TaskId::new(),
|
||||
operation,
|
||||
priority: TaskPriority::Normal,
|
||||
dependencies: Vec::new(),
|
||||
status: TaskStatus::Pending,
|
||||
deadline: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets the priority.
|
||||
pub fn with_priority(mut self, priority: TaskPriority) -> Self {
|
||||
self.priority = priority;
|
||||
self
|
||||
}
|
||||
|
||||
/// Adds dependencies.
|
||||
pub fn with_dependencies(mut self, deps: Vec<TaskId>) -> Self {
|
||||
self.dependencies = deps;
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets deadline.
|
||||
pub fn with_deadline(mut self, deadline: u64) -> Self {
|
||||
self.deadline = Some(deadline);
|
||||
self
|
||||
}
|
||||
|
||||
/// Checks if task is compatible with a processor type.
|
||||
pub fn is_compatible_with(&self, proc_type: ProcessorType) -> bool {
|
||||
// Check based on operation type
|
||||
let op_type = self.operation.op_type();
|
||||
|
||||
match proc_type {
|
||||
ProcessorType::Cpu(_) => {
|
||||
// CPUs can do most things, but slowly
|
||||
true
|
||||
}
|
||||
ProcessorType::Gpu(_) => {
|
||||
// GPUs are good for parallel operations
|
||||
matches!(
|
||||
op_type,
|
||||
OperationType::MatMul
|
||||
| OperationType::Conv2d
|
||||
| OperationType::SelfAttention
|
||||
| OperationType::FlashAttention
|
||||
| OperationType::Embedding
|
||||
| OperationType::Add
|
||||
| OperationType::Mul
|
||||
| OperationType::Softmax
|
||||
)
|
||||
}
|
||||
ProcessorType::Tpu(_) => {
|
||||
// TPUs are good for large matrix ops
|
||||
matches!(
|
||||
op_type,
|
||||
OperationType::MatMul
|
||||
| OperationType::Conv2d
|
||||
| OperationType::SelfAttention
|
||||
| OperationType::FlashAttention
|
||||
)
|
||||
}
|
||||
ProcessorType::Lpu => {
|
||||
// LPUs are good for sequential inference
|
||||
matches!(
|
||||
op_type,
|
||||
OperationType::MatMul
|
||||
| OperationType::SelfAttention
|
||||
| OperationType::KVCache
|
||||
| OperationType::Sampling
|
||||
)
|
||||
}
|
||||
ProcessorType::Npu(_) => {
|
||||
// NPUs are good for inference
|
||||
matches!(
|
||||
op_type,
|
||||
OperationType::MatMul
|
||||
| OperationType::Conv2d
|
||||
| OperationType::Add
|
||||
| OperationType::Softmax
|
||||
)
|
||||
}
|
||||
_ => true, // Default to compatible
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of task execution.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct TaskResult {
|
||||
/// Task ID.
|
||||
pub task_id: TaskId,
|
||||
/// Output data.
|
||||
pub output: Vec<u8>,
|
||||
/// Execution duration.
|
||||
pub duration: Duration,
|
||||
/// Energy consumed (Joules).
|
||||
pub energy: f64,
|
||||
}
|
||||
|
||||
/// Compute task for job execution.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ComputeTask {
|
||||
/// Task.
|
||||
pub task: Task,
|
||||
/// Resource requirements.
|
||||
pub requirements: TaskRequirements,
|
||||
/// Preferred processor type.
|
||||
pub preferred_processor: Option<ProcessorType>,
|
||||
/// Fallback processor type.
|
||||
pub fallback_processor: Option<ProcessorType>,
|
||||
}
|
||||
|
||||
/// Task resource requirements.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct TaskRequirements {
|
||||
/// Minimum memory (bytes).
|
||||
pub min_memory: u64,
|
||||
/// Minimum TFLOPS.
|
||||
pub min_tflops: f64,
|
||||
/// Maximum latency (ms).
|
||||
pub max_latency_ms: Option<u32>,
|
||||
/// Requires specific precision.
|
||||
pub precision: Option<Precision>,
|
||||
}
|
||||
|
||||
/// Decomposed workload.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct DecomposedWorkload {
|
||||
/// All tasks.
|
||||
pub tasks: Vec<Task>,
|
||||
/// Total estimated FLOPS.
|
||||
pub estimated_flops: f64,
|
||||
/// Total estimated memory.
|
||||
pub estimated_memory: u64,
|
||||
}
|
||||
|
||||
/// Task decomposer that breaks jobs into schedulable tasks.
|
||||
pub struct TaskDecomposer {
|
||||
/// Default batch size for inference.
|
||||
inference_batch_size: usize,
|
||||
/// Default precision.
|
||||
default_precision: Precision,
|
||||
}
|
||||
|
||||
impl TaskDecomposer {
|
||||
/// Creates a new task decomposer.
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
inference_batch_size: 32,
|
||||
default_precision: Precision::Fp16,
|
||||
}
|
||||
}
|
||||
|
||||
/// Decomposes a job into tasks.
|
||||
pub fn decompose(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
|
||||
match &job.job_type {
|
||||
JobType::Training { .. } => self.decompose_training(job),
|
||||
JobType::Inference { .. } => self.decompose_inference(job),
|
||||
JobType::Container { .. } => self.decompose_container(job),
|
||||
JobType::Serverless { .. } => self.decompose_serverless(job),
|
||||
JobType::Wasm { .. } => self.decompose_wasm(job),
|
||||
}
|
||||
}
|
||||
|
||||
/// Decompose training job.
|
||||
fn decompose_training(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
|
||||
let mut tasks = Vec::new();
|
||||
|
||||
if let JobType::Training {
|
||||
epochs,
|
||||
batch_size,
|
||||
..
|
||||
} = &job.job_type
|
||||
{
|
||||
// Data loading task
|
||||
tasks.push(
|
||||
Task::new(Operation::DataLoad {
|
||||
bytes: 1024 * 1024 * 100, // 100MB
|
||||
async_: true,
|
||||
})
|
||||
.with_priority(TaskPriority::High),
|
||||
);
|
||||
|
||||
let data_load_id = tasks[0].id;
|
||||
|
||||
// Preprocessing task
|
||||
tasks.push(
|
||||
Task::new(Operation::DataPreprocess {
|
||||
batch: *batch_size as usize,
|
||||
transforms: vec!["normalize".to_string(), "augment".to_string()],
|
||||
})
|
||||
.with_dependencies(vec![data_load_id])
|
||||
.with_priority(TaskPriority::High),
|
||||
);
|
||||
|
||||
let preprocess_id = tasks[1].id;
|
||||
|
||||
// Forward pass (simplified as MatMul)
|
||||
tasks.push(
|
||||
Task::new(Operation::MatMul {
|
||||
m: *batch_size as usize,
|
||||
n: 4096,
|
||||
k: 4096,
|
||||
precision: self.default_precision,
|
||||
})
|
||||
.with_dependencies(vec![preprocess_id])
|
||||
.with_priority(TaskPriority::Critical),
|
||||
);
|
||||
|
||||
let forward_id = tasks[2].id;
|
||||
|
||||
// Backward pass
|
||||
tasks.push(
|
||||
Task::new(Operation::Backward {
|
||||
forward_op: Box::new(Operation::MatMul {
|
||||
m: *batch_size as usize,
|
||||
n: 4096,
|
||||
k: 4096,
|
||||
precision: self.default_precision,
|
||||
}),
|
||||
})
|
||||
.with_dependencies(vec![forward_id])
|
||||
.with_priority(TaskPriority::Critical),
|
||||
);
|
||||
|
||||
let backward_id = tasks[3].id;
|
||||
|
||||
// Optimizer step
|
||||
tasks.push(
|
||||
Task::new(Operation::OptimizerStep {
|
||||
parameters: 1_000_000,
|
||||
optimizer: "adamw".to_string(),
|
||||
precision: self.default_precision,
|
||||
})
|
||||
.with_dependencies(vec![backward_id])
|
||||
.with_priority(TaskPriority::High),
|
||||
);
|
||||
}
|
||||
|
||||
Ok(tasks)
|
||||
}
|
||||
|
||||
/// Decompose inference job.
|
||||
fn decompose_inference(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
|
||||
let mut tasks = Vec::new();
|
||||
|
||||
if let JobType::Inference { batch_size, .. } = &job.job_type {
|
||||
// Tokenization (CPU optimal)
|
||||
tasks.push(
|
||||
Task::new(Operation::Tokenization {
|
||||
text_bytes: 4096,
|
||||
vocab_size: 32000,
|
||||
})
|
||||
.with_priority(TaskPriority::High),
|
||||
);
|
||||
|
||||
let token_id = tasks[0].id;
|
||||
|
||||
// Embedding (GPU optimal)
|
||||
tasks.push(
|
||||
Task::new(Operation::Embedding {
|
||||
batch: *batch_size as usize,
|
||||
seq_len: 512,
|
||||
vocab_size: 32000,
|
||||
embed_dim: 4096,
|
||||
precision: self.default_precision,
|
||||
})
|
||||
.with_dependencies(vec![token_id])
|
||||
.with_priority(TaskPriority::Critical),
|
||||
);
|
||||
|
||||
let embed_id = tasks[1].id;
|
||||
|
||||
// Self-attention (TPU/GPU optimal)
|
||||
tasks.push(
|
||||
Task::new(Operation::SelfAttention {
|
||||
batch: *batch_size as usize,
|
||||
seq_len: 512,
|
||||
num_heads: 32,
|
||||
head_dim: 128,
|
||||
precision: self.default_precision,
|
||||
})
|
||||
.with_dependencies(vec![embed_id])
|
||||
.with_priority(TaskPriority::Critical),
|
||||
);
|
||||
|
||||
let attention_id = tasks[2].id;
|
||||
|
||||
// Sampling (LPU optimal)
|
||||
tasks.push(
|
||||
Task::new(Operation::Sampling {
|
||||
batch: *batch_size as usize,
|
||||
vocab_size: 32000,
|
||||
temperature: 0.7,
|
||||
})
|
||||
.with_dependencies(vec![attention_id])
|
||||
.with_priority(TaskPriority::High),
|
||||
);
|
||||
|
||||
let sample_id = tasks[3].id;
|
||||
|
||||
// Detokenization (CPU optimal)
|
||||
tasks.push(
|
||||
Task::new(Operation::Detokenization {
|
||||
tokens: 256,
|
||||
vocab_size: 32000,
|
||||
})
|
||||
.with_dependencies(vec![sample_id])
|
||||
.with_priority(TaskPriority::Normal),
|
||||
);
|
||||
}
|
||||
|
||||
Ok(tasks)
|
||||
}
|
||||
|
||||
/// Decompose container job.
|
||||
fn decompose_container(&self, _job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
|
||||
// Container jobs are typically a single task
|
||||
Ok(vec![Task::new(Operation::Generic {
|
||||
op_type: OperationType::DataLoad,
|
||||
flops: 1e9,
|
||||
memory: 1024 * 1024 * 1024,
|
||||
})
|
||||
.with_priority(TaskPriority::Normal)])
|
||||
}
|
||||
|
||||
/// Decompose serverless function.
|
||||
fn decompose_serverless(&self, _job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
|
||||
// Serverless is typically a single task
|
||||
Ok(vec![Task::new(Operation::Generic {
|
||||
op_type: OperationType::DataPreprocess,
|
||||
flops: 1e6,
|
||||
memory: 256 * 1024 * 1024,
|
||||
})
|
||||
.with_priority(TaskPriority::High)])
|
||||
}
|
||||
|
||||
/// Decompose WASM job.
|
||||
fn decompose_wasm(&self, _job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
|
||||
// WASM is typically a single task
|
||||
Ok(vec![Task::new(Operation::Generic {
|
||||
op_type: OperationType::DataPreprocess,
|
||||
flops: 1e6,
|
||||
memory: 16 * 1024 * 1024,
|
||||
})
|
||||
.with_priority(TaskPriority::Normal)])
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for TaskDecomposer {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_task_creation() {
|
||||
let task = Task::new(Operation::MatMul {
|
||||
m: 1024,
|
||||
n: 1024,
|
||||
k: 1024,
|
||||
precision: Precision::Fp32,
|
||||
})
|
||||
.with_priority(TaskPriority::High);
|
||||
|
||||
assert_eq!(task.priority, TaskPriority::High);
|
||||
assert!(task.dependencies.is_empty());
|
||||
assert_eq!(task.status, TaskStatus::Pending);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_task_dependencies() {
|
||||
let task1 = Task::new(Operation::DataLoad {
|
||||
bytes: 1000,
|
||||
async_: true,
|
||||
});
|
||||
let task1_id = task1.id;
|
||||
|
||||
let task2 = Task::new(Operation::MatMul {
|
||||
m: 1024,
|
||||
n: 1024,
|
||||
k: 1024,
|
||||
precision: Precision::Fp32,
|
||||
})
|
||||
.with_dependencies(vec![task1_id]);
|
||||
|
||||
assert_eq!(task2.dependencies, vec![task1_id]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_task_compatibility() {
|
||||
let matmul_task = Task::new(Operation::MatMul {
|
||||
m: 1024,
|
||||
n: 1024,
|
||||
k: 1024,
|
||||
precision: Precision::Fp32,
|
||||
});
|
||||
|
||||
// MatMul should be compatible with GPU and TPU
|
||||
assert!(matmul_task.is_compatible_with(ProcessorType::Gpu(
|
||||
crate::processor::GpuVariant::NvidiaCuda {
|
||||
compute_capability: (8, 0)
|
||||
}
|
||||
)));
|
||||
assert!(matmul_task.is_compatible_with(ProcessorType::Tpu(
|
||||
crate::processor::TpuVersion::V5p
|
||||
)));
|
||||
|
||||
let data_load_task = Task::new(Operation::DataLoad {
|
||||
bytes: 1000,
|
||||
async_: true,
|
||||
});
|
||||
|
||||
// DataLoad should be compatible with CPU
|
||||
assert!(data_load_task.is_compatible_with(ProcessorType::Cpu(
|
||||
crate::processor::CpuVariant::default()
|
||||
)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_task_decomposer() {
|
||||
let decomposer = TaskDecomposer::new();
|
||||
|
||||
let job = ComputeJob {
|
||||
id: crate::JobId::new(),
|
||||
owner: [0u8; 32],
|
||||
job_type: JobType::Inference {
|
||||
model_cid: "model".to_string(),
|
||||
input_format: "json".to_string(),
|
||||
batch_size: 1,
|
||||
},
|
||||
resources: crate::ResourceRequirements::default(),
|
||||
input_cid: None,
|
||||
max_budget: 1_000_000,
|
||||
priority: crate::JobPriority::Normal,
|
||||
created_at: 0,
|
||||
deadline: None,
|
||||
};
|
||||
|
||||
let tasks = decomposer.decompose(&job).unwrap();
|
||||
assert!(!tasks.is_empty());
|
||||
|
||||
// Check dependencies form a chain
|
||||
for (i, task) in tasks.iter().enumerate() {
|
||||
if i > 0 {
|
||||
assert!(!task.dependencies.is_empty());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
1584
docs/PLAN/PHASE11-Synor-Compute-L2-Part2-HyperEfficiency.md
Normal file
1584
docs/PLAN/PHASE11-Synor-Compute-L2-Part2-HyperEfficiency.md
Normal file
File diff suppressed because it is too large
Load diff
1564
docs/PLAN/PHASE11-Synor-Compute-L2-Part3-HeterogeneousCompute.md
Normal file
1564
docs/PLAN/PHASE11-Synor-Compute-L2-Part3-HeterogeneousCompute.md
Normal file
File diff suppressed because it is too large
Load diff
906
docs/PLAN/PHASE11-Synor-Compute-L2.md
Normal file
906
docs/PLAN/PHASE11-Synor-Compute-L2.md
Normal file
|
|
@ -0,0 +1,906 @@
|
|||
# Phase 11: Synor Compute L2 - Full-Stack Compute Platform
|
||||
|
||||
> **Mission**: Build a decentralized compute platform capable of AI/ML training, inference, OS hosting, and general-purpose high-performance computing.
|
||||
|
||||
---
|
||||
|
||||
## Executive Summary
|
||||
|
||||
Synor Compute L2 extends beyond the current WASM-only Synor VM to provide:
|
||||
- **GPU Compute**: AI/ML training and inference with CUDA/ROCm support
|
||||
- **Container Orchestration**: Docker-compatible workloads with Kubernetes-style scheduling
|
||||
- **Persistent VMs**: Long-running virtual machines for OS hosting
|
||||
- **Serverless Functions**: Short-lived compute for API backends and event processing
|
||||
- **Edge Compute**: Low-latency compute at network edge nodes
|
||||
|
||||
---
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ SYNOR COMPUTE L2 │
|
||||
├─────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ APPLICATION LAYER │ │
|
||||
│ ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
|
||||
│ │ AI/ML │ Serverless │ Containers │ Persistent │ Edge │ │
|
||||
│ │ Training │ Functions │ (Docker) │ VMs (Linux) │ Compute │ │
|
||||
│ └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ ORCHESTRATION LAYER │ │
|
||||
│ ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
|
||||
│ │ Job │ Resource │ Network │ Storage │ Health │ │
|
||||
│ │ Scheduler │ Manager │ Fabric │ Orchestrator│ Monitor │ │
|
||||
│ └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ COMPUTE RUNTIME LAYER │ │
|
||||
│ ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
|
||||
│ │ GPU │ Container │ MicroVM │ WASM │ Native │ │
|
||||
│ │ Runtime │ Runtime │ Runtime │ Runtime │ Runtime │ │
|
||||
│ │ (CUDA/ROCm)│ (containerd)│ (Firecracker)│ (Wasmtime) │ (gVisor) │ │
|
||||
│ └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ INFRASTRUCTURE LAYER │ │
|
||||
│ ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
|
||||
│ │ Node │ Network │ Distributed │ Consensus │ Billing │ │
|
||||
│ │ Registry │ Overlay │ Storage │ (PoS+PoW) │ Metering │ │
|
||||
│ └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ SYNOR L1 BLOCKCHAIN (GHOSTDAG + DAG-RIDER) │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Milestone 1: GPU Compute Foundation (AI/ML Training & Inference)
|
||||
|
||||
### 1.1 GPU Node Registration
|
||||
|
||||
```rust
|
||||
// synor-compute/src/gpu/node.rs
|
||||
|
||||
/// GPU node capabilities
|
||||
pub struct GpuNode {
|
||||
/// Unique node ID
|
||||
pub node_id: NodeId,
|
||||
/// GPU specifications
|
||||
pub gpus: Vec<GpuSpec>,
|
||||
/// Total VRAM available (bytes)
|
||||
pub total_vram: u64,
|
||||
/// Available VRAM (bytes)
|
||||
pub available_vram: u64,
|
||||
/// CUDA compute capability (e.g., 8.6 for RTX 3090)
|
||||
pub cuda_capability: Option<(u8, u8)>,
|
||||
/// ROCm version (for AMD)
|
||||
pub rocm_version: Option<String>,
|
||||
/// Network bandwidth (Gbps)
|
||||
pub bandwidth_gbps: u32,
|
||||
/// Geographic region
|
||||
pub region: Region,
|
||||
/// Stake amount (for PoS validation)
|
||||
pub stake: u64,
|
||||
}
|
||||
|
||||
pub struct GpuSpec {
|
||||
pub model: String, // "NVIDIA RTX 4090"
|
||||
pub vram_gb: u32, // 24
|
||||
pub tensor_cores: u32, // 512
|
||||
pub cuda_cores: u32, // 16384
|
||||
pub memory_bandwidth: u32, // 1008 GB/s
|
||||
pub fp32_tflops: f32, // 82.6
|
||||
pub fp16_tflops: f32, // 165.2
|
||||
pub int8_tops: f32, // 330.4
|
||||
}
|
||||
```
|
||||
|
||||
### 1.2 AI/ML Job Specification
|
||||
|
||||
```rust
|
||||
// synor-compute/src/ai/job.rs
|
||||
|
||||
/// AI/ML training job specification
|
||||
pub struct TrainingJob {
|
||||
/// Job ID
|
||||
pub job_id: JobId,
|
||||
/// Owner address
|
||||
pub owner: Address,
|
||||
/// Framework (PyTorch, TensorFlow, JAX)
|
||||
pub framework: MlFramework,
|
||||
/// Model specification
|
||||
pub model: ModelSpec,
|
||||
/// Dataset reference (Synor Storage CID)
|
||||
pub dataset_cid: Cid,
|
||||
/// Training configuration
|
||||
pub config: TrainingConfig,
|
||||
/// Resource requirements
|
||||
pub resources: GpuResources,
|
||||
/// Maximum budget (SYNOR tokens)
|
||||
pub max_budget: u64,
|
||||
/// Checkpoint interval (steps)
|
||||
pub checkpoint_interval: u64,
|
||||
}
|
||||
|
||||
pub struct GpuResources {
|
||||
pub min_gpus: u32,
|
||||
pub max_gpus: u32,
|
||||
pub min_vram_per_gpu: u64,
|
||||
pub cuda_capability_min: Option<(u8, u8)>,
|
||||
pub distributed: bool, // Multi-node training
|
||||
pub priority: JobPriority,
|
||||
}
|
||||
|
||||
pub enum MlFramework {
|
||||
PyTorch { version: String },
|
||||
TensorFlow { version: String },
|
||||
JAX { version: String },
|
||||
ONNX,
|
||||
Custom { image: String },
|
||||
}
|
||||
|
||||
pub struct TrainingConfig {
|
||||
pub epochs: u32,
|
||||
pub batch_size: u32,
|
||||
pub learning_rate: f32,
|
||||
pub optimizer: String,
|
||||
pub mixed_precision: bool,
|
||||
pub gradient_accumulation: u32,
|
||||
pub distributed_strategy: DistributedStrategy,
|
||||
}
|
||||
|
||||
pub enum DistributedStrategy {
|
||||
DataParallel,
|
||||
ModelParallel,
|
||||
PipelineParallel,
|
||||
ZeRO { stage: u8 }, // DeepSpeed ZeRO stages 1-3
|
||||
FSDP, // Fully Sharded Data Parallel
|
||||
}
|
||||
```
|
||||
|
||||
### 1.3 Inference Service
|
||||
|
||||
```rust
|
||||
// synor-compute/src/ai/inference.rs
|
||||
|
||||
/// Inference endpoint specification
|
||||
pub struct InferenceEndpoint {
|
||||
/// Endpoint ID
|
||||
pub endpoint_id: EndpointId,
|
||||
/// Model reference (Synor Storage CID)
|
||||
pub model_cid: Cid,
|
||||
/// Model format
|
||||
pub format: ModelFormat,
|
||||
/// Scaling configuration
|
||||
pub scaling: AutoscaleConfig,
|
||||
/// GPU requirements per replica
|
||||
pub gpu_per_replica: GpuResources,
|
||||
/// Request timeout
|
||||
pub timeout_ms: u32,
|
||||
/// Max batch size for batching inference
|
||||
pub max_batch_size: u32,
|
||||
/// Batching timeout
|
||||
pub batch_timeout_ms: u32,
|
||||
}
|
||||
|
||||
pub enum ModelFormat {
|
||||
PyTorch,
|
||||
ONNX,
|
||||
TensorRT,
|
||||
Triton,
|
||||
vLLM, // For LLM serving
|
||||
TGI, // Text Generation Inference
|
||||
Custom,
|
||||
}
|
||||
|
||||
pub struct AutoscaleConfig {
|
||||
pub min_replicas: u32,
|
||||
pub max_replicas: u32,
|
||||
pub target_gpu_utilization: f32,
|
||||
pub scale_up_threshold: f32,
|
||||
pub scale_down_threshold: f32,
|
||||
pub cooldown_seconds: u32,
|
||||
}
|
||||
```
|
||||
|
||||
### 1.4 Pricing Model for GPU Compute
|
||||
|
||||
| Resource | Unit | Price (SYNOR/unit) |
|
||||
|----------|------|-------------------|
|
||||
| GPU (RTX 4090 equivalent) | hour | 0.50 |
|
||||
| GPU (A100 80GB equivalent) | hour | 2.00 |
|
||||
| GPU (H100 equivalent) | hour | 4.00 |
|
||||
| VRAM | GB/hour | 0.01 |
|
||||
| Network egress | GB | 0.05 |
|
||||
| Storage (hot, NVMe) | GB/month | 0.10 |
|
||||
| Inference requests | 1M tokens | 0.10 |
|
||||
|
||||
---
|
||||
|
||||
## Milestone 2: Container Orchestration (Docker/Kubernetes-Compatible)
|
||||
|
||||
### 2.1 Container Runtime
|
||||
|
||||
```rust
|
||||
// synor-compute/src/container/runtime.rs
|
||||
|
||||
/// Container specification (OCI-compatible)
|
||||
pub struct ContainerSpec {
|
||||
/// Image reference
|
||||
pub image: ImageRef,
|
||||
/// Resource limits
|
||||
pub resources: ContainerResources,
|
||||
/// Environment variables
|
||||
pub env: HashMap<String, String>,
|
||||
/// Volume mounts
|
||||
pub volumes: Vec<VolumeMount>,
|
||||
/// Network configuration
|
||||
pub network: NetworkConfig,
|
||||
/// Security context
|
||||
pub security: SecurityContext,
|
||||
/// Health check
|
||||
pub health_check: Option<HealthCheck>,
|
||||
}
|
||||
|
||||
pub struct ContainerResources {
|
||||
pub cpu_cores: f32, // 0.5, 1.0, 2.0, etc.
|
||||
pub memory_mb: u64,
|
||||
pub gpu: Option<GpuAllocation>,
|
||||
pub ephemeral_storage_gb: u32,
|
||||
pub network_bandwidth_mbps: u32,
|
||||
}
|
||||
|
||||
pub struct GpuAllocation {
|
||||
pub count: u32,
|
||||
pub vram_mb: u64,
|
||||
pub shared: bool, // Allow GPU sharing via MPS/MIG
|
||||
}
|
||||
```
|
||||
|
||||
### 2.2 Service Mesh & Networking
|
||||
|
||||
```rust
|
||||
// synor-compute/src/network/mesh.rs
|
||||
|
||||
/// Service definition for container orchestration
|
||||
pub struct Service {
|
||||
pub service_id: ServiceId,
|
||||
pub name: String,
|
||||
pub containers: Vec<ContainerSpec>,
|
||||
pub replicas: ReplicaConfig,
|
||||
pub load_balancer: LoadBalancerConfig,
|
||||
pub service_mesh: ServiceMeshConfig,
|
||||
}
|
||||
|
||||
pub struct ServiceMeshConfig {
|
||||
pub mtls_enabled: bool,
|
||||
pub traffic_policy: TrafficPolicy,
|
||||
pub circuit_breaker: CircuitBreakerConfig,
|
||||
pub retry_policy: RetryPolicy,
|
||||
pub rate_limit: Option<RateLimitConfig>,
|
||||
}
|
||||
|
||||
pub struct LoadBalancerConfig {
|
||||
pub algorithm: LoadBalancerAlgorithm,
|
||||
pub health_check: HealthCheck,
|
||||
pub sticky_sessions: bool,
|
||||
pub ssl_termination: SslTermination,
|
||||
}
|
||||
|
||||
pub enum LoadBalancerAlgorithm {
|
||||
RoundRobin,
|
||||
LeastConnections,
|
||||
WeightedRoundRobin { weights: Vec<u32> },
|
||||
IPHash,
|
||||
Random,
|
||||
}
|
||||
```
|
||||
|
||||
### 2.3 Container Pricing
|
||||
|
||||
| Resource | Unit | Price (SYNOR/unit) |
|
||||
|----------|------|-------------------|
|
||||
| CPU | core/hour | 0.02 |
|
||||
| Memory | GB/hour | 0.005 |
|
||||
| Ephemeral storage | GB/hour | 0.001 |
|
||||
| Network ingress | GB | FREE |
|
||||
| Network egress | GB | 0.05 |
|
||||
| Load balancer | hour | 0.01 |
|
||||
| Static IP | month | 2.00 |
|
||||
|
||||
---
|
||||
|
||||
## Milestone 3: Persistent Virtual Machines (OS Hosting)
|
||||
|
||||
### 3.1 MicroVM Architecture (Firecracker-based)
|
||||
|
||||
```rust
|
||||
// synor-compute/src/vm/microvm.rs
|
||||
|
||||
/// Virtual machine specification
|
||||
pub struct VmSpec {
|
||||
/// VM ID
|
||||
pub vm_id: VmId,
|
||||
/// Owner address
|
||||
pub owner: Address,
|
||||
/// VM size
|
||||
pub size: VmSize,
|
||||
/// Boot image
|
||||
pub image: VmImage,
|
||||
/// Persistent volumes
|
||||
pub volumes: Vec<PersistentVolume>,
|
||||
/// Network configuration
|
||||
pub network: VmNetworkConfig,
|
||||
/// SSH keys for access
|
||||
pub ssh_keys: Vec<SshPublicKey>,
|
||||
/// Cloud-init user data
|
||||
pub user_data: Option<String>,
|
||||
}
|
||||
|
||||
pub struct VmSize {
|
||||
pub vcpus: u32,
|
||||
pub memory_gb: u32,
|
||||
pub gpu: Option<GpuPassthrough>,
|
||||
pub network_bandwidth_gbps: u32,
|
||||
}
|
||||
|
||||
pub struct GpuPassthrough {
|
||||
pub count: u32,
|
||||
pub model: GpuModel,
|
||||
pub vram_gb: u32,
|
||||
}
|
||||
|
||||
pub enum VmImage {
|
||||
/// Pre-built images
|
||||
Marketplace { image_id: String, version: String },
|
||||
/// Custom image from Synor Storage
|
||||
Custom { cid: Cid, format: ImageFormat },
|
||||
/// Standard OS images
|
||||
Ubuntu { version: String },
|
||||
Debian { version: String },
|
||||
AlmaLinux { version: String },
|
||||
Windows { version: String, license: WindowsLicense },
|
||||
}
|
||||
|
||||
pub struct PersistentVolume {
|
||||
pub volume_id: VolumeId,
|
||||
pub size_gb: u32,
|
||||
pub volume_type: VolumeType,
|
||||
pub mount_path: String,
|
||||
pub encrypted: bool,
|
||||
}
|
||||
|
||||
pub enum VolumeType {
|
||||
/// High-performance NVMe SSD
|
||||
NvmeSsd { iops: u32, throughput_mbps: u32 },
|
||||
/// Standard SSD
|
||||
Ssd,
|
||||
/// HDD for archival
|
||||
Hdd,
|
||||
/// Distributed storage (Synor Storage L2)
|
||||
Distributed { replication: u8 },
|
||||
}
|
||||
```
|
||||
|
||||
### 3.2 VM Lifecycle Management
|
||||
|
||||
```rust
|
||||
// synor-compute/src/vm/lifecycle.rs
|
||||
|
||||
pub enum VmState {
|
||||
Pending,
|
||||
Provisioning,
|
||||
Running,
|
||||
Stopping,
|
||||
Stopped,
|
||||
Hibernating,
|
||||
Hibernated,
|
||||
Migrating,
|
||||
Failed,
|
||||
Terminated,
|
||||
}
|
||||
|
||||
pub struct VmManager {
|
||||
/// Active VMs
|
||||
vms: HashMap<VmId, VmInstance>,
|
||||
/// Node assignments
|
||||
node_assignments: HashMap<VmId, NodeId>,
|
||||
/// Live migration coordinator
|
||||
migration_coordinator: MigrationCoordinator,
|
||||
}
|
||||
|
||||
impl VmManager {
|
||||
/// Start a new VM
|
||||
pub async fn create(&self, spec: VmSpec) -> Result<VmId, VmError>;
|
||||
|
||||
/// Stop a VM (preserves state)
|
||||
pub async fn stop(&self, vm_id: &VmId) -> Result<(), VmError>;
|
||||
|
||||
/// Start a stopped VM
|
||||
pub async fn start(&self, vm_id: &VmId) -> Result<(), VmError>;
|
||||
|
||||
/// Hibernate VM to storage (saves memory state)
|
||||
pub async fn hibernate(&self, vm_id: &VmId) -> Result<(), VmError>;
|
||||
|
||||
/// Live migrate VM to another node
|
||||
pub async fn migrate(&self, vm_id: &VmId, target_node: NodeId) -> Result<(), VmError>;
|
||||
|
||||
/// Resize VM (requires restart)
|
||||
pub async fn resize(&self, vm_id: &VmId, new_size: VmSize) -> Result<(), VmError>;
|
||||
|
||||
/// Snapshot VM state
|
||||
pub async fn snapshot(&self, vm_id: &VmId) -> Result<SnapshotId, VmError>;
|
||||
|
||||
/// Terminate and delete VM
|
||||
pub async fn terminate(&self, vm_id: &VmId) -> Result<(), VmError>;
|
||||
}
|
||||
```
|
||||
|
||||
### 3.3 VM Pricing
|
||||
|
||||
| VM Type | vCPUs | Memory | Storage | GPU | Price (SYNOR/month) |
|
||||
|---------|-------|--------|---------|-----|---------------------|
|
||||
| micro | 1 | 1 GB | 20 GB SSD | - | 5 |
|
||||
| small | 2 | 4 GB | 50 GB SSD | - | 15 |
|
||||
| medium | 4 | 8 GB | 100 GB SSD | - | 30 |
|
||||
| large | 8 | 32 GB | 200 GB SSD | - | 80 |
|
||||
| xlarge | 16 | 64 GB | 500 GB NVMe | - | 200 |
|
||||
| gpu-small | 8 | 32 GB | 200 GB NVMe | 1x RTX 4090 | 400 |
|
||||
| gpu-medium | 16 | 64 GB | 500 GB NVMe | 2x RTX 4090 | 750 |
|
||||
| gpu-large | 32 | 128 GB | 1 TB NVMe | 4x A100 80GB | 2500 |
|
||||
| gpu-xlarge | 64 | 256 GB | 2 TB NVMe | 8x H100 | 8000 |
|
||||
|
||||
---
|
||||
|
||||
## Milestone 4: Serverless Functions (FaaS)
|
||||
|
||||
### 4.1 Function Specification
|
||||
|
||||
```rust
|
||||
// synor-compute/src/serverless/function.rs
|
||||
|
||||
/// Serverless function definition
|
||||
pub struct Function {
|
||||
pub function_id: FunctionId,
|
||||
pub owner: Address,
|
||||
pub name: String,
|
||||
pub runtime: FunctionRuntime,
|
||||
pub handler: String,
|
||||
pub code: FunctionCode,
|
||||
pub resources: FunctionResources,
|
||||
pub triggers: Vec<FunctionTrigger>,
|
||||
pub environment: HashMap<String, String>,
|
||||
pub timeout_ms: u32,
|
||||
pub concurrency: ConcurrencyConfig,
|
||||
}
|
||||
|
||||
pub enum FunctionRuntime {
|
||||
Node20,
|
||||
Node22,
|
||||
Python311,
|
||||
Python312,
|
||||
Rust,
|
||||
Go122,
|
||||
Java21,
|
||||
Dotnet8,
|
||||
Ruby33,
|
||||
Custom { image: String },
|
||||
}
|
||||
|
||||
pub struct FunctionCode {
|
||||
/// Source code CID in Synor Storage
|
||||
pub cid: Cid,
|
||||
/// Entry point file
|
||||
pub entry_point: String,
|
||||
/// Dependencies (package.json, requirements.txt, etc.)
|
||||
pub dependencies: Option<Cid>,
|
||||
}
|
||||
|
||||
pub struct FunctionResources {
|
||||
pub memory_mb: u32, // 128, 256, 512, 1024, 2048, 4096, 8192
|
||||
pub cpu_allocation: f32, // Proportional to memory
|
||||
pub ephemeral_storage_mb: u32,
|
||||
pub gpu: Option<GpuAllocation>,
|
||||
}
|
||||
|
||||
pub enum FunctionTrigger {
|
||||
/// HTTP endpoint
|
||||
Http { path: String, methods: Vec<HttpMethod> },
|
||||
/// Scheduled execution (cron)
|
||||
Schedule { cron: String },
|
||||
/// Event from message queue
|
||||
Queue { queue_name: String },
|
||||
/// Storage events
|
||||
Storage { bucket: String, events: Vec<StorageEvent> },
|
||||
/// Blockchain events
|
||||
Blockchain { contract: Address, events: Vec<String> },
|
||||
/// Webhook
|
||||
Webhook { url: String },
|
||||
}
|
||||
```
|
||||
|
||||
### 4.2 Cold Start Optimization
|
||||
|
||||
```rust
|
||||
// synor-compute/src/serverless/warmup.rs
|
||||
|
||||
/// Function warmup strategies
|
||||
pub struct WarmupConfig {
|
||||
/// Minimum warm instances
|
||||
pub min_instances: u32,
|
||||
/// Provisioned concurrency
|
||||
pub provisioned_concurrency: u32,
|
||||
/// Warmup schedule
|
||||
pub warmup_schedule: Option<String>,
|
||||
/// Snapshot-based cold start (SnapStart)
|
||||
pub snapstart_enabled: bool,
|
||||
}
|
||||
|
||||
pub struct ColdStartOptimizer {
|
||||
/// Pre-warmed function pools
|
||||
pools: HashMap<FunctionRuntime, WarmPool>,
|
||||
/// Snapshot cache
|
||||
snapshots: LruCache<FunctionId, FunctionSnapshot>,
|
||||
/// Prediction model for scaling
|
||||
predictor: ScalingPredictor,
|
||||
}
|
||||
|
||||
impl ColdStartOptimizer {
|
||||
/// Get a warm instance or create one
|
||||
pub async fn get_instance(&self, function: &Function) -> Result<FunctionInstance, Error> {
|
||||
// Try snapshot restore first (< 100ms)
|
||||
if let Some(snapshot) = self.snapshots.get(&function.function_id) {
|
||||
return self.restore_from_snapshot(snapshot).await;
|
||||
}
|
||||
|
||||
// Try warm pool (< 50ms)
|
||||
if let Some(instance) = self.pools.get(&function.runtime)?.get_warm() {
|
||||
return Ok(instance);
|
||||
}
|
||||
|
||||
// Cold start (1-5s depending on runtime)
|
||||
self.cold_start(function).await
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 4.3 Serverless Pricing
|
||||
|
||||
| Resource | Unit | Price (SYNOR) |
|
||||
|----------|------|---------------|
|
||||
| Invocations | 1M requests | 0.20 |
|
||||
| Duration | GB-second | 0.00001 |
|
||||
| Provisioned concurrency | GB-hour | 0.01 |
|
||||
| HTTP Gateway | 1M requests | 0.10 |
|
||||
| Event bridge | 1M events | 0.50 |
|
||||
|
||||
---
|
||||
|
||||
## Milestone 5: Edge Compute
|
||||
|
||||
### 5.1 Edge Node Architecture
|
||||
|
||||
```rust
|
||||
// synor-compute/src/edge/node.rs
|
||||
|
||||
/// Edge compute node
|
||||
pub struct EdgeNode {
|
||||
pub node_id: NodeId,
|
||||
pub location: GeoLocation,
|
||||
pub capabilities: EdgeCapabilities,
|
||||
pub latency_zones: Vec<LatencyZone>,
|
||||
pub resources: EdgeResources,
|
||||
}
|
||||
|
||||
pub struct EdgeCapabilities {
|
||||
pub wasm_runtime: bool,
|
||||
pub container_runtime: bool,
|
||||
pub gpu_inference: bool,
|
||||
pub video_transcoding: bool,
|
||||
pub cdn_cache: bool,
|
||||
}
|
||||
|
||||
pub struct EdgeResources {
|
||||
pub cpu_cores: u32,
|
||||
pub memory_gb: u32,
|
||||
pub storage_gb: u32,
|
||||
pub gpu: Option<EdgeGpu>,
|
||||
pub bandwidth_gbps: u32,
|
||||
}
|
||||
|
||||
/// Edge function for low-latency compute
|
||||
pub struct EdgeFunction {
|
||||
pub function_id: FunctionId,
|
||||
pub code: WasmModule,
|
||||
pub memory_limit: u32,
|
||||
pub timeout_ms: u32,
|
||||
pub allowed_regions: Vec<Region>,
|
||||
}
|
||||
```
|
||||
|
||||
### 5.2 Edge Use Cases
|
||||
|
||||
```rust
|
||||
// synor-compute/src/edge/usecases.rs
|
||||
|
||||
/// CDN with compute at edge
|
||||
pub struct EdgeCdn {
|
||||
/// Origin servers
|
||||
origins: Vec<Origin>,
|
||||
/// Cache rules
|
||||
cache_rules: Vec<CacheRule>,
|
||||
/// Edge workers for request/response transformation
|
||||
workers: Vec<EdgeWorker>,
|
||||
}
|
||||
|
||||
/// Real-time inference at edge
|
||||
pub struct EdgeInference {
|
||||
/// Model optimized for edge (quantized, pruned)
|
||||
model_id: ModelId,
|
||||
/// Inference runtime (TensorRT, ONNX Runtime)
|
||||
runtime: EdgeInferenceRuntime,
|
||||
/// Max batch size
|
||||
max_batch: u32,
|
||||
/// Target latency
|
||||
target_latency_ms: u32,
|
||||
}
|
||||
|
||||
/// Video processing at edge
|
||||
pub struct EdgeVideoProcessor {
|
||||
/// Transcoding profiles
|
||||
profiles: Vec<TranscodingProfile>,
|
||||
/// Real-time streaming
|
||||
live_streaming: bool,
|
||||
/// Adaptive bitrate
|
||||
abr_enabled: bool,
|
||||
}
|
||||
```
|
||||
|
||||
### 5.3 Edge Pricing
|
||||
|
||||
| Resource | Unit | Price (SYNOR) |
|
||||
|----------|------|---------------|
|
||||
| Edge function invocations | 1M | 0.50 |
|
||||
| Edge function duration | GB-second | 0.00002 |
|
||||
| Edge bandwidth | GB | 0.08 |
|
||||
| Edge cache storage | GB/month | 0.02 |
|
||||
| Video transcoding | minute | 0.02 |
|
||||
|
||||
---
|
||||
|
||||
## Milestone 6: Node Provider Economics
|
||||
|
||||
### 6.1 Provider Registration
|
||||
|
||||
```rust
|
||||
// synor-compute/src/provider/registration.rs
|
||||
|
||||
/// Compute provider registration
|
||||
pub struct ProviderRegistration {
|
||||
pub provider_id: ProviderId,
|
||||
pub owner: Address,
|
||||
/// Stake required to become provider
|
||||
pub stake: u64,
|
||||
/// Hardware specifications
|
||||
pub hardware: HardwareManifest,
|
||||
/// Network connectivity
|
||||
pub network: NetworkManifest,
|
||||
/// Geographic location
|
||||
pub location: GeoLocation,
|
||||
/// Availability SLA commitment
|
||||
pub sla: SlaCommitment,
|
||||
}
|
||||
|
||||
pub struct HardwareManifest {
|
||||
pub cpus: Vec<CpuSpec>,
|
||||
pub memory_total_gb: u64,
|
||||
pub gpus: Vec<GpuSpec>,
|
||||
pub storage: Vec<StorageSpec>,
|
||||
pub verified: bool, // Hardware attestation passed
|
||||
}
|
||||
|
||||
pub struct SlaCommitment {
|
||||
pub uptime_percent: f32, // 99.9, 99.99, etc.
|
||||
pub response_time_ms: u32,
|
||||
pub data_durability: f32,
|
||||
pub penalty_rate: f32, // Penalty for SLA violation
|
||||
}
|
||||
```
|
||||
|
||||
### 6.2 Provider Revenue Model
|
||||
|
||||
| Revenue Source | Provider Share | Protocol Share |
|
||||
|----------------|----------------|----------------|
|
||||
| Compute fees | 85% | 15% |
|
||||
| Storage fees | 80% | 20% |
|
||||
| Network fees | 75% | 25% |
|
||||
| SLA bonuses | 100% | 0% |
|
||||
| Staking rewards | 100% | 0% |
|
||||
|
||||
### 6.3 Slashing Conditions
|
||||
|
||||
| Violation | Penalty |
|
||||
|-----------|---------|
|
||||
| Downtime > committed SLA | 1% stake per hour |
|
||||
| Data loss | 10% stake + compensation |
|
||||
| Malicious behavior | 100% stake |
|
||||
| False hardware attestation | 50% stake |
|
||||
|
||||
---
|
||||
|
||||
## Implementation Timeline
|
||||
|
||||
### Phase 11.1: Foundation (Weeks 1-4)
|
||||
- [ ] Node registration and hardware attestation
|
||||
- [ ] Basic job scheduler
|
||||
- [ ] WASM runtime integration (existing)
|
||||
- [ ] Container runtime (containerd)
|
||||
- [ ] Network overlay (WireGuard mesh)
|
||||
|
||||
### Phase 11.2: GPU Compute (Weeks 5-8)
|
||||
- [ ] GPU node registration
|
||||
- [ ] NVIDIA driver integration
|
||||
- [ ] CUDA runtime support
|
||||
- [ ] Basic ML job execution
|
||||
- [ ] Model storage integration
|
||||
|
||||
### Phase 11.3: Container Orchestration (Weeks 9-12)
|
||||
- [ ] OCI image support
|
||||
- [ ] Service deployment
|
||||
- [ ] Load balancing
|
||||
- [ ] Auto-scaling
|
||||
- [ ] Service mesh (mTLS)
|
||||
|
||||
### Phase 11.4: Persistent VMs (Weeks 13-16)
|
||||
- [ ] MicroVM runtime (Firecracker)
|
||||
- [ ] VM lifecycle management
|
||||
- [ ] Persistent storage
|
||||
- [ ] Live migration
|
||||
- [ ] Snapshot/restore
|
||||
|
||||
### Phase 11.5: Serverless (Weeks 17-20)
|
||||
- [ ] Function deployment
|
||||
- [ ] Cold start optimization
|
||||
- [ ] Event triggers
|
||||
- [ ] API gateway
|
||||
- [ ] Monitoring/logging
|
||||
|
||||
### Phase 11.6: Edge Compute (Weeks 21-24)
|
||||
- [ ] Edge node registration
|
||||
- [ ] Edge function runtime
|
||||
- [ ] CDN integration
|
||||
- [ ] Edge inference
|
||||
- [ ] Global anycast
|
||||
|
||||
---
|
||||
|
||||
## Security Considerations
|
||||
|
||||
### Isolation Levels
|
||||
|
||||
| Workload Type | Isolation Technology | Security Level |
|
||||
|---------------|---------------------|----------------|
|
||||
| WASM | Wasmtime sandbox | High |
|
||||
| Serverless | gVisor + seccomp | High |
|
||||
| Containers | gVisor or Kata | Medium-High |
|
||||
| VMs | Firecracker MicroVM | High |
|
||||
| GPU | NVIDIA MIG/MPS | Medium |
|
||||
|
||||
### Network Security
|
||||
|
||||
- All inter-node traffic encrypted (WireGuard)
|
||||
- mTLS for service-to-service communication
|
||||
- Network policies for workload isolation
|
||||
- DDoS protection at edge
|
||||
|
||||
### Data Security
|
||||
|
||||
- Encryption at rest (AES-256)
|
||||
- Encryption in transit (TLS 1.3)
|
||||
- Confidential computing support (AMD SEV, Intel SGX)
|
||||
- Secure key management (HSM integration)
|
||||
|
||||
---
|
||||
|
||||
## API Examples
|
||||
|
||||
### Deploy AI Training Job
|
||||
|
||||
```bash
|
||||
synor compute train create \
|
||||
--framework pytorch \
|
||||
--model-config ./model.yaml \
|
||||
--dataset synor://datasets/imagenet \
|
||||
--gpus 8 \
|
||||
--gpu-type h100 \
|
||||
--distributed ddp \
|
||||
--epochs 100 \
|
||||
--checkpoint-interval 1000 \
|
||||
--max-budget 1000
|
||||
```
|
||||
|
||||
### Deploy Inference Endpoint
|
||||
|
||||
```bash
|
||||
synor compute inference deploy \
|
||||
--model synor://models/llama-70b \
|
||||
--format vllm \
|
||||
--min-replicas 2 \
|
||||
--max-replicas 10 \
|
||||
--gpu-per-replica 2 \
|
||||
--target-utilization 0.7
|
||||
```
|
||||
|
||||
### Create Persistent VM
|
||||
|
||||
```bash
|
||||
synor compute vm create \
|
||||
--name my-dev-server \
|
||||
--image ubuntu:22.04 \
|
||||
--size gpu-small \
|
||||
--volume 100gb:nvme:/data \
|
||||
--ssh-key ~/.ssh/id_ed25519.pub \
|
||||
--region us-east
|
||||
```
|
||||
|
||||
### Deploy Container Service
|
||||
|
||||
```bash
|
||||
synor compute service deploy \
|
||||
--name my-api \
|
||||
--image my-registry/my-api:latest \
|
||||
--replicas 3 \
|
||||
--cpu 2 \
|
||||
--memory 4gb \
|
||||
--port 8080 \
|
||||
--health-check /health \
|
||||
--autoscale 2-10
|
||||
```
|
||||
|
||||
### Deploy Serverless Function
|
||||
|
||||
```bash
|
||||
synor compute function deploy \
|
||||
--name process-image \
|
||||
--runtime python312 \
|
||||
--handler main.handler \
|
||||
--code ./function \
|
||||
--memory 1024 \
|
||||
--timeout 30000 \
|
||||
--trigger http:/api/process
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Comparison with Existing Synor VM
|
||||
|
||||
| Feature | Current Synor VM | Synor Compute L2 |
|
||||
|---------|------------------|------------------|
|
||||
| Runtime | WASM only | WASM, Container, MicroVM |
|
||||
| Timeout | 30 seconds | Unlimited (VMs) |
|
||||
| Memory | 16 MB max | Up to 256 GB |
|
||||
| GPU | ❌ | ✅ Full CUDA/ROCm |
|
||||
| Networking | ❌ | ✅ Full TCP/UDP |
|
||||
| File I/O | ❌ | ✅ Persistent volumes |
|
||||
| Threading | ❌ | ✅ Multi-threaded |
|
||||
| AI/ML | ❌ | ✅ Training + Inference |
|
||||
| OS Hosting | ❌ | ✅ Full Linux/Windows |
|
||||
|
||||
---
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. **Milestone 1**: Implement GPU node registration and attestation
|
||||
2. **Milestone 2**: Build basic job scheduler with resource allocation
|
||||
3. **Milestone 3**: Integrate containerd for container workloads
|
||||
4. **Milestone 4**: Add Firecracker for MicroVM support
|
||||
5. **Milestone 5**: Implement serverless function runtime
|
||||
6. **Milestone 6**: Deploy edge nodes and CDN integration
|
||||
|
||||
This plan transforms Synor from a smart contract platform into a full-stack decentralized cloud provider capable of competing with AWS/GCP/Azure while maintaining decentralization and censorship resistance.
|
||||
Loading…
Add table
Reference in a new issue