feat(compute): add Phase 11 Synor Compute L2 heterogeneous compute layer
- Add synor-compute crate for heterogeneous compute orchestration - Implement processor abstraction for CPU/GPU/TPU/NPU/LPU/FPGA/DSP - Add device registry with cross-vendor capability tracking - Implement task scheduler with work stealing and load balancing - Add energy-aware and latency-aware balancing strategies - Create spot market for compute resources with order matching - Add memory manager with tensor handles and cross-device transfers - Support processor capability profiles (H100, TPU v5p, Groq LPU, etc.) - Implement priority work queues with task decomposition Processor types supported: - CPU (x86-64 AVX512, ARM64 SVE, RISC-V Vector) - GPU (NVIDIA CUDA, AMD ROCm, Intel OneAPI, Apple Metal) - TPU (v2-v5p, Edge TPU) - NPU (Apple Neural Engine, Qualcomm Hexagon, Intel VPU) - LPU (Groq Language Processing Unit) - FPGA (Xilinx, Intel Altera) - DSP (TI, Analog Devices) - WebGPU and WASM runtimes
This commit is contained in:
parent
8da34bc73d
commit
4c36ddbdc2
19 changed files with 11219 additions and 0 deletions
|
|
@ -9,6 +9,7 @@ members = [
|
||||||
"crates/synor-storage",
|
"crates/synor-storage",
|
||||||
"crates/synor-hosting",
|
"crates/synor-hosting",
|
||||||
"crates/synor-database",
|
"crates/synor-database",
|
||||||
|
"crates/synor-compute",
|
||||||
"crates/synor-governance",
|
"crates/synor-governance",
|
||||||
"crates/synor-rpc",
|
"crates/synor-rpc",
|
||||||
"crates/synor-vm",
|
"crates/synor-vm",
|
||||||
|
|
|
||||||
51
crates/synor-compute/Cargo.toml
Normal file
51
crates/synor-compute/Cargo.toml
Normal file
|
|
@ -0,0 +1,51 @@
|
||||||
|
[package]
|
||||||
|
name = "synor-compute"
|
||||||
|
version.workspace = true
|
||||||
|
edition.workspace = true
|
||||||
|
description = "Heterogeneous multi-processor compute platform for Synor blockchain"
|
||||||
|
license.workspace = true
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
# Internal crates
|
||||||
|
synor-types = { path = "../synor-types" }
|
||||||
|
synor-crypto = { path = "../synor-crypto" }
|
||||||
|
synor-storage = { path = "../synor-storage" }
|
||||||
|
|
||||||
|
# Serialization
|
||||||
|
serde.workspace = true
|
||||||
|
serde_json.workspace = true
|
||||||
|
borsh.workspace = true
|
||||||
|
bincode = "1.3"
|
||||||
|
|
||||||
|
# Async runtime
|
||||||
|
tokio = { workspace = true, features = ["sync", "rt-multi-thread", "time", "macros"] }
|
||||||
|
async-trait = "0.1"
|
||||||
|
futures = "0.3"
|
||||||
|
|
||||||
|
# Concurrency
|
||||||
|
parking_lot.workspace = true
|
||||||
|
crossbeam-deque = "0.8"
|
||||||
|
crossbeam-channel = "0.5"
|
||||||
|
dashmap = "5.5"
|
||||||
|
|
||||||
|
# Utilities
|
||||||
|
thiserror.workspace = true
|
||||||
|
tracing.workspace = true
|
||||||
|
hex.workspace = true
|
||||||
|
|
||||||
|
# Hashing
|
||||||
|
blake3.workspace = true
|
||||||
|
|
||||||
|
# Data structures
|
||||||
|
indexmap = "2.2"
|
||||||
|
priority-queue = "2.0"
|
||||||
|
|
||||||
|
# Time
|
||||||
|
chrono = { version = "0.4", features = ["serde"] }
|
||||||
|
|
||||||
|
# Random
|
||||||
|
rand = "0.8"
|
||||||
|
|
||||||
|
[dev-dependencies]
|
||||||
|
tempfile.workspace = true
|
||||||
|
tokio-test = "0.4"
|
||||||
377
crates/synor-compute/src/device/mod.rs
Normal file
377
crates/synor-compute/src/device/mod.rs
Normal file
|
|
@ -0,0 +1,377 @@
|
||||||
|
//! Device registry and management.
|
||||||
|
//!
|
||||||
|
//! Supports all device types:
|
||||||
|
//! - Data center servers
|
||||||
|
//! - Desktop workstations
|
||||||
|
//! - Laptops
|
||||||
|
//! - Mobile devices (iOS, Android)
|
||||||
|
//! - Browsers (WebGPU, WASM)
|
||||||
|
//! - IoT devices
|
||||||
|
|
||||||
|
use crate::error::ComputeError;
|
||||||
|
use crate::processor::{GenericProcessor, Processor, ProcessorCapabilities, ProcessorId, ProcessorType};
|
||||||
|
use crate::{NodeId, ProcessorInfo};
|
||||||
|
use parking_lot::RwLock;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
/// Unique device identifier.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub struct DeviceId(pub [u8; 32]);
|
||||||
|
|
||||||
|
impl DeviceId {
|
||||||
|
/// Creates a new random device ID.
|
||||||
|
pub fn new() -> Self {
|
||||||
|
use rand::Rng;
|
||||||
|
let mut bytes = [0u8; 32];
|
||||||
|
rand::thread_rng().fill(&mut bytes);
|
||||||
|
DeviceId(bytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates from bytes.
|
||||||
|
pub fn from_bytes(bytes: [u8; 32]) -> Self {
|
||||||
|
DeviceId(bytes)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for DeviceId {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for DeviceId {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "dev_{}", hex::encode(&self.0[..8]))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Device type classification.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub enum DeviceType {
|
||||||
|
/// Data center server.
|
||||||
|
DataCenter,
|
||||||
|
/// Desktop workstation.
|
||||||
|
Desktop,
|
||||||
|
/// Laptop.
|
||||||
|
Laptop,
|
||||||
|
/// Mobile phone.
|
||||||
|
Mobile,
|
||||||
|
/// Tablet.
|
||||||
|
Tablet,
|
||||||
|
/// IoT device.
|
||||||
|
IoT,
|
||||||
|
/// Browser (WebGPU/WASM).
|
||||||
|
Browser,
|
||||||
|
/// Edge server.
|
||||||
|
Edge,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DeviceType {
|
||||||
|
/// Returns typical reliability score (0-100).
|
||||||
|
pub fn reliability(&self) -> u32 {
|
||||||
|
match self {
|
||||||
|
DeviceType::DataCenter => 99,
|
||||||
|
DeviceType::Edge => 95,
|
||||||
|
DeviceType::Desktop => 80,
|
||||||
|
DeviceType::Laptop => 60,
|
||||||
|
DeviceType::Mobile => 40,
|
||||||
|
DeviceType::Tablet => 50,
|
||||||
|
DeviceType::IoT => 70,
|
||||||
|
DeviceType::Browser => 30,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns typical availability hours per day.
|
||||||
|
pub fn availability_hours(&self) -> f32 {
|
||||||
|
match self {
|
||||||
|
DeviceType::DataCenter => 24.0,
|
||||||
|
DeviceType::Edge => 24.0,
|
||||||
|
DeviceType::Desktop => 8.0,
|
||||||
|
DeviceType::Laptop => 6.0,
|
||||||
|
DeviceType::Mobile => 4.0,
|
||||||
|
DeviceType::Tablet => 4.0,
|
||||||
|
DeviceType::IoT => 24.0,
|
||||||
|
DeviceType::Browser => 2.0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Device capabilities.
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub struct DeviceCapabilities {
|
||||||
|
/// Device type.
|
||||||
|
pub device_type: DeviceType,
|
||||||
|
/// Available processors.
|
||||||
|
pub processors: Vec<ProcessorType>,
|
||||||
|
/// Total memory (GB).
|
||||||
|
pub memory_gb: f32,
|
||||||
|
/// Network bandwidth (Mbps).
|
||||||
|
pub bandwidth_mbps: f32,
|
||||||
|
/// Storage available (GB).
|
||||||
|
pub storage_gb: f32,
|
||||||
|
/// Battery powered.
|
||||||
|
pub battery_powered: bool,
|
||||||
|
/// Supports background execution.
|
||||||
|
pub background_execution: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Device information.
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub struct DeviceInfo {
|
||||||
|
/// Device ID.
|
||||||
|
pub id: DeviceId,
|
||||||
|
/// Device type.
|
||||||
|
pub device_type: DeviceType,
|
||||||
|
/// Owner address.
|
||||||
|
pub owner: [u8; 32],
|
||||||
|
/// Capabilities.
|
||||||
|
pub capabilities: DeviceCapabilities,
|
||||||
|
/// Current status.
|
||||||
|
pub status: DeviceStatus,
|
||||||
|
/// Reputation score (0-100).
|
||||||
|
pub reputation: u32,
|
||||||
|
/// Total earnings (atomic SYNOR).
|
||||||
|
pub earnings: u64,
|
||||||
|
/// Geographic region.
|
||||||
|
pub region: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Device status.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
|
pub enum DeviceStatus {
|
||||||
|
/// Online and available.
|
||||||
|
Online,
|
||||||
|
/// Online but busy.
|
||||||
|
Busy,
|
||||||
|
/// Idle but available.
|
||||||
|
Idle,
|
||||||
|
/// On battery (reduced capacity).
|
||||||
|
OnBattery,
|
||||||
|
/// Offline.
|
||||||
|
Offline,
|
||||||
|
/// Maintenance.
|
||||||
|
Maintenance,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Device registry managing all devices and processors.
|
||||||
|
pub struct DeviceRegistry {
|
||||||
|
/// Registered devices.
|
||||||
|
devices: RwLock<HashMap<DeviceId, DeviceInfo>>,
|
||||||
|
/// Node to device mapping.
|
||||||
|
node_devices: RwLock<HashMap<NodeId, Vec<DeviceId>>>,
|
||||||
|
/// All processors (across all nodes).
|
||||||
|
processors: RwLock<HashMap<ProcessorId, Arc<dyn Processor>>>,
|
||||||
|
/// Processor to node mapping.
|
||||||
|
processor_nodes: RwLock<HashMap<ProcessorId, NodeId>>,
|
||||||
|
/// Next processor ID.
|
||||||
|
next_processor_id: std::sync::atomic::AtomicU64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DeviceRegistry {
|
||||||
|
/// Creates a new device registry.
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
devices: RwLock::new(HashMap::new()),
|
||||||
|
node_devices: RwLock::new(HashMap::new()),
|
||||||
|
processors: RwLock::new(HashMap::new()),
|
||||||
|
processor_nodes: RwLock::new(HashMap::new()),
|
||||||
|
next_processor_id: std::sync::atomic::AtomicU64::new(0),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Registers a device.
|
||||||
|
pub fn register_device(&self, device: DeviceInfo) -> Result<DeviceId, ComputeError> {
|
||||||
|
let id = device.id;
|
||||||
|
self.devices.write().insert(id, device);
|
||||||
|
Ok(id)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Unregisters a device.
|
||||||
|
pub fn unregister_device(&self, device_id: DeviceId) -> Result<(), ComputeError> {
|
||||||
|
self.devices.write().remove(&device_id);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets a device by ID.
|
||||||
|
pub fn get_device(&self, device_id: DeviceId) -> Option<DeviceInfo> {
|
||||||
|
self.devices.read().get(&device_id).cloned()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Registers a processor for a node.
|
||||||
|
pub fn register_processor(
|
||||||
|
&self,
|
||||||
|
node_id: NodeId,
|
||||||
|
info: ProcessorInfo,
|
||||||
|
) -> Result<(), ComputeError> {
|
||||||
|
let processor_id = info.id;
|
||||||
|
|
||||||
|
// Create a generic processor from the info
|
||||||
|
let processor: Arc<dyn Processor> = Arc::new(GenericProcessor::new(
|
||||||
|
processor_id,
|
||||||
|
info.processor_type,
|
||||||
|
info.capabilities,
|
||||||
|
));
|
||||||
|
|
||||||
|
self.processors.write().insert(processor_id, processor);
|
||||||
|
self.processor_nodes.write().insert(processor_id, node_id);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Unregisters all processors for a node.
|
||||||
|
pub fn unregister_node(&self, node_id: NodeId) -> Result<(), ComputeError> {
|
||||||
|
let mut processors = self.processors.write();
|
||||||
|
let mut processor_nodes = self.processor_nodes.write();
|
||||||
|
|
||||||
|
// Find and remove all processors for this node
|
||||||
|
let to_remove: Vec<_> = processor_nodes
|
||||||
|
.iter()
|
||||||
|
.filter(|(_, n)| **n == node_id)
|
||||||
|
.map(|(p, _)| *p)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
for proc_id in to_remove {
|
||||||
|
processors.remove(&proc_id);
|
||||||
|
processor_nodes.remove(&proc_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets a processor by ID.
|
||||||
|
pub fn get_processor(&self, processor_id: ProcessorId) -> Result<Arc<dyn Processor>, ComputeError> {
|
||||||
|
self.processors
|
||||||
|
.read()
|
||||||
|
.get(&processor_id)
|
||||||
|
.cloned()
|
||||||
|
.ok_or(ComputeError::ProcessorNotFound(processor_id))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets all processors.
|
||||||
|
pub fn all_processors(&self) -> Vec<Arc<dyn Processor>> {
|
||||||
|
self.processors.read().values().cloned().collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets processors of a specific type.
|
||||||
|
pub fn processors_by_type(&self, proc_type: ProcessorType) -> Vec<Arc<dyn Processor>> {
|
||||||
|
self.processors
|
||||||
|
.read()
|
||||||
|
.values()
|
||||||
|
.filter(|p| p.processor_type() == proc_type)
|
||||||
|
.cloned()
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets the next processor ID.
|
||||||
|
pub fn next_processor_id(&self) -> ProcessorId {
|
||||||
|
ProcessorId(self.next_processor_id.fetch_add(1, std::sync::atomic::Ordering::SeqCst))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets total number of devices.
|
||||||
|
pub fn device_count(&self) -> usize {
|
||||||
|
self.devices.read().len()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets total number of processors.
|
||||||
|
pub fn processor_count(&self) -> usize {
|
||||||
|
self.processors.read().len()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets devices by type.
|
||||||
|
pub fn devices_by_type(&self, device_type: DeviceType) -> Vec<DeviceInfo> {
|
||||||
|
self.devices
|
||||||
|
.read()
|
||||||
|
.values()
|
||||||
|
.filter(|d| d.device_type == device_type)
|
||||||
|
.cloned()
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets online devices.
|
||||||
|
pub fn online_devices(&self) -> Vec<DeviceInfo> {
|
||||||
|
self.devices
|
||||||
|
.read()
|
||||||
|
.values()
|
||||||
|
.filter(|d| d.status == DeviceStatus::Online || d.status == DeviceStatus::Idle)
|
||||||
|
.cloned()
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Updates device status.
|
||||||
|
pub fn update_device_status(
|
||||||
|
&self,
|
||||||
|
device_id: DeviceId,
|
||||||
|
status: DeviceStatus,
|
||||||
|
) -> Result<(), ComputeError> {
|
||||||
|
if let Some(device) = self.devices.write().get_mut(&device_id) {
|
||||||
|
device.status = status;
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(ComputeError::Internal(format!("Device not found: {}", device_id)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for DeviceRegistry {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::processor::{CpuVariant, AvxSupport};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_device_id() {
|
||||||
|
let id1 = DeviceId::new();
|
||||||
|
let id2 = DeviceId::new();
|
||||||
|
assert_ne!(id1.0, id2.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_device_registry() {
|
||||||
|
let registry = DeviceRegistry::new();
|
||||||
|
|
||||||
|
let device = DeviceInfo {
|
||||||
|
id: DeviceId::new(),
|
||||||
|
device_type: DeviceType::Desktop,
|
||||||
|
owner: [1u8; 32],
|
||||||
|
capabilities: DeviceCapabilities {
|
||||||
|
device_type: DeviceType::Desktop,
|
||||||
|
processors: vec![ProcessorType::Cpu(CpuVariant::X86_64 {
|
||||||
|
avx: AvxSupport::Avx512,
|
||||||
|
})],
|
||||||
|
memory_gb: 64.0,
|
||||||
|
bandwidth_mbps: 1000.0,
|
||||||
|
storage_gb: 1000.0,
|
||||||
|
battery_powered: false,
|
||||||
|
background_execution: true,
|
||||||
|
},
|
||||||
|
status: DeviceStatus::Online,
|
||||||
|
reputation: 100,
|
||||||
|
earnings: 0,
|
||||||
|
region: "us-east".to_string(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let device_id = device.id;
|
||||||
|
registry.register_device(device).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(registry.device_count(), 1);
|
||||||
|
assert!(registry.get_device(device_id).is_some());
|
||||||
|
|
||||||
|
registry.unregister_device(device_id).unwrap();
|
||||||
|
assert_eq!(registry.device_count(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_device_type_properties() {
|
||||||
|
assert_eq!(DeviceType::DataCenter.reliability(), 99);
|
||||||
|
assert_eq!(DeviceType::Mobile.reliability(), 40);
|
||||||
|
assert_eq!(DeviceType::DataCenter.availability_hours(), 24.0);
|
||||||
|
assert_eq!(DeviceType::Browser.availability_hours(), 2.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
92
crates/synor-compute/src/error.rs
Normal file
92
crates/synor-compute/src/error.rs
Normal file
|
|
@ -0,0 +1,92 @@
|
||||||
|
//! Error types for Synor Compute.
|
||||||
|
|
||||||
|
use crate::{JobId, NodeId, ProcessorId, ProcessorType};
|
||||||
|
use thiserror::Error;
|
||||||
|
|
||||||
|
/// Compute errors.
|
||||||
|
#[derive(Debug, Error)]
|
||||||
|
pub enum ComputeError {
|
||||||
|
/// Job not found.
|
||||||
|
#[error("Job not found: {0}")]
|
||||||
|
JobNotFound(JobId),
|
||||||
|
|
||||||
|
/// Node not found.
|
||||||
|
#[error("Node not found: {0}")]
|
||||||
|
NodeNotFound(NodeId),
|
||||||
|
|
||||||
|
/// Processor not found.
|
||||||
|
#[error("Processor not found: {0}")]
|
||||||
|
ProcessorNotFound(ProcessorId),
|
||||||
|
|
||||||
|
/// No suitable processor for operation.
|
||||||
|
#[error("No suitable processor for operation: {0}")]
|
||||||
|
NoSuitableProcessor(String),
|
||||||
|
|
||||||
|
/// Insufficient resources.
|
||||||
|
#[error("Insufficient resources: {0}")]
|
||||||
|
InsufficientResources(String),
|
||||||
|
|
||||||
|
/// Task execution failed.
|
||||||
|
#[error("Task execution failed: {0}")]
|
||||||
|
TaskExecutionFailed(String),
|
||||||
|
|
||||||
|
/// Scheduling failed.
|
||||||
|
#[error("Scheduling failed: {0}")]
|
||||||
|
SchedulingFailed(String),
|
||||||
|
|
||||||
|
/// Memory allocation failed.
|
||||||
|
#[error("Memory allocation failed: {0}")]
|
||||||
|
MemoryAllocationFailed(String),
|
||||||
|
|
||||||
|
/// Data transfer failed.
|
||||||
|
#[error("Data transfer failed: {0}")]
|
||||||
|
DataTransferFailed(String),
|
||||||
|
|
||||||
|
/// Processor type not supported.
|
||||||
|
#[error("Processor type not supported: {0:?}")]
|
||||||
|
ProcessorTypeNotSupported(ProcessorType),
|
||||||
|
|
||||||
|
/// Operation not supported on processor.
|
||||||
|
#[error("Operation not supported on {0:?}: {1}")]
|
||||||
|
OperationNotSupported(ProcessorType, String),
|
||||||
|
|
||||||
|
/// Timeout.
|
||||||
|
#[error("Operation timed out after {0}ms")]
|
||||||
|
Timeout(u64),
|
||||||
|
|
||||||
|
/// Budget exceeded.
|
||||||
|
#[error("Budget exceeded: required {required}, available {available}")]
|
||||||
|
BudgetExceeded { required: u64, available: u64 },
|
||||||
|
|
||||||
|
/// Node already registered.
|
||||||
|
#[error("Node already registered: {0}")]
|
||||||
|
NodeAlreadyRegistered(NodeId),
|
||||||
|
|
||||||
|
/// Invalid configuration.
|
||||||
|
#[error("Invalid configuration: {0}")]
|
||||||
|
InvalidConfiguration(String),
|
||||||
|
|
||||||
|
/// Serialization error.
|
||||||
|
#[error("Serialization error: {0}")]
|
||||||
|
Serialization(String),
|
||||||
|
|
||||||
|
/// Network error.
|
||||||
|
#[error("Network error: {0}")]
|
||||||
|
Network(String),
|
||||||
|
|
||||||
|
/// Internal error.
|
||||||
|
#[error("Internal error: {0}")]
|
||||||
|
Internal(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<bincode::Error> for ComputeError {
|
||||||
|
fn from(err: bincode::Error) -> Self {
|
||||||
|
ComputeError::Serialization(err.to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<serde_json::Error> for ComputeError {
|
||||||
|
fn from(err: serde_json::Error) -> Self {
|
||||||
|
ComputeError::Serialization(err.to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
631
crates/synor-compute/src/lib.rs
Normal file
631
crates/synor-compute/src/lib.rs
Normal file
|
|
@ -0,0 +1,631 @@
|
||||||
|
//! Synor Compute L2 - Heterogeneous Multi-Processor Compute Platform
|
||||||
|
//!
|
||||||
|
//! Provides decentralized compute services with:
|
||||||
|
//!
|
||||||
|
//! - **Heterogeneous Scheduling**: CPU + GPU + TPU + NPU + LPU working simultaneously
|
||||||
|
//! - **Consumer Device Mesh**: Mobile, browser, desktop devices contributing compute
|
||||||
|
//! - **90% Cost Reduction**: Zero margins, spot markets, electricity arbitrage
|
||||||
|
//! - **10x Speed**: Caching, speculative execution, optimal processor assignment
|
||||||
|
//!
|
||||||
|
//! # Architecture
|
||||||
|
//!
|
||||||
|
//! ```text
|
||||||
|
//! ┌─────────────────────────────────────────────────────────────────────────────┐
|
||||||
|
//! │ SYNOR COMPUTE L2 │
|
||||||
|
//! ├─────────────────────────────────────────────────────────────────────────────┤
|
||||||
|
//! │ │
|
||||||
|
//! │ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||||
|
//! │ │ TASK DECOMPOSER │ │
|
||||||
|
//! │ │ Analyzes workload → Identifies subtasks → Maps to optimal processors │ │
|
||||||
|
//! │ └─────────────────────────────────────────────────────────────────────────┘ │
|
||||||
|
//! │ │ │
|
||||||
|
//! │ ▼ │
|
||||||
|
//! │ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||||
|
//! │ │ HETEROGENEOUS SCHEDULER │ │
|
||||||
|
//! │ │ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ │ │
|
||||||
|
//! │ │ │ CPU │ │ GPU │ │ TPU │ │ NPU │ │ LPU │ │Custom│ │ │
|
||||||
|
//! │ │ │Queue │ │Queue │ │Queue │ │Queue │ │Queue │ │Queue │ │ │
|
||||||
|
//! │ │ └──────┘ └──────┘ └──────┘ └──────┘ └──────┘ └──────┘ │ │
|
||||||
|
//! │ └─────────────────────────────────────────────────────────────────────────┘ │
|
||||||
|
//! │ │
|
||||||
|
//! │ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||||
|
//! │ │ UNIFIED MEMORY FABRIC │ │
|
||||||
|
//! │ │ Zero-copy data sharing │ Automatic placement │ Cache coherency │ │
|
||||||
|
//! │ └─────────────────────────────────────────────────────────────────────────┘ │
|
||||||
|
//! │ │
|
||||||
|
//! └─────────────────────────────────────────────────────────────────────────────┘
|
||||||
|
//! ```
|
||||||
|
//!
|
||||||
|
//! # Pricing
|
||||||
|
//!
|
||||||
|
//! | Resource | Unit | Price (SYNOR) |
|
||||||
|
//! |----------|------|---------------|
|
||||||
|
//! | GPU (consumer) | hour | 0.10 |
|
||||||
|
//! | GPU (datacenter) | hour | 0.50-4.00 |
|
||||||
|
//! | CPU | core/hour | 0.02 |
|
||||||
|
//! | Memory | GB/hour | 0.005 |
|
||||||
|
//! | Inference | 1M tokens | 0.10 |
|
||||||
|
|
||||||
|
#![allow(dead_code)]
|
||||||
|
|
||||||
|
pub mod device;
|
||||||
|
pub mod error;
|
||||||
|
pub mod market;
|
||||||
|
pub mod memory;
|
||||||
|
pub mod processor;
|
||||||
|
pub mod scheduler;
|
||||||
|
pub mod task;
|
||||||
|
|
||||||
|
pub use device::{
|
||||||
|
DeviceCapabilities, DeviceId, DeviceInfo, DeviceRegistry, DeviceStatus, DeviceType,
|
||||||
|
};
|
||||||
|
pub use error::ComputeError;
|
||||||
|
pub use market::{
|
||||||
|
Auction, AuctionId, CloudComparison, CpuTier as MarketCpuTier, GpuTier as MarketGpuTier,
|
||||||
|
MarketStats, Order, OrderBook, OrderId, OrderSide, OrderType, PricingEngine, ProviderListing,
|
||||||
|
ResourceType, SpotMarket, Trade,
|
||||||
|
};
|
||||||
|
pub use memory::{MemoryManager, TensorHandle, TransferPath, UnifiedMemory};
|
||||||
|
pub use processor::{
|
||||||
|
ComputeThroughput, CpuVariant, GpuVariant, NpuVariant, Operation, OperationType, Processor,
|
||||||
|
ProcessorCapabilities, ProcessorId, ProcessorType, TpuVersion,
|
||||||
|
};
|
||||||
|
pub use scheduler::{
|
||||||
|
HeterogeneousScheduler, LoadBalancer, Schedule, ScheduleResult, TaskAssignment, WorkQueue,
|
||||||
|
};
|
||||||
|
pub use task::{
|
||||||
|
ComputeTask, DecomposedWorkload, Task, TaskDecomposer, TaskId, TaskPriority, TaskResult,
|
||||||
|
TaskStatus,
|
||||||
|
};
|
||||||
|
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use parking_lot::RwLock;
|
||||||
|
|
||||||
|
/// Compute node identifier.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub struct NodeId(pub u64);
|
||||||
|
|
||||||
|
impl std::fmt::Display for NodeId {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "node_{}", self.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Job identifier.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub struct JobId(pub [u8; 32]);
|
||||||
|
|
||||||
|
impl JobId {
|
||||||
|
/// Creates a new job ID.
|
||||||
|
pub fn new() -> Self {
|
||||||
|
use rand::Rng;
|
||||||
|
let mut bytes = [0u8; 32];
|
||||||
|
rand::thread_rng().fill(&mut bytes);
|
||||||
|
JobId(bytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates from bytes.
|
||||||
|
pub fn from_bytes(bytes: [u8; 32]) -> Self {
|
||||||
|
JobId(bytes)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for JobId {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for JobId {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "job_{}", hex::encode(&self.0[..8]))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compute job specification.
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub struct ComputeJob {
|
||||||
|
/// Job ID.
|
||||||
|
pub id: JobId,
|
||||||
|
/// Owner address.
|
||||||
|
pub owner: [u8; 32],
|
||||||
|
/// Job type.
|
||||||
|
pub job_type: JobType,
|
||||||
|
/// Resource requirements.
|
||||||
|
pub resources: ResourceRequirements,
|
||||||
|
/// Input data reference (CID).
|
||||||
|
pub input_cid: Option<String>,
|
||||||
|
/// Maximum budget (in atomic SYNOR).
|
||||||
|
pub max_budget: u64,
|
||||||
|
/// Priority level.
|
||||||
|
pub priority: JobPriority,
|
||||||
|
/// Created timestamp.
|
||||||
|
pub created_at: u64,
|
||||||
|
/// Deadline (optional).
|
||||||
|
pub deadline: Option<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Job type classification.
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub enum JobType {
|
||||||
|
/// AI/ML training job.
|
||||||
|
Training {
|
||||||
|
framework: MlFramework,
|
||||||
|
model_cid: String,
|
||||||
|
dataset_cid: String,
|
||||||
|
epochs: u32,
|
||||||
|
batch_size: u32,
|
||||||
|
},
|
||||||
|
/// AI/ML inference job.
|
||||||
|
Inference {
|
||||||
|
model_cid: String,
|
||||||
|
input_format: String,
|
||||||
|
batch_size: u32,
|
||||||
|
},
|
||||||
|
/// Container workload.
|
||||||
|
Container {
|
||||||
|
image: String,
|
||||||
|
command: Vec<String>,
|
||||||
|
env: HashMap<String, String>,
|
||||||
|
},
|
||||||
|
/// Serverless function.
|
||||||
|
Serverless {
|
||||||
|
runtime: FunctionRuntime,
|
||||||
|
code_cid: String,
|
||||||
|
handler: String,
|
||||||
|
},
|
||||||
|
/// General compute (WASM).
|
||||||
|
Wasm {
|
||||||
|
module_cid: String,
|
||||||
|
entrypoint: String,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
/// ML framework specification.
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub enum MlFramework {
|
||||||
|
PyTorch { version: String },
|
||||||
|
TensorFlow { version: String },
|
||||||
|
JAX { version: String },
|
||||||
|
ONNX,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Function runtime.
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub enum FunctionRuntime {
|
||||||
|
Node20,
|
||||||
|
Python312,
|
||||||
|
Rust,
|
||||||
|
Go,
|
||||||
|
Custom { image: String },
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Job priority levels.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
|
||||||
|
pub enum JobPriority {
|
||||||
|
/// Background job, can be preempted.
|
||||||
|
Background = 0,
|
||||||
|
/// Normal priority.
|
||||||
|
Normal = 1,
|
||||||
|
/// High priority, faster scheduling.
|
||||||
|
High = 2,
|
||||||
|
/// Critical, guaranteed resources.
|
||||||
|
Critical = 3,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for JobPriority {
|
||||||
|
fn default() -> Self {
|
||||||
|
JobPriority::Normal
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Resource requirements for a job.
|
||||||
|
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
|
||||||
|
pub struct ResourceRequirements {
|
||||||
|
/// Minimum CPU cores.
|
||||||
|
pub min_cpu_cores: f32,
|
||||||
|
/// Minimum memory (GB).
|
||||||
|
pub min_memory_gb: f32,
|
||||||
|
/// GPU requirements.
|
||||||
|
pub gpu: Option<GpuRequirements>,
|
||||||
|
/// Preferred processor types (in priority order).
|
||||||
|
pub preferred_processors: Vec<ProcessorType>,
|
||||||
|
/// Maximum latency (ms) - for inference.
|
||||||
|
pub max_latency_ms: Option<u32>,
|
||||||
|
/// Requires distributed execution.
|
||||||
|
pub distributed: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// GPU resource requirements.
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub struct GpuRequirements {
|
||||||
|
/// Minimum number of GPUs.
|
||||||
|
pub min_count: u32,
|
||||||
|
/// Maximum number of GPUs.
|
||||||
|
pub max_count: u32,
|
||||||
|
/// Minimum VRAM per GPU (GB).
|
||||||
|
pub min_vram_gb: u32,
|
||||||
|
/// Minimum compute capability.
|
||||||
|
pub min_compute_capability: Option<(u8, u8)>,
|
||||||
|
/// Allow GPU sharing (MPS/MIG).
|
||||||
|
pub allow_sharing: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Job execution status.
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub enum JobStatus {
|
||||||
|
/// Queued, waiting for resources.
|
||||||
|
Queued,
|
||||||
|
/// Resources allocated, starting.
|
||||||
|
Starting,
|
||||||
|
/// Running.
|
||||||
|
Running {
|
||||||
|
progress: f32,
|
||||||
|
assigned_nodes: Vec<NodeId>,
|
||||||
|
},
|
||||||
|
/// Completed successfully.
|
||||||
|
Completed {
|
||||||
|
result_cid: String,
|
||||||
|
duration_ms: u64,
|
||||||
|
cost: u64,
|
||||||
|
},
|
||||||
|
/// Failed.
|
||||||
|
Failed { error: String },
|
||||||
|
/// Cancelled by user.
|
||||||
|
Cancelled,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compute node registration.
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub struct ComputeNode {
|
||||||
|
/// Node ID.
|
||||||
|
pub id: NodeId,
|
||||||
|
/// Owner address.
|
||||||
|
pub owner: [u8; 32],
|
||||||
|
/// Available processors.
|
||||||
|
pub processors: Vec<ProcessorInfo>,
|
||||||
|
/// Total memory (GB).
|
||||||
|
pub total_memory_gb: f32,
|
||||||
|
/// Available memory (GB).
|
||||||
|
pub available_memory_gb: f32,
|
||||||
|
/// Network bandwidth (Gbps).
|
||||||
|
pub bandwidth_gbps: f32,
|
||||||
|
/// Geographic region.
|
||||||
|
pub region: String,
|
||||||
|
/// Stake amount (for PoS).
|
||||||
|
pub stake: u64,
|
||||||
|
/// Reputation score (0-100).
|
||||||
|
pub reputation: u32,
|
||||||
|
/// Current status.
|
||||||
|
pub status: NodeStatus,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Processor information on a node.
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub struct ProcessorInfo {
|
||||||
|
/// Processor ID (local to node).
|
||||||
|
pub id: ProcessorId,
|
||||||
|
/// Processor type.
|
||||||
|
pub processor_type: ProcessorType,
|
||||||
|
/// Capabilities.
|
||||||
|
pub capabilities: ProcessorCapabilities,
|
||||||
|
/// Current utilization (0.0 - 1.0).
|
||||||
|
pub utilization: f32,
|
||||||
|
/// Current temperature (Celsius).
|
||||||
|
pub temperature: Option<f32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Node status.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
|
pub enum NodeStatus {
|
||||||
|
/// Online and accepting jobs.
|
||||||
|
Online,
|
||||||
|
/// Online but not accepting new jobs.
|
||||||
|
Draining,
|
||||||
|
/// Offline.
|
||||||
|
Offline,
|
||||||
|
/// Maintenance mode.
|
||||||
|
Maintenance,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compute cluster manager.
|
||||||
|
pub struct ComputeCluster {
|
||||||
|
/// Registered nodes.
|
||||||
|
nodes: RwLock<HashMap<NodeId, ComputeNode>>,
|
||||||
|
/// Device registry.
|
||||||
|
device_registry: Arc<DeviceRegistry>,
|
||||||
|
/// Heterogeneous scheduler.
|
||||||
|
scheduler: Arc<HeterogeneousScheduler>,
|
||||||
|
/// Spot market.
|
||||||
|
spot_market: Arc<SpotMarket>,
|
||||||
|
/// Memory manager.
|
||||||
|
memory_manager: Arc<MemoryManager>,
|
||||||
|
/// Active jobs.
|
||||||
|
jobs: RwLock<HashMap<JobId, ComputeJob>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ComputeCluster {
|
||||||
|
/// Creates a new compute cluster.
|
||||||
|
pub fn new() -> Self {
|
||||||
|
let device_registry = Arc::new(DeviceRegistry::new());
|
||||||
|
let scheduler = Arc::new(HeterogeneousScheduler::new(device_registry.clone()));
|
||||||
|
let spot_market = Arc::new(SpotMarket::new());
|
||||||
|
let memory_manager = Arc::new(MemoryManager::new());
|
||||||
|
|
||||||
|
Self {
|
||||||
|
nodes: RwLock::new(HashMap::new()),
|
||||||
|
device_registry,
|
||||||
|
scheduler,
|
||||||
|
spot_market,
|
||||||
|
memory_manager,
|
||||||
|
jobs: RwLock::new(HashMap::new()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Registers a compute node.
|
||||||
|
pub fn register_node(&self, node: ComputeNode) -> Result<(), ComputeError> {
|
||||||
|
let id = node.id;
|
||||||
|
|
||||||
|
// Register processors with device registry
|
||||||
|
for proc in &node.processors {
|
||||||
|
self.device_registry.register_processor(id, proc.clone())?;
|
||||||
|
}
|
||||||
|
|
||||||
|
self.nodes.write().insert(id, node);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Unregisters a compute node.
|
||||||
|
pub fn unregister_node(&self, node_id: NodeId) -> Result<(), ComputeError> {
|
||||||
|
self.device_registry.unregister_node(node_id)?;
|
||||||
|
self.nodes.write().remove(&node_id);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Submits a job for execution.
|
||||||
|
pub async fn submit_job(&self, job: ComputeJob) -> Result<JobId, ComputeError> {
|
||||||
|
let job_id = job.id;
|
||||||
|
|
||||||
|
// Decompose job into tasks
|
||||||
|
let tasks = self.decompose_job(&job)?;
|
||||||
|
|
||||||
|
// Schedule tasks
|
||||||
|
let schedule = self.scheduler.schedule(tasks).await?;
|
||||||
|
|
||||||
|
// Store job
|
||||||
|
self.jobs.write().insert(job_id, job);
|
||||||
|
|
||||||
|
// Execute schedule (async)
|
||||||
|
tokio::spawn({
|
||||||
|
let scheduler = self.scheduler.clone();
|
||||||
|
async move {
|
||||||
|
let _ = scheduler.execute(&schedule.schedule).await;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok(job_id)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets job status.
|
||||||
|
pub fn get_job_status(&self, job_id: &JobId) -> Option<JobStatus> {
|
||||||
|
self.jobs.read().get(job_id).map(|_| JobStatus::Queued)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Cancels a job.
|
||||||
|
pub fn cancel_job(&self, job_id: &JobId) -> Result<(), ComputeError> {
|
||||||
|
if self.jobs.write().remove(job_id).is_some() {
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(ComputeError::JobNotFound(*job_id))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets cluster statistics.
|
||||||
|
pub fn stats(&self) -> ClusterStats {
|
||||||
|
let nodes = self.nodes.read();
|
||||||
|
let jobs = self.jobs.read();
|
||||||
|
|
||||||
|
let total_nodes = nodes.len();
|
||||||
|
let online_nodes = nodes.values().filter(|n| n.status == NodeStatus::Online).count();
|
||||||
|
|
||||||
|
let total_gpus: usize = nodes
|
||||||
|
.values()
|
||||||
|
.flat_map(|n| &n.processors)
|
||||||
|
.filter(|p| matches!(p.processor_type, ProcessorType::Gpu(_)))
|
||||||
|
.count();
|
||||||
|
|
||||||
|
let total_memory: f32 = nodes.values().map(|n| n.total_memory_gb).sum();
|
||||||
|
|
||||||
|
ClusterStats {
|
||||||
|
total_nodes,
|
||||||
|
online_nodes,
|
||||||
|
total_gpus,
|
||||||
|
total_memory_gb: total_memory,
|
||||||
|
active_jobs: jobs.len(),
|
||||||
|
queued_jobs: jobs.values().filter(|_| true).count(), // Simplified
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decomposes a job into schedulable tasks.
|
||||||
|
fn decompose_job(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
|
||||||
|
let decomposer = TaskDecomposer::new();
|
||||||
|
decomposer.decompose(job)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for ComputeCluster {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Cluster statistics.
|
||||||
|
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
|
||||||
|
pub struct ClusterStats {
|
||||||
|
/// Total registered nodes.
|
||||||
|
pub total_nodes: usize,
|
||||||
|
/// Online nodes.
|
||||||
|
pub online_nodes: usize,
|
||||||
|
/// Total GPUs across cluster.
|
||||||
|
pub total_gpus: usize,
|
||||||
|
/// Total memory (GB).
|
||||||
|
pub total_memory_gb: f32,
|
||||||
|
/// Active jobs.
|
||||||
|
pub active_jobs: usize,
|
||||||
|
/// Queued jobs.
|
||||||
|
pub queued_jobs: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Pricing calculator for compute operations.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct ComputePricing {
|
||||||
|
/// GPU cost per hour by type.
|
||||||
|
pub gpu_hourly: HashMap<GpuTier, u64>,
|
||||||
|
/// CPU cost per core-hour.
|
||||||
|
pub cpu_core_hour: u64,
|
||||||
|
/// Memory cost per GB-hour.
|
||||||
|
pub memory_gb_hour: u64,
|
||||||
|
/// Network egress per GB.
|
||||||
|
pub network_egress_gb: u64,
|
||||||
|
/// Inference per million tokens.
|
||||||
|
pub inference_per_million_tokens: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// GPU pricing tiers.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub enum GpuTier {
|
||||||
|
/// Consumer GPUs (RTX 30xx, 40xx).
|
||||||
|
Consumer,
|
||||||
|
/// Professional GPUs (RTX A series).
|
||||||
|
Professional,
|
||||||
|
/// Data center GPUs (A100).
|
||||||
|
DataCenter,
|
||||||
|
/// Latest generation (H100).
|
||||||
|
Premium,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for ComputePricing {
|
||||||
|
fn default() -> Self {
|
||||||
|
let mut gpu_hourly = HashMap::new();
|
||||||
|
gpu_hourly.insert(GpuTier::Consumer, 100_000_000); // 0.10 SYNOR
|
||||||
|
gpu_hourly.insert(GpuTier::Professional, 300_000_000); // 0.30 SYNOR
|
||||||
|
gpu_hourly.insert(GpuTier::DataCenter, 2_000_000_000); // 2.00 SYNOR
|
||||||
|
gpu_hourly.insert(GpuTier::Premium, 4_000_000_000); // 4.00 SYNOR
|
||||||
|
|
||||||
|
Self {
|
||||||
|
gpu_hourly,
|
||||||
|
cpu_core_hour: 20_000_000, // 0.02 SYNOR
|
||||||
|
memory_gb_hour: 5_000_000, // 0.005 SYNOR
|
||||||
|
network_egress_gb: 50_000_000, // 0.05 SYNOR
|
||||||
|
inference_per_million_tokens: 100_000_000, // 0.10 SYNOR
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ComputePricing {
|
||||||
|
/// Estimates cost for a job.
|
||||||
|
pub fn estimate(&self, job: &ComputeJob, duration_hours: f32) -> u64 {
|
||||||
|
let mut cost = 0u64;
|
||||||
|
|
||||||
|
// CPU cost
|
||||||
|
cost += (self.cpu_core_hour as f32 * job.resources.min_cpu_cores * duration_hours) as u64;
|
||||||
|
|
||||||
|
// Memory cost
|
||||||
|
cost += (self.memory_gb_hour as f32 * job.resources.min_memory_gb * duration_hours) as u64;
|
||||||
|
|
||||||
|
// GPU cost
|
||||||
|
if let Some(gpu) = &job.resources.gpu {
|
||||||
|
let tier = GpuTier::Consumer; // Simplified
|
||||||
|
let gpu_cost = self.gpu_hourly.get(&tier).unwrap_or(&100_000_000);
|
||||||
|
cost += (*gpu_cost as f32 * gpu.min_count as f32 * duration_hours) as u64;
|
||||||
|
}
|
||||||
|
|
||||||
|
cost
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_job_id() {
|
||||||
|
let id1 = JobId::new();
|
||||||
|
let id2 = JobId::new();
|
||||||
|
assert_ne!(id1.0, id2.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_compute_cluster() {
|
||||||
|
let cluster = ComputeCluster::new();
|
||||||
|
let stats = cluster.stats();
|
||||||
|
assert_eq!(stats.total_nodes, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_pricing() {
|
||||||
|
let pricing = ComputePricing::default();
|
||||||
|
|
||||||
|
let job = ComputeJob {
|
||||||
|
id: JobId::new(),
|
||||||
|
owner: [0u8; 32],
|
||||||
|
job_type: JobType::Inference {
|
||||||
|
model_cid: "model123".to_string(),
|
||||||
|
input_format: "json".to_string(),
|
||||||
|
batch_size: 32,
|
||||||
|
},
|
||||||
|
resources: ResourceRequirements {
|
||||||
|
min_cpu_cores: 4.0,
|
||||||
|
min_memory_gb: 16.0,
|
||||||
|
gpu: Some(GpuRequirements {
|
||||||
|
min_count: 1,
|
||||||
|
max_count: 1,
|
||||||
|
min_vram_gb: 16,
|
||||||
|
min_compute_capability: None,
|
||||||
|
allow_sharing: false,
|
||||||
|
}),
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
input_cid: None,
|
||||||
|
max_budget: 1_000_000_000,
|
||||||
|
priority: JobPriority::Normal,
|
||||||
|
created_at: 0,
|
||||||
|
deadline: None,
|
||||||
|
};
|
||||||
|
|
||||||
|
let cost = pricing.estimate(&job, 1.0);
|
||||||
|
assert!(cost > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_node_registration() {
|
||||||
|
let cluster = ComputeCluster::new();
|
||||||
|
|
||||||
|
let node = ComputeNode {
|
||||||
|
id: NodeId(1),
|
||||||
|
owner: [1u8; 32],
|
||||||
|
processors: vec![ProcessorInfo {
|
||||||
|
id: ProcessorId(0),
|
||||||
|
processor_type: ProcessorType::Cpu(CpuVariant::X86_64 {
|
||||||
|
avx: processor::AvxSupport::Avx512,
|
||||||
|
}),
|
||||||
|
capabilities: ProcessorCapabilities::default(),
|
||||||
|
utilization: 0.0,
|
||||||
|
temperature: Some(45.0),
|
||||||
|
}],
|
||||||
|
total_memory_gb: 64.0,
|
||||||
|
available_memory_gb: 60.0,
|
||||||
|
bandwidth_gbps: 10.0,
|
||||||
|
region: "us-east".to_string(),
|
||||||
|
stake: 1000,
|
||||||
|
reputation: 100,
|
||||||
|
status: NodeStatus::Online,
|
||||||
|
};
|
||||||
|
|
||||||
|
cluster.register_node(node).unwrap();
|
||||||
|
assert_eq!(cluster.stats().total_nodes, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
1151
crates/synor-compute/src/market/mod.rs
Normal file
1151
crates/synor-compute/src/market/mod.rs
Normal file
File diff suppressed because it is too large
Load diff
370
crates/synor-compute/src/memory/mod.rs
Normal file
370
crates/synor-compute/src/memory/mod.rs
Normal file
|
|
@ -0,0 +1,370 @@
|
||||||
|
//! Unified memory management for heterogeneous compute.
|
||||||
|
|
||||||
|
use crate::error::ComputeError;
|
||||||
|
use crate::processor::ProcessorType;
|
||||||
|
use parking_lot::RwLock;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
/// Tensor handle for memory management.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct TensorHandle {
|
||||||
|
/// Unique ID.
|
||||||
|
pub id: TensorId,
|
||||||
|
/// Shape.
|
||||||
|
pub shape: Vec<usize>,
|
||||||
|
/// Data type.
|
||||||
|
pub dtype: DataType,
|
||||||
|
/// Size in bytes.
|
||||||
|
pub size_bytes: u64,
|
||||||
|
/// Current locations.
|
||||||
|
pub locations: Vec<ProcessorType>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TensorHandle {
|
||||||
|
/// Creates a new tensor handle.
|
||||||
|
pub fn new(shape: Vec<usize>, dtype: DataType) -> Self {
|
||||||
|
let size_bytes = shape.iter().product::<usize>() as u64 * dtype.size_bytes() as u64;
|
||||||
|
Self {
|
||||||
|
id: TensorId::new(),
|
||||||
|
shape,
|
||||||
|
dtype,
|
||||||
|
size_bytes,
|
||||||
|
locations: Vec::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets the number of elements.
|
||||||
|
pub fn numel(&self) -> usize {
|
||||||
|
self.shape.iter().product()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Tensor identifier.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub struct TensorId(pub u64);
|
||||||
|
|
||||||
|
impl TensorId {
|
||||||
|
/// Creates a new tensor ID.
|
||||||
|
pub fn new() -> Self {
|
||||||
|
use rand::Rng;
|
||||||
|
TensorId(rand::thread_rng().gen())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for TensorId {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Data types for tensors.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub enum DataType {
|
||||||
|
Float64,
|
||||||
|
Float32,
|
||||||
|
Float16,
|
||||||
|
BFloat16,
|
||||||
|
Int64,
|
||||||
|
Int32,
|
||||||
|
Int16,
|
||||||
|
Int8,
|
||||||
|
UInt8,
|
||||||
|
Bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DataType {
|
||||||
|
/// Returns size in bytes.
|
||||||
|
pub fn size_bytes(&self) -> usize {
|
||||||
|
match self {
|
||||||
|
DataType::Float64 | DataType::Int64 => 8,
|
||||||
|
DataType::Float32 | DataType::Int32 => 4,
|
||||||
|
DataType::Float16 | DataType::BFloat16 | DataType::Int16 => 2,
|
||||||
|
DataType::Int8 | DataType::UInt8 | DataType::Bool => 1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Data transfer path between processors.
|
||||||
|
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
|
||||||
|
pub enum TransferPath {
|
||||||
|
/// Direct GPU-to-GPU via NVLink.
|
||||||
|
NvLink,
|
||||||
|
/// Direct GPU-to-GPU via PCIe P2P.
|
||||||
|
PciePeerToPeer,
|
||||||
|
/// Through CPU memory.
|
||||||
|
CpuMediated,
|
||||||
|
/// Unified memory (Apple Silicon).
|
||||||
|
UnifiedMemory,
|
||||||
|
/// Network transfer.
|
||||||
|
Network,
|
||||||
|
/// Same memory space (no transfer needed).
|
||||||
|
SameMemory,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TransferPath {
|
||||||
|
/// Returns approximate bandwidth in GB/s.
|
||||||
|
pub fn bandwidth_gbps(&self) -> f64 {
|
||||||
|
match self {
|
||||||
|
TransferPath::NvLink => 900.0, // NVLink 4.0
|
||||||
|
TransferPath::PciePeerToPeer => 64.0, // PCIe 5.0 x16
|
||||||
|
TransferPath::CpuMediated => 50.0, // DDR5
|
||||||
|
TransferPath::UnifiedMemory => 400.0, // Apple unified
|
||||||
|
TransferPath::Network => 10.0, // 100Gbps network
|
||||||
|
TransferPath::SameMemory => f64::INFINITY,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Estimates transfer time for given bytes.
|
||||||
|
pub fn estimate_transfer_time(&self, bytes: u64) -> std::time::Duration {
|
||||||
|
if matches!(self, TransferPath::SameMemory) {
|
||||||
|
return std::time::Duration::ZERO;
|
||||||
|
}
|
||||||
|
|
||||||
|
let bytes_f64 = bytes as f64;
|
||||||
|
let bandwidth = self.bandwidth_gbps() * 1e9; // Convert to bytes/s
|
||||||
|
let seconds = bytes_f64 / bandwidth;
|
||||||
|
std::time::Duration::from_secs_f64(seconds)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Unified memory manager.
|
||||||
|
pub struct MemoryManager {
|
||||||
|
/// Allocated tensors.
|
||||||
|
tensors: RwLock<HashMap<TensorId, TensorHandle>>,
|
||||||
|
/// Memory usage per processor type.
|
||||||
|
usage: RwLock<HashMap<ProcessorType, u64>>,
|
||||||
|
/// Memory limits per processor type.
|
||||||
|
limits: HashMap<ProcessorType, u64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MemoryManager {
|
||||||
|
/// Creates a new memory manager.
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
tensors: RwLock::new(HashMap::new()),
|
||||||
|
usage: RwLock::new(HashMap::new()),
|
||||||
|
limits: HashMap::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Sets memory limit for a processor type.
|
||||||
|
pub fn set_limit(&mut self, proc_type: ProcessorType, limit_bytes: u64) {
|
||||||
|
self.limits.insert(proc_type, limit_bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Allocates a tensor.
|
||||||
|
pub fn allocate(&self, shape: Vec<usize>, dtype: DataType) -> Result<TensorHandle, ComputeError> {
|
||||||
|
let handle = TensorHandle::new(shape, dtype);
|
||||||
|
self.tensors.write().insert(handle.id, handle.clone());
|
||||||
|
Ok(handle)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Frees a tensor.
|
||||||
|
pub fn free(&self, tensor_id: TensorId) -> Result<(), ComputeError> {
|
||||||
|
if let Some(handle) = self.tensors.write().remove(&tensor_id) {
|
||||||
|
// Update usage for all locations
|
||||||
|
let mut usage = self.usage.write();
|
||||||
|
for loc in &handle.locations {
|
||||||
|
if let Some(u) = usage.get_mut(loc) {
|
||||||
|
*u = u.saturating_sub(handle.size_bytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets a tensor handle.
|
||||||
|
pub fn get(&self, tensor_id: TensorId) -> Option<TensorHandle> {
|
||||||
|
self.tensors.read().get(&tensor_id).cloned()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Ensures tensor is on specified processor.
|
||||||
|
pub fn ensure_on(
|
||||||
|
&self,
|
||||||
|
tensor_id: TensorId,
|
||||||
|
target: ProcessorType,
|
||||||
|
) -> Result<TransferPath, ComputeError> {
|
||||||
|
let mut tensors = self.tensors.write();
|
||||||
|
|
||||||
|
if let Some(handle) = tensors.get_mut(&tensor_id) {
|
||||||
|
// Check if already on target
|
||||||
|
if handle.locations.contains(&target) {
|
||||||
|
return Ok(TransferPath::SameMemory);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determine transfer path
|
||||||
|
let path = if handle.locations.is_empty() {
|
||||||
|
// New tensor, allocate on target
|
||||||
|
TransferPath::SameMemory
|
||||||
|
} else {
|
||||||
|
// Find best transfer path from existing location
|
||||||
|
self.find_best_path(&handle.locations[0], &target)
|
||||||
|
};
|
||||||
|
|
||||||
|
// Record new location
|
||||||
|
handle.locations.push(target.clone());
|
||||||
|
|
||||||
|
// Update usage
|
||||||
|
let mut usage = self.usage.write();
|
||||||
|
*usage.entry(target).or_default() += handle.size_bytes;
|
||||||
|
|
||||||
|
Ok(path)
|
||||||
|
} else {
|
||||||
|
Err(ComputeError::Internal("Tensor not found".to_string()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Finds best transfer path between processors.
|
||||||
|
fn find_best_path(&self, from: &ProcessorType, to: &ProcessorType) -> TransferPath {
|
||||||
|
// Check for unified memory (Apple Silicon)
|
||||||
|
if self.shares_memory(from, to) {
|
||||||
|
return TransferPath::UnifiedMemory;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for NVLink between NVIDIA GPUs
|
||||||
|
if matches!(from, ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda { .. }))
|
||||||
|
&& matches!(to, ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda { .. }))
|
||||||
|
{
|
||||||
|
return TransferPath::NvLink;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for PCIe P2P between GPUs
|
||||||
|
if from.is_gpu() && to.is_gpu() {
|
||||||
|
return TransferPath::PciePeerToPeer;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Default to CPU-mediated transfer
|
||||||
|
TransferPath::CpuMediated
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Checks if two processor types share memory.
|
||||||
|
fn shares_memory(&self, a: &ProcessorType, b: &ProcessorType) -> bool {
|
||||||
|
use crate::processor::{CpuVariant, GpuVariant, NpuVariant};
|
||||||
|
|
||||||
|
match (a, b) {
|
||||||
|
// Apple Silicon unified memory
|
||||||
|
(ProcessorType::Cpu(CpuVariant::Arm64 { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal))
|
||||||
|
| (ProcessorType::Gpu(GpuVariant::AppleMetal), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
|
||||||
|
| (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
|
||||||
|
| (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal)) => true,
|
||||||
|
// Same type
|
||||||
|
_ if a == b => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets current memory usage for a processor type.
|
||||||
|
pub fn usage(&self, proc_type: ProcessorType) -> u64 {
|
||||||
|
self.usage.read().get(&proc_type).copied().unwrap_or(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets available memory for a processor type.
|
||||||
|
pub fn available(&self, proc_type: ProcessorType) -> u64 {
|
||||||
|
let limit = self.limits.get(&proc_type).copied().unwrap_or(u64::MAX);
|
||||||
|
let used = self.usage(proc_type);
|
||||||
|
limit.saturating_sub(used)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets total allocated tensors.
|
||||||
|
pub fn tensor_count(&self) -> usize {
|
||||||
|
self.tensors.read().len()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for MemoryManager {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Unified memory abstraction for zero-copy sharing.
|
||||||
|
pub struct UnifiedMemory {
|
||||||
|
/// Base pointer (in unified address space).
|
||||||
|
pub base: u64,
|
||||||
|
/// Size in bytes.
|
||||||
|
pub size: u64,
|
||||||
|
/// Accessible from these processor types.
|
||||||
|
pub accessible_from: Vec<ProcessorType>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl UnifiedMemory {
|
||||||
|
/// Creates new unified memory region.
|
||||||
|
pub fn new(size: u64) -> Self {
|
||||||
|
Self {
|
||||||
|
base: 0, // Would be actual pointer in real implementation
|
||||||
|
size,
|
||||||
|
accessible_from: Vec::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Checks if accessible from processor type.
|
||||||
|
pub fn is_accessible_from(&self, proc_type: &ProcessorType) -> bool {
|
||||||
|
self.accessible_from.contains(proc_type)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tensor_handle() {
|
||||||
|
let handle = TensorHandle::new(vec![1024, 1024], DataType::Float32);
|
||||||
|
assert_eq!(handle.numel(), 1024 * 1024);
|
||||||
|
assert_eq!(handle.size_bytes, 1024 * 1024 * 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_data_type_sizes() {
|
||||||
|
assert_eq!(DataType::Float64.size_bytes(), 8);
|
||||||
|
assert_eq!(DataType::Float32.size_bytes(), 4);
|
||||||
|
assert_eq!(DataType::Float16.size_bytes(), 2);
|
||||||
|
assert_eq!(DataType::Int8.size_bytes(), 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_transfer_path_bandwidth() {
|
||||||
|
assert!(TransferPath::NvLink.bandwidth_gbps() > TransferPath::PciePeerToPeer.bandwidth_gbps());
|
||||||
|
assert!(TransferPath::SameMemory.bandwidth_gbps().is_infinite());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_memory_manager() {
|
||||||
|
let manager = MemoryManager::new();
|
||||||
|
|
||||||
|
let handle = manager.allocate(vec![1024, 1024], DataType::Float32).unwrap();
|
||||||
|
assert_eq!(manager.tensor_count(), 1);
|
||||||
|
|
||||||
|
manager.free(handle.id).unwrap();
|
||||||
|
assert_eq!(manager.tensor_count(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_ensure_on() {
|
||||||
|
let manager = MemoryManager::new();
|
||||||
|
|
||||||
|
let handle = manager.allocate(vec![1024], DataType::Float32).unwrap();
|
||||||
|
|
||||||
|
// First ensure should allocate
|
||||||
|
let path = manager.ensure_on(
|
||||||
|
handle.id,
|
||||||
|
ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda {
|
||||||
|
compute_capability: (8, 0),
|
||||||
|
}),
|
||||||
|
).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(path, TransferPath::SameMemory);
|
||||||
|
|
||||||
|
// Second ensure to same location should be same memory
|
||||||
|
let path = manager.ensure_on(
|
||||||
|
handle.id,
|
||||||
|
ProcessorType::Gpu(crate::processor::GpuVariant::NvidiaCuda {
|
||||||
|
compute_capability: (8, 0),
|
||||||
|
}),
|
||||||
|
).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(path, TransferPath::SameMemory);
|
||||||
|
}
|
||||||
|
}
|
||||||
547
crates/synor-compute/src/processor/capabilities.rs
Normal file
547
crates/synor-compute/src/processor/capabilities.rs
Normal file
|
|
@ -0,0 +1,547 @@
|
||||||
|
//! Processor capability definitions.
|
||||||
|
|
||||||
|
use super::operation::OperationType;
|
||||||
|
use super::types::PowerTier;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::collections::HashSet;
|
||||||
|
|
||||||
|
/// Detailed processor capabilities.
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub struct ProcessorCapabilities {
|
||||||
|
/// Compute throughput.
|
||||||
|
pub compute: ComputeThroughput,
|
||||||
|
/// Memory specifications.
|
||||||
|
pub memory: MemorySpecs,
|
||||||
|
/// Supported operations.
|
||||||
|
pub operations: HashSet<OperationType>,
|
||||||
|
/// Power characteristics.
|
||||||
|
pub power: PowerCharacteristics,
|
||||||
|
/// Optimal workload characteristics.
|
||||||
|
pub optimal_for: Vec<WorkloadCharacteristic>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for ProcessorCapabilities {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
compute: ComputeThroughput::default(),
|
||||||
|
memory: MemorySpecs::default(),
|
||||||
|
operations: Self::default_operations(),
|
||||||
|
power: PowerCharacteristics::default(),
|
||||||
|
optimal_for: vec![],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ProcessorCapabilities {
|
||||||
|
/// Default operations supported by most processors.
|
||||||
|
fn default_operations() -> HashSet<OperationType> {
|
||||||
|
[
|
||||||
|
OperationType::MatMul,
|
||||||
|
OperationType::Add,
|
||||||
|
OperationType::Mul,
|
||||||
|
OperationType::ReLU,
|
||||||
|
OperationType::Softmax,
|
||||||
|
OperationType::DataLoad,
|
||||||
|
OperationType::DataPreprocess,
|
||||||
|
]
|
||||||
|
.into_iter()
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates CPU capabilities.
|
||||||
|
pub fn cpu(cores: u32, clock_ghz: f32, avx512: bool) -> Self {
|
||||||
|
let flops_per_cycle = if avx512 { 64.0 } else { 32.0 }; // FP32 ops per cycle with AVX
|
||||||
|
let fp32_tflops = (cores as f64 * clock_ghz as f64 * flops_per_cycle) / 1000.0;
|
||||||
|
|
||||||
|
Self {
|
||||||
|
compute: ComputeThroughput {
|
||||||
|
fp64_tflops: fp32_tflops / 2.0,
|
||||||
|
fp32_tflops,
|
||||||
|
fp16_tflops: fp32_tflops * 2.0,
|
||||||
|
bf16_tflops: fp32_tflops * 2.0,
|
||||||
|
int8_tops: fp32_tflops * 4.0,
|
||||||
|
int4_tops: fp32_tflops * 8.0,
|
||||||
|
sparsity_speedup: 1.0,
|
||||||
|
},
|
||||||
|
memory: MemorySpecs {
|
||||||
|
capacity_bytes: 64 * 1024 * 1024 * 1024, // 64 GB typical
|
||||||
|
bandwidth_gbps: 200, // DDR5
|
||||||
|
type_: MemoryType::Ddr5,
|
||||||
|
},
|
||||||
|
operations: Self::cpu_operations(),
|
||||||
|
power: PowerCharacteristics {
|
||||||
|
tdp_watts: 125,
|
||||||
|
efficiency: 0.8,
|
||||||
|
power_tier: PowerTier::Medium,
|
||||||
|
},
|
||||||
|
optimal_for: vec![
|
||||||
|
WorkloadCharacteristic::Sequential,
|
||||||
|
WorkloadCharacteristic::MemoryBound,
|
||||||
|
WorkloadCharacteristic::SmallBatch,
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Operations typically supported by CPUs.
|
||||||
|
fn cpu_operations() -> HashSet<OperationType> {
|
||||||
|
[
|
||||||
|
// Matrix operations (slow but supported)
|
||||||
|
OperationType::MatMul,
|
||||||
|
OperationType::Conv2d,
|
||||||
|
OperationType::BatchNorm,
|
||||||
|
OperationType::LayerNorm,
|
||||||
|
// Element-wise
|
||||||
|
OperationType::Add,
|
||||||
|
OperationType::Mul,
|
||||||
|
OperationType::ReLU,
|
||||||
|
OperationType::GeLU,
|
||||||
|
OperationType::Softmax,
|
||||||
|
// Data operations (optimal)
|
||||||
|
OperationType::DataLoad,
|
||||||
|
OperationType::DataPreprocess,
|
||||||
|
OperationType::Tokenization,
|
||||||
|
OperationType::Detokenization,
|
||||||
|
// Memory operations
|
||||||
|
OperationType::Transpose,
|
||||||
|
OperationType::Reshape,
|
||||||
|
OperationType::Concat,
|
||||||
|
OperationType::Split,
|
||||||
|
// I/O
|
||||||
|
OperationType::Checkpoint,
|
||||||
|
]
|
||||||
|
.into_iter()
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates NVIDIA GPU capabilities.
|
||||||
|
pub fn nvidia_gpu(
|
||||||
|
cuda_cores: u32,
|
||||||
|
tensor_cores: u32,
|
||||||
|
vram_gb: u32,
|
||||||
|
bandwidth_gbps: u32,
|
||||||
|
compute_capability: (u8, u8),
|
||||||
|
) -> Self {
|
||||||
|
// Approximate TFLOPS based on cores and typical clocks
|
||||||
|
let base_clock_ghz = 1.5;
|
||||||
|
let fp32_tflops = (cuda_cores as f64 * base_clock_ghz * 2.0) / 1000.0;
|
||||||
|
let tensor_multiplier = if compute_capability.0 >= 8 { 4.0 } else { 2.0 };
|
||||||
|
|
||||||
|
Self {
|
||||||
|
compute: ComputeThroughput {
|
||||||
|
fp64_tflops: fp32_tflops / 2.0,
|
||||||
|
fp32_tflops,
|
||||||
|
fp16_tflops: fp32_tflops * tensor_multiplier,
|
||||||
|
bf16_tflops: fp32_tflops * tensor_multiplier,
|
||||||
|
int8_tops: fp32_tflops * tensor_multiplier * 2.0,
|
||||||
|
int4_tops: fp32_tflops * tensor_multiplier * 4.0,
|
||||||
|
sparsity_speedup: if compute_capability.0 >= 8 { 2.0 } else { 1.0 },
|
||||||
|
},
|
||||||
|
memory: MemorySpecs {
|
||||||
|
capacity_bytes: vram_gb as u64 * 1024 * 1024 * 1024,
|
||||||
|
bandwidth_gbps,
|
||||||
|
type_: if compute_capability.0 >= 9 {
|
||||||
|
MemoryType::Hbm3
|
||||||
|
} else {
|
||||||
|
MemoryType::Hbm2e
|
||||||
|
},
|
||||||
|
},
|
||||||
|
operations: Self::gpu_operations(compute_capability),
|
||||||
|
power: PowerCharacteristics {
|
||||||
|
tdp_watts: if compute_capability.0 >= 9 { 700 } else { 350 },
|
||||||
|
efficiency: 0.9,
|
||||||
|
power_tier: PowerTier::High,
|
||||||
|
},
|
||||||
|
optimal_for: vec![
|
||||||
|
WorkloadCharacteristic::HighlyParallel,
|
||||||
|
WorkloadCharacteristic::LargeBatch,
|
||||||
|
WorkloadCharacteristic::ComputeBound,
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Operations supported by GPUs.
|
||||||
|
fn gpu_operations(compute_capability: (u8, u8)) -> HashSet<OperationType> {
|
||||||
|
let mut ops: HashSet<OperationType> = [
|
||||||
|
// Matrix operations (optimal)
|
||||||
|
OperationType::MatMul,
|
||||||
|
OperationType::Conv2d,
|
||||||
|
OperationType::Conv3d,
|
||||||
|
OperationType::DepthwiseConv,
|
||||||
|
OperationType::BatchNorm,
|
||||||
|
OperationType::LayerNorm,
|
||||||
|
// Attention
|
||||||
|
OperationType::SelfAttention,
|
||||||
|
OperationType::CrossAttention,
|
||||||
|
// Element-wise
|
||||||
|
OperationType::Add,
|
||||||
|
OperationType::Mul,
|
||||||
|
OperationType::ReLU,
|
||||||
|
OperationType::GeLU,
|
||||||
|
OperationType::SiLU,
|
||||||
|
OperationType::Softmax,
|
||||||
|
// Reduction
|
||||||
|
OperationType::Sum,
|
||||||
|
OperationType::Mean,
|
||||||
|
OperationType::Max,
|
||||||
|
OperationType::ArgMax,
|
||||||
|
// Memory operations
|
||||||
|
OperationType::Transpose,
|
||||||
|
OperationType::Reshape,
|
||||||
|
OperationType::Concat,
|
||||||
|
OperationType::Split,
|
||||||
|
OperationType::Gather,
|
||||||
|
OperationType::Scatter,
|
||||||
|
// LLM specific
|
||||||
|
OperationType::Embedding,
|
||||||
|
OperationType::RoPE,
|
||||||
|
OperationType::KVCache,
|
||||||
|
OperationType::TopK,
|
||||||
|
OperationType::Sampling,
|
||||||
|
]
|
||||||
|
.into_iter()
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// FlashAttention for newer GPUs
|
||||||
|
if compute_capability.0 >= 8 {
|
||||||
|
ops.insert(OperationType::FlashAttention);
|
||||||
|
}
|
||||||
|
|
||||||
|
ops
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates TPU capabilities.
|
||||||
|
pub fn tpu(version: super::TpuVersion) -> Self {
|
||||||
|
let (bf16_tflops, memory_gb, bandwidth_gbps) = match version {
|
||||||
|
super::TpuVersion::V5p => (918.0, 95, 4800),
|
||||||
|
super::TpuVersion::V5e => (197.0, 16, 1600),
|
||||||
|
super::TpuVersion::V4 => (275.0, 32, 2400),
|
||||||
|
super::TpuVersion::V4i => (138.0, 32, 1200),
|
||||||
|
super::TpuVersion::V3 => (123.0, 16, 900),
|
||||||
|
super::TpuVersion::V2 => (46.0, 8, 600),
|
||||||
|
super::TpuVersion::Edge => (4.0, 0, 0), // Uses host memory
|
||||||
|
};
|
||||||
|
|
||||||
|
Self {
|
||||||
|
compute: ComputeThroughput {
|
||||||
|
fp64_tflops: 0.0, // TPUs don't support FP64
|
||||||
|
fp32_tflops: bf16_tflops / 2.0,
|
||||||
|
fp16_tflops: bf16_tflops,
|
||||||
|
bf16_tflops,
|
||||||
|
int8_tops: bf16_tflops * 2.0,
|
||||||
|
int4_tops: bf16_tflops * 4.0,
|
||||||
|
sparsity_speedup: 2.0,
|
||||||
|
},
|
||||||
|
memory: MemorySpecs {
|
||||||
|
capacity_bytes: memory_gb as u64 * 1024 * 1024 * 1024,
|
||||||
|
bandwidth_gbps,
|
||||||
|
type_: MemoryType::Hbm2e,
|
||||||
|
},
|
||||||
|
operations: Self::tpu_operations(),
|
||||||
|
power: PowerCharacteristics {
|
||||||
|
tdp_watts: if matches!(version, super::TpuVersion::Edge) {
|
||||||
|
2
|
||||||
|
} else {
|
||||||
|
400
|
||||||
|
},
|
||||||
|
efficiency: 0.95,
|
||||||
|
power_tier: if matches!(version, super::TpuVersion::Edge) {
|
||||||
|
PowerTier::UltraLow
|
||||||
|
} else {
|
||||||
|
PowerTier::High
|
||||||
|
},
|
||||||
|
},
|
||||||
|
optimal_for: vec![
|
||||||
|
WorkloadCharacteristic::HighlyParallel,
|
||||||
|
WorkloadCharacteristic::ComputeBound,
|
||||||
|
WorkloadCharacteristic::FixedShape,
|
||||||
|
WorkloadCharacteristic::LargeBatch,
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Operations supported by TPUs.
|
||||||
|
fn tpu_operations() -> HashSet<OperationType> {
|
||||||
|
[
|
||||||
|
// Matrix operations (optimal)
|
||||||
|
OperationType::MatMul,
|
||||||
|
OperationType::Conv2d,
|
||||||
|
OperationType::BatchNorm,
|
||||||
|
OperationType::LayerNorm,
|
||||||
|
// Attention
|
||||||
|
OperationType::SelfAttention,
|
||||||
|
OperationType::CrossAttention,
|
||||||
|
OperationType::FlashAttention,
|
||||||
|
// Element-wise
|
||||||
|
OperationType::Add,
|
||||||
|
OperationType::Mul,
|
||||||
|
OperationType::ReLU,
|
||||||
|
OperationType::GeLU,
|
||||||
|
OperationType::SiLU,
|
||||||
|
OperationType::Softmax,
|
||||||
|
// Reduction
|
||||||
|
OperationType::Sum,
|
||||||
|
OperationType::Mean,
|
||||||
|
OperationType::Max,
|
||||||
|
// LLM specific
|
||||||
|
OperationType::Embedding,
|
||||||
|
OperationType::RoPE,
|
||||||
|
OperationType::KVCache,
|
||||||
|
]
|
||||||
|
.into_iter()
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates LPU (Groq) capabilities.
|
||||||
|
pub fn lpu() -> Self {
|
||||||
|
Self {
|
||||||
|
compute: ComputeThroughput {
|
||||||
|
fp64_tflops: 0.0,
|
||||||
|
fp32_tflops: 0.0,
|
||||||
|
fp16_tflops: 188.0,
|
||||||
|
bf16_tflops: 188.0,
|
||||||
|
int8_tops: 750.0,
|
||||||
|
int4_tops: 1500.0,
|
||||||
|
sparsity_speedup: 1.0,
|
||||||
|
},
|
||||||
|
memory: MemorySpecs {
|
||||||
|
capacity_bytes: 230 * 1024 * 1024 * 1024, // 230 GB SRAM!
|
||||||
|
bandwidth_gbps: 80_000, // 80 TB/s internal
|
||||||
|
type_: MemoryType::Sram,
|
||||||
|
},
|
||||||
|
operations: Self::lpu_operations(),
|
||||||
|
power: PowerCharacteristics {
|
||||||
|
tdp_watts: 300,
|
||||||
|
efficiency: 0.98, // Very efficient for inference
|
||||||
|
power_tier: PowerTier::Medium,
|
||||||
|
},
|
||||||
|
optimal_for: vec![
|
||||||
|
WorkloadCharacteristic::Sequential,
|
||||||
|
WorkloadCharacteristic::SmallBatch,
|
||||||
|
WorkloadCharacteristic::VariableLength,
|
||||||
|
WorkloadCharacteristic::LowLatency,
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Operations supported by Groq LPU.
|
||||||
|
fn lpu_operations() -> HashSet<OperationType> {
|
||||||
|
[
|
||||||
|
// Optimized for inference
|
||||||
|
OperationType::MatMul,
|
||||||
|
OperationType::LayerNorm,
|
||||||
|
OperationType::SelfAttention,
|
||||||
|
OperationType::Add,
|
||||||
|
OperationType::Mul,
|
||||||
|
OperationType::ReLU,
|
||||||
|
OperationType::GeLU,
|
||||||
|
OperationType::SiLU,
|
||||||
|
OperationType::Softmax,
|
||||||
|
OperationType::Embedding,
|
||||||
|
OperationType::RoPE,
|
||||||
|
OperationType::KVCache,
|
||||||
|
OperationType::TopK,
|
||||||
|
OperationType::Sampling,
|
||||||
|
]
|
||||||
|
.into_iter()
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates Apple Neural Engine capabilities.
|
||||||
|
pub fn apple_neural_engine(cores: u32) -> Self {
|
||||||
|
let int8_tops = match cores {
|
||||||
|
16 => 18.0, // M3
|
||||||
|
32 => 35.0, // M3 Max
|
||||||
|
_ => cores as f64 * 1.1,
|
||||||
|
};
|
||||||
|
|
||||||
|
Self {
|
||||||
|
compute: ComputeThroughput {
|
||||||
|
fp64_tflops: 0.0,
|
||||||
|
fp32_tflops: int8_tops / 4.0,
|
||||||
|
fp16_tflops: int8_tops / 2.0,
|
||||||
|
bf16_tflops: int8_tops / 2.0,
|
||||||
|
int8_tops,
|
||||||
|
int4_tops: int8_tops * 2.0,
|
||||||
|
sparsity_speedup: 1.0,
|
||||||
|
},
|
||||||
|
memory: MemorySpecs {
|
||||||
|
capacity_bytes: 0, // Uses unified memory
|
||||||
|
bandwidth_gbps: 400,
|
||||||
|
type_: MemoryType::Unified,
|
||||||
|
},
|
||||||
|
operations: Self::npu_operations(),
|
||||||
|
power: PowerCharacteristics {
|
||||||
|
tdp_watts: 15,
|
||||||
|
efficiency: 0.95,
|
||||||
|
power_tier: PowerTier::UltraLow,
|
||||||
|
},
|
||||||
|
optimal_for: vec![
|
||||||
|
WorkloadCharacteristic::LowPower,
|
||||||
|
WorkloadCharacteristic::LowLatency,
|
||||||
|
WorkloadCharacteristic::SmallBatch,
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Operations supported by NPUs.
|
||||||
|
fn npu_operations() -> HashSet<OperationType> {
|
||||||
|
[
|
||||||
|
// Inference optimized
|
||||||
|
OperationType::MatMul,
|
||||||
|
OperationType::Conv2d,
|
||||||
|
OperationType::DepthwiseConv,
|
||||||
|
OperationType::BatchNorm,
|
||||||
|
OperationType::LayerNorm,
|
||||||
|
OperationType::Add,
|
||||||
|
OperationType::Mul,
|
||||||
|
OperationType::ReLU,
|
||||||
|
OperationType::Softmax,
|
||||||
|
OperationType::Embedding,
|
||||||
|
]
|
||||||
|
.into_iter()
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compute throughput metrics.
|
||||||
|
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
|
||||||
|
pub struct ComputeThroughput {
|
||||||
|
/// FP64 TFLOPS.
|
||||||
|
pub fp64_tflops: f64,
|
||||||
|
/// FP32 TFLOPS.
|
||||||
|
pub fp32_tflops: f64,
|
||||||
|
/// FP16 TFLOPS.
|
||||||
|
pub fp16_tflops: f64,
|
||||||
|
/// BF16 TFLOPS.
|
||||||
|
pub bf16_tflops: f64,
|
||||||
|
/// INT8 TOPS.
|
||||||
|
pub int8_tops: f64,
|
||||||
|
/// INT4 TOPS.
|
||||||
|
pub int4_tops: f64,
|
||||||
|
/// Speedup for sparse operations.
|
||||||
|
pub sparsity_speedup: f64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Memory specifications.
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub struct MemorySpecs {
|
||||||
|
/// Total capacity (bytes).
|
||||||
|
pub capacity_bytes: u64,
|
||||||
|
/// Bandwidth (GB/s).
|
||||||
|
pub bandwidth_gbps: u32,
|
||||||
|
/// Memory type.
|
||||||
|
pub type_: MemoryType,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for MemorySpecs {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
capacity_bytes: 16 * 1024 * 1024 * 1024, // 16 GB
|
||||||
|
bandwidth_gbps: 500,
|
||||||
|
type_: MemoryType::Ddr5,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Memory types.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub enum MemoryType {
|
||||||
|
/// DDR4 RAM.
|
||||||
|
Ddr4,
|
||||||
|
/// DDR5 RAM.
|
||||||
|
Ddr5,
|
||||||
|
/// GDDR6/6X video memory.
|
||||||
|
Gddr6,
|
||||||
|
/// HBM2.
|
||||||
|
Hbm2,
|
||||||
|
/// HBM2e.
|
||||||
|
Hbm2e,
|
||||||
|
/// HBM3.
|
||||||
|
Hbm3,
|
||||||
|
/// SRAM (on-chip).
|
||||||
|
Sram,
|
||||||
|
/// Unified memory (Apple Silicon).
|
||||||
|
Unified,
|
||||||
|
/// LPDDR (mobile).
|
||||||
|
Lpddr,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Power characteristics.
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub struct PowerCharacteristics {
|
||||||
|
/// TDP in watts.
|
||||||
|
pub tdp_watts: u32,
|
||||||
|
/// Efficiency factor (0.0 - 1.0).
|
||||||
|
pub efficiency: f64,
|
||||||
|
/// Power tier.
|
||||||
|
pub power_tier: PowerTier,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for PowerCharacteristics {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
tdp_watts: 100,
|
||||||
|
efficiency: 0.8,
|
||||||
|
power_tier: PowerTier::Medium,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Workload characteristics for processor matching.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub enum WorkloadCharacteristic {
|
||||||
|
/// High parallelism (GPU, TPU).
|
||||||
|
HighlyParallel,
|
||||||
|
/// Sequential dependencies (CPU, LPU).
|
||||||
|
Sequential,
|
||||||
|
/// Memory bandwidth bound (GPU).
|
||||||
|
MemoryBound,
|
||||||
|
/// Compute bound (TPU).
|
||||||
|
ComputeBound,
|
||||||
|
/// Low latency required (NPU, edge).
|
||||||
|
LowLatency,
|
||||||
|
/// Low power required (NPU, mobile).
|
||||||
|
LowPower,
|
||||||
|
/// Large batch sizes (GPU, TPU).
|
||||||
|
LargeBatch,
|
||||||
|
/// Small batch sizes (CPU, LPU).
|
||||||
|
SmallBatch,
|
||||||
|
/// Variable length sequences (LPU).
|
||||||
|
VariableLength,
|
||||||
|
/// Fixed tensor shapes (TPU).
|
||||||
|
FixedShape,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_cpu_capabilities() {
|
||||||
|
let caps = ProcessorCapabilities::cpu(32, 3.5, true);
|
||||||
|
assert!(caps.compute.fp32_tflops > 0.0);
|
||||||
|
assert!(caps.operations.contains(&OperationType::DataLoad));
|
||||||
|
assert!(caps.operations.contains(&OperationType::Tokenization));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_gpu_capabilities() {
|
||||||
|
let caps = ProcessorCapabilities::nvidia_gpu(16384, 512, 24, 1000, (8, 9));
|
||||||
|
assert!(caps.compute.fp16_tflops > caps.compute.fp32_tflops);
|
||||||
|
assert!(caps.operations.contains(&OperationType::FlashAttention));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tpu_capabilities() {
|
||||||
|
let caps = ProcessorCapabilities::tpu(super::super::TpuVersion::V5p);
|
||||||
|
assert!(caps.compute.bf16_tflops > 900.0);
|
||||||
|
assert!(!caps.operations.contains(&OperationType::DataLoad)); // TPUs don't do I/O
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_lpu_capabilities() {
|
||||||
|
let caps = ProcessorCapabilities::lpu();
|
||||||
|
assert!(caps.memory.bandwidth_gbps > 10000); // Very high internal bandwidth
|
||||||
|
assert!(caps.optimal_for.contains(&WorkloadCharacteristic::Sequential));
|
||||||
|
}
|
||||||
|
}
|
||||||
339
crates/synor-compute/src/processor/mod.rs
Normal file
339
crates/synor-compute/src/processor/mod.rs
Normal file
|
|
@ -0,0 +1,339 @@
|
||||||
|
//! Processor abstractions for heterogeneous compute.
|
||||||
|
//!
|
||||||
|
//! Supports all processor types:
|
||||||
|
//! - CPU (x86_64, ARM64, RISC-V)
|
||||||
|
//! - GPU (NVIDIA CUDA, AMD ROCm, Intel OneAPI, Apple Metal)
|
||||||
|
//! - TPU (Google TPU v2-v5)
|
||||||
|
//! - NPU (Apple Neural Engine, Qualcomm Hexagon, Intel VPU)
|
||||||
|
//! - LPU (Groq Language Processing Unit)
|
||||||
|
//! - FPGA (Xilinx, Intel/Altera)
|
||||||
|
//! - DSP (Digital Signal Processors)
|
||||||
|
//! - Custom accelerators
|
||||||
|
|
||||||
|
mod capabilities;
|
||||||
|
mod operation;
|
||||||
|
mod profiles;
|
||||||
|
mod types;
|
||||||
|
|
||||||
|
pub use capabilities::{ComputeThroughput, MemorySpecs, MemoryType, ProcessorCapabilities};
|
||||||
|
pub use operation::{Operation, OperationType};
|
||||||
|
pub use profiles::ProcessorProfiles;
|
||||||
|
pub use types::*;
|
||||||
|
|
||||||
|
use crate::error::ComputeError;
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
/// Unique processor identifier (within a node).
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub struct ProcessorId(pub u64);
|
||||||
|
|
||||||
|
impl std::fmt::Display for ProcessorId {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "proc_{}", self.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Unified abstraction for any processor type.
|
||||||
|
#[async_trait]
|
||||||
|
pub trait Processor: Send + Sync {
|
||||||
|
/// Get processor ID.
|
||||||
|
fn id(&self) -> ProcessorId;
|
||||||
|
|
||||||
|
/// Get processor type.
|
||||||
|
fn processor_type(&self) -> ProcessorType;
|
||||||
|
|
||||||
|
/// Get capabilities.
|
||||||
|
fn capabilities(&self) -> &ProcessorCapabilities;
|
||||||
|
|
||||||
|
/// Check if processor can execute operation.
|
||||||
|
fn can_execute(&self, op: &Operation) -> bool;
|
||||||
|
|
||||||
|
/// Estimate execution time for operation.
|
||||||
|
fn estimate_time(&self, op: &Operation) -> Duration;
|
||||||
|
|
||||||
|
/// Estimate energy consumption for operation (Joules).
|
||||||
|
fn estimate_energy(&self, op: &Operation) -> f64;
|
||||||
|
|
||||||
|
/// Execute operation.
|
||||||
|
async fn execute(&self, op: Operation) -> Result<OperationResult, ComputeError>;
|
||||||
|
|
||||||
|
/// Current utilization (0.0 - 1.0).
|
||||||
|
fn utilization(&self) -> f64;
|
||||||
|
|
||||||
|
/// Available memory (bytes).
|
||||||
|
fn available_memory(&self) -> u64;
|
||||||
|
|
||||||
|
/// Check if this processor shares memory with another type.
|
||||||
|
fn shares_memory_with(&self, other: &ProcessorType) -> bool {
|
||||||
|
// By default, processors don't share memory
|
||||||
|
// Override for unified memory architectures (Apple Silicon, AMD APUs)
|
||||||
|
self.processor_type() == *other
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Result of an operation execution.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct OperationResult {
|
||||||
|
/// Output data.
|
||||||
|
pub output: Vec<u8>,
|
||||||
|
/// Execution time.
|
||||||
|
pub duration: Duration,
|
||||||
|
/// Energy consumed (Joules).
|
||||||
|
pub energy: f64,
|
||||||
|
/// Peak memory used (bytes).
|
||||||
|
pub peak_memory: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generic processor implementation for simulation/testing.
|
||||||
|
pub struct GenericProcessor {
|
||||||
|
id: ProcessorId,
|
||||||
|
processor_type: ProcessorType,
|
||||||
|
capabilities: ProcessorCapabilities,
|
||||||
|
utilization: std::sync::atomic::AtomicU64,
|
||||||
|
available_memory: std::sync::atomic::AtomicU64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl GenericProcessor {
|
||||||
|
/// Creates a new generic processor.
|
||||||
|
pub fn new(
|
||||||
|
id: ProcessorId,
|
||||||
|
processor_type: ProcessorType,
|
||||||
|
capabilities: ProcessorCapabilities,
|
||||||
|
) -> Self {
|
||||||
|
let available_memory = capabilities.memory.capacity_bytes;
|
||||||
|
Self {
|
||||||
|
id,
|
||||||
|
processor_type,
|
||||||
|
capabilities,
|
||||||
|
utilization: std::sync::atomic::AtomicU64::new(0),
|
||||||
|
available_memory: std::sync::atomic::AtomicU64::new(available_memory),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a CPU processor.
|
||||||
|
pub fn cpu(id: ProcessorId, variant: CpuVariant) -> Self {
|
||||||
|
Self::new(
|
||||||
|
id,
|
||||||
|
ProcessorType::Cpu(variant),
|
||||||
|
ProcessorProfiles::cpu_default(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates an NVIDIA GPU processor.
|
||||||
|
pub fn nvidia_gpu(id: ProcessorId, compute_capability: (u8, u8)) -> Self {
|
||||||
|
let capabilities = match compute_capability {
|
||||||
|
(9, 0) => ProcessorProfiles::nvidia_h100(),
|
||||||
|
(8, 9) => ProcessorProfiles::nvidia_rtx_4090(),
|
||||||
|
(8, 6) => ProcessorProfiles::nvidia_rtx_3090(),
|
||||||
|
_ => ProcessorProfiles::nvidia_default(),
|
||||||
|
};
|
||||||
|
Self::new(
|
||||||
|
id,
|
||||||
|
ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability }),
|
||||||
|
capabilities,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a TPU processor.
|
||||||
|
pub fn tpu(id: ProcessorId, version: TpuVersion) -> Self {
|
||||||
|
let capabilities = match version {
|
||||||
|
TpuVersion::V5p => ProcessorProfiles::google_tpu_v5p(),
|
||||||
|
TpuVersion::V4 => ProcessorProfiles::google_tpu_v4(),
|
||||||
|
_ => ProcessorProfiles::google_tpu_default(),
|
||||||
|
};
|
||||||
|
Self::new(id, ProcessorType::Tpu(version), capabilities)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a Groq LPU processor.
|
||||||
|
pub fn lpu(id: ProcessorId) -> Self {
|
||||||
|
Self::new(id, ProcessorType::Lpu, ProcessorProfiles::groq_lpu())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates an Apple Neural Engine processor.
|
||||||
|
pub fn apple_neural_engine(id: ProcessorId, cores: u32) -> Self {
|
||||||
|
Self::new(
|
||||||
|
id,
|
||||||
|
ProcessorType::Npu(NpuVariant::AppleNeuralEngine { cores }),
|
||||||
|
ProcessorProfiles::apple_neural_engine(cores),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl Processor for GenericProcessor {
|
||||||
|
fn id(&self) -> ProcessorId {
|
||||||
|
self.id
|
||||||
|
}
|
||||||
|
|
||||||
|
fn processor_type(&self) -> ProcessorType {
|
||||||
|
self.processor_type.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn capabilities(&self) -> &ProcessorCapabilities {
|
||||||
|
&self.capabilities
|
||||||
|
}
|
||||||
|
|
||||||
|
fn can_execute(&self, op: &Operation) -> bool {
|
||||||
|
self.capabilities.operations.contains(&op.op_type())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn estimate_time(&self, op: &Operation) -> Duration {
|
||||||
|
// Estimate based on FLOPS and operation complexity
|
||||||
|
let flops_needed = op.estimated_flops();
|
||||||
|
let throughput = match op.precision() {
|
||||||
|
Precision::Fp32 => self.capabilities.compute.fp32_tflops,
|
||||||
|
Precision::Fp16 => self.capabilities.compute.fp16_tflops,
|
||||||
|
Precision::Bf16 => self.capabilities.compute.bf16_tflops,
|
||||||
|
Precision::Int8 => self.capabilities.compute.int8_tops,
|
||||||
|
Precision::Int4 => self.capabilities.compute.int4_tops,
|
||||||
|
Precision::Fp64 => self.capabilities.compute.fp64_tflops,
|
||||||
|
};
|
||||||
|
|
||||||
|
if throughput > 0.0 {
|
||||||
|
let tflops = throughput;
|
||||||
|
let flops_per_second = tflops * 1e12;
|
||||||
|
let seconds = flops_needed / flops_per_second;
|
||||||
|
Duration::from_secs_f64(seconds)
|
||||||
|
} else {
|
||||||
|
Duration::from_secs(1) // Fallback
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn estimate_energy(&self, op: &Operation) -> f64 {
|
||||||
|
// Estimate based on TDP and execution time
|
||||||
|
let duration = self.estimate_time(op);
|
||||||
|
let watts = self.capabilities.power.tdp_watts as f64;
|
||||||
|
let efficiency = self.capabilities.power.efficiency;
|
||||||
|
watts * duration.as_secs_f64() * efficiency
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn execute(&self, op: Operation) -> Result<OperationResult, ComputeError> {
|
||||||
|
// Check if we can execute
|
||||||
|
if !self.can_execute(&op) {
|
||||||
|
return Err(ComputeError::OperationNotSupported(
|
||||||
|
self.processor_type.clone(),
|
||||||
|
format!("{:?}", op.op_type()),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Simulate execution
|
||||||
|
let duration = self.estimate_time(&op);
|
||||||
|
let energy = self.estimate_energy(&op);
|
||||||
|
|
||||||
|
// Update utilization
|
||||||
|
self.utilization
|
||||||
|
.store(50, std::sync::atomic::Ordering::Relaxed);
|
||||||
|
|
||||||
|
// Simulate work
|
||||||
|
tokio::time::sleep(Duration::from_micros(100)).await;
|
||||||
|
|
||||||
|
// Reset utilization
|
||||||
|
self.utilization
|
||||||
|
.store(0, std::sync::atomic::Ordering::Relaxed);
|
||||||
|
|
||||||
|
Ok(OperationResult {
|
||||||
|
output: vec![],
|
||||||
|
duration,
|
||||||
|
energy,
|
||||||
|
peak_memory: op.estimated_memory(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn utilization(&self) -> f64 {
|
||||||
|
self.utilization.load(std::sync::atomic::Ordering::Relaxed) as f64 / 100.0
|
||||||
|
}
|
||||||
|
|
||||||
|
fn available_memory(&self) -> u64 {
|
||||||
|
self.available_memory
|
||||||
|
.load(std::sync::atomic::Ordering::Relaxed)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn shares_memory_with(&self, other: &ProcessorType) -> bool {
|
||||||
|
match (&self.processor_type, other) {
|
||||||
|
// Apple Silicon has unified memory
|
||||||
|
(ProcessorType::Cpu(CpuVariant::Arm64 { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal))
|
||||||
|
| (ProcessorType::Gpu(GpuVariant::AppleMetal), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
|
||||||
|
| (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Cpu(CpuVariant::Arm64 { .. }))
|
||||||
|
| (ProcessorType::Npu(NpuVariant::AppleNeuralEngine { .. }), ProcessorType::Gpu(GpuVariant::AppleMetal)) => true,
|
||||||
|
// Same type always shares
|
||||||
|
(a, b) if a == b => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Precision for operations.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub enum Precision {
|
||||||
|
Fp64,
|
||||||
|
Fp32,
|
||||||
|
Fp16,
|
||||||
|
Bf16,
|
||||||
|
Int8,
|
||||||
|
Int4,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_processor_creation() {
|
||||||
|
let cpu = GenericProcessor::cpu(
|
||||||
|
ProcessorId(0),
|
||||||
|
CpuVariant::X86_64 {
|
||||||
|
avx: AvxSupport::Avx512,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(cpu.id(), ProcessorId(0));
|
||||||
|
assert!(matches!(cpu.processor_type(), ProcessorType::Cpu(_)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_gpu_creation() {
|
||||||
|
let gpu = GenericProcessor::nvidia_gpu(ProcessorId(1), (9, 0));
|
||||||
|
|
||||||
|
assert_eq!(gpu.id(), ProcessorId(1));
|
||||||
|
assert!(matches!(
|
||||||
|
gpu.processor_type(),
|
||||||
|
ProcessorType::Gpu(GpuVariant::NvidiaCuda { .. })
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_unified_memory() {
|
||||||
|
let apple_cpu = GenericProcessor::new(
|
||||||
|
ProcessorId(0),
|
||||||
|
ProcessorType::Cpu(CpuVariant::Arm64 { sve: false }),
|
||||||
|
ProcessorCapabilities::default(),
|
||||||
|
);
|
||||||
|
|
||||||
|
assert!(apple_cpu.shares_memory_with(&ProcessorType::Gpu(GpuVariant::AppleMetal)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_operation_execution() {
|
||||||
|
let cpu = GenericProcessor::cpu(
|
||||||
|
ProcessorId(0),
|
||||||
|
CpuVariant::X86_64 {
|
||||||
|
avx: AvxSupport::Avx512,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
let op = Operation::MatMul {
|
||||||
|
m: 1024,
|
||||||
|
n: 1024,
|
||||||
|
k: 1024,
|
||||||
|
precision: Precision::Fp32,
|
||||||
|
};
|
||||||
|
|
||||||
|
// CPU might not support all ops depending on capabilities
|
||||||
|
// This is testing the infrastructure
|
||||||
|
let result = cpu.execute(op).await;
|
||||||
|
// Result depends on capabilities
|
||||||
|
assert!(result.is_ok() || result.is_err());
|
||||||
|
}
|
||||||
|
}
|
||||||
543
crates/synor-compute/src/processor/operation.rs
Normal file
543
crates/synor-compute/src/processor/operation.rs
Normal file
|
|
@ -0,0 +1,543 @@
|
||||||
|
//! Operation definitions for heterogeneous compute.
|
||||||
|
|
||||||
|
use super::Precision;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
/// Operation types for processor matching.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub enum OperationType {
|
||||||
|
// Matrix operations
|
||||||
|
MatMul,
|
||||||
|
Conv2d,
|
||||||
|
Conv3d,
|
||||||
|
DepthwiseConv,
|
||||||
|
BatchNorm,
|
||||||
|
LayerNorm,
|
||||||
|
|
||||||
|
// Attention operations
|
||||||
|
SelfAttention,
|
||||||
|
CrossAttention,
|
||||||
|
FlashAttention,
|
||||||
|
|
||||||
|
// Element-wise operations
|
||||||
|
Add,
|
||||||
|
Mul,
|
||||||
|
ReLU,
|
||||||
|
GeLU,
|
||||||
|
SiLU,
|
||||||
|
Softmax,
|
||||||
|
|
||||||
|
// Reduction operations
|
||||||
|
Sum,
|
||||||
|
Mean,
|
||||||
|
Max,
|
||||||
|
ArgMax,
|
||||||
|
|
||||||
|
// Data movement
|
||||||
|
Transpose,
|
||||||
|
Reshape,
|
||||||
|
Concat,
|
||||||
|
Split,
|
||||||
|
Gather,
|
||||||
|
Scatter,
|
||||||
|
|
||||||
|
// LLM specific
|
||||||
|
Embedding,
|
||||||
|
RoPE, // Rotary Position Embedding
|
||||||
|
KVCache,
|
||||||
|
TopK,
|
||||||
|
Sampling,
|
||||||
|
|
||||||
|
// I/O operations
|
||||||
|
DataLoad,
|
||||||
|
DataPreprocess,
|
||||||
|
Tokenization,
|
||||||
|
Detokenization,
|
||||||
|
Checkpoint,
|
||||||
|
|
||||||
|
// Distributed operations
|
||||||
|
AllReduce,
|
||||||
|
AllGather,
|
||||||
|
ReduceScatter,
|
||||||
|
|
||||||
|
// Training specific
|
||||||
|
Backward,
|
||||||
|
OptimizerStep,
|
||||||
|
GradientClip,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Concrete operation with parameters.
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub enum Operation {
|
||||||
|
/// Matrix multiplication.
|
||||||
|
MatMul {
|
||||||
|
m: usize,
|
||||||
|
n: usize,
|
||||||
|
k: usize,
|
||||||
|
precision: Precision,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// 2D Convolution.
|
||||||
|
Conv2d {
|
||||||
|
batch: usize,
|
||||||
|
in_channels: usize,
|
||||||
|
out_channels: usize,
|
||||||
|
height: usize,
|
||||||
|
width: usize,
|
||||||
|
kernel_size: usize,
|
||||||
|
precision: Precision,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Batch normalization.
|
||||||
|
BatchNorm {
|
||||||
|
batch: usize,
|
||||||
|
channels: usize,
|
||||||
|
spatial: usize,
|
||||||
|
precision: Precision,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Layer normalization.
|
||||||
|
LayerNorm {
|
||||||
|
batch: usize,
|
||||||
|
seq_len: usize,
|
||||||
|
hidden: usize,
|
||||||
|
precision: Precision,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Self-attention.
|
||||||
|
SelfAttention {
|
||||||
|
batch: usize,
|
||||||
|
seq_len: usize,
|
||||||
|
num_heads: usize,
|
||||||
|
head_dim: usize,
|
||||||
|
precision: Precision,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Flash attention (fused, memory efficient).
|
||||||
|
FlashAttention {
|
||||||
|
batch: usize,
|
||||||
|
seq_len: usize,
|
||||||
|
num_heads: usize,
|
||||||
|
head_dim: usize,
|
||||||
|
precision: Precision,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Element-wise addition.
|
||||||
|
Add {
|
||||||
|
elements: usize,
|
||||||
|
precision: Precision,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Element-wise multiplication.
|
||||||
|
Mul {
|
||||||
|
elements: usize,
|
||||||
|
precision: Precision,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// ReLU activation.
|
||||||
|
ReLU { elements: usize },
|
||||||
|
|
||||||
|
/// GeLU activation.
|
||||||
|
GeLU { elements: usize },
|
||||||
|
|
||||||
|
/// SiLU (Swish) activation.
|
||||||
|
SiLU { elements: usize },
|
||||||
|
|
||||||
|
/// Softmax.
|
||||||
|
Softmax {
|
||||||
|
batch: usize,
|
||||||
|
seq_len: usize,
|
||||||
|
precision: Precision,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Embedding lookup.
|
||||||
|
Embedding {
|
||||||
|
batch: usize,
|
||||||
|
seq_len: usize,
|
||||||
|
vocab_size: usize,
|
||||||
|
embed_dim: usize,
|
||||||
|
precision: Precision,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Rotary Position Embedding.
|
||||||
|
RoPE {
|
||||||
|
batch: usize,
|
||||||
|
seq_len: usize,
|
||||||
|
head_dim: usize,
|
||||||
|
precision: Precision,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// KV Cache update.
|
||||||
|
KVCache {
|
||||||
|
batch: usize,
|
||||||
|
seq_len: usize,
|
||||||
|
num_heads: usize,
|
||||||
|
head_dim: usize,
|
||||||
|
precision: Precision,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Top-K sampling.
|
||||||
|
TopK {
|
||||||
|
batch: usize,
|
||||||
|
vocab_size: usize,
|
||||||
|
k: usize,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Token sampling.
|
||||||
|
Sampling {
|
||||||
|
batch: usize,
|
||||||
|
vocab_size: usize,
|
||||||
|
temperature: f32,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Data loading from storage.
|
||||||
|
DataLoad {
|
||||||
|
bytes: usize,
|
||||||
|
async_: bool,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Data preprocessing.
|
||||||
|
DataPreprocess {
|
||||||
|
batch: usize,
|
||||||
|
transforms: Vec<String>,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Tokenization.
|
||||||
|
Tokenization {
|
||||||
|
text_bytes: usize,
|
||||||
|
vocab_size: usize,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Detokenization.
|
||||||
|
Detokenization {
|
||||||
|
tokens: usize,
|
||||||
|
vocab_size: usize,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Checkpoint save.
|
||||||
|
Checkpoint {
|
||||||
|
bytes: usize,
|
||||||
|
async_: bool,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// All-reduce across devices.
|
||||||
|
AllReduce {
|
||||||
|
elements: usize,
|
||||||
|
precision: Precision,
|
||||||
|
devices: usize,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Backward pass for a layer.
|
||||||
|
Backward {
|
||||||
|
forward_op: Box<Operation>,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Optimizer step.
|
||||||
|
OptimizerStep {
|
||||||
|
parameters: usize,
|
||||||
|
optimizer: String,
|
||||||
|
precision: Precision,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Transpose.
|
||||||
|
Transpose {
|
||||||
|
shape: Vec<usize>,
|
||||||
|
axes: Vec<usize>,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Reshape.
|
||||||
|
Reshape {
|
||||||
|
from: Vec<usize>,
|
||||||
|
to: Vec<usize>,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Concatenate tensors.
|
||||||
|
Concat {
|
||||||
|
shapes: Vec<Vec<usize>>,
|
||||||
|
axis: usize,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Generic operation.
|
||||||
|
Generic {
|
||||||
|
op_type: OperationType,
|
||||||
|
flops: f64,
|
||||||
|
memory: u64,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Operation {
|
||||||
|
/// Returns the operation type.
|
||||||
|
pub fn op_type(&self) -> OperationType {
|
||||||
|
match self {
|
||||||
|
Operation::MatMul { .. } => OperationType::MatMul,
|
||||||
|
Operation::Conv2d { .. } => OperationType::Conv2d,
|
||||||
|
Operation::BatchNorm { .. } => OperationType::BatchNorm,
|
||||||
|
Operation::LayerNorm { .. } => OperationType::LayerNorm,
|
||||||
|
Operation::SelfAttention { .. } => OperationType::SelfAttention,
|
||||||
|
Operation::FlashAttention { .. } => OperationType::FlashAttention,
|
||||||
|
Operation::Add { .. } => OperationType::Add,
|
||||||
|
Operation::Mul { .. } => OperationType::Mul,
|
||||||
|
Operation::ReLU { .. } => OperationType::ReLU,
|
||||||
|
Operation::GeLU { .. } => OperationType::GeLU,
|
||||||
|
Operation::SiLU { .. } => OperationType::SiLU,
|
||||||
|
Operation::Softmax { .. } => OperationType::Softmax,
|
||||||
|
Operation::Embedding { .. } => OperationType::Embedding,
|
||||||
|
Operation::RoPE { .. } => OperationType::RoPE,
|
||||||
|
Operation::KVCache { .. } => OperationType::KVCache,
|
||||||
|
Operation::TopK { .. } => OperationType::TopK,
|
||||||
|
Operation::Sampling { .. } => OperationType::Sampling,
|
||||||
|
Operation::DataLoad { .. } => OperationType::DataLoad,
|
||||||
|
Operation::DataPreprocess { .. } => OperationType::DataPreprocess,
|
||||||
|
Operation::Tokenization { .. } => OperationType::Tokenization,
|
||||||
|
Operation::Detokenization { .. } => OperationType::Detokenization,
|
||||||
|
Operation::Checkpoint { .. } => OperationType::Checkpoint,
|
||||||
|
Operation::AllReduce { .. } => OperationType::AllReduce,
|
||||||
|
Operation::Backward { .. } => OperationType::Backward,
|
||||||
|
Operation::OptimizerStep { .. } => OperationType::OptimizerStep,
|
||||||
|
Operation::Transpose { .. } => OperationType::Transpose,
|
||||||
|
Operation::Reshape { .. } => OperationType::Reshape,
|
||||||
|
Operation::Concat { .. } => OperationType::Concat,
|
||||||
|
Operation::Generic { op_type, .. } => *op_type,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the precision used.
|
||||||
|
pub fn precision(&self) -> Precision {
|
||||||
|
match self {
|
||||||
|
Operation::MatMul { precision, .. }
|
||||||
|
| Operation::Conv2d { precision, .. }
|
||||||
|
| Operation::BatchNorm { precision, .. }
|
||||||
|
| Operation::LayerNorm { precision, .. }
|
||||||
|
| Operation::SelfAttention { precision, .. }
|
||||||
|
| Operation::FlashAttention { precision, .. }
|
||||||
|
| Operation::Add { precision, .. }
|
||||||
|
| Operation::Mul { precision, .. }
|
||||||
|
| Operation::Softmax { precision, .. }
|
||||||
|
| Operation::Embedding { precision, .. }
|
||||||
|
| Operation::RoPE { precision, .. }
|
||||||
|
| Operation::KVCache { precision, .. }
|
||||||
|
| Operation::AllReduce { precision, .. }
|
||||||
|
| Operation::OptimizerStep { precision, .. } => *precision,
|
||||||
|
Operation::Backward { forward_op } => forward_op.precision(),
|
||||||
|
_ => Precision::Fp32, // Default
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Estimates FLOPS for the operation.
|
||||||
|
pub fn estimated_flops(&self) -> f64 {
|
||||||
|
match self {
|
||||||
|
// MatMul: 2 * M * N * K (multiply-add)
|
||||||
|
Operation::MatMul { m, n, k, .. } => 2.0 * (*m as f64) * (*n as f64) * (*k as f64),
|
||||||
|
|
||||||
|
// Conv2d: 2 * batch * out * H * W * in * K * K
|
||||||
|
Operation::Conv2d {
|
||||||
|
batch,
|
||||||
|
in_channels,
|
||||||
|
out_channels,
|
||||||
|
height,
|
||||||
|
width,
|
||||||
|
kernel_size,
|
||||||
|
..
|
||||||
|
} => {
|
||||||
|
2.0 * (*batch as f64)
|
||||||
|
* (*out_channels as f64)
|
||||||
|
* (*height as f64)
|
||||||
|
* (*width as f64)
|
||||||
|
* (*in_channels as f64)
|
||||||
|
* (*kernel_size as f64)
|
||||||
|
* (*kernel_size as f64)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Self-attention: 4 * batch * seq * seq * head_dim * heads
|
||||||
|
Operation::SelfAttention {
|
||||||
|
batch,
|
||||||
|
seq_len,
|
||||||
|
num_heads,
|
||||||
|
head_dim,
|
||||||
|
..
|
||||||
|
}
|
||||||
|
| Operation::FlashAttention {
|
||||||
|
batch,
|
||||||
|
seq_len,
|
||||||
|
num_heads,
|
||||||
|
head_dim,
|
||||||
|
..
|
||||||
|
} => {
|
||||||
|
4.0 * (*batch as f64)
|
||||||
|
* (*seq_len as f64)
|
||||||
|
* (*seq_len as f64)
|
||||||
|
* (*head_dim as f64)
|
||||||
|
* (*num_heads as f64)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Element-wise: 1 FLOP per element
|
||||||
|
Operation::Add { elements, .. }
|
||||||
|
| Operation::Mul { elements, .. }
|
||||||
|
| Operation::ReLU { elements }
|
||||||
|
| Operation::GeLU { elements }
|
||||||
|
| Operation::SiLU { elements } => *elements as f64,
|
||||||
|
|
||||||
|
// Softmax: ~5 ops per element (exp, sum, div)
|
||||||
|
Operation::Softmax {
|
||||||
|
batch, seq_len, ..
|
||||||
|
} => 5.0 * (*batch as f64) * (*seq_len as f64),
|
||||||
|
|
||||||
|
// Embedding: just lookup, minimal FLOPS
|
||||||
|
Operation::Embedding {
|
||||||
|
batch,
|
||||||
|
seq_len,
|
||||||
|
embed_dim,
|
||||||
|
..
|
||||||
|
} => (*batch as f64) * (*seq_len as f64) * (*embed_dim as f64) * 0.1,
|
||||||
|
|
||||||
|
// Backward: ~2x forward
|
||||||
|
Operation::Backward { forward_op } => forward_op.estimated_flops() * 2.0,
|
||||||
|
|
||||||
|
// Generic
|
||||||
|
Operation::Generic { flops, .. } => *flops,
|
||||||
|
|
||||||
|
// I/O operations: minimal compute
|
||||||
|
_ => 1000.0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Estimates memory usage (bytes).
|
||||||
|
pub fn estimated_memory(&self) -> u64 {
|
||||||
|
let precision_bytes = match self.precision() {
|
||||||
|
Precision::Fp64 => 8,
|
||||||
|
Precision::Fp32 => 4,
|
||||||
|
Precision::Fp16 | Precision::Bf16 => 2,
|
||||||
|
Precision::Int8 => 1,
|
||||||
|
Precision::Int4 => 1, // Rounded up
|
||||||
|
};
|
||||||
|
|
||||||
|
match self {
|
||||||
|
Operation::MatMul { m, n, k, .. } => {
|
||||||
|
// Input A (m×k) + Input B (k×n) + Output (m×n)
|
||||||
|
((*m * *k) + (*k * *n) + (*m * *n)) as u64 * precision_bytes
|
||||||
|
}
|
||||||
|
|
||||||
|
Operation::SelfAttention {
|
||||||
|
batch,
|
||||||
|
seq_len,
|
||||||
|
num_heads,
|
||||||
|
head_dim,
|
||||||
|
..
|
||||||
|
} => {
|
||||||
|
// Q, K, V, Output, intermediate attention
|
||||||
|
5 * (*batch as u64)
|
||||||
|
* (*seq_len as u64)
|
||||||
|
* (*num_heads as u64)
|
||||||
|
* (*head_dim as u64)
|
||||||
|
* precision_bytes
|
||||||
|
}
|
||||||
|
|
||||||
|
Operation::FlashAttention {
|
||||||
|
batch,
|
||||||
|
seq_len,
|
||||||
|
num_heads,
|
||||||
|
head_dim,
|
||||||
|
..
|
||||||
|
} => {
|
||||||
|
// FlashAttention uses much less memory
|
||||||
|
2 * (*batch as u64)
|
||||||
|
* (*seq_len as u64)
|
||||||
|
* (*num_heads as u64)
|
||||||
|
* (*head_dim as u64)
|
||||||
|
* precision_bytes
|
||||||
|
}
|
||||||
|
|
||||||
|
Operation::KVCache {
|
||||||
|
batch,
|
||||||
|
seq_len,
|
||||||
|
num_heads,
|
||||||
|
head_dim,
|
||||||
|
..
|
||||||
|
} => {
|
||||||
|
// K and V caches
|
||||||
|
2 * (*batch as u64)
|
||||||
|
* (*seq_len as u64)
|
||||||
|
* (*num_heads as u64)
|
||||||
|
* (*head_dim as u64)
|
||||||
|
* precision_bytes
|
||||||
|
}
|
||||||
|
|
||||||
|
Operation::Generic { memory, .. } => *memory,
|
||||||
|
|
||||||
|
_ => 1024 * 1024, // 1 MB default
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates the backward operation for this operation.
|
||||||
|
pub fn backward(&self) -> Option<Operation> {
|
||||||
|
match self {
|
||||||
|
Operation::MatMul { .. }
|
||||||
|
| Operation::Conv2d { .. }
|
||||||
|
| Operation::SelfAttention { .. }
|
||||||
|
| Operation::FlashAttention { .. }
|
||||||
|
| Operation::LayerNorm { .. }
|
||||||
|
| Operation::BatchNorm { .. } => Some(Operation::Backward {
|
||||||
|
forward_op: Box::new(self.clone()),
|
||||||
|
}),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_matmul_flops() {
|
||||||
|
let op = Operation::MatMul {
|
||||||
|
m: 1024,
|
||||||
|
n: 1024,
|
||||||
|
k: 1024,
|
||||||
|
precision: Precision::Fp32,
|
||||||
|
};
|
||||||
|
|
||||||
|
let flops = op.estimated_flops();
|
||||||
|
// 2 * 1024^3 = ~2.1 billion FLOPS
|
||||||
|
assert!(flops > 2e9 && flops < 2.2e9);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_attention_memory() {
|
||||||
|
let regular = Operation::SelfAttention {
|
||||||
|
batch: 1,
|
||||||
|
seq_len: 4096,
|
||||||
|
num_heads: 32,
|
||||||
|
head_dim: 128,
|
||||||
|
precision: Precision::Fp16,
|
||||||
|
};
|
||||||
|
|
||||||
|
let flash = Operation::FlashAttention {
|
||||||
|
batch: 1,
|
||||||
|
seq_len: 4096,
|
||||||
|
num_heads: 32,
|
||||||
|
head_dim: 128,
|
||||||
|
precision: Precision::Fp16,
|
||||||
|
};
|
||||||
|
|
||||||
|
// FlashAttention should use less memory
|
||||||
|
assert!(flash.estimated_memory() < regular.estimated_memory());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_backward_creation() {
|
||||||
|
let forward = Operation::MatMul {
|
||||||
|
m: 1024,
|
||||||
|
n: 1024,
|
||||||
|
k: 1024,
|
||||||
|
precision: Precision::Fp32,
|
||||||
|
};
|
||||||
|
|
||||||
|
let backward = forward.backward();
|
||||||
|
assert!(backward.is_some());
|
||||||
|
|
||||||
|
if let Some(Operation::Backward { forward_op }) = backward {
|
||||||
|
assert!(matches!(*forward_op, Operation::MatMul { .. }));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
513
crates/synor-compute/src/processor/profiles.rs
Normal file
513
crates/synor-compute/src/processor/profiles.rs
Normal file
|
|
@ -0,0 +1,513 @@
|
||||||
|
//! Pre-defined processor profiles for common hardware.
|
||||||
|
|
||||||
|
use super::capabilities::{
|
||||||
|
ComputeThroughput, MemorySpecs, MemoryType, PowerCharacteristics, ProcessorCapabilities,
|
||||||
|
WorkloadCharacteristic,
|
||||||
|
};
|
||||||
|
use super::operation::OperationType;
|
||||||
|
use super::types::PowerTier;
|
||||||
|
use super::TpuVersion;
|
||||||
|
use std::collections::HashSet;
|
||||||
|
|
||||||
|
/// Pre-defined processor profiles.
|
||||||
|
pub struct ProcessorProfiles;
|
||||||
|
|
||||||
|
impl ProcessorProfiles {
|
||||||
|
// ═══════════════════════════════════════════════════════════════
|
||||||
|
// CPU PROFILES
|
||||||
|
// ═══════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
/// Default CPU profile.
|
||||||
|
pub fn cpu_default() -> ProcessorCapabilities {
|
||||||
|
ProcessorCapabilities::cpu(8, 3.5, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// AMD EPYC 9654 (96 cores).
|
||||||
|
pub fn amd_epyc_9654() -> ProcessorCapabilities {
|
||||||
|
ProcessorCapabilities {
|
||||||
|
compute: ComputeThroughput {
|
||||||
|
fp64_tflops: 2.7,
|
||||||
|
fp32_tflops: 5.4,
|
||||||
|
fp16_tflops: 10.8,
|
||||||
|
bf16_tflops: 10.8,
|
||||||
|
int8_tops: 21.6,
|
||||||
|
int4_tops: 43.2,
|
||||||
|
sparsity_speedup: 1.0,
|
||||||
|
},
|
||||||
|
memory: MemorySpecs {
|
||||||
|
capacity_bytes: 6 * 1024 * 1024 * 1024 * 1024, // 6 TB max
|
||||||
|
bandwidth_gbps: 460,
|
||||||
|
type_: MemoryType::Ddr5,
|
||||||
|
},
|
||||||
|
operations: ProcessorCapabilities::cpu(96, 2.4, false)
|
||||||
|
.operations,
|
||||||
|
power: PowerCharacteristics {
|
||||||
|
tdp_watts: 360,
|
||||||
|
efficiency: 0.85,
|
||||||
|
power_tier: PowerTier::High,
|
||||||
|
},
|
||||||
|
optimal_for: vec![
|
||||||
|
WorkloadCharacteristic::Sequential,
|
||||||
|
WorkloadCharacteristic::MemoryBound,
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Intel Xeon w9-3595X (56 cores).
|
||||||
|
pub fn intel_xeon_w9_3595x() -> ProcessorCapabilities {
|
||||||
|
ProcessorCapabilities {
|
||||||
|
compute: ComputeThroughput {
|
||||||
|
fp64_tflops: 3.2,
|
||||||
|
fp32_tflops: 6.4,
|
||||||
|
fp16_tflops: 12.8,
|
||||||
|
bf16_tflops: 12.8,
|
||||||
|
int8_tops: 25.6,
|
||||||
|
int4_tops: 51.2,
|
||||||
|
sparsity_speedup: 1.0,
|
||||||
|
},
|
||||||
|
memory: MemorySpecs {
|
||||||
|
capacity_bytes: 4 * 1024 * 1024 * 1024 * 1024, // 4 TB max
|
||||||
|
bandwidth_gbps: 307,
|
||||||
|
type_: MemoryType::Ddr5,
|
||||||
|
},
|
||||||
|
operations: ProcessorCapabilities::cpu(56, 2.9, true)
|
||||||
|
.operations,
|
||||||
|
power: PowerCharacteristics {
|
||||||
|
tdp_watts: 350,
|
||||||
|
efficiency: 0.80,
|
||||||
|
power_tier: PowerTier::High,
|
||||||
|
},
|
||||||
|
optimal_for: vec![
|
||||||
|
WorkloadCharacteristic::Sequential,
|
||||||
|
WorkloadCharacteristic::MemoryBound,
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Apple M3 Max CPU cores.
|
||||||
|
pub fn apple_m3_max_cpu() -> ProcessorCapabilities {
|
||||||
|
ProcessorCapabilities {
|
||||||
|
compute: ComputeThroughput {
|
||||||
|
fp64_tflops: 0.3,
|
||||||
|
fp32_tflops: 0.6,
|
||||||
|
fp16_tflops: 1.2,
|
||||||
|
bf16_tflops: 1.2,
|
||||||
|
int8_tops: 2.4,
|
||||||
|
int4_tops: 4.8,
|
||||||
|
sparsity_speedup: 1.0,
|
||||||
|
},
|
||||||
|
memory: MemorySpecs {
|
||||||
|
capacity_bytes: 128 * 1024 * 1024 * 1024, // 128 GB unified
|
||||||
|
bandwidth_gbps: 400,
|
||||||
|
type_: MemoryType::Unified,
|
||||||
|
},
|
||||||
|
operations: ProcessorCapabilities::cpu(16, 4.0, false)
|
||||||
|
.operations,
|
||||||
|
power: PowerCharacteristics {
|
||||||
|
tdp_watts: 40,
|
||||||
|
efficiency: 0.95,
|
||||||
|
power_tier: PowerTier::Low,
|
||||||
|
},
|
||||||
|
optimal_for: vec![
|
||||||
|
WorkloadCharacteristic::Sequential,
|
||||||
|
WorkloadCharacteristic::LowPower,
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ═══════════════════════════════════════════════════════════════
|
||||||
|
// NVIDIA GPU PROFILES
|
||||||
|
// ═══════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
/// Default NVIDIA GPU profile.
|
||||||
|
pub fn nvidia_default() -> ProcessorCapabilities {
|
||||||
|
ProcessorCapabilities::nvidia_gpu(8192, 256, 12, 600, (8, 0))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// NVIDIA H100 SXM (80GB).
|
||||||
|
pub fn nvidia_h100() -> ProcessorCapabilities {
|
||||||
|
ProcessorCapabilities {
|
||||||
|
compute: ComputeThroughput {
|
||||||
|
fp64_tflops: 67.0,
|
||||||
|
fp32_tflops: 67.0,
|
||||||
|
fp16_tflops: 1979.0, // With sparsity
|
||||||
|
bf16_tflops: 1979.0,
|
||||||
|
int8_tops: 3958.0,
|
||||||
|
int4_tops: 7916.0,
|
||||||
|
sparsity_speedup: 2.0,
|
||||||
|
},
|
||||||
|
memory: MemorySpecs {
|
||||||
|
capacity_bytes: 80 * 1024 * 1024 * 1024,
|
||||||
|
bandwidth_gbps: 3350,
|
||||||
|
type_: MemoryType::Hbm3,
|
||||||
|
},
|
||||||
|
operations: ProcessorCapabilities::nvidia_gpu(16896, 528, 80, 3350, (9, 0))
|
||||||
|
.operations,
|
||||||
|
power: PowerCharacteristics {
|
||||||
|
tdp_watts: 700,
|
||||||
|
efficiency: 0.90,
|
||||||
|
power_tier: PowerTier::High,
|
||||||
|
},
|
||||||
|
optimal_for: vec![
|
||||||
|
WorkloadCharacteristic::HighlyParallel,
|
||||||
|
WorkloadCharacteristic::LargeBatch,
|
||||||
|
WorkloadCharacteristic::ComputeBound,
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// NVIDIA A100 (80GB).
|
||||||
|
pub fn nvidia_a100() -> ProcessorCapabilities {
|
||||||
|
ProcessorCapabilities {
|
||||||
|
compute: ComputeThroughput {
|
||||||
|
fp64_tflops: 19.5,
|
||||||
|
fp32_tflops: 19.5,
|
||||||
|
fp16_tflops: 624.0, // With sparsity
|
||||||
|
bf16_tflops: 624.0,
|
||||||
|
int8_tops: 1248.0,
|
||||||
|
int4_tops: 2496.0,
|
||||||
|
sparsity_speedup: 2.0,
|
||||||
|
},
|
||||||
|
memory: MemorySpecs {
|
||||||
|
capacity_bytes: 80 * 1024 * 1024 * 1024,
|
||||||
|
bandwidth_gbps: 2039,
|
||||||
|
type_: MemoryType::Hbm2e,
|
||||||
|
},
|
||||||
|
operations: ProcessorCapabilities::nvidia_gpu(6912, 432, 80, 2039, (8, 0))
|
||||||
|
.operations,
|
||||||
|
power: PowerCharacteristics {
|
||||||
|
tdp_watts: 400,
|
||||||
|
efficiency: 0.88,
|
||||||
|
power_tier: PowerTier::High,
|
||||||
|
},
|
||||||
|
optimal_for: vec![
|
||||||
|
WorkloadCharacteristic::HighlyParallel,
|
||||||
|
WorkloadCharacteristic::LargeBatch,
|
||||||
|
WorkloadCharacteristic::ComputeBound,
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// NVIDIA RTX 4090.
|
||||||
|
pub fn nvidia_rtx_4090() -> ProcessorCapabilities {
|
||||||
|
ProcessorCapabilities {
|
||||||
|
compute: ComputeThroughput {
|
||||||
|
fp64_tflops: 1.3,
|
||||||
|
fp32_tflops: 82.6,
|
||||||
|
fp16_tflops: 330.4, // With sparsity
|
||||||
|
bf16_tflops: 330.4,
|
||||||
|
int8_tops: 660.8,
|
||||||
|
int4_tops: 1321.6,
|
||||||
|
sparsity_speedup: 2.0,
|
||||||
|
},
|
||||||
|
memory: MemorySpecs {
|
||||||
|
capacity_bytes: 24 * 1024 * 1024 * 1024,
|
||||||
|
bandwidth_gbps: 1008,
|
||||||
|
type_: MemoryType::Gddr6,
|
||||||
|
},
|
||||||
|
operations: ProcessorCapabilities::nvidia_gpu(16384, 512, 24, 1008, (8, 9))
|
||||||
|
.operations,
|
||||||
|
power: PowerCharacteristics {
|
||||||
|
tdp_watts: 450,
|
||||||
|
efficiency: 0.85,
|
||||||
|
power_tier: PowerTier::High,
|
||||||
|
},
|
||||||
|
optimal_for: vec![
|
||||||
|
WorkloadCharacteristic::HighlyParallel,
|
||||||
|
WorkloadCharacteristic::LargeBatch,
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// NVIDIA RTX 3090.
|
||||||
|
pub fn nvidia_rtx_3090() -> ProcessorCapabilities {
|
||||||
|
ProcessorCapabilities {
|
||||||
|
compute: ComputeThroughput {
|
||||||
|
fp64_tflops: 0.6,
|
||||||
|
fp32_tflops: 35.6,
|
||||||
|
fp16_tflops: 71.2,
|
||||||
|
bf16_tflops: 71.2,
|
||||||
|
int8_tops: 142.4,
|
||||||
|
int4_tops: 284.8,
|
||||||
|
sparsity_speedup: 1.0,
|
||||||
|
},
|
||||||
|
memory: MemorySpecs {
|
||||||
|
capacity_bytes: 24 * 1024 * 1024 * 1024,
|
||||||
|
bandwidth_gbps: 936,
|
||||||
|
type_: MemoryType::Gddr6,
|
||||||
|
},
|
||||||
|
operations: ProcessorCapabilities::nvidia_gpu(10496, 328, 24, 936, (8, 6))
|
||||||
|
.operations,
|
||||||
|
power: PowerCharacteristics {
|
||||||
|
tdp_watts: 350,
|
||||||
|
efficiency: 0.82,
|
||||||
|
power_tier: PowerTier::High,
|
||||||
|
},
|
||||||
|
optimal_for: vec![
|
||||||
|
WorkloadCharacteristic::HighlyParallel,
|
||||||
|
WorkloadCharacteristic::LargeBatch,
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ═══════════════════════════════════════════════════════════════
|
||||||
|
// AMD GPU PROFILES
|
||||||
|
// ═══════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
/// AMD MI300X.
|
||||||
|
pub fn amd_mi300x() -> ProcessorCapabilities {
|
||||||
|
ProcessorCapabilities {
|
||||||
|
compute: ComputeThroughput {
|
||||||
|
fp64_tflops: 163.4,
|
||||||
|
fp32_tflops: 163.4,
|
||||||
|
fp16_tflops: 1307.0,
|
||||||
|
bf16_tflops: 1307.0,
|
||||||
|
int8_tops: 2614.0,
|
||||||
|
int4_tops: 5228.0,
|
||||||
|
sparsity_speedup: 2.0,
|
||||||
|
},
|
||||||
|
memory: MemorySpecs {
|
||||||
|
capacity_bytes: 192 * 1024 * 1024 * 1024, // 192 GB HBM3
|
||||||
|
bandwidth_gbps: 5300,
|
||||||
|
type_: MemoryType::Hbm3,
|
||||||
|
},
|
||||||
|
operations: {
|
||||||
|
let mut ops = ProcessorCapabilities::nvidia_gpu(16384, 512, 80, 5300, (9, 0))
|
||||||
|
.operations;
|
||||||
|
ops.remove(&OperationType::FlashAttention); // Different implementation
|
||||||
|
ops
|
||||||
|
},
|
||||||
|
power: PowerCharacteristics {
|
||||||
|
tdp_watts: 750,
|
||||||
|
efficiency: 0.88,
|
||||||
|
power_tier: PowerTier::High,
|
||||||
|
},
|
||||||
|
optimal_for: vec![
|
||||||
|
WorkloadCharacteristic::HighlyParallel,
|
||||||
|
WorkloadCharacteristic::LargeBatch,
|
||||||
|
WorkloadCharacteristic::MemoryBound, // High memory bandwidth
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// AMD RX 7900 XTX.
|
||||||
|
pub fn amd_rx_7900_xtx() -> ProcessorCapabilities {
|
||||||
|
ProcessorCapabilities {
|
||||||
|
compute: ComputeThroughput {
|
||||||
|
fp64_tflops: 1.9,
|
||||||
|
fp32_tflops: 61.0,
|
||||||
|
fp16_tflops: 122.0,
|
||||||
|
bf16_tflops: 122.0,
|
||||||
|
int8_tops: 244.0,
|
||||||
|
int4_tops: 488.0,
|
||||||
|
sparsity_speedup: 1.0,
|
||||||
|
},
|
||||||
|
memory: MemorySpecs {
|
||||||
|
capacity_bytes: 24 * 1024 * 1024 * 1024,
|
||||||
|
bandwidth_gbps: 960,
|
||||||
|
type_: MemoryType::Gddr6,
|
||||||
|
},
|
||||||
|
operations: {
|
||||||
|
let mut ops = ProcessorCapabilities::nvidia_gpu(6144, 0, 24, 960, (8, 0))
|
||||||
|
.operations;
|
||||||
|
ops.remove(&OperationType::FlashAttention);
|
||||||
|
ops
|
||||||
|
},
|
||||||
|
power: PowerCharacteristics {
|
||||||
|
tdp_watts: 355,
|
||||||
|
efficiency: 0.80,
|
||||||
|
power_tier: PowerTier::High,
|
||||||
|
},
|
||||||
|
optimal_for: vec![
|
||||||
|
WorkloadCharacteristic::HighlyParallel,
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ═══════════════════════════════════════════════════════════════
|
||||||
|
// GOOGLE TPU PROFILES
|
||||||
|
// ═══════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
/// Default TPU profile.
|
||||||
|
pub fn google_tpu_default() -> ProcessorCapabilities {
|
||||||
|
ProcessorCapabilities::tpu(TpuVersion::V4)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Google TPU v5p.
|
||||||
|
pub fn google_tpu_v5p() -> ProcessorCapabilities {
|
||||||
|
ProcessorCapabilities::tpu(TpuVersion::V5p)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Google TPU v4.
|
||||||
|
pub fn google_tpu_v4() -> ProcessorCapabilities {
|
||||||
|
ProcessorCapabilities::tpu(TpuVersion::V4)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Google Edge TPU.
|
||||||
|
pub fn google_edge_tpu() -> ProcessorCapabilities {
|
||||||
|
ProcessorCapabilities {
|
||||||
|
compute: ComputeThroughput {
|
||||||
|
fp64_tflops: 0.0,
|
||||||
|
fp32_tflops: 0.0,
|
||||||
|
fp16_tflops: 0.0,
|
||||||
|
bf16_tflops: 0.0,
|
||||||
|
int8_tops: 4.0,
|
||||||
|
int4_tops: 8.0,
|
||||||
|
sparsity_speedup: 1.0,
|
||||||
|
},
|
||||||
|
memory: MemorySpecs {
|
||||||
|
capacity_bytes: 0, // Uses host memory
|
||||||
|
bandwidth_gbps: 0,
|
||||||
|
type_: MemoryType::Unified,
|
||||||
|
},
|
||||||
|
operations: {
|
||||||
|
let mut ops = HashSet::new();
|
||||||
|
ops.insert(OperationType::MatMul);
|
||||||
|
ops.insert(OperationType::Conv2d);
|
||||||
|
ops.insert(OperationType::DepthwiseConv);
|
||||||
|
ops.insert(OperationType::Add);
|
||||||
|
ops.insert(OperationType::Mul);
|
||||||
|
ops.insert(OperationType::ReLU);
|
||||||
|
ops.insert(OperationType::Softmax);
|
||||||
|
ops
|
||||||
|
},
|
||||||
|
power: PowerCharacteristics {
|
||||||
|
tdp_watts: 2,
|
||||||
|
efficiency: 0.95,
|
||||||
|
power_tier: PowerTier::UltraLow,
|
||||||
|
},
|
||||||
|
optimal_for: vec![
|
||||||
|
WorkloadCharacteristic::LowPower,
|
||||||
|
WorkloadCharacteristic::LowLatency,
|
||||||
|
WorkloadCharacteristic::SmallBatch,
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ═══════════════════════════════════════════════════════════════
|
||||||
|
// GROQ LPU PROFILE
|
||||||
|
// ═══════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
/// Groq LPU.
|
||||||
|
pub fn groq_lpu() -> ProcessorCapabilities {
|
||||||
|
ProcessorCapabilities::lpu()
|
||||||
|
}
|
||||||
|
|
||||||
|
// ═══════════════════════════════════════════════════════════════
|
||||||
|
// APPLE NEURAL ENGINE PROFILES
|
||||||
|
// ═══════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
/// Apple Neural Engine (generic).
|
||||||
|
pub fn apple_neural_engine(cores: u32) -> ProcessorCapabilities {
|
||||||
|
ProcessorCapabilities::apple_neural_engine(cores)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Apple M3 Neural Engine (16 cores).
|
||||||
|
pub fn apple_m3_neural_engine() -> ProcessorCapabilities {
|
||||||
|
ProcessorCapabilities::apple_neural_engine(16)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Apple M3 Max Neural Engine (16 cores).
|
||||||
|
pub fn apple_m3_max_neural_engine() -> ProcessorCapabilities {
|
||||||
|
ProcessorCapabilities::apple_neural_engine(16) // Same as M3
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Apple A17 Pro Neural Engine (35 TOPS).
|
||||||
|
pub fn apple_a17_pro_neural_engine() -> ProcessorCapabilities {
|
||||||
|
ProcessorCapabilities {
|
||||||
|
compute: ComputeThroughput {
|
||||||
|
fp64_tflops: 0.0,
|
||||||
|
fp32_tflops: 4.4,
|
||||||
|
fp16_tflops: 8.8,
|
||||||
|
bf16_tflops: 8.8,
|
||||||
|
int8_tops: 35.0,
|
||||||
|
int4_tops: 70.0,
|
||||||
|
sparsity_speedup: 1.0,
|
||||||
|
},
|
||||||
|
memory: MemorySpecs {
|
||||||
|
capacity_bytes: 0, // Uses unified memory
|
||||||
|
bandwidth_gbps: 200,
|
||||||
|
type_: MemoryType::Unified,
|
||||||
|
},
|
||||||
|
operations: ProcessorCapabilities::apple_neural_engine(16)
|
||||||
|
.operations,
|
||||||
|
power: PowerCharacteristics {
|
||||||
|
tdp_watts: 8,
|
||||||
|
efficiency: 0.98,
|
||||||
|
power_tier: PowerTier::UltraLow,
|
||||||
|
},
|
||||||
|
optimal_for: vec![
|
||||||
|
WorkloadCharacteristic::LowPower,
|
||||||
|
WorkloadCharacteristic::LowLatency,
|
||||||
|
WorkloadCharacteristic::SmallBatch,
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ═══════════════════════════════════════════════════════════════
|
||||||
|
// QUALCOMM NPU PROFILES
|
||||||
|
// ═══════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
/// Qualcomm Hexagon NPU (Snapdragon 8 Gen 3).
|
||||||
|
pub fn qualcomm_hexagon_8g3() -> ProcessorCapabilities {
|
||||||
|
ProcessorCapabilities {
|
||||||
|
compute: ComputeThroughput {
|
||||||
|
fp64_tflops: 0.0,
|
||||||
|
fp32_tflops: 3.0,
|
||||||
|
fp16_tflops: 6.0,
|
||||||
|
bf16_tflops: 6.0,
|
||||||
|
int8_tops: 73.0, // 73 TOPS
|
||||||
|
int4_tops: 146.0,
|
||||||
|
sparsity_speedup: 1.0,
|
||||||
|
},
|
||||||
|
memory: MemorySpecs {
|
||||||
|
capacity_bytes: 0, // Uses system memory
|
||||||
|
bandwidth_gbps: 77,
|
||||||
|
type_: MemoryType::Lpddr,
|
||||||
|
},
|
||||||
|
operations: ProcessorCapabilities::apple_neural_engine(16)
|
||||||
|
.operations,
|
||||||
|
power: PowerCharacteristics {
|
||||||
|
tdp_watts: 10,
|
||||||
|
efficiency: 0.95,
|
||||||
|
power_tier: PowerTier::UltraLow,
|
||||||
|
},
|
||||||
|
optimal_for: vec![
|
||||||
|
WorkloadCharacteristic::LowPower,
|
||||||
|
WorkloadCharacteristic::LowLatency,
|
||||||
|
WorkloadCharacteristic::SmallBatch,
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_h100_profile() {
|
||||||
|
let h100 = ProcessorProfiles::nvidia_h100();
|
||||||
|
assert!(h100.compute.fp16_tflops > 1000.0);
|
||||||
|
assert_eq!(h100.memory.capacity_bytes, 80 * 1024 * 1024 * 1024);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tpu_v5p_profile() {
|
||||||
|
let tpu = ProcessorProfiles::google_tpu_v5p();
|
||||||
|
assert!(tpu.compute.bf16_tflops > 900.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_groq_lpu_profile() {
|
||||||
|
let lpu = ProcessorProfiles::groq_lpu();
|
||||||
|
assert!(lpu.memory.bandwidth_gbps > 50000); // Very high internal bandwidth
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_apple_ane_profile() {
|
||||||
|
let ane = ProcessorProfiles::apple_m3_neural_engine();
|
||||||
|
assert!(ane.power.tdp_watts < 20);
|
||||||
|
assert!(ane.optimal_for.contains(&WorkloadCharacteristic::LowPower));
|
||||||
|
}
|
||||||
|
}
|
||||||
367
crates/synor-compute/src/processor/types.rs
Normal file
367
crates/synor-compute/src/processor/types.rs
Normal file
|
|
@ -0,0 +1,367 @@
|
||||||
|
//! Processor type definitions.
|
||||||
|
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
/// All supported processor types.
|
||||||
|
#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub enum ProcessorType {
|
||||||
|
/// Central Processing Unit.
|
||||||
|
Cpu(CpuVariant),
|
||||||
|
/// Graphics Processing Unit.
|
||||||
|
Gpu(GpuVariant),
|
||||||
|
/// Tensor Processing Unit (Google).
|
||||||
|
Tpu(TpuVersion),
|
||||||
|
/// Neural Processing Unit (various vendors).
|
||||||
|
Npu(NpuVariant),
|
||||||
|
/// Language Processing Unit (Groq).
|
||||||
|
Lpu,
|
||||||
|
/// Field Programmable Gate Array.
|
||||||
|
Fpga(FpgaVendor),
|
||||||
|
/// Digital Signal Processor.
|
||||||
|
Dsp(DspVariant),
|
||||||
|
/// WebGPU (browser).
|
||||||
|
WebGpu,
|
||||||
|
/// WebAssembly runtime.
|
||||||
|
Wasm,
|
||||||
|
/// Custom/Unknown accelerator.
|
||||||
|
Custom {
|
||||||
|
vendor: String,
|
||||||
|
model: String,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for ProcessorType {
|
||||||
|
fn default() -> Self {
|
||||||
|
ProcessorType::Cpu(CpuVariant::default())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// CPU architecture variants.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub enum CpuVariant {
|
||||||
|
/// x86-64 architecture.
|
||||||
|
X86_64 { avx: AvxSupport },
|
||||||
|
/// ARM 64-bit architecture.
|
||||||
|
Arm64 { sve: bool },
|
||||||
|
/// RISC-V architecture.
|
||||||
|
RiscV { vector: bool },
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for CpuVariant {
|
||||||
|
fn default() -> Self {
|
||||||
|
CpuVariant::X86_64 {
|
||||||
|
avx: AvxSupport::Avx2,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// AVX instruction set support levels.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
|
||||||
|
pub enum AvxSupport {
|
||||||
|
/// No AVX.
|
||||||
|
None,
|
||||||
|
/// AVX (Sandy Bridge+).
|
||||||
|
Avx,
|
||||||
|
/// AVX2 (Haswell+).
|
||||||
|
Avx2,
|
||||||
|
/// AVX-512 (Skylake-X+).
|
||||||
|
Avx512,
|
||||||
|
/// AVX10 (future).
|
||||||
|
Avx10,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// GPU vendor variants.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub enum GpuVariant {
|
||||||
|
/// NVIDIA CUDA GPU.
|
||||||
|
NvidiaCuda {
|
||||||
|
/// Compute capability (major, minor).
|
||||||
|
compute_capability: (u8, u8),
|
||||||
|
},
|
||||||
|
/// AMD ROCm GPU.
|
||||||
|
AmdRocm {
|
||||||
|
/// GFX version (e.g., 1100 for RDNA3).
|
||||||
|
gfx_version: u32,
|
||||||
|
},
|
||||||
|
/// Intel OneAPI GPU.
|
||||||
|
IntelOneApi,
|
||||||
|
/// Apple Metal GPU.
|
||||||
|
AppleMetal,
|
||||||
|
/// Qualcomm Adreno GPU.
|
||||||
|
QualcommAdreno {
|
||||||
|
/// Adreno model number.
|
||||||
|
model: u32,
|
||||||
|
},
|
||||||
|
/// ARM Mali GPU.
|
||||||
|
ArmMali {
|
||||||
|
/// Mali generation (e.g., G710).
|
||||||
|
model: u32,
|
||||||
|
},
|
||||||
|
/// IMG PowerVR GPU.
|
||||||
|
ImgPowerVr,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Google TPU versions.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub enum TpuVersion {
|
||||||
|
/// TPU v2.
|
||||||
|
V2,
|
||||||
|
/// TPU v3.
|
||||||
|
V3,
|
||||||
|
/// TPU v4.
|
||||||
|
V4,
|
||||||
|
/// TPU v4i (inference).
|
||||||
|
V4i,
|
||||||
|
/// TPU v5e (efficiency).
|
||||||
|
V5e,
|
||||||
|
/// TPU v5p (performance).
|
||||||
|
V5p,
|
||||||
|
/// Edge TPU.
|
||||||
|
Edge,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// NPU (Neural Processing Unit) variants.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub enum NpuVariant {
|
||||||
|
/// Apple Neural Engine.
|
||||||
|
AppleNeuralEngine {
|
||||||
|
/// Number of cores.
|
||||||
|
cores: u32,
|
||||||
|
},
|
||||||
|
/// Qualcomm Hexagon DSP/NPU.
|
||||||
|
QualcommHexagon {
|
||||||
|
/// Version number.
|
||||||
|
version: u32,
|
||||||
|
},
|
||||||
|
/// Intel VPU (Movidius).
|
||||||
|
IntelVpu,
|
||||||
|
/// Huawei Ascend.
|
||||||
|
HuaweiAscend {
|
||||||
|
/// Model (310, 910, etc.).
|
||||||
|
model: u32,
|
||||||
|
},
|
||||||
|
/// Google Edge TPU.
|
||||||
|
GoogleEdgeTpu,
|
||||||
|
/// Samsung NPU.
|
||||||
|
SamsungNpu,
|
||||||
|
/// MediaTek APU.
|
||||||
|
MediaTekApu {
|
||||||
|
/// Version.
|
||||||
|
version: u32,
|
||||||
|
},
|
||||||
|
/// Custom NPU.
|
||||||
|
Custom {
|
||||||
|
/// TOPS (Tera Operations Per Second).
|
||||||
|
tops: u32,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
/// FPGA vendors.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub enum FpgaVendor {
|
||||||
|
/// Xilinx (AMD).
|
||||||
|
Xilinx,
|
||||||
|
/// Intel (Altera).
|
||||||
|
Intel,
|
||||||
|
/// Lattice.
|
||||||
|
Lattice,
|
||||||
|
/// Microchip.
|
||||||
|
Microchip,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// DSP (Digital Signal Processor) variants.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub enum DspVariant {
|
||||||
|
/// Texas Instruments DSP.
|
||||||
|
TexasInstruments,
|
||||||
|
/// Analog Devices DSP.
|
||||||
|
AnalogDevices,
|
||||||
|
/// Qualcomm Hexagon DSP.
|
||||||
|
QualcommHexagon,
|
||||||
|
/// Custom DSP.
|
||||||
|
Custom,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ProcessorType {
|
||||||
|
/// Returns whether this processor type supports CUDA.
|
||||||
|
pub fn supports_cuda(&self) -> bool {
|
||||||
|
matches!(self, ProcessorType::Gpu(GpuVariant::NvidiaCuda { .. }))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns whether this processor type supports ROCm.
|
||||||
|
pub fn supports_rocm(&self) -> bool {
|
||||||
|
matches!(self, ProcessorType::Gpu(GpuVariant::AmdRocm { .. }))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns whether this processor type supports Metal.
|
||||||
|
pub fn supports_metal(&self) -> bool {
|
||||||
|
matches!(self, ProcessorType::Gpu(GpuVariant::AppleMetal))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns whether this processor type is a GPU.
|
||||||
|
pub fn is_gpu(&self) -> bool {
|
||||||
|
matches!(self, ProcessorType::Gpu(_))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns whether this processor type is a CPU.
|
||||||
|
pub fn is_cpu(&self) -> bool {
|
||||||
|
matches!(self, ProcessorType::Cpu(_))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns whether this processor type is suitable for parallel workloads.
|
||||||
|
pub fn is_parallel(&self) -> bool {
|
||||||
|
matches!(
|
||||||
|
self,
|
||||||
|
ProcessorType::Gpu(_) | ProcessorType::Tpu(_) | ProcessorType::Fpga(_)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns whether this processor type is suitable for sequential workloads.
|
||||||
|
pub fn is_sequential(&self) -> bool {
|
||||||
|
matches!(self, ProcessorType::Cpu(_) | ProcessorType::Lpu)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns whether this processor type is power-efficient.
|
||||||
|
pub fn is_low_power(&self) -> bool {
|
||||||
|
matches!(
|
||||||
|
self,
|
||||||
|
ProcessorType::Npu(_) | ProcessorType::Tpu(TpuVersion::Edge) | ProcessorType::Wasm
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the typical power consumption tier.
|
||||||
|
pub fn power_tier(&self) -> PowerTier {
|
||||||
|
match self {
|
||||||
|
ProcessorType::Npu(_) | ProcessorType::Wasm => PowerTier::UltraLow,
|
||||||
|
ProcessorType::Cpu(CpuVariant::Arm64 { .. }) => PowerTier::Low,
|
||||||
|
ProcessorType::Cpu(_) => PowerTier::Medium,
|
||||||
|
ProcessorType::Gpu(GpuVariant::AppleMetal) => PowerTier::Medium,
|
||||||
|
ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability })
|
||||||
|
if compute_capability.0 >= 8 =>
|
||||||
|
{
|
||||||
|
PowerTier::High
|
||||||
|
}
|
||||||
|
ProcessorType::Gpu(_) => PowerTier::Medium,
|
||||||
|
ProcessorType::Tpu(TpuVersion::Edge) => PowerTier::UltraLow,
|
||||||
|
ProcessorType::Tpu(_) => PowerTier::High,
|
||||||
|
ProcessorType::Lpu => PowerTier::Medium,
|
||||||
|
ProcessorType::Fpga(_) => PowerTier::Medium,
|
||||||
|
ProcessorType::Dsp(_) => PowerTier::Low,
|
||||||
|
ProcessorType::WebGpu => PowerTier::Low,
|
||||||
|
ProcessorType::Custom { .. } => PowerTier::Medium,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Power consumption tiers.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||||
|
pub enum PowerTier {
|
||||||
|
/// < 5W (mobile, IoT).
|
||||||
|
UltraLow,
|
||||||
|
/// 5-30W (laptop, tablet).
|
||||||
|
Low,
|
||||||
|
/// 30-150W (desktop, workstation).
|
||||||
|
Medium,
|
||||||
|
/// > 150W (server, data center).
|
||||||
|
High,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Device class for routing decisions.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub enum DeviceClass {
|
||||||
|
/// Data center equipment.
|
||||||
|
DataCenter,
|
||||||
|
/// Desktop/workstation.
|
||||||
|
Desktop,
|
||||||
|
/// Laptop.
|
||||||
|
Laptop,
|
||||||
|
/// Mobile phone.
|
||||||
|
Mobile,
|
||||||
|
/// Tablet.
|
||||||
|
Tablet,
|
||||||
|
/// IoT device.
|
||||||
|
IoT,
|
||||||
|
/// Browser (WebGPU/WASM).
|
||||||
|
Browser,
|
||||||
|
/// Edge server.
|
||||||
|
Edge,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DeviceClass {
|
||||||
|
/// Returns typical available compute hours per day.
|
||||||
|
pub fn typical_availability_hours(&self) -> f32 {
|
||||||
|
match self {
|
||||||
|
DeviceClass::DataCenter => 24.0,
|
||||||
|
DeviceClass::Desktop => 8.0,
|
||||||
|
DeviceClass::Laptop => 6.0,
|
||||||
|
DeviceClass::Mobile => 4.0,
|
||||||
|
DeviceClass::Tablet => 4.0,
|
||||||
|
DeviceClass::IoT => 24.0,
|
||||||
|
DeviceClass::Browser => 2.0,
|
||||||
|
DeviceClass::Edge => 24.0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns reliability score (0-100).
|
||||||
|
pub fn reliability_score(&self) -> u32 {
|
||||||
|
match self {
|
||||||
|
DeviceClass::DataCenter => 99,
|
||||||
|
DeviceClass::Edge => 95,
|
||||||
|
DeviceClass::Desktop => 80,
|
||||||
|
DeviceClass::Laptop => 60,
|
||||||
|
DeviceClass::Mobile => 40,
|
||||||
|
DeviceClass::Tablet => 50,
|
||||||
|
DeviceClass::IoT => 70,
|
||||||
|
DeviceClass::Browser => 30,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_processor_type_properties() {
|
||||||
|
let nvidia = ProcessorType::Gpu(GpuVariant::NvidiaCuda {
|
||||||
|
compute_capability: (9, 0),
|
||||||
|
});
|
||||||
|
assert!(nvidia.supports_cuda());
|
||||||
|
assert!(nvidia.is_gpu());
|
||||||
|
assert!(nvidia.is_parallel());
|
||||||
|
|
||||||
|
let cpu = ProcessorType::Cpu(CpuVariant::X86_64 {
|
||||||
|
avx: AvxSupport::Avx512,
|
||||||
|
});
|
||||||
|
assert!(cpu.is_cpu());
|
||||||
|
assert!(cpu.is_sequential());
|
||||||
|
|
||||||
|
let lpu = ProcessorType::Lpu;
|
||||||
|
assert!(lpu.is_sequential());
|
||||||
|
|
||||||
|
let npu = ProcessorType::Npu(NpuVariant::AppleNeuralEngine { cores: 16 });
|
||||||
|
assert!(npu.is_low_power());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_power_tiers() {
|
||||||
|
let h100 = ProcessorType::Gpu(GpuVariant::NvidiaCuda {
|
||||||
|
compute_capability: (9, 0),
|
||||||
|
});
|
||||||
|
assert_eq!(h100.power_tier(), PowerTier::High);
|
||||||
|
|
||||||
|
let npu = ProcessorType::Npu(NpuVariant::AppleNeuralEngine { cores: 16 });
|
||||||
|
assert_eq!(npu.power_tier(), PowerTier::UltraLow);
|
||||||
|
|
||||||
|
let arm = ProcessorType::Cpu(CpuVariant::Arm64 { sve: false });
|
||||||
|
assert_eq!(arm.power_tier(), PowerTier::Low);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_device_class() {
|
||||||
|
assert_eq!(DeviceClass::DataCenter.typical_availability_hours(), 24.0);
|
||||||
|
assert_eq!(DeviceClass::Mobile.typical_availability_hours(), 4.0);
|
||||||
|
assert_eq!(DeviceClass::DataCenter.reliability_score(), 99);
|
||||||
|
assert_eq!(DeviceClass::Browser.reliability_score(), 30);
|
||||||
|
}
|
||||||
|
}
|
||||||
810
crates/synor-compute/src/scheduler/load_balancer.rs
Normal file
810
crates/synor-compute/src/scheduler/load_balancer.rs
Normal file
|
|
@ -0,0 +1,810 @@
|
||||||
|
//! Load balancer with work stealing for heterogeneous compute.
|
||||||
|
//!
|
||||||
|
//! Supports:
|
||||||
|
//! - Cross-processor-type work migration
|
||||||
|
//! - Energy-aware balancing
|
||||||
|
//! - Latency-aware scheduling
|
||||||
|
//! - Real-time utilization metrics
|
||||||
|
|
||||||
|
use crate::device::{DeviceInfo, DeviceRegistry};
|
||||||
|
use crate::processor::{Operation, OperationType, ProcessorId, ProcessorType};
|
||||||
|
use crate::task::{Task, TaskId, TaskPriority};
|
||||||
|
use super::TaskAssignment;
|
||||||
|
use parking_lot::RwLock;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::sync::atomic::{AtomicU64, Ordering};
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
|
/// Balancing strategy for the load balancer.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||||
|
pub enum BalancingStrategy {
|
||||||
|
/// Optimize for speed (minimize execution time).
|
||||||
|
Speed,
|
||||||
|
/// Optimize for energy efficiency.
|
||||||
|
Energy,
|
||||||
|
/// Balance speed and energy.
|
||||||
|
Balanced,
|
||||||
|
/// Optimize for cost (spot pricing).
|
||||||
|
Cost,
|
||||||
|
/// Optimize for latency (inference workloads).
|
||||||
|
Latency,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for BalancingStrategy {
|
||||||
|
fn default() -> Self {
|
||||||
|
BalancingStrategy::Balanced
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Real-time processor metrics.
|
||||||
|
#[derive(Clone, Debug, Default)]
|
||||||
|
pub struct ProcessorMetrics {
|
||||||
|
/// Current utilization (0.0 - 1.0).
|
||||||
|
pub utilization: f64,
|
||||||
|
/// Queue depth (pending tasks).
|
||||||
|
pub queue_depth: u64,
|
||||||
|
/// Average task completion time (ms).
|
||||||
|
pub avg_completion_ms: f64,
|
||||||
|
/// Tasks completed in last minute.
|
||||||
|
pub throughput_per_min: u64,
|
||||||
|
/// Current power draw (watts).
|
||||||
|
pub power_watts: f64,
|
||||||
|
/// Temperature (celsius).
|
||||||
|
pub temperature: f64,
|
||||||
|
/// Last updated timestamp.
|
||||||
|
pub last_updated: Option<Instant>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Load balancer for heterogeneous compute environments.
|
||||||
|
pub struct LoadBalancer {
|
||||||
|
/// Device registry for processor info.
|
||||||
|
device_registry: Option<Arc<DeviceRegistry>>,
|
||||||
|
/// Current load per processor (task count).
|
||||||
|
loads: RwLock<HashMap<ProcessorId, AtomicU64>>,
|
||||||
|
/// Real-time metrics per processor.
|
||||||
|
metrics: RwLock<HashMap<ProcessorId, ProcessorMetrics>>,
|
||||||
|
/// Processor type mapping.
|
||||||
|
processor_types: RwLock<HashMap<ProcessorId, ProcessorType>>,
|
||||||
|
/// Work stealing threshold (0.0 - 1.0).
|
||||||
|
steal_threshold: f64,
|
||||||
|
/// Rebalance threshold (0.0 - 1.0).
|
||||||
|
rebalance_threshold: f64,
|
||||||
|
/// Current balancing strategy.
|
||||||
|
strategy: RwLock<BalancingStrategy>,
|
||||||
|
/// Migration history (to prevent thrashing).
|
||||||
|
migration_history: RwLock<Vec<MigrationRecord>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Record of a task migration.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
struct MigrationRecord {
|
||||||
|
task_id: TaskId,
|
||||||
|
from: ProcessorId,
|
||||||
|
to: ProcessorId,
|
||||||
|
timestamp: Instant,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LoadBalancer {
|
||||||
|
/// Creates a new load balancer.
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
device_registry: None,
|
||||||
|
loads: RwLock::new(HashMap::new()),
|
||||||
|
metrics: RwLock::new(HashMap::new()),
|
||||||
|
processor_types: RwLock::new(HashMap::new()),
|
||||||
|
steal_threshold: 0.3,
|
||||||
|
rebalance_threshold: 0.2,
|
||||||
|
strategy: RwLock::new(BalancingStrategy::default()),
|
||||||
|
migration_history: RwLock::new(Vec::new()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a load balancer with device registry.
|
||||||
|
pub fn with_registry(device_registry: Arc<DeviceRegistry>) -> Self {
|
||||||
|
Self {
|
||||||
|
device_registry: Some(device_registry),
|
||||||
|
loads: RwLock::new(HashMap::new()),
|
||||||
|
metrics: RwLock::new(HashMap::new()),
|
||||||
|
processor_types: RwLock::new(HashMap::new()),
|
||||||
|
steal_threshold: 0.3,
|
||||||
|
rebalance_threshold: 0.2,
|
||||||
|
strategy: RwLock::new(BalancingStrategy::default()),
|
||||||
|
migration_history: RwLock::new(Vec::new()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Sets the balancing strategy.
|
||||||
|
pub fn set_strategy(&self, strategy: BalancingStrategy) {
|
||||||
|
*self.strategy.write() = strategy;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets the current strategy.
|
||||||
|
pub fn strategy(&self) -> BalancingStrategy {
|
||||||
|
*self.strategy.read()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Register a processor with its type.
|
||||||
|
pub fn register_processor(&self, processor_id: ProcessorId, processor_type: ProcessorType) {
|
||||||
|
self.loads.write().insert(processor_id, AtomicU64::new(0));
|
||||||
|
self.metrics.write().insert(processor_id, ProcessorMetrics::default());
|
||||||
|
self.processor_types.write().insert(processor_id, processor_type);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Unregister a processor.
|
||||||
|
pub fn unregister_processor(&self, processor_id: ProcessorId) {
|
||||||
|
self.loads.write().remove(&processor_id);
|
||||||
|
self.metrics.write().remove(&processor_id);
|
||||||
|
self.processor_types.write().remove(&processor_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Update real-time metrics for a processor.
|
||||||
|
pub fn update_metrics(&self, processor_id: ProcessorId, metrics: ProcessorMetrics) {
|
||||||
|
if let Some(existing) = self.metrics.write().get_mut(&processor_id) {
|
||||||
|
*existing = ProcessorMetrics {
|
||||||
|
last_updated: Some(Instant::now()),
|
||||||
|
..metrics
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get current load for a processor.
|
||||||
|
pub fn get_load(&self, processor_id: ProcessorId) -> u64 {
|
||||||
|
self.loads.read()
|
||||||
|
.get(&processor_id)
|
||||||
|
.map(|l| l.load(Ordering::Relaxed))
|
||||||
|
.unwrap_or(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Increment load for a processor.
|
||||||
|
pub fn increment_load(&self, processor_id: ProcessorId) {
|
||||||
|
if let Some(load) = self.loads.read().get(&processor_id) {
|
||||||
|
load.fetch_add(1, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decrement load for a processor.
|
||||||
|
pub fn decrement_load(&self, processor_id: ProcessorId) {
|
||||||
|
if let Some(load) = self.loads.read().get(&processor_id) {
|
||||||
|
load.fetch_sub(1, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if an operation can run on a processor type.
|
||||||
|
pub fn can_execute(&self, op: &Operation, processor_type: &ProcessorType) -> bool {
|
||||||
|
let op_type = op.op_type();
|
||||||
|
|
||||||
|
match processor_type {
|
||||||
|
// CPUs can handle most sequential operations
|
||||||
|
ProcessorType::Cpu(_) => matches!(
|
||||||
|
op_type,
|
||||||
|
OperationType::MatMul
|
||||||
|
| OperationType::Conv2d
|
||||||
|
| OperationType::Conv3d
|
||||||
|
| OperationType::DepthwiseConv
|
||||||
|
| OperationType::BatchNorm
|
||||||
|
| OperationType::LayerNorm
|
||||||
|
| OperationType::Add
|
||||||
|
| OperationType::Mul
|
||||||
|
| OperationType::ReLU
|
||||||
|
| OperationType::GeLU
|
||||||
|
| OperationType::SiLU
|
||||||
|
| OperationType::Softmax
|
||||||
|
| OperationType::Sum
|
||||||
|
| OperationType::Mean
|
||||||
|
| OperationType::Max
|
||||||
|
| OperationType::ArgMax
|
||||||
|
| OperationType::Embedding
|
||||||
|
| OperationType::TopK
|
||||||
|
| OperationType::Sampling
|
||||||
|
| OperationType::Tokenization
|
||||||
|
| OperationType::Detokenization
|
||||||
|
| OperationType::DataLoad
|
||||||
|
| OperationType::DataPreprocess
|
||||||
|
| OperationType::Transpose
|
||||||
|
| OperationType::Reshape
|
||||||
|
| OperationType::Concat
|
||||||
|
| OperationType::Split
|
||||||
|
),
|
||||||
|
|
||||||
|
// GPUs excel at parallel operations
|
||||||
|
ProcessorType::Gpu(_) => matches!(
|
||||||
|
op_type,
|
||||||
|
OperationType::MatMul
|
||||||
|
| OperationType::Conv2d
|
||||||
|
| OperationType::Conv3d
|
||||||
|
| OperationType::DepthwiseConv
|
||||||
|
| OperationType::BatchNorm
|
||||||
|
| OperationType::LayerNorm
|
||||||
|
| OperationType::SelfAttention
|
||||||
|
| OperationType::CrossAttention
|
||||||
|
| OperationType::FlashAttention
|
||||||
|
| OperationType::Add
|
||||||
|
| OperationType::Mul
|
||||||
|
| OperationType::ReLU
|
||||||
|
| OperationType::GeLU
|
||||||
|
| OperationType::SiLU
|
||||||
|
| OperationType::Softmax
|
||||||
|
| OperationType::Sum
|
||||||
|
| OperationType::Mean
|
||||||
|
| OperationType::Max
|
||||||
|
| OperationType::ArgMax
|
||||||
|
| OperationType::Embedding
|
||||||
|
| OperationType::RoPE
|
||||||
|
| OperationType::KVCache
|
||||||
|
| OperationType::TopK
|
||||||
|
| OperationType::Sampling
|
||||||
|
| OperationType::Transpose
|
||||||
|
| OperationType::Reshape
|
||||||
|
| OperationType::Concat
|
||||||
|
| OperationType::Split
|
||||||
|
| OperationType::Gather
|
||||||
|
| OperationType::Scatter
|
||||||
|
| OperationType::AllReduce
|
||||||
|
| OperationType::AllGather
|
||||||
|
| OperationType::ReduceScatter
|
||||||
|
| OperationType::Backward
|
||||||
|
| OperationType::OptimizerStep
|
||||||
|
| OperationType::GradientClip
|
||||||
|
),
|
||||||
|
|
||||||
|
// TPUs optimized for ML
|
||||||
|
ProcessorType::Tpu(_) => matches!(
|
||||||
|
op_type,
|
||||||
|
OperationType::MatMul
|
||||||
|
| OperationType::Conv2d
|
||||||
|
| OperationType::BatchNorm
|
||||||
|
| OperationType::LayerNorm
|
||||||
|
| OperationType::SelfAttention
|
||||||
|
| OperationType::CrossAttention
|
||||||
|
| OperationType::FlashAttention
|
||||||
|
| OperationType::Add
|
||||||
|
| OperationType::Mul
|
||||||
|
| OperationType::ReLU
|
||||||
|
| OperationType::GeLU
|
||||||
|
| OperationType::SiLU
|
||||||
|
| OperationType::Softmax
|
||||||
|
| OperationType::Sum
|
||||||
|
| OperationType::Mean
|
||||||
|
| OperationType::Embedding
|
||||||
|
| OperationType::RoPE
|
||||||
|
| OperationType::KVCache
|
||||||
|
| OperationType::AllReduce
|
||||||
|
| OperationType::AllGather
|
||||||
|
| OperationType::ReduceScatter
|
||||||
|
| OperationType::Backward
|
||||||
|
| OperationType::OptimizerStep
|
||||||
|
),
|
||||||
|
|
||||||
|
// NPUs for neural network inference
|
||||||
|
ProcessorType::Npu(_) => matches!(
|
||||||
|
op_type,
|
||||||
|
OperationType::MatMul
|
||||||
|
| OperationType::Conv2d
|
||||||
|
| OperationType::DepthwiseConv
|
||||||
|
| OperationType::BatchNorm
|
||||||
|
| OperationType::LayerNorm
|
||||||
|
| OperationType::SelfAttention
|
||||||
|
| OperationType::Add
|
||||||
|
| OperationType::Mul
|
||||||
|
| OperationType::ReLU
|
||||||
|
| OperationType::GeLU
|
||||||
|
| OperationType::SiLU
|
||||||
|
| OperationType::Softmax
|
||||||
|
| OperationType::Sum
|
||||||
|
| OperationType::Mean
|
||||||
|
),
|
||||||
|
|
||||||
|
// LPUs for sequential inference (optimized for LLMs)
|
||||||
|
ProcessorType::Lpu => matches!(
|
||||||
|
op_type,
|
||||||
|
OperationType::MatMul
|
||||||
|
| OperationType::LayerNorm
|
||||||
|
| OperationType::SelfAttention
|
||||||
|
| OperationType::FlashAttention
|
||||||
|
| OperationType::Add
|
||||||
|
| OperationType::Mul
|
||||||
|
| OperationType::ReLU
|
||||||
|
| OperationType::GeLU
|
||||||
|
| OperationType::SiLU
|
||||||
|
| OperationType::Softmax
|
||||||
|
| OperationType::Embedding
|
||||||
|
| OperationType::RoPE
|
||||||
|
| OperationType::KVCache
|
||||||
|
| OperationType::TopK
|
||||||
|
| OperationType::Sampling
|
||||||
|
),
|
||||||
|
|
||||||
|
// FPGAs can be programmed for anything
|
||||||
|
ProcessorType::Fpga(_) => true,
|
||||||
|
|
||||||
|
// DSPs for signal processing
|
||||||
|
ProcessorType::Dsp(_) => matches!(
|
||||||
|
op_type,
|
||||||
|
OperationType::Conv2d
|
||||||
|
| OperationType::DepthwiseConv
|
||||||
|
| OperationType::Add
|
||||||
|
| OperationType::Mul
|
||||||
|
| OperationType::Sum
|
||||||
|
| OperationType::Mean
|
||||||
|
| OperationType::Max
|
||||||
|
),
|
||||||
|
|
||||||
|
// WebGPU has limited operations
|
||||||
|
ProcessorType::WebGpu => matches!(
|
||||||
|
op_type,
|
||||||
|
OperationType::MatMul
|
||||||
|
| OperationType::Conv2d
|
||||||
|
| OperationType::Add
|
||||||
|
| OperationType::Mul
|
||||||
|
| OperationType::ReLU
|
||||||
|
| OperationType::Softmax
|
||||||
|
| OperationType::Sum
|
||||||
|
| OperationType::Transpose
|
||||||
|
| OperationType::Reshape
|
||||||
|
),
|
||||||
|
|
||||||
|
// WASM for portable compute
|
||||||
|
ProcessorType::Wasm => matches!(
|
||||||
|
op_type,
|
||||||
|
OperationType::MatMul
|
||||||
|
| OperationType::Add
|
||||||
|
| OperationType::Mul
|
||||||
|
| OperationType::ReLU
|
||||||
|
| OperationType::Softmax
|
||||||
|
| OperationType::Sum
|
||||||
|
| OperationType::Mean
|
||||||
|
| OperationType::Tokenization
|
||||||
|
| OperationType::Detokenization
|
||||||
|
),
|
||||||
|
|
||||||
|
// Custom processors - assume they can handle anything
|
||||||
|
ProcessorType::Custom { .. } => true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate a score for assigning a task to a processor.
|
||||||
|
fn calculate_score(
|
||||||
|
&self,
|
||||||
|
task: &Task,
|
||||||
|
processor_id: ProcessorId,
|
||||||
|
processor_type: &ProcessorType,
|
||||||
|
) -> f64 {
|
||||||
|
let strategy = *self.strategy.read();
|
||||||
|
let load = self.get_load(processor_id);
|
||||||
|
let metrics = self.metrics.read();
|
||||||
|
let proc_metrics = metrics.get(&processor_id);
|
||||||
|
|
||||||
|
// Base score from compatibility
|
||||||
|
if !self.can_execute(&task.operation, processor_type) {
|
||||||
|
return f64::NEG_INFINITY;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get utilization and metrics
|
||||||
|
let utilization = proc_metrics.map(|m| m.utilization).unwrap_or(load as f64 / 100.0);
|
||||||
|
let power = proc_metrics.map(|m| m.power_watts).unwrap_or(100.0);
|
||||||
|
let avg_completion = proc_metrics.map(|m| m.avg_completion_ms).unwrap_or(100.0);
|
||||||
|
|
||||||
|
// Calculate score based on strategy
|
||||||
|
match strategy {
|
||||||
|
BalancingStrategy::Speed => {
|
||||||
|
// Prioritize low utilization and fast completion
|
||||||
|
let speed_score = 1.0 / (avg_completion.max(1.0) * (1.0 + utilization));
|
||||||
|
|
||||||
|
// Bonus for powerful processor types
|
||||||
|
let type_bonus = match processor_type {
|
||||||
|
ProcessorType::Gpu(_) => 2.0,
|
||||||
|
ProcessorType::Tpu(_) => 2.5,
|
||||||
|
ProcessorType::Lpu => 3.0, // Fastest for inference
|
||||||
|
ProcessorType::Npu(_) => 1.5,
|
||||||
|
_ => 1.0,
|
||||||
|
};
|
||||||
|
|
||||||
|
speed_score * type_bonus
|
||||||
|
}
|
||||||
|
|
||||||
|
BalancingStrategy::Energy => {
|
||||||
|
// Prioritize low power consumption
|
||||||
|
let energy_score = 1.0 / power.max(1.0);
|
||||||
|
|
||||||
|
// Bonus for efficient processor types
|
||||||
|
let efficiency_bonus = match processor_type {
|
||||||
|
ProcessorType::Npu(_) => 3.0, // Most efficient
|
||||||
|
ProcessorType::Lpu => 2.0,
|
||||||
|
ProcessorType::Cpu(_) => 1.5,
|
||||||
|
ProcessorType::Wasm => 2.0, // Low overhead
|
||||||
|
_ => 1.0,
|
||||||
|
};
|
||||||
|
|
||||||
|
energy_score * efficiency_bonus * (1.0 - utilization * 0.5)
|
||||||
|
}
|
||||||
|
|
||||||
|
BalancingStrategy::Balanced => {
|
||||||
|
// Balance speed and energy
|
||||||
|
let speed = 1.0 / avg_completion.max(1.0);
|
||||||
|
let efficiency = 1.0 / power.max(1.0);
|
||||||
|
let load_factor = 1.0 - utilization;
|
||||||
|
|
||||||
|
(speed * 0.4 + efficiency * 0.3 + load_factor * 0.3)
|
||||||
|
}
|
||||||
|
|
||||||
|
BalancingStrategy::Cost => {
|
||||||
|
// Prioritize cheaper resources (consumer devices)
|
||||||
|
let cost_factor = match processor_type {
|
||||||
|
ProcessorType::Wasm => 0.1, // Cheapest (browser)
|
||||||
|
ProcessorType::WebGpu => 0.15,
|
||||||
|
ProcessorType::Cpu(_) => 0.2,
|
||||||
|
ProcessorType::Npu(_) => 0.3, // Mobile NPUs
|
||||||
|
ProcessorType::Gpu(_) => 0.5,
|
||||||
|
ProcessorType::Lpu => 0.8,
|
||||||
|
ProcessorType::Tpu(_) => 1.0, // Most expensive
|
||||||
|
_ => 0.5,
|
||||||
|
};
|
||||||
|
|
||||||
|
(1.0 - cost_factor) * (1.0 - utilization)
|
||||||
|
}
|
||||||
|
|
||||||
|
BalancingStrategy::Latency => {
|
||||||
|
// Prioritize low latency for inference
|
||||||
|
let latency_score = 1.0 / avg_completion.max(0.1);
|
||||||
|
|
||||||
|
// Bonus for low-latency processors
|
||||||
|
let latency_bonus = match processor_type {
|
||||||
|
ProcessorType::Lpu => 5.0, // Designed for low latency
|
||||||
|
ProcessorType::Npu(_) => 3.0,
|
||||||
|
ProcessorType::Gpu(_) => 2.0,
|
||||||
|
ProcessorType::Tpu(_) => 1.5,
|
||||||
|
_ => 1.0,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Priority boost for critical tasks
|
||||||
|
let priority_boost = match task.priority {
|
||||||
|
TaskPriority::Critical => 2.0,
|
||||||
|
TaskPriority::High => 1.5,
|
||||||
|
TaskPriority::Normal => 1.0,
|
||||||
|
TaskPriority::Background => 0.5,
|
||||||
|
};
|
||||||
|
|
||||||
|
latency_score * latency_bonus * priority_boost * (1.0 - utilization)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Maybe rebalance a task to a different processor.
|
||||||
|
pub fn maybe_rebalance(
|
||||||
|
&self,
|
||||||
|
task: &Task,
|
||||||
|
suggested_processor: ProcessorId,
|
||||||
|
current_assignment: &TaskAssignment,
|
||||||
|
) -> ProcessorId {
|
||||||
|
// Get all registered processors
|
||||||
|
let processor_types = self.processor_types.read();
|
||||||
|
|
||||||
|
// If we don't have processor info, use suggested
|
||||||
|
let suggested_type = match processor_types.get(&suggested_processor) {
|
||||||
|
Some(t) => t.clone(),
|
||||||
|
None => return suggested_processor,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Calculate score for suggested processor
|
||||||
|
let suggested_score = self.calculate_score(task, suggested_processor, &suggested_type);
|
||||||
|
|
||||||
|
// Find best alternative
|
||||||
|
let mut best_processor = suggested_processor;
|
||||||
|
let mut best_score = suggested_score;
|
||||||
|
|
||||||
|
for (proc_id, proc_type) in processor_types.iter() {
|
||||||
|
if *proc_id == suggested_processor {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let score = self.calculate_score(task, *proc_id, proc_type);
|
||||||
|
|
||||||
|
// Only switch if significantly better (prevents thrashing)
|
||||||
|
if score > best_score * (1.0 + self.rebalance_threshold) {
|
||||||
|
best_score = score;
|
||||||
|
best_processor = *proc_id;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Record migration if different
|
||||||
|
if best_processor != suggested_processor {
|
||||||
|
self.migration_history.write().push(MigrationRecord {
|
||||||
|
task_id: task.id,
|
||||||
|
from: suggested_processor,
|
||||||
|
to: best_processor,
|
||||||
|
timestamp: Instant::now(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
best_processor
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if work stealing should happen between two processors.
|
||||||
|
pub fn should_steal(&self, from: ProcessorId, to: ProcessorId) -> bool {
|
||||||
|
let from_load = self.get_load(from) as f64;
|
||||||
|
let to_load = self.get_load(to) as f64;
|
||||||
|
|
||||||
|
if from_load == 0.0 {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if processor types are compatible for the queued work
|
||||||
|
let processor_types = self.processor_types.read();
|
||||||
|
let from_type = processor_types.get(&from);
|
||||||
|
let to_type = processor_types.get(&to);
|
||||||
|
|
||||||
|
// Only steal between same processor types by default
|
||||||
|
// (cross-type stealing requires operation compatibility check)
|
||||||
|
match (from_type, to_type) {
|
||||||
|
(Some(ft), Some(tt)) if std::mem::discriminant(ft) == std::mem::discriminant(tt) => {
|
||||||
|
let diff = (from_load - to_load) / from_load;
|
||||||
|
diff > self.steal_threshold
|
||||||
|
}
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get rebalancing suggestions based on current load.
|
||||||
|
pub fn get_rebalance_suggestions(&self) -> Vec<(ProcessorId, ProcessorId)> {
|
||||||
|
let mut suggestions = Vec::new();
|
||||||
|
let loads = self.loads.read();
|
||||||
|
|
||||||
|
let load_values: Vec<_> = loads.iter()
|
||||||
|
.map(|(id, load)| (*id, load.load(Ordering::Relaxed)))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if load_values.is_empty() {
|
||||||
|
return suggestions;
|
||||||
|
}
|
||||||
|
|
||||||
|
let avg_load: f64 = load_values.iter().map(|(_, l)| *l as f64).sum::<f64>()
|
||||||
|
/ load_values.len() as f64;
|
||||||
|
|
||||||
|
let processor_types = self.processor_types.read();
|
||||||
|
|
||||||
|
let overloaded: Vec<_> = load_values.iter()
|
||||||
|
.filter(|(_, l)| *l as f64 > avg_load * (1.0 + self.rebalance_threshold))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let underloaded: Vec<_> = load_values.iter()
|
||||||
|
.filter(|(_, l)| (*l as f64) < avg_load * (1.0 - self.rebalance_threshold))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Only suggest migrations between compatible processor types
|
||||||
|
for (over_id, _) in overloaded {
|
||||||
|
let over_type = processor_types.get(over_id);
|
||||||
|
|
||||||
|
for (under_id, _) in &underloaded {
|
||||||
|
let under_type = processor_types.get(under_id);
|
||||||
|
|
||||||
|
// Check type compatibility
|
||||||
|
if let (Some(ot), Some(ut)) = (over_type, under_type) {
|
||||||
|
if std::mem::discriminant(ot) == std::mem::discriminant(ut) {
|
||||||
|
suggestions.push((*over_id, *under_id));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
suggestions
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get load statistics.
|
||||||
|
pub fn get_stats(&self) -> LoadBalancerStats {
|
||||||
|
let loads = self.loads.read();
|
||||||
|
let metrics = self.metrics.read();
|
||||||
|
|
||||||
|
let total_load: u64 = loads.values().map(|l| l.load(Ordering::Relaxed)).sum();
|
||||||
|
let processor_count = loads.len();
|
||||||
|
let avg_load = if processor_count > 0 {
|
||||||
|
total_load as f64 / processor_count as f64
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
};
|
||||||
|
|
||||||
|
let total_utilization: f64 = metrics.values().map(|m| m.utilization).sum();
|
||||||
|
let avg_utilization = if processor_count > 0 {
|
||||||
|
total_utilization / processor_count as f64
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
};
|
||||||
|
|
||||||
|
let total_power: f64 = metrics.values().map(|m| m.power_watts).sum();
|
||||||
|
let migrations = self.migration_history.read().len();
|
||||||
|
|
||||||
|
LoadBalancerStats {
|
||||||
|
total_load,
|
||||||
|
avg_load,
|
||||||
|
processor_count,
|
||||||
|
avg_utilization,
|
||||||
|
total_power_watts: total_power,
|
||||||
|
total_migrations: migrations,
|
||||||
|
strategy: *self.strategy.read(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Clean up old migration history.
|
||||||
|
pub fn cleanup_history(&self, max_age: Duration) {
|
||||||
|
let cutoff = Instant::now() - max_age;
|
||||||
|
self.migration_history.write().retain(|r| r.timestamp > cutoff);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for LoadBalancer {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Load balancer statistics.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct LoadBalancerStats {
|
||||||
|
/// Total tasks across all processors.
|
||||||
|
pub total_load: u64,
|
||||||
|
/// Average load per processor.
|
||||||
|
pub avg_load: f64,
|
||||||
|
/// Number of registered processors.
|
||||||
|
pub processor_count: usize,
|
||||||
|
/// Average utilization (0.0 - 1.0).
|
||||||
|
pub avg_utilization: f64,
|
||||||
|
/// Total power consumption (watts).
|
||||||
|
pub total_power_watts: f64,
|
||||||
|
/// Total migrations performed.
|
||||||
|
pub total_migrations: usize,
|
||||||
|
/// Current balancing strategy.
|
||||||
|
pub strategy: BalancingStrategy,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::processor::{CpuVariant, GpuVariant, Operation, Precision};
|
||||||
|
use crate::task::TaskStatus;
|
||||||
|
|
||||||
|
fn create_test_task(priority: TaskPriority) -> Task {
|
||||||
|
Task {
|
||||||
|
id: TaskId::new(),
|
||||||
|
operation: Operation::MatMul {
|
||||||
|
m: 1024,
|
||||||
|
n: 1024,
|
||||||
|
k: 1024,
|
||||||
|
precision: Precision::Fp32,
|
||||||
|
},
|
||||||
|
priority,
|
||||||
|
dependencies: vec![],
|
||||||
|
status: TaskStatus::Pending,
|
||||||
|
deadline: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_load_tracking() {
|
||||||
|
let balancer = LoadBalancer::new();
|
||||||
|
|
||||||
|
balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
|
||||||
|
balancer.register_processor(ProcessorId(1), ProcessorType::Cpu(CpuVariant::default()));
|
||||||
|
|
||||||
|
assert_eq!(balancer.get_load(ProcessorId(0)), 0);
|
||||||
|
|
||||||
|
balancer.increment_load(ProcessorId(0));
|
||||||
|
balancer.increment_load(ProcessorId(0));
|
||||||
|
balancer.increment_load(ProcessorId(1));
|
||||||
|
|
||||||
|
assert_eq!(balancer.get_load(ProcessorId(0)), 2);
|
||||||
|
assert_eq!(balancer.get_load(ProcessorId(1)), 1);
|
||||||
|
|
||||||
|
balancer.decrement_load(ProcessorId(0));
|
||||||
|
assert_eq!(balancer.get_load(ProcessorId(0)), 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_should_steal_same_type() {
|
||||||
|
let balancer = LoadBalancer::new();
|
||||||
|
|
||||||
|
// Register two CPUs
|
||||||
|
balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
|
||||||
|
balancer.register_processor(ProcessorId(1), ProcessorType::Cpu(CpuVariant::default()));
|
||||||
|
|
||||||
|
// Give processor 0 high load
|
||||||
|
for _ in 0..10 {
|
||||||
|
balancer.increment_load(ProcessorId(0));
|
||||||
|
}
|
||||||
|
balancer.increment_load(ProcessorId(1));
|
||||||
|
|
||||||
|
// Should steal between same types
|
||||||
|
assert!(balancer.should_steal(ProcessorId(0), ProcessorId(1)));
|
||||||
|
assert!(!balancer.should_steal(ProcessorId(1), ProcessorId(0)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_should_not_steal_different_types() {
|
||||||
|
let balancer = LoadBalancer::new();
|
||||||
|
|
||||||
|
// Register CPU and GPU
|
||||||
|
balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
|
||||||
|
balancer.register_processor(
|
||||||
|
ProcessorId(1),
|
||||||
|
ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability: (8, 9) }),
|
||||||
|
);
|
||||||
|
|
||||||
|
// Give CPU high load
|
||||||
|
for _ in 0..10 {
|
||||||
|
balancer.increment_load(ProcessorId(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should NOT steal between different types
|
||||||
|
assert!(!balancer.should_steal(ProcessorId(0), ProcessorId(1)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_can_execute() {
|
||||||
|
let balancer = LoadBalancer::new();
|
||||||
|
|
||||||
|
let matmul = Operation::MatMul {
|
||||||
|
m: 1024,
|
||||||
|
n: 1024,
|
||||||
|
k: 1024,
|
||||||
|
precision: Precision::Fp32,
|
||||||
|
};
|
||||||
|
|
||||||
|
let flash_attention = Operation::FlashAttention {
|
||||||
|
batch: 32,
|
||||||
|
seq_len: 2048,
|
||||||
|
num_heads: 32,
|
||||||
|
head_dim: 128,
|
||||||
|
precision: Precision::Fp16,
|
||||||
|
};
|
||||||
|
|
||||||
|
let cpu = ProcessorType::Cpu(CpuVariant::default());
|
||||||
|
let gpu = ProcessorType::Gpu(GpuVariant::NvidiaCuda { compute_capability: (8, 9) });
|
||||||
|
let lpu = ProcessorType::Lpu;
|
||||||
|
|
||||||
|
// MatMul can run on all
|
||||||
|
assert!(balancer.can_execute(&matmul, &cpu));
|
||||||
|
assert!(balancer.can_execute(&matmul, &gpu));
|
||||||
|
assert!(balancer.can_execute(&matmul, &lpu));
|
||||||
|
|
||||||
|
// FlashAttention only on GPU/TPU/LPU
|
||||||
|
assert!(!balancer.can_execute(&flash_attention, &cpu));
|
||||||
|
assert!(balancer.can_execute(&flash_attention, &gpu));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_strategy_affects_scoring() {
|
||||||
|
let balancer = LoadBalancer::new();
|
||||||
|
|
||||||
|
let cpu_id = ProcessorId(0);
|
||||||
|
let npu_id = ProcessorId(1);
|
||||||
|
|
||||||
|
balancer.register_processor(cpu_id, ProcessorType::Cpu(CpuVariant::default()));
|
||||||
|
balancer.register_processor(npu_id, ProcessorType::Npu(crate::processor::NpuVariant::AppleNeuralEngine { cores: 16 }));
|
||||||
|
|
||||||
|
let task = create_test_task(TaskPriority::Normal);
|
||||||
|
|
||||||
|
// Energy strategy should prefer NPU
|
||||||
|
balancer.set_strategy(BalancingStrategy::Energy);
|
||||||
|
let assignment = TaskAssignment::new();
|
||||||
|
let result = balancer.maybe_rebalance(&task, cpu_id, &assignment);
|
||||||
|
|
||||||
|
// NPU should be preferred for energy efficiency
|
||||||
|
assert_eq!(result, npu_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_stats() {
|
||||||
|
let balancer = LoadBalancer::new();
|
||||||
|
|
||||||
|
balancer.register_processor(ProcessorId(0), ProcessorType::Cpu(CpuVariant::default()));
|
||||||
|
balancer.register_processor(ProcessorId(1), ProcessorType::Cpu(CpuVariant::default()));
|
||||||
|
|
||||||
|
balancer.increment_load(ProcessorId(0));
|
||||||
|
balancer.increment_load(ProcessorId(0));
|
||||||
|
balancer.increment_load(ProcessorId(1));
|
||||||
|
|
||||||
|
let stats = balancer.get_stats();
|
||||||
|
assert_eq!(stats.total_load, 3);
|
||||||
|
assert_eq!(stats.processor_count, 2);
|
||||||
|
assert!((stats.avg_load - 1.5).abs() < 0.01);
|
||||||
|
}
|
||||||
|
}
|
||||||
559
crates/synor-compute/src/scheduler/mod.rs
Normal file
559
crates/synor-compute/src/scheduler/mod.rs
Normal file
|
|
@ -0,0 +1,559 @@
|
||||||
|
//! Heterogeneous scheduler for multi-processor task assignment.
|
||||||
|
//!
|
||||||
|
//! Features:
|
||||||
|
//! - Optimal task-to-processor assignment
|
||||||
|
//! - Work stealing for load balancing
|
||||||
|
//! - Pipeline parallelism across processor types
|
||||||
|
//! - Dynamic rebalancing based on actual throughput
|
||||||
|
|
||||||
|
mod load_balancer;
|
||||||
|
mod work_queue;
|
||||||
|
|
||||||
|
pub use load_balancer::LoadBalancer;
|
||||||
|
pub use work_queue::WorkQueue;
|
||||||
|
|
||||||
|
use crate::device::DeviceRegistry;
|
||||||
|
use crate::error::ComputeError;
|
||||||
|
use crate::processor::{Operation, Processor, ProcessorId, ProcessorType};
|
||||||
|
use crate::task::{Task, TaskId, TaskPriority};
|
||||||
|
use parking_lot::RwLock;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
/// Heterogeneous scheduler that manages tasks across all processor types.
|
||||||
|
pub struct HeterogeneousScheduler {
|
||||||
|
/// Device registry.
|
||||||
|
device_registry: Arc<DeviceRegistry>,
|
||||||
|
/// Per-processor-type task queues.
|
||||||
|
queues: RwLock<HashMap<ProcessorType, WorkQueue>>,
|
||||||
|
/// Load balancer.
|
||||||
|
load_balancer: LoadBalancer,
|
||||||
|
/// Active schedules.
|
||||||
|
active_schedules: RwLock<HashMap<ScheduleId, Schedule>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HeterogeneousScheduler {
|
||||||
|
/// Creates a new heterogeneous scheduler.
|
||||||
|
pub fn new(device_registry: Arc<DeviceRegistry>) -> Self {
|
||||||
|
Self {
|
||||||
|
device_registry,
|
||||||
|
queues: RwLock::new(HashMap::new()),
|
||||||
|
load_balancer: LoadBalancer::new(),
|
||||||
|
active_schedules: RwLock::new(HashMap::new()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Schedule a set of tasks for execution.
|
||||||
|
pub async fn schedule(&self, tasks: Vec<Task>) -> Result<ScheduleResult, ComputeError> {
|
||||||
|
if tasks.is_empty() {
|
||||||
|
return Ok(ScheduleResult {
|
||||||
|
schedule: Schedule::empty(),
|
||||||
|
estimated_makespan: Duration::ZERO,
|
||||||
|
processor_utilization: HashMap::new(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// 1. Build dependency graph
|
||||||
|
let deps = self.build_dependency_graph(&tasks);
|
||||||
|
|
||||||
|
// 2. Assign tasks to optimal processors
|
||||||
|
let assignment = self.assign_tasks(&tasks, &deps).await?;
|
||||||
|
|
||||||
|
// 3. Create execution schedule with stages
|
||||||
|
let schedule = self.create_schedule(&tasks, &assignment, &deps)?;
|
||||||
|
|
||||||
|
// 4. Estimate metrics
|
||||||
|
let makespan = self.estimate_makespan(&schedule);
|
||||||
|
let utilization = self.estimate_utilization(&schedule);
|
||||||
|
|
||||||
|
// 5. Store active schedule
|
||||||
|
self.active_schedules.write().insert(schedule.id, schedule.clone());
|
||||||
|
|
||||||
|
Ok(ScheduleResult {
|
||||||
|
schedule,
|
||||||
|
estimated_makespan: makespan,
|
||||||
|
processor_utilization: utilization,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Execute a schedule.
|
||||||
|
pub async fn execute(&self, schedule: &Schedule) -> Result<ExecutionResult, ComputeError> {
|
||||||
|
let mut results = HashMap::new();
|
||||||
|
let start = std::time::Instant::now();
|
||||||
|
|
||||||
|
// Execute stages in order
|
||||||
|
for stage in &schedule.stages {
|
||||||
|
// Execute all tasks in this stage in parallel
|
||||||
|
let mut handles = Vec::new();
|
||||||
|
|
||||||
|
for task_id in &stage.tasks {
|
||||||
|
let task = schedule.tasks.get(task_id)
|
||||||
|
.ok_or_else(|| ComputeError::Internal(format!("Task not found: {:?}", task_id)))?;
|
||||||
|
let processor_id = schedule.assignment.get(task_id)
|
||||||
|
.ok_or_else(|| ComputeError::Internal(format!("No assignment for task: {:?}", task_id)))?;
|
||||||
|
|
||||||
|
let processor = self.device_registry.get_processor(processor_id)?;
|
||||||
|
let task_clone = task.clone();
|
||||||
|
|
||||||
|
handles.push(tokio::spawn(async move {
|
||||||
|
processor.execute(task_clone.operation).await
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for all tasks in stage
|
||||||
|
for (i, handle) in handles.into_iter().enumerate() {
|
||||||
|
let task_id = stage.tasks[i];
|
||||||
|
match handle.await {
|
||||||
|
Ok(Ok(result)) => {
|
||||||
|
results.insert(task_id, TaskExecutionResult::Success(result));
|
||||||
|
}
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
results.insert(task_id, TaskExecutionResult::Failed(e.to_string()));
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
results.insert(task_id, TaskExecutionResult::Failed(e.to_string()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let total_time = start.elapsed();
|
||||||
|
|
||||||
|
Ok(ExecutionResult {
|
||||||
|
results,
|
||||||
|
total_time,
|
||||||
|
actual_utilization: self.measure_utilization(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Assign tasks to optimal processors.
|
||||||
|
async fn assign_tasks(
|
||||||
|
&self,
|
||||||
|
tasks: &[Task],
|
||||||
|
deps: &DependencyGraph,
|
||||||
|
) -> Result<TaskAssignment, ComputeError> {
|
||||||
|
let mut assignment = TaskAssignment::new();
|
||||||
|
|
||||||
|
// Sort tasks by priority and dependencies (topological sort)
|
||||||
|
let sorted_tasks = self.topological_sort(tasks, deps);
|
||||||
|
|
||||||
|
for task in sorted_tasks {
|
||||||
|
// Find best processor for this task
|
||||||
|
let best_processor = self.find_best_processor(&task).await?;
|
||||||
|
|
||||||
|
// Check if we should rebalance
|
||||||
|
let final_processor = self.load_balancer
|
||||||
|
.maybe_rebalance(&task, best_processor, &assignment);
|
||||||
|
|
||||||
|
assignment.assign(task.id, final_processor);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(assignment)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Find the best processor for a task.
|
||||||
|
async fn find_best_processor(&self, task: &Task) -> Result<ProcessorId, ComputeError> {
|
||||||
|
let mut best_score = f64::NEG_INFINITY;
|
||||||
|
let mut best_processor = None;
|
||||||
|
|
||||||
|
// Get all available processors
|
||||||
|
let processors = self.device_registry.all_processors();
|
||||||
|
|
||||||
|
for processor in processors {
|
||||||
|
if !processor.can_execute(&task.operation) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate score based on multiple factors
|
||||||
|
let exec_time = processor.estimate_time(&task.operation);
|
||||||
|
let energy = processor.estimate_energy(&task.operation);
|
||||||
|
let load = processor.utilization();
|
||||||
|
|
||||||
|
// Score = 1 / (time * (1 + load) * energy_factor)
|
||||||
|
let time_factor = exec_time.as_secs_f64().max(0.001);
|
||||||
|
let load_factor = 1.0 + load;
|
||||||
|
let energy_factor = 1.0 + (energy / 1000.0); // Normalize energy
|
||||||
|
|
||||||
|
let score = 1.0 / (time_factor * load_factor * energy_factor);
|
||||||
|
|
||||||
|
if score > best_score {
|
||||||
|
best_score = score;
|
||||||
|
best_processor = Some(processor.id());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
best_processor.ok_or_else(|| {
|
||||||
|
ComputeError::NoSuitableProcessor(format!("{:?}", task.operation.op_type()))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build dependency graph from tasks.
|
||||||
|
fn build_dependency_graph(&self, tasks: &[Task]) -> DependencyGraph {
|
||||||
|
let mut graph = DependencyGraph::new();
|
||||||
|
|
||||||
|
for task in tasks {
|
||||||
|
graph.add_node(task.id);
|
||||||
|
for dep in &task.dependencies {
|
||||||
|
graph.add_edge(*dep, task.id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
graph
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Topological sort of tasks respecting dependencies.
|
||||||
|
fn topological_sort(&self, tasks: &[Task], deps: &DependencyGraph) -> Vec<Task> {
|
||||||
|
let mut sorted = Vec::new();
|
||||||
|
let mut visited = std::collections::HashSet::new();
|
||||||
|
let task_map: HashMap<TaskId, Task> = tasks.iter()
|
||||||
|
.map(|t| (t.id, t.clone()))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
fn visit(
|
||||||
|
task_id: TaskId,
|
||||||
|
task_map: &HashMap<TaskId, Task>,
|
||||||
|
deps: &DependencyGraph,
|
||||||
|
visited: &mut std::collections::HashSet<TaskId>,
|
||||||
|
sorted: &mut Vec<Task>,
|
||||||
|
) {
|
||||||
|
if visited.contains(&task_id) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
visited.insert(task_id);
|
||||||
|
|
||||||
|
// Visit dependencies first
|
||||||
|
if let Some(task_deps) = deps.dependencies.get(&task_id) {
|
||||||
|
for dep in task_deps {
|
||||||
|
visit(*dep, task_map, deps, visited, sorted);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(task) = task_map.get(&task_id) {
|
||||||
|
sorted.push(task.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for task in tasks {
|
||||||
|
visit(task.id, &task_map, deps, &mut visited, &mut sorted);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort by priority within dependency constraints
|
||||||
|
sorted.sort_by(|a, b| b.priority.cmp(&a.priority));
|
||||||
|
|
||||||
|
sorted
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create execution schedule with parallel stages.
|
||||||
|
fn create_schedule(
|
||||||
|
&self,
|
||||||
|
tasks: &[Task],
|
||||||
|
assignment: &TaskAssignment,
|
||||||
|
deps: &DependencyGraph,
|
||||||
|
) -> Result<Schedule, ComputeError> {
|
||||||
|
let mut stages = Vec::new();
|
||||||
|
let mut scheduled = std::collections::HashSet::new();
|
||||||
|
let task_map: HashMap<TaskId, Task> = tasks.iter()
|
||||||
|
.map(|t| (t.id, t.clone()))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
while scheduled.len() < tasks.len() {
|
||||||
|
let mut stage_tasks = Vec::new();
|
||||||
|
|
||||||
|
for task in tasks {
|
||||||
|
if scheduled.contains(&task.id) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if all dependencies are satisfied
|
||||||
|
let deps_satisfied = task.dependencies.iter()
|
||||||
|
.all(|dep| scheduled.contains(dep));
|
||||||
|
|
||||||
|
if deps_satisfied {
|
||||||
|
stage_tasks.push(task.id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if stage_tasks.is_empty() {
|
||||||
|
return Err(ComputeError::SchedulingFailed(
|
||||||
|
"Circular dependency detected".to_string()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
for task_id in &stage_tasks {
|
||||||
|
scheduled.insert(*task_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
stages.push(ScheduleStage {
|
||||||
|
stage_id: stages.len(),
|
||||||
|
tasks: stage_tasks,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Schedule {
|
||||||
|
id: ScheduleId::new(),
|
||||||
|
tasks: task_map,
|
||||||
|
assignment: assignment.clone(),
|
||||||
|
stages,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Estimate makespan (total execution time).
|
||||||
|
fn estimate_makespan(&self, schedule: &Schedule) -> Duration {
|
||||||
|
let mut total = Duration::ZERO;
|
||||||
|
|
||||||
|
for stage in &schedule.stages {
|
||||||
|
let mut max_stage_time = Duration::ZERO;
|
||||||
|
|
||||||
|
for task_id in &stage.tasks {
|
||||||
|
if let (Some(task), Some(proc_id)) = (
|
||||||
|
schedule.tasks.get(task_id),
|
||||||
|
schedule.assignment.get(task_id),
|
||||||
|
) {
|
||||||
|
if let Ok(processor) = self.device_registry.get_processor(proc_id) {
|
||||||
|
let time = processor.estimate_time(&task.operation);
|
||||||
|
max_stage_time = max_stage_time.max(time);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
total += max_stage_time;
|
||||||
|
}
|
||||||
|
|
||||||
|
total
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Estimate processor utilization.
|
||||||
|
fn estimate_utilization(&self, schedule: &Schedule) -> HashMap<ProcessorType, f64> {
|
||||||
|
let mut work_time: HashMap<ProcessorType, Duration> = HashMap::new();
|
||||||
|
let makespan = self.estimate_makespan(schedule);
|
||||||
|
|
||||||
|
for task_id in schedule.assignment.assignments.keys() {
|
||||||
|
if let (Some(task), Some(proc_id)) = (
|
||||||
|
schedule.tasks.get(task_id),
|
||||||
|
schedule.assignment.get(task_id),
|
||||||
|
) {
|
||||||
|
if let Ok(processor) = self.device_registry.get_processor(proc_id) {
|
||||||
|
let proc_type = processor.processor_type();
|
||||||
|
let time = processor.estimate_time(&task.operation);
|
||||||
|
*work_time.entry(proc_type).or_default() += time;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
work_time
|
||||||
|
.into_iter()
|
||||||
|
.map(|(proc_type, time)| {
|
||||||
|
let utilization = if makespan.as_secs_f64() > 0.0 {
|
||||||
|
time.as_secs_f64() / makespan.as_secs_f64()
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
};
|
||||||
|
(proc_type, utilization.min(1.0))
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Measure actual current utilization.
|
||||||
|
fn measure_utilization(&self) -> HashMap<ProcessorType, f64> {
|
||||||
|
let mut utilization = HashMap::new();
|
||||||
|
|
||||||
|
for processor in self.device_registry.all_processors() {
|
||||||
|
let proc_type = processor.processor_type();
|
||||||
|
let util = processor.utilization();
|
||||||
|
utilization
|
||||||
|
.entry(proc_type)
|
||||||
|
.and_modify(|u| *u = (*u + util) / 2.0)
|
||||||
|
.or_insert(util);
|
||||||
|
}
|
||||||
|
|
||||||
|
utilization
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Schedule identifier.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub struct ScheduleId(pub u64);
|
||||||
|
|
||||||
|
impl ScheduleId {
|
||||||
|
/// Creates a new schedule ID.
|
||||||
|
pub fn new() -> Self {
|
||||||
|
use rand::Rng;
|
||||||
|
ScheduleId(rand::thread_rng().gen())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for ScheduleId {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Task-to-processor assignment.
|
||||||
|
#[derive(Clone, Debug, Default)]
|
||||||
|
pub struct TaskAssignment {
|
||||||
|
/// Map from task ID to processor ID.
|
||||||
|
pub assignments: HashMap<TaskId, ProcessorId>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TaskAssignment {
|
||||||
|
/// Creates a new empty assignment.
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
assignments: HashMap::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Assigns a task to a processor.
|
||||||
|
pub fn assign(&mut self, task_id: TaskId, processor_id: ProcessorId) {
|
||||||
|
self.assignments.insert(task_id, processor_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets the assigned processor for a task.
|
||||||
|
pub fn get(&self, task_id: &TaskId) -> Option<ProcessorId> {
|
||||||
|
self.assignments.get(task_id).copied()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Dependency graph for tasks.
|
||||||
|
#[derive(Clone, Debug, Default)]
|
||||||
|
pub struct DependencyGraph {
|
||||||
|
/// Dependencies: task -> list of tasks it depends on.
|
||||||
|
pub dependencies: HashMap<TaskId, Vec<TaskId>>,
|
||||||
|
/// Dependents: task -> list of tasks that depend on it.
|
||||||
|
pub dependents: HashMap<TaskId, Vec<TaskId>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DependencyGraph {
|
||||||
|
/// Creates a new empty dependency graph.
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
dependencies: HashMap::new(),
|
||||||
|
dependents: HashMap::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Adds a node (task) to the graph.
|
||||||
|
pub fn add_node(&mut self, task_id: TaskId) {
|
||||||
|
self.dependencies.entry(task_id).or_default();
|
||||||
|
self.dependents.entry(task_id).or_default();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Adds a dependency edge (from depends on to).
|
||||||
|
pub fn add_edge(&mut self, from: TaskId, to: TaskId) {
|
||||||
|
self.dependencies.entry(to).or_default().push(from);
|
||||||
|
self.dependents.entry(from).or_default().push(to);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Execution schedule.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct Schedule {
|
||||||
|
/// Schedule ID.
|
||||||
|
pub id: ScheduleId,
|
||||||
|
/// All tasks.
|
||||||
|
pub tasks: HashMap<TaskId, Task>,
|
||||||
|
/// Task assignments.
|
||||||
|
pub assignment: TaskAssignment,
|
||||||
|
/// Execution stages (tasks within a stage can run in parallel).
|
||||||
|
pub stages: Vec<ScheduleStage>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Schedule {
|
||||||
|
/// Creates an empty schedule.
|
||||||
|
pub fn empty() -> Self {
|
||||||
|
Self {
|
||||||
|
id: ScheduleId::new(),
|
||||||
|
tasks: HashMap::new(),
|
||||||
|
assignment: TaskAssignment::new(),
|
||||||
|
stages: Vec::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A stage of parallel tasks.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct ScheduleStage {
|
||||||
|
/// Stage index.
|
||||||
|
pub stage_id: usize,
|
||||||
|
/// Tasks in this stage (can run in parallel).
|
||||||
|
pub tasks: Vec<TaskId>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Result of scheduling.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct ScheduleResult {
|
||||||
|
/// The schedule.
|
||||||
|
pub schedule: Schedule,
|
||||||
|
/// Estimated total execution time.
|
||||||
|
pub estimated_makespan: Duration,
|
||||||
|
/// Estimated processor utilization by type.
|
||||||
|
pub processor_utilization: HashMap<ProcessorType, f64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Result of execution.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct ExecutionResult {
|
||||||
|
/// Results per task.
|
||||||
|
pub results: HashMap<TaskId, TaskExecutionResult>,
|
||||||
|
/// Total execution time.
|
||||||
|
pub total_time: Duration,
|
||||||
|
/// Actual processor utilization.
|
||||||
|
pub actual_utilization: HashMap<ProcessorType, f64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Result of a single task execution.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub enum TaskExecutionResult {
|
||||||
|
/// Task completed successfully.
|
||||||
|
Success(crate::processor::OperationResult),
|
||||||
|
/// Task failed.
|
||||||
|
Failed(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::processor::Precision;
|
||||||
|
use crate::task::TaskStatus;
|
||||||
|
|
||||||
|
fn create_test_task(id: u64, op: Operation, deps: Vec<TaskId>) -> Task {
|
||||||
|
Task {
|
||||||
|
id: TaskId(id),
|
||||||
|
operation: op,
|
||||||
|
priority: TaskPriority::Normal,
|
||||||
|
dependencies: deps,
|
||||||
|
status: TaskStatus::Pending,
|
||||||
|
deadline: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_dependency_graph() {
|
||||||
|
let mut graph = DependencyGraph::new();
|
||||||
|
|
||||||
|
graph.add_node(TaskId(1));
|
||||||
|
graph.add_node(TaskId(2));
|
||||||
|
graph.add_node(TaskId(3));
|
||||||
|
|
||||||
|
graph.add_edge(TaskId(1), TaskId(2)); // 2 depends on 1
|
||||||
|
graph.add_edge(TaskId(1), TaskId(3)); // 3 depends on 1
|
||||||
|
graph.add_edge(TaskId(2), TaskId(3)); // 3 depends on 2
|
||||||
|
|
||||||
|
assert_eq!(graph.dependencies[&TaskId(2)], vec![TaskId(1)]);
|
||||||
|
assert_eq!(graph.dependencies[&TaskId(3)], vec![TaskId(1), TaskId(2)]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_task_assignment() {
|
||||||
|
let mut assignment = TaskAssignment::new();
|
||||||
|
|
||||||
|
assignment.assign(TaskId(1), ProcessorId(0));
|
||||||
|
assignment.assign(TaskId(2), ProcessorId(1));
|
||||||
|
|
||||||
|
assert_eq!(assignment.get(&TaskId(1)), Some(ProcessorId(0)));
|
||||||
|
assert_eq!(assignment.get(&TaskId(2)), Some(ProcessorId(1)));
|
||||||
|
assert_eq!(assignment.get(&TaskId(3)), None);
|
||||||
|
}
|
||||||
|
}
|
||||||
271
crates/synor-compute/src/scheduler/work_queue.rs
Normal file
271
crates/synor-compute/src/scheduler/work_queue.rs
Normal file
|
|
@ -0,0 +1,271 @@
|
||||||
|
//! Work queue with thread-safe task management.
|
||||||
|
|
||||||
|
use crate::processor::ProcessorType;
|
||||||
|
use crate::task::{Task, TaskId, TaskPriority};
|
||||||
|
use crossbeam_channel::{bounded, Receiver, Sender, TryRecvError};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::sync::atomic::{AtomicU64, Ordering};
|
||||||
|
|
||||||
|
/// Work queue for a specific processor type.
|
||||||
|
pub struct WorkQueue {
|
||||||
|
/// Task sender (for producers).
|
||||||
|
sender: Sender<Task>,
|
||||||
|
/// Task receiver (for consumers).
|
||||||
|
receiver: Receiver<Task>,
|
||||||
|
/// Processor type this queue is for.
|
||||||
|
processor_type: ProcessorType,
|
||||||
|
/// Current queue size.
|
||||||
|
size: AtomicU64,
|
||||||
|
/// Total tasks processed.
|
||||||
|
processed: AtomicU64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl WorkQueue {
|
||||||
|
/// Creates a new work queue for a processor type.
|
||||||
|
pub fn new(processor_type: ProcessorType, capacity: usize) -> Self {
|
||||||
|
let (sender, receiver) = bounded(capacity.max(1024));
|
||||||
|
|
||||||
|
Self {
|
||||||
|
sender,
|
||||||
|
receiver,
|
||||||
|
processor_type,
|
||||||
|
size: AtomicU64::new(0),
|
||||||
|
processed: AtomicU64::new(0),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Push a task to the queue.
|
||||||
|
pub fn push(&self, task: Task) {
|
||||||
|
if self.sender.try_send(task).is_ok() {
|
||||||
|
self.size.fetch_add(1, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Pop a task from the queue (ignores worker_id for compatibility).
|
||||||
|
pub fn pop(&self, _worker_id: usize) -> Option<Task> {
|
||||||
|
self.pop_any()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Pop any task from the queue.
|
||||||
|
pub fn pop_any(&self) -> Option<Task> {
|
||||||
|
match self.receiver.try_recv() {
|
||||||
|
Ok(task) => {
|
||||||
|
self.size.fetch_sub(1, Ordering::Relaxed);
|
||||||
|
self.processed.fetch_add(1, Ordering::Relaxed);
|
||||||
|
Some(task)
|
||||||
|
}
|
||||||
|
Err(TryRecvError::Empty) | Err(TryRecvError::Disconnected) => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Pop from global queue (alias for pop_any).
|
||||||
|
pub fn pop_global(&self) -> Option<Task> {
|
||||||
|
self.pop_any()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Steal a batch of tasks from another queue.
|
||||||
|
pub fn steal_batch_from(&self, other: &WorkQueue, max_tasks: usize) -> Vec<Task> {
|
||||||
|
let mut stolen = Vec::new();
|
||||||
|
|
||||||
|
while stolen.len() < max_tasks {
|
||||||
|
if let Some(task) = other.pop_any() {
|
||||||
|
stolen.push(task);
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Push stolen tasks to this queue
|
||||||
|
for task in &stolen {
|
||||||
|
// Tasks are already accounted for in `other`, just push to self
|
||||||
|
if self.sender.try_send(task.clone()).is_ok() {
|
||||||
|
self.size.fetch_add(1, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
stolen
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get current queue size.
|
||||||
|
pub fn len(&self) -> usize {
|
||||||
|
self.size.load(Ordering::Relaxed) as usize
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if queue is empty.
|
||||||
|
pub fn is_empty(&self) -> bool {
|
||||||
|
self.len() == 0
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get number of tasks processed.
|
||||||
|
pub fn processed_count(&self) -> u64 {
|
||||||
|
self.processed.load(Ordering::Relaxed)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get processor type for this queue.
|
||||||
|
pub fn processor_type(&self) -> ProcessorType {
|
||||||
|
self.processor_type.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get utilization estimate (0.0 - 1.0).
|
||||||
|
pub fn utilization(&self) -> f64 {
|
||||||
|
let size = self.size.load(Ordering::Relaxed) as f64;
|
||||||
|
let capacity = self.sender.capacity().unwrap_or(1024) as f64;
|
||||||
|
(size / capacity).min(1.0)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get a stealer for cross-queue work stealing.
|
||||||
|
pub fn get_stealer(&self) -> QueueStealer {
|
||||||
|
QueueStealer {
|
||||||
|
receiver: self.receiver.clone(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Stealer handle for cross-queue work stealing.
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct QueueStealer {
|
||||||
|
receiver: Receiver<Task>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl QueueStealer {
|
||||||
|
/// Try to steal a task.
|
||||||
|
pub fn steal(&self) -> Option<Task> {
|
||||||
|
self.receiver.try_recv().ok()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Priority queue wrapper for tasks.
|
||||||
|
pub struct PriorityWorkQueue {
|
||||||
|
/// Queues by priority level.
|
||||||
|
queues: HashMap<TaskPriority, WorkQueue>,
|
||||||
|
/// Processor type.
|
||||||
|
processor_type: ProcessorType,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PriorityWorkQueue {
|
||||||
|
/// Creates a new priority work queue.
|
||||||
|
pub fn new(processor_type: ProcessorType, capacity_per_priority: usize) -> Self {
|
||||||
|
let mut queues = HashMap::new();
|
||||||
|
|
||||||
|
for priority in [
|
||||||
|
TaskPriority::Critical,
|
||||||
|
TaskPriority::High,
|
||||||
|
TaskPriority::Normal,
|
||||||
|
TaskPriority::Background,
|
||||||
|
] {
|
||||||
|
queues.insert(priority, WorkQueue::new(processor_type.clone(), capacity_per_priority));
|
||||||
|
}
|
||||||
|
|
||||||
|
Self {
|
||||||
|
queues,
|
||||||
|
processor_type,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Push a task with its priority.
|
||||||
|
pub fn push(&self, task: Task) {
|
||||||
|
let priority = task.priority;
|
||||||
|
if let Some(queue) = self.queues.get(&priority) {
|
||||||
|
queue.push(task);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Pop highest priority task available.
|
||||||
|
pub fn pop(&self, worker_id: usize) -> Option<Task> {
|
||||||
|
// Try priorities in order: Critical > High > Normal > Background
|
||||||
|
for priority in [
|
||||||
|
TaskPriority::Critical,
|
||||||
|
TaskPriority::High,
|
||||||
|
TaskPriority::Normal,
|
||||||
|
TaskPriority::Background,
|
||||||
|
] {
|
||||||
|
if let Some(queue) = self.queues.get(&priority) {
|
||||||
|
if let Some(task) = queue.pop(worker_id) {
|
||||||
|
return Some(task);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get total queue size.
|
||||||
|
pub fn len(&self) -> usize {
|
||||||
|
self.queues.values().map(|q| q.len()).sum()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if all queues are empty.
|
||||||
|
pub fn is_empty(&self) -> bool {
|
||||||
|
self.queues.values().all(|q| q.is_empty())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::processor::{CpuVariant, Operation, Precision};
|
||||||
|
use crate::task::TaskStatus;
|
||||||
|
|
||||||
|
fn create_test_task(id: u64, priority: TaskPriority) -> Task {
|
||||||
|
Task {
|
||||||
|
id: TaskId(id),
|
||||||
|
operation: Operation::MatMul {
|
||||||
|
m: 1024,
|
||||||
|
n: 1024,
|
||||||
|
k: 1024,
|
||||||
|
precision: Precision::Fp32,
|
||||||
|
},
|
||||||
|
priority,
|
||||||
|
dependencies: vec![],
|
||||||
|
status: TaskStatus::Pending,
|
||||||
|
deadline: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_work_queue_basic() {
|
||||||
|
let queue = WorkQueue::new(
|
||||||
|
ProcessorType::Cpu(CpuVariant::default()),
|
||||||
|
100,
|
||||||
|
);
|
||||||
|
|
||||||
|
assert!(queue.is_empty());
|
||||||
|
|
||||||
|
queue.push(create_test_task(1, TaskPriority::Normal));
|
||||||
|
queue.push(create_test_task(2, TaskPriority::Normal));
|
||||||
|
|
||||||
|
assert_eq!(queue.len(), 2);
|
||||||
|
|
||||||
|
let task1 = queue.pop(0);
|
||||||
|
assert!(task1.is_some());
|
||||||
|
assert_eq!(queue.len(), 1);
|
||||||
|
|
||||||
|
let task2 = queue.pop(0);
|
||||||
|
assert!(task2.is_some());
|
||||||
|
assert!(queue.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_priority_queue() {
|
||||||
|
let queue = PriorityWorkQueue::new(
|
||||||
|
ProcessorType::Cpu(CpuVariant::default()),
|
||||||
|
100,
|
||||||
|
);
|
||||||
|
|
||||||
|
queue.push(create_test_task(1, TaskPriority::Background));
|
||||||
|
queue.push(create_test_task(2, TaskPriority::Critical));
|
||||||
|
queue.push(create_test_task(3, TaskPriority::Normal));
|
||||||
|
|
||||||
|
// Should get Critical first
|
||||||
|
let task = queue.pop(0).unwrap();
|
||||||
|
assert_eq!(task.id, TaskId(2));
|
||||||
|
assert_eq!(task.priority, TaskPriority::Critical);
|
||||||
|
|
||||||
|
// Then Normal
|
||||||
|
let task = queue.pop(0).unwrap();
|
||||||
|
assert_eq!(task.id, TaskId(3));
|
||||||
|
|
||||||
|
// Then Background
|
||||||
|
let task = queue.pop(0).unwrap();
|
||||||
|
assert_eq!(task.id, TaskId(1));
|
||||||
|
}
|
||||||
|
}
|
||||||
543
crates/synor-compute/src/task/mod.rs
Normal file
543
crates/synor-compute/src/task/mod.rs
Normal file
|
|
@ -0,0 +1,543 @@
|
||||||
|
//! Task definitions and decomposition.
|
||||||
|
|
||||||
|
use crate::error::ComputeError;
|
||||||
|
use crate::processor::{Operation, OperationType, Precision, ProcessorType};
|
||||||
|
use crate::{ComputeJob, JobType};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
/// Unique task identifier.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
pub struct TaskId(pub u64);
|
||||||
|
|
||||||
|
impl TaskId {
|
||||||
|
/// Creates a new task ID.
|
||||||
|
pub fn new() -> Self {
|
||||||
|
use rand::Rng;
|
||||||
|
TaskId(rand::thread_rng().gen())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for TaskId {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for TaskId {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "task_{}", self.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Task priority levels.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||||
|
pub enum TaskPriority {
|
||||||
|
/// Background, can be preempted.
|
||||||
|
Background = 0,
|
||||||
|
/// Normal priority.
|
||||||
|
Normal = 1,
|
||||||
|
/// High priority.
|
||||||
|
High = 2,
|
||||||
|
/// Critical, must complete.
|
||||||
|
Critical = 3,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for TaskPriority {
|
||||||
|
fn default() -> Self {
|
||||||
|
TaskPriority::Normal
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Task execution status.
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
|
pub enum TaskStatus {
|
||||||
|
/// Waiting to be scheduled.
|
||||||
|
Pending,
|
||||||
|
/// Queued for execution.
|
||||||
|
Queued,
|
||||||
|
/// Currently executing.
|
||||||
|
Running,
|
||||||
|
/// Completed successfully.
|
||||||
|
Completed,
|
||||||
|
/// Failed.
|
||||||
|
Failed,
|
||||||
|
/// Cancelled.
|
||||||
|
Cancelled,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A schedulable task.
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub struct Task {
|
||||||
|
/// Task ID.
|
||||||
|
pub id: TaskId,
|
||||||
|
/// Operation to execute.
|
||||||
|
pub operation: Operation,
|
||||||
|
/// Priority level.
|
||||||
|
pub priority: TaskPriority,
|
||||||
|
/// Dependencies (tasks that must complete first).
|
||||||
|
pub dependencies: Vec<TaskId>,
|
||||||
|
/// Current status.
|
||||||
|
pub status: TaskStatus,
|
||||||
|
/// Deadline (optional).
|
||||||
|
pub deadline: Option<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Task {
|
||||||
|
/// Creates a new task.
|
||||||
|
pub fn new(operation: Operation) -> Self {
|
||||||
|
Self {
|
||||||
|
id: TaskId::new(),
|
||||||
|
operation,
|
||||||
|
priority: TaskPriority::Normal,
|
||||||
|
dependencies: Vec::new(),
|
||||||
|
status: TaskStatus::Pending,
|
||||||
|
deadline: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Sets the priority.
|
||||||
|
pub fn with_priority(mut self, priority: TaskPriority) -> Self {
|
||||||
|
self.priority = priority;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Adds dependencies.
|
||||||
|
pub fn with_dependencies(mut self, deps: Vec<TaskId>) -> Self {
|
||||||
|
self.dependencies = deps;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Sets deadline.
|
||||||
|
pub fn with_deadline(mut self, deadline: u64) -> Self {
|
||||||
|
self.deadline = Some(deadline);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Checks if task is compatible with a processor type.
|
||||||
|
pub fn is_compatible_with(&self, proc_type: ProcessorType) -> bool {
|
||||||
|
// Check based on operation type
|
||||||
|
let op_type = self.operation.op_type();
|
||||||
|
|
||||||
|
match proc_type {
|
||||||
|
ProcessorType::Cpu(_) => {
|
||||||
|
// CPUs can do most things, but slowly
|
||||||
|
true
|
||||||
|
}
|
||||||
|
ProcessorType::Gpu(_) => {
|
||||||
|
// GPUs are good for parallel operations
|
||||||
|
matches!(
|
||||||
|
op_type,
|
||||||
|
OperationType::MatMul
|
||||||
|
| OperationType::Conv2d
|
||||||
|
| OperationType::SelfAttention
|
||||||
|
| OperationType::FlashAttention
|
||||||
|
| OperationType::Embedding
|
||||||
|
| OperationType::Add
|
||||||
|
| OperationType::Mul
|
||||||
|
| OperationType::Softmax
|
||||||
|
)
|
||||||
|
}
|
||||||
|
ProcessorType::Tpu(_) => {
|
||||||
|
// TPUs are good for large matrix ops
|
||||||
|
matches!(
|
||||||
|
op_type,
|
||||||
|
OperationType::MatMul
|
||||||
|
| OperationType::Conv2d
|
||||||
|
| OperationType::SelfAttention
|
||||||
|
| OperationType::FlashAttention
|
||||||
|
)
|
||||||
|
}
|
||||||
|
ProcessorType::Lpu => {
|
||||||
|
// LPUs are good for sequential inference
|
||||||
|
matches!(
|
||||||
|
op_type,
|
||||||
|
OperationType::MatMul
|
||||||
|
| OperationType::SelfAttention
|
||||||
|
| OperationType::KVCache
|
||||||
|
| OperationType::Sampling
|
||||||
|
)
|
||||||
|
}
|
||||||
|
ProcessorType::Npu(_) => {
|
||||||
|
// NPUs are good for inference
|
||||||
|
matches!(
|
||||||
|
op_type,
|
||||||
|
OperationType::MatMul
|
||||||
|
| OperationType::Conv2d
|
||||||
|
| OperationType::Add
|
||||||
|
| OperationType::Softmax
|
||||||
|
)
|
||||||
|
}
|
||||||
|
_ => true, // Default to compatible
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Result of task execution.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct TaskResult {
|
||||||
|
/// Task ID.
|
||||||
|
pub task_id: TaskId,
|
||||||
|
/// Output data.
|
||||||
|
pub output: Vec<u8>,
|
||||||
|
/// Execution duration.
|
||||||
|
pub duration: Duration,
|
||||||
|
/// Energy consumed (Joules).
|
||||||
|
pub energy: f64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compute task for job execution.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct ComputeTask {
|
||||||
|
/// Task.
|
||||||
|
pub task: Task,
|
||||||
|
/// Resource requirements.
|
||||||
|
pub requirements: TaskRequirements,
|
||||||
|
/// Preferred processor type.
|
||||||
|
pub preferred_processor: Option<ProcessorType>,
|
||||||
|
/// Fallback processor type.
|
||||||
|
pub fallback_processor: Option<ProcessorType>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Task resource requirements.
|
||||||
|
#[derive(Clone, Debug, Default)]
|
||||||
|
pub struct TaskRequirements {
|
||||||
|
/// Minimum memory (bytes).
|
||||||
|
pub min_memory: u64,
|
||||||
|
/// Minimum TFLOPS.
|
||||||
|
pub min_tflops: f64,
|
||||||
|
/// Maximum latency (ms).
|
||||||
|
pub max_latency_ms: Option<u32>,
|
||||||
|
/// Requires specific precision.
|
||||||
|
pub precision: Option<Precision>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decomposed workload.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct DecomposedWorkload {
|
||||||
|
/// All tasks.
|
||||||
|
pub tasks: Vec<Task>,
|
||||||
|
/// Total estimated FLOPS.
|
||||||
|
pub estimated_flops: f64,
|
||||||
|
/// Total estimated memory.
|
||||||
|
pub estimated_memory: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Task decomposer that breaks jobs into schedulable tasks.
|
||||||
|
pub struct TaskDecomposer {
|
||||||
|
/// Default batch size for inference.
|
||||||
|
inference_batch_size: usize,
|
||||||
|
/// Default precision.
|
||||||
|
default_precision: Precision,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TaskDecomposer {
|
||||||
|
/// Creates a new task decomposer.
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
inference_batch_size: 32,
|
||||||
|
default_precision: Precision::Fp16,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decomposes a job into tasks.
|
||||||
|
pub fn decompose(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
|
||||||
|
match &job.job_type {
|
||||||
|
JobType::Training { .. } => self.decompose_training(job),
|
||||||
|
JobType::Inference { .. } => self.decompose_inference(job),
|
||||||
|
JobType::Container { .. } => self.decompose_container(job),
|
||||||
|
JobType::Serverless { .. } => self.decompose_serverless(job),
|
||||||
|
JobType::Wasm { .. } => self.decompose_wasm(job),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decompose training job.
|
||||||
|
fn decompose_training(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
|
||||||
|
let mut tasks = Vec::new();
|
||||||
|
|
||||||
|
if let JobType::Training {
|
||||||
|
epochs,
|
||||||
|
batch_size,
|
||||||
|
..
|
||||||
|
} = &job.job_type
|
||||||
|
{
|
||||||
|
// Data loading task
|
||||||
|
tasks.push(
|
||||||
|
Task::new(Operation::DataLoad {
|
||||||
|
bytes: 1024 * 1024 * 100, // 100MB
|
||||||
|
async_: true,
|
||||||
|
})
|
||||||
|
.with_priority(TaskPriority::High),
|
||||||
|
);
|
||||||
|
|
||||||
|
let data_load_id = tasks[0].id;
|
||||||
|
|
||||||
|
// Preprocessing task
|
||||||
|
tasks.push(
|
||||||
|
Task::new(Operation::DataPreprocess {
|
||||||
|
batch: *batch_size as usize,
|
||||||
|
transforms: vec!["normalize".to_string(), "augment".to_string()],
|
||||||
|
})
|
||||||
|
.with_dependencies(vec![data_load_id])
|
||||||
|
.with_priority(TaskPriority::High),
|
||||||
|
);
|
||||||
|
|
||||||
|
let preprocess_id = tasks[1].id;
|
||||||
|
|
||||||
|
// Forward pass (simplified as MatMul)
|
||||||
|
tasks.push(
|
||||||
|
Task::new(Operation::MatMul {
|
||||||
|
m: *batch_size as usize,
|
||||||
|
n: 4096,
|
||||||
|
k: 4096,
|
||||||
|
precision: self.default_precision,
|
||||||
|
})
|
||||||
|
.with_dependencies(vec![preprocess_id])
|
||||||
|
.with_priority(TaskPriority::Critical),
|
||||||
|
);
|
||||||
|
|
||||||
|
let forward_id = tasks[2].id;
|
||||||
|
|
||||||
|
// Backward pass
|
||||||
|
tasks.push(
|
||||||
|
Task::new(Operation::Backward {
|
||||||
|
forward_op: Box::new(Operation::MatMul {
|
||||||
|
m: *batch_size as usize,
|
||||||
|
n: 4096,
|
||||||
|
k: 4096,
|
||||||
|
precision: self.default_precision,
|
||||||
|
}),
|
||||||
|
})
|
||||||
|
.with_dependencies(vec![forward_id])
|
||||||
|
.with_priority(TaskPriority::Critical),
|
||||||
|
);
|
||||||
|
|
||||||
|
let backward_id = tasks[3].id;
|
||||||
|
|
||||||
|
// Optimizer step
|
||||||
|
tasks.push(
|
||||||
|
Task::new(Operation::OptimizerStep {
|
||||||
|
parameters: 1_000_000,
|
||||||
|
optimizer: "adamw".to_string(),
|
||||||
|
precision: self.default_precision,
|
||||||
|
})
|
||||||
|
.with_dependencies(vec![backward_id])
|
||||||
|
.with_priority(TaskPriority::High),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(tasks)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decompose inference job.
|
||||||
|
fn decompose_inference(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
|
||||||
|
let mut tasks = Vec::new();
|
||||||
|
|
||||||
|
if let JobType::Inference { batch_size, .. } = &job.job_type {
|
||||||
|
// Tokenization (CPU optimal)
|
||||||
|
tasks.push(
|
||||||
|
Task::new(Operation::Tokenization {
|
||||||
|
text_bytes: 4096,
|
||||||
|
vocab_size: 32000,
|
||||||
|
})
|
||||||
|
.with_priority(TaskPriority::High),
|
||||||
|
);
|
||||||
|
|
||||||
|
let token_id = tasks[0].id;
|
||||||
|
|
||||||
|
// Embedding (GPU optimal)
|
||||||
|
tasks.push(
|
||||||
|
Task::new(Operation::Embedding {
|
||||||
|
batch: *batch_size as usize,
|
||||||
|
seq_len: 512,
|
||||||
|
vocab_size: 32000,
|
||||||
|
embed_dim: 4096,
|
||||||
|
precision: self.default_precision,
|
||||||
|
})
|
||||||
|
.with_dependencies(vec![token_id])
|
||||||
|
.with_priority(TaskPriority::Critical),
|
||||||
|
);
|
||||||
|
|
||||||
|
let embed_id = tasks[1].id;
|
||||||
|
|
||||||
|
// Self-attention (TPU/GPU optimal)
|
||||||
|
tasks.push(
|
||||||
|
Task::new(Operation::SelfAttention {
|
||||||
|
batch: *batch_size as usize,
|
||||||
|
seq_len: 512,
|
||||||
|
num_heads: 32,
|
||||||
|
head_dim: 128,
|
||||||
|
precision: self.default_precision,
|
||||||
|
})
|
||||||
|
.with_dependencies(vec![embed_id])
|
||||||
|
.with_priority(TaskPriority::Critical),
|
||||||
|
);
|
||||||
|
|
||||||
|
let attention_id = tasks[2].id;
|
||||||
|
|
||||||
|
// Sampling (LPU optimal)
|
||||||
|
tasks.push(
|
||||||
|
Task::new(Operation::Sampling {
|
||||||
|
batch: *batch_size as usize,
|
||||||
|
vocab_size: 32000,
|
||||||
|
temperature: 0.7,
|
||||||
|
})
|
||||||
|
.with_dependencies(vec![attention_id])
|
||||||
|
.with_priority(TaskPriority::High),
|
||||||
|
);
|
||||||
|
|
||||||
|
let sample_id = tasks[3].id;
|
||||||
|
|
||||||
|
// Detokenization (CPU optimal)
|
||||||
|
tasks.push(
|
||||||
|
Task::new(Operation::Detokenization {
|
||||||
|
tokens: 256,
|
||||||
|
vocab_size: 32000,
|
||||||
|
})
|
||||||
|
.with_dependencies(vec![sample_id])
|
||||||
|
.with_priority(TaskPriority::Normal),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(tasks)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decompose container job.
|
||||||
|
fn decompose_container(&self, _job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
|
||||||
|
// Container jobs are typically a single task
|
||||||
|
Ok(vec![Task::new(Operation::Generic {
|
||||||
|
op_type: OperationType::DataLoad,
|
||||||
|
flops: 1e9,
|
||||||
|
memory: 1024 * 1024 * 1024,
|
||||||
|
})
|
||||||
|
.with_priority(TaskPriority::Normal)])
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decompose serverless function.
|
||||||
|
fn decompose_serverless(&self, _job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
|
||||||
|
// Serverless is typically a single task
|
||||||
|
Ok(vec![Task::new(Operation::Generic {
|
||||||
|
op_type: OperationType::DataPreprocess,
|
||||||
|
flops: 1e6,
|
||||||
|
memory: 256 * 1024 * 1024,
|
||||||
|
})
|
||||||
|
.with_priority(TaskPriority::High)])
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decompose WASM job.
|
||||||
|
fn decompose_wasm(&self, _job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
|
||||||
|
// WASM is typically a single task
|
||||||
|
Ok(vec![Task::new(Operation::Generic {
|
||||||
|
op_type: OperationType::DataPreprocess,
|
||||||
|
flops: 1e6,
|
||||||
|
memory: 16 * 1024 * 1024,
|
||||||
|
})
|
||||||
|
.with_priority(TaskPriority::Normal)])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for TaskDecomposer {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_task_creation() {
|
||||||
|
let task = Task::new(Operation::MatMul {
|
||||||
|
m: 1024,
|
||||||
|
n: 1024,
|
||||||
|
k: 1024,
|
||||||
|
precision: Precision::Fp32,
|
||||||
|
})
|
||||||
|
.with_priority(TaskPriority::High);
|
||||||
|
|
||||||
|
assert_eq!(task.priority, TaskPriority::High);
|
||||||
|
assert!(task.dependencies.is_empty());
|
||||||
|
assert_eq!(task.status, TaskStatus::Pending);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_task_dependencies() {
|
||||||
|
let task1 = Task::new(Operation::DataLoad {
|
||||||
|
bytes: 1000,
|
||||||
|
async_: true,
|
||||||
|
});
|
||||||
|
let task1_id = task1.id;
|
||||||
|
|
||||||
|
let task2 = Task::new(Operation::MatMul {
|
||||||
|
m: 1024,
|
||||||
|
n: 1024,
|
||||||
|
k: 1024,
|
||||||
|
precision: Precision::Fp32,
|
||||||
|
})
|
||||||
|
.with_dependencies(vec![task1_id]);
|
||||||
|
|
||||||
|
assert_eq!(task2.dependencies, vec![task1_id]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_task_compatibility() {
|
||||||
|
let matmul_task = Task::new(Operation::MatMul {
|
||||||
|
m: 1024,
|
||||||
|
n: 1024,
|
||||||
|
k: 1024,
|
||||||
|
precision: Precision::Fp32,
|
||||||
|
});
|
||||||
|
|
||||||
|
// MatMul should be compatible with GPU and TPU
|
||||||
|
assert!(matmul_task.is_compatible_with(ProcessorType::Gpu(
|
||||||
|
crate::processor::GpuVariant::NvidiaCuda {
|
||||||
|
compute_capability: (8, 0)
|
||||||
|
}
|
||||||
|
)));
|
||||||
|
assert!(matmul_task.is_compatible_with(ProcessorType::Tpu(
|
||||||
|
crate::processor::TpuVersion::V5p
|
||||||
|
)));
|
||||||
|
|
||||||
|
let data_load_task = Task::new(Operation::DataLoad {
|
||||||
|
bytes: 1000,
|
||||||
|
async_: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
// DataLoad should be compatible with CPU
|
||||||
|
assert!(data_load_task.is_compatible_with(ProcessorType::Cpu(
|
||||||
|
crate::processor::CpuVariant::default()
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_task_decomposer() {
|
||||||
|
let decomposer = TaskDecomposer::new();
|
||||||
|
|
||||||
|
let job = ComputeJob {
|
||||||
|
id: crate::JobId::new(),
|
||||||
|
owner: [0u8; 32],
|
||||||
|
job_type: JobType::Inference {
|
||||||
|
model_cid: "model".to_string(),
|
||||||
|
input_format: "json".to_string(),
|
||||||
|
batch_size: 1,
|
||||||
|
},
|
||||||
|
resources: crate::ResourceRequirements::default(),
|
||||||
|
input_cid: None,
|
||||||
|
max_budget: 1_000_000,
|
||||||
|
priority: crate::JobPriority::Normal,
|
||||||
|
created_at: 0,
|
||||||
|
deadline: None,
|
||||||
|
};
|
||||||
|
|
||||||
|
let tasks = decomposer.decompose(&job).unwrap();
|
||||||
|
assert!(!tasks.is_empty());
|
||||||
|
|
||||||
|
// Check dependencies form a chain
|
||||||
|
for (i, task) in tasks.iter().enumerate() {
|
||||||
|
if i > 0 {
|
||||||
|
assert!(!task.dependencies.is_empty());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
1584
docs/PLAN/PHASE11-Synor-Compute-L2-Part2-HyperEfficiency.md
Normal file
1584
docs/PLAN/PHASE11-Synor-Compute-L2-Part2-HyperEfficiency.md
Normal file
File diff suppressed because it is too large
Load diff
1564
docs/PLAN/PHASE11-Synor-Compute-L2-Part3-HeterogeneousCompute.md
Normal file
1564
docs/PLAN/PHASE11-Synor-Compute-L2-Part3-HeterogeneousCompute.md
Normal file
File diff suppressed because it is too large
Load diff
906
docs/PLAN/PHASE11-Synor-Compute-L2.md
Normal file
906
docs/PLAN/PHASE11-Synor-Compute-L2.md
Normal file
|
|
@ -0,0 +1,906 @@
|
||||||
|
# Phase 11: Synor Compute L2 - Full-Stack Compute Platform
|
||||||
|
|
||||||
|
> **Mission**: Build a decentralized compute platform capable of AI/ML training, inference, OS hosting, and general-purpose high-performance computing.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Executive Summary
|
||||||
|
|
||||||
|
Synor Compute L2 extends beyond the current WASM-only Synor VM to provide:
|
||||||
|
- **GPU Compute**: AI/ML training and inference with CUDA/ROCm support
|
||||||
|
- **Container Orchestration**: Docker-compatible workloads with Kubernetes-style scheduling
|
||||||
|
- **Persistent VMs**: Long-running virtual machines for OS hosting
|
||||||
|
- **Serverless Functions**: Short-lived compute for API backends and event processing
|
||||||
|
- **Edge Compute**: Low-latency compute at network edge nodes
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Architecture Overview
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||||
|
│ SYNOR COMPUTE L2 │
|
||||||
|
├─────────────────────────────────────────────────────────────────────────────┤
|
||||||
|
│ │
|
||||||
|
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ APPLICATION LAYER │ │
|
||||||
|
│ ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
|
||||||
|
│ │ AI/ML │ Serverless │ Containers │ Persistent │ Edge │ │
|
||||||
|
│ │ Training │ Functions │ (Docker) │ VMs (Linux) │ Compute │ │
|
||||||
|
│ └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
|
||||||
|
│ │
|
||||||
|
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ ORCHESTRATION LAYER │ │
|
||||||
|
│ ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
|
||||||
|
│ │ Job │ Resource │ Network │ Storage │ Health │ │
|
||||||
|
│ │ Scheduler │ Manager │ Fabric │ Orchestrator│ Monitor │ │
|
||||||
|
│ └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
|
||||||
|
│ │
|
||||||
|
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ COMPUTE RUNTIME LAYER │ │
|
||||||
|
│ ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
|
||||||
|
│ │ GPU │ Container │ MicroVM │ WASM │ Native │ │
|
||||||
|
│ │ Runtime │ Runtime │ Runtime │ Runtime │ Runtime │ │
|
||||||
|
│ │ (CUDA/ROCm)│ (containerd)│ (Firecracker)│ (Wasmtime) │ (gVisor) │ │
|
||||||
|
│ └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
|
||||||
|
│ │
|
||||||
|
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ INFRASTRUCTURE LAYER │ │
|
||||||
|
│ ├──────────────┬──────────────┬──────────────┬──────────────┬────────────┤ │
|
||||||
|
│ │ Node │ Network │ Distributed │ Consensus │ Billing │ │
|
||||||
|
│ │ Registry │ Overlay │ Storage │ (PoS+PoW) │ Metering │ │
|
||||||
|
│ └──────────────┴──────────────┴──────────────┴──────────────┴────────────┘ │
|
||||||
|
│ │
|
||||||
|
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ SYNOR L1 BLOCKCHAIN (GHOSTDAG + DAG-RIDER) │ │
|
||||||
|
│ └─────────────────────────────────────────────────────────────────────────┘ │
|
||||||
|
│ │
|
||||||
|
└─────────────────────────────────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Milestone 1: GPU Compute Foundation (AI/ML Training & Inference)
|
||||||
|
|
||||||
|
### 1.1 GPU Node Registration
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// synor-compute/src/gpu/node.rs
|
||||||
|
|
||||||
|
/// GPU node capabilities
|
||||||
|
pub struct GpuNode {
|
||||||
|
/// Unique node ID
|
||||||
|
pub node_id: NodeId,
|
||||||
|
/// GPU specifications
|
||||||
|
pub gpus: Vec<GpuSpec>,
|
||||||
|
/// Total VRAM available (bytes)
|
||||||
|
pub total_vram: u64,
|
||||||
|
/// Available VRAM (bytes)
|
||||||
|
pub available_vram: u64,
|
||||||
|
/// CUDA compute capability (e.g., 8.6 for RTX 3090)
|
||||||
|
pub cuda_capability: Option<(u8, u8)>,
|
||||||
|
/// ROCm version (for AMD)
|
||||||
|
pub rocm_version: Option<String>,
|
||||||
|
/// Network bandwidth (Gbps)
|
||||||
|
pub bandwidth_gbps: u32,
|
||||||
|
/// Geographic region
|
||||||
|
pub region: Region,
|
||||||
|
/// Stake amount (for PoS validation)
|
||||||
|
pub stake: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct GpuSpec {
|
||||||
|
pub model: String, // "NVIDIA RTX 4090"
|
||||||
|
pub vram_gb: u32, // 24
|
||||||
|
pub tensor_cores: u32, // 512
|
||||||
|
pub cuda_cores: u32, // 16384
|
||||||
|
pub memory_bandwidth: u32, // 1008 GB/s
|
||||||
|
pub fp32_tflops: f32, // 82.6
|
||||||
|
pub fp16_tflops: f32, // 165.2
|
||||||
|
pub int8_tops: f32, // 330.4
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 1.2 AI/ML Job Specification
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// synor-compute/src/ai/job.rs
|
||||||
|
|
||||||
|
/// AI/ML training job specification
|
||||||
|
pub struct TrainingJob {
|
||||||
|
/// Job ID
|
||||||
|
pub job_id: JobId,
|
||||||
|
/// Owner address
|
||||||
|
pub owner: Address,
|
||||||
|
/// Framework (PyTorch, TensorFlow, JAX)
|
||||||
|
pub framework: MlFramework,
|
||||||
|
/// Model specification
|
||||||
|
pub model: ModelSpec,
|
||||||
|
/// Dataset reference (Synor Storage CID)
|
||||||
|
pub dataset_cid: Cid,
|
||||||
|
/// Training configuration
|
||||||
|
pub config: TrainingConfig,
|
||||||
|
/// Resource requirements
|
||||||
|
pub resources: GpuResources,
|
||||||
|
/// Maximum budget (SYNOR tokens)
|
||||||
|
pub max_budget: u64,
|
||||||
|
/// Checkpoint interval (steps)
|
||||||
|
pub checkpoint_interval: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct GpuResources {
|
||||||
|
pub min_gpus: u32,
|
||||||
|
pub max_gpus: u32,
|
||||||
|
pub min_vram_per_gpu: u64,
|
||||||
|
pub cuda_capability_min: Option<(u8, u8)>,
|
||||||
|
pub distributed: bool, // Multi-node training
|
||||||
|
pub priority: JobPriority,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum MlFramework {
|
||||||
|
PyTorch { version: String },
|
||||||
|
TensorFlow { version: String },
|
||||||
|
JAX { version: String },
|
||||||
|
ONNX,
|
||||||
|
Custom { image: String },
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct TrainingConfig {
|
||||||
|
pub epochs: u32,
|
||||||
|
pub batch_size: u32,
|
||||||
|
pub learning_rate: f32,
|
||||||
|
pub optimizer: String,
|
||||||
|
pub mixed_precision: bool,
|
||||||
|
pub gradient_accumulation: u32,
|
||||||
|
pub distributed_strategy: DistributedStrategy,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum DistributedStrategy {
|
||||||
|
DataParallel,
|
||||||
|
ModelParallel,
|
||||||
|
PipelineParallel,
|
||||||
|
ZeRO { stage: u8 }, // DeepSpeed ZeRO stages 1-3
|
||||||
|
FSDP, // Fully Sharded Data Parallel
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 1.3 Inference Service
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// synor-compute/src/ai/inference.rs
|
||||||
|
|
||||||
|
/// Inference endpoint specification
|
||||||
|
pub struct InferenceEndpoint {
|
||||||
|
/// Endpoint ID
|
||||||
|
pub endpoint_id: EndpointId,
|
||||||
|
/// Model reference (Synor Storage CID)
|
||||||
|
pub model_cid: Cid,
|
||||||
|
/// Model format
|
||||||
|
pub format: ModelFormat,
|
||||||
|
/// Scaling configuration
|
||||||
|
pub scaling: AutoscaleConfig,
|
||||||
|
/// GPU requirements per replica
|
||||||
|
pub gpu_per_replica: GpuResources,
|
||||||
|
/// Request timeout
|
||||||
|
pub timeout_ms: u32,
|
||||||
|
/// Max batch size for batching inference
|
||||||
|
pub max_batch_size: u32,
|
||||||
|
/// Batching timeout
|
||||||
|
pub batch_timeout_ms: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum ModelFormat {
|
||||||
|
PyTorch,
|
||||||
|
ONNX,
|
||||||
|
TensorRT,
|
||||||
|
Triton,
|
||||||
|
vLLM, // For LLM serving
|
||||||
|
TGI, // Text Generation Inference
|
||||||
|
Custom,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct AutoscaleConfig {
|
||||||
|
pub min_replicas: u32,
|
||||||
|
pub max_replicas: u32,
|
||||||
|
pub target_gpu_utilization: f32,
|
||||||
|
pub scale_up_threshold: f32,
|
||||||
|
pub scale_down_threshold: f32,
|
||||||
|
pub cooldown_seconds: u32,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 1.4 Pricing Model for GPU Compute
|
||||||
|
|
||||||
|
| Resource | Unit | Price (SYNOR/unit) |
|
||||||
|
|----------|------|-------------------|
|
||||||
|
| GPU (RTX 4090 equivalent) | hour | 0.50 |
|
||||||
|
| GPU (A100 80GB equivalent) | hour | 2.00 |
|
||||||
|
| GPU (H100 equivalent) | hour | 4.00 |
|
||||||
|
| VRAM | GB/hour | 0.01 |
|
||||||
|
| Network egress | GB | 0.05 |
|
||||||
|
| Storage (hot, NVMe) | GB/month | 0.10 |
|
||||||
|
| Inference requests | 1M tokens | 0.10 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Milestone 2: Container Orchestration (Docker/Kubernetes-Compatible)
|
||||||
|
|
||||||
|
### 2.1 Container Runtime
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// synor-compute/src/container/runtime.rs
|
||||||
|
|
||||||
|
/// Container specification (OCI-compatible)
|
||||||
|
pub struct ContainerSpec {
|
||||||
|
/// Image reference
|
||||||
|
pub image: ImageRef,
|
||||||
|
/// Resource limits
|
||||||
|
pub resources: ContainerResources,
|
||||||
|
/// Environment variables
|
||||||
|
pub env: HashMap<String, String>,
|
||||||
|
/// Volume mounts
|
||||||
|
pub volumes: Vec<VolumeMount>,
|
||||||
|
/// Network configuration
|
||||||
|
pub network: NetworkConfig,
|
||||||
|
/// Security context
|
||||||
|
pub security: SecurityContext,
|
||||||
|
/// Health check
|
||||||
|
pub health_check: Option<HealthCheck>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct ContainerResources {
|
||||||
|
pub cpu_cores: f32, // 0.5, 1.0, 2.0, etc.
|
||||||
|
pub memory_mb: u64,
|
||||||
|
pub gpu: Option<GpuAllocation>,
|
||||||
|
pub ephemeral_storage_gb: u32,
|
||||||
|
pub network_bandwidth_mbps: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct GpuAllocation {
|
||||||
|
pub count: u32,
|
||||||
|
pub vram_mb: u64,
|
||||||
|
pub shared: bool, // Allow GPU sharing via MPS/MIG
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.2 Service Mesh & Networking
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// synor-compute/src/network/mesh.rs
|
||||||
|
|
||||||
|
/// Service definition for container orchestration
|
||||||
|
pub struct Service {
|
||||||
|
pub service_id: ServiceId,
|
||||||
|
pub name: String,
|
||||||
|
pub containers: Vec<ContainerSpec>,
|
||||||
|
pub replicas: ReplicaConfig,
|
||||||
|
pub load_balancer: LoadBalancerConfig,
|
||||||
|
pub service_mesh: ServiceMeshConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct ServiceMeshConfig {
|
||||||
|
pub mtls_enabled: bool,
|
||||||
|
pub traffic_policy: TrafficPolicy,
|
||||||
|
pub circuit_breaker: CircuitBreakerConfig,
|
||||||
|
pub retry_policy: RetryPolicy,
|
||||||
|
pub rate_limit: Option<RateLimitConfig>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct LoadBalancerConfig {
|
||||||
|
pub algorithm: LoadBalancerAlgorithm,
|
||||||
|
pub health_check: HealthCheck,
|
||||||
|
pub sticky_sessions: bool,
|
||||||
|
pub ssl_termination: SslTermination,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum LoadBalancerAlgorithm {
|
||||||
|
RoundRobin,
|
||||||
|
LeastConnections,
|
||||||
|
WeightedRoundRobin { weights: Vec<u32> },
|
||||||
|
IPHash,
|
||||||
|
Random,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.3 Container Pricing
|
||||||
|
|
||||||
|
| Resource | Unit | Price (SYNOR/unit) |
|
||||||
|
|----------|------|-------------------|
|
||||||
|
| CPU | core/hour | 0.02 |
|
||||||
|
| Memory | GB/hour | 0.005 |
|
||||||
|
| Ephemeral storage | GB/hour | 0.001 |
|
||||||
|
| Network ingress | GB | FREE |
|
||||||
|
| Network egress | GB | 0.05 |
|
||||||
|
| Load balancer | hour | 0.01 |
|
||||||
|
| Static IP | month | 2.00 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Milestone 3: Persistent Virtual Machines (OS Hosting)
|
||||||
|
|
||||||
|
### 3.1 MicroVM Architecture (Firecracker-based)
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// synor-compute/src/vm/microvm.rs
|
||||||
|
|
||||||
|
/// Virtual machine specification
|
||||||
|
pub struct VmSpec {
|
||||||
|
/// VM ID
|
||||||
|
pub vm_id: VmId,
|
||||||
|
/// Owner address
|
||||||
|
pub owner: Address,
|
||||||
|
/// VM size
|
||||||
|
pub size: VmSize,
|
||||||
|
/// Boot image
|
||||||
|
pub image: VmImage,
|
||||||
|
/// Persistent volumes
|
||||||
|
pub volumes: Vec<PersistentVolume>,
|
||||||
|
/// Network configuration
|
||||||
|
pub network: VmNetworkConfig,
|
||||||
|
/// SSH keys for access
|
||||||
|
pub ssh_keys: Vec<SshPublicKey>,
|
||||||
|
/// Cloud-init user data
|
||||||
|
pub user_data: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct VmSize {
|
||||||
|
pub vcpus: u32,
|
||||||
|
pub memory_gb: u32,
|
||||||
|
pub gpu: Option<GpuPassthrough>,
|
||||||
|
pub network_bandwidth_gbps: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct GpuPassthrough {
|
||||||
|
pub count: u32,
|
||||||
|
pub model: GpuModel,
|
||||||
|
pub vram_gb: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum VmImage {
|
||||||
|
/// Pre-built images
|
||||||
|
Marketplace { image_id: String, version: String },
|
||||||
|
/// Custom image from Synor Storage
|
||||||
|
Custom { cid: Cid, format: ImageFormat },
|
||||||
|
/// Standard OS images
|
||||||
|
Ubuntu { version: String },
|
||||||
|
Debian { version: String },
|
||||||
|
AlmaLinux { version: String },
|
||||||
|
Windows { version: String, license: WindowsLicense },
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct PersistentVolume {
|
||||||
|
pub volume_id: VolumeId,
|
||||||
|
pub size_gb: u32,
|
||||||
|
pub volume_type: VolumeType,
|
||||||
|
pub mount_path: String,
|
||||||
|
pub encrypted: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum VolumeType {
|
||||||
|
/// High-performance NVMe SSD
|
||||||
|
NvmeSsd { iops: u32, throughput_mbps: u32 },
|
||||||
|
/// Standard SSD
|
||||||
|
Ssd,
|
||||||
|
/// HDD for archival
|
||||||
|
Hdd,
|
||||||
|
/// Distributed storage (Synor Storage L2)
|
||||||
|
Distributed { replication: u8 },
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.2 VM Lifecycle Management
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// synor-compute/src/vm/lifecycle.rs
|
||||||
|
|
||||||
|
pub enum VmState {
|
||||||
|
Pending,
|
||||||
|
Provisioning,
|
||||||
|
Running,
|
||||||
|
Stopping,
|
||||||
|
Stopped,
|
||||||
|
Hibernating,
|
||||||
|
Hibernated,
|
||||||
|
Migrating,
|
||||||
|
Failed,
|
||||||
|
Terminated,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct VmManager {
|
||||||
|
/// Active VMs
|
||||||
|
vms: HashMap<VmId, VmInstance>,
|
||||||
|
/// Node assignments
|
||||||
|
node_assignments: HashMap<VmId, NodeId>,
|
||||||
|
/// Live migration coordinator
|
||||||
|
migration_coordinator: MigrationCoordinator,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl VmManager {
|
||||||
|
/// Start a new VM
|
||||||
|
pub async fn create(&self, spec: VmSpec) -> Result<VmId, VmError>;
|
||||||
|
|
||||||
|
/// Stop a VM (preserves state)
|
||||||
|
pub async fn stop(&self, vm_id: &VmId) -> Result<(), VmError>;
|
||||||
|
|
||||||
|
/// Start a stopped VM
|
||||||
|
pub async fn start(&self, vm_id: &VmId) -> Result<(), VmError>;
|
||||||
|
|
||||||
|
/// Hibernate VM to storage (saves memory state)
|
||||||
|
pub async fn hibernate(&self, vm_id: &VmId) -> Result<(), VmError>;
|
||||||
|
|
||||||
|
/// Live migrate VM to another node
|
||||||
|
pub async fn migrate(&self, vm_id: &VmId, target_node: NodeId) -> Result<(), VmError>;
|
||||||
|
|
||||||
|
/// Resize VM (requires restart)
|
||||||
|
pub async fn resize(&self, vm_id: &VmId, new_size: VmSize) -> Result<(), VmError>;
|
||||||
|
|
||||||
|
/// Snapshot VM state
|
||||||
|
pub async fn snapshot(&self, vm_id: &VmId) -> Result<SnapshotId, VmError>;
|
||||||
|
|
||||||
|
/// Terminate and delete VM
|
||||||
|
pub async fn terminate(&self, vm_id: &VmId) -> Result<(), VmError>;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.3 VM Pricing
|
||||||
|
|
||||||
|
| VM Type | vCPUs | Memory | Storage | GPU | Price (SYNOR/month) |
|
||||||
|
|---------|-------|--------|---------|-----|---------------------|
|
||||||
|
| micro | 1 | 1 GB | 20 GB SSD | - | 5 |
|
||||||
|
| small | 2 | 4 GB | 50 GB SSD | - | 15 |
|
||||||
|
| medium | 4 | 8 GB | 100 GB SSD | - | 30 |
|
||||||
|
| large | 8 | 32 GB | 200 GB SSD | - | 80 |
|
||||||
|
| xlarge | 16 | 64 GB | 500 GB NVMe | - | 200 |
|
||||||
|
| gpu-small | 8 | 32 GB | 200 GB NVMe | 1x RTX 4090 | 400 |
|
||||||
|
| gpu-medium | 16 | 64 GB | 500 GB NVMe | 2x RTX 4090 | 750 |
|
||||||
|
| gpu-large | 32 | 128 GB | 1 TB NVMe | 4x A100 80GB | 2500 |
|
||||||
|
| gpu-xlarge | 64 | 256 GB | 2 TB NVMe | 8x H100 | 8000 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Milestone 4: Serverless Functions (FaaS)
|
||||||
|
|
||||||
|
### 4.1 Function Specification
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// synor-compute/src/serverless/function.rs
|
||||||
|
|
||||||
|
/// Serverless function definition
|
||||||
|
pub struct Function {
|
||||||
|
pub function_id: FunctionId,
|
||||||
|
pub owner: Address,
|
||||||
|
pub name: String,
|
||||||
|
pub runtime: FunctionRuntime,
|
||||||
|
pub handler: String,
|
||||||
|
pub code: FunctionCode,
|
||||||
|
pub resources: FunctionResources,
|
||||||
|
pub triggers: Vec<FunctionTrigger>,
|
||||||
|
pub environment: HashMap<String, String>,
|
||||||
|
pub timeout_ms: u32,
|
||||||
|
pub concurrency: ConcurrencyConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum FunctionRuntime {
|
||||||
|
Node20,
|
||||||
|
Node22,
|
||||||
|
Python311,
|
||||||
|
Python312,
|
||||||
|
Rust,
|
||||||
|
Go122,
|
||||||
|
Java21,
|
||||||
|
Dotnet8,
|
||||||
|
Ruby33,
|
||||||
|
Custom { image: String },
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct FunctionCode {
|
||||||
|
/// Source code CID in Synor Storage
|
||||||
|
pub cid: Cid,
|
||||||
|
/// Entry point file
|
||||||
|
pub entry_point: String,
|
||||||
|
/// Dependencies (package.json, requirements.txt, etc.)
|
||||||
|
pub dependencies: Option<Cid>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct FunctionResources {
|
||||||
|
pub memory_mb: u32, // 128, 256, 512, 1024, 2048, 4096, 8192
|
||||||
|
pub cpu_allocation: f32, // Proportional to memory
|
||||||
|
pub ephemeral_storage_mb: u32,
|
||||||
|
pub gpu: Option<GpuAllocation>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum FunctionTrigger {
|
||||||
|
/// HTTP endpoint
|
||||||
|
Http { path: String, methods: Vec<HttpMethod> },
|
||||||
|
/// Scheduled execution (cron)
|
||||||
|
Schedule { cron: String },
|
||||||
|
/// Event from message queue
|
||||||
|
Queue { queue_name: String },
|
||||||
|
/// Storage events
|
||||||
|
Storage { bucket: String, events: Vec<StorageEvent> },
|
||||||
|
/// Blockchain events
|
||||||
|
Blockchain { contract: Address, events: Vec<String> },
|
||||||
|
/// Webhook
|
||||||
|
Webhook { url: String },
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.2 Cold Start Optimization
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// synor-compute/src/serverless/warmup.rs
|
||||||
|
|
||||||
|
/// Function warmup strategies
|
||||||
|
pub struct WarmupConfig {
|
||||||
|
/// Minimum warm instances
|
||||||
|
pub min_instances: u32,
|
||||||
|
/// Provisioned concurrency
|
||||||
|
pub provisioned_concurrency: u32,
|
||||||
|
/// Warmup schedule
|
||||||
|
pub warmup_schedule: Option<String>,
|
||||||
|
/// Snapshot-based cold start (SnapStart)
|
||||||
|
pub snapstart_enabled: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct ColdStartOptimizer {
|
||||||
|
/// Pre-warmed function pools
|
||||||
|
pools: HashMap<FunctionRuntime, WarmPool>,
|
||||||
|
/// Snapshot cache
|
||||||
|
snapshots: LruCache<FunctionId, FunctionSnapshot>,
|
||||||
|
/// Prediction model for scaling
|
||||||
|
predictor: ScalingPredictor,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ColdStartOptimizer {
|
||||||
|
/// Get a warm instance or create one
|
||||||
|
pub async fn get_instance(&self, function: &Function) -> Result<FunctionInstance, Error> {
|
||||||
|
// Try snapshot restore first (< 100ms)
|
||||||
|
if let Some(snapshot) = self.snapshots.get(&function.function_id) {
|
||||||
|
return self.restore_from_snapshot(snapshot).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try warm pool (< 50ms)
|
||||||
|
if let Some(instance) = self.pools.get(&function.runtime)?.get_warm() {
|
||||||
|
return Ok(instance);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cold start (1-5s depending on runtime)
|
||||||
|
self.cold_start(function).await
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.3 Serverless Pricing
|
||||||
|
|
||||||
|
| Resource | Unit | Price (SYNOR) |
|
||||||
|
|----------|------|---------------|
|
||||||
|
| Invocations | 1M requests | 0.20 |
|
||||||
|
| Duration | GB-second | 0.00001 |
|
||||||
|
| Provisioned concurrency | GB-hour | 0.01 |
|
||||||
|
| HTTP Gateway | 1M requests | 0.10 |
|
||||||
|
| Event bridge | 1M events | 0.50 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Milestone 5: Edge Compute
|
||||||
|
|
||||||
|
### 5.1 Edge Node Architecture
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// synor-compute/src/edge/node.rs
|
||||||
|
|
||||||
|
/// Edge compute node
|
||||||
|
pub struct EdgeNode {
|
||||||
|
pub node_id: NodeId,
|
||||||
|
pub location: GeoLocation,
|
||||||
|
pub capabilities: EdgeCapabilities,
|
||||||
|
pub latency_zones: Vec<LatencyZone>,
|
||||||
|
pub resources: EdgeResources,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct EdgeCapabilities {
|
||||||
|
pub wasm_runtime: bool,
|
||||||
|
pub container_runtime: bool,
|
||||||
|
pub gpu_inference: bool,
|
||||||
|
pub video_transcoding: bool,
|
||||||
|
pub cdn_cache: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct EdgeResources {
|
||||||
|
pub cpu_cores: u32,
|
||||||
|
pub memory_gb: u32,
|
||||||
|
pub storage_gb: u32,
|
||||||
|
pub gpu: Option<EdgeGpu>,
|
||||||
|
pub bandwidth_gbps: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Edge function for low-latency compute
|
||||||
|
pub struct EdgeFunction {
|
||||||
|
pub function_id: FunctionId,
|
||||||
|
pub code: WasmModule,
|
||||||
|
pub memory_limit: u32,
|
||||||
|
pub timeout_ms: u32,
|
||||||
|
pub allowed_regions: Vec<Region>,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5.2 Edge Use Cases
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// synor-compute/src/edge/usecases.rs
|
||||||
|
|
||||||
|
/// CDN with compute at edge
|
||||||
|
pub struct EdgeCdn {
|
||||||
|
/// Origin servers
|
||||||
|
origins: Vec<Origin>,
|
||||||
|
/// Cache rules
|
||||||
|
cache_rules: Vec<CacheRule>,
|
||||||
|
/// Edge workers for request/response transformation
|
||||||
|
workers: Vec<EdgeWorker>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Real-time inference at edge
|
||||||
|
pub struct EdgeInference {
|
||||||
|
/// Model optimized for edge (quantized, pruned)
|
||||||
|
model_id: ModelId,
|
||||||
|
/// Inference runtime (TensorRT, ONNX Runtime)
|
||||||
|
runtime: EdgeInferenceRuntime,
|
||||||
|
/// Max batch size
|
||||||
|
max_batch: u32,
|
||||||
|
/// Target latency
|
||||||
|
target_latency_ms: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Video processing at edge
|
||||||
|
pub struct EdgeVideoProcessor {
|
||||||
|
/// Transcoding profiles
|
||||||
|
profiles: Vec<TranscodingProfile>,
|
||||||
|
/// Real-time streaming
|
||||||
|
live_streaming: bool,
|
||||||
|
/// Adaptive bitrate
|
||||||
|
abr_enabled: bool,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5.3 Edge Pricing
|
||||||
|
|
||||||
|
| Resource | Unit | Price (SYNOR) |
|
||||||
|
|----------|------|---------------|
|
||||||
|
| Edge function invocations | 1M | 0.50 |
|
||||||
|
| Edge function duration | GB-second | 0.00002 |
|
||||||
|
| Edge bandwidth | GB | 0.08 |
|
||||||
|
| Edge cache storage | GB/month | 0.02 |
|
||||||
|
| Video transcoding | minute | 0.02 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Milestone 6: Node Provider Economics
|
||||||
|
|
||||||
|
### 6.1 Provider Registration
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// synor-compute/src/provider/registration.rs
|
||||||
|
|
||||||
|
/// Compute provider registration
|
||||||
|
pub struct ProviderRegistration {
|
||||||
|
pub provider_id: ProviderId,
|
||||||
|
pub owner: Address,
|
||||||
|
/// Stake required to become provider
|
||||||
|
pub stake: u64,
|
||||||
|
/// Hardware specifications
|
||||||
|
pub hardware: HardwareManifest,
|
||||||
|
/// Network connectivity
|
||||||
|
pub network: NetworkManifest,
|
||||||
|
/// Geographic location
|
||||||
|
pub location: GeoLocation,
|
||||||
|
/// Availability SLA commitment
|
||||||
|
pub sla: SlaCommitment,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct HardwareManifest {
|
||||||
|
pub cpus: Vec<CpuSpec>,
|
||||||
|
pub memory_total_gb: u64,
|
||||||
|
pub gpus: Vec<GpuSpec>,
|
||||||
|
pub storage: Vec<StorageSpec>,
|
||||||
|
pub verified: bool, // Hardware attestation passed
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct SlaCommitment {
|
||||||
|
pub uptime_percent: f32, // 99.9, 99.99, etc.
|
||||||
|
pub response_time_ms: u32,
|
||||||
|
pub data_durability: f32,
|
||||||
|
pub penalty_rate: f32, // Penalty for SLA violation
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6.2 Provider Revenue Model
|
||||||
|
|
||||||
|
| Revenue Source | Provider Share | Protocol Share |
|
||||||
|
|----------------|----------------|----------------|
|
||||||
|
| Compute fees | 85% | 15% |
|
||||||
|
| Storage fees | 80% | 20% |
|
||||||
|
| Network fees | 75% | 25% |
|
||||||
|
| SLA bonuses | 100% | 0% |
|
||||||
|
| Staking rewards | 100% | 0% |
|
||||||
|
|
||||||
|
### 6.3 Slashing Conditions
|
||||||
|
|
||||||
|
| Violation | Penalty |
|
||||||
|
|-----------|---------|
|
||||||
|
| Downtime > committed SLA | 1% stake per hour |
|
||||||
|
| Data loss | 10% stake + compensation |
|
||||||
|
| Malicious behavior | 100% stake |
|
||||||
|
| False hardware attestation | 50% stake |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Implementation Timeline
|
||||||
|
|
||||||
|
### Phase 11.1: Foundation (Weeks 1-4)
|
||||||
|
- [ ] Node registration and hardware attestation
|
||||||
|
- [ ] Basic job scheduler
|
||||||
|
- [ ] WASM runtime integration (existing)
|
||||||
|
- [ ] Container runtime (containerd)
|
||||||
|
- [ ] Network overlay (WireGuard mesh)
|
||||||
|
|
||||||
|
### Phase 11.2: GPU Compute (Weeks 5-8)
|
||||||
|
- [ ] GPU node registration
|
||||||
|
- [ ] NVIDIA driver integration
|
||||||
|
- [ ] CUDA runtime support
|
||||||
|
- [ ] Basic ML job execution
|
||||||
|
- [ ] Model storage integration
|
||||||
|
|
||||||
|
### Phase 11.3: Container Orchestration (Weeks 9-12)
|
||||||
|
- [ ] OCI image support
|
||||||
|
- [ ] Service deployment
|
||||||
|
- [ ] Load balancing
|
||||||
|
- [ ] Auto-scaling
|
||||||
|
- [ ] Service mesh (mTLS)
|
||||||
|
|
||||||
|
### Phase 11.4: Persistent VMs (Weeks 13-16)
|
||||||
|
- [ ] MicroVM runtime (Firecracker)
|
||||||
|
- [ ] VM lifecycle management
|
||||||
|
- [ ] Persistent storage
|
||||||
|
- [ ] Live migration
|
||||||
|
- [ ] Snapshot/restore
|
||||||
|
|
||||||
|
### Phase 11.5: Serverless (Weeks 17-20)
|
||||||
|
- [ ] Function deployment
|
||||||
|
- [ ] Cold start optimization
|
||||||
|
- [ ] Event triggers
|
||||||
|
- [ ] API gateway
|
||||||
|
- [ ] Monitoring/logging
|
||||||
|
|
||||||
|
### Phase 11.6: Edge Compute (Weeks 21-24)
|
||||||
|
- [ ] Edge node registration
|
||||||
|
- [ ] Edge function runtime
|
||||||
|
- [ ] CDN integration
|
||||||
|
- [ ] Edge inference
|
||||||
|
- [ ] Global anycast
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Security Considerations
|
||||||
|
|
||||||
|
### Isolation Levels
|
||||||
|
|
||||||
|
| Workload Type | Isolation Technology | Security Level |
|
||||||
|
|---------------|---------------------|----------------|
|
||||||
|
| WASM | Wasmtime sandbox | High |
|
||||||
|
| Serverless | gVisor + seccomp | High |
|
||||||
|
| Containers | gVisor or Kata | Medium-High |
|
||||||
|
| VMs | Firecracker MicroVM | High |
|
||||||
|
| GPU | NVIDIA MIG/MPS | Medium |
|
||||||
|
|
||||||
|
### Network Security
|
||||||
|
|
||||||
|
- All inter-node traffic encrypted (WireGuard)
|
||||||
|
- mTLS for service-to-service communication
|
||||||
|
- Network policies for workload isolation
|
||||||
|
- DDoS protection at edge
|
||||||
|
|
||||||
|
### Data Security
|
||||||
|
|
||||||
|
- Encryption at rest (AES-256)
|
||||||
|
- Encryption in transit (TLS 1.3)
|
||||||
|
- Confidential computing support (AMD SEV, Intel SGX)
|
||||||
|
- Secure key management (HSM integration)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## API Examples
|
||||||
|
|
||||||
|
### Deploy AI Training Job
|
||||||
|
|
||||||
|
```bash
|
||||||
|
synor compute train create \
|
||||||
|
--framework pytorch \
|
||||||
|
--model-config ./model.yaml \
|
||||||
|
--dataset synor://datasets/imagenet \
|
||||||
|
--gpus 8 \
|
||||||
|
--gpu-type h100 \
|
||||||
|
--distributed ddp \
|
||||||
|
--epochs 100 \
|
||||||
|
--checkpoint-interval 1000 \
|
||||||
|
--max-budget 1000
|
||||||
|
```
|
||||||
|
|
||||||
|
### Deploy Inference Endpoint
|
||||||
|
|
||||||
|
```bash
|
||||||
|
synor compute inference deploy \
|
||||||
|
--model synor://models/llama-70b \
|
||||||
|
--format vllm \
|
||||||
|
--min-replicas 2 \
|
||||||
|
--max-replicas 10 \
|
||||||
|
--gpu-per-replica 2 \
|
||||||
|
--target-utilization 0.7
|
||||||
|
```
|
||||||
|
|
||||||
|
### Create Persistent VM
|
||||||
|
|
||||||
|
```bash
|
||||||
|
synor compute vm create \
|
||||||
|
--name my-dev-server \
|
||||||
|
--image ubuntu:22.04 \
|
||||||
|
--size gpu-small \
|
||||||
|
--volume 100gb:nvme:/data \
|
||||||
|
--ssh-key ~/.ssh/id_ed25519.pub \
|
||||||
|
--region us-east
|
||||||
|
```
|
||||||
|
|
||||||
|
### Deploy Container Service
|
||||||
|
|
||||||
|
```bash
|
||||||
|
synor compute service deploy \
|
||||||
|
--name my-api \
|
||||||
|
--image my-registry/my-api:latest \
|
||||||
|
--replicas 3 \
|
||||||
|
--cpu 2 \
|
||||||
|
--memory 4gb \
|
||||||
|
--port 8080 \
|
||||||
|
--health-check /health \
|
||||||
|
--autoscale 2-10
|
||||||
|
```
|
||||||
|
|
||||||
|
### Deploy Serverless Function
|
||||||
|
|
||||||
|
```bash
|
||||||
|
synor compute function deploy \
|
||||||
|
--name process-image \
|
||||||
|
--runtime python312 \
|
||||||
|
--handler main.handler \
|
||||||
|
--code ./function \
|
||||||
|
--memory 1024 \
|
||||||
|
--timeout 30000 \
|
||||||
|
--trigger http:/api/process
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Comparison with Existing Synor VM
|
||||||
|
|
||||||
|
| Feature | Current Synor VM | Synor Compute L2 |
|
||||||
|
|---------|------------------|------------------|
|
||||||
|
| Runtime | WASM only | WASM, Container, MicroVM |
|
||||||
|
| Timeout | 30 seconds | Unlimited (VMs) |
|
||||||
|
| Memory | 16 MB max | Up to 256 GB |
|
||||||
|
| GPU | ❌ | ✅ Full CUDA/ROCm |
|
||||||
|
| Networking | ❌ | ✅ Full TCP/UDP |
|
||||||
|
| File I/O | ❌ | ✅ Persistent volumes |
|
||||||
|
| Threading | ❌ | ✅ Multi-threaded |
|
||||||
|
| AI/ML | ❌ | ✅ Training + Inference |
|
||||||
|
| OS Hosting | ❌ | ✅ Full Linux/Windows |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
1. **Milestone 1**: Implement GPU node registration and attestation
|
||||||
|
2. **Milestone 2**: Build basic job scheduler with resource allocation
|
||||||
|
3. **Milestone 3**: Integrate containerd for container workloads
|
||||||
|
4. **Milestone 4**: Add Firecracker for MicroVM support
|
||||||
|
5. **Milestone 5**: Implement serverless function runtime
|
||||||
|
6. **Milestone 6**: Deploy edge nodes and CDN integration
|
||||||
|
|
||||||
|
This plan transforms Synor from a smart contract platform into a full-stack decentralized cloud provider capable of competing with AWS/GCP/Azure while maintaining decentralization and censorship resistance.
|
||||||
Loading…
Add table
Reference in a new issue