synor/crates/synor-compute/src/lib.rs
2026-02-02 05:58:22 +05:30

639 lines
20 KiB
Rust

//! Synor Compute L2 - Heterogeneous Multi-Processor Compute Platform
//!
//! Provides decentralized compute services with:
//!
//! - **Heterogeneous Scheduling**: CPU + GPU + TPU + NPU + LPU working simultaneously
//! - **Consumer Device Mesh**: Mobile, browser, desktop devices contributing compute
//! - **90% Cost Reduction**: Zero margins, spot markets, electricity arbitrage
//! - **10x Speed**: Caching, speculative execution, optimal processor assignment
//!
//! # Architecture
//!
//! ```text
//! ┌─────────────────────────────────────────────────────────────────────────────┐
//! │ SYNOR COMPUTE L2 │
//! ├─────────────────────────────────────────────────────────────────────────────┤
//! │ │
//! │ ┌─────────────────────────────────────────────────────────────────────────┐ │
//! │ │ TASK DECOMPOSER │ │
//! │ │ Analyzes workload → Identifies subtasks → Maps to optimal processors │ │
//! │ └─────────────────────────────────────────────────────────────────────────┘ │
//! │ │ │
//! │ ▼ │
//! │ ┌─────────────────────────────────────────────────────────────────────────┐ │
//! │ │ HETEROGENEOUS SCHEDULER │ │
//! │ │ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ │ │
//! │ │ │ CPU │ │ GPU │ │ TPU │ │ NPU │ │ LPU │ │Custom│ │ │
//! │ │ │Queue │ │Queue │ │Queue │ │Queue │ │Queue │ │Queue │ │ │
//! │ │ └──────┘ └──────┘ └──────┘ └──────┘ └──────┘ └──────┘ │ │
//! │ └─────────────────────────────────────────────────────────────────────────┘ │
//! │ │
//! │ ┌─────────────────────────────────────────────────────────────────────────┐ │
//! │ │ UNIFIED MEMORY FABRIC │ │
//! │ │ Zero-copy data sharing │ Automatic placement │ Cache coherency │ │
//! │ └─────────────────────────────────────────────────────────────────────────┘ │
//! │ │
//! └─────────────────────────────────────────────────────────────────────────────┘
//! ```
//!
//! # Pricing
//!
//! | Resource | Unit | Price (SYNOR) |
//! |----------|------|---------------|
//! | GPU (consumer) | hour | 0.10 |
//! | GPU (datacenter) | hour | 0.50-4.00 |
//! | CPU | core/hour | 0.02 |
//! | Memory | GB/hour | 0.005 |
//! | Inference | 1M tokens | 0.10 |
#![allow(dead_code)]
pub mod device;
pub mod error;
pub mod market;
pub mod memory;
pub mod model;
pub mod processor;
pub mod scheduler;
pub mod task;
pub use device::{
DeviceCapabilities, DeviceId, DeviceInfo, DeviceRegistry, DeviceStatus, DeviceType,
};
pub use error::ComputeError;
pub use market::{
Auction, AuctionId, CloudComparison, CpuTier as MarketCpuTier, GpuTier as MarketGpuTier,
MarketStats, Order, OrderBook, OrderId, OrderSide, OrderType, PricingEngine, ProviderListing,
ResourceType, SpotMarket, Trade,
};
pub use memory::{MemoryManager, TensorHandle, TransferPath, UnifiedMemory};
pub use model::{
ModelCategory, ModelFormat, ModelId, ModelInfo, ModelRegistry, ModelUploadRequest,
ModelUploadResponse,
};
pub use processor::{
ComputeThroughput, CpuVariant, GpuVariant, NpuVariant, Operation, OperationType, Processor,
ProcessorCapabilities, ProcessorId, ProcessorType, TpuVersion,
};
pub use scheduler::{
HeterogeneousScheduler, LoadBalancer, Schedule, ScheduleResult, TaskAssignment, WorkQueue,
};
pub use task::{
ComputeTask, DecomposedWorkload, Task, TaskDecomposer, TaskId, TaskPriority, TaskResult,
TaskStatus,
};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::Arc;
use parking_lot::RwLock;
/// Compute node identifier.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct NodeId(pub u64);
impl std::fmt::Display for NodeId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "node_{}", self.0)
}
}
/// Job identifier.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct JobId(pub [u8; 32]);
impl JobId {
/// Creates a new job ID.
pub fn new() -> Self {
use rand::Rng;
let mut bytes = [0u8; 32];
rand::thread_rng().fill(&mut bytes);
JobId(bytes)
}
/// Creates from bytes.
pub fn from_bytes(bytes: [u8; 32]) -> Self {
JobId(bytes)
}
}
impl Default for JobId {
fn default() -> Self {
Self::new()
}
}
impl std::fmt::Display for JobId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "job_{}", hex::encode(&self.0[..8]))
}
}
/// Compute job specification.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ComputeJob {
/// Job ID.
pub id: JobId,
/// Owner address.
pub owner: [u8; 32],
/// Job type.
pub job_type: JobType,
/// Resource requirements.
pub resources: ResourceRequirements,
/// Input data reference (CID).
pub input_cid: Option<String>,
/// Maximum budget (in atomic SYNOR).
pub max_budget: u64,
/// Priority level.
pub priority: JobPriority,
/// Created timestamp.
pub created_at: u64,
/// Deadline (optional).
pub deadline: Option<u64>,
}
/// Job type classification.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum JobType {
/// AI/ML training job.
Training {
framework: MlFramework,
model_cid: String,
dataset_cid: String,
epochs: u32,
batch_size: u32,
},
/// AI/ML inference job.
Inference {
model_cid: String,
input_format: String,
batch_size: u32,
},
/// Container workload.
Container {
image: String,
command: Vec<String>,
env: HashMap<String, String>,
},
/// Serverless function.
Serverless {
runtime: FunctionRuntime,
code_cid: String,
handler: String,
},
/// General compute (WASM).
Wasm {
module_cid: String,
entrypoint: String,
},
}
/// ML framework specification.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum MlFramework {
PyTorch { version: String },
TensorFlow { version: String },
JAX { version: String },
ONNX,
}
/// Function runtime.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum FunctionRuntime {
Node20,
Python312,
Rust,
Go,
Custom { image: String },
}
/// Job priority levels.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub enum JobPriority {
/// Background job, can be preempted.
Background = 0,
/// Normal priority.
Normal = 1,
/// High priority, faster scheduling.
High = 2,
/// Critical, guaranteed resources.
Critical = 3,
}
impl Default for JobPriority {
fn default() -> Self {
JobPriority::Normal
}
}
/// Resource requirements for a job.
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
pub struct ResourceRequirements {
/// Minimum CPU cores.
pub min_cpu_cores: f32,
/// Minimum memory (GB).
pub min_memory_gb: f32,
/// GPU requirements.
pub gpu: Option<GpuRequirements>,
/// Preferred processor types (in priority order).
pub preferred_processors: Vec<ProcessorType>,
/// Maximum latency (ms) - for inference.
pub max_latency_ms: Option<u32>,
/// Requires distributed execution.
pub distributed: bool,
}
/// GPU resource requirements.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct GpuRequirements {
/// Minimum number of GPUs.
pub min_count: u32,
/// Maximum number of GPUs.
pub max_count: u32,
/// Minimum VRAM per GPU (GB).
pub min_vram_gb: u32,
/// Minimum compute capability.
pub min_compute_capability: Option<(u8, u8)>,
/// Allow GPU sharing (MPS/MIG).
pub allow_sharing: bool,
}
/// Job execution status.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum JobStatus {
/// Queued, waiting for resources.
Queued,
/// Resources allocated, starting.
Starting,
/// Running.
Running {
progress: f32,
assigned_nodes: Vec<NodeId>,
},
/// Completed successfully.
Completed {
result_cid: String,
duration_ms: u64,
cost: u64,
},
/// Failed.
Failed { error: String },
/// Cancelled by user.
Cancelled,
}
/// Compute node registration.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ComputeNode {
/// Node ID.
pub id: NodeId,
/// Owner address.
pub owner: [u8; 32],
/// Available processors.
pub processors: Vec<ProcessorInfo>,
/// Total memory (GB).
pub total_memory_gb: f32,
/// Available memory (GB).
pub available_memory_gb: f32,
/// Network bandwidth (Gbps).
pub bandwidth_gbps: f32,
/// Geographic region.
pub region: String,
/// Stake amount (for PoS).
pub stake: u64,
/// Reputation score (0-100).
pub reputation: u32,
/// Current status.
pub status: NodeStatus,
}
/// Processor information on a node.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ProcessorInfo {
/// Processor ID (local to node).
pub id: ProcessorId,
/// Processor type.
pub processor_type: ProcessorType,
/// Capabilities.
pub capabilities: ProcessorCapabilities,
/// Current utilization (0.0 - 1.0).
pub utilization: f32,
/// Current temperature (Celsius).
pub temperature: Option<f32>,
}
/// Node status.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub enum NodeStatus {
/// Online and accepting jobs.
Online,
/// Online but not accepting new jobs.
Draining,
/// Offline.
Offline,
/// Maintenance mode.
Maintenance,
}
/// Compute cluster manager.
pub struct ComputeCluster {
/// Registered nodes.
nodes: RwLock<HashMap<NodeId, ComputeNode>>,
/// Device registry.
device_registry: Arc<DeviceRegistry>,
/// Heterogeneous scheduler.
scheduler: Arc<HeterogeneousScheduler>,
/// Spot market.
spot_market: Arc<SpotMarket>,
/// Memory manager.
memory_manager: Arc<MemoryManager>,
/// Active jobs.
jobs: RwLock<HashMap<JobId, ComputeJob>>,
}
impl ComputeCluster {
/// Creates a new compute cluster.
pub fn new() -> Self {
let device_registry = Arc::new(DeviceRegistry::new());
let scheduler = Arc::new(HeterogeneousScheduler::new(device_registry.clone()));
let spot_market = Arc::new(SpotMarket::new());
let memory_manager = Arc::new(MemoryManager::new());
Self {
nodes: RwLock::new(HashMap::new()),
device_registry,
scheduler,
spot_market,
memory_manager,
jobs: RwLock::new(HashMap::new()),
}
}
/// Registers a compute node.
pub fn register_node(&self, node: ComputeNode) -> Result<(), ComputeError> {
let id = node.id;
// Register processors with device registry
for proc in &node.processors {
self.device_registry.register_processor(id, proc.clone())?;
}
self.nodes.write().insert(id, node);
Ok(())
}
/// Unregisters a compute node.
pub fn unregister_node(&self, node_id: NodeId) -> Result<(), ComputeError> {
self.device_registry.unregister_node(node_id)?;
self.nodes.write().remove(&node_id);
Ok(())
}
/// Submits a job for execution.
pub async fn submit_job(&self, job: ComputeJob) -> Result<JobId, ComputeError> {
let job_id = job.id;
// Decompose job into tasks
let tasks = self.decompose_job(&job)?;
// Schedule tasks
let schedule = self.scheduler.schedule(tasks).await?;
// Store job
self.jobs.write().insert(job_id, job);
// Execute schedule (async)
tokio::spawn({
let scheduler = self.scheduler.clone();
async move {
let _ = scheduler.execute(&schedule.schedule).await;
}
});
Ok(job_id)
}
/// Gets job status.
pub fn get_job_status(&self, job_id: &JobId) -> Option<JobStatus> {
self.jobs.read().get(job_id).map(|_| JobStatus::Queued)
}
/// Cancels a job.
pub fn cancel_job(&self, job_id: &JobId) -> Result<(), ComputeError> {
if self.jobs.write().remove(job_id).is_some() {
Ok(())
} else {
Err(ComputeError::JobNotFound(*job_id))
}
}
/// Gets cluster statistics.
pub fn stats(&self) -> ClusterStats {
let nodes = self.nodes.read();
let jobs = self.jobs.read();
let total_nodes = nodes.len();
let online_nodes = nodes
.values()
.filter(|n| n.status == NodeStatus::Online)
.count();
let total_gpus: usize = nodes
.values()
.flat_map(|n| &n.processors)
.filter(|p| matches!(p.processor_type, ProcessorType::Gpu(_)))
.count();
let total_memory: f32 = nodes.values().map(|n| n.total_memory_gb).sum();
ClusterStats {
total_nodes,
online_nodes,
total_gpus,
total_memory_gb: total_memory,
active_jobs: jobs.len(),
queued_jobs: jobs.values().filter(|_| true).count(), // Simplified
}
}
/// Decomposes a job into schedulable tasks.
fn decompose_job(&self, job: &ComputeJob) -> Result<Vec<Task>, ComputeError> {
let decomposer = TaskDecomposer::new();
decomposer.decompose(job)
}
}
impl Default for ComputeCluster {
fn default() -> Self {
Self::new()
}
}
/// Cluster statistics.
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
pub struct ClusterStats {
/// Total registered nodes.
pub total_nodes: usize,
/// Online nodes.
pub online_nodes: usize,
/// Total GPUs across cluster.
pub total_gpus: usize,
/// Total memory (GB).
pub total_memory_gb: f32,
/// Active jobs.
pub active_jobs: usize,
/// Queued jobs.
pub queued_jobs: usize,
}
/// Pricing calculator for compute operations.
#[derive(Clone, Debug)]
pub struct ComputePricing {
/// GPU cost per hour by type.
pub gpu_hourly: HashMap<GpuTier, u64>,
/// CPU cost per core-hour.
pub cpu_core_hour: u64,
/// Memory cost per GB-hour.
pub memory_gb_hour: u64,
/// Network egress per GB.
pub network_egress_gb: u64,
/// Inference per million tokens.
pub inference_per_million_tokens: u64,
}
/// GPU pricing tiers.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum GpuTier {
/// Consumer GPUs (RTX 30xx, 40xx).
Consumer,
/// Professional GPUs (RTX A series).
Professional,
/// Data center GPUs (A100).
DataCenter,
/// Latest generation (H100).
Premium,
}
impl Default for ComputePricing {
fn default() -> Self {
let mut gpu_hourly = HashMap::new();
gpu_hourly.insert(GpuTier::Consumer, 100_000_000); // 0.10 SYNOR
gpu_hourly.insert(GpuTier::Professional, 300_000_000); // 0.30 SYNOR
gpu_hourly.insert(GpuTier::DataCenter, 2_000_000_000); // 2.00 SYNOR
gpu_hourly.insert(GpuTier::Premium, 4_000_000_000); // 4.00 SYNOR
Self {
gpu_hourly,
cpu_core_hour: 20_000_000, // 0.02 SYNOR
memory_gb_hour: 5_000_000, // 0.005 SYNOR
network_egress_gb: 50_000_000, // 0.05 SYNOR
inference_per_million_tokens: 100_000_000, // 0.10 SYNOR
}
}
}
impl ComputePricing {
/// Estimates cost for a job.
pub fn estimate(&self, job: &ComputeJob, duration_hours: f32) -> u64 {
let mut cost = 0u64;
// CPU cost
cost += (self.cpu_core_hour as f32 * job.resources.min_cpu_cores * duration_hours) as u64;
// Memory cost
cost += (self.memory_gb_hour as f32 * job.resources.min_memory_gb * duration_hours) as u64;
// GPU cost
if let Some(gpu) = &job.resources.gpu {
let tier = GpuTier::Consumer; // Simplified
let gpu_cost = self.gpu_hourly.get(&tier).unwrap_or(&100_000_000);
cost += (*gpu_cost as f32 * gpu.min_count as f32 * duration_hours) as u64;
}
cost
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_job_id() {
let id1 = JobId::new();
let id2 = JobId::new();
assert_ne!(id1.0, id2.0);
}
#[test]
fn test_compute_cluster() {
let cluster = ComputeCluster::new();
let stats = cluster.stats();
assert_eq!(stats.total_nodes, 0);
}
#[test]
fn test_pricing() {
let pricing = ComputePricing::default();
let job = ComputeJob {
id: JobId::new(),
owner: [0u8; 32],
job_type: JobType::Inference {
model_cid: "model123".to_string(),
input_format: "json".to_string(),
batch_size: 32,
},
resources: ResourceRequirements {
min_cpu_cores: 4.0,
min_memory_gb: 16.0,
gpu: Some(GpuRequirements {
min_count: 1,
max_count: 1,
min_vram_gb: 16,
min_compute_capability: None,
allow_sharing: false,
}),
..Default::default()
},
input_cid: None,
max_budget: 1_000_000_000,
priority: JobPriority::Normal,
created_at: 0,
deadline: None,
};
let cost = pricing.estimate(&job, 1.0);
assert!(cost > 0);
}
#[test]
fn test_node_registration() {
let cluster = ComputeCluster::new();
let node = ComputeNode {
id: NodeId(1),
owner: [1u8; 32],
processors: vec![ProcessorInfo {
id: ProcessorId(0),
processor_type: ProcessorType::Cpu(CpuVariant::X86_64 {
avx: processor::AvxSupport::Avx512,
}),
capabilities: ProcessorCapabilities::default(),
utilization: 0.0,
temperature: Some(45.0),
}],
total_memory_gb: 64.0,
available_memory_gb: 60.0,
bandwidth_gbps: 10.0,
region: "us-east".to_string(),
stake: 1000,
reputation: 100,
status: NodeStatus::Online,
};
cluster.register_node(node).unwrap();
assert_eq!(cluster.stats().total_nodes, 1);
}
}