synor/crates/synor-compute/src/processor/capabilities.rs
2026-02-02 05:58:22 +05:30

549 lines
17 KiB
Rust

//! Processor capability definitions.
use super::operation::OperationType;
use super::types::PowerTier;
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
/// Detailed processor capabilities.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ProcessorCapabilities {
/// Compute throughput.
pub compute: ComputeThroughput,
/// Memory specifications.
pub memory: MemorySpecs,
/// Supported operations.
pub operations: HashSet<OperationType>,
/// Power characteristics.
pub power: PowerCharacteristics,
/// Optimal workload characteristics.
pub optimal_for: Vec<WorkloadCharacteristic>,
}
impl Default for ProcessorCapabilities {
fn default() -> Self {
Self {
compute: ComputeThroughput::default(),
memory: MemorySpecs::default(),
operations: Self::default_operations(),
power: PowerCharacteristics::default(),
optimal_for: vec![],
}
}
}
impl ProcessorCapabilities {
/// Default operations supported by most processors.
fn default_operations() -> HashSet<OperationType> {
[
OperationType::MatMul,
OperationType::Add,
OperationType::Mul,
OperationType::ReLU,
OperationType::Softmax,
OperationType::DataLoad,
OperationType::DataPreprocess,
]
.into_iter()
.collect()
}
/// Creates CPU capabilities.
pub fn cpu(cores: u32, clock_ghz: f32, avx512: bool) -> Self {
let flops_per_cycle = if avx512 { 64.0 } else { 32.0 }; // FP32 ops per cycle with AVX
let fp32_tflops = (cores as f64 * clock_ghz as f64 * flops_per_cycle) / 1000.0;
Self {
compute: ComputeThroughput {
fp64_tflops: fp32_tflops / 2.0,
fp32_tflops,
fp16_tflops: fp32_tflops * 2.0,
bf16_tflops: fp32_tflops * 2.0,
int8_tops: fp32_tflops * 4.0,
int4_tops: fp32_tflops * 8.0,
sparsity_speedup: 1.0,
},
memory: MemorySpecs {
capacity_bytes: 64 * 1024 * 1024 * 1024, // 64 GB typical
bandwidth_gbps: 200, // DDR5
type_: MemoryType::Ddr5,
},
operations: Self::cpu_operations(),
power: PowerCharacteristics {
tdp_watts: 125,
efficiency: 0.8,
power_tier: PowerTier::Medium,
},
optimal_for: vec![
WorkloadCharacteristic::Sequential,
WorkloadCharacteristic::MemoryBound,
WorkloadCharacteristic::SmallBatch,
],
}
}
/// Operations typically supported by CPUs.
fn cpu_operations() -> HashSet<OperationType> {
[
// Matrix operations (slow but supported)
OperationType::MatMul,
OperationType::Conv2d,
OperationType::BatchNorm,
OperationType::LayerNorm,
// Element-wise
OperationType::Add,
OperationType::Mul,
OperationType::ReLU,
OperationType::GeLU,
OperationType::Softmax,
// Data operations (optimal)
OperationType::DataLoad,
OperationType::DataPreprocess,
OperationType::Tokenization,
OperationType::Detokenization,
// Memory operations
OperationType::Transpose,
OperationType::Reshape,
OperationType::Concat,
OperationType::Split,
// I/O
OperationType::Checkpoint,
]
.into_iter()
.collect()
}
/// Creates NVIDIA GPU capabilities.
pub fn nvidia_gpu(
cuda_cores: u32,
_tensor_cores: u32,
vram_gb: u32,
bandwidth_gbps: u32,
compute_capability: (u8, u8),
) -> Self {
// Approximate TFLOPS based on cores and typical clocks
let base_clock_ghz = 1.5;
let fp32_tflops = (cuda_cores as f64 * base_clock_ghz * 2.0) / 1000.0;
let tensor_multiplier = if compute_capability.0 >= 8 { 4.0 } else { 2.0 };
Self {
compute: ComputeThroughput {
fp64_tflops: fp32_tflops / 2.0,
fp32_tflops,
fp16_tflops: fp32_tflops * tensor_multiplier,
bf16_tflops: fp32_tflops * tensor_multiplier,
int8_tops: fp32_tflops * tensor_multiplier * 2.0,
int4_tops: fp32_tflops * tensor_multiplier * 4.0,
sparsity_speedup: if compute_capability.0 >= 8 { 2.0 } else { 1.0 },
},
memory: MemorySpecs {
capacity_bytes: vram_gb as u64 * 1024 * 1024 * 1024,
bandwidth_gbps,
type_: if compute_capability.0 >= 9 {
MemoryType::Hbm3
} else {
MemoryType::Hbm2e
},
},
operations: Self::gpu_operations(compute_capability),
power: PowerCharacteristics {
tdp_watts: if compute_capability.0 >= 9 { 700 } else { 350 },
efficiency: 0.9,
power_tier: PowerTier::High,
},
optimal_for: vec![
WorkloadCharacteristic::HighlyParallel,
WorkloadCharacteristic::LargeBatch,
WorkloadCharacteristic::ComputeBound,
],
}
}
/// Operations supported by GPUs.
fn gpu_operations(compute_capability: (u8, u8)) -> HashSet<OperationType> {
let mut ops: HashSet<OperationType> = [
// Matrix operations (optimal)
OperationType::MatMul,
OperationType::Conv2d,
OperationType::Conv3d,
OperationType::DepthwiseConv,
OperationType::BatchNorm,
OperationType::LayerNorm,
// Attention
OperationType::SelfAttention,
OperationType::CrossAttention,
// Element-wise
OperationType::Add,
OperationType::Mul,
OperationType::ReLU,
OperationType::GeLU,
OperationType::SiLU,
OperationType::Softmax,
// Reduction
OperationType::Sum,
OperationType::Mean,
OperationType::Max,
OperationType::ArgMax,
// Memory operations
OperationType::Transpose,
OperationType::Reshape,
OperationType::Concat,
OperationType::Split,
OperationType::Gather,
OperationType::Scatter,
// LLM specific
OperationType::Embedding,
OperationType::RoPE,
OperationType::KVCache,
OperationType::TopK,
OperationType::Sampling,
]
.into_iter()
.collect();
// FlashAttention for newer GPUs
if compute_capability.0 >= 8 {
ops.insert(OperationType::FlashAttention);
}
ops
}
/// Creates TPU capabilities.
pub fn tpu(version: super::TpuVersion) -> Self {
let (bf16_tflops, memory_gb, bandwidth_gbps) = match version {
super::TpuVersion::V5p => (918.0, 95, 4800),
super::TpuVersion::V5e => (197.0, 16, 1600),
super::TpuVersion::V4 => (275.0, 32, 2400),
super::TpuVersion::V4i => (138.0, 32, 1200),
super::TpuVersion::V3 => (123.0, 16, 900),
super::TpuVersion::V2 => (46.0, 8, 600),
super::TpuVersion::Edge => (4.0, 0, 0), // Uses host memory
};
Self {
compute: ComputeThroughput {
fp64_tflops: 0.0, // TPUs don't support FP64
fp32_tflops: bf16_tflops / 2.0,
fp16_tflops: bf16_tflops,
bf16_tflops,
int8_tops: bf16_tflops * 2.0,
int4_tops: bf16_tflops * 4.0,
sparsity_speedup: 2.0,
},
memory: MemorySpecs {
capacity_bytes: memory_gb as u64 * 1024 * 1024 * 1024,
bandwidth_gbps,
type_: MemoryType::Hbm2e,
},
operations: Self::tpu_operations(),
power: PowerCharacteristics {
tdp_watts: if matches!(version, super::TpuVersion::Edge) {
2
} else {
400
},
efficiency: 0.95,
power_tier: if matches!(version, super::TpuVersion::Edge) {
PowerTier::UltraLow
} else {
PowerTier::High
},
},
optimal_for: vec![
WorkloadCharacteristic::HighlyParallel,
WorkloadCharacteristic::ComputeBound,
WorkloadCharacteristic::FixedShape,
WorkloadCharacteristic::LargeBatch,
],
}
}
/// Operations supported by TPUs.
fn tpu_operations() -> HashSet<OperationType> {
[
// Matrix operations (optimal)
OperationType::MatMul,
OperationType::Conv2d,
OperationType::BatchNorm,
OperationType::LayerNorm,
// Attention
OperationType::SelfAttention,
OperationType::CrossAttention,
OperationType::FlashAttention,
// Element-wise
OperationType::Add,
OperationType::Mul,
OperationType::ReLU,
OperationType::GeLU,
OperationType::SiLU,
OperationType::Softmax,
// Reduction
OperationType::Sum,
OperationType::Mean,
OperationType::Max,
// LLM specific
OperationType::Embedding,
OperationType::RoPE,
OperationType::KVCache,
]
.into_iter()
.collect()
}
/// Creates LPU (Groq) capabilities.
pub fn lpu() -> Self {
Self {
compute: ComputeThroughput {
fp64_tflops: 0.0,
fp32_tflops: 0.0,
fp16_tflops: 188.0,
bf16_tflops: 188.0,
int8_tops: 750.0,
int4_tops: 1500.0,
sparsity_speedup: 1.0,
},
memory: MemorySpecs {
capacity_bytes: 230 * 1024 * 1024 * 1024, // 230 GB SRAM!
bandwidth_gbps: 80_000, // 80 TB/s internal
type_: MemoryType::Sram,
},
operations: Self::lpu_operations(),
power: PowerCharacteristics {
tdp_watts: 300,
efficiency: 0.98, // Very efficient for inference
power_tier: PowerTier::Medium,
},
optimal_for: vec![
WorkloadCharacteristic::Sequential,
WorkloadCharacteristic::SmallBatch,
WorkloadCharacteristic::VariableLength,
WorkloadCharacteristic::LowLatency,
],
}
}
/// Operations supported by Groq LPU.
fn lpu_operations() -> HashSet<OperationType> {
[
// Optimized for inference
OperationType::MatMul,
OperationType::LayerNorm,
OperationType::SelfAttention,
OperationType::Add,
OperationType::Mul,
OperationType::ReLU,
OperationType::GeLU,
OperationType::SiLU,
OperationType::Softmax,
OperationType::Embedding,
OperationType::RoPE,
OperationType::KVCache,
OperationType::TopK,
OperationType::Sampling,
]
.into_iter()
.collect()
}
/// Creates Apple Neural Engine capabilities.
pub fn apple_neural_engine(cores: u32) -> Self {
let int8_tops = match cores {
16 => 18.0, // M3
32 => 35.0, // M3 Max
_ => cores as f64 * 1.1,
};
Self {
compute: ComputeThroughput {
fp64_tflops: 0.0,
fp32_tflops: int8_tops / 4.0,
fp16_tflops: int8_tops / 2.0,
bf16_tflops: int8_tops / 2.0,
int8_tops,
int4_tops: int8_tops * 2.0,
sparsity_speedup: 1.0,
},
memory: MemorySpecs {
capacity_bytes: 0, // Uses unified memory
bandwidth_gbps: 400,
type_: MemoryType::Unified,
},
operations: Self::npu_operations(),
power: PowerCharacteristics {
tdp_watts: 15,
efficiency: 0.95,
power_tier: PowerTier::UltraLow,
},
optimal_for: vec![
WorkloadCharacteristic::LowPower,
WorkloadCharacteristic::LowLatency,
WorkloadCharacteristic::SmallBatch,
],
}
}
/// Operations supported by NPUs.
fn npu_operations() -> HashSet<OperationType> {
[
// Inference optimized
OperationType::MatMul,
OperationType::Conv2d,
OperationType::DepthwiseConv,
OperationType::BatchNorm,
OperationType::LayerNorm,
OperationType::Add,
OperationType::Mul,
OperationType::ReLU,
OperationType::Softmax,
OperationType::Embedding,
]
.into_iter()
.collect()
}
}
/// Compute throughput metrics.
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
pub struct ComputeThroughput {
/// FP64 TFLOPS.
pub fp64_tflops: f64,
/// FP32 TFLOPS.
pub fp32_tflops: f64,
/// FP16 TFLOPS.
pub fp16_tflops: f64,
/// BF16 TFLOPS.
pub bf16_tflops: f64,
/// INT8 TOPS.
pub int8_tops: f64,
/// INT4 TOPS.
pub int4_tops: f64,
/// Speedup for sparse operations.
pub sparsity_speedup: f64,
}
/// Memory specifications.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct MemorySpecs {
/// Total capacity (bytes).
pub capacity_bytes: u64,
/// Bandwidth (GB/s).
pub bandwidth_gbps: u32,
/// Memory type.
pub type_: MemoryType,
}
impl Default for MemorySpecs {
fn default() -> Self {
Self {
capacity_bytes: 16 * 1024 * 1024 * 1024, // 16 GB
bandwidth_gbps: 500,
type_: MemoryType::Ddr5,
}
}
}
/// Memory types.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum MemoryType {
/// DDR4 RAM.
Ddr4,
/// DDR5 RAM.
Ddr5,
/// GDDR6/6X video memory.
Gddr6,
/// HBM2.
Hbm2,
/// HBM2e.
Hbm2e,
/// HBM3.
Hbm3,
/// SRAM (on-chip).
Sram,
/// Unified memory (Apple Silicon).
Unified,
/// LPDDR (mobile).
Lpddr,
}
/// Power characteristics.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct PowerCharacteristics {
/// TDP in watts.
pub tdp_watts: u32,
/// Efficiency factor (0.0 - 1.0).
pub efficiency: f64,
/// Power tier.
pub power_tier: PowerTier,
}
impl Default for PowerCharacteristics {
fn default() -> Self {
Self {
tdp_watts: 100,
efficiency: 0.8,
power_tier: PowerTier::Medium,
}
}
}
/// Workload characteristics for processor matching.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum WorkloadCharacteristic {
/// High parallelism (GPU, TPU).
HighlyParallel,
/// Sequential dependencies (CPU, LPU).
Sequential,
/// Memory bandwidth bound (GPU).
MemoryBound,
/// Compute bound (TPU).
ComputeBound,
/// Low latency required (NPU, edge).
LowLatency,
/// Low power required (NPU, mobile).
LowPower,
/// Large batch sizes (GPU, TPU).
LargeBatch,
/// Small batch sizes (CPU, LPU).
SmallBatch,
/// Variable length sequences (LPU).
VariableLength,
/// Fixed tensor shapes (TPU).
FixedShape,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cpu_capabilities() {
let caps = ProcessorCapabilities::cpu(32, 3.5, true);
assert!(caps.compute.fp32_tflops > 0.0);
assert!(caps.operations.contains(&OperationType::DataLoad));
assert!(caps.operations.contains(&OperationType::Tokenization));
}
#[test]
fn test_gpu_capabilities() {
let caps = ProcessorCapabilities::nvidia_gpu(16384, 512, 24, 1000, (8, 9));
assert!(caps.compute.fp16_tflops > caps.compute.fp32_tflops);
assert!(caps.operations.contains(&OperationType::FlashAttention));
}
#[test]
fn test_tpu_capabilities() {
let caps = ProcessorCapabilities::tpu(super::super::TpuVersion::V5p);
assert!(caps.compute.bf16_tflops > 900.0);
assert!(!caps.operations.contains(&OperationType::DataLoad)); // TPUs don't do I/O
}
#[test]
fn test_lpu_capabilities() {
let caps = ProcessorCapabilities::lpu();
assert!(caps.memory.bandwidth_gbps > 10000); // Very high internal bandwidth
assert!(caps
.optimal_for
.contains(&WorkloadCharacteristic::Sequential));
}
}