//! Processor capability definitions. use super::operation::OperationType; use super::types::PowerTier; use serde::{Deserialize, Serialize}; use std::collections::HashSet; /// Detailed processor capabilities. #[derive(Clone, Debug, Serialize, Deserialize)] pub struct ProcessorCapabilities { /// Compute throughput. pub compute: ComputeThroughput, /// Memory specifications. pub memory: MemorySpecs, /// Supported operations. pub operations: HashSet, /// Power characteristics. pub power: PowerCharacteristics, /// Optimal workload characteristics. pub optimal_for: Vec, } impl Default for ProcessorCapabilities { fn default() -> Self { Self { compute: ComputeThroughput::default(), memory: MemorySpecs::default(), operations: Self::default_operations(), power: PowerCharacteristics::default(), optimal_for: vec![], } } } impl ProcessorCapabilities { /// Default operations supported by most processors. fn default_operations() -> HashSet { [ OperationType::MatMul, OperationType::Add, OperationType::Mul, OperationType::ReLU, OperationType::Softmax, OperationType::DataLoad, OperationType::DataPreprocess, ] .into_iter() .collect() } /// Creates CPU capabilities. pub fn cpu(cores: u32, clock_ghz: f32, avx512: bool) -> Self { let flops_per_cycle = if avx512 { 64.0 } else { 32.0 }; // FP32 ops per cycle with AVX let fp32_tflops = (cores as f64 * clock_ghz as f64 * flops_per_cycle) / 1000.0; Self { compute: ComputeThroughput { fp64_tflops: fp32_tflops / 2.0, fp32_tflops, fp16_tflops: fp32_tflops * 2.0, bf16_tflops: fp32_tflops * 2.0, int8_tops: fp32_tflops * 4.0, int4_tops: fp32_tflops * 8.0, sparsity_speedup: 1.0, }, memory: MemorySpecs { capacity_bytes: 64 * 1024 * 1024 * 1024, // 64 GB typical bandwidth_gbps: 200, // DDR5 type_: MemoryType::Ddr5, }, operations: Self::cpu_operations(), power: PowerCharacteristics { tdp_watts: 125, efficiency: 0.8, power_tier: PowerTier::Medium, }, optimal_for: vec![ WorkloadCharacteristic::Sequential, WorkloadCharacteristic::MemoryBound, WorkloadCharacteristic::SmallBatch, ], } } /// Operations typically supported by CPUs. fn cpu_operations() -> HashSet { [ // Matrix operations (slow but supported) OperationType::MatMul, OperationType::Conv2d, OperationType::BatchNorm, OperationType::LayerNorm, // Element-wise OperationType::Add, OperationType::Mul, OperationType::ReLU, OperationType::GeLU, OperationType::Softmax, // Data operations (optimal) OperationType::DataLoad, OperationType::DataPreprocess, OperationType::Tokenization, OperationType::Detokenization, // Memory operations OperationType::Transpose, OperationType::Reshape, OperationType::Concat, OperationType::Split, // I/O OperationType::Checkpoint, ] .into_iter() .collect() } /// Creates NVIDIA GPU capabilities. pub fn nvidia_gpu( cuda_cores: u32, _tensor_cores: u32, vram_gb: u32, bandwidth_gbps: u32, compute_capability: (u8, u8), ) -> Self { // Approximate TFLOPS based on cores and typical clocks let base_clock_ghz = 1.5; let fp32_tflops = (cuda_cores as f64 * base_clock_ghz * 2.0) / 1000.0; let tensor_multiplier = if compute_capability.0 >= 8 { 4.0 } else { 2.0 }; Self { compute: ComputeThroughput { fp64_tflops: fp32_tflops / 2.0, fp32_tflops, fp16_tflops: fp32_tflops * tensor_multiplier, bf16_tflops: fp32_tflops * tensor_multiplier, int8_tops: fp32_tflops * tensor_multiplier * 2.0, int4_tops: fp32_tflops * tensor_multiplier * 4.0, sparsity_speedup: if compute_capability.0 >= 8 { 2.0 } else { 1.0 }, }, memory: MemorySpecs { capacity_bytes: vram_gb as u64 * 1024 * 1024 * 1024, bandwidth_gbps, type_: if compute_capability.0 >= 9 { MemoryType::Hbm3 } else { MemoryType::Hbm2e }, }, operations: Self::gpu_operations(compute_capability), power: PowerCharacteristics { tdp_watts: if compute_capability.0 >= 9 { 700 } else { 350 }, efficiency: 0.9, power_tier: PowerTier::High, }, optimal_for: vec![ WorkloadCharacteristic::HighlyParallel, WorkloadCharacteristic::LargeBatch, WorkloadCharacteristic::ComputeBound, ], } } /// Operations supported by GPUs. fn gpu_operations(compute_capability: (u8, u8)) -> HashSet { let mut ops: HashSet = [ // Matrix operations (optimal) OperationType::MatMul, OperationType::Conv2d, OperationType::Conv3d, OperationType::DepthwiseConv, OperationType::BatchNorm, OperationType::LayerNorm, // Attention OperationType::SelfAttention, OperationType::CrossAttention, // Element-wise OperationType::Add, OperationType::Mul, OperationType::ReLU, OperationType::GeLU, OperationType::SiLU, OperationType::Softmax, // Reduction OperationType::Sum, OperationType::Mean, OperationType::Max, OperationType::ArgMax, // Memory operations OperationType::Transpose, OperationType::Reshape, OperationType::Concat, OperationType::Split, OperationType::Gather, OperationType::Scatter, // LLM specific OperationType::Embedding, OperationType::RoPE, OperationType::KVCache, OperationType::TopK, OperationType::Sampling, ] .into_iter() .collect(); // FlashAttention for newer GPUs if compute_capability.0 >= 8 { ops.insert(OperationType::FlashAttention); } ops } /// Creates TPU capabilities. pub fn tpu(version: super::TpuVersion) -> Self { let (bf16_tflops, memory_gb, bandwidth_gbps) = match version { super::TpuVersion::V5p => (918.0, 95, 4800), super::TpuVersion::V5e => (197.0, 16, 1600), super::TpuVersion::V4 => (275.0, 32, 2400), super::TpuVersion::V4i => (138.0, 32, 1200), super::TpuVersion::V3 => (123.0, 16, 900), super::TpuVersion::V2 => (46.0, 8, 600), super::TpuVersion::Edge => (4.0, 0, 0), // Uses host memory }; Self { compute: ComputeThroughput { fp64_tflops: 0.0, // TPUs don't support FP64 fp32_tflops: bf16_tflops / 2.0, fp16_tflops: bf16_tflops, bf16_tflops, int8_tops: bf16_tflops * 2.0, int4_tops: bf16_tflops * 4.0, sparsity_speedup: 2.0, }, memory: MemorySpecs { capacity_bytes: memory_gb as u64 * 1024 * 1024 * 1024, bandwidth_gbps, type_: MemoryType::Hbm2e, }, operations: Self::tpu_operations(), power: PowerCharacteristics { tdp_watts: if matches!(version, super::TpuVersion::Edge) { 2 } else { 400 }, efficiency: 0.95, power_tier: if matches!(version, super::TpuVersion::Edge) { PowerTier::UltraLow } else { PowerTier::High }, }, optimal_for: vec![ WorkloadCharacteristic::HighlyParallel, WorkloadCharacteristic::ComputeBound, WorkloadCharacteristic::FixedShape, WorkloadCharacteristic::LargeBatch, ], } } /// Operations supported by TPUs. fn tpu_operations() -> HashSet { [ // Matrix operations (optimal) OperationType::MatMul, OperationType::Conv2d, OperationType::BatchNorm, OperationType::LayerNorm, // Attention OperationType::SelfAttention, OperationType::CrossAttention, OperationType::FlashAttention, // Element-wise OperationType::Add, OperationType::Mul, OperationType::ReLU, OperationType::GeLU, OperationType::SiLU, OperationType::Softmax, // Reduction OperationType::Sum, OperationType::Mean, OperationType::Max, // LLM specific OperationType::Embedding, OperationType::RoPE, OperationType::KVCache, ] .into_iter() .collect() } /// Creates LPU (Groq) capabilities. pub fn lpu() -> Self { Self { compute: ComputeThroughput { fp64_tflops: 0.0, fp32_tflops: 0.0, fp16_tflops: 188.0, bf16_tflops: 188.0, int8_tops: 750.0, int4_tops: 1500.0, sparsity_speedup: 1.0, }, memory: MemorySpecs { capacity_bytes: 230 * 1024 * 1024 * 1024, // 230 GB SRAM! bandwidth_gbps: 80_000, // 80 TB/s internal type_: MemoryType::Sram, }, operations: Self::lpu_operations(), power: PowerCharacteristics { tdp_watts: 300, efficiency: 0.98, // Very efficient for inference power_tier: PowerTier::Medium, }, optimal_for: vec![ WorkloadCharacteristic::Sequential, WorkloadCharacteristic::SmallBatch, WorkloadCharacteristic::VariableLength, WorkloadCharacteristic::LowLatency, ], } } /// Operations supported by Groq LPU. fn lpu_operations() -> HashSet { [ // Optimized for inference OperationType::MatMul, OperationType::LayerNorm, OperationType::SelfAttention, OperationType::Add, OperationType::Mul, OperationType::ReLU, OperationType::GeLU, OperationType::SiLU, OperationType::Softmax, OperationType::Embedding, OperationType::RoPE, OperationType::KVCache, OperationType::TopK, OperationType::Sampling, ] .into_iter() .collect() } /// Creates Apple Neural Engine capabilities. pub fn apple_neural_engine(cores: u32) -> Self { let int8_tops = match cores { 16 => 18.0, // M3 32 => 35.0, // M3 Max _ => cores as f64 * 1.1, }; Self { compute: ComputeThroughput { fp64_tflops: 0.0, fp32_tflops: int8_tops / 4.0, fp16_tflops: int8_tops / 2.0, bf16_tflops: int8_tops / 2.0, int8_tops, int4_tops: int8_tops * 2.0, sparsity_speedup: 1.0, }, memory: MemorySpecs { capacity_bytes: 0, // Uses unified memory bandwidth_gbps: 400, type_: MemoryType::Unified, }, operations: Self::npu_operations(), power: PowerCharacteristics { tdp_watts: 15, efficiency: 0.95, power_tier: PowerTier::UltraLow, }, optimal_for: vec![ WorkloadCharacteristic::LowPower, WorkloadCharacteristic::LowLatency, WorkloadCharacteristic::SmallBatch, ], } } /// Operations supported by NPUs. fn npu_operations() -> HashSet { [ // Inference optimized OperationType::MatMul, OperationType::Conv2d, OperationType::DepthwiseConv, OperationType::BatchNorm, OperationType::LayerNorm, OperationType::Add, OperationType::Mul, OperationType::ReLU, OperationType::Softmax, OperationType::Embedding, ] .into_iter() .collect() } } /// Compute throughput metrics. #[derive(Clone, Debug, Default, Serialize, Deserialize)] pub struct ComputeThroughput { /// FP64 TFLOPS. pub fp64_tflops: f64, /// FP32 TFLOPS. pub fp32_tflops: f64, /// FP16 TFLOPS. pub fp16_tflops: f64, /// BF16 TFLOPS. pub bf16_tflops: f64, /// INT8 TOPS. pub int8_tops: f64, /// INT4 TOPS. pub int4_tops: f64, /// Speedup for sparse operations. pub sparsity_speedup: f64, } /// Memory specifications. #[derive(Clone, Debug, Serialize, Deserialize)] pub struct MemorySpecs { /// Total capacity (bytes). pub capacity_bytes: u64, /// Bandwidth (GB/s). pub bandwidth_gbps: u32, /// Memory type. pub type_: MemoryType, } impl Default for MemorySpecs { fn default() -> Self { Self { capacity_bytes: 16 * 1024 * 1024 * 1024, // 16 GB bandwidth_gbps: 500, type_: MemoryType::Ddr5, } } } /// Memory types. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] pub enum MemoryType { /// DDR4 RAM. Ddr4, /// DDR5 RAM. Ddr5, /// GDDR6/6X video memory. Gddr6, /// HBM2. Hbm2, /// HBM2e. Hbm2e, /// HBM3. Hbm3, /// SRAM (on-chip). Sram, /// Unified memory (Apple Silicon). Unified, /// LPDDR (mobile). Lpddr, } /// Power characteristics. #[derive(Clone, Debug, Serialize, Deserialize)] pub struct PowerCharacteristics { /// TDP in watts. pub tdp_watts: u32, /// Efficiency factor (0.0 - 1.0). pub efficiency: f64, /// Power tier. pub power_tier: PowerTier, } impl Default for PowerCharacteristics { fn default() -> Self { Self { tdp_watts: 100, efficiency: 0.8, power_tier: PowerTier::Medium, } } } /// Workload characteristics for processor matching. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] pub enum WorkloadCharacteristic { /// High parallelism (GPU, TPU). HighlyParallel, /// Sequential dependencies (CPU, LPU). Sequential, /// Memory bandwidth bound (GPU). MemoryBound, /// Compute bound (TPU). ComputeBound, /// Low latency required (NPU, edge). LowLatency, /// Low power required (NPU, mobile). LowPower, /// Large batch sizes (GPU, TPU). LargeBatch, /// Small batch sizes (CPU, LPU). SmallBatch, /// Variable length sequences (LPU). VariableLength, /// Fixed tensor shapes (TPU). FixedShape, } #[cfg(test)] mod tests { use super::*; #[test] fn test_cpu_capabilities() { let caps = ProcessorCapabilities::cpu(32, 3.5, true); assert!(caps.compute.fp32_tflops > 0.0); assert!(caps.operations.contains(&OperationType::DataLoad)); assert!(caps.operations.contains(&OperationType::Tokenization)); } #[test] fn test_gpu_capabilities() { let caps = ProcessorCapabilities::nvidia_gpu(16384, 512, 24, 1000, (8, 9)); assert!(caps.compute.fp16_tflops > caps.compute.fp32_tflops); assert!(caps.operations.contains(&OperationType::FlashAttention)); } #[test] fn test_tpu_capabilities() { let caps = ProcessorCapabilities::tpu(super::super::TpuVersion::V5p); assert!(caps.compute.bf16_tflops > 900.0); assert!(!caps.operations.contains(&OperationType::DataLoad)); // TPUs don't do I/O } #[test] fn test_lpu_capabilities() { let caps = ProcessorCapabilities::lpu(); assert!(caps.memory.bandwidth_gbps > 10000); // Very high internal bandwidth assert!(caps.optimal_for.contains(&WorkloadCharacteristic::Sequential)); } }