549 lines
17 KiB
Rust
549 lines
17 KiB
Rust
//! Processor capability definitions.
|
|
|
|
use super::operation::OperationType;
|
|
use super::types::PowerTier;
|
|
use serde::{Deserialize, Serialize};
|
|
use std::collections::HashSet;
|
|
|
|
/// Detailed processor capabilities.
|
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
|
pub struct ProcessorCapabilities {
|
|
/// Compute throughput.
|
|
pub compute: ComputeThroughput,
|
|
/// Memory specifications.
|
|
pub memory: MemorySpecs,
|
|
/// Supported operations.
|
|
pub operations: HashSet<OperationType>,
|
|
/// Power characteristics.
|
|
pub power: PowerCharacteristics,
|
|
/// Optimal workload characteristics.
|
|
pub optimal_for: Vec<WorkloadCharacteristic>,
|
|
}
|
|
|
|
impl Default for ProcessorCapabilities {
|
|
fn default() -> Self {
|
|
Self {
|
|
compute: ComputeThroughput::default(),
|
|
memory: MemorySpecs::default(),
|
|
operations: Self::default_operations(),
|
|
power: PowerCharacteristics::default(),
|
|
optimal_for: vec![],
|
|
}
|
|
}
|
|
}
|
|
|
|
impl ProcessorCapabilities {
|
|
/// Default operations supported by most processors.
|
|
fn default_operations() -> HashSet<OperationType> {
|
|
[
|
|
OperationType::MatMul,
|
|
OperationType::Add,
|
|
OperationType::Mul,
|
|
OperationType::ReLU,
|
|
OperationType::Softmax,
|
|
OperationType::DataLoad,
|
|
OperationType::DataPreprocess,
|
|
]
|
|
.into_iter()
|
|
.collect()
|
|
}
|
|
|
|
/// Creates CPU capabilities.
|
|
pub fn cpu(cores: u32, clock_ghz: f32, avx512: bool) -> Self {
|
|
let flops_per_cycle = if avx512 { 64.0 } else { 32.0 }; // FP32 ops per cycle with AVX
|
|
let fp32_tflops = (cores as f64 * clock_ghz as f64 * flops_per_cycle) / 1000.0;
|
|
|
|
Self {
|
|
compute: ComputeThroughput {
|
|
fp64_tflops: fp32_tflops / 2.0,
|
|
fp32_tflops,
|
|
fp16_tflops: fp32_tflops * 2.0,
|
|
bf16_tflops: fp32_tflops * 2.0,
|
|
int8_tops: fp32_tflops * 4.0,
|
|
int4_tops: fp32_tflops * 8.0,
|
|
sparsity_speedup: 1.0,
|
|
},
|
|
memory: MemorySpecs {
|
|
capacity_bytes: 64 * 1024 * 1024 * 1024, // 64 GB typical
|
|
bandwidth_gbps: 200, // DDR5
|
|
type_: MemoryType::Ddr5,
|
|
},
|
|
operations: Self::cpu_operations(),
|
|
power: PowerCharacteristics {
|
|
tdp_watts: 125,
|
|
efficiency: 0.8,
|
|
power_tier: PowerTier::Medium,
|
|
},
|
|
optimal_for: vec![
|
|
WorkloadCharacteristic::Sequential,
|
|
WorkloadCharacteristic::MemoryBound,
|
|
WorkloadCharacteristic::SmallBatch,
|
|
],
|
|
}
|
|
}
|
|
|
|
/// Operations typically supported by CPUs.
|
|
fn cpu_operations() -> HashSet<OperationType> {
|
|
[
|
|
// Matrix operations (slow but supported)
|
|
OperationType::MatMul,
|
|
OperationType::Conv2d,
|
|
OperationType::BatchNorm,
|
|
OperationType::LayerNorm,
|
|
// Element-wise
|
|
OperationType::Add,
|
|
OperationType::Mul,
|
|
OperationType::ReLU,
|
|
OperationType::GeLU,
|
|
OperationType::Softmax,
|
|
// Data operations (optimal)
|
|
OperationType::DataLoad,
|
|
OperationType::DataPreprocess,
|
|
OperationType::Tokenization,
|
|
OperationType::Detokenization,
|
|
// Memory operations
|
|
OperationType::Transpose,
|
|
OperationType::Reshape,
|
|
OperationType::Concat,
|
|
OperationType::Split,
|
|
// I/O
|
|
OperationType::Checkpoint,
|
|
]
|
|
.into_iter()
|
|
.collect()
|
|
}
|
|
|
|
/// Creates NVIDIA GPU capabilities.
|
|
pub fn nvidia_gpu(
|
|
cuda_cores: u32,
|
|
_tensor_cores: u32,
|
|
vram_gb: u32,
|
|
bandwidth_gbps: u32,
|
|
compute_capability: (u8, u8),
|
|
) -> Self {
|
|
// Approximate TFLOPS based on cores and typical clocks
|
|
let base_clock_ghz = 1.5;
|
|
let fp32_tflops = (cuda_cores as f64 * base_clock_ghz * 2.0) / 1000.0;
|
|
let tensor_multiplier = if compute_capability.0 >= 8 { 4.0 } else { 2.0 };
|
|
|
|
Self {
|
|
compute: ComputeThroughput {
|
|
fp64_tflops: fp32_tflops / 2.0,
|
|
fp32_tflops,
|
|
fp16_tflops: fp32_tflops * tensor_multiplier,
|
|
bf16_tflops: fp32_tflops * tensor_multiplier,
|
|
int8_tops: fp32_tflops * tensor_multiplier * 2.0,
|
|
int4_tops: fp32_tflops * tensor_multiplier * 4.0,
|
|
sparsity_speedup: if compute_capability.0 >= 8 { 2.0 } else { 1.0 },
|
|
},
|
|
memory: MemorySpecs {
|
|
capacity_bytes: vram_gb as u64 * 1024 * 1024 * 1024,
|
|
bandwidth_gbps,
|
|
type_: if compute_capability.0 >= 9 {
|
|
MemoryType::Hbm3
|
|
} else {
|
|
MemoryType::Hbm2e
|
|
},
|
|
},
|
|
operations: Self::gpu_operations(compute_capability),
|
|
power: PowerCharacteristics {
|
|
tdp_watts: if compute_capability.0 >= 9 { 700 } else { 350 },
|
|
efficiency: 0.9,
|
|
power_tier: PowerTier::High,
|
|
},
|
|
optimal_for: vec![
|
|
WorkloadCharacteristic::HighlyParallel,
|
|
WorkloadCharacteristic::LargeBatch,
|
|
WorkloadCharacteristic::ComputeBound,
|
|
],
|
|
}
|
|
}
|
|
|
|
/// Operations supported by GPUs.
|
|
fn gpu_operations(compute_capability: (u8, u8)) -> HashSet<OperationType> {
|
|
let mut ops: HashSet<OperationType> = [
|
|
// Matrix operations (optimal)
|
|
OperationType::MatMul,
|
|
OperationType::Conv2d,
|
|
OperationType::Conv3d,
|
|
OperationType::DepthwiseConv,
|
|
OperationType::BatchNorm,
|
|
OperationType::LayerNorm,
|
|
// Attention
|
|
OperationType::SelfAttention,
|
|
OperationType::CrossAttention,
|
|
// Element-wise
|
|
OperationType::Add,
|
|
OperationType::Mul,
|
|
OperationType::ReLU,
|
|
OperationType::GeLU,
|
|
OperationType::SiLU,
|
|
OperationType::Softmax,
|
|
// Reduction
|
|
OperationType::Sum,
|
|
OperationType::Mean,
|
|
OperationType::Max,
|
|
OperationType::ArgMax,
|
|
// Memory operations
|
|
OperationType::Transpose,
|
|
OperationType::Reshape,
|
|
OperationType::Concat,
|
|
OperationType::Split,
|
|
OperationType::Gather,
|
|
OperationType::Scatter,
|
|
// LLM specific
|
|
OperationType::Embedding,
|
|
OperationType::RoPE,
|
|
OperationType::KVCache,
|
|
OperationType::TopK,
|
|
OperationType::Sampling,
|
|
]
|
|
.into_iter()
|
|
.collect();
|
|
|
|
// FlashAttention for newer GPUs
|
|
if compute_capability.0 >= 8 {
|
|
ops.insert(OperationType::FlashAttention);
|
|
}
|
|
|
|
ops
|
|
}
|
|
|
|
/// Creates TPU capabilities.
|
|
pub fn tpu(version: super::TpuVersion) -> Self {
|
|
let (bf16_tflops, memory_gb, bandwidth_gbps) = match version {
|
|
super::TpuVersion::V5p => (918.0, 95, 4800),
|
|
super::TpuVersion::V5e => (197.0, 16, 1600),
|
|
super::TpuVersion::V4 => (275.0, 32, 2400),
|
|
super::TpuVersion::V4i => (138.0, 32, 1200),
|
|
super::TpuVersion::V3 => (123.0, 16, 900),
|
|
super::TpuVersion::V2 => (46.0, 8, 600),
|
|
super::TpuVersion::Edge => (4.0, 0, 0), // Uses host memory
|
|
};
|
|
|
|
Self {
|
|
compute: ComputeThroughput {
|
|
fp64_tflops: 0.0, // TPUs don't support FP64
|
|
fp32_tflops: bf16_tflops / 2.0,
|
|
fp16_tflops: bf16_tflops,
|
|
bf16_tflops,
|
|
int8_tops: bf16_tflops * 2.0,
|
|
int4_tops: bf16_tflops * 4.0,
|
|
sparsity_speedup: 2.0,
|
|
},
|
|
memory: MemorySpecs {
|
|
capacity_bytes: memory_gb as u64 * 1024 * 1024 * 1024,
|
|
bandwidth_gbps,
|
|
type_: MemoryType::Hbm2e,
|
|
},
|
|
operations: Self::tpu_operations(),
|
|
power: PowerCharacteristics {
|
|
tdp_watts: if matches!(version, super::TpuVersion::Edge) {
|
|
2
|
|
} else {
|
|
400
|
|
},
|
|
efficiency: 0.95,
|
|
power_tier: if matches!(version, super::TpuVersion::Edge) {
|
|
PowerTier::UltraLow
|
|
} else {
|
|
PowerTier::High
|
|
},
|
|
},
|
|
optimal_for: vec![
|
|
WorkloadCharacteristic::HighlyParallel,
|
|
WorkloadCharacteristic::ComputeBound,
|
|
WorkloadCharacteristic::FixedShape,
|
|
WorkloadCharacteristic::LargeBatch,
|
|
],
|
|
}
|
|
}
|
|
|
|
/// Operations supported by TPUs.
|
|
fn tpu_operations() -> HashSet<OperationType> {
|
|
[
|
|
// Matrix operations (optimal)
|
|
OperationType::MatMul,
|
|
OperationType::Conv2d,
|
|
OperationType::BatchNorm,
|
|
OperationType::LayerNorm,
|
|
// Attention
|
|
OperationType::SelfAttention,
|
|
OperationType::CrossAttention,
|
|
OperationType::FlashAttention,
|
|
// Element-wise
|
|
OperationType::Add,
|
|
OperationType::Mul,
|
|
OperationType::ReLU,
|
|
OperationType::GeLU,
|
|
OperationType::SiLU,
|
|
OperationType::Softmax,
|
|
// Reduction
|
|
OperationType::Sum,
|
|
OperationType::Mean,
|
|
OperationType::Max,
|
|
// LLM specific
|
|
OperationType::Embedding,
|
|
OperationType::RoPE,
|
|
OperationType::KVCache,
|
|
]
|
|
.into_iter()
|
|
.collect()
|
|
}
|
|
|
|
/// Creates LPU (Groq) capabilities.
|
|
pub fn lpu() -> Self {
|
|
Self {
|
|
compute: ComputeThroughput {
|
|
fp64_tflops: 0.0,
|
|
fp32_tflops: 0.0,
|
|
fp16_tflops: 188.0,
|
|
bf16_tflops: 188.0,
|
|
int8_tops: 750.0,
|
|
int4_tops: 1500.0,
|
|
sparsity_speedup: 1.0,
|
|
},
|
|
memory: MemorySpecs {
|
|
capacity_bytes: 230 * 1024 * 1024 * 1024, // 230 GB SRAM!
|
|
bandwidth_gbps: 80_000, // 80 TB/s internal
|
|
type_: MemoryType::Sram,
|
|
},
|
|
operations: Self::lpu_operations(),
|
|
power: PowerCharacteristics {
|
|
tdp_watts: 300,
|
|
efficiency: 0.98, // Very efficient for inference
|
|
power_tier: PowerTier::Medium,
|
|
},
|
|
optimal_for: vec![
|
|
WorkloadCharacteristic::Sequential,
|
|
WorkloadCharacteristic::SmallBatch,
|
|
WorkloadCharacteristic::VariableLength,
|
|
WorkloadCharacteristic::LowLatency,
|
|
],
|
|
}
|
|
}
|
|
|
|
/// Operations supported by Groq LPU.
|
|
fn lpu_operations() -> HashSet<OperationType> {
|
|
[
|
|
// Optimized for inference
|
|
OperationType::MatMul,
|
|
OperationType::LayerNorm,
|
|
OperationType::SelfAttention,
|
|
OperationType::Add,
|
|
OperationType::Mul,
|
|
OperationType::ReLU,
|
|
OperationType::GeLU,
|
|
OperationType::SiLU,
|
|
OperationType::Softmax,
|
|
OperationType::Embedding,
|
|
OperationType::RoPE,
|
|
OperationType::KVCache,
|
|
OperationType::TopK,
|
|
OperationType::Sampling,
|
|
]
|
|
.into_iter()
|
|
.collect()
|
|
}
|
|
|
|
/// Creates Apple Neural Engine capabilities.
|
|
pub fn apple_neural_engine(cores: u32) -> Self {
|
|
let int8_tops = match cores {
|
|
16 => 18.0, // M3
|
|
32 => 35.0, // M3 Max
|
|
_ => cores as f64 * 1.1,
|
|
};
|
|
|
|
Self {
|
|
compute: ComputeThroughput {
|
|
fp64_tflops: 0.0,
|
|
fp32_tflops: int8_tops / 4.0,
|
|
fp16_tflops: int8_tops / 2.0,
|
|
bf16_tflops: int8_tops / 2.0,
|
|
int8_tops,
|
|
int4_tops: int8_tops * 2.0,
|
|
sparsity_speedup: 1.0,
|
|
},
|
|
memory: MemorySpecs {
|
|
capacity_bytes: 0, // Uses unified memory
|
|
bandwidth_gbps: 400,
|
|
type_: MemoryType::Unified,
|
|
},
|
|
operations: Self::npu_operations(),
|
|
power: PowerCharacteristics {
|
|
tdp_watts: 15,
|
|
efficiency: 0.95,
|
|
power_tier: PowerTier::UltraLow,
|
|
},
|
|
optimal_for: vec![
|
|
WorkloadCharacteristic::LowPower,
|
|
WorkloadCharacteristic::LowLatency,
|
|
WorkloadCharacteristic::SmallBatch,
|
|
],
|
|
}
|
|
}
|
|
|
|
/// Operations supported by NPUs.
|
|
fn npu_operations() -> HashSet<OperationType> {
|
|
[
|
|
// Inference optimized
|
|
OperationType::MatMul,
|
|
OperationType::Conv2d,
|
|
OperationType::DepthwiseConv,
|
|
OperationType::BatchNorm,
|
|
OperationType::LayerNorm,
|
|
OperationType::Add,
|
|
OperationType::Mul,
|
|
OperationType::ReLU,
|
|
OperationType::Softmax,
|
|
OperationType::Embedding,
|
|
]
|
|
.into_iter()
|
|
.collect()
|
|
}
|
|
}
|
|
|
|
/// Compute throughput metrics.
|
|
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
|
|
pub struct ComputeThroughput {
|
|
/// FP64 TFLOPS.
|
|
pub fp64_tflops: f64,
|
|
/// FP32 TFLOPS.
|
|
pub fp32_tflops: f64,
|
|
/// FP16 TFLOPS.
|
|
pub fp16_tflops: f64,
|
|
/// BF16 TFLOPS.
|
|
pub bf16_tflops: f64,
|
|
/// INT8 TOPS.
|
|
pub int8_tops: f64,
|
|
/// INT4 TOPS.
|
|
pub int4_tops: f64,
|
|
/// Speedup for sparse operations.
|
|
pub sparsity_speedup: f64,
|
|
}
|
|
|
|
/// Memory specifications.
|
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
|
pub struct MemorySpecs {
|
|
/// Total capacity (bytes).
|
|
pub capacity_bytes: u64,
|
|
/// Bandwidth (GB/s).
|
|
pub bandwidth_gbps: u32,
|
|
/// Memory type.
|
|
pub type_: MemoryType,
|
|
}
|
|
|
|
impl Default for MemorySpecs {
|
|
fn default() -> Self {
|
|
Self {
|
|
capacity_bytes: 16 * 1024 * 1024 * 1024, // 16 GB
|
|
bandwidth_gbps: 500,
|
|
type_: MemoryType::Ddr5,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Memory types.
|
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
|
pub enum MemoryType {
|
|
/// DDR4 RAM.
|
|
Ddr4,
|
|
/// DDR5 RAM.
|
|
Ddr5,
|
|
/// GDDR6/6X video memory.
|
|
Gddr6,
|
|
/// HBM2.
|
|
Hbm2,
|
|
/// HBM2e.
|
|
Hbm2e,
|
|
/// HBM3.
|
|
Hbm3,
|
|
/// SRAM (on-chip).
|
|
Sram,
|
|
/// Unified memory (Apple Silicon).
|
|
Unified,
|
|
/// LPDDR (mobile).
|
|
Lpddr,
|
|
}
|
|
|
|
/// Power characteristics.
|
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
|
pub struct PowerCharacteristics {
|
|
/// TDP in watts.
|
|
pub tdp_watts: u32,
|
|
/// Efficiency factor (0.0 - 1.0).
|
|
pub efficiency: f64,
|
|
/// Power tier.
|
|
pub power_tier: PowerTier,
|
|
}
|
|
|
|
impl Default for PowerCharacteristics {
|
|
fn default() -> Self {
|
|
Self {
|
|
tdp_watts: 100,
|
|
efficiency: 0.8,
|
|
power_tier: PowerTier::Medium,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Workload characteristics for processor matching.
|
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
|
pub enum WorkloadCharacteristic {
|
|
/// High parallelism (GPU, TPU).
|
|
HighlyParallel,
|
|
/// Sequential dependencies (CPU, LPU).
|
|
Sequential,
|
|
/// Memory bandwidth bound (GPU).
|
|
MemoryBound,
|
|
/// Compute bound (TPU).
|
|
ComputeBound,
|
|
/// Low latency required (NPU, edge).
|
|
LowLatency,
|
|
/// Low power required (NPU, mobile).
|
|
LowPower,
|
|
/// Large batch sizes (GPU, TPU).
|
|
LargeBatch,
|
|
/// Small batch sizes (CPU, LPU).
|
|
SmallBatch,
|
|
/// Variable length sequences (LPU).
|
|
VariableLength,
|
|
/// Fixed tensor shapes (TPU).
|
|
FixedShape,
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_cpu_capabilities() {
|
|
let caps = ProcessorCapabilities::cpu(32, 3.5, true);
|
|
assert!(caps.compute.fp32_tflops > 0.0);
|
|
assert!(caps.operations.contains(&OperationType::DataLoad));
|
|
assert!(caps.operations.contains(&OperationType::Tokenization));
|
|
}
|
|
|
|
#[test]
|
|
fn test_gpu_capabilities() {
|
|
let caps = ProcessorCapabilities::nvidia_gpu(16384, 512, 24, 1000, (8, 9));
|
|
assert!(caps.compute.fp16_tflops > caps.compute.fp32_tflops);
|
|
assert!(caps.operations.contains(&OperationType::FlashAttention));
|
|
}
|
|
|
|
#[test]
|
|
fn test_tpu_capabilities() {
|
|
let caps = ProcessorCapabilities::tpu(super::super::TpuVersion::V5p);
|
|
assert!(caps.compute.bf16_tflops > 900.0);
|
|
assert!(!caps.operations.contains(&OperationType::DataLoad)); // TPUs don't do I/O
|
|
}
|
|
|
|
#[test]
|
|
fn test_lpu_capabilities() {
|
|
let caps = ProcessorCapabilities::lpu();
|
|
assert!(caps.memory.bandwidth_gbps > 10000); // Very high internal bandwidth
|
|
assert!(caps
|
|
.optimal_for
|
|
.contains(&WorkloadCharacteristic::Sequential));
|
|
}
|
|
}
|