//! Processor capability definitions.

use super::operation::OperationType;
use super::types::PowerTier;
use serde::{Deserialize, Serialize};
use std::collections::HashSet;

/// Detailed processor capabilities.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ProcessorCapabilities {
    /// Compute throughput.
    pub compute: ComputeThroughput,
    /// Memory specifications.
    pub memory: MemorySpecs,
    /// Supported operations.
    pub operations: HashSet<OperationType>,
    /// Power characteristics.
    pub power: PowerCharacteristics,
    /// Optimal workload characteristics.
    pub optimal_for: Vec<WorkloadCharacteristic>,
}

impl Default for ProcessorCapabilities {
    fn default() -> Self {
        Self {
            compute: ComputeThroughput::default(),
            memory: MemorySpecs::default(),
            operations: Self::default_operations(),
            power: PowerCharacteristics::default(),
            optimal_for: vec![],
        }
    }
}

impl ProcessorCapabilities {
    /// Default operations supported by most processors.
    fn default_operations() -> HashSet<OperationType> {
        [
            OperationType::MatMul,
            OperationType::Add,
            OperationType::Mul,
            OperationType::ReLU,
            OperationType::Softmax,
            OperationType::DataLoad,
            OperationType::DataPreprocess,
        ]
        .into_iter()
        .collect()
    }

    /// Creates CPU capabilities.
    pub fn cpu(cores: u32, clock_ghz: f32, avx512: bool) -> Self {
        let flops_per_cycle = if avx512 { 64.0 } else { 32.0 }; // FP32 ops per cycle with AVX
        let fp32_tflops = (cores as f64 * clock_ghz as f64 * flops_per_cycle) / 1000.0;

        Self {
            compute: ComputeThroughput {
                fp64_tflops: fp32_tflops / 2.0,
                fp32_tflops,
                fp16_tflops: fp32_tflops * 2.0,
                bf16_tflops: fp32_tflops * 2.0,
                int8_tops: fp32_tflops * 4.0,
                int4_tops: fp32_tflops * 8.0,
                sparsity_speedup: 1.0,
            },
            memory: MemorySpecs {
                capacity_bytes: 64 * 1024 * 1024 * 1024, // 64 GB typical
                bandwidth_gbps: 200,                     // DDR5
                type_: MemoryType::Ddr5,
            },
            operations: Self::cpu_operations(),
            power: PowerCharacteristics {
                tdp_watts: 125,
                efficiency: 0.8,
                power_tier: PowerTier::Medium,
            },
            optimal_for: vec![
                WorkloadCharacteristic::Sequential,
                WorkloadCharacteristic::MemoryBound,
                WorkloadCharacteristic::SmallBatch,
            ],
        }
    }

    /// Operations typically supported by CPUs.
    fn cpu_operations() -> HashSet<OperationType> {
        [
            // Matrix operations (slow but supported)
            OperationType::MatMul,
            OperationType::Conv2d,
            OperationType::BatchNorm,
            OperationType::LayerNorm,
            // Element-wise
            OperationType::Add,
            OperationType::Mul,
            OperationType::ReLU,
            OperationType::GeLU,
            OperationType::Softmax,
            // Data operations (optimal)
            OperationType::DataLoad,
            OperationType::DataPreprocess,
            OperationType::Tokenization,
            OperationType::Detokenization,
            // Memory operations
            OperationType::Transpose,
            OperationType::Reshape,
            OperationType::Concat,
            OperationType::Split,
            // I/O
            OperationType::Checkpoint,
        ]
        .into_iter()
        .collect()
    }

    /// Creates NVIDIA GPU capabilities.
    pub fn nvidia_gpu(
        cuda_cores: u32,
        _tensor_cores: u32,
        vram_gb: u32,
        bandwidth_gbps: u32,
        compute_capability: (u8, u8),
    ) -> Self {
        // Approximate TFLOPS based on cores and typical clocks
        let base_clock_ghz = 1.5;
        let fp32_tflops = (cuda_cores as f64 * base_clock_ghz * 2.0) / 1000.0;
        let tensor_multiplier = if compute_capability.0 >= 8 { 4.0 } else { 2.0 };

        Self {
            compute: ComputeThroughput {
                fp64_tflops: fp32_tflops / 2.0,
                fp32_tflops,
                fp16_tflops: fp32_tflops * tensor_multiplier,
                bf16_tflops: fp32_tflops * tensor_multiplier,
                int8_tops: fp32_tflops * tensor_multiplier * 2.0,
                int4_tops: fp32_tflops * tensor_multiplier * 4.0,
                sparsity_speedup: if compute_capability.0 >= 8 { 2.0 } else { 1.0 },
            },
            memory: MemorySpecs {
                capacity_bytes: vram_gb as u64 * 1024 * 1024 * 1024,
                bandwidth_gbps,
                type_: if compute_capability.0 >= 9 {
                    MemoryType::Hbm3
                } else {
                    MemoryType::Hbm2e
                },
            },
            operations: Self::gpu_operations(compute_capability),
            power: PowerCharacteristics {
                tdp_watts: if compute_capability.0 >= 9 { 700 } else { 350 },
                efficiency: 0.9,
                power_tier: PowerTier::High,
            },
            optimal_for: vec![
                WorkloadCharacteristic::HighlyParallel,
                WorkloadCharacteristic::LargeBatch,
                WorkloadCharacteristic::ComputeBound,
            ],
        }
    }

    /// Operations supported by GPUs.
    fn gpu_operations(compute_capability: (u8, u8)) -> HashSet<OperationType> {
        let mut ops: HashSet<OperationType> = [
            // Matrix operations (optimal)
            OperationType::MatMul,
            OperationType::Conv2d,
            OperationType::Conv3d,
            OperationType::DepthwiseConv,
            OperationType::BatchNorm,
            OperationType::LayerNorm,
            // Attention
            OperationType::SelfAttention,
            OperationType::CrossAttention,
            // Element-wise
            OperationType::Add,
            OperationType::Mul,
            OperationType::ReLU,
            OperationType::GeLU,
            OperationType::SiLU,
            OperationType::Softmax,
            // Reduction
            OperationType::Sum,
            OperationType::Mean,
            OperationType::Max,
            OperationType::ArgMax,
            // Memory operations
            OperationType::Transpose,
            OperationType::Reshape,
            OperationType::Concat,
            OperationType::Split,
            OperationType::Gather,
            OperationType::Scatter,
            // LLM specific
            OperationType::Embedding,
            OperationType::RoPE,
            OperationType::KVCache,
            OperationType::TopK,
            OperationType::Sampling,
        ]
        .into_iter()
        .collect();

        // FlashAttention for newer GPUs
        if compute_capability.0 >= 8 {
            ops.insert(OperationType::FlashAttention);
        }

        ops
    }

    /// Creates TPU capabilities.
    pub fn tpu(version: super::TpuVersion) -> Self {
        let (bf16_tflops, memory_gb, bandwidth_gbps) = match version {
            super::TpuVersion::V5p => (918.0, 95, 4800),
            super::TpuVersion::V5e => (197.0, 16, 1600),
            super::TpuVersion::V4 => (275.0, 32, 2400),
            super::TpuVersion::V4i => (138.0, 32, 1200),
            super::TpuVersion::V3 => (123.0, 16, 900),
            super::TpuVersion::V2 => (46.0, 8, 600),
            super::TpuVersion::Edge => (4.0, 0, 0), // Uses host memory
        };

        Self {
            compute: ComputeThroughput {
                fp64_tflops: 0.0, // TPUs don't support FP64
                fp32_tflops: bf16_tflops / 2.0,
                fp16_tflops: bf16_tflops,
                bf16_tflops,
                int8_tops: bf16_tflops * 2.0,
                int4_tops: bf16_tflops * 4.0,
                sparsity_speedup: 2.0,
            },
            memory: MemorySpecs {
                capacity_bytes: memory_gb as u64 * 1024 * 1024 * 1024,
                bandwidth_gbps,
                type_: MemoryType::Hbm2e,
            },
            operations: Self::tpu_operations(),
            power: PowerCharacteristics {
                tdp_watts: if matches!(version, super::TpuVersion::Edge) {
                    2
                } else {
                    400
                },
                efficiency: 0.95,
                power_tier: if matches!(version, super::TpuVersion::Edge) {
                    PowerTier::UltraLow
                } else {
                    PowerTier::High
                },
            },
            optimal_for: vec![
                WorkloadCharacteristic::HighlyParallel,
                WorkloadCharacteristic::ComputeBound,
                WorkloadCharacteristic::FixedShape,
                WorkloadCharacteristic::LargeBatch,
            ],
        }
    }

    /// Operations supported by TPUs.
    fn tpu_operations() -> HashSet<OperationType> {
        [
            // Matrix operations (optimal)
            OperationType::MatMul,
            OperationType::Conv2d,
            OperationType::BatchNorm,
            OperationType::LayerNorm,
            // Attention
            OperationType::SelfAttention,
            OperationType::CrossAttention,
            OperationType::FlashAttention,
            // Element-wise
            OperationType::Add,
            OperationType::Mul,
            OperationType::ReLU,
            OperationType::GeLU,
            OperationType::SiLU,
            OperationType::Softmax,
            // Reduction
            OperationType::Sum,
            OperationType::Mean,
            OperationType::Max,
            // LLM specific
            OperationType::Embedding,
            OperationType::RoPE,
            OperationType::KVCache,
        ]
        .into_iter()
        .collect()
    }

    /// Creates LPU (Groq) capabilities.
    pub fn lpu() -> Self {
        Self {
            compute: ComputeThroughput {
                fp64_tflops: 0.0,
                fp32_tflops: 0.0,
                fp16_tflops: 188.0,
                bf16_tflops: 188.0,
                int8_tops: 750.0,
                int4_tops: 1500.0,
                sparsity_speedup: 1.0,
            },
            memory: MemorySpecs {
                capacity_bytes: 230 * 1024 * 1024 * 1024, // 230 GB SRAM!
                bandwidth_gbps: 80_000,                    // 80 TB/s internal
                type_: MemoryType::Sram,
            },
            operations: Self::lpu_operations(),
            power: PowerCharacteristics {
                tdp_watts: 300,
                efficiency: 0.98, // Very efficient for inference
                power_tier: PowerTier::Medium,
            },
            optimal_for: vec![
                WorkloadCharacteristic::Sequential,
                WorkloadCharacteristic::SmallBatch,
                WorkloadCharacteristic::VariableLength,
                WorkloadCharacteristic::LowLatency,
            ],
        }
    }

    /// Operations supported by Groq LPU.
    fn lpu_operations() -> HashSet<OperationType> {
        [
            // Optimized for inference
            OperationType::MatMul,
            OperationType::LayerNorm,
            OperationType::SelfAttention,
            OperationType::Add,
            OperationType::Mul,
            OperationType::ReLU,
            OperationType::GeLU,
            OperationType::SiLU,
            OperationType::Softmax,
            OperationType::Embedding,
            OperationType::RoPE,
            OperationType::KVCache,
            OperationType::TopK,
            OperationType::Sampling,
        ]
        .into_iter()
        .collect()
    }

    /// Creates Apple Neural Engine capabilities.
    pub fn apple_neural_engine(cores: u32) -> Self {
        let int8_tops = match cores {
            16 => 18.0,  // M3
            32 => 35.0,  // M3 Max
            _ => cores as f64 * 1.1,
        };

        Self {
            compute: ComputeThroughput {
                fp64_tflops: 0.0,
                fp32_tflops: int8_tops / 4.0,
                fp16_tflops: int8_tops / 2.0,
                bf16_tflops: int8_tops / 2.0,
                int8_tops,
                int4_tops: int8_tops * 2.0,
                sparsity_speedup: 1.0,
            },
            memory: MemorySpecs {
                capacity_bytes: 0, // Uses unified memory
                bandwidth_gbps: 400,
                type_: MemoryType::Unified,
            },
            operations: Self::npu_operations(),
            power: PowerCharacteristics {
                tdp_watts: 15,
                efficiency: 0.95,
                power_tier: PowerTier::UltraLow,
            },
            optimal_for: vec![
                WorkloadCharacteristic::LowPower,
                WorkloadCharacteristic::LowLatency,
                WorkloadCharacteristic::SmallBatch,
            ],
        }
    }

    /// Operations supported by NPUs.
    fn npu_operations() -> HashSet<OperationType> {
        [
            // Inference optimized
            OperationType::MatMul,
            OperationType::Conv2d,
            OperationType::DepthwiseConv,
            OperationType::BatchNorm,
            OperationType::LayerNorm,
            OperationType::Add,
            OperationType::Mul,
            OperationType::ReLU,
            OperationType::Softmax,
            OperationType::Embedding,
        ]
        .into_iter()
        .collect()
    }
}

/// Compute throughput metrics.
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
pub struct ComputeThroughput {
    /// FP64 TFLOPS.
    pub fp64_tflops: f64,
    /// FP32 TFLOPS.
    pub fp32_tflops: f64,
    /// FP16 TFLOPS.
    pub fp16_tflops: f64,
    /// BF16 TFLOPS.
    pub bf16_tflops: f64,
    /// INT8 TOPS.
    pub int8_tops: f64,
    /// INT4 TOPS.
    pub int4_tops: f64,
    /// Speedup for sparse operations.
    pub sparsity_speedup: f64,
}

/// Memory specifications.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct MemorySpecs {
    /// Total capacity (bytes).
    pub capacity_bytes: u64,
    /// Bandwidth (GB/s).
    pub bandwidth_gbps: u32,
    /// Memory type.
    pub type_: MemoryType,
}

impl Default for MemorySpecs {
    fn default() -> Self {
        Self {
            capacity_bytes: 16 * 1024 * 1024 * 1024, // 16 GB
            bandwidth_gbps: 500,
            type_: MemoryType::Ddr5,
        }
    }
}

/// Memory types.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum MemoryType {
    /// DDR4 RAM.
    Ddr4,
    /// DDR5 RAM.
    Ddr5,
    /// GDDR6/6X video memory.
    Gddr6,
    /// HBM2.
    Hbm2,
    /// HBM2e.
    Hbm2e,
    /// HBM3.
    Hbm3,
    /// SRAM (on-chip).
    Sram,
    /// Unified memory (Apple Silicon).
    Unified,
    /// LPDDR (mobile).
    Lpddr,
}

/// Power characteristics.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct PowerCharacteristics {
    /// TDP in watts.
    pub tdp_watts: u32,
    /// Efficiency factor (0.0 - 1.0).
    pub efficiency: f64,
    /// Power tier.
    pub power_tier: PowerTier,
}

impl Default for PowerCharacteristics {
    fn default() -> Self {
        Self {
            tdp_watts: 100,
            efficiency: 0.8,
            power_tier: PowerTier::Medium,
        }
    }
}

/// Workload characteristics for processor matching.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum WorkloadCharacteristic {
    /// High parallelism (GPU, TPU).
    HighlyParallel,
    /// Sequential dependencies (CPU, LPU).
    Sequential,
    /// Memory bandwidth bound (GPU).
    MemoryBound,
    /// Compute bound (TPU).
    ComputeBound,
    /// Low latency required (NPU, edge).
    LowLatency,
    /// Low power required (NPU, mobile).
    LowPower,
    /// Large batch sizes (GPU, TPU).
    LargeBatch,
    /// Small batch sizes (CPU, LPU).
    SmallBatch,
    /// Variable length sequences (LPU).
    VariableLength,
    /// Fixed tensor shapes (TPU).
    FixedShape,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_cpu_capabilities() {
        let caps = ProcessorCapabilities::cpu(32, 3.5, true);
        assert!(caps.compute.fp32_tflops > 0.0);
        assert!(caps.operations.contains(&OperationType::DataLoad));
        assert!(caps.operations.contains(&OperationType::Tokenization));
    }

    #[test]
    fn test_gpu_capabilities() {
        let caps = ProcessorCapabilities::nvidia_gpu(16384, 512, 24, 1000, (8, 9));
        assert!(caps.compute.fp16_tflops > caps.compute.fp32_tflops);
        assert!(caps.operations.contains(&OperationType::FlashAttention));
    }

    #[test]
    fn test_tpu_capabilities() {
        let caps = ProcessorCapabilities::tpu(super::super::TpuVersion::V5p);
        assert!(caps.compute.bf16_tflops > 900.0);
        assert!(!caps.operations.contains(&OperationType::DataLoad)); // TPUs don't do I/O
    }

    #[test]
    fn test_lpu_capabilities() {
        let caps = ProcessorCapabilities::lpu();
        assert!(caps.memory.bandwidth_gbps > 10000); // Very high internal bandwidth
        assert!(caps.optimal_for.contains(&WorkloadCharacteristic::Sequential));
    }
}