//! File chunking for large file storage //! //! Files are split into fixed-size chunks for: //! - Parallel upload/download //! - Efficient deduplication //! - Erasure coding application use crate::cid::ContentId; use serde::{Deserialize, Serialize}; /// Default chunk size: 1 MB pub const DEFAULT_CHUNK_SIZE: usize = 1024 * 1024; /// A chunk of a file #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Chunk { /// Chunk index within the file pub index: u32, /// Content ID of this chunk pub cid: ContentId, /// Chunk data #[serde(with = "serde_bytes")] pub data: Vec, /// Offset in the original file pub offset: u64, } impl Chunk { /// Create a new chunk pub fn new(index: u32, data: Vec, offset: u64) -> Self { let cid = ContentId::from_content(&data); Self { index, cid, data, offset, } } /// Verify chunk integrity pub fn verify(&self) -> bool { self.cid.verify(&self.data) } /// Get chunk size pub fn size(&self) -> usize { self.data.len() } } /// Chunker configuration #[derive(Debug, Clone)] pub struct ChunkerConfig { /// Size of each chunk in bytes pub chunk_size: usize, } impl Default for ChunkerConfig { fn default() -> Self { Self { chunk_size: DEFAULT_CHUNK_SIZE, } } } /// File chunker - splits files into chunks pub struct Chunker { config: ChunkerConfig, } impl Chunker { /// Create a new chunker with default config pub fn new() -> Self { Self { config: ChunkerConfig::default(), } } /// Create a new chunker with custom config pub fn with_config(config: ChunkerConfig) -> Self { Self { config } } /// Split data into chunks pub fn chunk(&self, data: &[u8]) -> Vec { let mut chunks = Vec::new(); let mut offset = 0u64; let mut index = 0u32; for chunk_data in data.chunks(self.config.chunk_size) { chunks.push(Chunk::new( index, chunk_data.to_vec(), offset, )); offset += chunk_data.len() as u64; index += 1; } chunks } /// Reassemble chunks into original data pub fn reassemble(&self, chunks: &[Chunk]) -> Result, ReassembleError> { if chunks.is_empty() { return Ok(Vec::new()); } // Sort by index let mut sorted: Vec<_> = chunks.iter().collect(); sorted.sort_by_key(|c| c.index); // Verify indices are contiguous for (i, chunk) in sorted.iter().enumerate() { if chunk.index != i as u32 { return Err(ReassembleError::MissingChunk(i as u32)); } } // Verify each chunk for chunk in &sorted { if !chunk.verify() { return Err(ReassembleError::InvalidChunk(chunk.index)); } } // Combine data let total_size: usize = sorted.iter().map(|c| c.data.len()).sum(); let mut result = Vec::with_capacity(total_size); for chunk in sorted { result.extend_from_slice(&chunk.data); } Ok(result) } /// Get the number of chunks for a given file size pub fn chunk_count(&self, file_size: u64) -> u32 { let size = file_size as usize; let full_chunks = size / self.config.chunk_size; let has_remainder = size % self.config.chunk_size != 0; (full_chunks + if has_remainder { 1 } else { 0 }) as u32 } } impl Default for Chunker { fn default() -> Self { Self::new() } } /// Errors during chunk reassembly #[derive(Debug, Clone, PartialEq, Eq)] pub enum ReassembleError { /// A chunk is missing from the sequence MissingChunk(u32), /// A chunk failed integrity verification InvalidChunk(u32), } impl std::error::Error for ReassembleError {} impl std::fmt::Display for ReassembleError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::MissingChunk(i) => write!(f, "Missing chunk at index {}", i), Self::InvalidChunk(i) => write!(f, "Chunk {} failed verification", i), } } } /// Metadata about a chunked file #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ChunkedFile { /// CID of the complete file pub cid: ContentId, /// Total file size pub size: u64, /// Number of chunks pub chunk_count: u32, /// Size of each chunk (except possibly last) pub chunk_size: usize, /// CIDs of each chunk in order pub chunk_cids: Vec, } impl ChunkedFile { /// Create metadata from chunks pub fn from_chunks(chunks: &[Chunk], original_cid: ContentId) -> Self { Self { cid: original_cid, size: chunks.iter().map(|c| c.data.len() as u64).sum(), chunk_count: chunks.len() as u32, chunk_size: if chunks.is_empty() { 0 } else { chunks[0].data.len() }, chunk_cids: chunks.iter().map(|c| c.cid.clone()).collect(), } } } #[cfg(test)] mod tests { use super::*; #[test] fn test_chunk_small_file() { let chunker = Chunker::new(); let data = b"Small file that fits in one chunk"; let chunks = chunker.chunk(data); assert_eq!(chunks.len(), 1); assert_eq!(chunks[0].data, data); assert!(chunks[0].verify()); } #[test] fn test_chunk_large_file() { let config = ChunkerConfig { chunk_size: 10 }; let chunker = Chunker::with_config(config); let data = b"This is a longer file that will be split into chunks"; let chunks = chunker.chunk(data); assert!(chunks.len() > 1); // Verify all chunks for chunk in &chunks { assert!(chunk.verify()); } } #[test] fn test_reassemble() { let config = ChunkerConfig { chunk_size: 10 }; let chunker = Chunker::with_config(config); let original = b"This is a test file for chunking and reassembly"; let chunks = chunker.chunk(original); let reassembled = chunker.reassemble(&chunks).unwrap(); assert_eq!(reassembled, original); } #[test] fn test_reassemble_missing_chunk() { let config = ChunkerConfig { chunk_size: 10 }; let chunker = Chunker::with_config(config); let data = b"Test data for missing chunk test case here"; let mut chunks = chunker.chunk(data); // Remove middle chunk chunks.remove(1); let result = chunker.reassemble(&chunks); assert!(matches!(result, Err(ReassembleError::MissingChunk(_)))); } #[test] fn test_chunk_count() { let config = ChunkerConfig { chunk_size: 100 }; let chunker = Chunker::with_config(config); assert_eq!(chunker.chunk_count(0), 0); assert_eq!(chunker.chunk_count(50), 1); assert_eq!(chunker.chunk_count(100), 1); assert_eq!(chunker.chunk_count(101), 2); assert_eq!(chunker.chunk_count(250), 3); } #[test] fn test_empty_file() { let chunker = Chunker::new(); let chunks = chunker.chunk(&[]); assert!(chunks.is_empty()); let reassembled = chunker.reassemble(&chunks).unwrap(); assert!(reassembled.is_empty()); } }