- Applied clippy --fix to synor-storage (19 fixes) - Applied clippy --fix to synor-zk (2 fixes) - Simplified code patterns and removed redundant operations
282 lines
7.5 KiB
Rust
282 lines
7.5 KiB
Rust
//! File chunking for large file storage
|
|
//!
|
|
//! Files are split into fixed-size chunks for:
|
|
//! - Parallel upload/download
|
|
//! - Efficient deduplication
|
|
//! - Erasure coding application
|
|
|
|
use crate::cid::ContentId;
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
/// Default chunk size: 1 MB
|
|
pub const DEFAULT_CHUNK_SIZE: usize = 1024 * 1024;
|
|
|
|
/// A chunk of a file
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct Chunk {
|
|
/// Chunk index within the file
|
|
pub index: u32,
|
|
/// Content ID of this chunk
|
|
pub cid: ContentId,
|
|
/// Chunk data
|
|
#[serde(with = "serde_bytes")]
|
|
pub data: Vec<u8>,
|
|
/// Offset in the original file
|
|
pub offset: u64,
|
|
}
|
|
|
|
impl Chunk {
|
|
/// Create a new chunk
|
|
pub fn new(index: u32, data: Vec<u8>, offset: u64) -> Self {
|
|
let cid = ContentId::from_content(&data);
|
|
Self {
|
|
index,
|
|
cid,
|
|
data,
|
|
offset,
|
|
}
|
|
}
|
|
|
|
/// Verify chunk integrity
|
|
pub fn verify(&self) -> bool {
|
|
self.cid.verify(&self.data)
|
|
}
|
|
|
|
/// Get chunk size
|
|
pub fn size(&self) -> usize {
|
|
self.data.len()
|
|
}
|
|
}
|
|
|
|
/// Chunker configuration
|
|
#[derive(Debug, Clone)]
|
|
pub struct ChunkerConfig {
|
|
/// Size of each chunk in bytes
|
|
pub chunk_size: usize,
|
|
}
|
|
|
|
impl Default for ChunkerConfig {
|
|
fn default() -> Self {
|
|
Self {
|
|
chunk_size: DEFAULT_CHUNK_SIZE,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// File chunker - splits files into chunks
|
|
pub struct Chunker {
|
|
config: ChunkerConfig,
|
|
}
|
|
|
|
impl Chunker {
|
|
/// Create a new chunker with default config
|
|
pub fn new() -> Self {
|
|
Self {
|
|
config: ChunkerConfig::default(),
|
|
}
|
|
}
|
|
|
|
/// Create a new chunker with custom config
|
|
pub fn with_config(config: ChunkerConfig) -> Self {
|
|
Self { config }
|
|
}
|
|
|
|
/// Split data into chunks
|
|
pub fn chunk(&self, data: &[u8]) -> Vec<Chunk> {
|
|
let mut chunks = Vec::new();
|
|
let mut offset = 0u64;
|
|
let mut index = 0u32;
|
|
|
|
for chunk_data in data.chunks(self.config.chunk_size) {
|
|
chunks.push(Chunk::new(
|
|
index,
|
|
chunk_data.to_vec(),
|
|
offset,
|
|
));
|
|
offset += chunk_data.len() as u64;
|
|
index += 1;
|
|
}
|
|
|
|
chunks
|
|
}
|
|
|
|
/// Reassemble chunks into original data
|
|
pub fn reassemble(&self, chunks: &[Chunk]) -> Result<Vec<u8>, ReassembleError> {
|
|
if chunks.is_empty() {
|
|
return Ok(Vec::new());
|
|
}
|
|
|
|
// Sort by index
|
|
let mut sorted: Vec<_> = chunks.iter().collect();
|
|
sorted.sort_by_key(|c| c.index);
|
|
|
|
// Verify indices are contiguous
|
|
for (i, chunk) in sorted.iter().enumerate() {
|
|
if chunk.index != i as u32 {
|
|
return Err(ReassembleError::MissingChunk(i as u32));
|
|
}
|
|
}
|
|
|
|
// Verify each chunk
|
|
for chunk in &sorted {
|
|
if !chunk.verify() {
|
|
return Err(ReassembleError::InvalidChunk(chunk.index));
|
|
}
|
|
}
|
|
|
|
// Combine data
|
|
let total_size: usize = sorted.iter().map(|c| c.data.len()).sum();
|
|
let mut result = Vec::with_capacity(total_size);
|
|
|
|
for chunk in sorted {
|
|
result.extend_from_slice(&chunk.data);
|
|
}
|
|
|
|
Ok(result)
|
|
}
|
|
|
|
/// Get the number of chunks for a given file size
|
|
pub fn chunk_count(&self, file_size: u64) -> u32 {
|
|
let size = file_size as usize;
|
|
let full_chunks = size / self.config.chunk_size;
|
|
let has_remainder = !size.is_multiple_of(self.config.chunk_size);
|
|
|
|
(full_chunks + if has_remainder { 1 } else { 0 }) as u32
|
|
}
|
|
}
|
|
|
|
impl Default for Chunker {
|
|
fn default() -> Self {
|
|
Self::new()
|
|
}
|
|
}
|
|
|
|
/// Errors during chunk reassembly
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
pub enum ReassembleError {
|
|
/// A chunk is missing from the sequence
|
|
MissingChunk(u32),
|
|
/// A chunk failed integrity verification
|
|
InvalidChunk(u32),
|
|
}
|
|
|
|
impl std::error::Error for ReassembleError {}
|
|
|
|
impl std::fmt::Display for ReassembleError {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
match self {
|
|
Self::MissingChunk(i) => write!(f, "Missing chunk at index {}", i),
|
|
Self::InvalidChunk(i) => write!(f, "Chunk {} failed verification", i),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Metadata about a chunked file
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct ChunkedFile {
|
|
/// CID of the complete file
|
|
pub cid: ContentId,
|
|
/// Total file size
|
|
pub size: u64,
|
|
/// Number of chunks
|
|
pub chunk_count: u32,
|
|
/// Size of each chunk (except possibly last)
|
|
pub chunk_size: usize,
|
|
/// CIDs of each chunk in order
|
|
pub chunk_cids: Vec<ContentId>,
|
|
}
|
|
|
|
impl ChunkedFile {
|
|
/// Create metadata from chunks
|
|
pub fn from_chunks(chunks: &[Chunk], original_cid: ContentId) -> Self {
|
|
Self {
|
|
cid: original_cid,
|
|
size: chunks.iter().map(|c| c.data.len() as u64).sum(),
|
|
chunk_count: chunks.len() as u32,
|
|
chunk_size: if chunks.is_empty() { 0 } else { chunks[0].data.len() },
|
|
chunk_cids: chunks.iter().map(|c| c.cid.clone()).collect(),
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_chunk_small_file() {
|
|
let chunker = Chunker::new();
|
|
let data = b"Small file that fits in one chunk";
|
|
|
|
let chunks = chunker.chunk(data);
|
|
assert_eq!(chunks.len(), 1);
|
|
assert_eq!(chunks[0].data, data);
|
|
assert!(chunks[0].verify());
|
|
}
|
|
|
|
#[test]
|
|
fn test_chunk_large_file() {
|
|
let config = ChunkerConfig { chunk_size: 10 };
|
|
let chunker = Chunker::with_config(config);
|
|
|
|
let data = b"This is a longer file that will be split into chunks";
|
|
let chunks = chunker.chunk(data);
|
|
|
|
assert!(chunks.len() > 1);
|
|
|
|
// Verify all chunks
|
|
for chunk in &chunks {
|
|
assert!(chunk.verify());
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_reassemble() {
|
|
let config = ChunkerConfig { chunk_size: 10 };
|
|
let chunker = Chunker::with_config(config);
|
|
|
|
let original = b"This is a test file for chunking and reassembly";
|
|
let chunks = chunker.chunk(original);
|
|
|
|
let reassembled = chunker.reassemble(&chunks).unwrap();
|
|
assert_eq!(reassembled, original);
|
|
}
|
|
|
|
#[test]
|
|
fn test_reassemble_missing_chunk() {
|
|
let config = ChunkerConfig { chunk_size: 10 };
|
|
let chunker = Chunker::with_config(config);
|
|
|
|
let data = b"Test data for missing chunk test case here";
|
|
let mut chunks = chunker.chunk(data);
|
|
|
|
// Remove middle chunk
|
|
chunks.remove(1);
|
|
|
|
let result = chunker.reassemble(&chunks);
|
|
assert!(matches!(result, Err(ReassembleError::MissingChunk(_))));
|
|
}
|
|
|
|
#[test]
|
|
fn test_chunk_count() {
|
|
let config = ChunkerConfig { chunk_size: 100 };
|
|
let chunker = Chunker::with_config(config);
|
|
|
|
assert_eq!(chunker.chunk_count(0), 0);
|
|
assert_eq!(chunker.chunk_count(50), 1);
|
|
assert_eq!(chunker.chunk_count(100), 1);
|
|
assert_eq!(chunker.chunk_count(101), 2);
|
|
assert_eq!(chunker.chunk_count(250), 3);
|
|
}
|
|
|
|
#[test]
|
|
fn test_empty_file() {
|
|
let chunker = Chunker::new();
|
|
let chunks = chunker.chunk(&[]);
|
|
|
|
assert!(chunks.is_empty());
|
|
|
|
let reassembled = chunker.reassemble(&chunks).unwrap();
|
|
assert!(reassembled.is_empty());
|
|
}
|
|
}
|