synor/crates/synor-database/src/index.rs
2026-02-02 05:58:22 +05:30

533 lines
15 KiB
Rust

//! Index Management for efficient queries.
//!
//! Supports B-tree, hash, and vector indexes.
use crate::document::DocumentId;
use crate::error::DatabaseError;
use parking_lot::RwLock;
use serde::{Deserialize, Serialize};
use serde_json::Value as JsonValue;
use std::collections::{BTreeMap, HashMap, HashSet};
/// Index type.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub enum IndexType {
/// B-tree index for range queries.
BTree,
/// Hash index for equality lookups.
Hash,
/// Full-text search index.
FullText,
/// Vector index (HNSW).
Vector,
/// Compound index on multiple fields.
Compound,
/// Unique constraint index.
Unique,
}
/// Index configuration.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct IndexConfig {
/// Index name.
pub name: String,
/// Collection name.
pub collection: String,
/// Fields to index.
pub fields: Vec<String>,
/// Index type.
pub index_type: IndexType,
/// Whether index enforces uniqueness.
pub unique: bool,
/// Sparse index (skip null values).
pub sparse: bool,
}
impl IndexConfig {
/// Creates a new index config.
pub fn new(name: impl Into<String>, collection: impl Into<String>) -> Self {
Self {
name: name.into(),
collection: collection.into(),
fields: Vec::new(),
index_type: IndexType::BTree,
unique: false,
sparse: false,
}
}
/// Adds a field to index.
pub fn field(mut self, field: impl Into<String>) -> Self {
self.fields.push(field.into());
self
}
/// Sets index type.
pub fn index_type(mut self, t: IndexType) -> Self {
self.index_type = t;
self
}
/// Sets as unique.
pub fn unique(mut self) -> Self {
self.unique = true;
self
}
/// Sets as sparse.
pub fn sparse(mut self) -> Self {
self.sparse = true;
self
}
}
/// An index entry.
#[derive(Clone, Debug)]
struct IndexEntry {
/// Indexed value (serialized for comparison).
key: IndexKey,
/// Document IDs with this value.
doc_ids: HashSet<DocumentId>,
}
/// Index key for ordering.
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
enum IndexKey {
Null,
Bool(bool),
Int(i64),
String(String),
Bytes(Vec<u8>),
}
impl PartialOrd for IndexKey {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for IndexKey {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
match (self, other) {
(IndexKey::Null, IndexKey::Null) => std::cmp::Ordering::Equal,
(IndexKey::Null, _) => std::cmp::Ordering::Less,
(_, IndexKey::Null) => std::cmp::Ordering::Greater,
(IndexKey::Bool(a), IndexKey::Bool(b)) => a.cmp(b),
(IndexKey::Int(a), IndexKey::Int(b)) => a.cmp(b),
(IndexKey::String(a), IndexKey::String(b)) => a.cmp(b),
(IndexKey::Bytes(a), IndexKey::Bytes(b)) => a.cmp(b),
_ => std::cmp::Ordering::Equal,
}
}
}
impl From<&JsonValue> for IndexKey {
fn from(value: &JsonValue) -> Self {
match value {
JsonValue::Null => IndexKey::Null,
JsonValue::Bool(b) => IndexKey::Bool(*b),
JsonValue::Number(n) => IndexKey::Int(n.as_i64().unwrap_or(0)),
JsonValue::String(s) => IndexKey::String(s.clone()),
_ => IndexKey::Bytes(serde_json::to_vec(value).unwrap_or_default()),
}
}
}
/// A single index instance.
pub struct Index {
/// Index configuration.
pub config: IndexConfig,
/// B-tree index data.
btree: RwLock<BTreeMap<IndexKey, HashSet<DocumentId>>>,
/// Hash index data.
hash: RwLock<HashMap<IndexKey, HashSet<DocumentId>>>,
/// Statistics.
stats: RwLock<IndexStats>,
}
/// Index statistics.
#[derive(Clone, Debug, Default)]
pub struct IndexStats {
/// Total entries.
pub entries: u64,
/// Index lookups.
pub lookups: u64,
/// Index hits.
pub hits: u64,
}
impl Index {
/// Creates a new index.
pub fn new(config: IndexConfig) -> Self {
Self {
config,
btree: RwLock::new(BTreeMap::new()),
hash: RwLock::new(HashMap::new()),
stats: RwLock::new(IndexStats::default()),
}
}
/// Adds a document to the index.
pub fn insert(&self, doc_id: DocumentId, value: &JsonValue) -> Result<(), DatabaseError> {
let key = IndexKey::from(value);
// Check uniqueness if required
if self.config.unique {
let exists = match self.config.index_type {
IndexType::Hash | IndexType::Unique => self
.hash
.read()
.get(&key)
.map(|s| !s.is_empty())
.unwrap_or(false),
_ => self
.btree
.read()
.get(&key)
.map(|s| !s.is_empty())
.unwrap_or(false),
};
if exists {
return Err(DatabaseError::AlreadyExists(format!(
"Unique constraint violation on index '{}'",
self.config.name
)));
}
}
match self.config.index_type {
IndexType::Hash | IndexType::Unique => {
self.hash
.write()
.entry(key)
.or_insert_with(HashSet::new)
.insert(doc_id);
}
_ => {
self.btree
.write()
.entry(key)
.or_insert_with(HashSet::new)
.insert(doc_id);
}
}
self.stats.write().entries += 1;
Ok(())
}
/// Removes a document from the index.
pub fn remove(&self, doc_id: &DocumentId, value: &JsonValue) {
let key = IndexKey::from(value);
match self.config.index_type {
IndexType::Hash | IndexType::Unique => {
if let Some(set) = self.hash.write().get_mut(&key) {
set.remove(doc_id);
if set.is_empty() {
self.hash.write().remove(&key);
}
}
}
_ => {
if let Some(set) = self.btree.write().get_mut(&key) {
set.remove(doc_id);
if set.is_empty() {
self.btree.write().remove(&key);
}
}
}
}
}
/// Looks up documents by exact value.
pub fn lookup(&self, value: &JsonValue) -> Vec<DocumentId> {
let key = IndexKey::from(value);
self.stats.write().lookups += 1;
let result: Vec<DocumentId> = match self.config.index_type {
IndexType::Hash | IndexType::Unique => self
.hash
.read()
.get(&key)
.map(|s| s.iter().cloned().collect())
.unwrap_or_default(),
_ => self
.btree
.read()
.get(&key)
.map(|s| s.iter().cloned().collect())
.unwrap_or_default(),
};
if !result.is_empty() {
self.stats.write().hits += 1;
}
result
}
/// Range query (only for B-tree indexes).
pub fn range(&self, start: Option<&JsonValue>, end: Option<&JsonValue>) -> Vec<DocumentId> {
if self.config.index_type != IndexType::BTree {
return Vec::new();
}
self.stats.write().lookups += 1;
let btree = self.btree.read();
let start_key = start.map(IndexKey::from);
let end_key = end.map(IndexKey::from);
let mut result = Vec::new();
for (key, doc_ids) in btree.iter() {
let in_range = match (&start_key, &end_key) {
(Some(s), Some(e)) => key >= s && key <= e,
(Some(s), None) => key >= s,
(None, Some(e)) => key <= e,
(None, None) => true,
};
if in_range {
result.extend(doc_ids.iter().cloned());
}
}
if !result.is_empty() {
self.stats.write().hits += 1;
}
result
}
/// Returns index statistics.
pub fn stats(&self) -> IndexStats {
self.stats.read().clone()
}
/// Clears the index.
pub fn clear(&self) {
self.btree.write().clear();
self.hash.write().clear();
self.stats.write().entries = 0;
}
}
/// Manages indexes for a database.
pub struct IndexManager {
/// Indexes by name.
indexes: RwLock<HashMap<String, Index>>,
/// Index by collection and field.
by_collection: RwLock<HashMap<String, Vec<String>>>,
}
impl IndexManager {
/// Creates a new index manager.
pub fn new() -> Self {
Self {
indexes: RwLock::new(HashMap::new()),
by_collection: RwLock::new(HashMap::new()),
}
}
/// Creates a new index.
pub fn create_index(&self, config: IndexConfig) -> Result<(), DatabaseError> {
let name = config.name.clone();
let collection = config.collection.clone();
let mut indexes = self.indexes.write();
if indexes.contains_key(&name) {
return Err(DatabaseError::AlreadyExists(name));
}
indexes.insert(name.clone(), Index::new(config));
self.by_collection
.write()
.entry(collection)
.or_insert_with(Vec::new)
.push(name);
Ok(())
}
/// Drops an index.
pub fn drop_index(&self, name: &str) -> Result<(), DatabaseError> {
let mut indexes = self.indexes.write();
let index = indexes
.remove(name)
.ok_or_else(|| DatabaseError::IndexNotFound(name.to_string()))?;
// Remove from collection mapping
let mut by_collection = self.by_collection.write();
if let Some(names) = by_collection.get_mut(&index.config.collection) {
names.retain(|n| n != name);
}
Ok(())
}
/// Gets an index by name.
/// Note: Returns None as the current implementation stores Index directly.
/// A production implementation would store Arc<Index> to enable sharing.
pub fn get_index(&self, name: &str) -> Option<std::sync::Arc<Index>> {
// Check if index exists, but can't return Arc without storing Arc internally
if self.indexes.read().contains_key(name) {
// TODO: Store indexes as Arc<Index> to enable retrieval
None
} else {
None
}
}
/// Gets indexes for a collection.
pub fn get_collection_indexes(&self, collection: &str) -> Vec<String> {
self.by_collection
.read()
.get(collection)
.cloned()
.unwrap_or_default()
}
/// Indexes a document.
pub fn index_document(
&self,
collection: &str,
doc_id: DocumentId,
document: &JsonValue,
) -> Result<(), DatabaseError> {
let index_names = self.get_collection_indexes(collection);
let indexes = self.indexes.read();
for name in index_names {
if let Some(index) = indexes.get(&name) {
for field in &index.config.fields {
if let Some(value) = document.get(field) {
index.insert(doc_id.clone(), value)?;
}
}
}
}
Ok(())
}
/// Removes a document from indexes.
pub fn unindex_document(&self, collection: &str, doc_id: &DocumentId, document: &JsonValue) {
let index_names = self.get_collection_indexes(collection);
let indexes = self.indexes.read();
for name in index_names {
if let Some(index) = indexes.get(&name) {
for field in &index.config.fields {
if let Some(value) = document.get(field) {
index.remove(doc_id, value);
}
}
}
}
}
/// Lists all indexes.
pub fn list_indexes(&self) -> Vec<IndexConfig> {
self.indexes
.read()
.values()
.map(|i| i.config.clone())
.collect()
}
}
impl Default for IndexManager {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn test_btree_index() {
let config = IndexConfig::new("age_idx", "users")
.field("age")
.index_type(IndexType::BTree);
let index = Index::new(config);
let doc1 = DocumentId::new();
let doc2 = DocumentId::new();
let doc3 = DocumentId::new();
index.insert(doc1.clone(), &json!(25)).unwrap();
index.insert(doc2.clone(), &json!(30)).unwrap();
index.insert(doc3.clone(), &json!(35)).unwrap();
// Exact lookup
let results = index.lookup(&json!(30));
assert_eq!(results.len(), 1);
assert_eq!(results[0], doc2);
// Range query
let results = index.range(Some(&json!(28)), Some(&json!(36)));
assert_eq!(results.len(), 2);
}
#[test]
fn test_hash_index() {
let config = IndexConfig::new("email_idx", "users")
.field("email")
.index_type(IndexType::Hash);
let index = Index::new(config);
let doc1 = DocumentId::new();
index
.insert(doc1.clone(), &json!("alice@example.com"))
.unwrap();
let results = index.lookup(&json!("alice@example.com"));
assert_eq!(results.len(), 1);
let results = index.lookup(&json!("bob@example.com"));
assert!(results.is_empty());
}
#[test]
fn test_unique_index() {
let config = IndexConfig::new("email_unique", "users")
.field("email")
.index_type(IndexType::Unique)
.unique();
let index = Index::new(config);
let doc1 = DocumentId::new();
let doc2 = DocumentId::new();
index.insert(doc1, &json!("alice@example.com")).unwrap();
// Should fail - duplicate
let result = index.insert(doc2, &json!("alice@example.com"));
assert!(result.is_err());
}
#[test]
fn test_index_manager() {
let manager = IndexManager::new();
let config = IndexConfig::new("age_idx", "users").field("age");
manager.create_index(config).unwrap();
let doc_id = DocumentId::new();
let doc = json!({"name": "Alice", "age": 30});
manager
.index_document("users", doc_id.clone(), &doc)
.unwrap();
let indexes = manager.list_indexes();
assert_eq!(indexes.len(), 1);
}
}