From 4dc85d725e275854c0d43aa9b9b942c256b9fc4a Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Mon, 18 Aug 2025 20:32:02 -0300 Subject: [PATCH 01/27] feat(db): implement incremental storage of MPT --- src/db.rs | 517 ++++++++++++++++++++++++++++++------------- src/file_manager.rs | 15 +- src/serialization.rs | 194 +++++++++++++++- src/trie/error.rs | 2 + 4 files changed, 550 insertions(+), 178 deletions(-) diff --git a/src/db.rs b/src/db.rs index 34820a5..ef80b0a 100644 --- a/src/db.rs +++ b/src/db.rs @@ -1,58 +1,67 @@ //! EthrexDB - A simple MPT database use crate::file_manager::FileManager; -use crate::serialization::{Deserializer, serialize}; +use crate::serialization::{Deserializer, Serializer}; use crate::trie::{Node, NodeHash, TrieError}; +use std::collections::HashMap; use std::path::PathBuf; /// Ethrex DB struct pub struct EthrexDB { /// File manager file_manager: FileManager, + /// Index mapping node hashes to their file offsets + node_index: HashMap, } impl EthrexDB { /// Create a new database pub fn new(file_path: PathBuf) -> Result { let file_manager = FileManager::create(file_path)?; - Ok(Self { file_manager }) + Ok(Self { + file_manager, + node_index: HashMap::new(), + }) } /// Open an existing database pub fn open(file_path: PathBuf) -> Result { let file_manager = FileManager::open(file_path)?; - Ok(Self { file_manager }) + // TODO: Load node_index from file + Ok(Self { + file_manager, + node_index: HashMap::new(), + }) } /// Commit a new trie to the database /// - /// Creates a new version in the database by: - /// 1. Reading the current latest version offset from header - /// 2. Serializing the trie nodes - /// 3. Writing [previous_offset][serialized_nodes] at the end of file - /// 4. Updating the header to point to this new version - /// - /// NOTE: Right now, we are storing the complete trie in the database. We should - /// store only the root node and the updated nodes. + /// Uses Copy-on-Write to only store new/modified nodes: + /// 1. Serializes only new nodes (NodeRef::Node) + /// 2. Reuses existing nodes (NodeRef::Hash) by their offset + /// 3. Updates the node index with new mappings + /// 4. Updates the header to point to the new root pub fn commit(&mut self, root_node: &Node) -> Result { let root_hash = root_node.compute_hash(); - // Read the previous root offset from header - let previous_root_offset = self.file_manager.read_latest_root_offset()?; + // Get the current file size (where new data will be written) + let base_offset = self.file_manager.get_file_size()?; - let serialized_trie = serialize(root_node); + // Serialize the trie incrementally with the base offset + let serializer = Serializer::new_incremental(&self.node_index, base_offset); + let (serialized_data, new_offsets, root_offset) = + serializer.serialize_tree_incremental(root_node)?; - // Prepare version data: [prev_offset(8 bytes)] + [trie_data] - let mut data_to_write = Vec::with_capacity(8 + serialized_trie.len()); - data_to_write.extend_from_slice(&previous_root_offset.to_le_bytes()); - data_to_write.extend_from_slice(&serialized_trie); + // Write new nodes at the end of file + self.file_manager.write_at_end(&serialized_data)?; - // Write at the end and get the offset where this version starts - let new_root_offset = self.file_manager.write_at_end(&data_to_write)?; + // Update node index with new node offsets (they are already absolute) + for (hash, absolute_offset) in new_offsets { + self.node_index.insert(hash, absolute_offset); + } - // Update header to point to this new version - self.file_manager - .update_latest_root_offset(new_root_offset)?; + // Update header to point to the root node + self.file_manager.update_latest_root_offset(root_offset)?; Ok(root_hash) } @@ -60,91 +69,25 @@ impl EthrexDB { /// Get the latest root node of the database pub fn root(&self) -> Result { let latest_offset = self.file_manager.read_latest_root_offset()?; - let trie_data = self.get_trie_data_at_version(latest_offset)?; - let root_node = Deserializer::new(trie_data).decode_tree()?; + if latest_offset == 0 { + return Err(TrieError::Other("No root node in database".to_string())); + } + + // Get the entire file as a buffer for deserializer + let file_data = self.file_manager.get_slice_to_end(0)?; + let root_node = Deserializer::new(file_data).decode_node_at(latest_offset as usize)?; Ok(root_node) } /// Get the value of the node with the given key pub fn get(&self, key: &[u8]) -> Result>, TrieError> { let latest_offset = self.file_manager.read_latest_root_offset()?; - self.get_at_version(key, latest_offset) - } - - /// Get the value of the node with the given key at a specific version - pub fn get_at_version( - &self, - key: &[u8], - version_offset: u64, - ) -> Result>, TrieError> { - if version_offset == 0 { + if latest_offset == 0 { return Ok(None); } - let trie_data = self.get_trie_data_at_version(version_offset)?; - - Deserializer::new(trie_data).get_by_path(key) - } - - /// Get all the roots of the database - /// TODO: Make this an iterator - pub fn iter_roots(&self) -> Result, TrieError> { - let mut roots = Vec::new(); - let mut current_offset = self.file_manager.read_latest_root_offset()?; - - while current_offset != 0 { - let trie_data = self.get_trie_data_at_version(current_offset)?; - - // Deserialize the trie at this version - let root_node = Deserializer::new(trie_data).decode_tree()?; - roots.push(root_node); - current_offset = self.read_previous_offset_at_version(current_offset)?; - } - - Ok(roots) - } - - /// Get trie data slice at a specific version - /// - /// Each version has format: [prev_offset: 8 bytes][trie_data] - /// This function skips the prev_offset and returns only the trie_data portion - fn get_trie_data_at_version(&self, version_offset: u64) -> Result<&[u8], TrieError> { - // Skip the previous offset (8 bytes) to get to the trie data - let trie_data_start = version_offset + 8; - let next_version_offset = self.find_next_version_offset(version_offset)?; - - match next_version_offset { - Some(next_offset) => { - let size = (next_offset - trie_data_start) as usize; - self.file_manager.get_slice_at(trie_data_start, size) - } - None => { - // Last version, read until the end - self.file_manager.get_slice_to_end(trie_data_start) - } - } - } - - /// Read the previous offset at a specific version - fn read_previous_offset_at_version(&self, version_offset: u64) -> Result { - let prev_offset_slice = self.file_manager.get_slice_at(version_offset, 8)?; - Ok(u64::from_le_bytes(prev_offset_slice.try_into().unwrap())) - } - - /// Find the offset of the next version after the given offset - fn find_next_version_offset(&self, current_offset: u64) -> Result, TrieError> { - let mut offset = self.file_manager.read_latest_root_offset()?; - let mut next_offset = None; - - // Walk the linked list to find the smallest offset greater than current_offset - while offset != 0 { - if offset > current_offset && (next_offset.is_none() || offset < next_offset.unwrap()) { - next_offset = Some(offset); - } - offset = self.read_previous_offset_at_version(offset)?; - } - - Ok(next_offset) + let file_data = self.file_manager.get_slice_to_end(0)?; + Deserializer::new(file_data).get_by_path_at(key, latest_offset as usize) } } @@ -218,83 +161,82 @@ mod tests { } #[test] - fn test_multi_version_trie() { - let temp_dir = TempDir::new("ethrex_db_test").unwrap(); - let db_path = temp_dir.path().join("test.edb"); + fn test_simple_serialization_debug() { + let temp_dir = TempDir::new("ethrex_db_simple_test").unwrap(); + let db_path = temp_dir.path().join("simple.edb"); let mut db = EthrexDB::new(db_path.clone()).unwrap(); - let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); - trie.insert(b"key1".to_vec(), b"value1".to_vec()).unwrap(); - trie.insert(b"common".to_vec(), b"v1".to_vec()).unwrap(); + + // Simple test with just one key + trie.insert(b"key".to_vec(), b"value".to_vec()).unwrap(); let root_node = trie.root_node().unwrap().unwrap(); + + // Commit to DB db.commit(&root_node).unwrap(); - // Note: We can't call trie.commit() because it converts NodeRef::Node to NodeRef::Hash - // and our serialization doesn't support hash references yet - // trie.commit().unwrap(); + // Read back from DB + let _recovered_root = db.root().unwrap(); - assert_eq!(db.root().unwrap(), root_node); + // Test that we can read the value + let value = db.get(b"key").unwrap(); + assert_eq!(value, Some(b"value".to_vec())); + } + + #[test] + fn test_incremental_commit() { + let temp_dir = TempDir::new("ethrex_db_test").unwrap(); + let db_path = temp_dir.path().join("test.edb"); - // let mut trie2 = Trie::new(Box::new(InMemoryTrieDB::new_empty())); + let mut db = EthrexDB::new(db_path.clone()).unwrap(); + let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); + + // First commit: Add initial keys + trie.insert(b"key1".to_vec(), b"value1".to_vec()).unwrap(); trie.insert(b"key2".to_vec(), b"value2".to_vec()).unwrap(); - trie.insert(b"common".to_vec(), b"v2".to_vec()).unwrap(); let root_node = trie.root_node().unwrap().unwrap(); + let initial_file_size = db.file_manager.get_file_size().unwrap(); db.commit(&root_node).unwrap(); + trie.commit().unwrap(); // Convert NodeRef::Node to NodeRef::Hash + let recovered_root = db.root().unwrap(); + assert_eq!(recovered_root, root_node); - // Note: We can't call trie.commit() because it converts NodeRef::Node to NodeRef::Hash - // and our serialization doesn't support hash references yet - // trie.commit().unwrap(); - - assert_eq!(db.root().unwrap(), root_node); + let size_after_first = db.file_manager.get_file_size().unwrap(); + assert!(size_after_first > initial_file_size); + // Second commit: Add one more key (should only store new nodes) trie.insert(b"key3".to_vec(), b"value3".to_vec()).unwrap(); - trie.insert(b"common".to_vec(), b"v3".to_vec()).unwrap(); let root_node = trie.root_node().unwrap().unwrap(); db.commit(&root_node).unwrap(); - - // Note: We can't call trie.commit() because it converts NodeRef::Node to NodeRef::Hash - // and our serialization doesn't support hash references yet - // trie.commit().unwrap(); - assert_eq!(db.root().unwrap(), root_node); - - assert_eq!(db.get(b"key3").unwrap(), Some(b"value3".to_vec())); - assert_eq!(db.get(b"common").unwrap(), Some(b"v3".to_vec())); + trie.commit().unwrap(); + + let size_after_second = db.file_manager.get_file_size().unwrap(); + // Should be smaller increment than first commit + let first_increment = size_after_first - initial_file_size; + let second_increment = size_after_second - size_after_first; + assert!( + second_increment < first_increment, + "Second commit should add less data due to CoW" + ); + + // Verify all values are still accessible assert_eq!(db.get(b"key1").unwrap(), Some(b"value1".to_vec())); assert_eq!(db.get(b"key2").unwrap(), Some(b"value2".to_vec())); + assert_eq!(db.get(b"key3").unwrap(), Some(b"value3".to_vec())); - assert_eq!(db.get(b"nonexistent").unwrap(), None); - assert_eq!(db.iter_roots().unwrap().len(), 3); - } - - #[test] - fn test_iter_roots() { - let temp_dir = TempDir::new("ethrex_db_test").unwrap(); - let db_path = temp_dir.path().join("test.edb"); - let mut db = EthrexDB::new(db_path.clone()).unwrap(); - - // Empty DB should have no roots - let roots: Vec = db.iter_roots().unwrap(); - assert_eq!(roots.len(), 0); - - for i in 1..=3 { - let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); - trie.insert( - format!("key{}", i).into_bytes(), - format!("value{}", i).into_bytes(), - ) + // Third commit: Update existing key (should reuse many nodes) + trie.insert(b"key2".to_vec(), b"value2_updated".to_vec()) .unwrap(); - let root_node = trie.root_node().unwrap().unwrap(); - db.commit(&root_node).unwrap(); - } - - let roots: Vec = db.iter_roots().unwrap(); - assert_eq!(roots.len(), 3); + let root_node = trie.root_node().unwrap().unwrap(); + db.commit(&root_node).unwrap(); + trie.commit().unwrap(); + assert_eq!(db.root().unwrap(), root_node); - for root in &roots { - root.compute_hash(); - } + // Verify updated value + assert_eq!(db.get(b"key2").unwrap(), Some(b"value2_updated".to_vec())); + assert_eq!(db.get(b"key1").unwrap(), Some(b"value1".to_vec())); + assert_eq!(db.get(b"key3").unwrap(), Some(b"value3".to_vec())); } #[test] @@ -367,12 +309,271 @@ mod tests { let result = db.get(key).unwrap(); assert_eq!(result, Some(expected_value.clone())); } + } + + // Helper function to generate test data (like the benchmark) + fn generate_test_data(n: usize) -> Vec<(Vec, Vec)> { + use sha3::{Digest, Keccak256}; + + (1..=n) + .map(|i| { + // 32-byte key (hash) + let key = Keccak256::new() + .chain_update(i.to_be_bytes()) + .finalize() + .to_vec(); + + // 104-byte value (account info: 2 hashes + u256 + u64) + let mut value = Vec::with_capacity(104); + value.extend_from_slice( + &Keccak256::new() + .chain_update((i * 2).to_be_bytes()) + .finalize(), + ); + value.extend_from_slice( + &Keccak256::new() + .chain_update((i * 3).to_be_bytes()) + .finalize(), + ); + value.extend_from_slice(&[0u8; 24]); // u256 padding + value.extend_from_slice(&(i as u64).to_be_bytes()); // u256 value + value.extend_from_slice(&(i as u64).to_be_bytes()); // u64 + + (key, value) + }) + .collect() + } + + #[test] + fn test_blockchain_simulation_with_incremental_storage() { + let temp_dir = TempDir::new("ethrex_blockchain_sim").unwrap(); + let db_path = temp_dir.path().join("blockchain.edb"); + + let mut db = EthrexDB::new(db_path.clone()).unwrap(); + let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); + + // Keep track of a persistent value from the first batch + let persistent_key = generate_test_data(1)[0].0.clone(); + let persistent_value = generate_test_data(1)[0].1.clone(); + + // Batch 1: Initial accounts (simulating genesis) + let batch1_data = generate_test_data(100); + + for (key, value) in batch1_data.iter() { + trie.insert(key.clone(), value.clone()).unwrap(); + } + + let root_node1 = trie.root_node().unwrap().unwrap(); + let trie_root_hash1 = root_node1.compute_hash(); + let db_root_hash1 = db.commit(&root_node1).unwrap(); + trie.commit().unwrap(); // Convert to NodeRef::Hash for CoW + + assert_eq!( + trie_root_hash1, db_root_hash1, + "Root hashes must match after batch 1" + ); + assert_eq!( + db.root().unwrap(), + root_node1, + "DB root must match trie root after batch 1" + ); + + // Verify persistent value exists + assert_eq!( + db.get(&persistent_key).unwrap(), + Some(persistent_value.clone()) + ); + let size_after_batch1 = db.file_manager.get_file_size().unwrap(); + + // Batch 2: New transactions + modify some existing accounts + let new_accounts_batch2 = generate_test_data(150); + + // Add 50 new accounts (indices 100-149) + for (key, value) in new_accounts_batch2[100..].iter() { + trie.insert(key.clone(), value.clone()).unwrap(); + } + + // Modify some existing accounts from batch 1 + for i in [10, 25, 50, 75].iter() { + if *i < batch1_data.len() { + let (key, _) = &batch1_data[*i]; + let new_value = format!("modified_account_{}", i).into_bytes(); + trie.insert(key.clone(), new_value).unwrap(); + } + } + + let root_node2 = trie.root_node().unwrap().unwrap(); + let trie_root_hash2 = root_node2.compute_hash(); + let db_root_hash2 = db.commit(&root_node2).unwrap(); + trie.commit().unwrap(); + + assert_eq!( + trie_root_hash2, db_root_hash2, + "Root hashes must match after batch 2" + ); + assert_eq!( + db.root().unwrap(), + root_node2, + "DB root must match trie root after batch 2" + ); + assert_eq!( + db.get(&persistent_key).unwrap(), + Some(persistent_value.clone()) + ); + + let size_after_batch2 = db.file_manager.get_file_size().unwrap(); + let batch2_increment = size_after_batch2 - size_after_batch1; + + // Batch 3: More transactions + let new_accounts_batch3 = generate_test_data(200); + + // Add 50 more new accounts (indices 150-199) + for (key, value) in &new_accounts_batch3[150..] { + trie.insert(key.clone(), value.clone()).unwrap(); + } - let roots: Vec = db.iter_roots().unwrap(); - assert_eq!(roots.len(), 3); + // Modify more existing accounts + for i in [5, 15, 35, 45, 110, 125].iter() { + if *i < 150 { + let test_data = generate_test_data(*i + 1); + let (key, _) = &test_data[*i]; + let new_value = format!("batch3_modified_{}", i).into_bytes(); + trie.insert(key.clone(), new_value).unwrap(); + } + } + + let root_node3 = trie.root_node().unwrap().unwrap(); + let trie_root_hash3 = root_node3.compute_hash(); + let db_root_hash3 = db.commit(&root_node3).unwrap(); + trie.commit().unwrap(); + + assert_eq!( + trie_root_hash3, db_root_hash3, + "Root hashes must match after batch 3" + ); + assert_eq!( + db.root().unwrap(), + root_node3, + "DB root must match trie root after batch 3" + ); + assert_eq!( + db.get(&persistent_key).unwrap(), + Some(persistent_value.clone()) + ); + + let size_after_batch3 = db.file_manager.get_file_size().unwrap(); + let _batch3_increment = size_after_batch3 - size_after_batch2; + + // Batch 4: Large update batch + let new_accounts_batch4 = generate_test_data(250); + + // Add 50 more new accounts (indices 200-249) + for (key, value) in &new_accounts_batch4[200..] { + trie.insert(key.clone(), value.clone()).unwrap(); + } + + // Modify many existing accounts + for i in [1, 20, 30, 40, 60, 80, 90, 105, 115, 135, 145, 170, 180].iter() { + if *i < 200 { + let test_data = generate_test_data(*i + 1); + let (key, _) = &test_data[*i]; + let new_value = format!("batch4_update_{}", i).into_bytes(); + trie.insert(key.clone(), new_value).unwrap(); + } + } - for root in roots.iter() { - root.compute_hash(); + let root_node4 = trie.root_node().unwrap().unwrap(); + let trie_root_hash4 = root_node4.compute_hash(); + let db_root_hash4 = db.commit(&root_node4).unwrap(); + trie.commit().unwrap(); + + assert_eq!( + trie_root_hash4, db_root_hash4, + "Root hashes must match after batch 4" + ); + assert_eq!( + db.root().unwrap(), + root_node4, + "DB root must match trie root after batch 4" + ); + assert_eq!( + db.get(&persistent_key).unwrap(), + Some(persistent_value.clone()) + ); + + let size_after_batch4 = db.file_manager.get_file_size().unwrap(); + let _batch4_increment = size_after_batch4 - size_after_batch3; + + // Batch 5: Final verification batch + let new_accounts_batch5 = generate_test_data(300); + + // Add 50 final accounts (indices 250-299) + for (key, value) in &new_accounts_batch5[250..] { + trie.insert(key.clone(), value.clone()).unwrap(); + } + + // Few more modifications + for i in [8, 28, 58, 88, 128, 158, 188, 218].iter() { + if *i < 250 { + let test_data = generate_test_data(*i + 1); + let (key, _) = &test_data[*i]; + let new_value = format!("final_update_{}", i).into_bytes(); + trie.insert(key.clone(), new_value).unwrap(); + } + } + + let root_node5 = trie.root_node().unwrap().unwrap(); + let trie_root_hash5 = root_node5.compute_hash(); + let db_root_hash5 = db.commit(&root_node5).unwrap(); + trie.commit().unwrap(); + + assert_eq!( + trie_root_hash5, db_root_hash5, + "Root hashes must match after batch 5" + ); + assert_eq!( + db.root().unwrap(), + root_node5, + "DB root must match trie root after batch 5" + ); + + // Final verification: The persistent value from batch 1 should still be accessible + assert_eq!( + db.get(&persistent_key).unwrap(), + Some(persistent_value.clone()), + "Persistent value from genesis batch must still be accessible after all updates" + ); + + // Verify file size increased over time (CoW should minimize growth) + let final_file_size = db.file_manager.get_file_size().unwrap(); + let _batch5_increment = final_file_size - size_after_batch4; + + let average_account_size = (size_after_batch1 - 8) / 100; // Remove header + + // If we were writing the full trie each time, batch 2 would be ~150 accounts worth + let expected_if_full_rewrite = average_account_size * 150; + + // Verify CoW is working - batch 2 should be significantly smaller than full rewrite + // Use 70% as threshold (accounting for trie structure overhead) + let cow_threshold = (expected_if_full_rewrite as f64 * 0.7) as u64; + assert!( + batch2_increment < cow_threshold, + "CoW not working effectively: {} >= {} (70% of full rewrite)", + batch2_increment, + cow_threshold + ); + + // Random verification of some accounts + for batch_num in 1..=5 { + let test_data = generate_test_data(batch_num * 50); + if let Some((key, _)) = test_data.get(batch_num * 10) { + let db_value = db.get(key).unwrap(); + assert!( + db_value.is_some(), + "Account from batch {} should be accessible", + batch_num + ); + } } } } diff --git a/src/file_manager.rs b/src/file_manager.rs index 7c86510..a6652a1 100644 --- a/src/file_manager.rs +++ b/src/file_manager.rs @@ -98,6 +98,11 @@ impl FileManager { Ok(offset) } + /// Get the current file size + pub fn get_file_size(&self) -> Result { + Ok(self.mmap.len() as u64) + } + /// Get slice from a specific offset to the end of the file pub fn get_slice_to_end(&self, offset: u64) -> Result<&[u8], TrieError> { if offset as usize >= self.mmap.len() { @@ -106,16 +111,6 @@ impl FileManager { Ok(&self.mmap[offset as usize..]) } - - /// Get slice of exactly n bytes from a specific offset - pub fn get_slice_at(&self, offset: u64, size: usize) -> Result<&[u8], TrieError> { - let start = offset as usize; - let end = start + size; - - assert!(end <= self.mmap.len(), "Offset out of bounds"); - - Ok(&self.mmap[start..end]) - } } #[cfg(test)] diff --git a/src/serialization.rs b/src/serialization.rs index 9fe39c1..d2ded40 100644 --- a/src/serialization.rs +++ b/src/serialization.rs @@ -10,15 +10,21 @@ //! - Extension -> Extend with child but no value (value_offset = 0) //! - Branch -> Branch (unchanged) +use std::collections::HashMap; use std::sync::{Arc, OnceLock}; -use crate::trie::{BranchNode, ExtensionNode, LeafNode, Nibbles, Node, NodeRef, TrieError}; +use crate::trie::{ + BranchNode, ExtensionNode, LeafNode, Nibbles, Node, NodeHash, NodeRef, TrieError, +}; /// Tag for Branch node (16 children + 1 value) const TAG_BRANCH: u8 = 0; /// Tag for Extend node (combines Extension and Leaf) const TAG_EXTEND: u8 = 1; +/// Type alias for incremental serialization result +type IncrementalResult = Result<(Vec, HashMap, u64), TrieError>; + /// Serializes a Merkle Patricia Trie into a byte buffer using the two node format /// /// - Branch: 16 node offsets + 1 value offset @@ -27,20 +33,88 @@ const TAG_EXTEND: u8 = 1; pub struct Serializer { /// Buffer where serialized data is accumulated buffer: Vec, + /// Index of existing nodes (hash -> file offset) + node_index: HashMap, + /// New nodes added during this serialization (hash -> absolute offset) + new_nodes: HashMap, + /// Base offset where new data will be written in file + base_offset: u64, } impl Serializer { + #[allow(dead_code)] pub fn new() -> Self { Self::default() } + /// Create a new incremental serializer with existing node index + pub fn new_incremental(node_index: &HashMap, base_offset: u64) -> Self { + Self { + buffer: Vec::new(), + node_index: node_index.clone(), + new_nodes: HashMap::new(), + base_offset, + } + } + /// Serializes a trie using the two node format + #[allow(dead_code)] pub fn serialize_tree(mut self, root: &Node) -> Result, TrieError> { self.serialize_node(root)?; Ok(self.buffer) } + /// Serializes a trie incrementally, only storing new nodes + /// Returns the serialized data, a map of new node hashes to their offsets, and the root offset + pub fn serialize_tree_incremental(mut self, root: &Node) -> IncrementalResult { + let root_offset = self.serialize_node_or_ref(root)?; + Ok((self.buffer, self.new_nodes, root_offset)) + } + + /// Serializes a node or returns existing offset if already stored + fn serialize_node_or_ref(&mut self, node: &Node) -> Result { + let hash = node.compute_hash(); + + // Check if node already exists in the database + if let Some(&existing_offset) = self.node_index.get(&hash) { + return Ok(existing_offset); + } + + // Check if we already serialized this node in this batch + if let Some(&absolute_offset) = self.new_nodes.get(&hash) { + return Ok(absolute_offset); + } + + // Node is new, serialize it + let buffer_offset = self.buffer.len() as u64; + let absolute_offset = self.base_offset + buffer_offset; + self.new_nodes.insert(hash, absolute_offset); + self.serialize_node_internal(node)?; + Ok(absolute_offset) + } + + /// Handles NodeRef serialization - returns offset for both Hash and Node variants + fn serialize_noderef(&mut self, noderef: &NodeRef) -> Result { + match noderef { + NodeRef::Hash(hash) if hash.is_valid() => { + // Look up the offset for this hash - this is an existing node + if let Some(&offset) = self.node_index.get(hash) { + Ok(offset) // Return absolute offset in file + } else { + // This shouldn't happen if trie.commit() was called properly + panic!("Hash reference not found in index: {:?}", hash); + } + } + NodeRef::Hash(_) => Ok(0), // Empty/invalid hash + NodeRef::Node(node, _) => { + // This is a new node, serialize it + self.serialize_node_or_ref(node) + } + } + } + /// Serializes a node, converting from 3 node to 2 node system + #[allow(dead_code)] fn serialize_node(&mut self, node: &Node) -> Result { let offset = self.buffer.len() as u64; @@ -63,8 +137,6 @@ impl Serializer { // Go back and write the actual value offset self.buffer[value_offset_pos + 8..value_offset_pos + 16] .copy_from_slice(&value_offset.to_le_bytes()); - - Ok(offset) } Node::Extension(ext) => { // Extension becomes Extend with only child @@ -93,8 +165,6 @@ impl Serializer { self.buffer[child_offset_pos..child_offset_pos + 8] .copy_from_slice(&child_offset.to_le_bytes()); } - - Ok(offset) } Node::Branch(branch) => { // Branch stays Branch but with offsets @@ -137,8 +207,93 @@ impl Serializer { pos += 8; } self.buffer[pos..pos + 8].copy_from_slice(&value_offset.to_le_bytes()); + } + } + + Ok(offset) + } + + /// Internal node serialization (used by both serialize_node and serialize_node_or_ref) + fn serialize_node_internal(&mut self, node: &Node) -> Result<(), TrieError> { + match node { + Node::Leaf(leaf) => { + // Leaf becomes Extend with only value + self.buffer.push(TAG_EXTEND); + + let compact_nibbles = leaf.partial.encode_compact(); + self.write_bytes_with_len(&compact_nibbles); + + // Reserve space for offsets + let value_offset_pos = self.buffer.len(); + self.buffer.extend_from_slice(&0u64.to_le_bytes()); // node offset = 0 + self.buffer.extend_from_slice(&0u64.to_le_bytes()); // value offset placeholder + + let value_offset = self.base_offset + self.buffer.len() as u64; // Absolute offset + self.write_bytes_with_len(&leaf.value); - Ok(offset) + // Go back and write the actual value offset + self.buffer[value_offset_pos + 8..value_offset_pos + 16] + .copy_from_slice(&value_offset.to_le_bytes()); + + Ok(()) + } + Node::Extension(ext) => { + // Extension becomes Extend with only child + self.buffer.push(TAG_EXTEND); + + let compact_prefix = ext.prefix.encode_compact(); + self.write_bytes_with_len(&compact_prefix); + + // Reserve space for offsets + let child_offset_pos = self.buffer.len(); + self.buffer.extend_from_slice(&0u64.to_le_bytes()); // child offset placeholder + self.buffer.extend_from_slice(&0u64.to_le_bytes()); // value offset = 0 + + let child_offset = self.serialize_noderef(&ext.child)?; + + // Go back and write the actual child offset + if child_offset > 0 { + self.buffer[child_offset_pos..child_offset_pos + 8] + .copy_from_slice(&child_offset.to_le_bytes()); + } + + Ok(()) + } + Node::Branch(branch) => { + // Branch stays Branch but with offsets + self.buffer.push(TAG_BRANCH); + + // Reserve space for all offsets + let offsets_start = self.buffer.len(); + // 16 child offsets + 1 value offset + for _ in 0..17 { + self.buffer.extend_from_slice(&0u64.to_le_bytes()); + } + + // Serialize all children and collect their offsets + let mut child_offsets = [0u64; 16]; + for (i, child) in branch.choices.iter().enumerate() { + child_offsets[i] = self.serialize_noderef(child)?; + } + + // Serialize value if present + let value_offset = if branch.value.is_empty() { + 0u64 + } else { + let offset = self.base_offset + self.buffer.len() as u64; // Absolute offset + self.write_bytes_with_len(&branch.value); + offset + }; + + // Go back and write all the actual offsets + let mut pos = offsets_start; + for &child_offset in &child_offsets { + self.buffer[pos..pos + 8].copy_from_slice(&child_offset.to_le_bytes()); + pos += 8; + } + self.buffer[pos..pos + 8].copy_from_slice(&value_offset.to_le_bytes()); + + Ok(()) } } } @@ -166,6 +321,7 @@ impl<'a> Deserializer<'a> { } /// Deserializes a tree from the two node format back to standard 3 node format + #[allow(dead_code)] pub fn decode_tree(&self) -> Result { let node = self.decode_node_at(0)?; node.compute_hash(); @@ -173,7 +329,7 @@ impl<'a> Deserializer<'a> { } /// Decodes a node from the two node format at specific position - fn decode_node_at(&self, pos: usize) -> Result { + pub fn decode_node_at(&self, pos: usize) -> Result { if pos >= self.buffer.len() { panic!("Invalid buffer position"); } @@ -242,6 +398,7 @@ impl<'a> Deserializer<'a> { let mut children: [NodeRef; 16] = Default::default(); for (i, &offset) in child_offsets.iter().enumerate() { if offset > 0 { + // All offsets are absolute in the file let child = self.decode_node_at(offset as usize)?; children[i] = NodeRef::Node(Arc::new(child), OnceLock::new()); } @@ -249,8 +406,12 @@ impl<'a> Deserializer<'a> { // Read value if present let value = if value_offset > 0 { - self.read_value_at_offset(value_offset as usize)? - .unwrap_or_default() + if (value_offset as usize) >= self.buffer.len() { + vec![] + } else { + self.read_value_at_offset(value_offset as usize)? + .unwrap_or_default() + } } else { vec![] }; @@ -264,6 +425,7 @@ impl<'a> Deserializer<'a> { } /// Gets a value by path without copying data + #[allow(dead_code)] pub fn get_by_path(&self, path: &[u8]) -> Result>, TrieError> { if self.buffer.is_empty() { return Ok(None); @@ -273,6 +435,16 @@ impl<'a> Deserializer<'a> { self.get_by_path_inner(nibbles, 0) } + /// Gets a value by path starting at a specific offset + pub fn get_by_path_at(&self, path: &[u8], offset: usize) -> Result>, TrieError> { + if self.buffer.is_empty() { + return Ok(None); + } + + let nibbles = Nibbles::from_raw(path, false); + self.get_by_path_inner(nibbles, offset) + } + /// Internal helper for get_by_path with position tracking fn get_by_path_inner( &self, @@ -404,7 +576,8 @@ impl<'a> Deserializer<'a> { return Ok(None); } - Ok(Some(self.buffer[data_start..data_start + len].to_vec())) + let value = self.buffer[data_start..data_start + len].to_vec(); + Ok(Some(value)) } /// Read a u64 value from buffer at position @@ -429,6 +602,7 @@ impl<'a> Deserializer<'a> { } /// Helper function to serialize a Merkle Patricia Trie node to bytes. +#[allow(dead_code)] pub fn serialize(node: &Node) -> Vec { Serializer::new().serialize_tree(node).unwrap() } diff --git a/src/trie/error.rs b/src/trie/error.rs index 24ffca4..4f2d9db 100644 --- a/src/trie/error.rs +++ b/src/trie/error.rs @@ -13,4 +13,6 @@ pub enum TrieError { LockError, #[error("DB Error: {0}")] DbError(String), + #[error("Other Error: {0}")] + Other(String), } From bb12b5f4deb6d550cfdb0e310a691d7b0b600d6d Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Mon, 18 Aug 2025 21:05:31 -0300 Subject: [PATCH 02/27] feat(bench): update benchmarks with new serializer --- benches/db_benchmark.rs | 84 +++++++++++++++++++++++++++-- examples/profiling.rs | 116 ++++++++++++++++++++++++++++++---------- 2 files changed, 168 insertions(+), 32 deletions(-) diff --git a/benches/db_benchmark.rs b/benches/db_benchmark.rs index c3db371..a22806d 100644 --- a/benches/db_benchmark.rs +++ b/benches/db_benchmark.rs @@ -212,10 +212,11 @@ fn insert_benchmark(c: &mut Criterion) { || { let temp_dir = TempDir::new("ethrex_bench").unwrap(); let file_path = temp_dir.path().join("test.edb"); - EthrexDB::new(file_path).unwrap() + let db = EthrexDB::new(file_path).unwrap(); + let trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); + (db, trie) }, - |mut db| { - let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); + |(mut db, mut trie)| { for (key, value) in data { trie.insert(key.clone(), value.clone()).unwrap(); } @@ -319,5 +320,80 @@ fn random_get_benchmark(c: &mut Criterion) { group.finish(); } -criterion_group!(benches, insert_benchmark, random_get_benchmark); +fn cow_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("cow_updates"); + group.measurement_time(Duration::from_secs(15)); + group.sample_size(10); + + for size in [1_000, 10_000, 50_000] { + let base_data = generate_test_data(size); + let update_data = generate_test_data(size / 2); // 50% new data to add + + // LibmdbxHashDB CoW test + group.bench_with_input( + BenchmarkId::new("libmdbx_hash_cow", size), + &(&base_data, &update_data), + |b, (base, updates)| { + b.iter_with_setup( + || { + let temp_dir = TempDir::new("libmdbx_cow_bench").unwrap(); + let mut db = LibmdbxHashDB::new(temp_dir.path()); + // Pre-populate with base data + db.insert_batch(base); + db + }, + |mut db| { + // Now add the update data (this should reuse existing trie structure) + db.insert_batch(black_box(updates)); + black_box(db) + }, + ); + }, + ); + + // EthrexDB CoW test + group.bench_with_input( + BenchmarkId::new("ethrex_db_cow", size), + &(&base_data, &update_data), + |b, (base, updates)| { + b.iter_with_setup( + || { + let temp_dir = TempDir::new("ethrex_cow_bench").unwrap(); + let file_path = temp_dir.path().join("cow_test.edb"); + let mut db = EthrexDB::new(file_path).unwrap(); + let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); + + // Pre-populate with base data + for (key, value) in base.iter() { + trie.insert(key.clone(), value.clone()).unwrap(); + } + let root_node = trie.root_node().unwrap().unwrap(); + db.commit(&root_node).unwrap(); + trie.commit().unwrap(); // Convert to CoW references + + (db, trie) + }, + |(mut db, mut trie)| { + // Now add the update data with CoW + for (key, value) in updates.iter() { + trie.insert(key.clone(), value.clone()).unwrap(); + } + let root_node = trie.root_node().unwrap().unwrap(); + db.commit(&root_node).unwrap(); + db + }, + ); + }, + ); + } + + group.finish(); +} + +criterion_group!( + benches, + insert_benchmark, + random_get_benchmark, + cow_benchmark +); criterion_main!(benches); diff --git a/examples/profiling.rs b/examples/profiling.rs index db9de6d..9e2fde2 100644 --- a/examples/profiling.rs +++ b/examples/profiling.rs @@ -1,3 +1,6 @@ +//! Profile of EthrexDB when inserting 100k keys and then 20 batches of 5k keys. +//! Then, it does 500k random gets. + use ethrexdb::{ EthrexDB, trie::{InMemoryTrieDB, Trie}, @@ -6,38 +9,91 @@ use rand::{Rng, thread_rng}; use std::time::Instant; fn main() { - let db_path = std::env::temp_dir().join("profile_gets.edb"); - let mut db = EthrexDB::new(db_path).unwrap(); - - println!("Phase 1: Inserting 1,000,000 keys..."); + let db_path = std::env::temp_dir().join("profile_ethrexdb.db"); + let mut db = EthrexDB::new(db_path.clone()).unwrap(); let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); let mut keys = Vec::new(); - for i in 0..1_000_000 { - let key = format!("benchmark_key_{:08}", i); - let value = format!("value_for_key_{:08}", i); + // Phase 1: Initial population (100k keys) + print!("Initial population (100k keys)... "); + let start_phase1 = Instant::now(); + + for i in 0..100_000 { + let key = format!("initial_key_{:08}", i); + let value = format!("initial_value_{:08}", i); trie.insert(key.as_bytes().to_vec(), value.as_bytes().to_vec()) .unwrap(); keys.push(key); } - // Single commit with all data - let start_insert = Instant::now(); let root_node = trie.root_node().unwrap().unwrap(); + let initial_file_size = std::fs::metadata(&db_path).map(|m| m.len()).unwrap(); db.commit(&root_node).unwrap(); - println!("Insert phase completed in {:?}", start_insert.elapsed()); + let after_initial_size = std::fs::metadata(&db_path).map(|m| m.len()).unwrap(); + trie.commit().unwrap(); // Convert to CoW references + + println!( + "Done in {:?} - DB size: {:.1} MB", + start_phase1.elapsed(), + (after_initial_size - initial_file_size) as f64 / 1_048_576.0 + ); + + print!("Incremental updates (20 batches of 5k keys)... "); + let start_phase2 = Instant::now(); + + for batch in 0..20 { + let batch_start = Instant::now(); + let pre_batch_size = std::fs::metadata(&db_path).map(|m| m.len()).unwrap(); + + // Add 5,000 new keys + for i in 0..5_000 { + let key = format!("batch_{}_key_{:08}", batch, i); + let value = format!("batch_{}_value_{:08}", batch, i); + + trie.insert(key.as_bytes().to_vec(), value.as_bytes().to_vec()) + .unwrap(); + keys.push(key); + } + + // Also update some existing keys to demonstrate CoW efficiency + let mut rng = thread_rng(); + for _ in 0..100 { + let idx = rng.gen_range(0..keys.len().min(100_000)); // Only update initial keys + let updated_value = format!("updated_in_batch_{}_value", batch); + trie.insert( + keys[idx].as_bytes().to_vec(), + updated_value.as_bytes().to_vec(), + ) + .unwrap(); + } - // === PHASE 2: Random gets (this is what we want to profile) === - println!("Phase 2: Performing 1,000,000 random gets..."); + let root_node = trie.root_node().unwrap().unwrap(); + db.commit(&root_node).unwrap(); + let post_batch_size = std::fs::metadata(&db_path).map(|m| m.len()).unwrap(); + trie.commit().unwrap(); // Convert to CoW references + + let _batch_time = batch_start.elapsed(); + let _batch_growth = post_batch_size - pre_batch_size; + } + + let phase2_duration = start_phase2.elapsed(); + let final_file_size = std::fs::metadata(&db_path).map(|m| m.len()).unwrap(); + let incremental_growth = final_file_size - after_initial_size; + + println!( + "Done in {:?} - DB grew: {:.1} MB", + phase2_duration, + incremental_growth as f64 / 1_048_576.0 + ); + + print!("Performance test (500k random gets)... "); let start_gets = Instant::now(); let mut rng = thread_rng(); - let mut hit_count = 0; - let mut miss_count = 0; - for i in 0..1_000_000 { + for i in 0..500_000 { let key = if i % 10 == 0 { // 10% misses - random non-existent keys format!("nonexistent_key_{}", rng.r#gen::()) @@ -45,20 +101,24 @@ fn main() { // 90% hits - existing keys keys[rng.gen_range(0..keys.len())].clone() }; - - match db.get(key.as_bytes()).unwrap() { - Some(_) => hit_count += 1, - None => miss_count += 1, - } - - if i % 10_000 == 0 { - println!("Completed {} gets", i); - } + db.get(key.as_bytes()).unwrap(); } let gets_duration = start_gets.elapsed(); - println!("Gets phase completed in {:?}", gets_duration); - println!("Hits: {}, Misses: {}", hit_count, miss_count); - println!("Total get time: {:?}", gets_duration); - println!("Average get time: {:?}", gets_duration / 1_000_000); + println!( + "Done in {:?} - Avg: {:?}/get", + gets_duration, + gets_duration / 500_000 + ); + + println!("Total keys: {}", keys.len()); + println!( + "Final DB size: {:.1} MB", + final_file_size as f64 / 1_048_576.0 + ); + println!("Total time: {:?}", start_phase1.elapsed()); + + // Clean up temp file + drop(db); + let _ = std::fs::remove_file(&db_path); } From 6edf5d781bce1b2b4815a9cfcd59706caeb6bdd1 Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Mon, 18 Aug 2025 21:27:21 -0300 Subject: [PATCH 03/27] refactor(core): improve logic --- src/db.rs | 49 +-- src/file_manager.rs | 27 +- src/serialization.rs | 743 ++++++------------------------------------- 3 files changed, 124 insertions(+), 695 deletions(-) diff --git a/src/db.rs b/src/db.rs index ef80b0a..1a81e02 100644 --- a/src/db.rs +++ b/src/db.rs @@ -68,26 +68,27 @@ impl EthrexDB { /// Get the latest root node of the database pub fn root(&self) -> Result { - let latest_offset = self.file_manager.read_latest_root_offset()?; + let (latest_offset, file_data) = self.get_latest_data()?; if latest_offset == 0 { return Err(TrieError::Other("No root node in database".to_string())); } - - // Get the entire file as a buffer for deserializer - let file_data = self.file_manager.get_slice_to_end(0)?; - let root_node = Deserializer::new(file_data).decode_node_at(latest_offset as usize)?; - Ok(root_node) + Deserializer::new(file_data).decode_node_at(latest_offset as usize) } /// Get the value of the node with the given key pub fn get(&self, key: &[u8]) -> Result>, TrieError> { - let latest_offset = self.file_manager.read_latest_root_offset()?; + let (latest_offset, file_data) = self.get_latest_data()?; if latest_offset == 0 { return Ok(None); } + Deserializer::new(file_data).get_by_path_at(key, latest_offset as usize) + } + /// Helper to get latest offset and file data + fn get_latest_data(&self) -> Result<(u64, &[u8]), TrieError> { + let latest_offset = self.file_manager.read_latest_root_offset()?; let file_data = self.file_manager.get_slice_to_end(0)?; - Deserializer::new(file_data).get_by_path_at(key, latest_offset as usize) + Ok((latest_offset, file_data)) } } @@ -176,7 +177,7 @@ mod tests { db.commit(&root_node).unwrap(); // Read back from DB - let _recovered_root = db.root().unwrap(); + assert_eq!(db.root().unwrap(), root_node); // Test that we can read the value let value = db.get(b"key").unwrap(); @@ -383,7 +384,6 @@ mod tests { db.get(&persistent_key).unwrap(), Some(persistent_value.clone()) ); - let size_after_batch1 = db.file_manager.get_file_size().unwrap(); // Batch 2: New transactions + modify some existing accounts let new_accounts_batch2 = generate_test_data(150); @@ -421,9 +421,6 @@ mod tests { Some(persistent_value.clone()) ); - let size_after_batch2 = db.file_manager.get_file_size().unwrap(); - let batch2_increment = size_after_batch2 - size_after_batch1; - // Batch 3: More transactions let new_accounts_batch3 = generate_test_data(200); @@ -461,9 +458,6 @@ mod tests { Some(persistent_value.clone()) ); - let size_after_batch3 = db.file_manager.get_file_size().unwrap(); - let _batch3_increment = size_after_batch3 - size_after_batch2; - // Batch 4: Large update batch let new_accounts_batch4 = generate_test_data(250); @@ -501,9 +495,6 @@ mod tests { Some(persistent_value.clone()) ); - let size_after_batch4 = db.file_manager.get_file_size().unwrap(); - let _batch4_increment = size_after_batch4 - size_after_batch3; - // Batch 5: Final verification batch let new_accounts_batch5 = generate_test_data(300); @@ -541,26 +532,6 @@ mod tests { assert_eq!( db.get(&persistent_key).unwrap(), Some(persistent_value.clone()), - "Persistent value from genesis batch must still be accessible after all updates" - ); - - // Verify file size increased over time (CoW should minimize growth) - let final_file_size = db.file_manager.get_file_size().unwrap(); - let _batch5_increment = final_file_size - size_after_batch4; - - let average_account_size = (size_after_batch1 - 8) / 100; // Remove header - - // If we were writing the full trie each time, batch 2 would be ~150 accounts worth - let expected_if_full_rewrite = average_account_size * 150; - - // Verify CoW is working - batch 2 should be significantly smaller than full rewrite - // Use 70% as threshold (accounting for trie structure overhead) - let cow_threshold = (expected_if_full_rewrite as f64 * 0.7) as u64; - assert!( - batch2_increment < cow_threshold, - "CoW not working effectively: {} >= {} (70% of full rewrite)", - batch2_increment, - cow_threshold ); // Random verification of some accounts diff --git a/src/file_manager.rs b/src/file_manager.rs index a6652a1..b15377f 100644 --- a/src/file_manager.rs +++ b/src/file_manager.rs @@ -8,21 +8,13 @@ use std::path::PathBuf; /// /// File format: /// ```text -/// [header: 8 bytes] -> points to latest root version -/// [version 1: [prev_offset: 8 bytes][nodes]] -/// [version 2: [prev_offset: 8 bytes][nodes]] -/// ... -/// [version N: [prev_offset: 8 bytes][nodes]] <- latest version +/// [header: 8 bytes] -> points to latest root offset +/// [serialized trie nodes...] /// ``` -/// -/// Each version contains: -/// - prev_offset: Points to the previous version -/// - nodes: Serialized trie nodes pub struct FileManager { /// File where the data is stored file: File, /// Memory-mapped of the file - /// TODO: Handle case when adding new nodes mmap: Mmap, } @@ -74,15 +66,12 @@ impl FileManager { Ok(u64::from_le_bytes(offset_bytes)) } - /// Update the header to point to the new latest root version + /// Update the header to point to the new latest root offset pub fn update_latest_root_offset(&mut self, new_offset: u64) -> Result<(), TrieError> { self.file.seek(SeekFrom::Start(0)).unwrap(); self.file.write_all(&new_offset.to_le_bytes()).unwrap(); self.file.flush().unwrap(); - - // TODO: Check if this is needed - self.mmap = unsafe { MmapOptions::new().map(&self.file).unwrap() }; - + self.refresh_mmap(); Ok(()) } @@ -91,11 +80,13 @@ impl FileManager { let offset = self.file.seek(SeekFrom::End(0)).unwrap(); self.file.write_all(data).unwrap(); self.file.flush().unwrap(); + self.refresh_mmap(); + Ok(offset) + } - // TODO: Check if this is needed + /// Refresh memory map after file modifications + fn refresh_mmap(&mut self) { self.mmap = unsafe { MmapOptions::new().map(&self.file).unwrap() }; - - Ok(offset) } /// Get the current file size diff --git a/src/serialization.rs b/src/serialization.rs index d2ded40..5a0ab63 100644 --- a/src/serialization.rs +++ b/src/serialization.rs @@ -17,36 +17,24 @@ use crate::trie::{ BranchNode, ExtensionNode, LeafNode, Nibbles, Node, NodeHash, NodeRef, TrieError, }; +/// Result type for incremental serialization: (serialized_data, new_node_offsets, root_offset) +type SerializationResult = Result<(Vec, HashMap, u64), TrieError>; + /// Tag for Branch node (16 children + 1 value) const TAG_BRANCH: u8 = 0; /// Tag for Extend node (combines Extension and Leaf) const TAG_EXTEND: u8 = 1; -/// Type alias for incremental serialization result -type IncrementalResult = Result<(Vec, HashMap, u64), TrieError>; - /// Serializes a Merkle Patricia Trie into a byte buffer using the two node format -/// -/// - Branch: 16 node offsets + 1 value offset -/// - Extend: 1 node offset + 1 value offset (combines Extension and Leaf) #[derive(Default)] pub struct Serializer { - /// Buffer where serialized data is accumulated buffer: Vec, - /// Index of existing nodes (hash -> file offset) node_index: HashMap, - /// New nodes added during this serialization (hash -> absolute offset) new_nodes: HashMap, - /// Base offset where new data will be written in file base_offset: u64, } impl Serializer { - #[allow(dead_code)] - pub fn new() -> Self { - Self::default() - } - /// Create a new incremental serializer with existing node index pub fn new_incremental(node_index: &HashMap, base_offset: u64) -> Self { Self { @@ -57,25 +45,17 @@ impl Serializer { } } - /// Serializes a trie using the two node format - #[allow(dead_code)] - pub fn serialize_tree(mut self, root: &Node) -> Result, TrieError> { - self.serialize_node(root)?; - Ok(self.buffer) - } - /// Serializes a trie incrementally, only storing new nodes - /// Returns the serialized data, a map of new node hashes to their offsets, and the root offset - pub fn serialize_tree_incremental(mut self, root: &Node) -> IncrementalResult { - let root_offset = self.serialize_node_or_ref(root)?; + pub fn serialize_tree_incremental(mut self, root: &Node) -> SerializationResult { + let root_offset = self.serialize_node(root)?; Ok((self.buffer, self.new_nodes, root_offset)) } - /// Serializes a node or returns existing offset if already stored - fn serialize_node_or_ref(&mut self, node: &Node) -> Result { + /// Serializes a node, checking CoW first + fn serialize_node(&mut self, node: &Node) -> Result { let hash = node.compute_hash(); - // Check if node already exists in the database + // Check if node already exists (CoW) if let Some(&existing_offset) = self.node_index.get(&hash) { return Ok(existing_offset); } @@ -89,212 +69,99 @@ impl Serializer { let buffer_offset = self.buffer.len() as u64; let absolute_offset = self.base_offset + buffer_offset; self.new_nodes.insert(hash, absolute_offset); - self.serialize_node_internal(node)?; - Ok(absolute_offset) - } - - /// Handles NodeRef serialization - returns offset for both Hash and Node variants - fn serialize_noderef(&mut self, noderef: &NodeRef) -> Result { - match noderef { - NodeRef::Hash(hash) if hash.is_valid() => { - // Look up the offset for this hash - this is an existing node - if let Some(&offset) = self.node_index.get(hash) { - Ok(offset) // Return absolute offset in file - } else { - // This shouldn't happen if trie.commit() was called properly - panic!("Hash reference not found in index: {:?}", hash); - } - } - NodeRef::Hash(_) => Ok(0), // Empty/invalid hash - NodeRef::Node(node, _) => { - // This is a new node, serialize it - self.serialize_node_or_ref(node) - } - } - } - - /// Serializes a node, converting from 3 node to 2 node system - #[allow(dead_code)] - fn serialize_node(&mut self, node: &Node) -> Result { - let offset = self.buffer.len() as u64; match node { - Node::Leaf(leaf) => { - // Leaf becomes Extend with only value - self.buffer.push(TAG_EXTEND); - - let compact_nibbles = leaf.partial.encode_compact(); - self.write_bytes_with_len(&compact_nibbles); - - // Reserve space for offsets - let value_offset_pos = self.buffer.len(); - self.buffer.extend_from_slice(&0u64.to_le_bytes()); // node offset = 0 - self.buffer.extend_from_slice(&0u64.to_le_bytes()); // value offset placeholder - - let value_offset = self.buffer.len() as u64; - self.write_bytes_with_len(&leaf.value); - - // Go back and write the actual value offset - self.buffer[value_offset_pos + 8..value_offset_pos + 16] - .copy_from_slice(&value_offset.to_le_bytes()); - } - Node::Extension(ext) => { - // Extension becomes Extend with only child - self.buffer.push(TAG_EXTEND); - - let compact_prefix = ext.prefix.encode_compact(); - self.write_bytes_with_len(&compact_prefix); - - // Reserve space for offsets - let child_offset_pos = self.buffer.len(); - self.buffer.extend_from_slice(&0u64.to_le_bytes()); // child offset placeholder - self.buffer.extend_from_slice(&0u64.to_le_bytes()); // value offset = 0 - - let child_offset = match &ext.child { - NodeRef::Hash(hash) => { - if hash.is_valid() { - panic!("Hash references not supported in serialization"); - } - 0u64 // Empty child - } - NodeRef::Node(node, _) => self.serialize_node(node)?, - }; - - // Go back and write the actual child offset - if child_offset > 0 { - self.buffer[child_offset_pos..child_offset_pos + 8] - .copy_from_slice(&child_offset.to_le_bytes()); - } - } - Node::Branch(branch) => { - // Branch stays Branch but with offsets - self.buffer.push(TAG_BRANCH); - - // Reserve space for all offsets - let offsets_start = self.buffer.len(); - // 16 child offsets + 1 value offset - for _ in 0..17 { - self.buffer.extend_from_slice(&0u64.to_le_bytes()); - } - - // Serialize all children and collect their offsets - let mut child_offsets = [0u64; 16]; - for (i, child) in branch.choices.iter().enumerate() { - child_offsets[i] = match child { - NodeRef::Hash(hash) => { - if hash.is_valid() { - panic!("Hash references not supported in serialization"); - } - 0u64 - } - NodeRef::Node(node, _) => self.serialize_node(node)?, - }; - } - - // Serialize value if present - let value_offset = if branch.value.is_empty() { - 0u64 - } else { - let offset = self.buffer.len() as u64; - self.write_bytes_with_len(&branch.value); - offset - }; - - // Go back and write all the actual offsets - let mut pos = offsets_start; - for &child_offset in &child_offsets { - self.buffer[pos..pos + 8].copy_from_slice(&child_offset.to_le_bytes()); - pos += 8; - } - self.buffer[pos..pos + 8].copy_from_slice(&value_offset.to_le_bytes()); - } + Node::Leaf(leaf) => self.serialize_leaf(leaf)?, + Node::Extension(ext) => self.serialize_extension(ext)?, + Node::Branch(branch) => self.serialize_branch(branch)?, } - Ok(offset) + Ok(absolute_offset) } - /// Internal node serialization (used by both serialize_node and serialize_node_or_ref) - fn serialize_node_internal(&mut self, node: &Node) -> Result<(), TrieError> { - match node { - Node::Leaf(leaf) => { - // Leaf becomes Extend with only value - self.buffer.push(TAG_EXTEND); + fn serialize_leaf(&mut self, leaf: &LeafNode) -> Result<(), TrieError> { + self.buffer.push(TAG_EXTEND); + let compact_nibbles = leaf.partial.encode_compact(); + self.write_bytes_with_len(&compact_nibbles); - let compact_nibbles = leaf.partial.encode_compact(); - self.write_bytes_with_len(&compact_nibbles); + // Child offset = 0, value offset will be filled + self.buffer.extend_from_slice(&0u64.to_le_bytes()); + let value_offset_pos = self.buffer.len(); + self.buffer.extend_from_slice(&0u64.to_le_bytes()); - // Reserve space for offsets - let value_offset_pos = self.buffer.len(); - self.buffer.extend_from_slice(&0u64.to_le_bytes()); // node offset = 0 - self.buffer.extend_from_slice(&0u64.to_le_bytes()); // value offset placeholder + let value_offset = self.base_offset + self.buffer.len() as u64; + self.write_bytes_with_len(&leaf.value); - let value_offset = self.base_offset + self.buffer.len() as u64; // Absolute offset - self.write_bytes_with_len(&leaf.value); + // Write actual value offset + self.buffer[value_offset_pos..value_offset_pos + 8] + .copy_from_slice(&value_offset.to_le_bytes()); - // Go back and write the actual value offset - self.buffer[value_offset_pos + 8..value_offset_pos + 16] - .copy_from_slice(&value_offset.to_le_bytes()); + Ok(()) + } - Ok(()) - } - Node::Extension(ext) => { - // Extension becomes Extend with only child - self.buffer.push(TAG_EXTEND); + fn serialize_extension(&mut self, ext: &ExtensionNode) -> Result<(), TrieError> { + self.buffer.push(TAG_EXTEND); + let compact_prefix = ext.prefix.encode_compact(); + self.write_bytes_with_len(&compact_prefix); - let compact_prefix = ext.prefix.encode_compact(); - self.write_bytes_with_len(&compact_prefix); + // Child offset will be filled, value offset = 0 + let child_offset_pos = self.buffer.len(); + self.buffer.extend_from_slice(&0u64.to_le_bytes()); + self.buffer.extend_from_slice(&0u64.to_le_bytes()); - // Reserve space for offsets - let child_offset_pos = self.buffer.len(); - self.buffer.extend_from_slice(&0u64.to_le_bytes()); // child offset placeholder - self.buffer.extend_from_slice(&0u64.to_le_bytes()); // value offset = 0 + let child_offset = self.serialize_noderef(&ext.child)?; - let child_offset = self.serialize_noderef(&ext.child)?; + // Write actual child offset + self.buffer[child_offset_pos..child_offset_pos + 8] + .copy_from_slice(&child_offset.to_le_bytes()); - // Go back and write the actual child offset - if child_offset > 0 { - self.buffer[child_offset_pos..child_offset_pos + 8] - .copy_from_slice(&child_offset.to_le_bytes()); - } + Ok(()) + } - Ok(()) - } - Node::Branch(branch) => { - // Branch stays Branch but with offsets - self.buffer.push(TAG_BRANCH); - - // Reserve space for all offsets - let offsets_start = self.buffer.len(); - // 16 child offsets + 1 value offset - for _ in 0..17 { - self.buffer.extend_from_slice(&0u64.to_le_bytes()); - } + fn serialize_branch(&mut self, branch: &BranchNode) -> Result<(), TrieError> { + self.buffer.push(TAG_BRANCH); - // Serialize all children and collect their offsets - let mut child_offsets = [0u64; 16]; - for (i, child) in branch.choices.iter().enumerate() { - child_offsets[i] = self.serialize_noderef(child)?; - } + // Reserve space for 16 child offsets + 1 value offset + let offsets_start = self.buffer.len(); + for _ in 0..17 { + self.buffer.extend_from_slice(&0u64.to_le_bytes()); + } - // Serialize value if present - let value_offset = if branch.value.is_empty() { - 0u64 - } else { - let offset = self.base_offset + self.buffer.len() as u64; // Absolute offset - self.write_bytes_with_len(&branch.value); - offset - }; + // Serialize children + let mut child_offsets = [0u64; 16]; + for (i, child) in branch.choices.iter().enumerate() { + child_offsets[i] = self.serialize_noderef(child)?; + } - // Go back and write all the actual offsets - let mut pos = offsets_start; - for &child_offset in &child_offsets { - self.buffer[pos..pos + 8].copy_from_slice(&child_offset.to_le_bytes()); - pos += 8; - } - self.buffer[pos..pos + 8].copy_from_slice(&value_offset.to_le_bytes()); + // Serialize value + let value_offset = if branch.value.is_empty() { + 0u64 + } else { + let offset = self.base_offset + self.buffer.len() as u64; + self.write_bytes_with_len(&branch.value); + offset + }; + + // Write all offsets + let mut pos = offsets_start; + for &child_offset in &child_offsets { + self.buffer[pos..pos + 8].copy_from_slice(&child_offset.to_le_bytes()); + pos += 8; + } + self.buffer[pos..pos + 8].copy_from_slice(&value_offset.to_le_bytes()); + + Ok(()) + } - Ok(()) + fn serialize_noderef(&mut self, noderef: &NodeRef) -> Result { + match noderef { + NodeRef::Hash(hash) if hash.is_valid() => { + self.node_index.get(hash).copied().ok_or_else(|| { + TrieError::Other(format!("Hash reference not found: {:?}", hash)) + }) } + NodeRef::Hash(_) => Ok(0), // Empty/invalid hash + NodeRef::Node(node, _) => self.serialize_node(node), } } @@ -306,32 +173,19 @@ impl Serializer { } /// Deserializes a Merkle Patricia Trie from a byte buffer. -/// -/// The deserializer reads the binary format produced by [`Serializer`]. -/// It uses the two node format and converts back to the standard 3 node format. pub struct Deserializer<'a> { - /// The byte buffer containing serialized trie data buffer: &'a [u8], } impl<'a> Deserializer<'a> { - /// Creates a new deserializer for the given buffer pub fn new(buffer: &'a [u8]) -> Self { Self { buffer } } - /// Deserializes a tree from the two node format back to standard 3 node format - #[allow(dead_code)] - pub fn decode_tree(&self) -> Result { - let node = self.decode_node_at(0)?; - node.compute_hash(); - Ok(node) - } - - /// Decodes a node from the two node format at specific position + /// Decodes a node at specific position pub fn decode_node_at(&self, pos: usize) -> Result { if pos >= self.buffer.len() { - panic!("Invalid buffer position"); + return Err(TrieError::Other("Invalid buffer position".to_string())); } let tag = self.buffer[pos]; @@ -339,48 +193,34 @@ impl<'a> Deserializer<'a> { match tag { TAG_EXTEND => { - // Read nibbles length let len = self.read_u32_at(position)? as usize; position += 4; - // Read nibbles - if position + len > self.buffer.len() { - panic!("Invalid buffer length"); - } let compact_nibbles = &self.buffer[position..position + len]; let nibbles = Nibbles::decode_compact(compact_nibbles); position += len; - // Read node offset let node_offset = self.read_u64_at(position)?; position += 8; - - // Read value offset let value_offset = self.read_u64_at(position)?; - // Determine node type based on what's present match (node_offset > 0, value_offset > 0) { (false, true) => { - // Only value = Leaf node + // Leaf node let value = self .read_value_at_offset(value_offset as usize)? .unwrap_or_default(); Ok(Node::Leaf(LeafNode::new(nibbles, value))) } (true, false) => { - // Only child = Extension node + // Extension node let child = self.decode_node_at(node_offset as usize)?; Ok(Node::Extension(ExtensionNode::new( nibbles, NodeRef::Node(Arc::new(child), OnceLock::new()), ))) } - (true, true) => { - panic!("Extend node with both child and value not supported"); - } - (false, false) => { - panic!("Invalid Extend node with no child or value"); - } + _ => Err(TrieError::Other("Invalid Extend node".to_string())), } } TAG_BRANCH => { @@ -390,28 +230,21 @@ impl<'a> Deserializer<'a> { *child = self.read_u64_at(position)?; position += 8; } - - // Read value offset let value_offset = self.read_u64_at(position)?; - // Build children NodeRefs + // Build children let mut children: [NodeRef; 16] = Default::default(); for (i, &offset) in child_offsets.iter().enumerate() { if offset > 0 { - // All offsets are absolute in the file let child = self.decode_node_at(offset as usize)?; children[i] = NodeRef::Node(Arc::new(child), OnceLock::new()); } } - // Read value if present + // Read value let value = if value_offset > 0 { - if (value_offset as usize) >= self.buffer.len() { - vec![] - } else { - self.read_value_at_offset(value_offset as usize)? - .unwrap_or_default() - } + self.read_value_at_offset(value_offset as usize)? + .unwrap_or_default() } else { vec![] }; @@ -420,19 +253,8 @@ impl<'a> Deserializer<'a> { children, value, )))) } - _ => panic!("Invalid node tag: {}", tag), - } - } - - /// Gets a value by path without copying data - #[allow(dead_code)] - pub fn get_by_path(&self, path: &[u8]) -> Result>, TrieError> { - if self.buffer.is_empty() { - return Ok(None); + _ => Err(TrieError::Other(format!("Invalid node tag: {}", tag))), } - - let nibbles = Nibbles::from_raw(path, false); - self.get_by_path_inner(nibbles, 0) } /// Gets a value by path starting at a specific offset @@ -440,12 +262,10 @@ impl<'a> Deserializer<'a> { if self.buffer.is_empty() { return Ok(None); } - let nibbles = Nibbles::from_raw(path, false); self.get_by_path_inner(nibbles, offset) } - /// Internal helper for get_by_path with position tracking fn get_by_path_inner( &self, mut path: Nibbles, @@ -460,48 +280,30 @@ impl<'a> Deserializer<'a> { match tag { TAG_EXTEND => { - // Read nibbles length - if position + 4 > self.buffer.len() { - return Ok(None); - } let len = u32::from_le_bytes(self.buffer[position..position + 4].try_into().unwrap()) as usize; position += 4; - // Read nibbles data - if position + len > self.buffer.len() { - return Ok(None); - } let compact_nibbles = &self.buffer[position..position + len]; let nibbles = Nibbles::decode_compact(compact_nibbles); position += len; - // Read node offset - if position + 8 > self.buffer.len() { - return Ok(None); - } let node_offset = u64::from_le_bytes(self.buffer[position..position + 8].try_into().unwrap()); position += 8; - - // Read value offset - if position + 8 > self.buffer.len() { - return Ok(None); - } let value_offset = u64::from_le_bytes(self.buffer[position..position + 8].try_into().unwrap()); - // Extend has only a child or a value if node_offset == 0 && value_offset > 0 { // Leaf node - let leaf_path_without_flag = if nibbles.is_leaf() { + let leaf_path = if nibbles.is_leaf() { nibbles.slice(0, nibbles.len() - 1) } else { nibbles }; - if path == leaf_path_without_flag { + if path == leaf_path { self.read_value_at_offset(value_offset as usize) } else { Ok(None) @@ -511,20 +313,15 @@ impl<'a> Deserializer<'a> { if !path.skip_prefix(&nibbles) { return Ok(None); } - // Recurse into the child self.get_by_path_inner(path, node_offset as usize) } else { - panic!("Extend node with both child and value not supported"); + Ok(None) } } TAG_BRANCH => { if path.is_empty() { // Skip 16 child offsets position += 16 * 8; - - if position + 8 > self.buffer.len() { - return Ok(None); - } let value_offset = u64::from_le_bytes(self.buffer[position..position + 8].try_into().unwrap()); @@ -534,18 +331,10 @@ impl<'a> Deserializer<'a> { Ok(None) } } else { - // Get next nibble and find corresponding child - let next_nibble = match path.next_choice() { - Some(nibble) => nibble, - None => return Ok(None), - }; - - // Read child offset at position next_nibble + let next_nibble = path + .next_choice() + .ok_or_else(|| TrieError::Other("Invalid path".to_string()))?; let child_offset_pos = position + next_nibble * 8; - if child_offset_pos + 8 > self.buffer.len() { - return Ok(None); - } - let child_offset = u64::from_le_bytes( self.buffer[child_offset_pos..child_offset_pos + 8] .try_into() @@ -559,19 +348,18 @@ impl<'a> Deserializer<'a> { } } } - _ => panic!("Invalid node tag: {}", tag), + _ => Err(TrieError::Other(format!("Invalid node tag: {}", tag))), } } - /// Read value at specific offset fn read_value_at_offset(&self, offset: usize) -> Result>, TrieError> { if offset + 4 > self.buffer.len() { return Ok(None); } let len = u32::from_le_bytes(self.buffer[offset..offset + 4].try_into().unwrap()) as usize; - let data_start = offset + 4; + if data_start + len > self.buffer.len() { return Ok(None); } @@ -580,20 +368,18 @@ impl<'a> Deserializer<'a> { Ok(Some(value)) } - /// Read a u64 value from buffer at position fn read_u64_at(&self, pos: usize) -> Result { if pos + 8 > self.buffer.len() { - panic!("Invalid buffer length for u64"); + return Err(TrieError::Other("Invalid buffer length".to_string())); } Ok(u64::from_le_bytes( self.buffer[pos..pos + 8].try_into().unwrap(), )) } - /// Read a u32 value from buffer at position fn read_u32_at(&self, pos: usize) -> Result { if pos + 4 > self.buffer.len() { - panic!("Invalid buffer length for u32"); + return Err(TrieError::Other("Invalid buffer length".to_string())); } Ok(u32::from_le_bytes( self.buffer[pos..pos + 4].try_into().unwrap(), @@ -601,17 +387,10 @@ impl<'a> Deserializer<'a> { } } -/// Helper function to serialize a Merkle Patricia Trie node to bytes. -#[allow(dead_code)] -pub fn serialize(node: &Node) -> Vec { - Serializer::new().serialize_tree(node).unwrap() -} - #[cfg(test)] mod test { - use crate::trie::{InMemoryTrieDB, Trie, node_hash::NodeHash}; - use super::*; + use crate::trie::{InMemoryTrieDB, Trie, node_hash::NodeHash}; fn new_temp() -> Trie { use std::collections::HashMap; @@ -624,329 +403,17 @@ mod test { Trie::new(Box::new(db)) } - fn deserialize(buffer: &[u8]) -> Node { - Deserializer::new(buffer).decode_tree().unwrap() - } - - #[test] - fn test_serialize_deserialize_empty_leaf() { - let leaf = Node::Leaf(LeafNode { - partial: Nibbles::from_hex(vec![]), - value: vec![], - }); - - let bytes = serialize(&leaf); - let recovered = deserialize(&bytes); - - assert_eq!(leaf, recovered); - } - - #[test] - fn test_serialize_deserialize_leaf_with_long_path() { - let leaf = Node::Leaf(LeafNode { - partial: Nibbles::from_hex(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5]), - value: b"long_path_value".to_vec(), - }); - - let bytes = serialize(&leaf); - let recovered = deserialize(&bytes); - - assert_eq!(leaf, recovered); - } - - #[test] - fn test_serialize_deserialize_branch_empty() { - let branch = Node::Branch(Box::new(BranchNode { - choices: Default::default(), - value: vec![], - })); - - let bytes = serialize(&branch); - let recovered = deserialize(&bytes); - - assert_eq!(branch, recovered); - } - - #[test] - fn test_serialize_deserialize_tree_extension_to_leaf() { - let leaf = Node::Leaf(LeafNode { - partial: Nibbles::from_hex(vec![5, 6, 7]), - value: b"nested_leaf".to_vec(), - }); - - let ext = Node::Extension(ExtensionNode { - prefix: Nibbles::from_hex(vec![1, 2]), - child: NodeRef::Node(Arc::new(leaf), OnceLock::new()), - }); - - let bytes = serialize(&ext); - let recovered = deserialize(&bytes); - assert_eq!(recovered, ext); - match recovered { - Node::Extension(ext_node) => { - assert_eq!(ext_node.prefix, Nibbles::from_hex(vec![1, 2])); - match &ext_node.child { - NodeRef::Node(arc_node, _) => match &**arc_node { - Node::Leaf(leaf_node) => { - assert_eq!(leaf_node.partial, Nibbles::from_hex(vec![5, 6, 7])); - assert_eq!(leaf_node.value, b"nested_leaf"); - } - _ => panic!("Expected leaf node"), - }, - _ => panic!("Expected embedded node"), - } - } - _ => panic!("Expected extension node"), - } - } - - #[test] - fn test_serialize_deserialize_deep_tree() { - let leaf = Node::Leaf(LeafNode { - partial: Nibbles::from_hex(vec![9, 8]), - value: b"deep_leaf".to_vec(), - }); - - let inner_ext = Node::Extension(ExtensionNode { - prefix: Nibbles::from_hex(vec![5, 6]), - child: NodeRef::Node(Arc::new(leaf), OnceLock::new()), - }); - - let mut branch_choices: [NodeRef; 16] = Default::default(); - branch_choices[2] = NodeRef::Node(Arc::new(inner_ext), OnceLock::new()); - - let branch = Node::Branch(Box::new(BranchNode { - choices: branch_choices, - value: vec![], - })); - - let outer_ext = Node::Extension(ExtensionNode { - prefix: Nibbles::from_hex(vec![1, 2, 3]), - child: NodeRef::Node(Arc::new(branch), OnceLock::new()), - }); - - let bytes = serialize(&outer_ext); - let recovered = deserialize(&bytes); - - assert_eq!(recovered, outer_ext); - } - #[test] - fn test_trie_serialization_empty() { - let trie = new_temp(); - let root = trie.root_node().unwrap(); - assert!(root.is_none()); - } - - #[test] - fn test_trie_serialization_single_insert() { + fn test_simple_serialization() { let mut trie = new_temp(); trie.insert(b"key".to_vec(), b"value".to_vec()).unwrap(); let root = trie.root_node().unwrap().unwrap(); - let bytes = serialize(&root); - let recovered = deserialize(&bytes); - - assert_eq!(root, recovered); - } - - #[test] - fn test_trie_serialization_multiple_inserts() { - let mut trie = new_temp(); - - let test_data = vec![ - (b"do".to_vec(), b"verb".to_vec()), - (b"dog".to_vec(), b"puppy".to_vec()), - (b"doge".to_vec(), b"coin".to_vec()), - (b"horse".to_vec(), b"stallion".to_vec()), - ]; - - for (key, value) in &test_data { - trie.insert(key.clone(), value.clone()).unwrap(); - } - - let root = trie.root_node().unwrap().unwrap(); - let bytes = serialize(&root); - let recovered = deserialize(&bytes); - - assert_eq!(recovered, root); - } - - #[test] - fn test_file_io() { - use std::fs; - - // Create trie - let mut trie = new_temp(); - trie.insert(b"file_key".to_vec(), b"file_value".to_vec()) - .unwrap(); - - // Serialize to file - let root = trie.root_node().unwrap().unwrap(); - let serialized = serialize(&root); - - let path = "/tmp/test_trie.mpt"; - fs::write(path, &serialized).unwrap(); - - // Read from file and deserialize - let read_data = fs::read(path).unwrap(); - let deserialized = deserialize(&read_data); - - assert_eq!(root, deserialized); - fs::remove_file(path).unwrap(); - } - - #[test] - fn test_get_by_path_serialized_simple() { - let mut trie = new_temp(); - trie.insert(b"test".to_vec(), b"value".to_vec()).unwrap(); - - let root = trie.root_node().unwrap().unwrap(); - let buffer = serialize(&root); - - let deserializer = Deserializer::new(&buffer); - assert_eq!( - deserializer.get_by_path(b"test").unwrap(), - Some(b"value".to_vec()) - ); - - let deserializer = Deserializer::new(&buffer); - let recovered = deserializer.decode_tree().unwrap(); - assert_eq!(root, recovered); - } - - #[test] - fn test_get_by_path_serialized() { - let mut trie = new_temp(); - - let test_data = vec![ - (b"do".to_vec(), b"verb".to_vec()), - (b"dog".to_vec(), b"puppy".to_vec()), - (b"doge".to_vec(), b"coin".to_vec()), - (b"horse".to_vec(), b"stallion".to_vec()), - ]; - - for (key, value) in &test_data { - trie.insert(key.clone(), value.clone()).unwrap(); - } - - let root = trie.root_node().unwrap().unwrap(); - let buffer = serialize(&root); - - let deserializer = Deserializer::new(&buffer); - assert_eq!( - deserializer.get_by_path(b"horse").unwrap(), - Some(b"stallion".to_vec()) - ); - - let deserializer = Deserializer::new(&buffer); - assert_eq!( - deserializer.get_by_path(b"dog").unwrap(), - Some(b"puppy".to_vec()) - ); - - let deserializer = Deserializer::new(&buffer); - assert_eq!( - deserializer.get_by_path(b"doge").unwrap(), - Some(b"coin".to_vec()) - ); - - let deserializer = Deserializer::new(&buffer); - assert_eq!( - deserializer.get_by_path(b"do").unwrap(), - Some(b"verb".to_vec()) - ); - - let deserializer = Deserializer::new(&buffer); - assert_eq!(deserializer.get_by_path(b"cat").unwrap(), None); - - let deserializer = Deserializer::new(&buffer); - assert_eq!(deserializer.get_by_path(b"").unwrap(), None); - - // Reset position before decoding tree - let deserializer = Deserializer::new(&buffer); - let recovered = deserializer.decode_tree().unwrap(); - assert_eq!(root, recovered); - } - - #[test] - fn test_complex_trie_serialization() { - let mut trie = new_temp(); - - let test_data = vec![ - (b"app".to_vec(), b"application".to_vec()), - (b"apple".to_vec(), b"fruit".to_vec()), - (b"application".to_vec(), b"software".to_vec()), - (b"append".to_vec(), b"add_to_end".to_vec()), - (b"applied".to_vec(), b"past_tense".to_vec()), - (b"car".to_vec(), b"vehicle".to_vec()), - (b"card".to_vec(), b"playing_card".to_vec()), - (b"care".to_vec(), b"attention".to_vec()), - (b"career".to_vec(), b"profession".to_vec()), - (b"careful".to_vec(), b"cautious".to_vec()), - (b"test".to_vec(), b"examination".to_vec()), - (b"testing".to_vec(), b"verification".to_vec()), - (b"tester".to_vec(), b"one_who_tests".to_vec()), - (b"testament".to_vec(), b"will_document".to_vec()), - (b"a".to_vec(), b"letter_a".to_vec()), - (b"b".to_vec(), b"letter_b".to_vec()), - (b"c".to_vec(), b"letter_c".to_vec()), - (b"d".to_vec(), b"letter_d".to_vec()), - (b"e".to_vec(), b"letter_e".to_vec()), - (b"0x123456".to_vec(), b"hex_value_1".to_vec()), - (b"0x123abc".to_vec(), b"hex_value_2".to_vec()), - (b"0x124000".to_vec(), b"hex_value_3".to_vec()), - (b"0xabcdef".to_vec(), b"hex_value_4".to_vec()), - ( - b"very_long_key_that_creates_deep_structure_in_trie_1234567890".to_vec(), - b"long_value_1".to_vec(), - ), - ( - b"very_long_key_that_creates_deep_structure_in_trie_abcdefghijk".to_vec(), - b"long_value_2".to_vec(), - ), - (b"empty_value_key".to_vec(), vec![]), - (b"similar_key_1".to_vec(), b"value_1".to_vec()), - (b"similar_key_2".to_vec(), b"value_2".to_vec()), - (b"similar_key_3".to_vec(), b"value_3".to_vec()), - (b"123".to_vec(), b"number_123".to_vec()), - (b"1234".to_vec(), b"number_1234".to_vec()), - (b"12345".to_vec(), b"number_12345".to_vec()), - ]; - - for (key, value) in &test_data { - trie.insert(key.clone(), value.clone()).unwrap(); - } - - let root = trie.root_node().unwrap().unwrap(); - - let buffer = serialize(&root); - - for (key, expected_value) in &test_data { - let deserializer = Deserializer::new(&buffer); - let retrieved_value = deserializer.get_by_path(key).unwrap(); - assert_eq!(retrieved_value, Some(expected_value.clone())); - } - - let non_existent_keys = vec![ - b"nonexistent".to_vec(), - b"app_wrong".to_vec(), - b"car_wrong".to_vec(), - b"test_wrong".to_vec(), - b"0x999999".to_vec(), - b"similar_key_4".to_vec(), - b"".to_vec(), - b"very_long_nonexistent_key".to_vec(), - ]; - - for key in &non_existent_keys { - let deserializer = Deserializer::new(&buffer); - let result = deserializer.get_by_path(key).unwrap(); - assert_eq!(result, None); - } + let serializer = Serializer::new_incremental(&HashMap::new(), 0); + let (buffer, _, _) = serializer.serialize_tree_incremental(&root).unwrap(); let deserializer = Deserializer::new(&buffer); - let recovered = deserializer.decode_tree().unwrap(); + let recovered = deserializer.decode_node_at(0).unwrap(); assert_eq!(root, recovered); } From 76e715cd089a9ea85d82d6a16a41c7748e67b69e Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Tue, 19 Aug 2025 09:24:57 -0300 Subject: [PATCH 04/27] docs(core): update old documentation --- benches/db_benchmark.rs | 77 +---------------------- src/db.rs | 132 +++++++++++++++++++++++++++++++++------- src/file_manager.rs | 18 ++++-- src/serialization.rs | 57 ++++++++++++----- 4 files changed, 166 insertions(+), 118 deletions(-) diff --git a/benches/db_benchmark.rs b/benches/db_benchmark.rs index a22806d..59dc6fb 100644 --- a/benches/db_benchmark.rs +++ b/benches/db_benchmark.rs @@ -320,80 +320,5 @@ fn random_get_benchmark(c: &mut Criterion) { group.finish(); } -fn cow_benchmark(c: &mut Criterion) { - let mut group = c.benchmark_group("cow_updates"); - group.measurement_time(Duration::from_secs(15)); - group.sample_size(10); - - for size in [1_000, 10_000, 50_000] { - let base_data = generate_test_data(size); - let update_data = generate_test_data(size / 2); // 50% new data to add - - // LibmdbxHashDB CoW test - group.bench_with_input( - BenchmarkId::new("libmdbx_hash_cow", size), - &(&base_data, &update_data), - |b, (base, updates)| { - b.iter_with_setup( - || { - let temp_dir = TempDir::new("libmdbx_cow_bench").unwrap(); - let mut db = LibmdbxHashDB::new(temp_dir.path()); - // Pre-populate with base data - db.insert_batch(base); - db - }, - |mut db| { - // Now add the update data (this should reuse existing trie structure) - db.insert_batch(black_box(updates)); - black_box(db) - }, - ); - }, - ); - - // EthrexDB CoW test - group.bench_with_input( - BenchmarkId::new("ethrex_db_cow", size), - &(&base_data, &update_data), - |b, (base, updates)| { - b.iter_with_setup( - || { - let temp_dir = TempDir::new("ethrex_cow_bench").unwrap(); - let file_path = temp_dir.path().join("cow_test.edb"); - let mut db = EthrexDB::new(file_path).unwrap(); - let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); - - // Pre-populate with base data - for (key, value) in base.iter() { - trie.insert(key.clone(), value.clone()).unwrap(); - } - let root_node = trie.root_node().unwrap().unwrap(); - db.commit(&root_node).unwrap(); - trie.commit().unwrap(); // Convert to CoW references - - (db, trie) - }, - |(mut db, mut trie)| { - // Now add the update data with CoW - for (key, value) in updates.iter() { - trie.insert(key.clone(), value.clone()).unwrap(); - } - let root_node = trie.root_node().unwrap().unwrap(); - db.commit(&root_node).unwrap(); - db - }, - ); - }, - ); - } - - group.finish(); -} - -criterion_group!( - benches, - insert_benchmark, - random_get_benchmark, - cow_benchmark -); +criterion_group!(benches, insert_benchmark, random_get_benchmark,); criterion_main!(benches); diff --git a/src/db.rs b/src/db.rs index 1a81e02..dfe48cc 100644 --- a/src/db.rs +++ b/src/db.rs @@ -1,4 +1,4 @@ -//! EthrexDB - A simple MPT database +//! EthrexDB - Copy-on-Write Merkle Patricia Trie Database use crate::file_manager::FileManager; use crate::serialization::{Deserializer, Serializer}; @@ -12,6 +12,8 @@ pub struct EthrexDB { file_manager: FileManager, /// Index mapping node hashes to their file offsets node_index: HashMap, + /// List of root offsets in chronological order (for linked list) + root_history: Vec, } impl EthrexDB { @@ -21,38 +23,32 @@ impl EthrexDB { Ok(Self { file_manager, node_index: HashMap::new(), + root_history: Vec::new(), }) } /// Open an existing database pub fn open(file_path: PathBuf) -> Result { let file_manager = FileManager::open(file_path)?; - // TODO: Load node_index from file + // TODO: Load node_index and root_history from file Ok(Self { file_manager, node_index: HashMap::new(), + root_history: Vec::new(), }) } - /// Commit a new trie to the database - /// - /// Uses Copy-on-Write to only store new/modified nodes: - /// 1. Serializes only new nodes (NodeRef::Node) - /// 2. Reuses existing nodes (NodeRef::Hash) by their offset - /// 3. Updates the node index with new mappings - /// 4. Updates the header to point to the new root + /// Commit a trie state to the database pub fn commit(&mut self, root_node: &Node) -> Result { let root_hash = root_node.compute_hash(); - // Get the current file size (where new data will be written) + let prev_root_offset = self.file_manager.read_latest_root_offset()?; let base_offset = self.file_manager.get_file_size()?; - // Serialize the trie incrementally with the base offset - let serializer = Serializer::new_incremental(&self.node_index, base_offset); + let serializer = Serializer::new(&self.node_index, base_offset); let (serialized_data, new_offsets, root_offset) = - serializer.serialize_tree_incremental(root_node)?; + serializer.serialize_tree(root_node, prev_root_offset)?; - // Write new nodes at the end of file self.file_manager.write_at_end(&serialized_data)?; // Update node index with new node offsets (they are already absolute) @@ -63,32 +59,51 @@ impl EthrexDB { // Update header to point to the root node self.file_manager.update_latest_root_offset(root_offset)?; + self.root_history.push(root_offset); + Ok(root_hash) } /// Get the latest root node of the database pub fn root(&self) -> Result { - let (latest_offset, file_data) = self.get_latest_data()?; + let latest_offset = self.file_manager.read_latest_root_offset()?; if latest_offset == 0 { return Err(TrieError::Other("No root node in database".to_string())); } - Deserializer::new(file_data).decode_node_at(latest_offset as usize) + + let file_data = self.file_manager.get_slice_to_end(0)?; + // All roots now have 8-byte prepended previous root offset + let actual_root_offset = latest_offset + 8; + + Deserializer::new(file_data).decode_node_at(actual_root_offset as usize) } /// Get the value of the node with the given key pub fn get(&self, key: &[u8]) -> Result>, TrieError> { - let (latest_offset, file_data) = self.get_latest_data()?; + let latest_offset = self.file_manager.read_latest_root_offset()?; if latest_offset == 0 { return Ok(None); } - Deserializer::new(file_data).get_by_path_at(key, latest_offset as usize) + self.get_at_root(key, latest_offset) } - /// Helper to get latest offset and file data - fn get_latest_data(&self) -> Result<(u64, &[u8]), TrieError> { - let latest_offset = self.file_manager.read_latest_root_offset()?; + /// Get the list of all root versions in chronological order (oldest first) + pub fn get_root_history(&self) -> Result, TrieError> { + Ok(self.root_history.clone()) + } + + /// Get a value from a specific root + pub fn get_at_root(&self, key: &[u8], root_offset: u64) -> Result>, TrieError> { + if root_offset == 0 { + return Ok(None); + } + let file_data = self.file_manager.get_slice_to_end(0)?; - Ok((latest_offset, file_data)) + + // All roots have 8-byte prepended previous root offset + let actual_root_offset = root_offset + 8; + + Deserializer::new(file_data).get_by_path_at(key, actual_root_offset as usize) } } @@ -547,4 +562,77 @@ mod tests { } } } + + #[test] + fn test_root_history_linked_list() { + let temp_dir = TempDir::new("ethrex_db_test").unwrap(); + let db_path = temp_dir.path().join("test.db"); + let mut db = EthrexDB::new(db_path).unwrap(); + + // Initially no roots + assert_eq!(db.get_root_history().unwrap(), vec![]); + + // Commit 3 different states + let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); + + // State 1 + trie.insert(b"key1".to_vec(), b"value1".to_vec()).unwrap(); + let root1 = trie.root_node().unwrap().unwrap(); + db.commit(&root1).unwrap(); + + // State 2 + trie.insert(b"key2".to_vec(), b"value2".to_vec()).unwrap(); + let root2 = trie.root_node().unwrap().unwrap(); + db.commit(&root2).unwrap(); + + // State 3 + trie.insert(b"key3".to_vec(), b"value3".to_vec()).unwrap(); + let root3 = trie.root_node().unwrap().unwrap(); + db.commit(&root3).unwrap(); + + // Get root history (should be in chronological order) + let history = db.get_root_history().unwrap(); + assert_eq!(history.len(), 3); + + // Verify we can read from each historical root + let (root1_offset, root2_offset, root3_offset) = (history[0], history[1], history[2]); + + // At root1: only key1 exists + assert_eq!( + db.get_at_root(b"key1", root1_offset).unwrap(), + Some(b"value1".to_vec()) + ); + assert_eq!(db.get_at_root(b"key2", root1_offset).unwrap(), None); + assert_eq!(db.get_at_root(b"key3", root1_offset).unwrap(), None); + + // At root2: key1 and key2 exist + assert_eq!( + db.get_at_root(b"key1", root2_offset).unwrap(), + Some(b"value1".to_vec()) + ); + assert_eq!( + db.get_at_root(b"key2", root2_offset).unwrap(), + Some(b"value2".to_vec()) + ); + assert_eq!(db.get_at_root(b"key3", root2_offset).unwrap(), None); + + // At root3: all keys exist + assert_eq!( + db.get_at_root(b"key1", root3_offset).unwrap(), + Some(b"value1".to_vec()) + ); + assert_eq!( + db.get_at_root(b"key2", root3_offset).unwrap(), + Some(b"value2".to_vec()) + ); + assert_eq!( + db.get_at_root(b"key3", root3_offset).unwrap(), + Some(b"value3".to_vec()) + ); + + // Current get() should return the latest state + assert_eq!(db.get(b"key1").unwrap(), Some(b"value1".to_vec())); + assert_eq!(db.get(b"key2").unwrap(), Some(b"value2".to_vec())); + assert_eq!(db.get(b"key3").unwrap(), Some(b"value3".to_vec())); + } } diff --git a/src/file_manager.rs b/src/file_manager.rs index b15377f..920b7d4 100644 --- a/src/file_manager.rs +++ b/src/file_manager.rs @@ -4,13 +4,23 @@ use std::fs::{File, OpenOptions}; use std::io::{Seek, SeekFrom, Write}; use std::path::PathBuf; -/// Responsible for file management and offsets +/// File management with Copy-on-Write and versioning support /// -/// File format: +/// Manages persistent storage for EthrexDB with append-only writes and +/// linked list versioning of root nodes. +/// +/// File Format: /// ```text -/// [header: 8 bytes] -> points to latest root offset -/// [serialized trie nodes...] +/// [header: 8 bytes] -> offset to latest root version +/// [commit 1: [prev_root_offset: 8 bytes][root_node][other_nodes]] +/// [commit 2: [prev_root_offset: 8 bytes][root_node][other_nodes]] +/// [commit N: [prev_root_offset: 8 bytes][root_node][other_nodes]] /// ``` +/// +/// Each root node is prepended with the offset of the previous root, creating +/// a linked list that allows traversal through all historical versions: +/// - First root: `prev_root_offset = 0` (end of chain) +/// - Subsequent roots: `prev_root_offset = previous_root_location` pub struct FileManager { /// File where the data is stored file: File, diff --git a/src/serialization.rs b/src/serialization.rs index 5a0ab63..90db78c 100644 --- a/src/serialization.rs +++ b/src/serialization.rs @@ -1,14 +1,28 @@ -//! Serialization and deserialization of the trie +//! Incremental serialization with Copy-on-Write optimization //! -//! Two-node serialization format: -//! Instead of the standard 3 node types (Branch, Extension, Leaf), we use 2: -//! - Branch: Has 16 children slots + 1 value slot -//! - Extend: Has 1 child slot + 1 value slot (can represent both Extension and Leaf) +//! ## Core Features: +//! - **Copy-on-Write (CoW)**: Only new/modified nodes are serialized +//! - **Linked List Versioning**: Each root has prepended offset to previous root +//! - **Append-Only Storage**: Data is only added, never overwritten +//! - **Node Reuse**: Existing nodes referenced by offset, not re-serialized //! -//! This simplifies serialization: -//! - Leaf -> Extend with value but no child (child_offset = 0) -//! - Extension -> Extend with child but no value (value_offset = 0) -//! - Branch -> Branch (unchanged) +//! ## Two-Node Serialization Format: +//! Instead of standard 3 node types (Branch, Extension, Leaf), we use 2: +//! - **Branch**: 16 children slots + 1 value slot +//! - **Extend**: 1 child slot + 1 value slot (represents both Extension and Leaf) +//! +//! Node type mapping: +//! - Leaf โ†’ Extend with value but no child (child_offset = 0) +//! - Extension โ†’ Extend with child but no value (value_offset = 0) +//! - Branch โ†’ Branch (unchanged) +//! +//! ## File Structure: +//! ```text +//! [header: 8 bytes] -> offset to latest root +//! [commit 1: [prev_root_offset: 8 bytes][root_node][other_nodes]] +//! [commit 2: [prev_root_offset: 8 bytes][root_node][other_nodes]] +//! [commit N: [prev_root_offset: 8 bytes][root_node][other_nodes]] +//! ``` use std::collections::HashMap; use std::sync::{Arc, OnceLock}; @@ -36,7 +50,7 @@ pub struct Serializer { impl Serializer { /// Create a new incremental serializer with existing node index - pub fn new_incremental(node_index: &HashMap, base_offset: u64) -> Self { + pub fn new(node_index: &HashMap, base_offset: u64) -> Self { Self { buffer: Vec::new(), node_index: node_index.clone(), @@ -46,9 +60,20 @@ impl Serializer { } /// Serializes a trie incrementally, only storing new nodes - pub fn serialize_tree_incremental(mut self, root: &Node) -> SerializationResult { - let root_offset = self.serialize_node(root)?; - Ok((self.buffer, self.new_nodes, root_offset)) + /// Always prepends the previous root offset (0 for first root) + pub fn serialize_tree(mut self, root: &Node, prev_root_offset: u64) -> SerializationResult { + // Store where the root structure starts (including prepended offset) + let root_structure_offset = self.base_offset + self.buffer.len() as u64; + + // Always prepend the previous root offset (0 for first root) + self.buffer + .extend_from_slice(&prev_root_offset.to_le_bytes()); + + // Serialize the actual root node + self.serialize_node(root)?; + + // Return the offset to the start of the root structure (with prepended offset) + Ok((self.buffer, self.new_nodes, root_structure_offset)) } /// Serializes a node, checking CoW first @@ -409,11 +434,11 @@ mod test { trie.insert(b"key".to_vec(), b"value".to_vec()).unwrap(); let root = trie.root_node().unwrap().unwrap(); - let serializer = Serializer::new_incremental(&HashMap::new(), 0); - let (buffer, _, _) = serializer.serialize_tree_incremental(&root).unwrap(); + let serializer = Serializer::new(&HashMap::new(), 0); + let (buffer, _, _) = serializer.serialize_tree(&root, 0).unwrap(); let deserializer = Deserializer::new(&buffer); - let recovered = deserializer.decode_node_at(0).unwrap(); + let recovered = deserializer.decode_node_at(8).unwrap(); // Skip 8-byte prev root offset assert_eq!(root, recovered); } From ceeb241f1802b6136537ecc817a5febe1dbf0657 Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Tue, 19 Aug 2025 10:01:37 -0300 Subject: [PATCH 05/27] test(fmanager): add some asserts checking the file size --- src/db.rs | 68 ++++++++++++++++++++++----------------------- src/file_manager.rs | 7 +++++ 2 files changed, 41 insertions(+), 34 deletions(-) diff --git a/src/db.rs b/src/db.rs index dfe48cc..f4373df 100644 --- a/src/db.rs +++ b/src/db.rs @@ -51,7 +51,7 @@ impl EthrexDB { self.file_manager.write_at_end(&serialized_data)?; - // Update node index with new node offsets (they are already absolute) + // Update node index with new node offsets for (hash, absolute_offset) in new_offsets { self.node_index.insert(hash, absolute_offset); } @@ -114,6 +114,39 @@ mod tests { use super::*; use tempdir::TempDir; + // Helper function to generate test data + fn generate_test_data(n: usize) -> Vec<(Vec, Vec)> { + use sha3::{Digest, Keccak256}; + + (1..=n) + .map(|i| { + // 32-byte key (hash) + let key = Keccak256::new() + .chain_update(i.to_be_bytes()) + .finalize() + .to_vec(); + + // 104-byte value (account info: 2 hashes + u256 + u64) + let mut value = Vec::with_capacity(104); + value.extend_from_slice( + &Keccak256::new() + .chain_update((i * 2).to_be_bytes()) + .finalize(), + ); + value.extend_from_slice( + &Keccak256::new() + .chain_update((i * 3).to_be_bytes()) + .finalize(), + ); + value.extend_from_slice(&[0u8; 24]); // u256 padding + value.extend_from_slice(&(i as u64).to_be_bytes()); // u256 value + value.extend_from_slice(&(i as u64).to_be_bytes()); // u64 + + (key, value) + }) + .collect() + } + #[test] fn test_create_and_commit() { let temp_dir = TempDir::new("ethrex_db_test").unwrap(); @@ -327,39 +360,6 @@ mod tests { } } - // Helper function to generate test data (like the benchmark) - fn generate_test_data(n: usize) -> Vec<(Vec, Vec)> { - use sha3::{Digest, Keccak256}; - - (1..=n) - .map(|i| { - // 32-byte key (hash) - let key = Keccak256::new() - .chain_update(i.to_be_bytes()) - .finalize() - .to_vec(); - - // 104-byte value (account info: 2 hashes + u256 + u64) - let mut value = Vec::with_capacity(104); - value.extend_from_slice( - &Keccak256::new() - .chain_update((i * 2).to_be_bytes()) - .finalize(), - ); - value.extend_from_slice( - &Keccak256::new() - .chain_update((i * 3).to_be_bytes()) - .finalize(), - ); - value.extend_from_slice(&[0u8; 24]); // u256 padding - value.extend_from_slice(&(i as u64).to_be_bytes()); // u256 value - value.extend_from_slice(&(i as u64).to_be_bytes()); // u64 - - (key, value) - }) - .collect() - } - #[test] fn test_blockchain_simulation_with_incremental_storage() { let temp_dir = TempDir::new("ethrex_blockchain_sim").unwrap(); diff --git a/src/file_manager.rs b/src/file_manager.rs index 920b7d4..b32de5b 100644 --- a/src/file_manager.rs +++ b/src/file_manager.rs @@ -161,8 +161,14 @@ mod tests { { let mut fm = FileManager::create(file_path.clone()).unwrap(); + assert_eq!( + fm.get_file_size().unwrap(), + 8, + "File is empty but should have 8 bytes for the header" + ); fm.update_latest_root_offset(456).unwrap(); fm.write_at_end(b"persistent data").unwrap(); + assert_ne!(fm.get_file_size().unwrap(), 8); } let fm = FileManager::open(file_path).unwrap(); @@ -170,5 +176,6 @@ mod tests { let data = fm.get_slice_to_end(8).unwrap().to_vec(); assert_eq!(data, b"persistent data"); + assert_ne!(fm.get_file_size().unwrap(), 8); } } From b75cc216bf6071c929c910a28c60df950197ffc8 Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Tue, 19 Aug 2025 10:09:37 -0300 Subject: [PATCH 06/27] refactor(db): remove root node history --- src/db.rs | 97 ++----------------------------------------------------- 1 file changed, 2 insertions(+), 95 deletions(-) diff --git a/src/db.rs b/src/db.rs index f4373df..87aff9c 100644 --- a/src/db.rs +++ b/src/db.rs @@ -12,8 +12,6 @@ pub struct EthrexDB { file_manager: FileManager, /// Index mapping node hashes to their file offsets node_index: HashMap, - /// List of root offsets in chronological order (for linked list) - root_history: Vec, } impl EthrexDB { @@ -23,18 +21,16 @@ impl EthrexDB { Ok(Self { file_manager, node_index: HashMap::new(), - root_history: Vec::new(), }) } /// Open an existing database pub fn open(file_path: PathBuf) -> Result { let file_manager = FileManager::open(file_path)?; - // TODO: Load node_index and root_history from file + // TODO: Load node_index from file Ok(Self { file_manager, node_index: HashMap::new(), - root_history: Vec::new(), }) } @@ -58,9 +54,6 @@ impl EthrexDB { // Update header to point to the root node self.file_manager.update_latest_root_offset(root_offset)?; - - self.root_history.push(root_offset); - Ok(root_hash) } @@ -84,24 +77,11 @@ impl EthrexDB { if latest_offset == 0 { return Ok(None); } - self.get_at_root(key, latest_offset) - } - - /// Get the list of all root versions in chronological order (oldest first) - pub fn get_root_history(&self) -> Result, TrieError> { - Ok(self.root_history.clone()) - } - - /// Get a value from a specific root - pub fn get_at_root(&self, key: &[u8], root_offset: u64) -> Result>, TrieError> { - if root_offset == 0 { - return Ok(None); - } let file_data = self.file_manager.get_slice_to_end(0)?; // All roots have 8-byte prepended previous root offset - let actual_root_offset = root_offset + 8; + let actual_root_offset = latest_offset + 8; Deserializer::new(file_data).get_by_path_at(key, actual_root_offset as usize) } @@ -562,77 +542,4 @@ mod tests { } } } - - #[test] - fn test_root_history_linked_list() { - let temp_dir = TempDir::new("ethrex_db_test").unwrap(); - let db_path = temp_dir.path().join("test.db"); - let mut db = EthrexDB::new(db_path).unwrap(); - - // Initially no roots - assert_eq!(db.get_root_history().unwrap(), vec![]); - - // Commit 3 different states - let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); - - // State 1 - trie.insert(b"key1".to_vec(), b"value1".to_vec()).unwrap(); - let root1 = trie.root_node().unwrap().unwrap(); - db.commit(&root1).unwrap(); - - // State 2 - trie.insert(b"key2".to_vec(), b"value2".to_vec()).unwrap(); - let root2 = trie.root_node().unwrap().unwrap(); - db.commit(&root2).unwrap(); - - // State 3 - trie.insert(b"key3".to_vec(), b"value3".to_vec()).unwrap(); - let root3 = trie.root_node().unwrap().unwrap(); - db.commit(&root3).unwrap(); - - // Get root history (should be in chronological order) - let history = db.get_root_history().unwrap(); - assert_eq!(history.len(), 3); - - // Verify we can read from each historical root - let (root1_offset, root2_offset, root3_offset) = (history[0], history[1], history[2]); - - // At root1: only key1 exists - assert_eq!( - db.get_at_root(b"key1", root1_offset).unwrap(), - Some(b"value1".to_vec()) - ); - assert_eq!(db.get_at_root(b"key2", root1_offset).unwrap(), None); - assert_eq!(db.get_at_root(b"key3", root1_offset).unwrap(), None); - - // At root2: key1 and key2 exist - assert_eq!( - db.get_at_root(b"key1", root2_offset).unwrap(), - Some(b"value1".to_vec()) - ); - assert_eq!( - db.get_at_root(b"key2", root2_offset).unwrap(), - Some(b"value2".to_vec()) - ); - assert_eq!(db.get_at_root(b"key3", root2_offset).unwrap(), None); - - // At root3: all keys exist - assert_eq!( - db.get_at_root(b"key1", root3_offset).unwrap(), - Some(b"value1".to_vec()) - ); - assert_eq!( - db.get_at_root(b"key2", root3_offset).unwrap(), - Some(b"value2".to_vec()) - ); - assert_eq!( - db.get_at_root(b"key3", root3_offset).unwrap(), - Some(b"value3".to_vec()) - ); - - // Current get() should return the latest state - assert_eq!(db.get(b"key1").unwrap(), Some(b"value1".to_vec())); - assert_eq!(db.get(b"key2").unwrap(), Some(b"value2".to_vec())); - assert_eq!(db.get(b"key3").unwrap(), Some(b"value3".to_vec())); - } } From 6ba626772e4ed42576ad0036bc255ac401a0ab68 Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Tue, 19 Aug 2025 10:21:49 -0300 Subject: [PATCH 07/27] test(serialization): update test with new version of serialization --- src/serialization.rs | 326 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 323 insertions(+), 3 deletions(-) diff --git a/src/serialization.rs b/src/serialization.rs index 90db78c..3bad2d9 100644 --- a/src/serialization.rs +++ b/src/serialization.rs @@ -413,10 +413,13 @@ impl<'a> Deserializer<'a> { } #[cfg(test)] -mod test { +mod tests { use super::*; use crate::trie::{InMemoryTrieDB, Trie, node_hash::NodeHash}; + /// Offset to skip the prepended previous root offset (8 bytes) + const ROOT_DATA_OFFSET: usize = 8; + fn new_temp() -> Trie { use std::collections::HashMap; use std::sync::Arc; @@ -429,16 +432,333 @@ mod test { } #[test] - fn test_simple_serialization() { + fn test_serialize_leaf() { + let leaf = Node::Leaf(LeafNode { + partial: Nibbles::from_hex(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5]), + value: b"long_path_value".to_vec(), + }); + + let serializer = Serializer::new(&HashMap::new(), 0); + let (buffer, _, _) = serializer.serialize_tree(&leaf, 0).unwrap(); + + let deserializer = Deserializer::new(&buffer); + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); + + assert_eq!(leaf, recovered); + } + + #[test] + fn test_serialize_deserialize_branch_empty() { + let branch = Node::Branch(Box::new(BranchNode { + choices: Default::default(), + value: vec![], + })); + + let serializer = Serializer::new(&HashMap::new(), 0); + let (buffer, _, _) = serializer.serialize_tree(&branch, 0).unwrap(); + + let deserializer = Deserializer::new(&buffer); + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); + + assert_eq!(branch, recovered); + } + + #[test] + fn test_serialize_deserialize_tree_extension_to_leaf() { + let leaf = Node::Leaf(LeafNode { + partial: Nibbles::from_hex(vec![5, 6, 7]), + value: b"nested_leaf".to_vec(), + }); + + let ext = Node::Extension(ExtensionNode { + prefix: Nibbles::from_hex(vec![1, 2]), + child: NodeRef::Node(Arc::new(leaf), OnceLock::new()), + }); + + let serializer = Serializer::new(&HashMap::new(), 0); + let (buffer, _, _) = serializer.serialize_tree(&ext, 0).unwrap(); + + let deserializer = Deserializer::new(&buffer); + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); + + assert_eq!(recovered, ext); + match recovered { + Node::Extension(ext_node) => { + assert_eq!(ext_node.prefix, Nibbles::from_hex(vec![1, 2])); + match &ext_node.child { + NodeRef::Node(arc_node, _) => match &**arc_node { + Node::Leaf(leaf_node) => { + assert_eq!(leaf_node.partial, Nibbles::from_hex(vec![5, 6, 7])); + assert_eq!(leaf_node.value, b"nested_leaf"); + } + _ => panic!("Expected leaf node"), + }, + _ => panic!("Expected embedded node"), + } + } + _ => panic!("Expected extension node"), + } + } + + #[test] + fn test_serialize_deserialize_deep_tree() { + let leaf = Node::Leaf(LeafNode { + partial: Nibbles::from_hex(vec![9, 8]), + value: b"deep_leaf".to_vec(), + }); + + let inner_ext = Node::Extension(ExtensionNode { + prefix: Nibbles::from_hex(vec![5, 6]), + child: NodeRef::Node(Arc::new(leaf), OnceLock::new()), + }); + + let mut branch_choices: [NodeRef; 16] = Default::default(); + branch_choices[2] = NodeRef::Node(Arc::new(inner_ext), OnceLock::new()); + + let branch = Node::Branch(Box::new(BranchNode { + choices: branch_choices, + value: vec![], + })); + + let outer_ext = Node::Extension(ExtensionNode { + prefix: Nibbles::from_hex(vec![1, 2, 3]), + child: NodeRef::Node(Arc::new(branch), OnceLock::new()), + }); + + let serializer = Serializer::new(&HashMap::new(), 0); + let (buffer, _, _) = serializer.serialize_tree(&outer_ext, 0).unwrap(); + + let deserializer = Deserializer::new(&buffer); + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); + + assert_eq!(recovered, outer_ext); + } + + #[test] + fn test_trie_serialization_empty() { + let trie = new_temp(); + let root = trie.root_node().unwrap(); + assert!(root.is_none()); + } + + #[test] + fn test_trie_serialization_single_insert() { let mut trie = new_temp(); trie.insert(b"key".to_vec(), b"value".to_vec()).unwrap(); let root = trie.root_node().unwrap().unwrap(); let serializer = Serializer::new(&HashMap::new(), 0); let (buffer, _, _) = serializer.serialize_tree(&root, 0).unwrap(); + let deserializer = Deserializer::new(&buffer); + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); + + assert_eq!(root, recovered); + } + + #[test] + fn test_trie_serialization_multiple_inserts() { + let mut trie = new_temp(); + + let test_data = vec![ + (b"do".to_vec(), b"verb".to_vec()), + (b"dog".to_vec(), b"puppy".to_vec()), + (b"doge".to_vec(), b"coin".to_vec()), + (b"horse".to_vec(), b"stallion".to_vec()), + ]; + + for (key, value) in &test_data { + trie.insert(key.clone(), value.clone()).unwrap(); + } + + let root = trie.root_node().unwrap().unwrap(); + let serializer = Serializer::new(&HashMap::new(), 0); + let (buffer, _, _) = serializer.serialize_tree(&root, 0).unwrap(); + let deserializer = Deserializer::new(&buffer); + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); + + assert_eq!(recovered, root); + } + + #[test] + fn test_file_io() { + use std::fs; + + // Create trie + let mut trie = new_temp(); + trie.insert(b"file_key".to_vec(), b"file_value".to_vec()) + .unwrap(); + + // Serialize to file + let root = trie.root_node().unwrap().unwrap(); + let serializer = Serializer::new(&HashMap::new(), 0); + let (buffer, _, _) = serializer.serialize_tree(&root, 0).unwrap(); + + let path = "/tmp/test_trie.mpt"; + fs::write(path, &buffer).unwrap(); + + // Read from file and deserialize + let read_data = fs::read(path).unwrap(); + let deserializer = Deserializer::new(&read_data); + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); + + assert_eq!(root, recovered); + fs::remove_file(path).unwrap(); + } + + #[test] + fn test_get_by_path_serialized_simple() { + let mut trie = new_temp(); + trie.insert(b"test".to_vec(), b"value".to_vec()).unwrap(); + + let root = trie.root_node().unwrap().unwrap(); + let serializer = Serializer::new(&HashMap::new(), 0); + let (buffer, _, _) = serializer.serialize_tree(&root, 0).unwrap(); + + let deserializer = Deserializer::new(&buffer); + assert_eq!( + deserializer.get_by_path_at(b"test", ROOT_DATA_OFFSET).unwrap(), + Some(b"value".to_vec()) + ); + + let deserializer = Deserializer::new(&buffer); + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); + assert_eq!(root, recovered); + } + + #[test] + fn test_get_by_path_serialized() { + let mut trie = new_temp(); + + let test_data = vec![ + (b"do".to_vec(), b"verb".to_vec()), + (b"dog".to_vec(), b"puppy".to_vec()), + (b"doge".to_vec(), b"coin".to_vec()), + (b"horse".to_vec(), b"stallion".to_vec()), + ]; + + for (key, value) in &test_data { + trie.insert(key.clone(), value.clone()).unwrap(); + } + + let root = trie.root_node().unwrap().unwrap(); + let serializer = Serializer::new(&HashMap::new(), 0); + let (buffer, _, _) = serializer.serialize_tree(&root, 0).unwrap(); + + let deserializer = Deserializer::new(&buffer); + assert_eq!( + deserializer.get_by_path_at(b"horse", ROOT_DATA_OFFSET).unwrap(), + Some(b"stallion".to_vec()) + ); + + let deserializer = Deserializer::new(&buffer); + assert_eq!( + deserializer.get_by_path_at(b"dog", ROOT_DATA_OFFSET).unwrap(), + Some(b"puppy".to_vec()) + ); + + let deserializer = Deserializer::new(&buffer); + assert_eq!( + deserializer.get_by_path_at(b"doge", ROOT_DATA_OFFSET).unwrap(), + Some(b"coin".to_vec()) + ); + + let deserializer = Deserializer::new(&buffer); + assert_eq!( + deserializer.get_by_path_at(b"do", ROOT_DATA_OFFSET).unwrap(), + Some(b"verb".to_vec()) + ); + + let deserializer = Deserializer::new(&buffer); + assert_eq!(deserializer.get_by_path_at(b"cat", ROOT_DATA_OFFSET).unwrap(), None); + + let deserializer = Deserializer::new(&buffer); + assert_eq!(deserializer.get_by_path_at(b"", ROOT_DATA_OFFSET).unwrap(), None); + + // Reset position before decoding tree + let deserializer = Deserializer::new(&buffer); + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); + assert_eq!(root, recovered); + } + + #[test] + fn test_complex_trie_serialization() { + let mut trie = new_temp(); + + let test_data = vec![ + (b"app".to_vec(), b"application".to_vec()), + (b"apple".to_vec(), b"fruit".to_vec()), + (b"application".to_vec(), b"software".to_vec()), + (b"append".to_vec(), b"add_to_end".to_vec()), + (b"applied".to_vec(), b"past_tense".to_vec()), + (b"car".to_vec(), b"vehicle".to_vec()), + (b"card".to_vec(), b"playing_card".to_vec()), + (b"care".to_vec(), b"attention".to_vec()), + (b"career".to_vec(), b"profession".to_vec()), + (b"careful".to_vec(), b"cautious".to_vec()), + (b"test".to_vec(), b"examination".to_vec()), + (b"testing".to_vec(), b"verification".to_vec()), + (b"tester".to_vec(), b"one_who_tests".to_vec()), + (b"testament".to_vec(), b"will_document".to_vec()), + (b"a".to_vec(), b"letter_a".to_vec()), + (b"b".to_vec(), b"letter_b".to_vec()), + (b"c".to_vec(), b"letter_c".to_vec()), + (b"d".to_vec(), b"letter_d".to_vec()), + (b"e".to_vec(), b"letter_e".to_vec()), + (b"0x123456".to_vec(), b"hex_value_1".to_vec()), + (b"0x123abc".to_vec(), b"hex_value_2".to_vec()), + (b"0x124000".to_vec(), b"hex_value_3".to_vec()), + (b"0xabcdef".to_vec(), b"hex_value_4".to_vec()), + ( + b"very_long_key_that_creates_deep_structure_in_trie_1234567890".to_vec(), + b"long_value_1".to_vec(), + ), + ( + b"very_long_key_that_creates_deep_structure_in_trie_abcdefghijk".to_vec(), + b"long_value_2".to_vec(), + ), + (b"empty_value_key".to_vec(), vec![]), + (b"similar_key_1".to_vec(), b"value_1".to_vec()), + (b"similar_key_2".to_vec(), b"value_2".to_vec()), + (b"similar_key_3".to_vec(), b"value_3".to_vec()), + (b"123".to_vec(), b"number_123".to_vec()), + (b"1234".to_vec(), b"number_1234".to_vec()), + (b"12345".to_vec(), b"number_12345".to_vec()), + ]; + + for (key, value) in &test_data { + trie.insert(key.clone(), value.clone()).unwrap(); + } + + let root = trie.root_node().unwrap().unwrap(); + + let serializer = Serializer::new(&HashMap::new(), 0); + let (buffer, _, _) = serializer.serialize_tree(&root, 0).unwrap(); + + for (key, expected_value) in &test_data { + let deserializer = Deserializer::new(&buffer); + let retrieved_value = deserializer.get_by_path_at(key, ROOT_DATA_OFFSET).unwrap(); + assert_eq!(retrieved_value, Some(expected_value.clone())); + } + + let non_existent_keys = vec![ + b"nonexistent".to_vec(), + b"app_wrong".to_vec(), + b"car_wrong".to_vec(), + b"test_wrong".to_vec(), + b"0x999999".to_vec(), + b"similar_key_4".to_vec(), + b"".to_vec(), + b"very_long_nonexistent_key".to_vec(), + ]; + + for key in &non_existent_keys { + let deserializer = Deserializer::new(&buffer); + let result = deserializer.get_by_path_at(key, ROOT_DATA_OFFSET).unwrap(); + assert_eq!(result, None); + } let deserializer = Deserializer::new(&buffer); - let recovered = deserializer.decode_node_at(8).unwrap(); // Skip 8-byte prev root offset + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); assert_eq!(root, recovered); } From 2cef7044334915bf4c7accea95dba9ca1dc836b9 Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Tue, 19 Aug 2025 14:43:19 -0300 Subject: [PATCH 08/27] test(db,serialization): improve test and remove duplicates --- src/db.rs | 144 +++---------------------------------------- src/serialization.rs | 40 ++++++------ 2 files changed, 29 insertions(+), 155 deletions(-) diff --git a/src/db.rs b/src/db.rs index 87aff9c..f6dc73f 100644 --- a/src/db.rs +++ b/src/db.rs @@ -140,7 +140,7 @@ mod tests { let root_node = trie.root_node().unwrap().unwrap(); let root_hash = db.commit(&root_node).unwrap(); - assert!(root_hash.as_ref() != [0u8; 32]); + assert_ne!(root_hash.as_ref(), [0u8; 32]); } #[test] @@ -189,29 +189,6 @@ mod tests { assert_eq!(db.get(b"nonexistent").unwrap(), None); } - #[test] - fn test_simple_serialization_debug() { - let temp_dir = TempDir::new("ethrex_db_simple_test").unwrap(); - let db_path = temp_dir.path().join("simple.edb"); - - let mut db = EthrexDB::new(db_path.clone()).unwrap(); - let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); - - // Simple test with just one key - trie.insert(b"key".to_vec(), b"value".to_vec()).unwrap(); - let root_node = trie.root_node().unwrap().unwrap(); - - // Commit to DB - db.commit(&root_node).unwrap(); - - // Read back from DB - assert_eq!(db.root().unwrap(), root_node); - - // Test that we can read the value - let value = db.get(b"key").unwrap(); - assert_eq!(value, Some(b"value".to_vec())); - } - #[test] fn test_incremental_commit() { let temp_dir = TempDir::new("ethrex_db_test").unwrap(); @@ -268,78 +245,6 @@ mod tests { assert_eq!(db.get(b"key3").unwrap(), Some(b"value3".to_vec())); } - #[test] - fn test_complex_db_operations() { - let temp_dir = TempDir::new("ethrex_db_complex_test").unwrap(); - let db_path = temp_dir.path().join("complex_test.edb"); - - let test_data_v1 = vec![ - (b"app".to_vec(), b"application_v1".to_vec()), - (b"apple".to_vec(), b"fruit_v1".to_vec()), - (b"car".to_vec(), b"vehicle_v1".to_vec()), - (b"test".to_vec(), b"examination_v1".to_vec()), - (b"0x123456".to_vec(), b"hex_value_v1".to_vec()), - ]; - - let test_data_v2 = vec![ - (b"app".to_vec(), b"application_v2".to_vec()), - (b"apple".to_vec(), b"fruit_v2".to_vec()), - (b"banana".to_vec(), b"fruit_new".to_vec()), - (b"car".to_vec(), b"vehicle_v2".to_vec()), - (b"bike".to_vec(), b"vehicle_new".to_vec()), // New - (b"test".to_vec(), b"examination_v2".to_vec()), - (b"0x123456".to_vec(), b"hex_value_v2".to_vec()), - (b"0xabcdef".to_vec(), b"hex_new".to_vec()), - ]; - - let mut db = EthrexDB::new(db_path.clone()).unwrap(); - - let mut trie_v1 = Trie::new(Box::new(InMemoryTrieDB::new_empty())); - for (key, value) in &test_data_v1 { - trie_v1.insert(key.clone(), value.clone()).unwrap(); - } - let root_node = trie_v1.root_node().unwrap().unwrap(); - db.commit(&root_node).unwrap(); - - let mut trie_v2 = Trie::new(Box::new(InMemoryTrieDB::new_empty())); - for (key, value) in &test_data_v2 { - trie_v2.insert(key.clone(), value.clone()).unwrap(); - } - let root_node = trie_v2.root_node().unwrap().unwrap(); - db.commit(&root_node).unwrap(); - - for (key, expected_value) in &test_data_v2 { - let result = db.get(key).unwrap(); - assert_eq!(result, Some(expected_value.clone())); - } - - assert_eq!(db.get(b"nonexistent").unwrap(), None); - - let complex_test_data = vec![ - ( - b"very_long_key_with_complex_structure_123456789".to_vec(), - b"complex_value".to_vec(), - ), - (b"short".to_vec(), b"val".to_vec()), - (b"".to_vec(), b"empty_key_value".to_vec()), - ]; - - let mut trie_v3 = Trie::new(Box::new(InMemoryTrieDB::new_empty())); - for (key, value) in &test_data_v2 { - trie_v3.insert(key.clone(), value.clone()).unwrap(); - } - for (key, value) in &complex_test_data { - trie_v3.insert(key.clone(), value.clone()).unwrap(); - } - let root_node = trie_v3.root_node().unwrap().unwrap(); - db.commit(&root_node).unwrap(); - - for (key, expected_value) in &complex_test_data { - let result = db.get(key).unwrap(); - assert_eq!(result, Some(expected_value.clone())); - } - } - #[test] fn test_blockchain_simulation_with_incremental_storage() { let temp_dir = TempDir::new("ethrex_blockchain_sim").unwrap(); @@ -348,11 +253,7 @@ mod tests { let mut db = EthrexDB::new(db_path.clone()).unwrap(); let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); - // Keep track of a persistent value from the first batch - let persistent_key = generate_test_data(1)[0].0.clone(); - let persistent_value = generate_test_data(1)[0].1.clone(); - - // Batch 1: Initial accounts (simulating genesis) + // Batch 1: Initial accounts let batch1_data = generate_test_data(100); for (key, value) in batch1_data.iter() { @@ -362,7 +263,7 @@ mod tests { let root_node1 = trie.root_node().unwrap().unwrap(); let trie_root_hash1 = root_node1.compute_hash(); let db_root_hash1 = db.commit(&root_node1).unwrap(); - trie.commit().unwrap(); // Convert to NodeRef::Hash for CoW + trie.commit().unwrap(); // Convert to NodeRef::Hash assert_eq!( trie_root_hash1, db_root_hash1, @@ -374,12 +275,6 @@ mod tests { "DB root must match trie root after batch 1" ); - // Verify persistent value exists - assert_eq!( - db.get(&persistent_key).unwrap(), - Some(persistent_value.clone()) - ); - // Batch 2: New transactions + modify some existing accounts let new_accounts_batch2 = generate_test_data(150); @@ -400,7 +295,7 @@ mod tests { let root_node2 = trie.root_node().unwrap().unwrap(); let trie_root_hash2 = root_node2.compute_hash(); let db_root_hash2 = db.commit(&root_node2).unwrap(); - trie.commit().unwrap(); + trie.commit().unwrap(); // Convert to NodeRef::Hash assert_eq!( trie_root_hash2, db_root_hash2, @@ -411,10 +306,6 @@ mod tests { root_node2, "DB root must match trie root after batch 2" ); - assert_eq!( - db.get(&persistent_key).unwrap(), - Some(persistent_value.clone()) - ); // Batch 3: More transactions let new_accounts_batch3 = generate_test_data(200); @@ -437,7 +328,7 @@ mod tests { let root_node3 = trie.root_node().unwrap().unwrap(); let trie_root_hash3 = root_node3.compute_hash(); let db_root_hash3 = db.commit(&root_node3).unwrap(); - trie.commit().unwrap(); + trie.commit().unwrap(); // Convert to NodeRef::Hash assert_eq!( trie_root_hash3, db_root_hash3, @@ -448,10 +339,6 @@ mod tests { root_node3, "DB root must match trie root after batch 3" ); - assert_eq!( - db.get(&persistent_key).unwrap(), - Some(persistent_value.clone()) - ); // Batch 4: Large update batch let new_accounts_batch4 = generate_test_data(250); @@ -474,7 +361,7 @@ mod tests { let root_node4 = trie.root_node().unwrap().unwrap(); let trie_root_hash4 = root_node4.compute_hash(); let db_root_hash4 = db.commit(&root_node4).unwrap(); - trie.commit().unwrap(); + trie.commit().unwrap(); // Convert to NodeRef::Hash assert_eq!( trie_root_hash4, db_root_hash4, @@ -485,10 +372,6 @@ mod tests { root_node4, "DB root must match trie root after batch 4" ); - assert_eq!( - db.get(&persistent_key).unwrap(), - Some(persistent_value.clone()) - ); // Batch 5: Final verification batch let new_accounts_batch5 = generate_test_data(300); @@ -511,7 +394,7 @@ mod tests { let root_node5 = trie.root_node().unwrap().unwrap(); let trie_root_hash5 = root_node5.compute_hash(); let db_root_hash5 = db.commit(&root_node5).unwrap(); - trie.commit().unwrap(); + trie.commit().unwrap(); // Convert to NodeRef::Hash assert_eq!( trie_root_hash5, db_root_hash5, @@ -523,22 +406,11 @@ mod tests { "DB root must match trie root after batch 5" ); - // Final verification: The persistent value from batch 1 should still be accessible - assert_eq!( - db.get(&persistent_key).unwrap(), - Some(persistent_value.clone()), - ); - // Random verification of some accounts for batch_num in 1..=5 { let test_data = generate_test_data(batch_num * 50); if let Some((key, _)) = test_data.get(batch_num * 10) { - let db_value = db.get(key).unwrap(); - assert!( - db_value.is_some(), - "Account from batch {} should be accessible", - batch_num - ); + assert_eq!(db.get(key).unwrap(), trie.get(key).unwrap()); } } } diff --git a/src/serialization.rs b/src/serialization.rs index 3bad2d9..604cdc3 100644 --- a/src/serialization.rs +++ b/src/serialization.rs @@ -616,7 +616,9 @@ mod tests { let deserializer = Deserializer::new(&buffer); assert_eq!( - deserializer.get_by_path_at(b"test", ROOT_DATA_OFFSET).unwrap(), + deserializer + .get_by_path_at(b"test", ROOT_DATA_OFFSET) + .unwrap(), Some(b"value".to_vec()) ); @@ -646,36 +648,36 @@ mod tests { let deserializer = Deserializer::new(&buffer); assert_eq!( - deserializer.get_by_path_at(b"horse", ROOT_DATA_OFFSET).unwrap(), + deserializer + .get_by_path_at(b"horse", ROOT_DATA_OFFSET) + .unwrap(), Some(b"stallion".to_vec()) ); - - let deserializer = Deserializer::new(&buffer); assert_eq!( - deserializer.get_by_path_at(b"dog", ROOT_DATA_OFFSET).unwrap(), + deserializer + .get_by_path_at(b"dog", ROOT_DATA_OFFSET) + .unwrap(), Some(b"puppy".to_vec()) ); - - let deserializer = Deserializer::new(&buffer); assert_eq!( - deserializer.get_by_path_at(b"doge", ROOT_DATA_OFFSET).unwrap(), + deserializer + .get_by_path_at(b"doge", ROOT_DATA_OFFSET) + .unwrap(), Some(b"coin".to_vec()) ); - - let deserializer = Deserializer::new(&buffer); assert_eq!( - deserializer.get_by_path_at(b"do", ROOT_DATA_OFFSET).unwrap(), + deserializer + .get_by_path_at(b"do", ROOT_DATA_OFFSET) + .unwrap(), Some(b"verb".to_vec()) ); + assert_eq!( + deserializer + .get_by_path_at(b"cat", ROOT_DATA_OFFSET) + .unwrap(), + None + ); - let deserializer = Deserializer::new(&buffer); - assert_eq!(deserializer.get_by_path_at(b"cat", ROOT_DATA_OFFSET).unwrap(), None); - - let deserializer = Deserializer::new(&buffer); - assert_eq!(deserializer.get_by_path_at(b"", ROOT_DATA_OFFSET).unwrap(), None); - - // Reset position before decoding tree - let deserializer = Deserializer::new(&buffer); let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); assert_eq!(root, recovered); } From cd26df3fc1ca193287ba3333ac3646346d006ac8 Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Tue, 19 Aug 2025 16:02:08 -0300 Subject: [PATCH 09/27] docs(db, file_manager, serialization): enhance documentation --- src/db.rs | 9 ++++++ src/file_manager.rs | 45 +++++++++++++++------------ src/serialization.rs | 72 +++++++++++++++++++++++++++++--------------- 3 files changed, 83 insertions(+), 43 deletions(-) diff --git a/src/db.rs b/src/db.rs index f6dc73f..daa94fe 100644 --- a/src/db.rs +++ b/src/db.rs @@ -1,4 +1,13 @@ //! EthrexDB - Copy-on-Write Merkle Patricia Trie Database +//! +//! The database implements Copy-on-Write (CoW) optimization where only modified nodes +//! are written during commits. Unchanged nodes are referenced by their file offset, +//! avoiding duplication. All writes are append-only - data is never overwritten, +//! only appended to the end of the file. +//! +//! Each commit creates a new root that links to the previous root via a prepended +//! offset, forming a linked list of all historical states. This allows traversing +//! the entire version history if needed. use crate::file_manager::FileManager; use crate::serialization::{Deserializer, Serializer}; diff --git a/src/file_manager.rs b/src/file_manager.rs index b32de5b..cd12457 100644 --- a/src/file_manager.rs +++ b/src/file_manager.rs @@ -1,35 +1,42 @@ +//! File management +//! +//! The FileManager handles all low-level file operations for `EthrexDB`, implementing +//! an append-only storage strategy where data is never overwritten. All writes go +//! to the end of the file, preserving historical data and enabling version traversal. +//! +//! File Layout: +//! ```text +//! Offset 0: [header: 8 bytes] // Points to latest root offset +//! Offset 8: [commit_1_data...] // First commit's data +//! Offset X: [commit_2_data...] // Second commit's data +//! Offset Y: [commit_N_data...] // Latest commit's data +//! ``` +//! +//! The header at offset 0 always contains the offset of the most recent root. +//! This is the only part of the file that gets updated in-place. Everything else +//! is append-only. +//! +//! Each commit's data starts with an 8-byte link to the previous root offset, +//! creating a linked list through all versions: +//! - First commit: prev_root_offset = 0 (marks end of chain) +//! - Later commits: prev_root_offset = offset of previous root + use crate::trie::TrieError; use memmap2::{Mmap, MmapOptions}; use std::fs::{File, OpenOptions}; use std::io::{Seek, SeekFrom, Write}; use std::path::PathBuf; -/// File management with Copy-on-Write and versioning support -/// -/// Manages persistent storage for EthrexDB with append-only writes and -/// linked list versioning of root nodes. -/// -/// File Format: -/// ```text -/// [header: 8 bytes] -> offset to latest root version -/// [commit 1: [prev_root_offset: 8 bytes][root_node][other_nodes]] -/// [commit 2: [prev_root_offset: 8 bytes][root_node][other_nodes]] -/// [commit N: [prev_root_offset: 8 bytes][root_node][other_nodes]] -/// ``` -/// -/// Each root node is prepended with the offset of the previous root, creating -/// a linked list that allows traversal through all historical versions: -/// - First root: `prev_root_offset = 0` (end of chain) -/// - Subsequent roots: `prev_root_offset = previous_root_location` +/// File manager for `EthrexDB` pub struct FileManager { /// File where the data is stored file: File, - /// Memory-mapped of the file + /// Memory-mapped view of the file mmap: Mmap, } impl FileManager { - /// Create a new database file + /// Create a new file pub fn create(file_path: PathBuf) -> Result { if let Some(parent) = file_path.parent() { std::fs::create_dir_all(parent).unwrap(); diff --git a/src/serialization.rs b/src/serialization.rs index 604cdc3..8984756 100644 --- a/src/serialization.rs +++ b/src/serialization.rs @@ -1,28 +1,49 @@ -//! Incremental serialization with Copy-on-Write optimization +//! Serialization and deserialization of the trie. //! -//! ## Core Features: -//! - **Copy-on-Write (CoW)**: Only new/modified nodes are serialized -//! - **Linked List Versioning**: Each root has prepended offset to previous root -//! - **Append-Only Storage**: Data is only added, never overwritten -//! - **Node Reuse**: Existing nodes referenced by offset, not re-serialized +//! This module implements a two-node serialization format that reduces the standard +//! three MPT node types ([`BranchNode`], [`ExtensionNode`], [`LeafNode`]) to just two: +//! `Branch` and `Extend`. +//! The `Extend` node cleverly represents both `Extension` and `Leaf` nodes based on which fields are populated. //! -//! ## Two-Node Serialization Format: -//! Instead of standard 3 node types (Branch, Extension, Leaf), we use 2: -//! - **Branch**: 16 children slots + 1 value slot -//! - **Extend**: 1 child slot + 1 value slot (represents both Extension and Leaf) +//! File Structure: +//! Each commit in the file has the following layout: //! -//! Node type mapping: -//! - Leaf โ†’ Extend with value but no child (child_offset = 0) -//! - Extension โ†’ Extend with child but no value (value_offset = 0) -//! - Branch โ†’ Branch (unchanged) +//! ```text +//! [prev_root_offset: 8 bytes] // Links to previous root (0 for first commit) +//! [root_node_data] // Root node serialized first +//! [child_nodes_data...] // Children in depth-first order +//! ``` +//! +//! Node Serialization Format: //! -//! ## File Structure: +//! Branch Node: //! ```text -//! [header: 8 bytes] -> offset to latest root -//! [commit 1: [prev_root_offset: 8 bytes][root_node][other_nodes]] -//! [commit 2: [prev_root_offset: 8 bytes][root_node][other_nodes]] -//! [commit N: [prev_root_offset: 8 bytes][root_node][other_nodes]] +//! [tag: 1 byte = 0x00] +//! [child_offsets: 16 * 8 bytes] // Offsets to 16 possible children (0 if empty) +//! [value_offset: 8 bytes] // Offset to value data (0 if no value) //! ``` +//! +//! Extend Node (Extension or Leaf): +//! - If the value is empty and the child is not zero, it's an Extension node. +//! - If the value is not empty and the child is zero, it's a Leaf node. +//! +//! ```text +//! [tag: 1 byte = 0x01] +//! [nibbles_len: 4 bytes] +//! [nibbles_data: variable] +//! [child_offset: 8 bytes] // 0 for Leaf, valid offset for Extension +//! [value_offset: 8 bytes] // Valid offset for Leaf, 0 for Extension +//! ``` +//! +//! Copy-on-Write: +//! During [`Serializer::serialize_tree`], each node is checked against the +//! [`Serializer::node_index`]. If the node's hash already exists, its offset is +//! returned immediately without re-serialization. This means unchanged subtrees +//! are never duplicated - they're referenced by offset. +//! +//! The serialization order is depth-first, with the root always first after the +//! `prev_root_offset`. This allows offsets to be calculated during serialization +//! as children are written immediately after their parents (unless they already exist). use std::collections::HashMap; use std::sync::{Arc, OnceLock}; @@ -42,14 +63,18 @@ const TAG_EXTEND: u8 = 1; /// Serializes a Merkle Patricia Trie into a byte buffer using the two node format #[derive(Default)] pub struct Serializer { + /// Buffer to store the serialized data buffer: Vec, + /// Index of the nodes in the buffer node_index: HashMap, + /// Serialized nodes in this batch new_nodes: HashMap, + /// Base offset of the buffer base_offset: u64, } impl Serializer { - /// Create a new incremental serializer with existing node index + /// Create a new serializer with existing node index pub fn new(node_index: &HashMap, base_offset: u64) -> Self { Self { buffer: Vec::new(), @@ -59,10 +84,10 @@ impl Serializer { } } - /// Serializes a trie incrementally, only storing new nodes + /// Serializes a trie, only storing new nodes /// Always prepends the previous root offset (0 for first root) pub fn serialize_tree(mut self, root: &Node, prev_root_offset: u64) -> SerializationResult { - // Store where the root structure starts (including prepended offset) + // Store where the root structure starts let root_structure_offset = self.base_offset + self.buffer.len() as u64; // Always prepend the previous root offset (0 for first root) @@ -80,12 +105,10 @@ impl Serializer { fn serialize_node(&mut self, node: &Node) -> Result { let hash = node.compute_hash(); - // Check if node already exists (CoW) if let Some(&existing_offset) = self.node_index.get(&hash) { return Ok(existing_offset); } - // Check if we already serialized this node in this batch if let Some(&absolute_offset) = self.new_nodes.get(&hash) { return Ok(absolute_offset); } @@ -181,6 +204,7 @@ impl Serializer { fn serialize_noderef(&mut self, noderef: &NodeRef) -> Result { match noderef { NodeRef::Hash(hash) if hash.is_valid() => { + // Node was previously committed - must exist in index self.node_index.get(hash).copied().ok_or_else(|| { TrieError::Other(format!("Hash reference not found: {:?}", hash)) }) From 4551c8ea4710991f3b3e6665d648933094aab79a Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Wed, 20 Aug 2025 12:31:15 -0300 Subject: [PATCH 10/27] feat(index): add a HashMap to store hash and offset --- Cargo.lock | 1 + Cargo.toml | 11 ++++- src/db.rs | 15 ++++--- src/index.rs | 102 ++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 2 + src/serialization.rs | 46 +++++++++++-------- src/trie/node_hash.rs | 8 ++-- 7 files changed, 155 insertions(+), 30 deletions(-) create mode 100644 src/index.rs diff --git a/Cargo.lock b/Cargo.lock index 57b45e4..b7bb442 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -434,6 +434,7 @@ dependencies = [ "criterion", "ethereum-types", "hasher", + "hex", "hex-literal", "lazy_static", "libmdbx", diff --git a/Cargo.toml b/Cargo.toml index e2046dd..a2a8b55 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,13 +15,21 @@ sha3 = "0.10.8" thiserror = "2.0.9" tinyvec = "1.6.0" +# Optional benchmark dependencies +libmdbx = { version = "=0.5.3", features = ["orm"], optional = true } +anyhow = { version = "1.0.86", optional = true } + +[features] +default = [] +libmdbx-benchmark = ["libmdbx", "anyhow"] + [dev-dependencies] anyhow = "1.0.86" cita_trie = "4.0.0" # used for proptest comparisons criterion = { version = "0.5", features = ["html_reports"] } hasher = "0.1.4" # cita_trie needs this +hex = "0.4.3" # for simple benchmark hex-literal = "0.4.1" -libmdbx = { version = "=0.5.3", features = ["orm"] } proptest = "1.0.0" rand = "0.8.5" tempdir = "0.3.7" @@ -29,6 +37,7 @@ tempdir = "0.3.7" [[bench]] name = "db_benchmark" harness = false +required-features = ["libmdbx-benchmark"] [profile.release-with-debug] inherits = "release" diff --git a/src/db.rs b/src/db.rs index daa94fe..cadb297 100644 --- a/src/db.rs +++ b/src/db.rs @@ -10,9 +10,9 @@ //! the entire version history if needed. use crate::file_manager::FileManager; +use crate::index::Index; use crate::serialization::{Deserializer, Serializer}; use crate::trie::{Node, NodeHash, TrieError}; -use std::collections::HashMap; use std::path::PathBuf; /// Ethrex DB struct @@ -20,26 +20,27 @@ pub struct EthrexDB { /// File manager file_manager: FileManager, /// Index mapping node hashes to their file offsets - node_index: HashMap, + node_index: Index, } impl EthrexDB { /// Create a new database pub fn new(file_path: PathBuf) -> Result { - let file_manager = FileManager::create(file_path)?; + let file_manager = FileManager::create(file_path.clone())?; + let node_index = Index::new(); Ok(Self { file_manager, - node_index: HashMap::new(), + node_index, }) } /// Open an existing database pub fn open(file_path: PathBuf) -> Result { - let file_manager = FileManager::open(file_path)?; - // TODO: Load node_index from file + let file_manager = FileManager::open(file_path.clone())?; + let node_index = Index::new(); Ok(Self { file_manager, - node_index: HashMap::new(), + node_index, }) } diff --git a/src/index.rs b/src/index.rs new file mode 100644 index 0000000..e11cae9 --- /dev/null +++ b/src/index.rs @@ -0,0 +1,102 @@ +//! Simple in-memory index for O(1) hash -> offset lookups +//! +//! This module provides a pure HashMap-based index with no persistence. +//! The index is rebuilt automatically when needed from the main database file. + +use crate::trie::NodeHash; +use std::collections::HashMap; + +/// Simple in-memory index - pure HashMap +#[derive(Debug, Default)] +pub struct Index { + /// Index map for O(1) lookups + data: HashMap, +} + +impl Index { + /// Create a new empty index + pub fn new() -> Self { + Self { + data: HashMap::new(), + } + } + + /// Get an offset by node hash + pub fn get(&self, hash: &NodeHash) -> Option { + self.data.get(hash).copied() + } + + /// Insert a new node hash -> offset mapping + pub fn insert(&mut self, hash: NodeHash, offset: u64) { + self.data.insert(hash, offset); + } + + /// Get the number of entries in the index + pub fn len(&self) -> usize { + self.data.len() + } + + /// Check if the index is empty + pub fn is_empty(&self) -> bool { + self.data.is_empty() + } + + /// Clear all entries + pub fn clear(&mut self) { + self.data.clear(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_new_index() { + let index = Index::new(); + assert_eq!(index.len(), 0); + assert!(index.is_empty()); + } + + #[test] + fn test_insert_and_get() { + let mut index = Index::new(); + let hash = NodeHash::default(); + let offset = 1234u64; + + index.insert(hash, offset); + assert_eq!(index.get(&hash), Some(offset)); + assert_eq!(index.len(), 1); + assert!(!index.is_empty()); + } + + #[test] + fn test_multiple_hash_types() { + let mut index = Index::new(); + + // Test with different NodeHash variants + let inline_hash = NodeHash::from_slice(&[1, 2, 3]); + let hashed_hash = NodeHash::from_slice(&[0u8; 32]); + + index.insert(inline_hash, 100); + index.insert(hashed_hash, 200); + + assert_eq!(index.get(&inline_hash), Some(100)); + assert_eq!(index.get(&hashed_hash), Some(200)); + assert_eq!(index.len(), 2); + } + + #[test] + fn test_clear() { + let mut index = Index::new(); + let hash = NodeHash::default(); + + index.insert(hash, 123); + assert_eq!(index.len(), 1); + + index.clear(); + assert_eq!(index.len(), 0); + assert!(index.is_empty()); + assert_eq!(index.get(&hash), None); + } +} diff --git a/src/lib.rs b/src/lib.rs index d2256e6..dbed6de 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,6 +4,8 @@ mod db; /// Interact with the file mod file_manager; +/// In-memory index with persistence +pub mod index; /// Serialization and deserialization of the trie. mod serialization; diff --git a/src/serialization.rs b/src/serialization.rs index 8984756..5069eb3 100644 --- a/src/serialization.rs +++ b/src/serialization.rs @@ -45,6 +45,7 @@ //! `prev_root_offset`. This allows offsets to be calculated during serialization //! as children are written immediately after their parents (unless they already exist). +use crate::index::Index; use std::collections::HashMap; use std::sync::{Arc, OnceLock}; @@ -61,24 +62,23 @@ const TAG_BRANCH: u8 = 0; const TAG_EXTEND: u8 = 1; /// Serializes a Merkle Patricia Trie into a byte buffer using the two node format -#[derive(Default)] -pub struct Serializer { +pub struct Serializer<'a> { /// Buffer to store the serialized data buffer: Vec, /// Index of the nodes in the buffer - node_index: HashMap, + node_index: &'a Index, /// Serialized nodes in this batch new_nodes: HashMap, /// Base offset of the buffer base_offset: u64, } -impl Serializer { +impl<'a> Serializer<'a> { /// Create a new serializer with existing node index - pub fn new(node_index: &HashMap, base_offset: u64) -> Self { + pub fn new(node_index: &'a Index, base_offset: u64) -> Self { Self { buffer: Vec::new(), - node_index: node_index.clone(), + node_index, new_nodes: HashMap::new(), base_offset, } @@ -105,7 +105,7 @@ impl Serializer { fn serialize_node(&mut self, node: &Node) -> Result { let hash = node.compute_hash(); - if let Some(&existing_offset) = self.node_index.get(&hash) { + if let Some(existing_offset) = self.node_index.get(&hash) { return Ok(existing_offset); } @@ -205,7 +205,7 @@ impl Serializer { match noderef { NodeRef::Hash(hash) if hash.is_valid() => { // Node was previously committed - must exist in index - self.node_index.get(hash).copied().ok_or_else(|| { + self.node_index.get(hash).ok_or_else(|| { TrieError::Other(format!("Hash reference not found: {:?}", hash)) }) } @@ -462,7 +462,8 @@ mod tests { value: b"long_path_value".to_vec(), }); - let serializer = Serializer::new(&HashMap::new(), 0); + let index = Index::new(); + let serializer = Serializer::new(&index, 0); let (buffer, _, _) = serializer.serialize_tree(&leaf, 0).unwrap(); let deserializer = Deserializer::new(&buffer); @@ -478,7 +479,8 @@ mod tests { value: vec![], })); - let serializer = Serializer::new(&HashMap::new(), 0); + let index = Index::new(); + let serializer = Serializer::new(&index, 0); let (buffer, _, _) = serializer.serialize_tree(&branch, 0).unwrap(); let deserializer = Deserializer::new(&buffer); @@ -499,7 +501,8 @@ mod tests { child: NodeRef::Node(Arc::new(leaf), OnceLock::new()), }); - let serializer = Serializer::new(&HashMap::new(), 0); + let index = Index::new(); + let serializer = Serializer::new(&index, 0); let (buffer, _, _) = serializer.serialize_tree(&ext, 0).unwrap(); let deserializer = Deserializer::new(&buffer); @@ -549,7 +552,8 @@ mod tests { child: NodeRef::Node(Arc::new(branch), OnceLock::new()), }); - let serializer = Serializer::new(&HashMap::new(), 0); + let index = Index::new(); + let serializer = Serializer::new(&index, 0); let (buffer, _, _) = serializer.serialize_tree(&outer_ext, 0).unwrap(); let deserializer = Deserializer::new(&buffer); @@ -571,7 +575,8 @@ mod tests { trie.insert(b"key".to_vec(), b"value".to_vec()).unwrap(); let root = trie.root_node().unwrap().unwrap(); - let serializer = Serializer::new(&HashMap::new(), 0); + let index = Index::new(); + let serializer = Serializer::new(&index, 0); let (buffer, _, _) = serializer.serialize_tree(&root, 0).unwrap(); let deserializer = Deserializer::new(&buffer); let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); @@ -595,7 +600,8 @@ mod tests { } let root = trie.root_node().unwrap().unwrap(); - let serializer = Serializer::new(&HashMap::new(), 0); + let index = Index::new(); + let serializer = Serializer::new(&index, 0); let (buffer, _, _) = serializer.serialize_tree(&root, 0).unwrap(); let deserializer = Deserializer::new(&buffer); let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); @@ -614,7 +620,8 @@ mod tests { // Serialize to file let root = trie.root_node().unwrap().unwrap(); - let serializer = Serializer::new(&HashMap::new(), 0); + let index = Index::new(); + let serializer = Serializer::new(&index, 0); let (buffer, _, _) = serializer.serialize_tree(&root, 0).unwrap(); let path = "/tmp/test_trie.mpt"; @@ -635,7 +642,8 @@ mod tests { trie.insert(b"test".to_vec(), b"value".to_vec()).unwrap(); let root = trie.root_node().unwrap().unwrap(); - let serializer = Serializer::new(&HashMap::new(), 0); + let index = Index::new(); + let serializer = Serializer::new(&index, 0); let (buffer, _, _) = serializer.serialize_tree(&root, 0).unwrap(); let deserializer = Deserializer::new(&buffer); @@ -667,7 +675,8 @@ mod tests { } let root = trie.root_node().unwrap().unwrap(); - let serializer = Serializer::new(&HashMap::new(), 0); + let index = Index::new(); + let serializer = Serializer::new(&index, 0); let (buffer, _, _) = serializer.serialize_tree(&root, 0).unwrap(); let deserializer = Deserializer::new(&buffer); @@ -757,7 +766,8 @@ mod tests { let root = trie.root_node().unwrap().unwrap(); - let serializer = Serializer::new(&HashMap::new(), 0); + let index = Index::new(); + let serializer = Serializer::new(&index, 0); let (buffer, _, _) = serializer.serialize_tree(&root, 0).unwrap(); for (key, expected_value) in &test_data { diff --git a/src/trie/node_hash.rs b/src/trie/node_hash.rs index f9b7229..70ffb14 100644 --- a/src/trie/node_hash.rs +++ b/src/trie/node_hash.rs @@ -1,6 +1,6 @@ use crate::rlp::{Encoder, RLPDecode, RLPDecodeError, RLPEncode}; use ethereum_types::H256; -#[cfg(test)] +#[cfg(feature = "libmdbx-benchmark")] use libmdbx::orm::{Decodable, Encodable}; use sha3::{Digest, Keccak256}; @@ -38,7 +38,7 @@ impl NodeHash { /// Converts a slice of an already hashed data (in case it's not inlineable) to a NodeHash. /// Panics if the slice is over 32 bytes /// If you need to hash it in case its len >= 32 see `from_encoded_raw` - pub(crate) fn from_slice(slice: &[u8]) -> NodeHash { + pub fn from_slice(slice: &[u8]) -> NodeHash { match slice.len() { 0..32 => { let mut buffer = [0; 31]; @@ -141,7 +141,7 @@ impl RLPDecode for NodeHash { } } -#[cfg(test)] +#[cfg(feature = "libmdbx-benchmark")] impl Encodable for NodeHash { type Encoded = Vec; @@ -150,7 +150,7 @@ impl Encodable for NodeHash { } } -#[cfg(test)] +#[cfg(feature = "libmdbx-benchmark")] impl Decodable for NodeHash { fn decode(b: &[u8]) -> anyhow::Result { Ok(NodeHash::from_slice(b)) From d97d99e963903f82cc7a68a0269a3cf19fe4b85f Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Wed, 20 Aug 2025 12:31:38 -0300 Subject: [PATCH 11/27] refactor(bench): change the logic of our current benchmark, to make it more real --- Makefile | 2 +- benches/db_benchmark.rs | 549 ++++++++++++++++++++++------------------ 2 files changed, 299 insertions(+), 252 deletions(-) diff --git a/Makefile b/Makefile index 25c8420..4676728 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ clean: ## ๐Ÿงน Remove build artifacts cargo clean bench: ## ๐Ÿ“Š Run benchmarks - cargo bench --bench db_benchmark + cargo bench --bench db_benchmark --features="libmdbx-benchmark" profile: ## ๐Ÿ” Run samply profile cargo build --profile release-with-debug --example profiling diff --git a/benches/db_benchmark.rs b/benches/db_benchmark.rs index 59dc6fb..b8b30ac 100644 --- a/benches/db_benchmark.rs +++ b/benches/db_benchmark.rs @@ -1,54 +1,110 @@ -use criterion::{BenchmarkId, Criterion, black_box, criterion_group, criterion_main}; +//! Ethereum mainnet-like comparison benchmark +//! +//! Compares EthrexDB vs LibMDBX Hash performance with: +//! - Random hash keys (like real accounts) +//! - 104-byte account info (2 hashes + u256 + u64) +//! - 1% random read samples (10x more reads) +//! - Multiple scales: 10k, 100k, 500k, 1M, 10M accounts + use ethrexdb::EthrexDB; use ethrexdb::trie::{InMemoryTrieDB, NodeHash, Trie, TrieDB, TrieError}; -use libmdbx::orm::{Database, Decodable, Encodable, Table, table_info}; -use libmdbx::{DatabaseOptions, Mode, PageSize, ReadWriteOptions, table}; +use libmdbx::orm::{Database, Table, table_info}; +use libmdbx::table; use rand::{seq::SliceRandom, thread_rng}; use sha3::{Digest, Keccak256}; -use std::{sync::Arc, time::Duration}; -use tempdir::TempDir; +use std::fs; +use std::marker::PhantomData; +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Instant; + +/// Generate realistic 32-byte hash key (like account address) +fn generate_account_hash(id: u64) -> Vec { + Keccak256::new() + .chain_update(id.to_be_bytes()) + .finalize() + .to_vec() +} + +/// Generate 104-byte account info: 2 hashes + u256 + u64 +fn generate_account_info(id: u64) -> Vec { + let mut value = Vec::with_capacity(104); + + // Storage hash (32 bytes) + value.extend_from_slice( + &Keccak256::new() + .chain_update((id * 2).to_be_bytes()) + .finalize(), + ); + + // Code hash (32 bytes) + value.extend_from_slice( + &Keccak256::new() + .chain_update((id * 3).to_be_bytes()) + .finalize(), + ); + + // Balance u256 (32 bytes) - deterministic based on id + let balance = (id as u128 % 1000) * 1_000_000_000_000_000_000u128; // ETH in wei + value.extend_from_slice(&[0u8; 16]); // High 128 bits + value.extend_from_slice(&balance.to_be_bytes()); // Low 128 bits + + // Nonce u64 (8 bytes) + value.extend_from_slice(&(id % 1000).to_be_bytes()); + + value +} + +table!( + /// Test table for benchmarks. + (TestNodes) NodeHash => Vec +); + +/// Creates a new temporary DB +fn new_db() -> Arc { + use libmdbx::{DatabaseOptions, Mode, ReadWriteOptions}; -fn create_libmdbx_db(path: std::path::PathBuf) -> Arc { let tables = [table_info!(T)].into_iter().collect(); let options = DatabaseOptions { - page_size: Some(PageSize::Set(4096)), mode: Mode::ReadWrite(ReadWriteOptions { - max_size: Some(1024 * 1024 * 1024), + max_size: Some(2 * 1024 * 1024 * 1024), // 2GB instead of default ..Default::default() }), ..Default::default() }; Arc::new( - Database::create_with_options(Some(path), options, &tables) - .expect("Failed to create LibMDBX database"), + Database::create_with_options(None, options, &tables).expect("Failed to create temp DB"), ) } -table!( - /// Hash-based table for storing trie nodes by their hash - (TestNodes) Vec => Vec -); - -// Simple TrieDB implementation for benchmarking -struct LibmdbxTrieDB { +pub struct LibmdbxTrieDB { db: Arc, + phantom: PhantomData, } -impl LibmdbxTrieDB { - fn new(db: Arc) -> Self { - Self { db } +impl LibmdbxTrieDB +where + T: Table>, +{ + pub fn new(db: Arc) -> Self { + Self { + db, + phantom: PhantomData, + } } } -impl TrieDB for LibmdbxTrieDB { +impl TrieDB for LibmdbxTrieDB +where + T: Table>, +{ fn get(&self, key: NodeHash) -> Result>, TrieError> { let txn = self .db .begin_read() .map_err(|e| TrieError::DbError(e.to_string()))?; - let key_bytes: Vec = key.into(); - txn.get::(key_bytes) + txn.get::(key) .map_err(|e| TrieError::DbError(e.to_string())) } @@ -58,267 +114,258 @@ impl TrieDB for LibmdbxTrieDB { .begin_readwrite() .map_err(|e| TrieError::DbError(e.to_string()))?; for (key, value) in key_values { - let key_bytes: Vec = key.into(); - txn.upsert::(key_bytes, value) + txn.upsert::(key, value) .map_err(|e| TrieError::DbError(e.to_string()))?; } txn.commit().map_err(|e| TrieError::DbError(e.to_string())) } } -struct LibmdbxHashDB { - trie: Trie, +#[derive(Debug)] +struct BenchmarkResults { + db_name: String, + total_accounts: usize, + write_time_ms: u64, + read_time_ms: u64, + reads_per_sec: f64, } -impl LibmdbxHashDB { - fn new(temp_dir: &std::path::Path) -> Self { - let db = create_libmdbx_db::(temp_dir.into()); - let trie = Trie::new(Box::new(LibmdbxTrieDB::new(db.clone()))); - Self { trie } - } - - fn insert_batch(&mut self, data: &[(Vec, Vec)]) { - for (key, value) in data { - self.trie.insert(key.clone(), value.clone()).unwrap(); +fn run_ethrex_benchmark( + accounts: &[(Vec, Vec)], + sample_keys: &[Vec], +) -> Result> { + println!("๐Ÿ”ฅ EthrexDB Benchmark"); + + let db_path = PathBuf::from("ethrex_bench.edb"); + let _ = fs::remove_file(&db_path); + + let mut db = EthrexDB::new(db_path.clone())?; + let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); + + // Write performance test - batch processing like Ethereum blocks + let batch_size = 15_000; // ~Ethereum block size + let batches: Vec<_> = accounts.chunks(batch_size).collect(); + + println!(" ๐Ÿ“ Processing {} accounts in {} batches of ~{}", + accounts.len(), batches.len(), batch_size); + + let total_write_start = Instant::now(); + + for (batch_idx, batch) in batches.iter().enumerate() { + let batch_start = Instant::now(); + + // Insert batch into trie + for (key, value) in batch.iter() { + trie.insert(key.clone(), value.clone())?; + } + + // Commit batch (like block commit) + let root_node = trie.root_node()?.ok_or("No root node")?; + db.commit(&root_node)?; + trie.commit()?; // Convert to hashes for CoW efficiency + + let batch_time = batch_start.elapsed(); + if batch_idx % 10 == 0 || batch_idx == batches.len() - 1 { + println!(" Batch {}/{}: {}ms ({} accounts)", + batch_idx + 1, batches.len(), batch_time.as_millis(), batch.len()); } - self.trie.commit().unwrap(); - } - - fn get(&self, key: &[u8]) -> Option> { - self.trie.get(&key.to_vec()).unwrap() } -} + + let total_write_time = total_write_start.elapsed(); -#[derive(Clone, Debug, PartialEq, Eq)] -pub struct PathKey(Vec); + // Read performance test + let read_start = Instant::now(); + let mut _successful_reads = 0; -impl Encodable for PathKey { - type Encoded = Vec; - fn encode(self) -> Self::Encoded { - self.0 + for key in sample_keys { + if db.get(key)?.is_some() { + _successful_reads += 1; + } } -} -impl Decodable for PathKey { - fn decode(b: &[u8]) -> anyhow::Result { - Ok(PathKey(b.to_vec())) - } + let read_time = read_start.elapsed(); + let reads_per_sec = sample_keys.len() as f64 / read_time.as_secs_f64(); + + println!( + " โœ… Write: {}ms, Read: {}ms ({:.0} reads/sec)", + total_write_time.as_millis(), + read_time.as_millis(), + reads_per_sec + ); + + // Cleanup + let _ = fs::remove_file(&db_path); + + Ok(BenchmarkResults { + db_name: "EthrexDB".to_string(), + total_accounts: accounts.len(), + write_time_ms: total_write_time.as_millis() as u64, + read_time_ms: read_time.as_millis() as u64, + reads_per_sec, + }) } -table!( - /// Path-based table for storing key-value pairs directly by path - (PathNodes) PathKey => Vec -); - -struct LibmdbxSnapshotPathDB { - db: Arc, -} - -impl LibmdbxSnapshotPathDB { - fn new(temp_dir: &std::path::Path) -> Self { - let db = create_libmdbx_db::(temp_dir.into()); - Self { db } +fn run_libmdbx_benchmark( + accounts: &[(Vec, Vec)], + sample_keys: &[Vec], +) -> Result> { + println!("๐Ÿ”ฅ LibMDBX Hash Benchmark"); + + let db: LibmdbxTrieDB = LibmdbxTrieDB::new(new_db::()); + let mut trie = Trie::new(Box::new(db)); + + // Write performance test - batch processing like Ethereum blocks + let batch_size = 15_000; // ~Ethereum block size + let batches: Vec<_> = accounts.chunks(batch_size).collect(); + + println!(" ๐Ÿ“ Processing {} accounts in {} batches of ~{}", + accounts.len(), batches.len(), batch_size); + + let total_write_start = Instant::now(); + + for (batch_idx, batch) in batches.iter().enumerate() { + let batch_start = Instant::now(); + + // Insert batch into trie + for (key, value) in batch.iter() { + trie.insert(key.clone(), value.clone())?; + } + + // Commit batch (like block commit) + trie.commit()?; + + let batch_time = batch_start.elapsed(); + if batch_idx % 10 == 0 || batch_idx == batches.len() - 1 { + println!(" Batch {}/{}: {}ms ({} accounts)", + batch_idx + 1, batches.len(), batch_time.as_millis(), batch.len()); + } } + + let total_write_time = total_write_start.elapsed(); + + // Read performance test + let read_start = Instant::now(); + let mut _successful_reads = 0; - fn insert_batch(&self, data: &[(Vec, Vec)]) { - let txn = self.db.begin_readwrite().unwrap(); - for (key, value) in data { - txn.upsert::(PathKey(key.clone()), value.clone()) - .unwrap(); + for key in sample_keys { + if trie.get(key)?.is_some() { + _successful_reads += 1; } - txn.commit().unwrap(); } - fn get(&self, key: &[u8]) -> Option> { - let txn = self.db.begin_read().unwrap(); - txn.get::(PathKey(key.to_vec())).unwrap() - } + let read_time = read_start.elapsed(); + let reads_per_sec = sample_keys.len() as f64 / read_time.as_secs_f64(); + + println!( + " โœ… Write: {}ms, Read: {}ms ({:.0} reads/sec)", + total_write_time.as_millis(), + read_time.as_millis(), + reads_per_sec + ); + + Ok(BenchmarkResults { + db_name: "LibMDBX Hash".to_string(), + total_accounts: accounts.len(), + write_time_ms: total_write_time.as_millis() as u64, + read_time_ms: read_time.as_millis() as u64, + reads_per_sec, + }) } -// Generate test data (key = hash, value = account info) -fn generate_test_data(n: usize) -> Vec<(Vec, Vec)> { - (1..=n) - .map(|i| { - // 32-byte key (hash) - let key = Keccak256::new() - .chain_update(i.to_be_bytes()) - .finalize() - .to_vec(); - - // 104-byte value (account info: 2 hashes + u256 + u64) - let mut value = Vec::with_capacity(104); - value.extend_from_slice( - &Keccak256::new() - .chain_update((i * 2).to_be_bytes()) - .finalize(), - ); - value.extend_from_slice( - &Keccak256::new() - .chain_update((i * 3).to_be_bytes()) - .finalize(), - ); - value.extend_from_slice(&[0u8; 24]); // u256 padding - value.extend_from_slice(&(i as u64).to_be_bytes()); // u256 value - value.extend_from_slice(&(i as u64).to_be_bytes()); // u64 +fn print_comparison_table(results: &[BenchmarkResults]) { + println!("\n๐Ÿ“Š COMPARISON TABLE"); + println!("====================================================================================="); + println!("Database Accounts Write Time Read Time Reads/Sec Read Sample"); + println!("--------- -------- ---------- --------- --------- -----------"); + + for result in results { + println!( + "{:<14} {:>8} {:>8}ms {:>7}ms {:>9.0} {:>8} keys", + result.db_name, + result.total_accounts, + result.write_time_ms, + result.read_time_ms, + result.reads_per_sec, + if result.total_accounts >= 100 { + result.total_accounts / 100 + } else { + result.total_accounts / 10 + } + ); + } + println!("====================================================================================="); +} +fn run_scale_benchmark( + total_accounts: usize, +) -> Result, Box> { + println!("\n๐Ÿ”ฅ Scale: {} accounts", total_accounts); + println!("========================"); + + let mut results = Vec::new(); + + // Generate all account data upfront (like mainnet snapshot) + println!("Generating {} account hashes...", total_accounts); + let gen_start = Instant::now(); + let mut accounts: Vec<(Vec, Vec)> = (0..total_accounts) + .map(|id| { + let key = generate_account_hash(id as u64); + let value = generate_account_info(id as u64); (key, value) }) - .collect() -} + .collect(); + println!("โœ… Generated in {:.2}s", gen_start.elapsed().as_secs_f64()); -fn insert_benchmark(c: &mut Criterion) { - let mut group = c.benchmark_group("insert"); - group.measurement_time(Duration::from_secs(15)); - group.sample_size(10); - - for size in [1_000, 10_000, 100_000, 1_000_000] { - let data = generate_test_data(size); - - // Hash - group.bench_with_input(BenchmarkId::new("libmdbx_hash", size), &data, |b, data| { - b.iter_with_setup( - || { - let temp_dir = TempDir::new("libmdbx_hash_bench").unwrap(); - LibmdbxHashDB::new(temp_dir.path()) - }, - |mut db| { - db.insert_batch(black_box(data)); - black_box(db) - }, - ); - }); - - // Path - group.bench_with_input( - BenchmarkId::new("libmdbx_snapshot_path", size), - &data, - |b, data| { - b.iter_with_setup( - || { - let temp_dir = TempDir::new("libmdbx_path_bench").unwrap(); - LibmdbxSnapshotPathDB::new(temp_dir.path()) - }, - |db| { - db.insert_batch(black_box(data)); - black_box(db) - }, - ); - }, - ); + // Shuffle for random distribution (like real accounts) + let mut rng = thread_rng(); + accounts.shuffle(&mut rng); - // EthrexDB - group.bench_with_input(BenchmarkId::new("ethrex_db", size), &data, |b, data| { - b.iter_with_setup( - || { - let temp_dir = TempDir::new("ethrex_bench").unwrap(); - let file_path = temp_dir.path().join("test.edb"); - let db = EthrexDB::new(file_path).unwrap(); - let trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); - (db, trie) - }, - |(mut db, mut trie)| { - for (key, value) in data { - trie.insert(key.clone(), value.clone()).unwrap(); - } - let root_node = trie.root_node().unwrap().unwrap(); - db.commit(&root_node).unwrap(); - db - }, - ); - }); - } + // Prepare read samples (1% for more reads) + let sample_size = (total_accounts / 100).clamp(1000, 50_000); + let mut sample_indices: Vec = (0..total_accounts).collect(); + sample_indices.shuffle(&mut rng); + let sample_keys: Vec<_> = sample_indices[..sample_size] + .iter() + .map(|&i| accounts[i].0.clone()) + .collect(); + + println!( + "๐Ÿ“Š Running benchmarks with {} read samples (1% of total)...", + sample_keys.len() + ); + + // Benchmark 1: EthrexDB + results.push(run_ethrex_benchmark(&accounts, &sample_keys)?); - group.finish(); + // Benchmark 2: LibMDBX Hash (Trie) + results.push(run_libmdbx_benchmark(&accounts, &sample_keys)?); + + // Print comparison table + print_comparison_table(&results); + + Ok(results) } -fn random_get_benchmark(c: &mut Criterion) { - let mut group = c.benchmark_group("random_get"); - group.measurement_time(Duration::from_secs(15)); - group.sample_size(10); - - for size in [1_000, 10_000, 100_000, 1_000_000] { - let data = generate_test_data(size); - - let sample_size = std::cmp::max(1, size / 1000); - let mut indices: Vec = (0..size).collect(); - indices.shuffle(&mut thread_rng()); - let sample_keys: Vec<_> = indices[..sample_size] - .iter() - .map(|&i| data[i].0.clone()) - .collect(); - - let libmdbx_hash_temp = TempDir::new("libmdbx_hash_read").unwrap(); - let mut libmdbx_hash_db = LibmdbxHashDB::new(libmdbx_hash_temp.path()); - libmdbx_hash_db.insert_batch(&data); - - let libmdbx_path_temp = TempDir::new("libmdbx_path_read").unwrap(); - let libmdbx_path_db = LibmdbxSnapshotPathDB::new(libmdbx_path_temp.path()); - libmdbx_path_db.insert_batch(&data); - - let ethrex_temp = TempDir::new("ethrex_read").unwrap(); - let ethrex_file = ethrex_temp.path().join("test.edb"); - let mut ethrex_db = EthrexDB::new(ethrex_file.clone()).unwrap(); - let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); - for (key, value) in &data { - trie.insert(key.clone(), value.clone()).unwrap(); - } - let root_node = trie.root_node().unwrap().unwrap(); - ethrex_db.commit(&root_node).unwrap(); - - group.bench_with_input( - BenchmarkId::new("libmdbx_hash", size), - &sample_keys, - |b, keys| { - b.iter(|| { - let mut found = 0; - for key in keys { - if libmdbx_hash_db.get(black_box(key)).is_some() { - found += 1; - } - } - black_box(found) - }); - }, - ); +fn main() -> Result<(), Box> { + println!("๐Ÿš€ EthrexDB vs LibMDBX Mainnet Benchmark"); + println!("========================================"); + println!("Simulating Ethereum account storage patterns"); + println!("Comparing EthrexDB vs LibMDBX Hash (Trie) performance"); - group.bench_with_input( - BenchmarkId::new("libmdbx_snapshot_path", size), - &sample_keys, - |b, keys| { - b.iter(|| { - let mut found = 0; - for key in keys { - if libmdbx_path_db.get(black_box(key)).is_some() { - found += 1; - } - } - black_box(found) - }); - }, - ); + // Multiple scales with more reads (1% sample = 10x more reads than before) + let scales = [10_000, 100_000, 500_000, 1_000_000, 10_000_000]; + let mut all_results = Vec::new(); - group.bench_with_input( - BenchmarkId::new("ethrex_db", size), - &sample_keys, - |b, keys| { - b.iter_with_setup( - || EthrexDB::open(ethrex_file.clone()).unwrap(), - |db| { - let mut found = 0; - for key in keys { - if db.get(black_box(key)).unwrap().is_some() { - found += 1; - } - } - black_box(found) - }, - ); - }, - ); + for &scale in &scales { + let results = run_scale_benchmark(scale)?; + all_results.extend(results); } - group.finish(); -} + println!("\n๐ŸŽฏ FINAL SUMMARY"); + println!("================="); + println!("All benchmarks completed with 1% random read samples (10x more reads than before)"); + println!("EthrexDB: mmap + CoW trie with pure HashMap index"); + println!("LibMDBX: LMDB-based persistent trie storage"); -criterion_group!(benches, insert_benchmark, random_get_benchmark,); -criterion_main!(benches); + Ok(()) +} From c481b4392481af98aff4d1f025aec80c59414afc Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Wed, 20 Aug 2025 15:31:18 -0300 Subject: [PATCH 12/27] refactor(bench): update benchmark logic with new features --- benches/db_benchmark.rs | 219 ++++++++++++++++++---------------------- src/db.rs | 2 +- src/index.rs | 9 +- 3 files changed, 103 insertions(+), 127 deletions(-) diff --git a/benches/db_benchmark.rs b/benches/db_benchmark.rs index b8b30ac..25038b1 100644 --- a/benches/db_benchmark.rs +++ b/benches/db_benchmark.rs @@ -4,7 +4,7 @@ //! - Random hash keys (like real accounts) //! - 104-byte account info (2 hashes + u256 + u64) //! - 1% random read samples (10x more reads) -//! - Multiple scales: 10k, 100k, 500k, 1M, 10M accounts +//! - Multiple scales: 10k, 100k, 500k and 1M accounts use ethrexdb::EthrexDB; use ethrexdb::trie::{InMemoryTrieDB, NodeHash, Trie, TrieDB, TrieError}; @@ -60,21 +60,22 @@ table!( (TestNodes) NodeHash => Vec ); -/// Creates a new temporary DB -fn new_db() -> Arc { +/// Create a libmdbx database with a specific path +fn new_db_with_path(path: PathBuf) -> Arc { use libmdbx::{DatabaseOptions, Mode, ReadWriteOptions}; let tables = [table_info!(T)].into_iter().collect(); let options = DatabaseOptions { mode: Mode::ReadWrite(ReadWriteOptions { - max_size: Some(2 * 1024 * 1024 * 1024), // 2GB instead of default + max_size: Some(2 * 1024 * 1024 * 1024), ..Default::default() }), ..Default::default() }; Arc::new( - Database::create_with_options(None, options, &tables).expect("Failed to create temp DB"), + Database::create_with_options(Some(path), options, &tables) + .expect("Failed to create DB with path"), ) } @@ -123,57 +124,39 @@ where #[derive(Debug)] struct BenchmarkResults { - db_name: String, total_accounts: usize, write_time_ms: u64, read_time_ms: u64, - reads_per_sec: f64, } fn run_ethrex_benchmark( accounts: &[(Vec, Vec)], sample_keys: &[Vec], ) -> Result> { - println!("๐Ÿ”ฅ EthrexDB Benchmark"); - let db_path = PathBuf::from("ethrex_bench.edb"); let _ = fs::remove_file(&db_path); let mut db = EthrexDB::new(db_path.clone())?; let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); - // Write performance test - batch processing like Ethereum blocks - let batch_size = 15_000; // ~Ethereum block size + let batch_size = 15_000; let batches: Vec<_> = accounts.chunks(batch_size).collect(); - - println!(" ๐Ÿ“ Processing {} accounts in {} batches of ~{}", - accounts.len(), batches.len(), batch_size); - + let total_write_start = Instant::now(); - - for (batch_idx, batch) in batches.iter().enumerate() { - let batch_start = Instant::now(); - - // Insert batch into trie + + for batch in batches.iter() { for (key, value) in batch.iter() { trie.insert(key.clone(), value.clone())?; } - - // Commit batch (like block commit) - let root_node = trie.root_node()?.ok_or("No root node")?; + + // Commit db and trie (Convert NodeRef::Node to NodeRef::Hash) + let root_node = trie.root_node().unwrap().unwrap(); db.commit(&root_node)?; - trie.commit()?; // Convert to hashes for CoW efficiency - - let batch_time = batch_start.elapsed(); - if batch_idx % 10 == 0 || batch_idx == batches.len() - 1 { - println!(" Batch {}/{}: {}ms ({} accounts)", - batch_idx + 1, batches.len(), batch_time.as_millis(), batch.len()); - } + trie.commit()?; } - + let total_write_time = total_write_start.elapsed(); - // Read performance test let read_start = Instant::now(); let mut _successful_reads = 0; @@ -184,24 +167,14 @@ fn run_ethrex_benchmark( } let read_time = read_start.elapsed(); - let reads_per_sec = sample_keys.len() as f64 / read_time.as_secs_f64(); - - println!( - " โœ… Write: {}ms, Read: {}ms ({:.0} reads/sec)", - total_write_time.as_millis(), - read_time.as_millis(), - reads_per_sec - ); // Cleanup let _ = fs::remove_file(&db_path); Ok(BenchmarkResults { - db_name: "EthrexDB".to_string(), total_accounts: accounts.len(), write_time_ms: total_write_time.as_millis() as u64, read_time_ms: read_time.as_millis() as u64, - reads_per_sec, }) } @@ -209,38 +182,28 @@ fn run_libmdbx_benchmark( accounts: &[(Vec, Vec)], sample_keys: &[Vec], ) -> Result> { - println!("๐Ÿ”ฅ LibMDBX Hash Benchmark"); + // LibMDBX needs a directory path, it will create the database files inside + let libmdbx_dir = PathBuf::from("libmdbx_bench_dir"); + let _ = fs::remove_dir_all(&libmdbx_dir); + fs::create_dir_all(&libmdbx_dir)?; - let db: LibmdbxTrieDB = LibmdbxTrieDB::new(new_db::()); + let db: LibmdbxTrieDB = + LibmdbxTrieDB::new(new_db_with_path::(libmdbx_dir.clone())); let mut trie = Trie::new(Box::new(db)); - // Write performance test - batch processing like Ethereum blocks - let batch_size = 15_000; // ~Ethereum block size + let batch_size = 15_000; let batches: Vec<_> = accounts.chunks(batch_size).collect(); - - println!(" ๐Ÿ“ Processing {} accounts in {} batches of ~{}", - accounts.len(), batches.len(), batch_size); - + let total_write_start = Instant::now(); - - for (batch_idx, batch) in batches.iter().enumerate() { - let batch_start = Instant::now(); - - // Insert batch into trie + + for batch in batches.iter() { for (key, value) in batch.iter() { trie.insert(key.clone(), value.clone())?; } - - // Commit batch (like block commit) + trie.commit()?; - - let batch_time = batch_start.elapsed(); - if batch_idx % 10 == 0 || batch_idx == batches.len() - 1 { - println!(" Batch {}/{}: {}ms ({} accounts)", - batch_idx + 1, batches.len(), batch_time.as_millis(), batch.len()); - } } - + let total_write_time = total_write_start.elapsed(); // Read performance test @@ -254,59 +217,46 @@ fn run_libmdbx_benchmark( } let read_time = read_start.elapsed(); - let reads_per_sec = sample_keys.len() as f64 / read_time.as_secs_f64(); - println!( - " โœ… Write: {}ms, Read: {}ms ({:.0} reads/sec)", - total_write_time.as_millis(), - read_time.as_millis(), - reads_per_sec - ); + // Cleanup + let _ = fs::remove_dir_all(&libmdbx_dir); Ok(BenchmarkResults { - db_name: "LibMDBX Hash".to_string(), total_accounts: accounts.len(), write_time_ms: total_write_time.as_millis() as u64, read_time_ms: read_time.as_millis() as u64, - reads_per_sec, }) } -fn print_comparison_table(results: &[BenchmarkResults]) { - println!("\n๐Ÿ“Š COMPARISON TABLE"); - println!("====================================================================================="); - println!("Database Accounts Write Time Read Time Reads/Sec Read Sample"); - println!("--------- -------- ---------- --------- --------- -----------"); - - for result in results { - println!( - "{:<14} {:>8} {:>8}ms {:>7}ms {:>9.0} {:>8} keys", - result.db_name, - result.total_accounts, - result.write_time_ms, - result.read_time_ms, - result.reads_per_sec, - if result.total_accounts >= 100 { - result.total_accounts / 100 - } else { - result.total_accounts / 10 - } - ); - } - println!("====================================================================================="); +fn print_scale_summary(results: &[BenchmarkResults], sample_size: usize, batch_count: usize) { + let ethrex_result = &results[0]; + let libmdbx_result = &results[1]; + + let ethrex_avg_batch = ethrex_result.write_time_ms as f64 / batch_count as f64; + let libmdbx_avg_batch = libmdbx_result.write_time_ms as f64 / batch_count as f64; + + println!( + "\n{} accounts ({} batches):", + ethrex_result.total_accounts, batch_count + ); + println!( + " EthrexDB: {:.0}ms avg/batch, {}ms total write, {}ms read ({} keys)", + ethrex_avg_batch, ethrex_result.write_time_ms, ethrex_result.read_time_ms, sample_size + ); + println!( + " LibMDBX: {:.0}ms avg/batch, {}ms total write, {}ms read ({} keys)", + libmdbx_avg_batch, libmdbx_result.write_time_ms, libmdbx_result.read_time_ms, sample_size + ); } -fn run_scale_benchmark( +fn run_benchmark( total_accounts: usize, ) -> Result, Box> { - println!("\n๐Ÿ”ฅ Scale: {} accounts", total_accounts); + println!("\nBenchmark: {} accounts", total_accounts); println!("========================"); let mut results = Vec::new(); - // Generate all account data upfront (like mainnet snapshot) - println!("Generating {} account hashes...", total_accounts); - let gen_start = Instant::now(); let mut accounts: Vec<(Vec, Vec)> = (0..total_accounts) .map(|id| { let key = generate_account_hash(id as u64); @@ -314,9 +264,7 @@ fn run_scale_benchmark( (key, value) }) .collect(); - println!("โœ… Generated in {:.2}s", gen_start.elapsed().as_secs_f64()); - // Shuffle for random distribution (like real accounts) let mut rng = thread_rng(); accounts.shuffle(&mut rng); @@ -330,42 +278,73 @@ fn run_scale_benchmark( .collect(); println!( - "๐Ÿ“Š Running benchmarks with {} read samples (1% of total)...", + "Running benchmarks with {} read samples (1% of total)...", sample_keys.len() ); - // Benchmark 1: EthrexDB results.push(run_ethrex_benchmark(&accounts, &sample_keys)?); - - // Benchmark 2: LibMDBX Hash (Trie) results.push(run_libmdbx_benchmark(&accounts, &sample_keys)?); - // Print comparison table - print_comparison_table(&results); + let batch_count = accounts.len().div_ceil(15_000); + print_scale_summary(&results, sample_keys.len(), batch_count); Ok(results) } +fn print_final_comparison(all_results: &[BenchmarkResults], read_samples: &[usize]) { + println!("\n\nFINAL COMPARISON"); + println!("================="); + println!( + "Scale EthrexDB Write LibMDBX Write EthrexDB Read LibMDBX Read Keys Read" + ); + println!( + "------ ------------- ------------- ------------- ------------ ---------" + ); + + for (i, chunk) in all_results.chunks(2).enumerate() { + if chunk.len() == 2 { + let ethrex = &chunk[0]; + let libmdbx = &chunk[1]; + + let scale_str = if ethrex.total_accounts >= 1_000_000 { + format!("{}M", ethrex.total_accounts / 1_000_000) + } else if ethrex.total_accounts >= 1_000 { + format!("{}k", ethrex.total_accounts / 1_000) + } else { + ethrex.total_accounts.to_string() + }; + + let keys_read = read_samples[i]; + + println!( + "{:<8} {:>11}ms {:>11}ms {:>11}ms {:>10}ms {:>9}", + scale_str, + ethrex.write_time_ms, + libmdbx.write_time_ms, + ethrex.read_time_ms, + libmdbx.read_time_ms, + keys_read + ); + } + } +} + fn main() -> Result<(), Box> { - println!("๐Ÿš€ EthrexDB vs LibMDBX Mainnet Benchmark"); - println!("========================================"); - println!("Simulating Ethereum account storage patterns"); - println!("Comparing EthrexDB vs LibMDBX Hash (Trie) performance"); + println!("ETHREXDB VS LIBMDBX"); + println!("==================="); - // Multiple scales with more reads (1% sample = 10x more reads than before) - let scales = [10_000, 100_000, 500_000, 1_000_000, 10_000_000]; + let scales = [10_000, 100_000, 500_000, 1_000_000]; let mut all_results = Vec::new(); + let mut read_samples = Vec::new(); for &scale in &scales { - let results = run_scale_benchmark(scale)?; + let sample_size = (scale / 100).clamp(1000, 50_000); + read_samples.push(sample_size); + let results = run_benchmark(scale)?; all_results.extend(results); } - println!("\n๐ŸŽฏ FINAL SUMMARY"); - println!("================="); - println!("All benchmarks completed with 1% random read samples (10x more reads than before)"); - println!("EthrexDB: mmap + CoW trie with pure HashMap index"); - println!("LibMDBX: LMDB-based persistent trie storage"); + print_final_comparison(&all_results, &read_samples); Ok(()) } diff --git a/src/db.rs b/src/db.rs index cadb297..b7eacea 100644 --- a/src/db.rs +++ b/src/db.rs @@ -1,4 +1,4 @@ -//! EthrexDB - Copy-on-Write Merkle Patricia Trie Database +//! EthrexDB - Merkle Patricia Trie Database //! //! The database implements Copy-on-Write (CoW) optimization where only modified nodes //! are written during commits. Unchanged nodes are referenced by their file offset, diff --git a/src/index.rs b/src/index.rs index e11cae9..224117d 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,15 +1,12 @@ -//! Simple in-memory index for O(1) hash -> offset lookups -//! -//! This module provides a pure HashMap-based index with no persistence. -//! The index is rebuilt automatically when needed from the main database file. +//! Simple in-memory index: [`NodeHash`] -> offset lookups use crate::trie::NodeHash; use std::collections::HashMap; -/// Simple in-memory index - pure HashMap +/// Simple in-memory index #[derive(Debug, Default)] pub struct Index { - /// Index map for O(1) lookups + /// Index map data: HashMap, } From f5de40d0fea7302def68613920e443b38acf2fa2 Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Wed, 20 Aug 2025 15:40:01 -0300 Subject: [PATCH 13/27] revert(example): add old code again in profiling.rs --- examples/profiling.rs | 116 ++++++++++-------------------------------- src/db.rs | 2 +- src/serialization.rs | 22 ++++---- src/trie/error.rs | 2 - 4 files changed, 39 insertions(+), 103 deletions(-) diff --git a/examples/profiling.rs b/examples/profiling.rs index 9e2fde2..db9de6d 100644 --- a/examples/profiling.rs +++ b/examples/profiling.rs @@ -1,6 +1,3 @@ -//! Profile of EthrexDB when inserting 100k keys and then 20 batches of 5k keys. -//! Then, it does 500k random gets. - use ethrexdb::{ EthrexDB, trie::{InMemoryTrieDB, Trie}, @@ -9,91 +6,38 @@ use rand::{Rng, thread_rng}; use std::time::Instant; fn main() { - let db_path = std::env::temp_dir().join("profile_ethrexdb.db"); - let mut db = EthrexDB::new(db_path.clone()).unwrap(); + let db_path = std::env::temp_dir().join("profile_gets.edb"); + let mut db = EthrexDB::new(db_path).unwrap(); + + println!("Phase 1: Inserting 1,000,000 keys..."); let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); let mut keys = Vec::new(); - // Phase 1: Initial population (100k keys) - print!("Initial population (100k keys)... "); - let start_phase1 = Instant::now(); - - for i in 0..100_000 { - let key = format!("initial_key_{:08}", i); - let value = format!("initial_value_{:08}", i); + for i in 0..1_000_000 { + let key = format!("benchmark_key_{:08}", i); + let value = format!("value_for_key_{:08}", i); trie.insert(key.as_bytes().to_vec(), value.as_bytes().to_vec()) .unwrap(); keys.push(key); } + // Single commit with all data + let start_insert = Instant::now(); let root_node = trie.root_node().unwrap().unwrap(); - let initial_file_size = std::fs::metadata(&db_path).map(|m| m.len()).unwrap(); db.commit(&root_node).unwrap(); - let after_initial_size = std::fs::metadata(&db_path).map(|m| m.len()).unwrap(); - trie.commit().unwrap(); // Convert to CoW references - - println!( - "Done in {:?} - DB size: {:.1} MB", - start_phase1.elapsed(), - (after_initial_size - initial_file_size) as f64 / 1_048_576.0 - ); - - print!("Incremental updates (20 batches of 5k keys)... "); - let start_phase2 = Instant::now(); - - for batch in 0..20 { - let batch_start = Instant::now(); - let pre_batch_size = std::fs::metadata(&db_path).map(|m| m.len()).unwrap(); - - // Add 5,000 new keys - for i in 0..5_000 { - let key = format!("batch_{}_key_{:08}", batch, i); - let value = format!("batch_{}_value_{:08}", batch, i); - - trie.insert(key.as_bytes().to_vec(), value.as_bytes().to_vec()) - .unwrap(); - keys.push(key); - } - - // Also update some existing keys to demonstrate CoW efficiency - let mut rng = thread_rng(); - for _ in 0..100 { - let idx = rng.gen_range(0..keys.len().min(100_000)); // Only update initial keys - let updated_value = format!("updated_in_batch_{}_value", batch); - trie.insert( - keys[idx].as_bytes().to_vec(), - updated_value.as_bytes().to_vec(), - ) - .unwrap(); - } + println!("Insert phase completed in {:?}", start_insert.elapsed()); - let root_node = trie.root_node().unwrap().unwrap(); - db.commit(&root_node).unwrap(); - let post_batch_size = std::fs::metadata(&db_path).map(|m| m.len()).unwrap(); - trie.commit().unwrap(); // Convert to CoW references - - let _batch_time = batch_start.elapsed(); - let _batch_growth = post_batch_size - pre_batch_size; - } - - let phase2_duration = start_phase2.elapsed(); - let final_file_size = std::fs::metadata(&db_path).map(|m| m.len()).unwrap(); - let incremental_growth = final_file_size - after_initial_size; - - println!( - "Done in {:?} - DB grew: {:.1} MB", - phase2_duration, - incremental_growth as f64 / 1_048_576.0 - ); - - print!("Performance test (500k random gets)... "); + // === PHASE 2: Random gets (this is what we want to profile) === + println!("Phase 2: Performing 1,000,000 random gets..."); let start_gets = Instant::now(); let mut rng = thread_rng(); + let mut hit_count = 0; + let mut miss_count = 0; - for i in 0..500_000 { + for i in 0..1_000_000 { let key = if i % 10 == 0 { // 10% misses - random non-existent keys format!("nonexistent_key_{}", rng.r#gen::()) @@ -101,24 +45,20 @@ fn main() { // 90% hits - existing keys keys[rng.gen_range(0..keys.len())].clone() }; - db.get(key.as_bytes()).unwrap(); - } - let gets_duration = start_gets.elapsed(); - println!( - "Done in {:?} - Avg: {:?}/get", - gets_duration, - gets_duration / 500_000 - ); + match db.get(key.as_bytes()).unwrap() { + Some(_) => hit_count += 1, + None => miss_count += 1, + } - println!("Total keys: {}", keys.len()); - println!( - "Final DB size: {:.1} MB", - final_file_size as f64 / 1_048_576.0 - ); - println!("Total time: {:?}", start_phase1.elapsed()); + if i % 10_000 == 0 { + println!("Completed {} gets", i); + } + } - // Clean up temp file - drop(db); - let _ = std::fs::remove_file(&db_path); + let gets_duration = start_gets.elapsed(); + println!("Gets phase completed in {:?}", gets_duration); + println!("Hits: {}, Misses: {}", hit_count, miss_count); + println!("Total get time: {:?}", gets_duration); + println!("Average get time: {:?}", gets_duration / 1_000_000); } diff --git a/src/db.rs b/src/db.rs index b7eacea..b6ac0cd 100644 --- a/src/db.rs +++ b/src/db.rs @@ -71,7 +71,7 @@ impl EthrexDB { pub fn root(&self) -> Result { let latest_offset = self.file_manager.read_latest_root_offset()?; if latest_offset == 0 { - return Err(TrieError::Other("No root node in database".to_string())); + panic!("No root node in database"); } let file_data = self.file_manager.get_slice_to_end(0)?; diff --git a/src/serialization.rs b/src/serialization.rs index 5069eb3..30a5c7e 100644 --- a/src/serialization.rs +++ b/src/serialization.rs @@ -205,9 +205,9 @@ impl<'a> Serializer<'a> { match noderef { NodeRef::Hash(hash) if hash.is_valid() => { // Node was previously committed - must exist in index - self.node_index.get(hash).ok_or_else(|| { - TrieError::Other(format!("Hash reference not found: {:?}", hash)) - }) + self.node_index + .get(hash) + .ok_or_else(|| panic!("Hash reference not found: {:?}", hash)) } NodeRef::Hash(_) => Ok(0), // Empty/invalid hash NodeRef::Node(node, _) => self.serialize_node(node), @@ -234,7 +234,7 @@ impl<'a> Deserializer<'a> { /// Decodes a node at specific position pub fn decode_node_at(&self, pos: usize) -> Result { if pos >= self.buffer.len() { - return Err(TrieError::Other("Invalid buffer position".to_string())); + panic!("Invalid buffer position"); } let tag = self.buffer[pos]; @@ -269,7 +269,7 @@ impl<'a> Deserializer<'a> { NodeRef::Node(Arc::new(child), OnceLock::new()), ))) } - _ => Err(TrieError::Other("Invalid Extend node".to_string())), + _ => panic!("Invalid Extend node with both child and value"), } } TAG_BRANCH => { @@ -302,7 +302,7 @@ impl<'a> Deserializer<'a> { children, value, )))) } - _ => Err(TrieError::Other(format!("Invalid node tag: {}", tag))), + _ => panic!("Invalid node tag: {tag}"), } } @@ -380,9 +380,7 @@ impl<'a> Deserializer<'a> { Ok(None) } } else { - let next_nibble = path - .next_choice() - .ok_or_else(|| TrieError::Other("Invalid path".to_string()))?; + let next_nibble = path.next_choice().unwrap(); let child_offset_pos = position + next_nibble * 8; let child_offset = u64::from_le_bytes( self.buffer[child_offset_pos..child_offset_pos + 8] @@ -397,7 +395,7 @@ impl<'a> Deserializer<'a> { } } } - _ => Err(TrieError::Other(format!("Invalid node tag: {}", tag))), + _ => panic!("Invalid node tag: {tag}"), } } @@ -419,7 +417,7 @@ impl<'a> Deserializer<'a> { fn read_u64_at(&self, pos: usize) -> Result { if pos + 8 > self.buffer.len() { - return Err(TrieError::Other("Invalid buffer length".to_string())); + panic!("Invalid buffer length"); } Ok(u64::from_le_bytes( self.buffer[pos..pos + 8].try_into().unwrap(), @@ -428,7 +426,7 @@ impl<'a> Deserializer<'a> { fn read_u32_at(&self, pos: usize) -> Result { if pos + 4 > self.buffer.len() { - return Err(TrieError::Other("Invalid buffer length".to_string())); + panic!("Invalid buffer length"); } Ok(u32::from_le_bytes( self.buffer[pos..pos + 4].try_into().unwrap(), diff --git a/src/trie/error.rs b/src/trie/error.rs index 4f2d9db..24ffca4 100644 --- a/src/trie/error.rs +++ b/src/trie/error.rs @@ -13,6 +13,4 @@ pub enum TrieError { LockError, #[error("DB Error: {0}")] DbError(String), - #[error("Other Error: {0}")] - Other(String), } From 41c8e1f56eb271e5f4881a70041e704c400e7352 Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Wed, 20 Aug 2025 15:45:36 -0300 Subject: [PATCH 14/27] test(db): format test to avoid big diff --- src/db.rs | 189 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 123 insertions(+), 66 deletions(-) diff --git a/src/db.rs b/src/db.rs index b6ac0cd..94f2e68 100644 --- a/src/db.rs +++ b/src/db.rs @@ -104,39 +104,6 @@ mod tests { use super::*; use tempdir::TempDir; - // Helper function to generate test data - fn generate_test_data(n: usize) -> Vec<(Vec, Vec)> { - use sha3::{Digest, Keccak256}; - - (1..=n) - .map(|i| { - // 32-byte key (hash) - let key = Keccak256::new() - .chain_update(i.to_be_bytes()) - .finalize() - .to_vec(); - - // 104-byte value (account info: 2 hashes + u256 + u64) - let mut value = Vec::with_capacity(104); - value.extend_from_slice( - &Keccak256::new() - .chain_update((i * 2).to_be_bytes()) - .finalize(), - ); - value.extend_from_slice( - &Keccak256::new() - .chain_update((i * 3).to_be_bytes()) - .finalize(), - ); - value.extend_from_slice(&[0u8; 24]); // u256 padding - value.extend_from_slice(&(i as u64).to_be_bytes()); // u256 value - value.extend_from_slice(&(i as u64).to_be_bytes()); // u64 - - (key, value) - }) - .collect() - } - #[test] fn test_create_and_commit() { let temp_dir = TempDir::new("ethrex_db_test").unwrap(); @@ -150,7 +117,7 @@ mod tests { let root_node = trie.root_node().unwrap().unwrap(); let root_hash = db.commit(&root_node).unwrap(); - assert_ne!(root_hash.as_ref(), [0u8; 32]); + assert!(root_hash.as_ref() != [0u8; 32]); } #[test] @@ -200,59 +167,149 @@ mod tests { } #[test] - fn test_incremental_commit() { + fn test_multi_version_trie() { let temp_dir = TempDir::new("ethrex_db_test").unwrap(); let db_path = temp_dir.path().join("test.edb"); let mut db = EthrexDB::new(db_path.clone()).unwrap(); - let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); - // First commit: Add initial keys + let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); trie.insert(b"key1".to_vec(), b"value1".to_vec()).unwrap(); + trie.insert(b"common".to_vec(), b"v1".to_vec()).unwrap(); + let root_node = trie.root_node().unwrap().unwrap(); + db.commit(&root_node).unwrap(); + trie.commit().unwrap(); + + assert_eq!(db.root().unwrap(), root_node); + + // let mut trie2 = Trie::new(Box::new(InMemoryTrieDB::new_empty())); trie.insert(b"key2".to_vec(), b"value2".to_vec()).unwrap(); + trie.insert(b"common".to_vec(), b"v2".to_vec()).unwrap(); let root_node = trie.root_node().unwrap().unwrap(); - let initial_file_size = db.file_manager.get_file_size().unwrap(); db.commit(&root_node).unwrap(); - trie.commit().unwrap(); // Convert NodeRef::Node to NodeRef::Hash - let recovered_root = db.root().unwrap(); - assert_eq!(recovered_root, root_node); + trie.commit().unwrap(); - let size_after_first = db.file_manager.get_file_size().unwrap(); - assert!(size_after_first > initial_file_size); + assert_eq!(db.root().unwrap(), root_node); - // Second commit: Add one more key (should only store new nodes) trie.insert(b"key3".to_vec(), b"value3".to_vec()).unwrap(); + trie.insert(b"common".to_vec(), b"v3".to_vec()).unwrap(); let root_node = trie.root_node().unwrap().unwrap(); db.commit(&root_node).unwrap(); - assert_eq!(db.root().unwrap(), root_node); trie.commit().unwrap(); - let size_after_second = db.file_manager.get_file_size().unwrap(); - // Should be smaller increment than first commit - let first_increment = size_after_first - initial_file_size; - let second_increment = size_after_second - size_after_first; - assert!( - second_increment < first_increment, - "Second commit should add less data due to CoW" - ); + assert_eq!(db.root().unwrap(), root_node); - // Verify all values are still accessible + assert_eq!(db.get(b"key3").unwrap(), Some(b"value3".to_vec())); + assert_eq!(db.get(b"common").unwrap(), Some(b"v3".to_vec())); assert_eq!(db.get(b"key1").unwrap(), Some(b"value1".to_vec())); assert_eq!(db.get(b"key2").unwrap(), Some(b"value2".to_vec())); - assert_eq!(db.get(b"key3").unwrap(), Some(b"value3".to_vec())); - // Third commit: Update existing key (should reuse many nodes) - trie.insert(b"key2".to_vec(), b"value2_updated".to_vec()) - .unwrap(); - let root_node = trie.root_node().unwrap().unwrap(); + assert_eq!(db.get(b"nonexistent").unwrap(), None); + } + + #[test] + fn test_complex_db_operations() { + let temp_dir = TempDir::new("ethrex_db_complex_test").unwrap(); + let db_path = temp_dir.path().join("complex_test.edb"); + + let test_data_v1 = vec![ + (b"app".to_vec(), b"application_v1".to_vec()), + (b"apple".to_vec(), b"fruit_v1".to_vec()), + (b"car".to_vec(), b"vehicle_v1".to_vec()), + (b"test".to_vec(), b"examination_v1".to_vec()), + (b"0x123456".to_vec(), b"hex_value_v1".to_vec()), + ]; + + let test_data_v2 = vec![ + (b"app".to_vec(), b"application_v2".to_vec()), + (b"apple".to_vec(), b"fruit_v2".to_vec()), + (b"banana".to_vec(), b"fruit_new".to_vec()), + (b"car".to_vec(), b"vehicle_v2".to_vec()), + (b"bike".to_vec(), b"vehicle_new".to_vec()), // New + (b"test".to_vec(), b"examination_v2".to_vec()), + (b"0x123456".to_vec(), b"hex_value_v2".to_vec()), + (b"0xabcdef".to_vec(), b"hex_new".to_vec()), + ]; + + let mut db = EthrexDB::new(db_path.clone()).unwrap(); + + let mut trie_v1 = Trie::new(Box::new(InMemoryTrieDB::new_empty())); + for (key, value) in &test_data_v1 { + trie_v1.insert(key.clone(), value.clone()).unwrap(); + } + let root_node = trie_v1.root_node().unwrap().unwrap(); db.commit(&root_node).unwrap(); - trie.commit().unwrap(); - assert_eq!(db.root().unwrap(), root_node); - // Verify updated value - assert_eq!(db.get(b"key2").unwrap(), Some(b"value2_updated".to_vec())); - assert_eq!(db.get(b"key1").unwrap(), Some(b"value1".to_vec())); - assert_eq!(db.get(b"key3").unwrap(), Some(b"value3".to_vec())); + let mut trie_v2 = Trie::new(Box::new(InMemoryTrieDB::new_empty())); + for (key, value) in &test_data_v2 { + trie_v2.insert(key.clone(), value.clone()).unwrap(); + } + let root_node = trie_v2.root_node().unwrap().unwrap(); + db.commit(&root_node).unwrap(); + + for (key, expected_value) in &test_data_v2 { + let result = db.get(key).unwrap(); + assert_eq!(result, Some(expected_value.clone())); + } + + assert_eq!(db.get(b"nonexistent").unwrap(), None); + + let complex_test_data = vec![ + ( + b"very_long_key_with_complex_structure_123456789".to_vec(), + b"complex_value".to_vec(), + ), + (b"short".to_vec(), b"val".to_vec()), + (b"".to_vec(), b"empty_key_value".to_vec()), + ]; + + let mut trie_v3 = Trie::new(Box::new(InMemoryTrieDB::new_empty())); + for (key, value) in &test_data_v2 { + trie_v3.insert(key.clone(), value.clone()).unwrap(); + } + for (key, value) in &complex_test_data { + trie_v3.insert(key.clone(), value.clone()).unwrap(); + } + let root_node = trie_v3.root_node().unwrap().unwrap(); + db.commit(&root_node).unwrap(); + + for (key, expected_value) in &complex_test_data { + let result = db.get(key).unwrap(); + assert_eq!(result, Some(expected_value.clone())); + } + } + + // Helper function to generate test data + fn generate_test_data(n: usize) -> Vec<(Vec, Vec)> { + use sha3::{Digest, Keccak256}; + + (1..=n) + .map(|i| { + // 32-byte key (hash) + let key = Keccak256::new() + .chain_update(i.to_be_bytes()) + .finalize() + .to_vec(); + + // 104-byte value (account info: 2 hashes + u256 + u64) + let mut value = Vec::with_capacity(104); + value.extend_from_slice( + &Keccak256::new() + .chain_update((i * 2).to_be_bytes()) + .finalize(), + ); + value.extend_from_slice( + &Keccak256::new() + .chain_update((i * 3).to_be_bytes()) + .finalize(), + ); + value.extend_from_slice(&[0u8; 24]); // u256 padding + value.extend_from_slice(&(i as u64).to_be_bytes()); // u256 value + value.extend_from_slice(&(i as u64).to_be_bytes()); // u64 + + (key, value) + }) + .collect() } #[test] @@ -305,7 +362,7 @@ mod tests { let root_node2 = trie.root_node().unwrap().unwrap(); let trie_root_hash2 = root_node2.compute_hash(); let db_root_hash2 = db.commit(&root_node2).unwrap(); - trie.commit().unwrap(); // Convert to NodeRef::Hash + trie.commit().unwrap(); // Convert to NodeRef::Hash assert_eq!( trie_root_hash2, db_root_hash2, From 554cfc4c885a7a3c131adbf2a51b1d3b74a8e03b Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Wed, 20 Aug 2025 16:00:03 -0300 Subject: [PATCH 15/27] refactor(serialization): reduce diff --- src/file_manager.rs | 3 +- src/serialization.rs | 132 +++++++++++++++++++++++++++---------------- 2 files changed, 84 insertions(+), 51 deletions(-) diff --git a/src/file_manager.rs b/src/file_manager.rs index cd12457..c47d6fd 100644 --- a/src/file_manager.rs +++ b/src/file_manager.rs @@ -31,7 +31,8 @@ use std::path::PathBuf; pub struct FileManager { /// File where the data is stored file: File, - /// Memory-mapped view of the file + /// Memory-mapped of the file + /// TODO: Handle case when adding new nodes mmap: Mmap, } diff --git a/src/serialization.rs b/src/serialization.rs index 30a5c7e..78853ec 100644 --- a/src/serialization.rs +++ b/src/serialization.rs @@ -47,6 +47,7 @@ use crate::index::Index; use std::collections::HashMap; +use std::ops::Not; use std::sync::{Arc, OnceLock}; use crate::trie::{ @@ -109,10 +110,6 @@ impl<'a> Serializer<'a> { return Ok(existing_offset); } - if let Some(&absolute_offset) = self.new_nodes.get(&hash) { - return Ok(absolute_offset); - } - // Node is new, serialize it let buffer_offset = self.buffer.len() as u64; let absolute_offset = self.base_offset + buffer_offset; @@ -181,14 +178,17 @@ impl<'a> Serializer<'a> { child_offsets[i] = self.serialize_noderef(child)?; } - // Serialize value - let value_offset = if branch.value.is_empty() { - 0u64 - } else { - let offset = self.base_offset + self.buffer.len() as u64; - self.write_bytes_with_len(&branch.value); - offset - }; + // Serialize offset + let value_offset = branch + .value + .is_empty() + .not() + .then(|| { + let offset = self.base_offset + self.buffer.len() as u64; + self.write_bytes_with_len(&branch.value); + offset + }) + .unwrap_or_default(); // Write all offsets let mut pos = offsets_start; @@ -203,6 +203,7 @@ impl<'a> Serializer<'a> { fn serialize_noderef(&mut self, noderef: &NodeRef) -> Result { match noderef { + NodeRef::Node(node, _) => self.serialize_node(node), NodeRef::Hash(hash) if hash.is_valid() => { // Node was previously committed - must exist in index self.node_index @@ -210,7 +211,6 @@ impl<'a> Serializer<'a> { .ok_or_else(|| panic!("Hash reference not found: {:?}", hash)) } NodeRef::Hash(_) => Ok(0), // Empty/invalid hash - NodeRef::Node(node, _) => self.serialize_node(node), } } @@ -222,11 +222,16 @@ impl<'a> Serializer<'a> { } /// Deserializes a Merkle Patricia Trie from a byte buffer. +/// +/// The deserializer reads the binary format produced by [`Serializer`]. +/// It uses the two node format and converts back to the standard three node format. pub struct Deserializer<'a> { + /// The byte buffer containing serialized trie data buffer: &'a [u8], } impl<'a> Deserializer<'a> { + /// Creates a new deserializer for the given buffer pub fn new(buffer: &'a [u8]) -> Self { Self { buffer } } @@ -329,15 +334,27 @@ impl<'a> Deserializer<'a> { match tag { TAG_EXTEND => { + // Read nibbles length + if position + 4 > self.buffer.len() { + return Ok(None); + } let len = u32::from_le_bytes(self.buffer[position..position + 4].try_into().unwrap()) as usize; position += 4; + // Read nibbles data + if position + len > self.buffer.len() { + return Ok(None); + } let compact_nibbles = &self.buffer[position..position + len]; let nibbles = Nibbles::decode_compact(compact_nibbles); position += len; + // Read node offset + if position + 8 > self.buffer.len() { + return Ok(None); + } let node_offset = u64::from_le_bytes(self.buffer[position..position + 8].try_into().unwrap()); position += 8; @@ -346,13 +363,13 @@ impl<'a> Deserializer<'a> { if node_offset == 0 && value_offset > 0 { // Leaf node - let leaf_path = if nibbles.is_leaf() { + let leaf_path_without_flag = if nibbles.is_leaf() { nibbles.slice(0, nibbles.len() - 1) } else { nibbles }; - if path == leaf_path { + if path == leaf_path_without_flag { self.read_value_at_offset(value_offset as usize) } else { Ok(None) @@ -374,14 +391,25 @@ impl<'a> Deserializer<'a> { let value_offset = u64::from_le_bytes(self.buffer[position..position + 8].try_into().unwrap()); + if position + 8 > self.buffer.len() { + return Ok(None); + } if value_offset > 0 { self.read_value_at_offset(value_offset as usize) } else { Ok(None) } } else { - let next_nibble = path.next_choice().unwrap(); + let Some(next_nibble) = path.next_choice() else { + return Ok(None); + }; + + // Read child offset at position next_nibble let child_offset_pos = position + next_nibble * 8; + if child_offset_pos + 8 > self.buffer.len() { + return Ok(None); + } + let child_offset = u64::from_le_bytes( self.buffer[child_offset_pos..child_offset_pos + 8] .try_into() @@ -399,34 +427,37 @@ impl<'a> Deserializer<'a> { } } + /// Read value at specific offset fn read_value_at_offset(&self, offset: usize) -> Result>, TrieError> { if offset + 4 > self.buffer.len() { return Ok(None); } let len = u32::from_le_bytes(self.buffer[offset..offset + 4].try_into().unwrap()) as usize; + let data_start = offset + 4; if data_start + len > self.buffer.len() { return Ok(None); } - let value = self.buffer[data_start..data_start + len].to_vec(); - Ok(Some(value)) + Ok(Some(self.buffer[data_start..data_start + len].to_vec())) } + /// Read a u64 value from buffer at position fn read_u64_at(&self, pos: usize) -> Result { if pos + 8 > self.buffer.len() { - panic!("Invalid buffer length"); + panic!("Invalid buffer length for u64"); } Ok(u64::from_le_bytes( self.buffer[pos..pos + 8].try_into().unwrap(), )) } + /// Read a u32 value from buffer at position fn read_u32_at(&self, pos: usize) -> Result { if pos + 4 > self.buffer.len() { - panic!("Invalid buffer length"); + panic!("Invalid buffer length for u32"); } Ok(u32::from_le_bytes( self.buffer[pos..pos + 4].try_into().unwrap(), @@ -442,6 +473,12 @@ mod tests { /// Offset to skip the prepended previous root offset (8 bytes) const ROOT_DATA_OFFSET: usize = 8; + fn serialize(root: &Node) -> (Vec, HashMap, u64) { + let index = Index::new(); + let serializer = Serializer::new(&index, 0); + serializer.serialize_tree(root, 0).unwrap() + } + fn new_temp() -> Trie { use std::collections::HashMap; use std::sync::Arc; @@ -453,6 +490,21 @@ mod tests { Trie::new(Box::new(db)) } + #[test] + fn test_serialize_deserialize_empty_leaf() { + let leaf = Node::Leaf(LeafNode { + partial: Nibbles::from_hex(vec![]), + value: vec![], + }); + + let (buffer, _, _) = serialize(&leaf); + + let deserializer = Deserializer::new(&buffer); + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); + + assert_eq!(leaf, recovered); + } + #[test] fn test_serialize_leaf() { let leaf = Node::Leaf(LeafNode { @@ -460,9 +512,7 @@ mod tests { value: b"long_path_value".to_vec(), }); - let index = Index::new(); - let serializer = Serializer::new(&index, 0); - let (buffer, _, _) = serializer.serialize_tree(&leaf, 0).unwrap(); + let (buffer, _, _) = serialize(&leaf); let deserializer = Deserializer::new(&buffer); let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); @@ -477,9 +527,7 @@ mod tests { value: vec![], })); - let index = Index::new(); - let serializer = Serializer::new(&index, 0); - let (buffer, _, _) = serializer.serialize_tree(&branch, 0).unwrap(); + let (buffer, _, _) = serialize(&branch); let deserializer = Deserializer::new(&buffer); let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); @@ -499,9 +547,7 @@ mod tests { child: NodeRef::Node(Arc::new(leaf), OnceLock::new()), }); - let index = Index::new(); - let serializer = Serializer::new(&index, 0); - let (buffer, _, _) = serializer.serialize_tree(&ext, 0).unwrap(); + let (buffer, _, _) = serialize(&ext); let deserializer = Deserializer::new(&buffer); let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); @@ -550,9 +596,7 @@ mod tests { child: NodeRef::Node(Arc::new(branch), OnceLock::new()), }); - let index = Index::new(); - let serializer = Serializer::new(&index, 0); - let (buffer, _, _) = serializer.serialize_tree(&outer_ext, 0).unwrap(); + let (buffer, _, _) = serialize(&outer_ext); let deserializer = Deserializer::new(&buffer); let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); @@ -573,9 +617,7 @@ mod tests { trie.insert(b"key".to_vec(), b"value".to_vec()).unwrap(); let root = trie.root_node().unwrap().unwrap(); - let index = Index::new(); - let serializer = Serializer::new(&index, 0); - let (buffer, _, _) = serializer.serialize_tree(&root, 0).unwrap(); + let (buffer, _, _) = serialize(&root); let deserializer = Deserializer::new(&buffer); let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); @@ -598,9 +640,7 @@ mod tests { } let root = trie.root_node().unwrap().unwrap(); - let index = Index::new(); - let serializer = Serializer::new(&index, 0); - let (buffer, _, _) = serializer.serialize_tree(&root, 0).unwrap(); + let (buffer, _, _) = serialize(&root); let deserializer = Deserializer::new(&buffer); let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); @@ -618,9 +658,7 @@ mod tests { // Serialize to file let root = trie.root_node().unwrap().unwrap(); - let index = Index::new(); - let serializer = Serializer::new(&index, 0); - let (buffer, _, _) = serializer.serialize_tree(&root, 0).unwrap(); + let (buffer, _, _) = serialize(&root); let path = "/tmp/test_trie.mpt"; fs::write(path, &buffer).unwrap(); @@ -640,9 +678,7 @@ mod tests { trie.insert(b"test".to_vec(), b"value".to_vec()).unwrap(); let root = trie.root_node().unwrap().unwrap(); - let index = Index::new(); - let serializer = Serializer::new(&index, 0); - let (buffer, _, _) = serializer.serialize_tree(&root, 0).unwrap(); + let (buffer, _, _) = serialize(&root); let deserializer = Deserializer::new(&buffer); assert_eq!( @@ -673,9 +709,7 @@ mod tests { } let root = trie.root_node().unwrap().unwrap(); - let index = Index::new(); - let serializer = Serializer::new(&index, 0); - let (buffer, _, _) = serializer.serialize_tree(&root, 0).unwrap(); + let (buffer, _, _) = serialize(&root); let deserializer = Deserializer::new(&buffer); assert_eq!( @@ -764,9 +798,7 @@ mod tests { let root = trie.root_node().unwrap().unwrap(); - let index = Index::new(); - let serializer = Serializer::new(&index, 0); - let (buffer, _, _) = serializer.serialize_tree(&root, 0).unwrap(); + let (buffer, _, _) = serialize(&root); for (key, expected_value) in &test_data { let deserializer = Deserializer::new(&buffer); From 81796e087d8c48f6d18e810fa16ea35fdc914ebf Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Wed, 20 Aug 2025 16:34:49 -0300 Subject: [PATCH 16/27] refactor(core): reduce diff again0 --- src/index.rs | 4 ++++ src/lib.rs | 2 +- src/serialization.rs | 13 +++++++++++-- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/index.rs b/src/index.rs index 224117d..3ad7fbf 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,4 +1,8 @@ //! Simple in-memory index: [`NodeHash`] -> offset lookups +//! +//! This is used to store the absolute offset of the node in the file +//! for each node hash. With this information, we can create new nodes and be able +//! to point to nodes that didn't change and exist in the file. use crate::trie::NodeHash; use std::collections::HashMap; diff --git a/src/lib.rs b/src/lib.rs index dbed6de..30cce52 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,7 +4,7 @@ mod db; /// Interact with the file mod file_manager; -/// In-memory index with persistence +/// In-memory index pub mod index; /// Serialization and deserialization of the trie. mod serialization; diff --git a/src/serialization.rs b/src/serialization.rs index 78853ec..62e610f 100644 --- a/src/serialization.rs +++ b/src/serialization.rs @@ -320,6 +320,7 @@ impl<'a> Deserializer<'a> { self.get_by_path_inner(nibbles, offset) } + /// Internal helper for get_by_path with position tracking fn get_by_path_inner( &self, mut path: Nibbles, @@ -358,9 +359,15 @@ impl<'a> Deserializer<'a> { let node_offset = u64::from_le_bytes(self.buffer[position..position + 8].try_into().unwrap()); position += 8; + + // Read value offset + if position + 8 > self.buffer.len() { + return Ok(None); + } let value_offset = u64::from_le_bytes(self.buffer[position..position + 8].try_into().unwrap()); + // Extend has only a child or a value if node_offset == 0 && value_offset > 0 { // Leaf node let leaf_path_without_flag = if nibbles.is_leaf() { @@ -379,6 +386,7 @@ impl<'a> Deserializer<'a> { if !path.skip_prefix(&nibbles) { return Ok(None); } + // Recurse into the child self.get_by_path_inner(path, node_offset as usize) } else { Ok(None) @@ -436,7 +444,6 @@ impl<'a> Deserializer<'a> { let len = u32::from_le_bytes(self.buffer[offset..offset + 4].try_into().unwrap()) as usize; let data_start = offset + 4; - if data_start + len > self.buffer.len() { return Ok(None); } @@ -473,6 +480,8 @@ mod tests { /// Offset to skip the prepended previous root offset (8 bytes) const ROOT_DATA_OFFSET: usize = 8; + /// Helper function to create [`Index`] and [`Serializer`] structures + /// and serialize a tree fn serialize(root: &Node) -> (Vec, HashMap, u64) { let index = Index::new(); let serializer = Serializer::new(&index, 0); @@ -506,7 +515,7 @@ mod tests { } #[test] - fn test_serialize_leaf() { + fn test_serialize_deserialize_leaf_with_long_path() { let leaf = Node::Leaf(LeafNode { partial: Nibbles::from_hex(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5]), value: b"long_path_value".to_vec(), From 54e9f4370649c4f54fc6e9b6c2f4e7aaab88288e Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Wed, 20 Aug 2025 16:38:11 -0300 Subject: [PATCH 17/27] feat(bench): compare EthrexDB and Libmdbx root hash --- benches/db_benchmark.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/benches/db_benchmark.rs b/benches/db_benchmark.rs index 25038b1..6c5255d 100644 --- a/benches/db_benchmark.rs +++ b/benches/db_benchmark.rs @@ -127,6 +127,7 @@ struct BenchmarkResults { total_accounts: usize, write_time_ms: u64, read_time_ms: u64, + root_hash: NodeHash, } fn run_ethrex_benchmark( @@ -168,6 +169,9 @@ fn run_ethrex_benchmark( let read_time = read_start.elapsed(); + // Get root hash for validation + let root_hash = db.root().unwrap().compute_hash(); + // Cleanup let _ = fs::remove_file(&db_path); @@ -175,6 +179,7 @@ fn run_ethrex_benchmark( total_accounts: accounts.len(), write_time_ms: total_write_time.as_millis() as u64, read_time_ms: read_time.as_millis() as u64, + root_hash, }) } @@ -218,6 +223,9 @@ fn run_libmdbx_benchmark( let read_time = read_start.elapsed(); + // Get root hash for validation + let root_hash = trie.root_node().unwrap().unwrap().compute_hash(); + // Cleanup let _ = fs::remove_dir_all(&libmdbx_dir); @@ -225,6 +233,7 @@ fn run_libmdbx_benchmark( total_accounts: accounts.len(), write_time_ms: total_write_time.as_millis() as u64, read_time_ms: read_time.as_millis() as u64, + root_hash, }) } @@ -247,6 +256,12 @@ fn print_scale_summary(results: &[BenchmarkResults], sample_size: usize, batch_c " LibMDBX: {:.0}ms avg/batch, {}ms total write, {}ms read ({} keys)", libmdbx_avg_batch, libmdbx_result.write_time_ms, libmdbx_result.read_time_ms, sample_size ); + + // Validate root hashes match + assert_eq!( + ethrex_result.root_hash, libmdbx_result.root_hash, + "Root hashes mismatch" + ); } fn run_benchmark( From 08442c1e33f46f74878ac4df93a78717b65b874d Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Wed, 20 Aug 2025 17:13:24 -0300 Subject: [PATCH 18/27] chore(gitignore): add .vscode --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 5c9a5de..fb5ab95 100644 --- a/.gitignore +++ b/.gitignore @@ -20,5 +20,8 @@ target/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +# VSCode +.vscode/ + # Samply json profile.json.gz From cbec76362117745a0b24e75a98edcd0ae8e4b94a Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Fri, 22 Aug 2025 10:23:23 -0300 Subject: [PATCH 19/27] docs(index): add TODOs --- src/db.rs | 2 ++ src/index.rs | 2 ++ src/serialization.rs | 4 ---- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/db.rs b/src/db.rs index 94f2e68..702413a 100644 --- a/src/db.rs +++ b/src/db.rs @@ -20,6 +20,7 @@ pub struct EthrexDB { /// File manager file_manager: FileManager, /// Index mapping node hashes to their file offsets + /// TODO: Read from file if it exists to node_index: Index, } @@ -37,6 +38,7 @@ impl EthrexDB { /// Open an existing database pub fn open(file_path: PathBuf) -> Result { let file_manager = FileManager::open(file_path.clone())?; + // TODO: Read node index from file if it exists let node_index = Index::new(); Ok(Self { file_manager, diff --git a/src/index.rs b/src/index.rs index 3ad7fbf..415acda 100644 --- a/src/index.rs +++ b/src/index.rs @@ -11,6 +11,8 @@ use std::collections::HashMap; #[derive(Debug, Default)] pub struct Index { /// Index map + /// TODO: Use a better data structure + /// TODO: Read from file if it exists data: HashMap, } diff --git a/src/serialization.rs b/src/serialization.rs index 62e610f..9a40422 100644 --- a/src/serialization.rs +++ b/src/serialization.rs @@ -40,10 +40,6 @@ //! [`Serializer::node_index`]. If the node's hash already exists, its offset is //! returned immediately without re-serialization. This means unchanged subtrees //! are never duplicated - they're referenced by offset. -//! -//! The serialization order is depth-first, with the root always first after the -//! `prev_root_offset`. This allows offsets to be calculated during serialization -//! as children are written immediately after their parents (unless they already exist). use crate::index::Index; use std::collections::HashMap; From 54861c4be41052a410b6bd6c4c388baf8a1b89f9 Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Mon, 25 Aug 2025 09:13:06 -0300 Subject: [PATCH 20/27] refactor(libmdbx): libmdbx encode to benchmark and move to dev-dep --- Cargo.toml | 12 ++---------- Makefile | 2 +- benches/db_benchmark.rs | 40 ++++++++++++++++++++++++++++++++++------ src/trie/node_hash.rs | 18 ------------------ 4 files changed, 37 insertions(+), 35 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a2a8b55..69734cf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,21 +15,14 @@ sha3 = "0.10.8" thiserror = "2.0.9" tinyvec = "1.6.0" -# Optional benchmark dependencies -libmdbx = { version = "=0.5.3", features = ["orm"], optional = true } -anyhow = { version = "1.0.86", optional = true } - -[features] -default = [] -libmdbx-benchmark = ["libmdbx", "anyhow"] - [dev-dependencies] -anyhow = "1.0.86" +anyhow = { version = "1.0.86" } cita_trie = "4.0.0" # used for proptest comparisons criterion = { version = "0.5", features = ["html_reports"] } hasher = "0.1.4" # cita_trie needs this hex = "0.4.3" # for simple benchmark hex-literal = "0.4.1" +libmdbx = { version = "=0.5.3", features = ["orm"] } proptest = "1.0.0" rand = "0.8.5" tempdir = "0.3.7" @@ -37,7 +30,6 @@ tempdir = "0.3.7" [[bench]] name = "db_benchmark" harness = false -required-features = ["libmdbx-benchmark"] [profile.release-with-debug] inherits = "release" diff --git a/Makefile b/Makefile index 4676728..25c8420 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ clean: ## ๐Ÿงน Remove build artifacts cargo clean bench: ## ๐Ÿ“Š Run benchmarks - cargo bench --bench db_benchmark --features="libmdbx-benchmark" + cargo bench --bench db_benchmark profile: ## ๐Ÿ” Run samply profile cargo build --profile release-with-debug --example profiling diff --git a/benches/db_benchmark.rs b/benches/db_benchmark.rs index 6c5255d..769ca43 100644 --- a/benches/db_benchmark.rs +++ b/benches/db_benchmark.rs @@ -8,7 +8,7 @@ use ethrexdb::EthrexDB; use ethrexdb::trie::{InMemoryTrieDB, NodeHash, Trie, TrieDB, TrieError}; -use libmdbx::orm::{Database, Table, table_info}; +use libmdbx::orm::{Database, Decodable, Encodable, Table, table_info}; use libmdbx::table; use rand::{seq::SliceRandom, thread_rng}; use sha3::{Digest, Keccak256}; @@ -18,6 +18,34 @@ use std::path::PathBuf; use std::sync::Arc; use std::time::Instant; +/// Wrapper for NodeHash to implement external traits in benchmarks +/// This is needed due to Rust's orphan rule: we can't implement +/// external traits (Encodable/Decodable from libmdbx) for external types (NodeHash) +/// With this wrapper, we can move libmdbx dependecy to dev-dependencies and +/// avoid creating a new feature flag +#[derive(Clone, Copy)] +pub struct NodeHashWrapper(NodeHash); + +impl From for NodeHashWrapper { + fn from(hash: NodeHash) -> Self { + NodeHashWrapper(hash) + } +} + +impl Encodable for NodeHashWrapper { + type Encoded = Vec; + + fn encode(self) -> Self::Encoded { + self.0.into() + } +} + +impl Decodable for NodeHashWrapper { + fn decode(b: &[u8]) -> anyhow::Result { + Ok(NodeHashWrapper(NodeHash::from_slice(b))) + } +} + /// Generate realistic 32-byte hash key (like account address) fn generate_account_hash(id: u64) -> Vec { Keccak256::new() @@ -57,7 +85,7 @@ fn generate_account_info(id: u64) -> Vec { table!( /// Test table for benchmarks. - (TestNodes) NodeHash => Vec + (TestNodes) NodeHashWrapper => Vec ); /// Create a libmdbx database with a specific path @@ -86,7 +114,7 @@ pub struct LibmdbxTrieDB { impl LibmdbxTrieDB where - T: Table>, + T: Table>, { pub fn new(db: Arc) -> Self { Self { @@ -98,14 +126,14 @@ where impl TrieDB for LibmdbxTrieDB where - T: Table>, + T: Table>, { fn get(&self, key: NodeHash) -> Result>, TrieError> { let txn = self .db .begin_read() .map_err(|e| TrieError::DbError(e.to_string()))?; - txn.get::(key) + txn.get::(key.into()) .map_err(|e| TrieError::DbError(e.to_string())) } @@ -115,7 +143,7 @@ where .begin_readwrite() .map_err(|e| TrieError::DbError(e.to_string()))?; for (key, value) in key_values { - txn.upsert::(key, value) + txn.upsert::(key.into(), value) .map_err(|e| TrieError::DbError(e.to_string()))?; } txn.commit().map_err(|e| TrieError::DbError(e.to_string())) diff --git a/src/trie/node_hash.rs b/src/trie/node_hash.rs index 70ffb14..e20756e 100644 --- a/src/trie/node_hash.rs +++ b/src/trie/node_hash.rs @@ -1,7 +1,5 @@ use crate::rlp::{Encoder, RLPDecode, RLPDecodeError, RLPEncode}; use ethereum_types::H256; -#[cfg(feature = "libmdbx-benchmark")] -use libmdbx::orm::{Decodable, Encodable}; use sha3::{Digest, Keccak256}; /// Struct representing a trie node hash @@ -140,19 +138,3 @@ impl RLPDecode for NodeHash { Ok((hash, rest)) } } - -#[cfg(feature = "libmdbx-benchmark")] -impl Encodable for NodeHash { - type Encoded = Vec; - - fn encode(self) -> Self::Encoded { - self.into() - } -} - -#[cfg(feature = "libmdbx-benchmark")] -impl Decodable for NodeHash { - fn decode(b: &[u8]) -> anyhow::Result { - Ok(NodeHash::from_slice(b)) - } -} From ba37a5ce15aaee00d1b8f2d095fd833efce8a2df Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Mon, 25 Aug 2025 09:15:29 -0300 Subject: [PATCH 21/27] fix(Cargo.toml): correct syntax for anyhow dependency declaration --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 69734cf..db91698 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,7 @@ thiserror = "2.0.9" tinyvec = "1.6.0" [dev-dependencies] -anyhow = { version = "1.0.86" } +anyhow = version = "1.0.86" cita_trie = "4.0.0" # used for proptest comparisons criterion = { version = "0.5", features = ["html_reports"] } hasher = "0.1.4" # cita_trie needs this From 46d423af75415e8f282a1f9f16221111de0eddd8 Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Mon, 25 Aug 2025 09:18:09 -0300 Subject: [PATCH 22/27] chore(Cargo): remove unused dependencies --- Cargo.lock | 376 +---------------------------------------------------- Cargo.toml | 3 +- 2 files changed, 2 insertions(+), 377 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b7bb442..6f20629 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,18 +11,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "anes" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" - -[[package]] -name = "anstyle" -version = "1.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" - [[package]] name = "anyhow" version = "1.0.99" @@ -56,7 +44,7 @@ dependencies = [ "bitflags", "cexpr", "clang-sys", - "itertools 0.13.0", + "itertools", "proc-macro2", "quote", "regex", @@ -107,12 +95,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "bumpalo" -version = "3.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" - [[package]] name = "byte-slice-cast" version = "1.2.3" @@ -134,12 +116,6 @@ dependencies = [ "serde", ] -[[package]] -name = "cast" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" - [[package]] name = "cc" version = "1.2.32" @@ -164,33 +140,6 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" -[[package]] -name = "ciborium" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" -dependencies = [ - "ciborium-io", - "ciborium-ll", - "serde", -] - -[[package]] -name = "ciborium-io" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" - -[[package]] -name = "ciborium-ll" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" -dependencies = [ - "ciborium-io", - "half", -] - [[package]] name = "cita_trie" version = "4.1.0" @@ -213,31 +162,6 @@ dependencies = [ "libloading", ] -[[package]] -name = "clap" -version = "4.5.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fc0e74a703892159f5ae7d3aac52c8e6c392f5ae5f359c70b5881d60aaac318" -dependencies = [ - "clap_builder", -] - -[[package]] -name = "clap_builder" -version = "4.5.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3e7f4214277f3c7aa526a59dd3fbe306a370daee1f8b7b8c987069cd8e888a8" -dependencies = [ - "anstyle", - "clap_lex", -] - -[[package]] -name = "clap_lex" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" - [[package]] name = "const_format" version = "0.2.34" @@ -267,67 +191,6 @@ dependencies = [ "libc", ] -[[package]] -name = "criterion" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" -dependencies = [ - "anes", - "cast", - "ciborium", - "clap", - "criterion-plot", - "is-terminal", - "itertools 0.10.5", - "num-traits", - "once_cell", - "oorandom", - "plotters", - "rayon", - "regex", - "serde", - "serde_derive", - "serde_json", - "tinytemplate", - "walkdir", -] - -[[package]] -name = "criterion-plot" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" -dependencies = [ - "cast", - "itertools 0.10.5", -] - -[[package]] -name = "crossbeam-deque" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" -dependencies = [ - "crossbeam-epoch", - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-epoch" -version = "0.9.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" -dependencies = [ - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-utils" -version = "0.8.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" - [[package]] name = "crunchy" version = "0.2.4" @@ -431,7 +294,6 @@ dependencies = [ "anyhow", "bytes", "cita_trie", - "criterion", "ethereum-types", "hasher", "hex", @@ -522,16 +384,6 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" -[[package]] -name = "half" -version = "2.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" -dependencies = [ - "cfg-if", - "crunchy", -] - [[package]] name = "hashbrown" version = "0.15.5" @@ -547,12 +399,6 @@ dependencies = [ "tiny-keccak", ] -[[package]] -name = "hermit-abi" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" - [[package]] name = "hex" version = "0.4.3" @@ -619,26 +465,6 @@ dependencies = [ "hashbrown", ] -[[package]] -name = "is-terminal" -version = "0.4.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9" -dependencies = [ - "hermit-abi", - "libc", - "windows-sys 0.59.0", -] - -[[package]] -name = "itertools" -version = "0.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" -dependencies = [ - "either", -] - [[package]] name = "itertools" version = "0.13.0" @@ -648,22 +474,6 @@ dependencies = [ "either", ] -[[package]] -name = "itoa" -version = "1.0.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" - -[[package]] -name = "js-sys" -version = "0.3.77" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" -dependencies = [ - "once_cell", - "wasm-bindgen", -] - [[package]] name = "keccak" version = "0.1.5" @@ -732,12 +542,6 @@ dependencies = [ "scopeguard", ] -[[package]] -name = "log" -version = "0.4.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" - [[package]] name = "mdbx-sys" version = "12.12.0" @@ -795,12 +599,6 @@ version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" -[[package]] -name = "oorandom" -version = "11.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" - [[package]] name = "parity-scale-codec" version = "3.7.5" @@ -852,34 +650,6 @@ dependencies = [ "windows-targets 0.52.6", ] -[[package]] -name = "plotters" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" -dependencies = [ - "num-traits", - "plotters-backend", - "plotters-svg", - "wasm-bindgen", - "web-sys", -] - -[[package]] -name = "plotters-backend" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" - -[[package]] -name = "plotters-svg" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" -dependencies = [ - "plotters-backend", -] - [[package]] name = "ppv-lite86" version = "0.2.21" @@ -1063,26 +833,6 @@ dependencies = [ "rand_core 0.9.3", ] -[[package]] -name = "rayon" -version = "1.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" -dependencies = [ - "either", - "rayon-core", -] - -[[package]] -name = "rayon-core" -version = "1.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" -dependencies = [ - "crossbeam-deque", - "crossbeam-utils", -] - [[package]] name = "rdrand" version = "0.4.0" @@ -1202,21 +952,6 @@ dependencies = [ "wait-timeout", ] -[[package]] -name = "ryu" -version = "1.0.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" - -[[package]] -name = "same-file" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" -dependencies = [ - "winapi-util", -] - [[package]] name = "scopeguard" version = "1.2.0" @@ -1254,18 +989,6 @@ dependencies = [ "syn", ] -[[package]] -name = "serde_json" -version = "1.0.142" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "030fedb782600dcbd6f02d479bf0d817ac3bb40d644745b769d6a96bc3afc5a7" -dependencies = [ - "itoa", - "memchr", - "ryu", - "serde", -] - [[package]] name = "sha3" version = "0.10.8" @@ -1363,16 +1086,6 @@ dependencies = [ "crunchy", ] -[[package]] -name = "tinytemplate" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" -dependencies = [ - "serde", - "serde_json", -] - [[package]] name = "tinyvec" version = "1.9.0" @@ -1447,16 +1160,6 @@ dependencies = [ "libc", ] -[[package]] -name = "walkdir" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" -dependencies = [ - "same-file", - "winapi-util", -] - [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" @@ -1472,74 +1175,6 @@ dependencies = [ "wit-bindgen-rt", ] -[[package]] -name = "wasm-bindgen" -version = "0.2.100" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" -dependencies = [ - "cfg-if", - "once_cell", - "rustversion", - "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.100" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" -dependencies = [ - "bumpalo", - "log", - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-macro" -version = "0.2.100" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" -dependencies = [ - "quote", - "wasm-bindgen-macro-support", -] - -[[package]] -name = "wasm-bindgen-macro-support" -version = "0.2.100" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-backend", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-shared" -version = "0.2.100" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "web-sys" -version = "0.3.77" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" -dependencies = [ - "js-sys", - "wasm-bindgen", -] - [[package]] name = "winapi" version = "0.3.9" @@ -1556,15 +1191,6 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" -[[package]] -name = "winapi-util" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" -dependencies = [ - "windows-sys 0.59.0", -] - [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" diff --git a/Cargo.toml b/Cargo.toml index db91698..b8de305 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,9 +16,8 @@ thiserror = "2.0.9" tinyvec = "1.6.0" [dev-dependencies] -anyhow = version = "1.0.86" +anyhow = "1.0.86" cita_trie = "4.0.0" # used for proptest comparisons -criterion = { version = "0.5", features = ["html_reports"] } hasher = "0.1.4" # cita_trie needs this hex = "0.4.3" # for simple benchmark hex-literal = "0.4.1" From 5ba5c9d84fa03e28d2718338913b76fa69ff0397 Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Mon, 25 Aug 2025 10:28:56 -0300 Subject: [PATCH 23/27] test(db): add test for file size after inserts --- src/db.rs | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/db.rs b/src/db.rs index 702413a..7999624 100644 --- a/src/db.rs +++ b/src/db.rs @@ -483,4 +483,39 @@ mod tests { } } } + + #[test] + fn test_file_size() { + let temp_dir = TempDir::new("ethrex_db_test").unwrap(); + let db_path = temp_dir.path().join("test.edb"); + + let mut db = EthrexDB::new(db_path.clone()).unwrap(); + + let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); + + // Insert 100,000 keys + for i in 0..100_000 { + let key = format!("key_{}", i); + let value = format!("value_{}", i); + trie.insert(key.as_bytes().to_vec(), value.as_bytes().to_vec()) + .unwrap(); + } + let root_node = trie.root_node().unwrap().unwrap(); + db.commit(&root_node).unwrap(); + trie.commit().unwrap(); + // Check file size after inserting 100,000 keys + let insert_file_size = std::fs::metadata(db_path.clone()).unwrap().len(); + + // Update a single key + trie.insert(b"key_1".to_vec(), b"updated_value".to_vec()) + .unwrap(); + let root_node = trie.root_node().unwrap().unwrap(); + db.commit(&root_node).unwrap(); + // Check file size after updating a single key + let update_file_size = std::fs::metadata(db_path).unwrap().len(); + + // File after update should have a very small increase + assert!(insert_file_size < update_file_size); + assert!(update_file_size < insert_file_size + 1000); + } } From f5b07baf2b7b978301292062634116b531e88994 Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Mon, 25 Aug 2025 10:45:19 -0300 Subject: [PATCH 24/27] refactor(bench): remove unused variable --- benches/db_benchmark.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/benches/db_benchmark.rs b/benches/db_benchmark.rs index 769ca43..8b9c19a 100644 --- a/benches/db_benchmark.rs +++ b/benches/db_benchmark.rs @@ -187,12 +187,9 @@ fn run_ethrex_benchmark( let total_write_time = total_write_start.elapsed(); let read_start = Instant::now(); - let mut _successful_reads = 0; for key in sample_keys { - if db.get(key)?.is_some() { - _successful_reads += 1; - } + db.get(key).unwrap().unwrap(); } let read_time = read_start.elapsed(); From f8c7c24f83980786e1018d96ceb14a877eb09055 Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Mon, 25 Aug 2025 10:46:18 -0300 Subject: [PATCH 25/27] refactor(profiling): batch insert instead of full --- examples/profiling.rs | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/examples/profiling.rs b/examples/profiling.rs index db9de6d..b6adc5d 100644 --- a/examples/profiling.rs +++ b/examples/profiling.rs @@ -14,22 +14,32 @@ fn main() { let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); let mut keys = Vec::new(); - for i in 0..1_000_000 { - let key = format!("benchmark_key_{:08}", i); - let value = format!("value_for_key_{:08}", i); + let total_insert_time = Instant::now(); + println!("Inserting 100,000 keys 10 times"); + for batch in 0..10 { + let start_insert = Instant::now(); + for i in 0..100_000 { + let key = format!("benchmark_key_{:08}", batch * 100_000 + i); + let value = format!("value_for_key_{:08}", i); - trie.insert(key.as_bytes().to_vec(), value.as_bytes().to_vec()) - .unwrap(); - keys.push(key); - } + trie.insert(key.as_bytes().to_vec(), value.as_bytes().to_vec()) + .unwrap(); + keys.push(key); + } + let root_node = trie.root_node().unwrap().unwrap(); + let trie_hash = root_node.compute_hash(); + let db_hash = db.commit(&root_node).unwrap(); + trie.commit().unwrap(); + assert_eq!(trie_hash, db_hash); - // Single commit with all data - let start_insert = Instant::now(); - let root_node = trie.root_node().unwrap().unwrap(); - db.commit(&root_node).unwrap(); - println!("Insert phase completed in {:?}", start_insert.elapsed()); + println!( + "Insert 100,000 keys in batch {batch}. Time taken: {:?}", + start_insert.elapsed() + ); + } + println!("Total insert time: {:?}", total_insert_time.elapsed()); - // === PHASE 2: Random gets (this is what we want to profile) === + // === PHASE 2: Random gets === println!("Phase 2: Performing 1,000,000 random gets..."); let start_gets = Instant::now(); From d54030d3f9ba2230b9c052215fa1a774f9ca89ea Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Mon, 25 Aug 2025 10:47:41 -0300 Subject: [PATCH 26/27] refactor(bench): remove unused variable in libmdbx --- benches/db_benchmark.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/benches/db_benchmark.rs b/benches/db_benchmark.rs index 8b9c19a..f2b25a8 100644 --- a/benches/db_benchmark.rs +++ b/benches/db_benchmark.rs @@ -238,12 +238,9 @@ fn run_libmdbx_benchmark( // Read performance test let read_start = Instant::now(); - let mut _successful_reads = 0; for key in sample_keys { - if trie.get(key)?.is_some() { - _successful_reads += 1; - } + trie.get(key).unwrap().unwrap(); } let read_time = read_start.elapsed(); From 269c1b1b73cc79f2ffca2b610e60fbe52f0034dc Mon Sep 17 00:00:00 2001 From: Damian Ramirez Date: Mon, 25 Aug 2025 16:57:38 -0300 Subject: [PATCH 27/27] refactor(bench): use keccak digest instead of chai update Co-authored-by: Mario Rugiero --- benches/db_benchmark.rs | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/benches/db_benchmark.rs b/benches/db_benchmark.rs index f2b25a8..166fb35 100644 --- a/benches/db_benchmark.rs +++ b/benches/db_benchmark.rs @@ -48,10 +48,7 @@ impl Decodable for NodeHashWrapper { /// Generate realistic 32-byte hash key (like account address) fn generate_account_hash(id: u64) -> Vec { - Keccak256::new() - .chain_update(id.to_be_bytes()) - .finalize() - .to_vec() + Keccak256::digest(id.to_be_bytes()).to_vec() } /// Generate 104-byte account info: 2 hashes + u256 + u64 @@ -59,18 +56,10 @@ fn generate_account_info(id: u64) -> Vec { let mut value = Vec::with_capacity(104); // Storage hash (32 bytes) - value.extend_from_slice( - &Keccak256::new() - .chain_update((id * 2).to_be_bytes()) - .finalize(), - ); + value.extend_from_slice(&Keccak256::digest((id * 2).to_be_bytes())); // Code hash (32 bytes) - value.extend_from_slice( - &Keccak256::new() - .chain_update((id * 3).to_be_bytes()) - .finalize(), - ); + value.extend_from_slice(&Keccak256::digest((id * 3).to_be_bytes())); // Balance u256 (32 bytes) - deterministic based on id let balance = (id as u128 % 1000) * 1_000_000_000_000_000_000u128; // ETH in wei