diff --git a/.gitignore b/.gitignore index f5debf0..777a947 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,9 @@ target/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +# VSCode +.vscode/ + # Samply json profile.json.gz diff --git a/Cargo.lock b/Cargo.lock index f179683..c2afdf9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -35,12 +35,6 @@ dependencies = [ "libc", ] -[[package]] -name = "anes" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" - [[package]] name = "ansi_term" version = "0.12.1" @@ -144,7 +138,7 @@ dependencies = [ "bitflags 2.9.1", "cexpr", "clang-sys", - "itertools 0.13.0", + "itertools", "proc-macro2", "quote", "regex", @@ -238,12 +232,6 @@ dependencies = [ "serde", ] -[[package]] -name = "cast" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" - [[package]] name = "cc" version = "1.2.32" @@ -302,33 +290,6 @@ dependencies = [ "phf_codegen", ] -[[package]] -name = "ciborium" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" -dependencies = [ - "ciborium-io", - "ciborium-ll", - "serde", -] - -[[package]] -name = "ciborium-io" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" - -[[package]] -name = "ciborium-ll" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" -dependencies = [ - "ciborium-io", - "half", -] - [[package]] name = "cita_trie" version = "4.1.0" @@ -466,42 +427,6 @@ dependencies = [ "libc", ] -[[package]] -name = "criterion" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" -dependencies = [ - "anes", - "cast", - "ciborium", - "clap 4.5.45", - "criterion-plot", - "is-terminal", - "itertools 0.10.5", - "num-traits", - "once_cell", - "oorandom", - "plotters", - "rayon", - "regex", - "serde", - "serde_derive", - "serde_json", - "tinytemplate", - "walkdir", -] - -[[package]] -name = "criterion-plot" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" -dependencies = [ - "cast", - "itertools 0.10.5", -] - [[package]] name = "crossbeam-channel" version = "0.5.15" @@ -755,9 +680,9 @@ dependencies = [ "anyhow", "bytes", "cita_trie", - "criterion", "ethereum-types", "hasher", + "hex", "hex-literal", "lazy_static", "libmdbx", @@ -893,16 +818,6 @@ dependencies = [ "memmap2", ] -[[package]] -name = "half" -version = "2.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" -dependencies = [ - "cfg-if", - "crunchy", -] - [[package]] name = "hashbrown" version = "0.15.5" @@ -1095,15 +1010,6 @@ dependencies = [ "either", ] -[[package]] -name = "itertools" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" -dependencies = [ - "either", -] - [[package]] name = "itoa" version = "1.0.15" @@ -1307,12 +1213,6 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" -[[package]] -name = "oorandom" -version = "11.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" - [[package]] name = "parity-scale-codec" version = "3.7.5" @@ -1492,34 +1392,6 @@ dependencies = [ "siphasher", ] -[[package]] -name = "plotters" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" -dependencies = [ - "num-traits", - "plotters-backend", - "plotters-svg", - "wasm-bindgen", - "web-sys", -] - -[[package]] -name = "plotters-backend" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" - -[[package]] -name = "plotters-svg" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" -dependencies = [ - "plotters-backend", -] - [[package]] name = "ppv-lite86" version = "0.2.21" @@ -1719,9 +1591,9 @@ dependencies = [ [[package]] name = "rayon" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" dependencies = [ "either", "rayon-core", @@ -1729,9 +1601,9 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.12.1" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" dependencies = [ "crossbeam-deque", "crossbeam-utils", @@ -2168,16 +2040,6 @@ dependencies = [ "crunchy", ] -[[package]] -name = "tinytemplate" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" -dependencies = [ - "serde", - "serde_json", -] - [[package]] name = "tinyvec" version = "1.9.0" @@ -2446,16 +2308,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "web-sys" -version = "0.3.77" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" -dependencies = [ - "js-sys", - "wasm-bindgen", -] - [[package]] name = "winapi" version = "0.3.9" @@ -2474,9 +2326,9 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.9" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +checksum = "0978bf7171b3d90bac376700cb56d606feb40f251a475a5d6634613564460b22" dependencies = [ "windows-sys 0.59.0", ] diff --git a/Cargo.toml b/Cargo.toml index ac804c6..a9e2fcb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,8 +18,8 @@ tinyvec = "1.6.0" [dev-dependencies] anyhow = "1.0.86" cita_trie = "4.0.0" # used for proptest comparisons -criterion = { version = "0.5", features = ["html_reports"] } hasher = "0.1.4" # cita_trie needs this +hex = "0.4.3" # for simple benchmark hex-literal = "0.4.1" libmdbx = { version = "=0.5.3", features = ["orm"] } proptest = "1.0.0" diff --git a/benches/db_benchmark.rs b/benches/db_benchmark.rs index c3db371..166fb35 100644 --- a/benches/db_benchmark.rs +++ b/benches/db_benchmark.rs @@ -1,19 +1,90 @@ -use criterion::{BenchmarkId, Criterion, black_box, criterion_group, criterion_main}; +//! Ethereum mainnet-like comparison benchmark +//! +//! Compares EthrexDB vs LibMDBX Hash performance with: +//! - Random hash keys (like real accounts) +//! - 104-byte account info (2 hashes + u256 + u64) +//! - 1% random read samples (10x more reads) +//! - Multiple scales: 10k, 100k, 500k and 1M accounts + use ethrexdb::EthrexDB; use ethrexdb::trie::{InMemoryTrieDB, NodeHash, Trie, TrieDB, TrieError}; use libmdbx::orm::{Database, Decodable, Encodable, Table, table_info}; -use libmdbx::{DatabaseOptions, Mode, PageSize, ReadWriteOptions, table}; +use libmdbx::table; use rand::{seq::SliceRandom, thread_rng}; use sha3::{Digest, Keccak256}; -use std::{sync::Arc, time::Duration}; -use tempdir::TempDir; +use std::fs; +use std::marker::PhantomData; +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Instant; + +/// Wrapper for NodeHash to implement external traits in benchmarks +/// This is needed due to Rust's orphan rule: we can't implement +/// external traits (Encodable/Decodable from libmdbx) for external types (NodeHash) +/// With this wrapper, we can move libmdbx dependecy to dev-dependencies and +/// avoid creating a new feature flag +#[derive(Clone, Copy)] +pub struct NodeHashWrapper(NodeHash); + +impl From for NodeHashWrapper { + fn from(hash: NodeHash) -> Self { + NodeHashWrapper(hash) + } +} + +impl Encodable for NodeHashWrapper { + type Encoded = Vec; + + fn encode(self) -> Self::Encoded { + self.0.into() + } +} + +impl Decodable for NodeHashWrapper { + fn decode(b: &[u8]) -> anyhow::Result { + Ok(NodeHashWrapper(NodeHash::from_slice(b))) + } +} + +/// Generate realistic 32-byte hash key (like account address) +fn generate_account_hash(id: u64) -> Vec { + Keccak256::digest(id.to_be_bytes()).to_vec() +} + +/// Generate 104-byte account info: 2 hashes + u256 + u64 +fn generate_account_info(id: u64) -> Vec { + let mut value = Vec::with_capacity(104); + + // Storage hash (32 bytes) + value.extend_from_slice(&Keccak256::digest((id * 2).to_be_bytes())); + + // Code hash (32 bytes) + value.extend_from_slice(&Keccak256::digest((id * 3).to_be_bytes())); + + // Balance u256 (32 bytes) - deterministic based on id + let balance = (id as u128 % 1000) * 1_000_000_000_000_000_000u128; // ETH in wei + value.extend_from_slice(&[0u8; 16]); // High 128 bits + value.extend_from_slice(&balance.to_be_bytes()); // Low 128 bits + + // Nonce u64 (8 bytes) + value.extend_from_slice(&(id % 1000).to_be_bytes()); + + value +} + +table!( + /// Test table for benchmarks. + (TestNodes) NodeHashWrapper => Vec +); + +/// Create a libmdbx database with a specific path +fn new_db_with_path(path: PathBuf) -> Arc { + use libmdbx::{DatabaseOptions, Mode, ReadWriteOptions}; -fn create_libmdbx_db(path: std::path::PathBuf) -> Arc { let tables = [table_info!(T)].into_iter().collect(); let options = DatabaseOptions { - page_size: Some(PageSize::Set(4096)), mode: Mode::ReadWrite(ReadWriteOptions { - max_size: Some(1024 * 1024 * 1024), + max_size: Some(2 * 1024 * 1024 * 1024), ..Default::default() }), ..Default::default() @@ -21,34 +92,37 @@ fn create_libmdbx_db(path: std::path::PathBuf) -> Arc { Arc::new( Database::create_with_options(Some(path), options, &tables) - .expect("Failed to create LibMDBX database"), + .expect("Failed to create DB with path"), ) } -table!( - /// Hash-based table for storing trie nodes by their hash - (TestNodes) Vec => Vec -); - -// Simple TrieDB implementation for benchmarking -struct LibmdbxTrieDB { +pub struct LibmdbxTrieDB { db: Arc, + phantom: PhantomData, } -impl LibmdbxTrieDB { - fn new(db: Arc) -> Self { - Self { db } +impl LibmdbxTrieDB +where + T: Table>, +{ + pub fn new(db: Arc) -> Self { + Self { + db, + phantom: PhantomData, + } } } -impl TrieDB for LibmdbxTrieDB { +impl TrieDB for LibmdbxTrieDB +where + T: Table>, +{ fn get(&self, key: NodeHash) -> Result>, TrieError> { let txn = self .db .begin_read() .map_err(|e| TrieError::DbError(e.to_string()))?; - let key_bytes: Vec = key.into(); - txn.get::(key_bytes) + txn.get::(key.into()) .map_err(|e| TrieError::DbError(e.to_string())) } @@ -58,266 +132,245 @@ impl TrieDB for LibmdbxTrieDB { .begin_readwrite() .map_err(|e| TrieError::DbError(e.to_string()))?; for (key, value) in key_values { - let key_bytes: Vec = key.into(); - txn.upsert::(key_bytes, value) + txn.upsert::(key.into(), value) .map_err(|e| TrieError::DbError(e.to_string()))?; } txn.commit().map_err(|e| TrieError::DbError(e.to_string())) } } -struct LibmdbxHashDB { - trie: Trie, +#[derive(Debug)] +struct BenchmarkResults { + total_accounts: usize, + write_time_ms: u64, + read_time_ms: u64, + root_hash: NodeHash, } -impl LibmdbxHashDB { - fn new(temp_dir: &std::path::Path) -> Self { - let db = create_libmdbx_db::(temp_dir.into()); - let trie = Trie::new(Box::new(LibmdbxTrieDB::new(db.clone()))); - Self { trie } - } +fn run_ethrex_benchmark( + accounts: &[(Vec, Vec)], + sample_keys: &[Vec], +) -> Result> { + let db_path = PathBuf::from("ethrex_bench.edb"); + let _ = fs::remove_file(&db_path); + + let mut db = EthrexDB::new(db_path.clone())?; + let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); + + let batch_size = 15_000; + let batches: Vec<_> = accounts.chunks(batch_size).collect(); - fn insert_batch(&mut self, data: &[(Vec, Vec)]) { - for (key, value) in data { - self.trie.insert(key.clone(), value.clone()).unwrap(); + let total_write_start = Instant::now(); + + for batch in batches.iter() { + for (key, value) in batch.iter() { + trie.insert(key.clone(), value.clone())?; } - self.trie.commit().unwrap(); - } - fn get(&self, key: &[u8]) -> Option> { - self.trie.get(&key.to_vec()).unwrap() + // Commit db and trie (Convert NodeRef::Node to NodeRef::Hash) + let root_node = trie.root_node().unwrap().unwrap(); + db.commit(&root_node)?; + trie.commit()?; } -} -#[derive(Clone, Debug, PartialEq, Eq)] -pub struct PathKey(Vec); + let total_write_time = total_write_start.elapsed(); -impl Encodable for PathKey { - type Encoded = Vec; - fn encode(self) -> Self::Encoded { - self.0 - } -} + let read_start = Instant::now(); -impl Decodable for PathKey { - fn decode(b: &[u8]) -> anyhow::Result { - Ok(PathKey(b.to_vec())) + for key in sample_keys { + db.get(key).unwrap().unwrap(); } -} -table!( - /// Path-based table for storing key-value pairs directly by path - (PathNodes) PathKey => Vec -); + let read_time = read_start.elapsed(); -struct LibmdbxSnapshotPathDB { - db: Arc, + // Get root hash for validation + let root_hash = db.root().unwrap().compute_hash(); + + // Cleanup + let _ = fs::remove_file(&db_path); + + Ok(BenchmarkResults { + total_accounts: accounts.len(), + write_time_ms: total_write_time.as_millis() as u64, + read_time_ms: read_time.as_millis() as u64, + root_hash, + }) } -impl LibmdbxSnapshotPathDB { - fn new(temp_dir: &std::path::Path) -> Self { - let db = create_libmdbx_db::(temp_dir.into()); - Self { db } - } +fn run_libmdbx_benchmark( + accounts: &[(Vec, Vec)], + sample_keys: &[Vec], +) -> Result> { + // LibMDBX needs a directory path, it will create the database files inside + let libmdbx_dir = PathBuf::from("libmdbx_bench_dir"); + let _ = fs::remove_dir_all(&libmdbx_dir); + fs::create_dir_all(&libmdbx_dir)?; + + let db: LibmdbxTrieDB = + LibmdbxTrieDB::new(new_db_with_path::(libmdbx_dir.clone())); + let mut trie = Trie::new(Box::new(db)); - fn insert_batch(&self, data: &[(Vec, Vec)]) { - let txn = self.db.begin_readwrite().unwrap(); - for (key, value) in data { - txn.upsert::(PathKey(key.clone()), value.clone()) - .unwrap(); + let batch_size = 15_000; + let batches: Vec<_> = accounts.chunks(batch_size).collect(); + + let total_write_start = Instant::now(); + + for batch in batches.iter() { + for (key, value) in batch.iter() { + trie.insert(key.clone(), value.clone())?; } - txn.commit().unwrap(); + + trie.commit()?; } - fn get(&self, key: &[u8]) -> Option> { - let txn = self.db.begin_read().unwrap(); - txn.get::(PathKey(key.to_vec())).unwrap() + let total_write_time = total_write_start.elapsed(); + + // Read performance test + let read_start = Instant::now(); + + for key in sample_keys { + trie.get(key).unwrap().unwrap(); } + + let read_time = read_start.elapsed(); + + // Get root hash for validation + let root_hash = trie.root_node().unwrap().unwrap().compute_hash(); + + // Cleanup + let _ = fs::remove_dir_all(&libmdbx_dir); + + Ok(BenchmarkResults { + total_accounts: accounts.len(), + write_time_ms: total_write_time.as_millis() as u64, + read_time_ms: read_time.as_millis() as u64, + root_hash, + }) } -// Generate test data (key = hash, value = account info) -fn generate_test_data(n: usize) -> Vec<(Vec, Vec)> { - (1..=n) - .map(|i| { - // 32-byte key (hash) - let key = Keccak256::new() - .chain_update(i.to_be_bytes()) - .finalize() - .to_vec(); - - // 104-byte value (account info: 2 hashes + u256 + u64) - let mut value = Vec::with_capacity(104); - value.extend_from_slice( - &Keccak256::new() - .chain_update((i * 2).to_be_bytes()) - .finalize(), - ); - value.extend_from_slice( - &Keccak256::new() - .chain_update((i * 3).to_be_bytes()) - .finalize(), - ); - value.extend_from_slice(&[0u8; 24]); // u256 padding - value.extend_from_slice(&(i as u64).to_be_bytes()); // u256 value - value.extend_from_slice(&(i as u64).to_be_bytes()); // u64 +fn print_scale_summary(results: &[BenchmarkResults], sample_size: usize, batch_count: usize) { + let ethrex_result = &results[0]; + let libmdbx_result = &results[1]; + + let ethrex_avg_batch = ethrex_result.write_time_ms as f64 / batch_count as f64; + let libmdbx_avg_batch = libmdbx_result.write_time_ms as f64 / batch_count as f64; + + println!( + "\n{} accounts ({} batches):", + ethrex_result.total_accounts, batch_count + ); + println!( + " EthrexDB: {:.0}ms avg/batch, {}ms total write, {}ms read ({} keys)", + ethrex_avg_batch, ethrex_result.write_time_ms, ethrex_result.read_time_ms, sample_size + ); + println!( + " LibMDBX: {:.0}ms avg/batch, {}ms total write, {}ms read ({} keys)", + libmdbx_avg_batch, libmdbx_result.write_time_ms, libmdbx_result.read_time_ms, sample_size + ); + + // Validate root hashes match + assert_eq!( + ethrex_result.root_hash, libmdbx_result.root_hash, + "Root hashes mismatch" + ); +} + +fn run_benchmark( + total_accounts: usize, +) -> Result, Box> { + println!("\nBenchmark: {} accounts", total_accounts); + println!("========================"); + let mut results = Vec::new(); + + let mut accounts: Vec<(Vec, Vec)> = (0..total_accounts) + .map(|id| { + let key = generate_account_hash(id as u64); + let value = generate_account_info(id as u64); (key, value) }) - .collect() + .collect(); + + let mut rng = thread_rng(); + accounts.shuffle(&mut rng); + + // Prepare read samples (1% for more reads) + let sample_size = (total_accounts / 100).clamp(1000, 50_000); + let mut sample_indices: Vec = (0..total_accounts).collect(); + sample_indices.shuffle(&mut rng); + let sample_keys: Vec<_> = sample_indices[..sample_size] + .iter() + .map(|&i| accounts[i].0.clone()) + .collect(); + + println!( + "Running benchmarks with {} read samples (1% of total)...", + sample_keys.len() + ); + + results.push(run_ethrex_benchmark(&accounts, &sample_keys)?); + results.push(run_libmdbx_benchmark(&accounts, &sample_keys)?); + + let batch_count = accounts.len().div_ceil(15_000); + print_scale_summary(&results, sample_keys.len(), batch_count); + + Ok(results) } -fn insert_benchmark(c: &mut Criterion) { - let mut group = c.benchmark_group("insert"); - group.measurement_time(Duration::from_secs(15)); - group.sample_size(10); - - for size in [1_000, 10_000, 100_000, 1_000_000] { - let data = generate_test_data(size); - - // Hash - group.bench_with_input(BenchmarkId::new("libmdbx_hash", size), &data, |b, data| { - b.iter_with_setup( - || { - let temp_dir = TempDir::new("libmdbx_hash_bench").unwrap(); - LibmdbxHashDB::new(temp_dir.path()) - }, - |mut db| { - db.insert_batch(black_box(data)); - black_box(db) - }, +fn print_final_comparison(all_results: &[BenchmarkResults], read_samples: &[usize]) { + println!("\n\nFINAL COMPARISON"); + println!("================="); + println!( + "Scale EthrexDB Write LibMDBX Write EthrexDB Read LibMDBX Read Keys Read" + ); + println!( + "------ ------------- ------------- ------------- ------------ ---------" + ); + + for (i, chunk) in all_results.chunks(2).enumerate() { + if chunk.len() == 2 { + let ethrex = &chunk[0]; + let libmdbx = &chunk[1]; + + let scale_str = if ethrex.total_accounts >= 1_000_000 { + format!("{}M", ethrex.total_accounts / 1_000_000) + } else if ethrex.total_accounts >= 1_000 { + format!("{}k", ethrex.total_accounts / 1_000) + } else { + ethrex.total_accounts.to_string() + }; + + let keys_read = read_samples[i]; + + println!( + "{:<8} {:>11}ms {:>11}ms {:>11}ms {:>10}ms {:>9}", + scale_str, + ethrex.write_time_ms, + libmdbx.write_time_ms, + ethrex.read_time_ms, + libmdbx.read_time_ms, + keys_read ); - }); - - // Path - group.bench_with_input( - BenchmarkId::new("libmdbx_snapshot_path", size), - &data, - |b, data| { - b.iter_with_setup( - || { - let temp_dir = TempDir::new("libmdbx_path_bench").unwrap(); - LibmdbxSnapshotPathDB::new(temp_dir.path()) - }, - |db| { - db.insert_batch(black_box(data)); - black_box(db) - }, - ); - }, - ); - - // EthrexDB - group.bench_with_input(BenchmarkId::new("ethrex_db", size), &data, |b, data| { - b.iter_with_setup( - || { - let temp_dir = TempDir::new("ethrex_bench").unwrap(); - let file_path = temp_dir.path().join("test.edb"); - EthrexDB::new(file_path).unwrap() - }, - |mut db| { - let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); - for (key, value) in data { - trie.insert(key.clone(), value.clone()).unwrap(); - } - let root_node = trie.root_node().unwrap().unwrap(); - db.commit(&root_node).unwrap(); - db - }, - ); - }); + } } - - group.finish(); } -fn random_get_benchmark(c: &mut Criterion) { - let mut group = c.benchmark_group("random_get"); - group.measurement_time(Duration::from_secs(15)); - group.sample_size(10); - - for size in [1_000, 10_000, 100_000, 1_000_000] { - let data = generate_test_data(size); - - let sample_size = std::cmp::max(1, size / 1000); - let mut indices: Vec = (0..size).collect(); - indices.shuffle(&mut thread_rng()); - let sample_keys: Vec<_> = indices[..sample_size] - .iter() - .map(|&i| data[i].0.clone()) - .collect(); - - let libmdbx_hash_temp = TempDir::new("libmdbx_hash_read").unwrap(); - let mut libmdbx_hash_db = LibmdbxHashDB::new(libmdbx_hash_temp.path()); - libmdbx_hash_db.insert_batch(&data); - - let libmdbx_path_temp = TempDir::new("libmdbx_path_read").unwrap(); - let libmdbx_path_db = LibmdbxSnapshotPathDB::new(libmdbx_path_temp.path()); - libmdbx_path_db.insert_batch(&data); - - let ethrex_temp = TempDir::new("ethrex_read").unwrap(); - let ethrex_file = ethrex_temp.path().join("test.edb"); - let mut ethrex_db = EthrexDB::new(ethrex_file.clone()).unwrap(); - let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); - for (key, value) in &data { - trie.insert(key.clone(), value.clone()).unwrap(); - } - let root_node = trie.root_node().unwrap().unwrap(); - ethrex_db.commit(&root_node).unwrap(); - - group.bench_with_input( - BenchmarkId::new("libmdbx_hash", size), - &sample_keys, - |b, keys| { - b.iter(|| { - let mut found = 0; - for key in keys { - if libmdbx_hash_db.get(black_box(key)).is_some() { - found += 1; - } - } - black_box(found) - }); - }, - ); - - group.bench_with_input( - BenchmarkId::new("libmdbx_snapshot_path", size), - &sample_keys, - |b, keys| { - b.iter(|| { - let mut found = 0; - for key in keys { - if libmdbx_path_db.get(black_box(key)).is_some() { - found += 1; - } - } - black_box(found) - }); - }, - ); - - group.bench_with_input( - BenchmarkId::new("ethrex_db", size), - &sample_keys, - |b, keys| { - b.iter_with_setup( - || EthrexDB::open(ethrex_file.clone()).unwrap(), - |db| { - let mut found = 0; - for key in keys { - if db.get(black_box(key)).unwrap().is_some() { - found += 1; - } - } - black_box(found) - }, - ); - }, - ); +fn main() -> Result<(), Box> { + println!("ETHREXDB VS LIBMDBX"); + println!("==================="); + + let scales = [10_000, 100_000, 500_000, 1_000_000]; + let mut all_results = Vec::new(); + let mut read_samples = Vec::new(); + + for &scale in &scales { + let sample_size = (scale / 100).clamp(1000, 50_000); + read_samples.push(sample_size); + let results = run_benchmark(scale)?; + all_results.extend(results); } - group.finish(); -} + print_final_comparison(&all_results, &read_samples); -criterion_group!(benches, insert_benchmark, random_get_benchmark); -criterion_main!(benches); + Ok(()) +} diff --git a/examples/profiling.rs b/examples/profiling.rs index db9de6d..b6adc5d 100644 --- a/examples/profiling.rs +++ b/examples/profiling.rs @@ -14,22 +14,32 @@ fn main() { let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); let mut keys = Vec::new(); - for i in 0..1_000_000 { - let key = format!("benchmark_key_{:08}", i); - let value = format!("value_for_key_{:08}", i); + let total_insert_time = Instant::now(); + println!("Inserting 100,000 keys 10 times"); + for batch in 0..10 { + let start_insert = Instant::now(); + for i in 0..100_000 { + let key = format!("benchmark_key_{:08}", batch * 100_000 + i); + let value = format!("value_for_key_{:08}", i); - trie.insert(key.as_bytes().to_vec(), value.as_bytes().to_vec()) - .unwrap(); - keys.push(key); - } + trie.insert(key.as_bytes().to_vec(), value.as_bytes().to_vec()) + .unwrap(); + keys.push(key); + } + let root_node = trie.root_node().unwrap().unwrap(); + let trie_hash = root_node.compute_hash(); + let db_hash = db.commit(&root_node).unwrap(); + trie.commit().unwrap(); + assert_eq!(trie_hash, db_hash); - // Single commit with all data - let start_insert = Instant::now(); - let root_node = trie.root_node().unwrap().unwrap(); - db.commit(&root_node).unwrap(); - println!("Insert phase completed in {:?}", start_insert.elapsed()); + println!( + "Insert 100,000 keys in batch {batch}. Time taken: {:?}", + start_insert.elapsed() + ); + } + println!("Total insert time: {:?}", total_insert_time.elapsed()); - // === PHASE 2: Random gets (this is what we want to profile) === + // === PHASE 2: Random gets === println!("Phase 2: Performing 1,000,000 random gets..."); let start_gets = Instant::now(); diff --git a/src/db.rs b/src/db.rs index 34820a5..7999624 100644 --- a/src/db.rs +++ b/src/db.rs @@ -1,7 +1,17 @@ -//! EthrexDB - A simple MPT database +//! EthrexDB - Merkle Patricia Trie Database +//! +//! The database implements Copy-on-Write (CoW) optimization where only modified nodes +//! are written during commits. Unchanged nodes are referenced by their file offset, +//! avoiding duplication. All writes are append-only - data is never overwritten, +//! only appended to the end of the file. +//! +//! Each commit creates a new root that links to the previous root via a prepended +//! offset, forming a linked list of all historical states. This allows traversing +//! the entire version history if needed. use crate::file_manager::FileManager; -use crate::serialization::{Deserializer, serialize}; +use crate::index::Index; +use crate::serialization::{Deserializer, Serializer}; use crate::trie::{Node, NodeHash, TrieError}; use std::path::PathBuf; @@ -9,142 +19,83 @@ use std::path::PathBuf; pub struct EthrexDB { /// File manager file_manager: FileManager, + /// Index mapping node hashes to their file offsets + /// TODO: Read from file if it exists to + node_index: Index, } impl EthrexDB { /// Create a new database pub fn new(file_path: PathBuf) -> Result { - let file_manager = FileManager::create(file_path)?; - Ok(Self { file_manager }) + let file_manager = FileManager::create(file_path.clone())?; + let node_index = Index::new(); + Ok(Self { + file_manager, + node_index, + }) } /// Open an existing database pub fn open(file_path: PathBuf) -> Result { - let file_manager = FileManager::open(file_path)?; - Ok(Self { file_manager }) + let file_manager = FileManager::open(file_path.clone())?; + // TODO: Read node index from file if it exists + let node_index = Index::new(); + Ok(Self { + file_manager, + node_index, + }) } - /// Commit a new trie to the database - /// - /// Creates a new version in the database by: - /// 1. Reading the current latest version offset from header - /// 2. Serializing the trie nodes - /// 3. Writing [previous_offset][serialized_nodes] at the end of file - /// 4. Updating the header to point to this new version - /// - /// NOTE: Right now, we are storing the complete trie in the database. We should - /// store only the root node and the updated nodes. + /// Commit a trie state to the database pub fn commit(&mut self, root_node: &Node) -> Result { let root_hash = root_node.compute_hash(); - // Read the previous root offset from header - let previous_root_offset = self.file_manager.read_latest_root_offset()?; + let prev_root_offset = self.file_manager.read_latest_root_offset()?; + let base_offset = self.file_manager.get_file_size()?; - let serialized_trie = serialize(root_node); + let serializer = Serializer::new(&self.node_index, base_offset); + let (serialized_data, new_offsets, root_offset) = + serializer.serialize_tree(root_node, prev_root_offset)?; - // Prepare version data: [prev_offset(8 bytes)] + [trie_data] - let mut data_to_write = Vec::with_capacity(8 + serialized_trie.len()); - data_to_write.extend_from_slice(&previous_root_offset.to_le_bytes()); - data_to_write.extend_from_slice(&serialized_trie); + self.file_manager.write_at_end(&serialized_data)?; - // Write at the end and get the offset where this version starts - let new_root_offset = self.file_manager.write_at_end(&data_to_write)?; - - // Update header to point to this new version - self.file_manager - .update_latest_root_offset(new_root_offset)?; + // Update node index with new node offsets + for (hash, absolute_offset) in new_offsets { + self.node_index.insert(hash, absolute_offset); + } + // Update header to point to the root node + self.file_manager.update_latest_root_offset(root_offset)?; Ok(root_hash) } /// Get the latest root node of the database pub fn root(&self) -> Result { let latest_offset = self.file_manager.read_latest_root_offset()?; - let trie_data = self.get_trie_data_at_version(latest_offset)?; - let root_node = Deserializer::new(trie_data).decode_tree()?; - Ok(root_node) + if latest_offset == 0 { + panic!("No root node in database"); + } + + let file_data = self.file_manager.get_slice_to_end(0)?; + // All roots now have 8-byte prepended previous root offset + let actual_root_offset = latest_offset + 8; + + Deserializer::new(file_data).decode_node_at(actual_root_offset as usize) } /// Get the value of the node with the given key pub fn get(&self, key: &[u8]) -> Result>, TrieError> { let latest_offset = self.file_manager.read_latest_root_offset()?; - self.get_at_version(key, latest_offset) - } - - /// Get the value of the node with the given key at a specific version - pub fn get_at_version( - &self, - key: &[u8], - version_offset: u64, - ) -> Result>, TrieError> { - if version_offset == 0 { + if latest_offset == 0 { return Ok(None); } - let trie_data = self.get_trie_data_at_version(version_offset)?; - - Deserializer::new(trie_data).get_by_path(key) - } - - /// Get all the roots of the database - /// TODO: Make this an iterator - pub fn iter_roots(&self) -> Result, TrieError> { - let mut roots = Vec::new(); - let mut current_offset = self.file_manager.read_latest_root_offset()?; - - while current_offset != 0 { - let trie_data = self.get_trie_data_at_version(current_offset)?; - - // Deserialize the trie at this version - let root_node = Deserializer::new(trie_data).decode_tree()?; - roots.push(root_node); - current_offset = self.read_previous_offset_at_version(current_offset)?; - } - - Ok(roots) - } + let file_data = self.file_manager.get_slice_to_end(0)?; - /// Get trie data slice at a specific version - /// - /// Each version has format: [prev_offset: 8 bytes][trie_data] - /// This function skips the prev_offset and returns only the trie_data portion - fn get_trie_data_at_version(&self, version_offset: u64) -> Result<&[u8], TrieError> { - // Skip the previous offset (8 bytes) to get to the trie data - let trie_data_start = version_offset + 8; - let next_version_offset = self.find_next_version_offset(version_offset)?; - - match next_version_offset { - Some(next_offset) => { - let size = (next_offset - trie_data_start) as usize; - self.file_manager.get_slice_at(trie_data_start, size) - } - None => { - // Last version, read until the end - self.file_manager.get_slice_to_end(trie_data_start) - } - } - } + // All roots have 8-byte prepended previous root offset + let actual_root_offset = latest_offset + 8; - /// Read the previous offset at a specific version - fn read_previous_offset_at_version(&self, version_offset: u64) -> Result { - let prev_offset_slice = self.file_manager.get_slice_at(version_offset, 8)?; - Ok(u64::from_le_bytes(prev_offset_slice.try_into().unwrap())) - } - - /// Find the offset of the next version after the given offset - fn find_next_version_offset(&self, current_offset: u64) -> Result, TrieError> { - let mut offset = self.file_manager.read_latest_root_offset()?; - let mut next_offset = None; - - // Walk the linked list to find the smallest offset greater than current_offset - while offset != 0 { - if offset > current_offset && (next_offset.is_none() || offset < next_offset.unwrap()) { - next_offset = Some(offset); - } - offset = self.read_previous_offset_at_version(offset)?; - } - - Ok(next_offset) + Deserializer::new(file_data).get_by_path_at(key, actual_root_offset as usize) } } @@ -229,10 +180,7 @@ mod tests { trie.insert(b"common".to_vec(), b"v1".to_vec()).unwrap(); let root_node = trie.root_node().unwrap().unwrap(); db.commit(&root_node).unwrap(); - - // Note: We can't call trie.commit() because it converts NodeRef::Node to NodeRef::Hash - // and our serialization doesn't support hash references yet - // trie.commit().unwrap(); + trie.commit().unwrap(); assert_eq!(db.root().unwrap(), root_node); @@ -241,10 +189,7 @@ mod tests { trie.insert(b"common".to_vec(), b"v2".to_vec()).unwrap(); let root_node = trie.root_node().unwrap().unwrap(); db.commit(&root_node).unwrap(); - - // Note: We can't call trie.commit() because it converts NodeRef::Node to NodeRef::Hash - // and our serialization doesn't support hash references yet - // trie.commit().unwrap(); + trie.commit().unwrap(); assert_eq!(db.root().unwrap(), root_node); @@ -252,10 +197,7 @@ mod tests { trie.insert(b"common".to_vec(), b"v3".to_vec()).unwrap(); let root_node = trie.root_node().unwrap().unwrap(); db.commit(&root_node).unwrap(); - - // Note: We can't call trie.commit() because it converts NodeRef::Node to NodeRef::Hash - // and our serialization doesn't support hash references yet - // trie.commit().unwrap(); + trie.commit().unwrap(); assert_eq!(db.root().unwrap(), root_node); @@ -265,36 +207,6 @@ mod tests { assert_eq!(db.get(b"key2").unwrap(), Some(b"value2".to_vec())); assert_eq!(db.get(b"nonexistent").unwrap(), None); - assert_eq!(db.iter_roots().unwrap().len(), 3); - } - - #[test] - fn test_iter_roots() { - let temp_dir = TempDir::new("ethrex_db_test").unwrap(); - let db_path = temp_dir.path().join("test.edb"); - let mut db = EthrexDB::new(db_path.clone()).unwrap(); - - // Empty DB should have no roots - let roots: Vec = db.iter_roots().unwrap(); - assert_eq!(roots.len(), 0); - - for i in 1..=3 { - let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); - trie.insert( - format!("key{}", i).into_bytes(), - format!("value{}", i).into_bytes(), - ) - .unwrap(); - let root_node = trie.root_node().unwrap().unwrap(); - db.commit(&root_node).unwrap(); - } - - let roots: Vec = db.iter_roots().unwrap(); - assert_eq!(roots.len(), 3); - - for root in &roots { - root.compute_hash(); - } } #[test] @@ -367,12 +279,243 @@ mod tests { let result = db.get(key).unwrap(); assert_eq!(result, Some(expected_value.clone())); } + } - let roots: Vec = db.iter_roots().unwrap(); - assert_eq!(roots.len(), 3); + // Helper function to generate test data + fn generate_test_data(n: usize) -> Vec<(Vec, Vec)> { + use sha3::{Digest, Keccak256}; + + (1..=n) + .map(|i| { + // 32-byte key (hash) + let key = Keccak256::new() + .chain_update(i.to_be_bytes()) + .finalize() + .to_vec(); + + // 104-byte value (account info: 2 hashes + u256 + u64) + let mut value = Vec::with_capacity(104); + value.extend_from_slice( + &Keccak256::new() + .chain_update((i * 2).to_be_bytes()) + .finalize(), + ); + value.extend_from_slice( + &Keccak256::new() + .chain_update((i * 3).to_be_bytes()) + .finalize(), + ); + value.extend_from_slice(&[0u8; 24]); // u256 padding + value.extend_from_slice(&(i as u64).to_be_bytes()); // u256 value + value.extend_from_slice(&(i as u64).to_be_bytes()); // u64 + + (key, value) + }) + .collect() + } - for root in roots.iter() { - root.compute_hash(); + #[test] + fn test_blockchain_simulation_with_incremental_storage() { + let temp_dir = TempDir::new("ethrex_blockchain_sim").unwrap(); + let db_path = temp_dir.path().join("blockchain.edb"); + + let mut db = EthrexDB::new(db_path.clone()).unwrap(); + let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); + + // Batch 1: Initial accounts + let batch1_data = generate_test_data(100); + + for (key, value) in batch1_data.iter() { + trie.insert(key.clone(), value.clone()).unwrap(); + } + + let root_node1 = trie.root_node().unwrap().unwrap(); + let trie_root_hash1 = root_node1.compute_hash(); + let db_root_hash1 = db.commit(&root_node1).unwrap(); + trie.commit().unwrap(); // Convert to NodeRef::Hash + + assert_eq!( + trie_root_hash1, db_root_hash1, + "Root hashes must match after batch 1" + ); + assert_eq!( + db.root().unwrap(), + root_node1, + "DB root must match trie root after batch 1" + ); + + // Batch 2: New transactions + modify some existing accounts + let new_accounts_batch2 = generate_test_data(150); + + // Add 50 new accounts (indices 100-149) + for (key, value) in new_accounts_batch2[100..].iter() { + trie.insert(key.clone(), value.clone()).unwrap(); + } + + // Modify some existing accounts from batch 1 + for i in [10, 25, 50, 75].iter() { + if *i < batch1_data.len() { + let (key, _) = &batch1_data[*i]; + let new_value = format!("modified_account_{}", i).into_bytes(); + trie.insert(key.clone(), new_value).unwrap(); + } + } + + let root_node2 = trie.root_node().unwrap().unwrap(); + let trie_root_hash2 = root_node2.compute_hash(); + let db_root_hash2 = db.commit(&root_node2).unwrap(); + trie.commit().unwrap(); // Convert to NodeRef::Hash + + assert_eq!( + trie_root_hash2, db_root_hash2, + "Root hashes must match after batch 2" + ); + assert_eq!( + db.root().unwrap(), + root_node2, + "DB root must match trie root after batch 2" + ); + + // Batch 3: More transactions + let new_accounts_batch3 = generate_test_data(200); + + // Add 50 more new accounts (indices 150-199) + for (key, value) in &new_accounts_batch3[150..] { + trie.insert(key.clone(), value.clone()).unwrap(); + } + + // Modify more existing accounts + for i in [5, 15, 35, 45, 110, 125].iter() { + if *i < 150 { + let test_data = generate_test_data(*i + 1); + let (key, _) = &test_data[*i]; + let new_value = format!("batch3_modified_{}", i).into_bytes(); + trie.insert(key.clone(), new_value).unwrap(); + } + } + + let root_node3 = trie.root_node().unwrap().unwrap(); + let trie_root_hash3 = root_node3.compute_hash(); + let db_root_hash3 = db.commit(&root_node3).unwrap(); + trie.commit().unwrap(); // Convert to NodeRef::Hash + + assert_eq!( + trie_root_hash3, db_root_hash3, + "Root hashes must match after batch 3" + ); + assert_eq!( + db.root().unwrap(), + root_node3, + "DB root must match trie root after batch 3" + ); + + // Batch 4: Large update batch + let new_accounts_batch4 = generate_test_data(250); + + // Add 50 more new accounts (indices 200-249) + for (key, value) in &new_accounts_batch4[200..] { + trie.insert(key.clone(), value.clone()).unwrap(); + } + + // Modify many existing accounts + for i in [1, 20, 30, 40, 60, 80, 90, 105, 115, 135, 145, 170, 180].iter() { + if *i < 200 { + let test_data = generate_test_data(*i + 1); + let (key, _) = &test_data[*i]; + let new_value = format!("batch4_update_{}", i).into_bytes(); + trie.insert(key.clone(), new_value).unwrap(); + } + } + + let root_node4 = trie.root_node().unwrap().unwrap(); + let trie_root_hash4 = root_node4.compute_hash(); + let db_root_hash4 = db.commit(&root_node4).unwrap(); + trie.commit().unwrap(); // Convert to NodeRef::Hash + + assert_eq!( + trie_root_hash4, db_root_hash4, + "Root hashes must match after batch 4" + ); + assert_eq!( + db.root().unwrap(), + root_node4, + "DB root must match trie root after batch 4" + ); + + // Batch 5: Final verification batch + let new_accounts_batch5 = generate_test_data(300); + + // Add 50 final accounts (indices 250-299) + for (key, value) in &new_accounts_batch5[250..] { + trie.insert(key.clone(), value.clone()).unwrap(); + } + + // Few more modifications + for i in [8, 28, 58, 88, 128, 158, 188, 218].iter() { + if *i < 250 { + let test_data = generate_test_data(*i + 1); + let (key, _) = &test_data[*i]; + let new_value = format!("final_update_{}", i).into_bytes(); + trie.insert(key.clone(), new_value).unwrap(); + } + } + + let root_node5 = trie.root_node().unwrap().unwrap(); + let trie_root_hash5 = root_node5.compute_hash(); + let db_root_hash5 = db.commit(&root_node5).unwrap(); + trie.commit().unwrap(); // Convert to NodeRef::Hash + + assert_eq!( + trie_root_hash5, db_root_hash5, + "Root hashes must match after batch 5" + ); + assert_eq!( + db.root().unwrap(), + root_node5, + "DB root must match trie root after batch 5" + ); + + // Random verification of some accounts + for batch_num in 1..=5 { + let test_data = generate_test_data(batch_num * 50); + if let Some((key, _)) = test_data.get(batch_num * 10) { + assert_eq!(db.get(key).unwrap(), trie.get(key).unwrap()); + } } } + + #[test] + fn test_file_size() { + let temp_dir = TempDir::new("ethrex_db_test").unwrap(); + let db_path = temp_dir.path().join("test.edb"); + + let mut db = EthrexDB::new(db_path.clone()).unwrap(); + + let mut trie = Trie::new(Box::new(InMemoryTrieDB::new_empty())); + + // Insert 100,000 keys + for i in 0..100_000 { + let key = format!("key_{}", i); + let value = format!("value_{}", i); + trie.insert(key.as_bytes().to_vec(), value.as_bytes().to_vec()) + .unwrap(); + } + let root_node = trie.root_node().unwrap().unwrap(); + db.commit(&root_node).unwrap(); + trie.commit().unwrap(); + // Check file size after inserting 100,000 keys + let insert_file_size = std::fs::metadata(db_path.clone()).unwrap().len(); + + // Update a single key + trie.insert(b"key_1".to_vec(), b"updated_value".to_vec()) + .unwrap(); + let root_node = trie.root_node().unwrap().unwrap(); + db.commit(&root_node).unwrap(); + // Check file size after updating a single key + let update_file_size = std::fs::metadata(db_path).unwrap().len(); + + // File after update should have a very small increase + assert!(insert_file_size < update_file_size); + assert!(update_file_size < insert_file_size + 1000); + } } diff --git a/src/file_manager.rs b/src/file_manager.rs index 7c86510..c47d6fd 100644 --- a/src/file_manager.rs +++ b/src/file_manager.rs @@ -1,23 +1,33 @@ +//! File management +//! +//! The FileManager handles all low-level file operations for `EthrexDB`, implementing +//! an append-only storage strategy where data is never overwritten. All writes go +//! to the end of the file, preserving historical data and enabling version traversal. +//! +//! File Layout: +//! ```text +//! Offset 0: [header: 8 bytes] // Points to latest root offset +//! Offset 8: [commit_1_data...] // First commit's data +//! Offset X: [commit_2_data...] // Second commit's data +//! Offset Y: [commit_N_data...] // Latest commit's data +//! ``` +//! +//! The header at offset 0 always contains the offset of the most recent root. +//! This is the only part of the file that gets updated in-place. Everything else +//! is append-only. +//! +//! Each commit's data starts with an 8-byte link to the previous root offset, +//! creating a linked list through all versions: +//! - First commit: prev_root_offset = 0 (marks end of chain) +//! - Later commits: prev_root_offset = offset of previous root + use crate::trie::TrieError; use memmap2::{Mmap, MmapOptions}; use std::fs::{File, OpenOptions}; use std::io::{Seek, SeekFrom, Write}; use std::path::PathBuf; -/// Responsible for file management and offsets -/// -/// File format: -/// ```text -/// [header: 8 bytes] -> points to latest root version -/// [version 1: [prev_offset: 8 bytes][nodes]] -/// [version 2: [prev_offset: 8 bytes][nodes]] -/// ... -/// [version N: [prev_offset: 8 bytes][nodes]] <- latest version -/// ``` -/// -/// Each version contains: -/// - prev_offset: Points to the previous version -/// - nodes: Serialized trie nodes +/// File manager for `EthrexDB` pub struct FileManager { /// File where the data is stored file: File, @@ -27,7 +37,7 @@ pub struct FileManager { } impl FileManager { - /// Create a new database file + /// Create a new file pub fn create(file_path: PathBuf) -> Result { if let Some(parent) = file_path.parent() { std::fs::create_dir_all(parent).unwrap(); @@ -74,15 +84,12 @@ impl FileManager { Ok(u64::from_le_bytes(offset_bytes)) } - /// Update the header to point to the new latest root version + /// Update the header to point to the new latest root offset pub fn update_latest_root_offset(&mut self, new_offset: u64) -> Result<(), TrieError> { self.file.seek(SeekFrom::Start(0)).unwrap(); self.file.write_all(&new_offset.to_le_bytes()).unwrap(); self.file.flush().unwrap(); - - // TODO: Check if this is needed - self.mmap = unsafe { MmapOptions::new().map(&self.file).unwrap() }; - + self.refresh_mmap(); Ok(()) } @@ -91,11 +98,18 @@ impl FileManager { let offset = self.file.seek(SeekFrom::End(0)).unwrap(); self.file.write_all(data).unwrap(); self.file.flush().unwrap(); + self.refresh_mmap(); + Ok(offset) + } - // TODO: Check if this is needed + /// Refresh memory map after file modifications + fn refresh_mmap(&mut self) { self.mmap = unsafe { MmapOptions::new().map(&self.file).unwrap() }; + } - Ok(offset) + /// Get the current file size + pub fn get_file_size(&self) -> Result { + Ok(self.mmap.len() as u64) } /// Get slice from a specific offset to the end of the file @@ -106,16 +120,6 @@ impl FileManager { Ok(&self.mmap[offset as usize..]) } - - /// Get slice of exactly n bytes from a specific offset - pub fn get_slice_at(&self, offset: u64, size: usize) -> Result<&[u8], TrieError> { - let start = offset as usize; - let end = start + size; - - assert!(end <= self.mmap.len(), "Offset out of bounds"); - - Ok(&self.mmap[start..end]) - } } #[cfg(test)] @@ -165,8 +169,14 @@ mod tests { { let mut fm = FileManager::create(file_path.clone()).unwrap(); + assert_eq!( + fm.get_file_size().unwrap(), + 8, + "File is empty but should have 8 bytes for the header" + ); fm.update_latest_root_offset(456).unwrap(); fm.write_at_end(b"persistent data").unwrap(); + assert_ne!(fm.get_file_size().unwrap(), 8); } let fm = FileManager::open(file_path).unwrap(); @@ -174,5 +184,6 @@ mod tests { let data = fm.get_slice_to_end(8).unwrap().to_vec(); assert_eq!(data, b"persistent data"); + assert_ne!(fm.get_file_size().unwrap(), 8); } } diff --git a/src/index.rs b/src/index.rs new file mode 100644 index 0000000..415acda --- /dev/null +++ b/src/index.rs @@ -0,0 +1,105 @@ +//! Simple in-memory index: [`NodeHash`] -> offset lookups +//! +//! This is used to store the absolute offset of the node in the file +//! for each node hash. With this information, we can create new nodes and be able +//! to point to nodes that didn't change and exist in the file. + +use crate::trie::NodeHash; +use std::collections::HashMap; + +/// Simple in-memory index +#[derive(Debug, Default)] +pub struct Index { + /// Index map + /// TODO: Use a better data structure + /// TODO: Read from file if it exists + data: HashMap, +} + +impl Index { + /// Create a new empty index + pub fn new() -> Self { + Self { + data: HashMap::new(), + } + } + + /// Get an offset by node hash + pub fn get(&self, hash: &NodeHash) -> Option { + self.data.get(hash).copied() + } + + /// Insert a new node hash -> offset mapping + pub fn insert(&mut self, hash: NodeHash, offset: u64) { + self.data.insert(hash, offset); + } + + /// Get the number of entries in the index + pub fn len(&self) -> usize { + self.data.len() + } + + /// Check if the index is empty + pub fn is_empty(&self) -> bool { + self.data.is_empty() + } + + /// Clear all entries + pub fn clear(&mut self) { + self.data.clear(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_new_index() { + let index = Index::new(); + assert_eq!(index.len(), 0); + assert!(index.is_empty()); + } + + #[test] + fn test_insert_and_get() { + let mut index = Index::new(); + let hash = NodeHash::default(); + let offset = 1234u64; + + index.insert(hash, offset); + assert_eq!(index.get(&hash), Some(offset)); + assert_eq!(index.len(), 1); + assert!(!index.is_empty()); + } + + #[test] + fn test_multiple_hash_types() { + let mut index = Index::new(); + + // Test with different NodeHash variants + let inline_hash = NodeHash::from_slice(&[1, 2, 3]); + let hashed_hash = NodeHash::from_slice(&[0u8; 32]); + + index.insert(inline_hash, 100); + index.insert(hashed_hash, 200); + + assert_eq!(index.get(&inline_hash), Some(100)); + assert_eq!(index.get(&hashed_hash), Some(200)); + assert_eq!(index.len(), 2); + } + + #[test] + fn test_clear() { + let mut index = Index::new(); + let hash = NodeHash::default(); + + index.insert(hash, 123); + assert_eq!(index.len(), 1); + + index.clear(); + assert_eq!(index.len(), 0); + assert!(index.is_empty()); + assert_eq!(index.get(&hash), None); + } +} diff --git a/src/lib.rs b/src/lib.rs index d2256e6..30cce52 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,6 +4,8 @@ mod db; /// Interact with the file mod file_manager; +/// In-memory index +pub mod index; /// Serialization and deserialization of the trie. mod serialization; diff --git a/src/serialization.rs b/src/serialization.rs index 9fe39c1..9a40422 100644 --- a/src/serialization.rs +++ b/src/serialization.rs @@ -1,18 +1,57 @@ -//! Serialization and deserialization of the trie +//! Serialization and deserialization of the trie. //! -//! Two-node serialization format: -//! Instead of the standard 3 node types (Branch, Extension, Leaf), we use 2: -//! - Branch: Has 16 children slots + 1 value slot -//! - Extend: Has 1 child slot + 1 value slot (can represent both Extension and Leaf) +//! This module implements a two-node serialization format that reduces the standard +//! three MPT node types ([`BranchNode`], [`ExtensionNode`], [`LeafNode`]) to just two: +//! `Branch` and `Extend`. +//! The `Extend` node cleverly represents both `Extension` and `Leaf` nodes based on which fields are populated. //! -//! This simplifies serialization: -//! - Leaf -> Extend with value but no child (child_offset = 0) -//! - Extension -> Extend with child but no value (value_offset = 0) -//! - Branch -> Branch (unchanged) - +//! File Structure: +//! Each commit in the file has the following layout: +//! +//! ```text +//! [prev_root_offset: 8 bytes] // Links to previous root (0 for first commit) +//! [root_node_data] // Root node serialized first +//! [child_nodes_data...] // Children in depth-first order +//! ``` +//! +//! Node Serialization Format: +//! +//! Branch Node: +//! ```text +//! [tag: 1 byte = 0x00] +//! [child_offsets: 16 * 8 bytes] // Offsets to 16 possible children (0 if empty) +//! [value_offset: 8 bytes] // Offset to value data (0 if no value) +//! ``` +//! +//! Extend Node (Extension or Leaf): +//! - If the value is empty and the child is not zero, it's an Extension node. +//! - If the value is not empty and the child is zero, it's a Leaf node. +//! +//! ```text +//! [tag: 1 byte = 0x01] +//! [nibbles_len: 4 bytes] +//! [nibbles_data: variable] +//! [child_offset: 8 bytes] // 0 for Leaf, valid offset for Extension +//! [value_offset: 8 bytes] // Valid offset for Leaf, 0 for Extension +//! ``` +//! +//! Copy-on-Write: +//! During [`Serializer::serialize_tree`], each node is checked against the +//! [`Serializer::node_index`]. If the node's hash already exists, its offset is +//! returned immediately without re-serialization. This means unchanged subtrees +//! are never duplicated - they're referenced by offset. + +use crate::index::Index; +use std::collections::HashMap; +use std::ops::Not; use std::sync::{Arc, OnceLock}; -use crate::trie::{BranchNode, ExtensionNode, LeafNode, Nibbles, Node, NodeRef, TrieError}; +use crate::trie::{ + BranchNode, ExtensionNode, LeafNode, Nibbles, Node, NodeHash, NodeRef, TrieError, +}; + +/// Result type for incremental serialization: (serialized_data, new_node_offsets, root_offset) +type SerializationResult = Result<(Vec, HashMap, u64), TrieError>; /// Tag for Branch node (16 children + 1 value) const TAG_BRANCH: u8 = 0; @@ -20,126 +59,154 @@ const TAG_BRANCH: u8 = 0; const TAG_EXTEND: u8 = 1; /// Serializes a Merkle Patricia Trie into a byte buffer using the two node format -/// -/// - Branch: 16 node offsets + 1 value offset -/// - Extend: 1 node offset + 1 value offset (combines Extension and Leaf) -#[derive(Default)] -pub struct Serializer { - /// Buffer where serialized data is accumulated +pub struct Serializer<'a> { + /// Buffer to store the serialized data buffer: Vec, + /// Index of the nodes in the buffer + node_index: &'a Index, + /// Serialized nodes in this batch + new_nodes: HashMap, + /// Base offset of the buffer + base_offset: u64, } -impl Serializer { - pub fn new() -> Self { - Self::default() +impl<'a> Serializer<'a> { + /// Create a new serializer with existing node index + pub fn new(node_index: &'a Index, base_offset: u64) -> Self { + Self { + buffer: Vec::new(), + node_index, + new_nodes: HashMap::new(), + base_offset, + } } - /// Serializes a trie using the two node format - pub fn serialize_tree(mut self, root: &Node) -> Result, TrieError> { + /// Serializes a trie, only storing new nodes + /// Always prepends the previous root offset (0 for first root) + pub fn serialize_tree(mut self, root: &Node, prev_root_offset: u64) -> SerializationResult { + // Store where the root structure starts + let root_structure_offset = self.base_offset + self.buffer.len() as u64; + + // Always prepend the previous root offset (0 for first root) + self.buffer + .extend_from_slice(&prev_root_offset.to_le_bytes()); + + // Serialize the actual root node self.serialize_node(root)?; - Ok(self.buffer) + + // Return the offset to the start of the root structure (with prepended offset) + Ok((self.buffer, self.new_nodes, root_structure_offset)) } - /// Serializes a node, converting from 3 node to 2 node system + /// Serializes a node, checking CoW first fn serialize_node(&mut self, node: &Node) -> Result { - let offset = self.buffer.len() as u64; + let hash = node.compute_hash(); + + if let Some(existing_offset) = self.node_index.get(&hash) { + return Ok(existing_offset); + } + + // Node is new, serialize it + let buffer_offset = self.buffer.len() as u64; + let absolute_offset = self.base_offset + buffer_offset; + self.new_nodes.insert(hash, absolute_offset); match node { - Node::Leaf(leaf) => { - // Leaf becomes Extend with only value - self.buffer.push(TAG_EXTEND); + Node::Leaf(leaf) => self.serialize_leaf(leaf)?, + Node::Extension(ext) => self.serialize_extension(ext)?, + Node::Branch(branch) => self.serialize_branch(branch)?, + } - let compact_nibbles = leaf.partial.encode_compact(); - self.write_bytes_with_len(&compact_nibbles); + Ok(absolute_offset) + } - // Reserve space for offsets - let value_offset_pos = self.buffer.len(); - self.buffer.extend_from_slice(&0u64.to_le_bytes()); // node offset = 0 - self.buffer.extend_from_slice(&0u64.to_le_bytes()); // value offset placeholder + fn serialize_leaf(&mut self, leaf: &LeafNode) -> Result<(), TrieError> { + self.buffer.push(TAG_EXTEND); + let compact_nibbles = leaf.partial.encode_compact(); + self.write_bytes_with_len(&compact_nibbles); - let value_offset = self.buffer.len() as u64; - self.write_bytes_with_len(&leaf.value); + // Child offset = 0, value offset will be filled + self.buffer.extend_from_slice(&0u64.to_le_bytes()); + let value_offset_pos = self.buffer.len(); + self.buffer.extend_from_slice(&0u64.to_le_bytes()); - // Go back and write the actual value offset - self.buffer[value_offset_pos + 8..value_offset_pos + 16] - .copy_from_slice(&value_offset.to_le_bytes()); + let value_offset = self.base_offset + self.buffer.len() as u64; + self.write_bytes_with_len(&leaf.value); - Ok(offset) - } - Node::Extension(ext) => { - // Extension becomes Extend with only child - self.buffer.push(TAG_EXTEND); - - let compact_prefix = ext.prefix.encode_compact(); - self.write_bytes_with_len(&compact_prefix); - - // Reserve space for offsets - let child_offset_pos = self.buffer.len(); - self.buffer.extend_from_slice(&0u64.to_le_bytes()); // child offset placeholder - self.buffer.extend_from_slice(&0u64.to_le_bytes()); // value offset = 0 - - let child_offset = match &ext.child { - NodeRef::Hash(hash) => { - if hash.is_valid() { - panic!("Hash references not supported in serialization"); - } - 0u64 // Empty child - } - NodeRef::Node(node, _) => self.serialize_node(node)?, - }; + // Write actual value offset + self.buffer[value_offset_pos..value_offset_pos + 8] + .copy_from_slice(&value_offset.to_le_bytes()); - // Go back and write the actual child offset - if child_offset > 0 { - self.buffer[child_offset_pos..child_offset_pos + 8] - .copy_from_slice(&child_offset.to_le_bytes()); - } + Ok(()) + } - Ok(offset) - } - Node::Branch(branch) => { - // Branch stays Branch but with offsets - self.buffer.push(TAG_BRANCH); - - // Reserve space for all offsets - let offsets_start = self.buffer.len(); - // 16 child offsets + 1 value offset - for _ in 0..17 { - self.buffer.extend_from_slice(&0u64.to_le_bytes()); - } + fn serialize_extension(&mut self, ext: &ExtensionNode) -> Result<(), TrieError> { + self.buffer.push(TAG_EXTEND); + let compact_prefix = ext.prefix.encode_compact(); + self.write_bytes_with_len(&compact_prefix); - // Serialize all children and collect their offsets - let mut child_offsets = [0u64; 16]; - for (i, child) in branch.choices.iter().enumerate() { - child_offsets[i] = match child { - NodeRef::Hash(hash) => { - if hash.is_valid() { - panic!("Hash references not supported in serialization"); - } - 0u64 - } - NodeRef::Node(node, _) => self.serialize_node(node)?, - }; - } + // Child offset will be filled, value offset = 0 + let child_offset_pos = self.buffer.len(); + self.buffer.extend_from_slice(&0u64.to_le_bytes()); + self.buffer.extend_from_slice(&0u64.to_le_bytes()); - // Serialize value if present - let value_offset = if branch.value.is_empty() { - 0u64 - } else { - let offset = self.buffer.len() as u64; - self.write_bytes_with_len(&branch.value); - offset - }; + let child_offset = self.serialize_noderef(&ext.child)?; - // Go back and write all the actual offsets - let mut pos = offsets_start; - for &child_offset in &child_offsets { - self.buffer[pos..pos + 8].copy_from_slice(&child_offset.to_le_bytes()); - pos += 8; - } - self.buffer[pos..pos + 8].copy_from_slice(&value_offset.to_le_bytes()); + // Write actual child offset + self.buffer[child_offset_pos..child_offset_pos + 8] + .copy_from_slice(&child_offset.to_le_bytes()); + + Ok(()) + } + + fn serialize_branch(&mut self, branch: &BranchNode) -> Result<(), TrieError> { + self.buffer.push(TAG_BRANCH); + + // Reserve space for 16 child offsets + 1 value offset + let offsets_start = self.buffer.len(); + for _ in 0..17 { + self.buffer.extend_from_slice(&0u64.to_le_bytes()); + } + + // Serialize children + let mut child_offsets = [0u64; 16]; + for (i, child) in branch.choices.iter().enumerate() { + child_offsets[i] = self.serialize_noderef(child)?; + } + + // Serialize offset + let value_offset = branch + .value + .is_empty() + .not() + .then(|| { + let offset = self.base_offset + self.buffer.len() as u64; + self.write_bytes_with_len(&branch.value); + offset + }) + .unwrap_or_default(); + + // Write all offsets + let mut pos = offsets_start; + for &child_offset in &child_offsets { + self.buffer[pos..pos + 8].copy_from_slice(&child_offset.to_le_bytes()); + pos += 8; + } + self.buffer[pos..pos + 8].copy_from_slice(&value_offset.to_le_bytes()); + + Ok(()) + } - Ok(offset) + fn serialize_noderef(&mut self, noderef: &NodeRef) -> Result { + match noderef { + NodeRef::Node(node, _) => self.serialize_node(node), + NodeRef::Hash(hash) if hash.is_valid() => { + // Node was previously committed - must exist in index + self.node_index + .get(hash) + .ok_or_else(|| panic!("Hash reference not found: {:?}", hash)) } + NodeRef::Hash(_) => Ok(0), // Empty/invalid hash } } @@ -153,7 +220,7 @@ impl Serializer { /// Deserializes a Merkle Patricia Trie from a byte buffer. /// /// The deserializer reads the binary format produced by [`Serializer`]. -/// It uses the two node format and converts back to the standard 3 node format. +/// It uses the two node format and converts back to the standard three node format. pub struct Deserializer<'a> { /// The byte buffer containing serialized trie data buffer: &'a [u8], @@ -165,15 +232,8 @@ impl<'a> Deserializer<'a> { Self { buffer } } - /// Deserializes a tree from the two node format back to standard 3 node format - pub fn decode_tree(&self) -> Result { - let node = self.decode_node_at(0)?; - node.compute_hash(); - Ok(node) - } - - /// Decodes a node from the two node format at specific position - fn decode_node_at(&self, pos: usize) -> Result { + /// Decodes a node at specific position + pub fn decode_node_at(&self, pos: usize) -> Result { if pos >= self.buffer.len() { panic!("Invalid buffer position"); } @@ -183,48 +243,34 @@ impl<'a> Deserializer<'a> { match tag { TAG_EXTEND => { - // Read nibbles length let len = self.read_u32_at(position)? as usize; position += 4; - // Read nibbles - if position + len > self.buffer.len() { - panic!("Invalid buffer length"); - } let compact_nibbles = &self.buffer[position..position + len]; let nibbles = Nibbles::decode_compact(compact_nibbles); position += len; - // Read node offset let node_offset = self.read_u64_at(position)?; position += 8; - - // Read value offset let value_offset = self.read_u64_at(position)?; - // Determine node type based on what's present match (node_offset > 0, value_offset > 0) { (false, true) => { - // Only value = Leaf node + // Leaf node let value = self .read_value_at_offset(value_offset as usize)? .unwrap_or_default(); Ok(Node::Leaf(LeafNode::new(nibbles, value))) } (true, false) => { - // Only child = Extension node + // Extension node let child = self.decode_node_at(node_offset as usize)?; Ok(Node::Extension(ExtensionNode::new( nibbles, NodeRef::Node(Arc::new(child), OnceLock::new()), ))) } - (true, true) => { - panic!("Extend node with both child and value not supported"); - } - (false, false) => { - panic!("Invalid Extend node with no child or value"); - } + _ => panic!("Invalid Extend node with both child and value"), } } TAG_BRANCH => { @@ -234,11 +280,9 @@ impl<'a> Deserializer<'a> { *child = self.read_u64_at(position)?; position += 8; } - - // Read value offset let value_offset = self.read_u64_at(position)?; - // Build children NodeRefs + // Build children let mut children: [NodeRef; 16] = Default::default(); for (i, &offset) in child_offsets.iter().enumerate() { if offset > 0 { @@ -247,7 +291,7 @@ impl<'a> Deserializer<'a> { } } - // Read value if present + // Read value let value = if value_offset > 0 { self.read_value_at_offset(value_offset as usize)? .unwrap_or_default() @@ -259,18 +303,17 @@ impl<'a> Deserializer<'a> { children, value, )))) } - _ => panic!("Invalid node tag: {}", tag), + _ => panic!("Invalid node tag: {tag}"), } } - /// Gets a value by path without copying data - pub fn get_by_path(&self, path: &[u8]) -> Result>, TrieError> { + /// Gets a value by path starting at a specific offset + pub fn get_by_path_at(&self, path: &[u8], offset: usize) -> Result>, TrieError> { if self.buffer.is_empty() { return Ok(None); } - let nibbles = Nibbles::from_raw(path, false); - self.get_by_path_inner(nibbles, 0) + self.get_by_path_inner(nibbles, offset) } /// Internal helper for get_by_path with position tracking @@ -342,30 +385,27 @@ impl<'a> Deserializer<'a> { // Recurse into the child self.get_by_path_inner(path, node_offset as usize) } else { - panic!("Extend node with both child and value not supported"); + Ok(None) } } TAG_BRANCH => { if path.is_empty() { // Skip 16 child offsets position += 16 * 8; + let value_offset = + u64::from_le_bytes(self.buffer[position..position + 8].try_into().unwrap()); if position + 8 > self.buffer.len() { return Ok(None); } - let value_offset = - u64::from_le_bytes(self.buffer[position..position + 8].try_into().unwrap()); - if value_offset > 0 { self.read_value_at_offset(value_offset as usize) } else { Ok(None) } } else { - // Get next nibble and find corresponding child - let next_nibble = match path.next_choice() { - Some(nibble) => nibble, - None => return Ok(None), + let Some(next_nibble) = path.next_choice() else { + return Ok(None); }; // Read child offset at position next_nibble @@ -387,7 +427,7 @@ impl<'a> Deserializer<'a> { } } } - _ => panic!("Invalid node tag: {}", tag), + _ => panic!("Invalid node tag: {tag}"), } } @@ -428,16 +468,21 @@ impl<'a> Deserializer<'a> { } } -/// Helper function to serialize a Merkle Patricia Trie node to bytes. -pub fn serialize(node: &Node) -> Vec { - Serializer::new().serialize_tree(node).unwrap() -} - #[cfg(test)] -mod test { +mod tests { + use super::*; use crate::trie::{InMemoryTrieDB, Trie, node_hash::NodeHash}; - use super::*; + /// Offset to skip the prepended previous root offset (8 bytes) + const ROOT_DATA_OFFSET: usize = 8; + + /// Helper function to create [`Index`] and [`Serializer`] structures + /// and serialize a tree + fn serialize(root: &Node) -> (Vec, HashMap, u64) { + let index = Index::new(); + let serializer = Serializer::new(&index, 0); + serializer.serialize_tree(root, 0).unwrap() + } fn new_temp() -> Trie { use std::collections::HashMap; @@ -450,10 +495,6 @@ mod test { Trie::new(Box::new(db)) } - fn deserialize(buffer: &[u8]) -> Node { - Deserializer::new(buffer).decode_tree().unwrap() - } - #[test] fn test_serialize_deserialize_empty_leaf() { let leaf = Node::Leaf(LeafNode { @@ -461,8 +502,10 @@ mod test { value: vec![], }); - let bytes = serialize(&leaf); - let recovered = deserialize(&bytes); + let (buffer, _, _) = serialize(&leaf); + + let deserializer = Deserializer::new(&buffer); + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); assert_eq!(leaf, recovered); } @@ -474,8 +517,10 @@ mod test { value: b"long_path_value".to_vec(), }); - let bytes = serialize(&leaf); - let recovered = deserialize(&bytes); + let (buffer, _, _) = serialize(&leaf); + + let deserializer = Deserializer::new(&buffer); + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); assert_eq!(leaf, recovered); } @@ -487,8 +532,10 @@ mod test { value: vec![], })); - let bytes = serialize(&branch); - let recovered = deserialize(&bytes); + let (buffer, _, _) = serialize(&branch); + + let deserializer = Deserializer::new(&buffer); + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); assert_eq!(branch, recovered); } @@ -505,8 +552,11 @@ mod test { child: NodeRef::Node(Arc::new(leaf), OnceLock::new()), }); - let bytes = serialize(&ext); - let recovered = deserialize(&bytes); + let (buffer, _, _) = serialize(&ext); + + let deserializer = Deserializer::new(&buffer); + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); + assert_eq!(recovered, ext); match recovered { Node::Extension(ext_node) => { @@ -551,8 +601,10 @@ mod test { child: NodeRef::Node(Arc::new(branch), OnceLock::new()), }); - let bytes = serialize(&outer_ext); - let recovered = deserialize(&bytes); + let (buffer, _, _) = serialize(&outer_ext); + + let deserializer = Deserializer::new(&buffer); + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); assert_eq!(recovered, outer_ext); } @@ -570,8 +622,9 @@ mod test { trie.insert(b"key".to_vec(), b"value".to_vec()).unwrap(); let root = trie.root_node().unwrap().unwrap(); - let bytes = serialize(&root); - let recovered = deserialize(&bytes); + let (buffer, _, _) = serialize(&root); + let deserializer = Deserializer::new(&buffer); + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); assert_eq!(root, recovered); } @@ -592,8 +645,9 @@ mod test { } let root = trie.root_node().unwrap().unwrap(); - let bytes = serialize(&root); - let recovered = deserialize(&bytes); + let (buffer, _, _) = serialize(&root); + let deserializer = Deserializer::new(&buffer); + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); assert_eq!(recovered, root); } @@ -609,16 +663,17 @@ mod test { // Serialize to file let root = trie.root_node().unwrap().unwrap(); - let serialized = serialize(&root); + let (buffer, _, _) = serialize(&root); let path = "/tmp/test_trie.mpt"; - fs::write(path, &serialized).unwrap(); + fs::write(path, &buffer).unwrap(); // Read from file and deserialize let read_data = fs::read(path).unwrap(); - let deserialized = deserialize(&read_data); + let deserializer = Deserializer::new(&read_data); + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); - assert_eq!(root, deserialized); + assert_eq!(root, recovered); fs::remove_file(path).unwrap(); } @@ -628,16 +683,18 @@ mod test { trie.insert(b"test".to_vec(), b"value".to_vec()).unwrap(); let root = trie.root_node().unwrap().unwrap(); - let buffer = serialize(&root); + let (buffer, _, _) = serialize(&root); let deserializer = Deserializer::new(&buffer); assert_eq!( - deserializer.get_by_path(b"test").unwrap(), + deserializer + .get_by_path_at(b"test", ROOT_DATA_OFFSET) + .unwrap(), Some(b"value".to_vec()) ); let deserializer = Deserializer::new(&buffer); - let recovered = deserializer.decode_tree().unwrap(); + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); assert_eq!(root, recovered); } @@ -657,41 +714,41 @@ mod test { } let root = trie.root_node().unwrap().unwrap(); - let buffer = serialize(&root); + let (buffer, _, _) = serialize(&root); let deserializer = Deserializer::new(&buffer); assert_eq!( - deserializer.get_by_path(b"horse").unwrap(), + deserializer + .get_by_path_at(b"horse", ROOT_DATA_OFFSET) + .unwrap(), Some(b"stallion".to_vec()) ); - - let deserializer = Deserializer::new(&buffer); assert_eq!( - deserializer.get_by_path(b"dog").unwrap(), + deserializer + .get_by_path_at(b"dog", ROOT_DATA_OFFSET) + .unwrap(), Some(b"puppy".to_vec()) ); - - let deserializer = Deserializer::new(&buffer); assert_eq!( - deserializer.get_by_path(b"doge").unwrap(), + deserializer + .get_by_path_at(b"doge", ROOT_DATA_OFFSET) + .unwrap(), Some(b"coin".to_vec()) ); - - let deserializer = Deserializer::new(&buffer); assert_eq!( - deserializer.get_by_path(b"do").unwrap(), + deserializer + .get_by_path_at(b"do", ROOT_DATA_OFFSET) + .unwrap(), Some(b"verb".to_vec()) ); + assert_eq!( + deserializer + .get_by_path_at(b"cat", ROOT_DATA_OFFSET) + .unwrap(), + None + ); - let deserializer = Deserializer::new(&buffer); - assert_eq!(deserializer.get_by_path(b"cat").unwrap(), None); - - let deserializer = Deserializer::new(&buffer); - assert_eq!(deserializer.get_by_path(b"").unwrap(), None); - - // Reset position before decoding tree - let deserializer = Deserializer::new(&buffer); - let recovered = deserializer.decode_tree().unwrap(); + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); assert_eq!(root, recovered); } @@ -746,11 +803,11 @@ mod test { let root = trie.root_node().unwrap().unwrap(); - let buffer = serialize(&root); + let (buffer, _, _) = serialize(&root); for (key, expected_value) in &test_data { let deserializer = Deserializer::new(&buffer); - let retrieved_value = deserializer.get_by_path(key).unwrap(); + let retrieved_value = deserializer.get_by_path_at(key, ROOT_DATA_OFFSET).unwrap(); assert_eq!(retrieved_value, Some(expected_value.clone())); } @@ -767,12 +824,12 @@ mod test { for key in &non_existent_keys { let deserializer = Deserializer::new(&buffer); - let result = deserializer.get_by_path(key).unwrap(); + let result = deserializer.get_by_path_at(key, ROOT_DATA_OFFSET).unwrap(); assert_eq!(result, None); } let deserializer = Deserializer::new(&buffer); - let recovered = deserializer.decode_tree().unwrap(); + let recovered = deserializer.decode_node_at(ROOT_DATA_OFFSET).unwrap(); assert_eq!(root, recovered); } diff --git a/src/trie/node_hash.rs b/src/trie/node_hash.rs index f9b7229..e20756e 100644 --- a/src/trie/node_hash.rs +++ b/src/trie/node_hash.rs @@ -1,7 +1,5 @@ use crate::rlp::{Encoder, RLPDecode, RLPDecodeError, RLPEncode}; use ethereum_types::H256; -#[cfg(test)] -use libmdbx::orm::{Decodable, Encodable}; use sha3::{Digest, Keccak256}; /// Struct representing a trie node hash @@ -38,7 +36,7 @@ impl NodeHash { /// Converts a slice of an already hashed data (in case it's not inlineable) to a NodeHash. /// Panics if the slice is over 32 bytes /// If you need to hash it in case its len >= 32 see `from_encoded_raw` - pub(crate) fn from_slice(slice: &[u8]) -> NodeHash { + pub fn from_slice(slice: &[u8]) -> NodeHash { match slice.len() { 0..32 => { let mut buffer = [0; 31]; @@ -140,19 +138,3 @@ impl RLPDecode for NodeHash { Ok((hash, rest)) } } - -#[cfg(test)] -impl Encodable for NodeHash { - type Encoded = Vec; - - fn encode(self) -> Self::Encoded { - self.into() - } -} - -#[cfg(test)] -impl Decodable for NodeHash { - fn decode(b: &[u8]) -> anyhow::Result { - Ok(NodeHash::from_slice(b)) - } -}