diff --git a/bectl/Cargo.toml b/bectl/Cargo.toml index 7d9f9161e..221d71091 100644 --- a/bectl/Cargo.toml +++ b/bectl/Cargo.toml @@ -3,7 +3,7 @@ name = "bectl" version = "0.1.0" authors = ["tilpner ", "Johannes Wünsche "] edition = "2021" -rust-version = "1.66.1" +rust-version = "1.82.0" [dependencies] betree_storage_stack = { path = "../betree", features = [ "internal-api" ] } @@ -14,7 +14,7 @@ serde_json = "1.0" jemallocator = { version = "0.3", features = [ "background_threads" ] } -figment = { version = "0.10", features = [ "json" ] } +figment = { version = "0.10", features = ["json", "yaml"] } log = "0.4" env_logger = "0.9" diff --git a/bectl/src/main.rs b/bectl/src/main.rs index 3ff286fe6..76ddb3e8b 100644 --- a/bectl/src/main.rs +++ b/bectl/src/main.rs @@ -8,8 +8,9 @@ use std::{ use betree_storage_stack::{ cow_bytes::CowBytes, database::{Database, DatabaseConfiguration, Superblock}, + storage_pool::DiskOffset, tree::{DefaultMessageAction, TreeLayer}, - StoragePreference, storage_pool::DiskOffset, + StoragePreference, }; use chrono::{DateTime, Utc}; use figment::providers::Format; @@ -126,7 +127,7 @@ enum ObjMode { }, Mv { name: String, - new_name: String + new_name: String, }, Meta { obj_name: String, @@ -179,7 +180,8 @@ fn bectl_main() -> Result<(), Error> { let cfg: DatabaseConfiguration = figment::Figment::new() .merge(DatabaseConfiguration::figment_default()) - .merge(figment::providers::Json::file(opt.database_config)) + .merge(figment::providers::Yaml::file(&opt.database_config)) + .merge(figment::providers::Json::file(&opt.database_config)) .merge(DatabaseConfiguration::figment_env()) .extract()?; diff --git a/betree/Cargo.toml b/betree/Cargo.toml index d9f116a30..432c93cd8 100644 --- a/betree/Cargo.toml +++ b/betree/Cargo.toml @@ -4,7 +4,7 @@ version = "0.3.1-alpha" authors = ["Felix Wiedemann ", "Till Hoeppner ", "Johannes Wünsche "] license = "MIT OR Apache-2.0" edition = "2021" -rust-version = "1.66.1" +rust-version = "1.82.0" [lib] crate-type = ["rlib", "staticlib", "cdylib"] @@ -61,6 +61,9 @@ rand = { version = "0.8", features = ["std_rng"] } pmdk = { path = "./pmdk", optional = true } rustc-hash = "1.1.0" gxhash = "3.1.1" +rkyv = { version = "0.7.43", features = ["validation"] } +lazy_static = "1.4.0" +serde_yaml = "0.9.34" [dev-dependencies] rand_xorshift = "0.3" @@ -81,8 +84,7 @@ figment_config = ["figment"] # leaf vdev. This requires additional system calls due to time measuring and is # therefore safeguarded into it's own feature latency_metrics = [] -experimental-api = [] nvm = ["pmdk"] # Log the allocations and deallocations done for later analysis allocation_log = [] - +cache-paranoia = [] diff --git a/betree/haura-benchmarks/Cargo.toml b/betree/haura-benchmarks/Cargo.toml index 31b0a053c..38563d254 100644 --- a/betree/haura-benchmarks/Cargo.toml +++ b/betree/haura-benchmarks/Cargo.toml @@ -8,10 +8,10 @@ edition = "2018" members = ["."] [dependencies] -betree_storage_stack = { path = "..", features = ["experimental-api"]} +betree_storage_stack = { path = ".." } structopt = "0.3" -figment = { version = "0.10", features = [ "json" ] } +figment = { version = "0.10", features = [ "json", "yaml" ] } serde_json = "1" libmedium = "0.7" procfs = "0.16" diff --git a/betree/haura-benchmarks/haura-plots/haura_plots/ycsb_plots.py b/betree/haura-benchmarks/haura-plots/haura_plots/ycsb_plots.py index c2a908fee..cab2a1376 100644 --- a/betree/haura-benchmarks/haura-plots/haura_plots/ycsb_plots.py +++ b/betree/haura-benchmarks/haura-plots/haura_plots/ycsb_plots.py @@ -42,6 +42,7 @@ def plot_grouped_c(path, runs, overall=False): return fig, ax = plt.subplots() + runs = sorted(runs, key=lambda run: max(run["results"])) off = 1 / (len(runs) + 1) for idx, run in enumerate(runs): if not overall: @@ -60,8 +61,8 @@ def plot_grouped_c(path, runs, overall=False): group = runs[0]["group"].split('/')[-1:][0] ax.set_title(f'YCSB Scaling | {group}') else: - ax.set_title(f'YCSB Scaling') + ax.set_title(f'YCSB-C-esque Write Scaling (Key-Value)') ax.set_ylabel("Throughput [op/s]") ax.set_xlabel("Threads [#]") extra = fig.legend(loc="upper left", bbox_to_anchor=(0.9, 0.89)) - fig.savefig(f"{path}/ycsb_c_comparison.svg", bbox_extra_artists=(extra,), bbox_inches="tight") + fig.savefig(f"{path}/ycsb_c_comparison.svg", bbox_extra_artists=(extra,), bbox_inches="tight", transparent=True) diff --git a/betree/haura-benchmarks/run.sh b/betree/haura-benchmarks/run.sh index ebe1cb16f..b94945f19 100755 --- a/betree/haura-benchmarks/run.sh +++ b/betree/haura-benchmarks/run.sh @@ -2,247 +2,243 @@ # shellcheck disable=SC2030,SC2031 # we exploit this characteristic to start several test scenarios - merging them would lead to pollution function ensure_zip { - local url - url="https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.15.58.tar.xz" - - if [ ! -e "$ZIP_ARCHIVE" ] - then - mkdir data - pushd data || exit - - curl "$url" -o linux.tar.xz - tar xf linux.tar.xz - rm linux.tar.xz - zip -0 -r linux.zip linux-* - rm -r linux-* - - popd || exit - fi + local url + url="https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.15.58.tar.xz" + + if [ ! -e "$ZIP_ARCHIVE" ]; then + mkdir data + pushd data || exit + + curl "$url" -o linux.tar.xz + tar xf linux.tar.xz + rm linux.tar.xz + zip -0 -r linux.zip linux-* + rm -r linux-* + + popd || exit + fi } function ensure_bectl { - pushd ../../bectl || exit - cargo build --release - popd || return + pushd ../../bectl || exit + cargo build --release + popd || return } function ensure_config { - if [ ! -e "$BETREE_CONFIG" ] - then - echo "No Haura configuration found at: ${BETREE_CONFIG}" - exit 1 - fi + if [ ! -e "$BETREE_CONFIG" ]; then + echo "No Haura configuration found at: ${BETREE_CONFIG}" + exit 1 + fi } total_runs=0 function run { - local vdev_type="$1" - local name="$2" - local mode="$3" - shift 3 - - if [ "$total_runs" -gt 0 ] - then - sleep 60 - fi - total_runs=$((total_runs + 1)) - - local out_path - out_path="results/$(date -I)_${vdev_type}/${name}_$(date +%s)" - mkdir -p "$out_path" - - pushd "$out_path" || return - - echo "running $mode with these settings:" - env | grep BETREE__ - env > "env" - "$ROOT/../../target/release/bectl" config print-active > "config" - "$ROOT/target/release/betree-perf" "$mode" "$@" - - echo "merging results into $out_path/out.jsonl" - "$ROOT/target/release/json-merge" \ - --timestamp-key epoch_ms \ - ./betree-metrics.jsonl \ - ./proc.jsonl \ - ./sysinfo.jsonl \ - | "$ROOT/target/release/json-flatten" > "out.jsonl" - - popd || return + local vdev_type="$1" + local name="$2" + local mode="$3" + shift 3 + + if [ "$total_runs" -gt 0 ]; then + sleep 60 + fi + total_runs=$((total_runs + 1)) + + local out_path + out_path="results/$(date -I)_${vdev_type}/${name}_$(date +%s)" + mkdir -p "$out_path" + + pushd "$out_path" || return + + echo "running $mode with these settings:" + env | grep BETREE__ + env >"env" + "$ROOT/../../target/release/bectl" config print-active >"config" + "$ROOT/target/release/betree-perf" "$mode" "$@" + + echo "merging results into $out_path/out.jsonl" + "$ROOT/target/release/json-merge" \ + --timestamp-key epoch_ms \ + ./betree-metrics.jsonl \ + ./proc.jsonl \ + ./sysinfo.jsonl | + "$ROOT/target/release/json-flatten" >"out.jsonl" + + popd || return } function tiered() { - ( - export BETREE__ALLOC_STRATEGY='[[0],[0],[],[]]' - run "$RUN_IDENT" tiered1_all0_alloc tiered1 - ) - - ( - export BETREE__ALLOC_STRATEGY='[[0],[1],[],[]]' - run "$RUN_IDENT" tiered1_id_alloc tiered1 - ) - - ( - export BETREE__ALLOC_STRATEGY='[[1],[1],[],[]]' - run "$RUN_IDENT" tiered1_all1_alloc tiered1 - ) + ( + export BETREE__ALLOC_STRATEGY='[[0],[0],[],[]]' + run "$RUN_IDENT" tiered1_all0_alloc tiered1 + ) + + ( + export BETREE__ALLOC_STRATEGY='[[0],[1],[],[]]' + run "$RUN_IDENT" tiered1_id_alloc tiered1 + ) + + ( + export BETREE__ALLOC_STRATEGY='[[1],[1],[],[]]' + run "$RUN_IDENT" tiered1_all1_alloc tiered1 + ) } function scientific_evaluation() { - # Invocation: - run "$RUN_IDENT" random_evaluation_read evaluation-read 30 $((25 * 1024 * 1024 * 1024)) $((8192)) $((1 * 1024)) $((12 * 1024 * 1024)) + # Invocation: + run "$RUN_IDENT" random_evaluation_read evaluation-read 30 $((25 * 1024 * 1024 * 1024)) $((8192)) $((1 * 1024)) $((12 * 1024 * 1024)) } function evaluation_rw() { - # Invocation: - run "$RUN_IDENT" random_evaluation_rw evaluation-rw 30 $((25 * 1024 * 1024 * 1024)) $((8192)) $((1 * 1024)) $((12 * 1024 * 1024)) + # Invocation: + run "$RUN_IDENT" random_evaluation_rw evaluation-rw 30 $((25 * 1024 * 1024 * 1024)) $((8192)) $((1 * 1024)) $((12 * 1024 * 1024)) } function filesystem_zip() { - export BETREE__ALLOC_STRATEGY='[[0],[1],[2],[]]' - run "$RUN_IDENT" file_system_three "$ZIP_ARCHIVE" + export BETREE__ALLOC_STRATEGY='[[0],[1],[2],[]]' + run "$RUN_IDENT" file_system_three "$ZIP_ARCHIVE" } function checkpoints() { - export BETREE__ALLOC_STRATEGY='[[0, 1],[1],[],[]]' - run "$RUN_IDENT" checkpoints_fastest checkpoints + export BETREE__ALLOC_STRATEGY='[[0, 1],[1],[],[]]' + run "$RUN_IDENT" checkpoints_fastest checkpoints } function filesystem() { - export BETREE__ALLOC_STRATEGY='[[0],[1],[2],[]]' - run "$RUN_IDENT" file_system_three filesystem + export BETREE__ALLOC_STRATEGY='[[0],[1],[2],[]]' + run "$RUN_IDENT" file_system_three filesystem } function zip_cache() { - local F_CD_START=1040032667 + local F_CD_START=1040032667 - for cache_mib in 32 128 512 2048; do - ( - export BETREE__CACHE_SIZE=$((cache_mib * 1024 * 1024)) - run "$RUN_IDENT" "zip_cache_$cache_mib" zip 4 100 10 "$ZIP_ARCHIVE" "$F_CD_START" - ) - done + for cache_mib in 32 128 512 2048; do + ( + export BETREE__CACHE_SIZE=$((cache_mib * 1024 * 1024)) + run "$RUN_IDENT" "zip_cache_$cache_mib" zip 4 100 10 "$ZIP_ARCHIVE" "$F_CD_START" + ) + done } function zip_mt() { - local F="$PWD/data/linux.zip" - local F_CD_START=1 + local F="$PWD/data/linux.zip" + local F_CD_START=1 - for cache_mib in 256 512 1024 2048; do - echo "using $cache_mib MiB of cache" - ( - export BETREE__CACHE_SIZE=$((cache_mib * 1024 * 1024)) + for cache_mib in 256 512 1024 2048; do + echo "using $cache_mib MiB of cache" + ( + export BETREE__CACHE_SIZE=$((cache_mib * 1024 * 1024)) - local total=10000 + local total=10000 - for num_workers in 1 2 3 4 5 6 7 8 9 10; do - echo "running with $num_workers workers" - local per_worker=$((total / num_workers)) - local per_run=$((per_worker / 10)) + for num_workers in 1 2 3 4 5 6 7 8 9 10; do + echo "running with $num_workers workers" + local per_worker=$((total / num_workers)) + local per_run=$((per_worker / 10)) - run "$RUN_IDENT" "zip_mt_${cache_mib}_${num_workers}_${per_run}_10" zip "$num_workers" "$per_run" 10 "$F" "$F_CD_START" - done - ) - done + run "$RUN_IDENT" "zip_mt_${cache_mib}_${num_workers}_${per_run}_10" zip "$num_workers" "$per_run" 10 "$F" "$F_CD_START" + done + ) + done } function zip_tiered() { - local F_CD_START=1 #242415017 #1040032667 - # for cache_mib in 256 512 1024; do - for cache_mib in 32 64; do - echo "using $cache_mib MiB of cache" - ( - export BETREE__CACHE_SIZE=$((cache_mib * 1024 * 1024)) + local F_CD_START=1 #242415017 #1040032667 + # for cache_mib in 256 512 1024; do + for cache_mib in 32 64; do + echo "using $cache_mib MiB of cache" + ( + export BETREE__CACHE_SIZE=$((cache_mib * 1024 * 1024)) - local total=10000 + local total=10000 - for num_workers in 1 2 3 4 5 6 7 8; do - echo "running with $num_workers workers" - local per_worker=$((total / num_workers)) - local per_run=$((per_worker / 10)) + for num_workers in 1 2 3 4 5 6 7 8; do + echo "running with $num_workers workers" + local per_worker=$((total / num_workers)) + local per_run=$((per_worker / 10)) - ( - export BETREE__ALLOC_STRATEGY='[[0],[0],[],[]]' - run "$RUN_IDENT" "zip_tiered_all0_${cache_mib}_${num_workers}_${per_run}_10" zip "$num_workers" "$per_run" 10 "$ZIP_ARCHIVE" "$F_CD_START" - ) + ( + export BETREE__ALLOC_STRATEGY='[[0],[0],[],[]]' + run "$RUN_IDENT" "zip_tiered_all0_${cache_mib}_${num_workers}_${per_run}_10" zip "$num_workers" "$per_run" 10 "$ZIP_ARCHIVE" "$F_CD_START" + ) - ( - export BETREE__ALLOC_STRATEGY='[[0],[1],[],[]]' - run "$RUN_IDENT" "zip_tiered_id_${cache_mib}_${num_workers}_${per_run}_10" zip "$num_workers" "$per_run" 10 "$ZIP_ARCHIVE" "$F_CD_START" - ) + ( + export BETREE__ALLOC_STRATEGY='[[0],[1],[],[]]' + run "$RUN_IDENT" "zip_tiered_id_${cache_mib}_${num_workers}_${per_run}_10" zip "$num_workers" "$per_run" 10 "$ZIP_ARCHIVE" "$F_CD_START" + ) - ( - export BETREE__ALLOC_STRATEGY='[[1],[1],[],[]]' - run "$RUN_IDENT" "zip_tiered_all1_${cache_mib}_${num_workers}_${per_run}_10" zip "$num_workers" "$per_run" 10 "$ZIP_ARCHIVE" "$F_CD_START" - ) + ( + export BETREE__ALLOC_STRATEGY='[[1],[1],[],[]]' + run "$RUN_IDENT" "zip_tiered_all1_${cache_mib}_${num_workers}_${per_run}_10" zip "$num_workers" "$per_run" 10 "$ZIP_ARCHIVE" "$F_CD_START" + ) - done - ) - done + done + ) + done } function ingest() { - ( ( - export BETREE__COMPRESSION="None" - run "$RUN_IDENT" ingest_hdd_none ingest "$ZIP_ARCHIVE" - ) + ( + export BETREE__COMPRESSION="None" + run "$RUN_IDENT" ingest_hdd_none ingest "$ZIP_ARCHIVE" + ) - for level in $(seq 1 16); do - ( - export BETREE__COMPRESSION="{ Zstd = { level = $level } }" - run "$RUN_IDENT" "ingest_hdd_zstd_$level" ingest "$ZIP_ARCHIVE" - ) - done - ) + for level in $(seq 1 16); do + ( + export BETREE__COMPRESSION="{ Zstd = { level = $level } }" + run "$RUN_IDENT" "ingest_hdd_zstd_$level" ingest "$ZIP_ARCHIVE" + ) + done + ) } function switchover() { - run "$RUN_IDENT" switchover_tiny switchover 32 "$((32 * 1024 * 1024))" - run "$RUN_IDENT" switchover_small switchover 8 "$((128 * 1024 * 1024))" - run "$RUN_IDENT" switchover_medium switchover 4 "$((2 * 1024 * 1024 * 1024))" - run "$RUN_IDENT" switchover_large switchover 4 "$((8 * 1024 * 1024 * 1024))" + run "$RUN_IDENT" switchover_tiny switchover 32 "$((32 * 1024 * 1024))" + run "$RUN_IDENT" switchover_small switchover 8 "$((128 * 1024 * 1024))" + run "$RUN_IDENT" switchover_medium switchover 4 "$((2 * 1024 * 1024 * 1024))" + run "$RUN_IDENT" switchover_large switchover 4 "$((8 * 1024 * 1024 * 1024))" } function ci() { - run "$RUN_IDENT" switchover_small switchover 4 "$((128 * 1024 * 1024))" + run "$RUN_IDENT" switchover_small switchover 4 "$((128 * 1024 * 1024))" } function ycsb_a() { - run "$RUN_IDENT" ycsb_a_block ycsb-a "$((8 * 1024 * 1024 * 1024))" 0 8 - run "$RUN_IDENT" ycsb_a_memory ycsb-a "$((8 * 1024 * 1024 * 1024))" 1 8 + run "$RUN_IDENT" ycsb_a_block ycsb-a "$((8 * 1024 * 1024 * 1024))" 0 8 + run "$RUN_IDENT" ycsb_a_memory ycsb-a "$((8 * 1024 * 1024 * 1024))" 1 8 } function ycsb_b() { - run "$RUN_IDENT" ycsb_b_block ycsb-b "$((8 * 1024 * 1024 * 1024))" 0 8 - run "$RUN_IDENT" ycsb_b_memory ycsb-b "$((8 * 1024 * 1024 * 1024))" 1 8 + run "$RUN_IDENT" ycsb_b_block ycsb-b "$((8 * 1024 * 1024 * 1024))" 0 8 + run "$RUN_IDENT" ycsb_b_memory ycsb-b "$((8 * 1024 * 1024 * 1024))" 1 8 } function ycsb_c() { - run "$RUN_IDENT" ycsb_c_block ycsb-c "$((8 * 1024 * 1024 * 1024))" 0 8 - run "$RUN_IDENT" ycsb_c_memory ycsb-c "$((8 * 1024 * 1024 * 1024))" 1 8 + run "$RUN_IDENT" ycsb_c_block ycsb-c "$((8 * 1024 * 1024 * 1024))" 0 8 + run "$RUN_IDENT" ycsb_c_memory ycsb-c "$((8 * 1024 * 1024 * 1024))" 1 8 } function ycsb_d() { - run "$RUN_IDENT" ycsb_d_block ycsb-d "$((8 * 1024 * 1024 * 1024))" 0 8 - run "$RUN_IDENT" ycsb_d_memory ycsb-d "$((8 * 1024 * 1024 * 1024))" 1 8 + run "$RUN_IDENT" ycsb_d_block ycsb-d "$((8 * 1024 * 1024 * 1024))" 0 8 + run "$RUN_IDENT" ycsb_d_memory ycsb-d "$((8 * 1024 * 1024 * 1024))" 1 8 } function ycsb_e() { - run "$RUN_IDENT" ycsb_e_block ycsb-e "$((8 * 1024 * 1024 * 1024))" 0 8 - run "$RUN_IDENT" ycsb_e_memory ycsb-e "$((8 * 1024 * 1024 * 1024))" 1 8 + run "$RUN_IDENT" ycsb_e_block ycsb-e "$((8 * 1024 * 1024 * 1024))" 0 8 + run "$RUN_IDENT" ycsb_e_memory ycsb-e "$((8 * 1024 * 1024 * 1024))" 1 8 } function ycsb_f() { - run "$RUN_IDENT" ycsb_f_block ycsb-f "$((8 * 1024 * 1024 * 1024))" 0 8 - run "$RUN_IDENT" ycsb_f_memory ycsb-f "$((8 * 1024 * 1024 * 1024))" 1 8 + run "$RUN_IDENT" ycsb_f_block ycsb-f "$((8 * 1024 * 1024 * 1024))" 0 8 + run "$RUN_IDENT" ycsb_f_memory ycsb-f "$((8 * 1024 * 1024 * 1024))" 1 8 } cargo build --release -if [ -z "$BETREE_CONFIG" ] -then - export BETREE_CONFIG="$PWD/perf-config.json" +if [ -z "$BETREE_CONFIG" ]; then + export BETREE_CONFIG="$PWD/perf-config.json" fi export ROOT="$PWD" @@ -251,15 +247,13 @@ export ZIP_ARCHIVE="$PWD/data/linux.zip" # this if multiple categories are needed. export RUN_IDENT="default" -if [ "$1" == "-h" ] || [ "$1" == "--help" ] || [ "$1" = "help" ] -then +if [ "$1" == "-h" ] || [ "$1" == "--help" ] || [ "$1" = "help" ]; then echo "Usage:" echo " $0 [identifier]" exit 0 fi -if [ -n "$*" ] -then +if [ -n "$*" ]; then export RUN_IDENT=$* fi diff --git a/betree/haura-benchmarks/src/lib.rs b/betree/haura-benchmarks/src/lib.rs index ae8949f4f..cc1790f42 100644 --- a/betree/haura-benchmarks/src/lib.rs +++ b/betree/haura-benchmarks/src/lib.rs @@ -33,19 +33,28 @@ impl Control { pub fn with_custom_config(modify_cfg: impl Fn(&mut DatabaseConfiguration)) -> Self { init_env_logger(); - let conf_path = env::var("BETREE_CONFIG").expect("Didn't provide a BETREE_CONFIG"); + let conf_path = + PathBuf::from(env::var("BETREE_CONFIG").expect("Didn't provide a BETREE_CONFIG")); + + let mut cfg = figment::Figment::new().merge(DatabaseConfiguration::figment_default()); + + match conf_path.extension() { + Some(ext) if ext == "yml" || ext == "yaml" => { + cfg = cfg.merge(figment::providers::Yaml::file(conf_path.clone())) + } + Some(ext) if ext == "json" => { + cfg = cfg.merge(figment::providers::Json::file(conf_path.clone())) + } + _ => todo!(), + } - let mut cfg: DatabaseConfiguration = figment::Figment::new() - .merge(DatabaseConfiguration::figment_default()) - .merge(figment::providers::Json::file(conf_path)) + let mut cfg: DatabaseConfiguration = cfg .merge(DatabaseConfiguration::figment_env()) .extract() .expect("Failed to extract DatabaseConfiguration"); cfg.access_mode = AccessMode::AlwaysCreateNew; - cfg.sync_interval_ms = None; - cfg.metrics = Some(metrics::MetricsConfiguration { enabled: true, interval_ms: 500, diff --git a/betree/haura-benchmarks/src/ycsb.rs b/betree/haura-benchmarks/src/ycsb.rs index 1f31b9f35..4d1ed2494 100644 --- a/betree/haura-benchmarks/src/ycsb.rs +++ b/betree/haura-benchmarks/src/ycsb.rs @@ -184,7 +184,7 @@ pub fn b(mut client: KvClient, size: u64, threads: usize, runtime: u64) { /// C - Read heavy /// Operations: Read 100% /// Distribution: Zipfian -/// Application example: User profile cache, where profiles are constructed elsewhere (e.g., Hadoop) +/// Access Size: 1000 bytes pub fn c(mut client: KvClient, size: u64, threads: usize, runtime: u64) { println!("Running YCSB Workload C"); println!("Filling KV store..."); diff --git a/betree/include/betree.h b/betree/include/betree.h index d7131aece..a14be14ab 100644 --- a/betree/include/betree.h +++ b/betree/include/betree.h @@ -1,7 +1,7 @@ #ifndef betree_h #define betree_h -/* Generated with cbindgen:0.24.3 */ +/* Generated with cbindgen:0.27.0 */ /* Warning, this file is autogenerated by cbindgen. Don't modify this manually. */ @@ -16,6 +16,8 @@ */ #define BLOCK_SIZE 4096 +#define BUFFER_STATIC_SIZE HEADER + #define CHUNK_MAX (UINT32_MAX - 1024) /** @@ -144,10 +146,19 @@ typedef struct byte_slice_t { const struct byte_slice_rc_t *arc; } byte_slice_t; +/** + * Highest storage preference. + */ #define STORAGE_PREF_FASTEST (storage_pref_t){ ._0 = StoragePreference_FASTEST } +/** + * Default storage preference. + */ #define STORAGE_PREF_NONE (storage_pref_t){ ._0 = StoragePreference_NONE } +/** + * Lowest storage preference. + */ #define STORAGE_PREF_SLOWEST (storage_pref_t){ ._0 = StoragePreference_SLOWEST } /** @@ -225,7 +236,7 @@ int betree_create_ds(struct db_t *db, struct err_t **err); /** - * Create an object store interface. + * Create an object store. */ struct obj_store_t *betree_create_object_store(struct db_t *db, const char *name, @@ -437,6 +448,14 @@ struct obj_t *betree_object_create(struct obj_store_t *os, */ int betree_object_delete(struct obj_t *obj, struct err_t **err); +/** + * Fetch the size of the given object if it exists. Returns 0 on error. + */ +unsigned long long betree_object_get_size(struct obj_store_t *os, + const char *key, + unsigned int key_len, + struct err_t **err); + /** * Open an existing object. */ @@ -575,4 +594,4 @@ struct range_iter_t *betree_snapshot_range(const struct ss_t *ss, */ int betree_sync_db(struct db_t *db, struct err_t **err); -#endif /* betree_h */ +#endif /* betree_h */ diff --git a/betree/pmdk/src/lib.rs b/betree/pmdk/src/lib.rs index 5d4d8421e..5f50f635e 100644 --- a/betree/pmdk/src/lib.rs +++ b/betree/pmdk/src/lib.rs @@ -23,7 +23,7 @@ pub struct PMem { impl Drop for PMem { fn drop(&mut self) { - self.close() + self.close().unwrap() } } @@ -67,6 +67,17 @@ impl PMem { Self::new(ptr, mapped_len, is_pmem) } + pub unsafe fn get_slice( + &self, + offset: usize, + len: usize, + ) -> Result<&'static [u8], std::io::Error> { + Ok(std::slice::from_raw_parts( + self.ptr.as_ptr().add(offset) as *const u8, + len, + )) + } + fn new(ptr: *mut c_void, len: usize, is_pmem: i32) -> Result { NonNull::new(ptr) .map(|valid| PMem { @@ -87,14 +98,12 @@ impl PMem { /// Read a range of bytes from the specified offset. pub fn read(&self, offset: usize, data: &mut [u8]) { - let _ = unsafe { - pmem_memcpy( - data.as_ptr() as *mut c_void, - self.ptr.as_ptr().add(offset), - data.len(), - PMEM_F_MEM_NOFLUSH, - ) - }; + unsafe { + self.ptr + .as_ptr() + .add(offset) + .copy_to(data.as_mut_ptr() as *mut c_void, data.len()) + } } /// Write a range of bytes to the specified offset. @@ -125,11 +134,18 @@ impl PMem { self.len } - fn close(&mut self) { + fn close(&mut self) -> Result<(), std::io::Error> { unsafe { - // TODO: Read out error correctly. Atleast let the output know that something went wrong. - pmem_unmap(self.ptr.as_ptr(), self.len); + if -1 == pmem_unmap(self.ptr.as_ptr(), self.len) { + let err = CString::from_raw(pmem_errormsg() as *mut i8); + let err_msg = format!( + "Failed to close persistent memory pool. Reason: {}", + err.to_string_lossy() + ); + return Err(std::io::Error::new(std::io::ErrorKind::Other, err_msg)); + } } + Ok(()) } } diff --git a/betree/src/buffer.rs b/betree/src/buffer.rs index b8308d34b..97c5346b0 100644 --- a/betree/src/buffer.rs +++ b/betree/src/buffer.rs @@ -9,11 +9,15 @@ //! //! [MutBuf] does not support growing with [io::Write] because the semantics of growing an inner split buffer are unclear. -use crate::vdev::{Block, BLOCK_SIZE}; +use crate::{ + cow_bytes::{CowBytes, SlicedCowBytes}, + vdev::{Block, BLOCK_SIZE}, +}; use std::{ alloc::{self, Layout}, cell::UnsafeCell, - fmt, io, + fmt, + io::{self, Write}, mem::ManuallyDrop, ops::{Deref, Range}, ptr::NonNull, @@ -32,18 +36,6 @@ fn split_range_at( range: &Range>, mid: Block, ) -> (Range>, Range>) { - // NOTE: - // jwuensche: I've changed the behavior off this function back to the - // original implementation whereas we take the relative mid from the current - // view. In the redesign of some of the modules this was changed to relate - // to a total mid which we do not calculate in some of the vdev structs. - // This only became a problem when performing multiple splits as the buffer - // returned would contain the total position in the original buffer. - // To keep the performance advantage we simply offset the mid from the known - // total start. - // - // No further implications should be expected as the only sequentially use of - // this structure is in the [crate::vdev::Parity1] code. if range.start + mid < range.end { // mid is in range (range.start..range.start + mid, range.start + mid..range.end) @@ -74,7 +66,7 @@ impl AlignedStorage { ptr: unsafe { let new_layout = Layout::from_size_align_unchecked(capacity.to_bytes() as usize, BLOCK_SIZE); - NonNull::new(alloc::alloc_zeroed(new_layout)).expect("Allocation failed.") + NonNull::new(alloc::alloc(new_layout)).expect("Allocation failed.") }, capacity, } @@ -114,8 +106,7 @@ impl AlignedStorage { ); self.ptr = NonNull::new(realloc_ptr).unwrap_or_else(|| { - let new_ptr = - NonNull::new(alloc::alloc_zeroed(new_layout)).expect("Allocation failed."); + let new_ptr = NonNull::new(alloc::alloc(new_layout)).expect("Allocation failed."); self.ptr .as_ptr() .copy_to_nonoverlapping(new_ptr.as_ptr(), self.capacity.to_bytes() as usize); @@ -157,10 +148,6 @@ impl From> for AlignedStorage { log::warn!("Unaligned buffer, copying {} bytes", b.len()); let size = Block::round_up_from_bytes(b.len() as u32); let storage = AlignedStorage::zeroed(size); - let align = std::mem::align_of::(); - assert!(!b.as_ptr().is_null()); - assert!(storage.ptr.as_ptr().align_offset(align) == 0); - assert!(b.as_ptr().align_offset(align) == 0); unsafe { storage .ptr @@ -226,10 +213,38 @@ impl From> for AlignedBuf { } } +#[derive(Clone)] +enum BufSource { + Allocated(AlignedBuf), + Foreign(Arc>>, Block), +} + +impl BufSource { + fn as_ptr(&self) -> *mut u8 { + match self { + BufSource::Allocated(buf) => unsafe { (*buf.buf.get()).ptr.as_ptr() }, + BufSource::Foreign(ptr, _) => unsafe { (*ptr.get()).as_ptr() }, + } + } + + fn len(&self) -> usize { + match self { + BufSource::Allocated(buf) => unsafe { (*buf.buf.get()).capacity.to_bytes() as usize }, + BufSource::Foreign(_, s) => s.to_bytes() as usize, + } + } + + fn as_slice(&self) -> &[u8] { + unsafe { slice::from_raw_parts(self.as_ptr(), self.len()) } + } +} + +unsafe impl Send for BufSource {} + /// A shared read-only buffer, internally using block-aligned allocations. #[derive(Clone)] pub struct Buf { - buf: AlignedBuf, + buf: BufSource, range: Range>, } @@ -269,11 +284,30 @@ impl BufWrite { /// This is always safe because [BufWrite] can't be split, /// and therefore no aliasing writable pieces can remain. pub fn into_buf(self) -> Buf { + // NOTE: This entire section has been commented out bc it is detrimental to performance as these operations can happen on the hotpath during evictions. What is really changed by this is the total memory footprint which can be hold as *technically* we might hold unused memory, though this should not happen with accurate size reports as the + + // let curr_layout = unsafe { + // Layout::from_size_align_unchecked(self.buf.capacity.to_bytes() as usize, BLOCK_SIZE) + // }; + // let new_cap = Block::round_up_from_bytes(self.size); + // self.buf.capacity = new_cap; + // let new_ptr = unsafe { + // alloc::realloc( + // self.buf.ptr.as_ptr(), + // curr_layout, + // new_cap.to_bytes() as usize, + // ) + // }; + // // If return value is null, old value remains valid. + // if let Some(new_ptr) = NonNull::new(new_ptr) { + // self.buf.ptr = new_ptr; + // } Buf::from_aligned(AlignedBuf { buf: Arc::new(UnsafeCell::new(self.buf)), }) } + /// Return the size of this buffer. Capacity maybe larger. pub fn len(&self) -> usize { self.size as usize } @@ -364,7 +398,14 @@ impl Buf { fn from_aligned(aligned: AlignedBuf) -> Self { Self { range: aligned.full_range(), - buf: aligned, + buf: BufSource::Allocated(aligned), + } + } + + pub(crate) unsafe fn from_raw(ptr: NonNull, size: Block) -> Self { + Self { + buf: BufSource::Foreign(Arc::new(UnsafeCell::new(ptr)), size), + range: Block(0)..size, } } @@ -383,39 +424,76 @@ impl Buf { /// Panics if Buf was not unique, to ensure no readable references remain pub fn into_full_mut(self) -> MutBuf { - let range = self.buf.full_range(); - MutBuf { - buf: self.buf.unwrap_unique(), - range, + match self.buf { + BufSource::Allocated(buf) => { + let range = buf.full_range(); + + MutBuf { + buf: buf.unwrap_unique(), + range, + } + } + BufSource::Foreign(_, _) => self.into_buf_write().into_buf().into_full_mut(), } } /// Convert to a mutable [BufWrite], if this is the only [Buf] referencing the backing storage. /// Panics if this [Buf] was not unique. pub fn into_buf_write(self) -> BufWrite { - let storage = Arc::try_unwrap(self.buf.buf) - .expect("AlignedBuf was not unique") - .into_inner(); - BufWrite { - buf: storage, - size: self.range.end.to_bytes(), + match self.buf { + BufSource::Allocated(buf) => { + let storage = Arc::try_unwrap(buf.buf) + .expect("AlignedBuf was not unique") + .into_inner(); + BufWrite { + buf: storage, + size: self.range.end.to_bytes(), + } + } + BufSource::Foreign(_, _) => { + let mut tmp = BufWrite::with_capacity(self.range.end); + tmp.write(self.buf.as_slice()).unwrap(); + tmp + } + } + } + + /// Convert to [SlicedCowBytes]. When [Buf] is referring to a foreign + /// non-self-managed memory range, this property is transferred otherwise a + /// new [CowBytes] is created. + pub fn into_sliced_cow_bytes(self) -> SlicedCowBytes { + match self.buf { + BufSource::Allocated(_) => CowBytes::from(self.into_boxed_slice()).into(), + BufSource::Foreign(stg, size) => { + let ptr = ManuallyDrop::new( + Arc::try_unwrap(stg) + .expect("RawBuf was not unique") + .into_inner(), + ); + + unsafe { SlicedCowBytes::from_raw(ptr.as_ptr(), size.to_bytes() as usize) } + } } } /// If this [Buf] is unique, return its backing buffer without reallocation or copying. /// Panics if this [Buf] was not unique. pub fn into_boxed_slice(self) -> Box<[u8]> { - let storage = ManuallyDrop::new( - Arc::try_unwrap(self.buf.buf) - .expect("AlignedBuf was not unique") - .into_inner(), - ); - - unsafe { - Box::from_raw(slice::from_raw_parts_mut( - storage.ptr.as_ptr(), - storage.capacity.to_bytes() as usize, - )) + match self.buf { + BufSource::Allocated(buf) => { + let storage = ManuallyDrop::new( + Arc::try_unwrap(buf.buf) + .expect("AlignedBuf was not unique") + .into_inner(), + ); + unsafe { + Box::from_raw(slice::from_raw_parts_mut( + storage.ptr.as_ptr(), + storage.capacity.to_bytes() as usize, + )) + } + } + BufSource::Foreign(_, _) => self.buf.as_slice().to_vec().into_boxed_slice(), } } @@ -483,13 +561,10 @@ impl Deref for Buf { impl AsRef<[u8]> for Buf { fn as_ref(&self) -> &[u8] { - unsafe { - let start = self.range.start.to_bytes() as usize; - let end = self.range.end.to_bytes() as usize; - let buf = &*self.buf.buf.get(); - let slice = slice::from_raw_parts(buf.ptr.as_ptr(), buf.capacity.to_bytes() as usize); - &slice[start..end] - } + let start = self.range.start.to_bytes() as usize; + let end = self.range.end.to_bytes() as usize; + let slice = self.buf.as_slice(); + &slice[start..end] } } @@ -518,7 +593,7 @@ impl From> for Buf { let aligned = AlignedBuf::from(b); Buf { range: aligned.full_range(), - buf: aligned, + buf: BufSource::Allocated(aligned), } } } diff --git a/betree/src/c_interface.rs b/betree/src/c_interface.rs index b4717b08d..3a693e16b 100644 --- a/betree/src/c_interface.rs +++ b/betree/src/c_interface.rs @@ -1,21 +1,15 @@ //! This module provides the C interface to the database. #![allow(non_camel_case_types)] use std::{ - env::SplitPaths, - ffi::{CStr, OsStr}, - io::{stderr, BufReader, Write}, - os::{ - raw::{c_char, c_int, c_uint, c_ulong}, - unix::prelude::OsStrExt, - }, + ffi::CStr, + io::{stderr, Write}, + os::raw::{c_char, c_int, c_uint, c_ulong, c_ulonglong}, process::abort, ptr::{null_mut, read, write}, slice::{from_raw_parts, from_raw_parts_mut}, sync::Arc, }; -use libc::{c_void, memcpy}; - use crate::{ cow_bytes::{CowBytes, SlicedCowBytes}, database::{AccessMode, Database, Dataset, Error, Snapshot}, @@ -48,8 +42,11 @@ pub struct obj_store_t(ObjectStore); /// The handle of an object in the corresponding object store pub struct obj_t<'os>(ObjectHandle<'os>); +/// Default storage preference. pub const STORAGE_PREF_NONE: storage_pref_t = storage_pref_t(StoragePreference::NONE); +/// Highest storage preference. pub const STORAGE_PREF_FASTEST: storage_pref_t = storage_pref_t(StoragePreference::FASTEST); +/// Lowest storage preference. pub const STORAGE_PREF_SLOWEST: storage_pref_t = storage_pref_t(StoragePreference::SLOWEST); /// A reference counted byte slice @@ -73,7 +70,7 @@ impl From for byte_slice_t { fn from(x: SlicedCowBytes) -> Self { let ptr = &x[..] as *const [u8] as *const u8 as *const c_char; let len = x.len() as c_uint; - let arc = Arc::into_raw(x.data.inner) as *const byte_slice_rc_t; + let arc = x.into_raw() as *const byte_slice_rc_t; byte_slice_t { ptr, len, arc } } } @@ -260,7 +257,7 @@ pub unsafe extern "C" fn betree_parse_configuration( /// On error, return null. If `err` is not null, store an error in `err`. #[no_mangle] pub unsafe extern "C" fn betree_configuration_from_env(err: *mut *mut err_t) -> *mut cfg_t { - let path = match std::env::var_os("BETREE_CONFIG") { + let path = std::path::PathBuf::from(match std::env::var_os("BETREE_CONFIG") { Some(val) => val, None => { handle_err( @@ -269,11 +266,30 @@ pub unsafe extern "C" fn betree_configuration_from_env(err: *mut *mut err_t) -> ); return null_mut(); } - }; - let file = std::fs::OpenOptions::new().read(true).open(path).unwrap(); - serde_json::from_reader::<_, DatabaseConfiguration>(BufReader::new(file)) - .map_err(Error::from) - .handle_result(err) + }); + + let file = std::fs::OpenOptions::new() + .read(true) + .open(path.clone()) + .unwrap(); + + match path.extension() { + Some(ext) if ext == "yml" || ext == "yaml" => { + serde_yaml::from_reader::<_, DatabaseConfiguration>(file) + .map_err(Error::from) + .handle_result(err) + } + Some(ext) if ext == "json" => serde_json::from_reader::<_, DatabaseConfiguration>(file) + .map_err(Error::from) + .handle_result(err), + _ => { + handle_err( + Error::Generic("File has no common extension, pick 'json', 'yaml' or 'yml'".into()), + err, + ); + return null_mut(); + } + } } /// Enable the global env_logger, configured via environment variables. @@ -828,7 +844,7 @@ pub unsafe extern "C" fn betree_print_error(err: *mut err_t) { } } -/// Create an object store interface. +/// Create an object store. #[no_mangle] pub unsafe extern "C" fn betree_create_object_store( db: *mut db_t, @@ -912,6 +928,23 @@ pub unsafe extern "C" fn betree_object_close(obj: *mut obj_t, err: *mut *mut err obj.close().handle_result(err) } +/// Fetch the size of the given object if it exists. Returns 0 on error. +#[no_mangle] +pub unsafe extern "C" fn betree_object_get_size( + os: *mut obj_store_t, + key: *const c_char, + key_len: c_uint, + err: *mut *mut err_t, +) -> c_ulonglong { + let os = &mut (*os).0; + if let Ok(Some(info)) = os.read_object_info(from_raw_parts(key as *const u8, key_len as usize)) { + info.size as u64 + } else { + *err = Box::into_raw(Box::new(err_t(Error::DoesNotExist))); + 0 + } +} + /// Try to read `buf_len` bytes of `obj` into `buf`, starting at `offset` bytes into the objects /// data. The actually read number of bytes is written into `n_read` if and only if the read /// succeeded. diff --git a/betree/src/cache/clock_cache.rs b/betree/src/cache/clock_cache.rs index db53565e1..241a1d615 100644 --- a/betree/src/cache/clock_cache.rs +++ b/betree/src/cache/clock_cache.rs @@ -136,7 +136,7 @@ impl Stats for CacheStats { } } -impl AddSize for PinnedEntry { +impl AddSize for PinnedEntry { fn add_size(&self, size_delta: isize) { if size_delta >= 0 { self.size.fetch_add(size_delta as usize, Ordering::Relaxed); @@ -301,12 +301,15 @@ impl>)| { let p: *mut CacheEntry<_> = Arc::as_ptr(&v) as *mut CacheEntry<_>; let v2: &mut CacheEntry = unsafe { &mut *p }; - v2.value.size() + v2.value.cache_size() }) .sum::(); @@ -396,7 +399,7 @@ impl Self::Stats; + /// Debug feature to compare actual size requirements with tracked delta + /// changes. fn verify(&mut self); } diff --git a/betree/src/checksum/fxhash.rs b/betree/src/checksum/fxhash.rs index 3837d947d..2638f166a 100644 --- a/betree/src/checksum/fxhash.rs +++ b/betree/src/checksum/fxhash.rs @@ -22,6 +22,9 @@ impl Checksum for FxHash { &self, data: I, ) -> Result<(), ChecksumError> { + if self.0 == 0 { + return Ok(()); + } let mut state = FxHashBuilder.build(); for x in data { state.ingest(x.as_ref()); @@ -49,6 +52,10 @@ impl Builder for FxHashBuilder { fn build(&self) -> Self::State { FxHashState(FxHasher::default()) } + + fn empty(&self) -> FxHash { + FxHash(0) + } } /// The internal state of `FxHash`. diff --git a/betree/src/checksum/gxhash.rs b/betree/src/checksum/gxhash.rs index f0ce0a1cc..8b4408d43 100644 --- a/betree/src/checksum/gxhash.rs +++ b/betree/src/checksum/gxhash.rs @@ -22,6 +22,9 @@ impl Checksum for GxHash { &self, data: I, ) -> Result<(), ChecksumError> { + if self.0 == 0 { + return Ok(()); + } let mut state = GxHashBuilder.build(); for x in data { state.ingest(x.as_ref()); @@ -51,6 +54,10 @@ impl Builder for GxHashBuilder { // does not work for us, therefore, use pinned seed. GxHashState(GxHasher::with_seed(0)) } + + fn empty(&self) -> GxHash { + GxHash(0) + } } /// The internal state of `GxHash`. diff --git a/betree/src/checksum/mod.rs b/betree/src/checksum/mod.rs index 755cc2eac..1bc3ce386 100644 --- a/betree/src/checksum/mod.rs +++ b/betree/src/checksum/mod.rs @@ -1,7 +1,7 @@ //! This module provides a `Checksum` trait and implementors for verifying data //! integrity. -use crate::size::Size; +use crate::size::{Size, StaticSize}; use serde::{de::DeserializeOwned, Serialize}; use std::{error::Error, fmt, iter::once}; @@ -15,7 +15,7 @@ pub use xxhash::{XxHash, XxHashBuilder}; /// A checksum to verify data integrity. pub trait Checksum: - Serialize + DeserializeOwned + Size + Clone + Send + Sync + fmt::Debug + 'static + Serialize + DeserializeOwned + Size + StaticSize + Clone + Send + Sync + fmt::Debug + 'static { /// Builds a new `Checksum`. type Builder: Builder; @@ -45,6 +45,10 @@ pub trait Builder: /// Create a new state to build a checksum. fn build(&self) -> Self::State; + + /// Return an empty Checksum. This variant skips the verificiation steps + /// when applied to a new buffer. + fn empty(&self) -> C; } /// Holds a state for building a new `Checksum`. diff --git a/betree/src/checksum/xxhash.rs b/betree/src/checksum/xxhash.rs index 839c07956..5c4b04c46 100644 --- a/betree/src/checksum/xxhash.rs +++ b/betree/src/checksum/xxhash.rs @@ -23,6 +23,9 @@ impl Checksum for XxHash { &self, data: I, ) -> Result<(), ChecksumError> { + if self.0 == 0 { + return Ok(()); + } let mut state = XxHashBuilder.build(); for x in data { state.ingest(x.as_ref()); @@ -50,6 +53,10 @@ impl Builder for XxHashBuilder { fn build(&self) -> Self::State { XxHashState(twox_hash::XxHash::with_seed(0)) } + + fn empty(&self) -> XxHash { + XxHash(0) + } } /// The internal state of `XxHash`. diff --git a/betree/src/compression/mod.rs b/betree/src/compression/mod.rs index 42807a67f..69203770c 100644 --- a/betree/src/compression/mod.rs +++ b/betree/src/compression/mod.rs @@ -3,26 +3,30 @@ //! `None` and `Lz4` are provided as implementation. use crate::{ - buffer::{Buf, BufWrite}, + buffer::Buf, size::{Size, StaticSize}, vdev::Block, }; use serde::{Deserialize, Serialize}; -use std::{fmt::Debug, io::Write, mem}; +use std::{fmt::Debug, mem}; mod errors; pub use errors::*; const DEFAULT_BUFFER_SIZE: Block = Block(1); +/// Determine the used compression algorithm. #[derive(Debug, Serialize, Deserialize, Clone)] pub enum CompressionConfiguration { + /// No-op. None, // Lz4, + /// Configurable Zstd algorithm. Zstd(Zstd), } impl CompressionConfiguration { + /// pub fn to_builder(&self) -> Box { match self { CompressionConfiguration::None => Box::new(None), @@ -35,15 +39,32 @@ impl CompressionConfiguration { /// method. This differs from a CompressionConfiguration, in that it is not configurable, as /// all methods will decompress just fine without knowing at which compression level it was /// originally written, so there's no advantage in storing the compression level with each object. -#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] +#[derive( + Debug, + Copy, + Clone, + Serialize, + Deserialize, + PartialEq, + Eq, + Hash, + rkyv::Archive, + rkyv::Serialize, + rkyv::Deserialize, +)] +#[archive(check_bytes)] #[repr(u8)] pub enum DecompressionTag { + /// No-op. None, + /// Decompress using Lz4. Lz4, + /// Decompress using Zstd. Zstd, } impl DecompressionTag { + /// Start a new decompression. The resulting structure consumes a buffer to decompress the data. pub fn new_decompression(&self) -> Result> { use DecompressionTag as Tag; match self { @@ -65,17 +86,20 @@ impl StaticSize for DecompressionTag { pub trait CompressionBuilder: Debug + Size + Send + Sync + 'static { /// Returns an object for compressing data into a `Box<[u8]>`. fn new_compression(&self) -> Result>; + /// Which decompression algorithm needs to be used. fn decompression_tag(&self) -> DecompressionTag; } /// Trait for the object that compresses data. -pub trait CompressionState: Write { +pub trait CompressionState { /// Finishes the compression stream and returns a buffer that contains the /// compressed data. fn finish(&mut self, data: Buf) -> Result; } +/// An implementation of consumption-based decompression. pub trait DecompressionState { + /// Decompress the given [Buf]. On No-op this is a simple pass through, no memory is copied. fn decompress(&mut self, data: Buf) -> Result; } diff --git a/betree/src/compression/none.rs b/betree/src/compression/none.rs index a1bb77457..cb84d9100 100644 --- a/betree/src/compression/none.rs +++ b/betree/src/compression/none.rs @@ -7,7 +7,7 @@ use crate::{ size::StaticSize, }; use serde::{Deserialize, Serialize}; -use std::{io, mem}; +use std::io; /// No-op compression. #[derive(Debug, Clone, Serialize, Deserialize, Copy)] @@ -36,6 +36,7 @@ impl CompressionBuilder for None { } impl None { + /// Start no-op decompression. pub fn new_decompression() -> Result> { Ok(Box::new(NoneDecompression)) } @@ -63,7 +64,6 @@ impl CompressionState for NoneCompression { impl DecompressionState for NoneDecompression { fn decompress(&mut self, data: Buf) -> Result { - // FIXME: pass-through Buf, reusing alloc Ok(data) } } diff --git a/betree/src/compression/zstd.rs b/betree/src/compression/zstd.rs index 606422a4c..23459ef7f 100644 --- a/betree/src/compression/zstd.rs +++ b/betree/src/compression/zstd.rs @@ -1,26 +1,13 @@ -use super::{ - CompressionBuilder, CompressionState, DecompressionState, DecompressionTag, Result, - DEFAULT_BUFFER_SIZE, -}; +use super::{CompressionBuilder, CompressionState, DecompressionState, DecompressionTag, Result}; use crate::{ buffer::{Buf, BufWrite}, - database, size::StaticSize, vdev::Block, }; use serde::{Deserialize, Serialize}; -use std::{ - io::{self, Cursor, Write}, - mem, -}; -use zstd::{ - block::{Compressor, Decompressor}, - stream::{ - raw::{CParameter, DParameter, Decoder, Encoder}, - zio::{Reader, Writer}, - }, -}; -use zstd_safe::{FrameFormat, InBuffer, OutBuffer, WriteBuf}; +use std::{io::Write, mem}; +use zstd::stream::raw::{CParameter, DParameter, Decoder, Encoder}; +use zstd_safe::{FrameFormat, WriteBuf}; // TODO: investigate pre-created dictionary payoff @@ -67,6 +54,7 @@ impl CompressionBuilder for Zstd { } impl Zstd { + /// Start Zstd decompression. The decompression level is by default encoded with the received data stream. pub fn new_decompression() -> Result> { let mut decoder = Decoder::new()?; decoder.set_parameter(DParameter::Format(FrameFormat::Magicless))?; @@ -76,20 +64,6 @@ impl Zstd { } } -impl io::Write for ZstdCompression { - fn write(&mut self, buf: &[u8]) -> io::Result { - unimplemented!() - } - - fn write_all(&mut self, buf: &[u8]) -> io::Result<()> { - unimplemented!() - } - - fn flush(&mut self) -> io::Result<()> { - unimplemented!() - } -} - use speedy::{Readable, Writable}; const DATA_OFF: usize = mem::size_of::(); diff --git a/betree/src/cow_bytes.rs b/betree/src/cow_bytes.rs index e1a8bb1ea..e532345bf 100644 --- a/betree/src/cow_bytes.rs +++ b/betree/src/cow_bytes.rs @@ -221,11 +221,39 @@ impl<'a> Extend<&'a u8> for CowBytes { /// Reference-counted pointer which points to a subslice of the referenced data. #[derive(Debug, Default, Clone)] pub struct SlicedCowBytes { - pub(super) data: CowBytes, + pub(super) data: ByteSource, pos: u32, len: u32, } +#[derive(Debug, Clone)] +pub(super) enum ByteSource { + Cow(CowBytes), + Raw { ptr: *const u8, len: usize }, +} + +impl Deref for ByteSource { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + match self { + ByteSource::Cow(data) => &data, + ByteSource::Raw { ptr, len } => unsafe { + std::slice::from_raw_parts(ptr.clone(), *len) + }, + } + } +} + +impl Default for ByteSource { + fn default() -> Self { + Self::Cow(CowBytes::default()) + } +} + +unsafe impl Send for ByteSource {} +unsafe impl Sync for ByteSource {} + impl PartialEq for SlicedCowBytes { fn eq(&self, other: &Self) -> bool { **self == **other @@ -258,13 +286,20 @@ impl Size for SlicedCowBytes { fn size(&self) -> usize { 8 + self.len as usize } + + fn cache_size(&self) -> usize { + match self.data { + ByteSource::Cow(ref cow_bytes) => cow_bytes.cache_size(), + ByteSource::Raw { .. } => std::mem::size_of::() + std::mem::size_of::(), + } + } } impl SlicedCowBytes { /// Returns a new subslice which points to `self[pos..pos+len]`. pub fn subslice(self, pos: u32, len: u32) -> Self { - let pos = self.pos + pos; assert!(pos + len <= self.len); + let pos = self.pos + pos; SlicedCowBytes { data: self.data, pos, @@ -281,6 +316,35 @@ impl SlicedCowBytes { len: self.len - pos, } } + + pub(crate) fn into_raw(self) -> *const Vec { + match self.data { + ByteSource::Cow(data) => Arc::into_raw(data.inner), + ByteSource::Raw { ptr, len } => unsafe { + // FIXME: This copies data currently when the original buffer + // is from a raw source ot avoid breaking behavior from + // outside. + let mut buf = Vec::with_capacity(len); + (buf.as_mut_ptr() as *mut u8).copy_from(ptr, len); + &buf + }, + } + } + + pub(crate) unsafe fn from_raw(ptr: *const u8, len: usize) -> Self { + Self { + data: ByteSource::Raw { ptr, len }, + pos: 0, + len: len.try_into().expect("Capacity to large."), + } + } + + pub(crate) fn into_cow_bytes(self) -> Result { + match self.data { + ByteSource::Cow(cow_bytes) if self.pos == 0 => Ok(cow_bytes), + _ => Err(self), + } + } } impl From for SlicedCowBytes { @@ -288,7 +352,7 @@ impl From for SlicedCowBytes { SlicedCowBytes { pos: 0, len: data.len() as u32, - data, + data: ByteSource::Cow(data), } } } diff --git a/betree/src/data_management/cache_value.rs b/betree/src/data_management/cache_value.rs index aa680c2e3..c727b9324 100644 --- a/betree/src/data_management/cache_value.rs +++ b/betree/src/data_management/cache_value.rs @@ -112,4 +112,8 @@ impl SizeMut for TaggedCacheValue { fn size(&mut self) -> usize { self.value.size() } + + fn cache_size(&mut self) -> usize { + self.value.cache_size() + } } diff --git a/betree/src/data_management/delegation.rs b/betree/src/data_management/delegation.rs index e4634ad07..9d484b28b 100644 --- a/betree/src/data_management/delegation.rs +++ b/betree/src/data_management/delegation.rs @@ -87,7 +87,7 @@ where (**self).prefetch(or) } - fn finish_prefetch(&self, p: Self::Prefetch) -> Result<(), Error> { + fn finish_prefetch(&self, p: Self::Prefetch) -> Result { (**self).finish_prefetch(p) } diff --git a/betree/src/data_management/dmu.rs b/betree/src/data_management/dmu.rs index 8b5a27eb1..19a931c68 100644 --- a/betree/src/data_management/dmu.rs +++ b/betree/src/data_management/dmu.rs @@ -11,7 +11,7 @@ use crate::{ cache::{Cache, ChangeKeyError, RemoveError}, checksum::{Builder, Checksum, State}, compression::CompressionBuilder, - data_management::CopyOnWriteReason, + data_management::{CopyOnWriteReason, IntegrityMode}, database::{DatasetId, Generation, Handler}, migration::DmlMsg, size::{Size, SizeMut, StaticSize}, @@ -278,24 +278,21 @@ where /// Fetches synchronously an object from disk and inserts it into the /// cache. - fn fetch(&self, op: &::ObjectPointer, pivot_key: PivotKey) -> Result<(), Error> { + fn fetch( + &self, + op: &::ObjectPointer, + pivot_key: PivotKey, + ) -> Result { // FIXME: reuse decompression_state debug!("Fetching {op:?}"); - let mut decompression_state = op.decompression_tag().new_decompression()?; let offset = op.offset(); let generation = op.generation(); - let compressed_data = self - .pool - .read(op.size(), op.offset(), op.checksum().clone())?; - - let object: Node>> = { - let data = decompression_state.decompress(compressed_data)?; - Object::unpack_at(op.offset(), op.info(), data.into_boxed_slice())? - }; + let object = op.fetch(self.pool())?; + // FIXME: The NVM node is only available when no compression is used. let key = ObjectKey::Unmodified { offset, generation }; - self.insert_object_into_cache(key, TaggedCacheValue::new(RwLock::new(object), pivot_key)); - Ok(()) + Ok(self + .insert_object_into_cache(key, TaggedCacheValue::new(RwLock::new(object), pivot_key))) } /// Fetches asynchronously an object from disk and inserts it into the @@ -317,19 +314,26 @@ where > { let ptr = op.clone(); + let size = op.size(); + Ok(self .pool - .read_async(op.size(), op.offset(), op.checksum().clone())? + .read_async(size, op.offset(), op.checksum().clone())? .map_err(Error::from) .and_then(move |data| ok((ptr, data, pivot_key)))) } - fn insert_object_into_cache(&self, key: ObjectKey, mut object: E::Value) { - let size = object.value_mut().get_mut().size(); + fn insert_object_into_cache( + &self, + key: ObjectKey, + mut object: E::Value, + ) -> E::ValueRef { + let size = object.value_mut().get_mut().cache_size(); let mut cache = self.cache.write(); if !cache.contains_key(&key) { cache.insert(key, object, size); } + cache.get(&key, false).unwrap() } fn evict(&self, mut cache: RwLockWriteGuard) -> Result<(), Error> { @@ -376,7 +380,7 @@ where .is_ok(), }; if can_be_evicted { - Some(object.size()) + Some(object.cache_size()) } else { None } @@ -392,7 +396,7 @@ where ObjectKey::Modified(mid) => mid, }; - let size = object.value_mut().get_mut().size(); + let size = object.value_mut().get_mut().cache_size(); cache.insert(ObjectKey::InWriteback(mid), object, size); let entry = cache.get(&ObjectKey::InWriteback(mid), false).unwrap(); @@ -421,6 +425,7 @@ where super::Size::size(&*object) } }; + let cache_size = object.cache_size(); log::trace!("Entering write back of {:?}", &mid); if object_size > 4 * 1024 * 1024 { @@ -440,36 +445,49 @@ where .unwrap_or(self.default_storage_class); let compression = &self.default_compression; - let compressed_data = { + let (integrity_mode, compressed_data) = { // FIXME: cache this let mut state = compression.new_compression()?; - let mut buf = crate::buffer::BufWrite::with_capacity(Block(128)); - { - object.pack(&mut buf)?; + let mut buf = crate::buffer::BufWrite::with_capacity(Block::round_up_from_bytes( + object_size as u32, + )); + let integrity_mode = { + let pp = object.prepare_pack( + self.spl().storage_kind_map()[storage_class as usize], + &pivot_key, + )?; + let part = object.pack(&mut buf, pp, |bytes| { + let mut builder = self.default_checksum_builder.build(); + builder.ingest(bytes); + builder.finish() + })?; drop(object); - } - state.finish(buf.into_buf())? + part + }; + (integrity_mode, state.finish(buf.into_buf())?) }; assert!(compressed_data.len() <= u32::max_value() as usize); let size = compressed_data.len(); + // FIXME + if size > Block::round_up_from_bytes(object_size).to_bytes() { + warn!("anticipated size deviated from actual size, realloc necessary in writes... (Expected {}, Actual {})", Block::round_up_from_bytes(object_size).to_bytes(), size); + } debug!("Compressed object size is {size} bytes"); let size = Block(((size + BLOCK_SIZE - 1) / BLOCK_SIZE) as u32); assert!(size.to_bytes() as usize >= compressed_data.len()); let offset = self.allocate(storage_class, size)?; assert_eq!(size.to_bytes() as usize, compressed_data.len()); - /*if size.to_bytes() as usize != compressed_data.len() { - let mut v = compressed_data.into_vec(); - v.resize(size.to_bytes() as usize, 0); - compressed_data = v.into_boxed_slice(); - }*/ let info = self.modified_info.lock().remove(&mid).unwrap(); - let checksum = { - let mut state = self.default_checksum_builder.build(); - state.ingest(compressed_data.as_ref()); - state.finish() + let checksum = match integrity_mode { + IntegrityMode::External => { + let mut state = self.default_checksum_builder.build(); + state.ingest(compressed_data.as_ref()); + state.finish() + } + IntegrityMode::Internal { .. } => self.default_checksum_builder.empty(), }; self.pool.begin_write(compressed_data, offset)?; @@ -481,6 +499,7 @@ where decompression_tag: compression.decompression_tag(), generation, info, + integrity_mode, }; let was_present; @@ -489,7 +508,7 @@ where // We can safely ignore pins. // If it's pinned, it must be a readonly request. was_present = if evict { - cache.force_remove(&ObjectKey::InWriteback(mid), object_size) + cache.force_remove(&ObjectKey::InWriteback(mid), cache_size) } else { cache.force_change_key( &ObjectKey::InWriteback(mid), @@ -823,7 +842,7 @@ where if let ObjRef::Unmodified(ref ptr, ref pk) = *or { drop(cache); - self.fetch(ptr, pk.clone())?; + let _ = self.fetch(ptr, pk.clone())?; if let Some(report_tx) = &self.report_tx { let _ = report_tx .send(DmlMsg::fetch(ptr.offset(), ptr.size(), pk.clone())) @@ -870,7 +889,7 @@ where }; self.modified_info.lock().insert(mid, info); let key = ObjectKey::Modified(mid); - let size = object.size(); + let size = object.cache_size(); self.cache.write().insert( key, TaggedCacheValue::new(RwLock::new(object), pk.clone()), @@ -891,7 +910,7 @@ where }; self.modified_info.lock().insert(mid, info); let key = ObjectKey::Modified(mid); - let size = object.size(); + let size = object.cache_size(); let entry = { let mut cache = self.cache.write(); cache.insert( @@ -905,10 +924,18 @@ where } fn remove(&self, or: Self::ObjectRef) { - match self.cache.write().remove(&or.as_key(), |obj| obj.size()) { + match self + .cache + .write() + .remove(&or.as_key(), |obj| obj.cache_size()) + { Ok(_) | Err(RemoveError::NotPresent) => {} // TODO - Err(RemoveError::Pinned) => unimplemented!(), + Err(RemoveError::Pinned) => { + let bt = std::backtrace::Backtrace::force_capture(); + println!("{}", bt); + unimplemented!() + } }; if let ObjRef::Unmodified(ref ptr, ..) = or { self.copy_on_write(ptr.clone(), CopyOnWriteReason::Remove, or.index().clone()); @@ -921,7 +948,11 @@ where ) -> Result>>, Error> { let obj = loop { self.get(&mut or)?; - match self.cache.write().remove(&or.as_key(), |obj| obj.size()) { + match self + .cache + .write() + .remove(&or.as_key(), |obj| obj.cache_size()) + { Ok(obj) => break obj, Err(RemoveError::NotPresent) => {} // TODO @@ -1032,26 +1063,27 @@ where }) } - fn finish_prefetch(&self, p: Self::Prefetch) -> Result<(), Error> { + fn finish_prefetch(&self, p: Self::Prefetch) -> Result { let (ptr, compressed_data, pk) = block_on(p)?; let object: Node>> = { let data = ptr .decompression_tag() .new_decompression()? .decompress(compressed_data)?; - Object::unpack_at(ptr.offset(), ptr.info(), data.into_boxed_slice())? + Object::unpack_at(ptr.info(), data, ptr.integrity_mode.clone())? }; let key = ObjectKey::Unmodified { offset: ptr.offset(), generation: ptr.generation(), }; - self.insert_object_into_cache(key, TaggedCacheValue::new(RwLock::new(object), pk.clone())); + let cache_ref = self + .insert_object_into_cache(key, TaggedCacheValue::new(RwLock::new(object), pk.clone())); if let Some(report_tx) = &self.report_tx { let _ = report_tx .send(DmlMsg::fetch(ptr.offset(), ptr.size(), pk)) .map_err(|_| warn!("Channel Receiver has been dropped.")); } - Ok(()) + Ok(CacheValueRef::read(cache_ref)) } // Cache depending methods @@ -1069,7 +1101,7 @@ where .filter(|&key| matches!(key, ObjectKey::Unmodified { .. })) .collect(); for key in keys { - let _ = cache.remove(&key, |obj| obj.size()); + let _ = cache.remove(&key, |obj| obj.cache_size()); } } } diff --git a/betree/src/data_management/errors.rs b/betree/src/data_management/errors.rs index 4bbb7d34d..78dfb4159 100644 --- a/betree/src/data_management/errors.rs +++ b/betree/src/data_management/errors.rs @@ -4,12 +4,12 @@ use thiserror::Error; #[derive(Error, Debug)] pub enum Error { - #[error("The storage pool encountered an error.")] + #[error("VDev failed: {source}")] VdevError { #[from] source: crate::vdev::Error, }, - #[error("The chosen compression type encountered an error.")] + #[error("Compression failed: {source}")] CompressionError { #[from] source: crate::compression::Error, @@ -22,7 +22,7 @@ pub enum Error { SerializationError, #[error("The allocation handler encountered an error.")] HandlerError(String), - #[error("Input/Output procedure encountered an error.")] + #[error("Io failed: {source}")] IoError { #[from] source: std::io::Error, @@ -31,7 +31,7 @@ pub enum Error { OutOfSpaceError, #[error("A callback function to the cache has errored.")] CallbackError, - #[error("A raw allocation has failed.")] + #[error("A raw allocation of size {size} as {at} has failed.")] RawAllocationError { at: DiskOffset, size: Block }, } diff --git a/betree/src/data_management/impls.rs b/betree/src/data_management/impls.rs index bf18854b2..846137912 100644 --- a/betree/src/data_management/impls.rs +++ b/betree/src/data_management/impls.rs @@ -1,14 +1,9 @@ use super::{object_ptr::ObjectPointer, HasStoragePreference}; use crate::{ - database::Generation, - size::{StaticSize}, - storage_pool::DiskOffset, - tree::PivotKey, + database::Generation, size::StaticSize, storage_pool::DiskOffset, tree::PivotKey, StoragePreference, }; -use serde::{ - de::DeserializeOwned, ser::Error as SerError, Deserialize, Deserializer, Serialize, Serializer, -}; +use serde::{de::DeserializeOwned, ser::Error as SerError}; #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub struct ModifiedObjectId { @@ -41,7 +36,7 @@ pub enum ObjRef

{ impl super::ObjectReference for ObjRef> where D: std::fmt::Debug + 'static, - ObjectPointer: Serialize + DeserializeOwned + StaticSize + Clone, + ObjectPointer: serde::Serialize + DeserializeOwned + StaticSize + Clone, { type ObjectPointer = ObjectPointer; fn get_unmodified(&self) -> Option<&ObjectPointer> { @@ -62,7 +57,7 @@ where ObjRef::Unmodified(_, o_pk) | ObjRef::Modified(_, o_pk) => *o_pk = pk, // NOTE: An object reference may never need to be modified when // performing a write back. - ObjRef::InWriteback(..) => unreachable!(), + ObjRef::InWriteback(..) => {}, } } @@ -129,10 +124,10 @@ impl StaticSize for ObjRef

{ } } -impl Serialize for ObjRef

{ +impl serde::Serialize for ObjRef

{ fn serialize(&self, serializer: S) -> Result where - S: Serializer, + S: serde::Serializer, { match *self { ObjRef::Modified(..) => Err(S::Error::custom( @@ -148,13 +143,13 @@ impl Serialize for ObjRef

{ } } -impl<'de, D> Deserialize<'de> for ObjRef> +impl<'de, D> serde::Deserialize<'de> for ObjRef> where - ObjectPointer: Deserialize<'de>, + ObjectPointer: serde::Deserialize<'de>, { fn deserialize(deserializer: E) -> Result where - E: Deserializer<'de>, + E: serde::Deserializer<'de>, { ObjectPointer::::deserialize(deserializer).map(ObjRef::Incomplete) } diff --git a/betree/src/data_management/mod.rs b/betree/src/data_management/mod.rs index 1e3dd3c65..da591826f 100644 --- a/betree/src/data_management/mod.rs +++ b/betree/src/data_management/mod.rs @@ -13,17 +13,18 @@ //! data blobs as in the [crate::object] module. use crate::{ + buffer::Buf, cache::AddSize, + checksum::{Builder, Checksum}, database::DatasetId, migration::DmlMsg, size::{Size, StaticSize}, - storage_pool::{DiskOffset, GlobalDiskId, StoragePoolLayer}, - tree::PivotKey, - vdev::Block, + storage_pool::StoragePoolLayer, + tree::{PivotKey, StorageKind}, StoragePreference, }; use parking_lot::Mutex; -use serde::{de::DeserializeOwned, Serialize}; +use serde::{de::DeserializeOwned, Deserialize, Serialize}; use stable_deref_trait::StableDeref; use std::{ collections::HashMap, @@ -108,15 +109,71 @@ pub trait HasStoragePreference { // fn flood_storage_preference(&self, pref: StoragePreference); } +/// Intermediary structure to prove that media constraints have been checked. +/// This is more of a hack since i don't want to pull apart the trait. +pub struct PreparePack(); + +/// Which integrity mode is used by the nodes. Can be used to skip the +/// processing of an entire node if it is not required to ensure integrity of +/// data. +#[derive(Serialize, Deserialize, Debug, Clone, Copy)] +pub enum IntegrityMode { + /// The default mode. Checksums are stored with the object pointers. All + /// data is processed initially. + External, + /// Integrity is ensured by the node implementation itself. + Internal { csum: C, len: u32 }, +} + +impl StaticSize for IntegrityMode { + fn static_size() -> usize { + // FIXME: this only works if we abandon the other the integrity mode + C::static_size() + std::mem::size_of::() + } +} + +impl IntegrityMode { + pub fn checksum(&self) -> Option<&C> { + match self { + IntegrityMode::Internal { csum, .. } => Some(csum), + _ => None, + } + } + + pub fn length(&self) -> Option { + match self { + IntegrityMode::Internal { len, .. } => Some(*len), + _ => None, + } + } +} + /// An object managed by a [Dml]. pub trait Object: Size + Sized + HasStoragePreference { - /// Packs the object into the given `writer`. - fn pack(&self, writer: W) -> Result<(), io::Error>; + /// Informs the object about the kind of storage it will be placed upon. + /// This allows for optimizations within the node for different kind of + /// storage medias. + fn prepare_pack( + &mut self, + storage_kind: StorageKind, + pivot_key: &PivotKey, + ) -> Result + where + R: ObjectReference; + + /// Packs the object into the given `writer`. Returns an option if the node + /// can be read with a subset of data starting from the start of the range. + fn pack C, C: Checksum>( + &self, + writer: W, + pp: PreparePack, + csum_builder: F, + ) -> Result, io::Error>; /// Unpacks the object from the given `data`. - fn unpack_at( - disk_offset: DiskOffset, + fn unpack_at( d_id: DatasetId, - data: Box<[u8]>, + data: Buf, + integrity_mode: IntegrityMode, ) -> Result; /// Returns debug information about an object. @@ -175,7 +232,7 @@ pub trait Dml: Sized { info: DatasetId, ) -> Result; - /// Provides mutable access to the object + /// Provi /// if this object is already mutable. fn try_get_mut(&self, or: &Self::ObjectRef) -> Option; @@ -215,7 +272,7 @@ pub trait Dml: Sized { fn prefetch(&self, or: &Self::ObjectRef) -> Result, Error>; /// Finishes the prefetching. - fn finish_prefetch(&self, p: Self::Prefetch) -> Result<(), Error>; + fn finish_prefetch(&self, p: Self::Prefetch) -> Result; /// Which format the cache statistics are represented in. For example a simple struct. type CacheStats: serde::Serialize; @@ -254,8 +311,10 @@ pub enum CopyOnWriteReason { /// Denotes if an implementor of the [Dml] can utilize an allocation handler. pub trait DmlWithHandler { + /// Precise type of handler used. type Handler; + /// Return the inner allocation handler. fn handler(&self) -> &Self::Handler; } diff --git a/betree/src/data_management/object_ptr.rs b/betree/src/data_management/object_ptr.rs index 0dbbd6d13..c805f99c6 100644 --- a/betree/src/data_management/object_ptr.rs +++ b/betree/src/data_management/object_ptr.rs @@ -1,9 +1,9 @@ -use super::HasStoragePreference; +use super::{HasStoragePreference, IntegrityMode}; use crate::{ compression::DecompressionTag, database::{DatasetId, Generation}, size::StaticSize, - storage_pool::DiskOffset, + storage_pool::{DiskOffset, StoragePoolLayer}, vdev::Block, StoragePreference, }; @@ -16,6 +16,7 @@ pub struct ObjectPointer { pub(super) checksum: D, pub(super) offset: DiskOffset, pub(super) size: Block, + pub(super) integrity_mode: IntegrityMode, pub(super) info: DatasetId, pub(super) generation: Generation, } @@ -51,6 +52,7 @@ impl StaticSize for ObjectPointer { + Generation::static_size() + ::static_size() + Block::::static_size() + + std::mem::size_of::() } } @@ -67,6 +69,7 @@ impl ObjectPointer { pub fn offset(&self) -> DiskOffset { self.offset } + /// Get the size in blocks of the serialized object. pub fn size(&self) -> Block { self.size @@ -80,4 +83,26 @@ impl ObjectPointer { pub fn info(&self) -> DatasetId { self.info } + + /// Instantiate the object. + pub fn fetch( + &self, + pool: &SPL, + ) -> Result< + crate::tree::Node>>, + super::errors::Error, + > + where + SPL: StoragePoolLayer, + D: crate::size::StaticSize + crate::checksum::Checksum, + { + let mut decompression_state = self.decompression_tag().new_decompression()?; + let compressed_data = pool.read(self.size(), self.offset(), self.checksum.clone())?; + let data = decompression_state.decompress(compressed_data)?; + Ok(super::Object::unpack_at( + self.info(), + data, + self.integrity_mode.clone(), + )?) + } } diff --git a/betree/src/database/dataset.rs b/betree/src/database/dataset.rs index 76a43c124..cb951a2c5 100644 --- a/betree/src/database/dataset.rs +++ b/betree/src/database/dataset.rs @@ -1,7 +1,7 @@ use super::root_tree_msg::dataset; use super::{ errors::*, fetch_ds_data, Database, DatasetData, DatasetId, DatasetTree, Generation, - MessageTree, StorageInfo, RootDmu, + MessageTree, RootDmu, StorageInfo, }; use crate::{ cow_bytes::{CowBytes, SlicedCowBytes}, @@ -152,6 +152,7 @@ impl Database { Err(e) => return Err(e), }; let ds_id = self.allocate_ds_id()?; + let tree = DatasetTree::empty_tree( ds_id, DefaultMessageAction, @@ -174,7 +175,7 @@ impl Database { let mut key = vec![1]; key.extend(name); self.root_tree.insert( - key, + &key[..], DefaultMessageAction::insert_msg(&ds_id.pack()), StoragePreference::NONE, )?; @@ -616,6 +617,7 @@ impl Dataset { } #[cfg(feature = "internal-api")] + /// Fetch a node by it's pivot key. For testing purposes. pub fn test_get_node_pivot( &self, pk: &PivotKey, diff --git a/betree/src/database/errors.rs b/betree/src/database/errors.rs index 4dcee1ca7..b628354c8 100644 --- a/betree/src/database/errors.rs +++ b/betree/src/database/errors.rs @@ -17,7 +17,7 @@ pub enum Error { #[from] source: crate::storage_pool::Error, }, - #[error("A tree operation encountered an error. This is likely an internal error.")] + #[error("TreeError: {source}")] TreeError { #[from] source: crate::tree::Error, @@ -50,17 +50,20 @@ pub enum Error { DoesNotExist, #[error("Dataset name already occupied. Try to `.open()` the dataset instead.")] AlreadyExists, - // TODO: This should anyway not happen, as there are no problems occuring - // anymore when two instances are opened. Remove? #[error("Given dataset is already in use. Try to close another instance first before opening a new one.")] InUse, #[error("Message surpasses the maximum length. If you cannot shrink your value, use an object store instead.")] MessageTooLarge, - #[error("Could not serialize the given data. This is an internal error.")] + #[error("Could not serialize the given data. This is an internal error. Backtrace: {source}")] SerializeFailed { #[from] source: serde_json::Error, }, + #[error("Could not deserialize the given data. Backtrace: {source}")] + YamlConfigFailed { + #[from] + source: serde_yaml::Error, + }, #[error("Migration is not possible as {1:?} blocks are not available in tier {0}.")] MigrationWouldExceedStorage(u8, Block), #[error("Migration is not possible as the given tier does not exist.")] diff --git a/betree/src/database/handler.rs b/betree/src/database/handler.rs index 71bbbcab6..4e2ffdc9d 100644 --- a/betree/src/database/handler.rs +++ b/betree/src/database/handler.rs @@ -107,16 +107,19 @@ impl<'a> SegmentAllocatorGuard<'a> { } impl Handler { + /// Return current generation. pub fn current_generation(&self) -> Generation { self.current_generation.read() } + /// Push alloc or dealloc messages to the current in-memory allocation + /// bitmap or message queue for synchronization. pub fn update_allocation_bitmap( &self, offset: DiskOffset, size: Block, action: Action, - dmu: &X, + _dmu: &X, ) -> Result<()> where X: Dml, ObjectRef = OR, ObjectPointer = OR::ObjectPointer>, @@ -159,6 +162,7 @@ impl Handler { Ok(()) } + /// Fetch and return segment bitmap from cache or disk. pub fn get_allocation_bitmap(&self, id: SegmentId, dmu: &X) -> Result where X: Dml, ObjectRef = OR, ObjectPointer = OR::ObjectPointer>, @@ -210,10 +214,12 @@ impl Handler { Ok(SegmentAllocatorGuard { inner: foo, id }) } + /// Return space information of a single disk. pub fn free_space_disk(&self, disk_id: GlobalDiskId) -> Option { self.free_space.get(&disk_id).map(|elem| elem.into()) } + /// Return space information of a single tier. pub fn free_space_tier(&self, class: u8) -> Option { self.free_space_tier .get(class as usize) diff --git a/betree/src/database/mod.rs b/betree/src/database/mod.rs index bc9f37e0f..b32788476 100644 --- a/betree/src/database/mod.rs +++ b/betree/src/database/mod.rs @@ -5,9 +5,7 @@ use crate::{ checksum::GxHash, compression::CompressionConfiguration, cow_bytes::SlicedCowBytes, - data_management::{ - self, Dml, DmlWithHandler, DmlWithReport, DmlWithStorageHints, Dmu, TaggedCacheValue, - }, + data_management::{self, Dml, DmlWithReport, DmlWithStorageHints, Dmu, TaggedCacheValue}, metrics::{metrics_init, MetricsConfiguration}, migration::{DatabaseMsg, DmlMsg, GlobalObjectId, MigrationPolicies}, size::StaticSize, @@ -69,7 +67,7 @@ const DEFAULT_SYNC_INTERVAL_MS: u64 = 1000; // This is the hash used overall in the entire database. For reconfiguration // recompilation is necessary and this type changed. -type Checksum = GxHash; +pub(crate) type Checksum = GxHash; type ObjectPointer = data_management::ObjectPointer; pub(crate) type ObjectRef = data_management::impls::ObjRef; @@ -107,8 +105,11 @@ pub enum AccessMode { pub enum SyncMode { /// No automatic sync, only on user call Explicit, - /// Every `interval_ms` milliseconds, sync is called - Periodic { interval_ms: u64 }, + /// Repeatedly call sync, wall clock dependent. + Periodic { + /// Every `interval_ms` milliseconds, sync is called + interval_ms: u64, + }, } /// A bundle type of component configuration types, used during [Database::build] @@ -134,7 +135,7 @@ pub struct DatabaseConfiguration { pub default_storage_class: u8, /// Which compression type to use, and the type-specific compression parameters pub compression: CompressionConfiguration, - /// Size of cache in TODO + /// Size of cache in bytes pub cache_size: usize, /// Whether to check for and open an existing database, or overwrite it pub access_mode: AccessMode, @@ -184,10 +185,15 @@ impl DatabaseConfiguration { } impl DatabaseConfiguration { + /// Create new [StoragePoolUnit] instance. This is the first step of the DB initialization. pub fn new_spu(&self) -> Result { - Ok(StoragePoolUnit::::new(&self.storage)?) + Ok(StoragePoolUnit::::new( + &self.storage, + self.default_storage_class, + )?) } + /// Create new [Handler] instance. This is the second step of the DB initialization. pub fn new_handler(&self, spu: &RootSpu) -> DbHandler { Handler { root_tree_inner: AtomicOption::new(), @@ -218,6 +224,7 @@ impl DatabaseConfiguration { } } + /// Create a new [Dmu] instance. This is the third step of the DB initialization. pub fn new_dmu(&self, spu: RootSpu, handler: DbHandler) -> RootDmu { let mut strategy: [[Option; NUM_STORAGE_CLASSES]; NUM_STORAGE_CLASSES] = [[None; NUM_STORAGE_CLASSES]; NUM_STORAGE_CLASSES]; @@ -679,8 +686,22 @@ impl DeadListData { /// Internal identifier for a dataset #[derive( - Debug, Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize, + Debug, + Default, + Clone, + Copy, + PartialEq, + Eq, + PartialOrd, + Ord, + Hash, + Serialize, + Deserialize, + rkyv::Archive, + rkyv::Serialize, + rkyv::Deserialize, )] +#[archive(check_bytes)] pub struct DatasetId(u64); use std::fmt::Display; @@ -706,6 +727,7 @@ impl DatasetId { DatasetId(self.0 + 1) } + /// Return the raw integer used as ID. pub fn as_u64(&self) -> u64 { self.0 } @@ -774,7 +796,22 @@ impl DatasetData

{ } /// Internal identifier of a generation -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +#[derive( + Debug, + Clone, + Copy, + PartialEq, + Eq, + PartialOrd, + Ord, + Hash, + Serialize, + Deserialize, + rkyv::Archive, + rkyv::Serialize, + rkyv::Deserialize, +)] +#[archive(check_bytes)] pub struct Generation(u64); impl StaticSize for Generation { diff --git a/betree/src/database/snapshot.rs b/betree/src/database/snapshot.rs index 5e65cf9e3..e87147300 100644 --- a/betree/src/database/snapshot.rs +++ b/betree/src/database/snapshot.rs @@ -1,12 +1,11 @@ use super::{ dataset::Dataset, errors::*, fetch_ds_data, fetch_ss_data, root_tree_msg::dataset, root_tree_msg::deadlist, root_tree_msg::snapshot, Database, DatasetData, DatasetId, - DatasetTree, DeadListData, Generation, ObjectPointer, RootDmu + DatasetTree, DeadListData, Generation, ObjectPointer, RootDmu, }; use crate::{ allocator::Action, cow_bytes::{CowBytes, SlicedCowBytes}, - data_management::DmlWithHandler, tree::{DefaultMessageAction, Tree, TreeLayer}, StoragePreference, }; diff --git a/betree/src/database/storage_info.rs b/betree/src/database/storage_info.rs index b14526393..46bdbf437 100644 --- a/betree/src/database/storage_info.rs +++ b/betree/src/database/storage_info.rs @@ -32,6 +32,7 @@ impl StorageInfo { ) } + /// Returns the amount of blocks needed to fill the storage space to the given threshold (0 <= t <= 1). pub fn blocks_until_filled_to(&self, threshold: f32) -> Block { let threshold = threshold.clamp(0.0, 1.0); Block( diff --git a/betree/src/metrics/mod.rs b/betree/src/metrics/mod.rs index 5e7010143..54b584536 100644 --- a/betree/src/metrics/mod.rs +++ b/betree/src/metrics/mod.rs @@ -1,7 +1,7 @@ //! A naive metrics system, logging newline-delimited JSON to a configurable file. use crate::{ - data_management::{Dml, DmlWithHandler}, + data_management::Dml, database::{RootDmu, StorageInfo}, storage_pool::{StoragePoolLayer, NUM_STORAGE_CLASSES}, }; diff --git a/betree/src/migration/mod.rs b/betree/src/migration/mod.rs index 680dcf161..cf439a098 100644 --- a/betree/src/migration/mod.rs +++ b/betree/src/migration/mod.rs @@ -82,8 +82,8 @@ use serde::{Deserialize, Serialize}; use std::{collections::HashMap, sync::Arc}; use crate::{ - data_management::DmlWithHandler, database::RootDmu, storage_pool::NUM_STORAGE_CLASSES, - tree::PivotKey, vdev::Block, Database, StoragePreference, + database::RootDmu, storage_pool::NUM_STORAGE_CLASSES, tree::PivotKey, vdev::Block, Database, + StoragePreference, }; use self::{lfu::Lfu, reinforcment_learning::ZhangHellanderToor}; diff --git a/betree/src/migration/reinforcment_learning.rs b/betree/src/migration/reinforcment_learning.rs index 3bff0af15..799933e17 100644 --- a/betree/src/migration/reinforcment_learning.rs +++ b/betree/src/migration/reinforcment_learning.rs @@ -3,7 +3,7 @@ use parking_lot::RwLock; use crate::{ cow_bytes::CowBytes, - data_management::{DmlWithHandler, DmlWithStorageHints}, + data_management::DmlWithStorageHints, database::{RootDmu, StorageInfo}, object::{ObjectStore, ObjectStoreId}, vdev::Block, diff --git a/betree/src/object/mod.rs b/betree/src/object/mod.rs index 1e8588fe2..9cf090afe 100644 --- a/betree/src/object/mod.rs +++ b/betree/src/object/mod.rs @@ -303,6 +303,7 @@ impl Database { } /// Create a namespaced object store, with the datasets "{name}\0data" and "{name}\0meta". + /// This method will open a block style object store. pub fn open_named_object_store( &mut self, name: &[u8], @@ -646,7 +647,7 @@ impl<'os> ObjectStore { Ok(Box::new(iter)) } - fn read_object_info(&'os self, key: &[u8]) -> Result> { + pub fn read_object_info(&'os self, key: &[u8]) -> Result> { if let Some(meta) = self.metadata.get(key)? { Ok(Some( ObjectInfo::read_from_buffer_with_ctx(meta::ENDIAN, &meta).unwrap(), @@ -757,7 +758,6 @@ impl<'ds> ObjectHandle<'ds> { )) .map_err(|_| warn!("Channel Receiver has been dropped.")); } - // no-op for now Ok(()) } diff --git a/betree/src/size.rs b/betree/src/size.rs index 4fc5f69e8..66e875482 100644 --- a/betree/src/size.rs +++ b/betree/src/size.rs @@ -5,18 +5,22 @@ use parking_lot::RwLock; -/// A trait which represents an serializable object -/// that can quickly calculate the size of it's -/// [`bincode`](../../bincode/index.html) representation. +/// A trait which represents an serializable object that can quickly calculate +/// the size of it's [`bincode`](../../bincode/index.html) representation and +/// the current size occupied in memory. pub trait Size { /// Returns the size (number of bytes) that this object would have /// if serialized using [`bincode`](../../bincode/index.html). fn size(&self) -> usize; + /// Return the, possibly recomputed size, of the current state of the + /// object. fn actual_size(&self) -> Option { None } + /// Return and verify the serialized size of the object based on + /// [Size::size] and [Size::actual_size]. fn checked_size(&self) -> Result { match (self.size(), self.actual_size()) { (predicted, Some(actual)) if predicted == actual => Ok(actual), @@ -24,6 +28,11 @@ pub trait Size { (predicted, None) => Ok(predicted), } } + + /// Current memory footprint of an object. + fn cache_size(&self) -> usize { + self.size() + } } /// A trait which represents an serializable object @@ -33,6 +42,9 @@ pub trait SizeMut { /// Returns the size (number of bytes) that this object would have /// if serialized using [`bincode`](../../bincode/index.html). fn size(&mut self) -> usize; + + /// Current memory footprint of an object. + fn cache_size(&mut self) -> usize; } /// A trait which represents an serializable object @@ -54,6 +66,11 @@ impl SizeMut for T { fn size(&mut self) -> usize { Size::size(self) } + + /// Current memory footprint of an object. + fn cache_size(&mut self) -> usize { + Size::cache_size(self) + } } impl Size for T { @@ -66,4 +83,9 @@ impl SizeMut for RwLock { fn size(&mut self) -> usize { self.get_mut().size() } + + /// Current memory footprint of an object. + fn cache_size(&mut self) -> usize { + self.get_mut().cache_size() + } } diff --git a/betree/src/storage_pool/configuration.rs b/betree/src/storage_pool/configuration.rs index 67de854f8..b62986864 100644 --- a/betree/src/storage_pool/configuration.rs +++ b/betree/src/storage_pool/configuration.rs @@ -2,7 +2,10 @@ #[cfg(feature = "nvm")] use pmdk; -use crate::vdev::{self, Dev, Leaf}; +use crate::{ + tree::StorageKind, + vdev::{self, Dev, Leaf}, +}; use itertools::Itertools; use libc; use serde::{Deserialize, Serialize}; @@ -71,6 +74,8 @@ pub struct TierConfiguration { /// Which storage access is preferred to be used with this tier. See /// [PreferredAccessType] for all variants. pub preferred_access_type: PreferredAccessType, + /// Which medium this layer is made of. + pub storage_kind: StorageKind, } /// Configuration for the storage pool unit. @@ -157,6 +162,7 @@ impl TierConfiguration { TierConfiguration { top_level_vdevs, preferred_access_type: PreferredAccessType::Unknown, + storage_kind: StorageKind::Hdd, } } @@ -207,6 +213,7 @@ impl TierConfiguration { Ok(TierConfiguration { top_level_vdevs: v, preferred_access_type: PreferredAccessType::Unknown, + storage_kind: StorageKind::Hdd, }) } @@ -252,6 +259,7 @@ impl FromIterator for TierConfiguration { TierConfiguration { top_level_vdevs: iter.into_iter().collect(), preferred_access_type: PreferredAccessType::Unknown, + storage_kind: StorageKind::Hdd, } } } @@ -318,39 +326,25 @@ impl LeafVdev { format!("memory-{mem}"), )?)), #[cfg(feature = "nvm")] - LeafVdev::PMemFile { .. } => { - let (path, len) = match self { - LeafVdev::File(path) => unreachable!(), - LeafVdev::FileWithOpts { .. } => unreachable!(), - LeafVdev::Memory { .. } => unreachable!(), - LeafVdev::PMemFile { path, len } => (path, len), - }; - - let mut file = match path.to_str() { - Some(filepath_str) => { - match pmdk::PMem::open(format!("{}\0", filepath_str).as_str()) { - Ok(handle) => handle, - Err(e) => match pmdk::PMem::create( - format!("{}\0", filepath_str).as_str(), - *len, - ) { - Ok(handle) => handle, - Err(e) => { - return Err(io::Error::new(io::ErrorKind::Other, - format!("Failed to create or open handle for pmem file. Path: {}", filepath_str))); - } - }, + LeafVdev::PMemFile { ref path, len } => { + let file = match pmdk::PMem::open(path) { + Ok(handle) => handle, + Err(open_err) => match pmdk::PMem::create(path, len) { + Ok(handle) => handle, + Err(create_err) => { + return Err(io::Error::new( + io::ErrorKind::Other, + format!( + "Failed to create or open handle for pmem file. Path: {} - Open Error {} -Create Error {}", + path.display(), + open_err, + create_err, + ), + )); } - } - None => { - return Err(io::Error::new( - io::ErrorKind::Other, - format!("Invalid file path: {:?}", path), - )); - } + }, }; - - if file.len() != *len { + if file.len() != len { return Err(io::Error::new(io::ErrorKind::Other, format!("The file already exists with a different length. Provided length: {}, File's length: {}", len, file.len()))); diff --git a/betree/src/storage_pool/disk_offset.rs b/betree/src/storage_pool/disk_offset.rs index 948a0f8fb..11671c4f5 100644 --- a/betree/src/storage_pool/disk_offset.rs +++ b/betree/src/storage_pool/disk_offset.rs @@ -4,9 +4,32 @@ use std::{fmt, mem}; /// 2-bit storage class, 10-bit disk ID, 52-bit block offset (see /// [`BLOCK_SIZE`](../vdev/constant.BLOCK_SIZE.html)) -#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive( + Clone, + Copy, + PartialEq, + Eq, + Hash, + Serialize, + Deserialize, + rkyv::Archive, + rkyv::Serialize, + rkyv::Deserialize, +)] +#[archive(check_bytes)] pub struct DiskOffset(u64); +impl std::fmt::Display for DiskOffset { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_fmt(format_args!( + "Offset({},{},{})", + self.storage_class(), + self.disk_id(), + self.block_offset() + )) + } +} + const MASK_STORAGE_CLASS: u64 = ((1 << 2) - 1) << (10 + 52); const MASK_DISK_ID: u64 = ((1 << 10) - 1) << 52; const MASK_OFFSET: u64 = (1 << 52) - 1; @@ -65,7 +88,7 @@ impl DiskOffset { DiskOffset(x) } - // Glue together a class identifier with a class depdendent disk_id. + /// Glue together a class identifier with a class depdendent disk_id. pub fn construct_disk_id(class: u8, disk_id: u16) -> GlobalDiskId { GlobalDiskId(((class as u16) << 10) | disk_id) } diff --git a/betree/src/storage_pool/mod.rs b/betree/src/storage_pool/mod.rs index 66bbe0f60..cafdba748 100644 --- a/betree/src/storage_pool/mod.rs +++ b/betree/src/storage_pool/mod.rs @@ -5,6 +5,7 @@ use crate::{ buffer::Buf, checksum::Checksum, + tree::StorageKind, vdev::{Block, Error as VdevError, Result as VdevResult}, }; use futures::{executor::block_on, prelude::*, TryFuture}; @@ -32,7 +33,10 @@ pub trait StoragePoolLayer: Clone + Send + Sync + 'static { type Metrics: Serialize; /// Constructs a new object using the given `Configuration`. - fn new(configuration: &Self::Configuration) -> StoragePoolResult; + fn new( + configuration: &Self::Configuration, + default_storage_class: u8, + ) -> StoragePoolResult; /// Reads `size` blocks from the given `offset`. fn read( @@ -96,6 +100,12 @@ pub trait StoragePoolLayer: Clone + Send + Sync + 'static { /// Return a fitting [StoragePreference] to the given [PreferredAccessType]. fn access_type_preference(&self, t: PreferredAccessType) -> StoragePreference; + + /// Get list of storage kinds divided by tier. + fn storage_kind_map(&self) -> [StorageKind; NUM_STORAGE_CLASSES]; + + /// Get default storage class. + fn default_storage_class(&self) -> u8; } mod disk_offset; diff --git a/betree/src/storage_pool/storage_preference.rs b/betree/src/storage_pool/storage_preference.rs index 78199f95d..be5fcb8a2 100644 --- a/betree/src/storage_pool/storage_preference.rs +++ b/betree/src/storage_pool/storage_preference.rs @@ -1,4 +1,3 @@ -use serde::{Deserialize, Serialize}; use speedy::{Readable, Writable}; use std::{ cmp, @@ -27,7 +26,22 @@ const SLOWEST: u8 = 3; /// /// This type is not an `Option`, because it saves one byte per value, and allows the /// implementation of convenience methods on itself. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Readable, Writable)] +#[derive( + Debug, + Clone, + Copy, + PartialEq, + Eq, + Hash, + serde::Serialize, + serde::Deserialize, + Readable, + Writable, + rkyv::Archive, + rkyv::Serialize, + rkyv::Deserialize, +)] +#[archive(check_bytes)] #[repr(transparent)] pub struct StoragePreference(u8); impl StoragePreference { @@ -113,7 +127,10 @@ impl PartialOrd for StoragePreference { } } -#[derive(Debug, Serialize, Deserialize)] +#[derive( + Debug, serde::Serialize, serde::Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize, +)] +#[archive(check_bytes)] /// An atomic version of [StoragePreference], replacing a RwLock> by /// using the additional variant "Unknown" in place of None. pub struct AtomicStoragePreference(AtomicU8); @@ -206,8 +223,11 @@ impl Default for AtomicStoragePreference { /// automated migration policy, in contrast to the lower bound by /// [StoragePreference]. Acts as a neutral element when set to /// `None`. -#[derive(Debug, Serialize, Deserialize)] -pub(crate) struct AtomicSystemStoragePreference(AtomicU8); +#[derive( + Debug, serde::Serialize, serde::Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize, +)] +#[archive(check_bytes)] +pub struct AtomicSystemStoragePreference(AtomicU8); impl Clone for AtomicSystemStoragePreference { fn clone(&self) -> Self { diff --git a/betree/src/storage_pool/unit.rs b/betree/src/storage_pool/unit.rs index 13e7373e0..71c0236fd 100644 --- a/betree/src/storage_pool/unit.rs +++ b/betree/src/storage_pool/unit.rs @@ -6,6 +6,7 @@ use crate::{ bounded_future_queue::BoundedFutureQueue, buffer::Buf, checksum::Checksum, + tree::StorageKind, vdev::{self, Block, Dev, Error as VdevError, Vdev, VdevRead, VdevWrite}, PreferredAccessType, StoragePreference, }; @@ -31,6 +32,7 @@ pub(super) type WriteBackQueue = BoundedFutureQueue< struct StorageTier { devs: Box<[Dev]>, preferred_access_type: PreferredAccessType, + kind: StorageKind, } impl StorageTier { @@ -56,15 +58,17 @@ impl Default for StorageTier { Self { devs: Box::new([]), preferred_access_type: PreferredAccessType::Unknown, + kind: StorageKind::Hdd, } } } -impl From<(Box<[Dev]>, PreferredAccessType)> for StorageTier { - fn from(item: (Box<[Dev]>, PreferredAccessType)) -> Self { +impl From<(Box<[Dev]>, PreferredAccessType, StorageKind)> for StorageTier { + fn from(item: (Box<[Dev]>, PreferredAccessType, StorageKind)) -> Self { Self { devs: item.0, preferred_access_type: item.1, + kind: item.2, } } } @@ -74,6 +78,8 @@ struct Inner { _check: PhantomData>, write_back_queue: WriteBackQueue, pool: ThreadPool, + cfg: StoragePoolConfiguration, + default_storage_class: u8, } impl Inner { @@ -87,16 +93,18 @@ impl StoragePoolLayer for StoragePoolUnit { type Configuration = StoragePoolConfiguration; type Metrics = StoragePoolMetrics; - fn new(configuration: &Self::Configuration) -> StoragePoolResult { + fn new( + configuration: &Self::Configuration, + default_storage_class: u8, + ) -> StoragePoolResult { let tiers: [StorageTier; NUM_STORAGE_CLASSES] = { let mut vec: Vec = configuration .tiers .iter() .map(|tier_cfg| { - tier_cfg - .build() - .map(Vec::into_boxed_slice) - .map(|tier| (tier, tier_cfg.preferred_access_type).into()) + tier_cfg.build().map(Vec::into_boxed_slice).map(|tier| { + (tier, tier_cfg.preferred_access_type, tier_cfg.storage_kind).into() + }) }) .collect::, _>>()?; @@ -111,6 +119,8 @@ impl StoragePoolLayer for StoragePoolUnit { let queue_depth = configuration.queue_depth_factor as usize * devices_len; Ok(StoragePoolUnit { inner: Arc::new(Inner { + cfg: configuration.clone(), + default_storage_class, tiers, _check: PhantomData::default(), write_back_queue: BoundedFutureQueue::new(queue_depth), @@ -165,9 +175,6 @@ impl StoragePoolLayer for StoragePoolUnit { .by_offset(offset) .write(data, offset.block_offset()) .await; - - // TODO: what about multiple writes to same offset? - // NOTE: This is currently covered in the tests and fails as expected inner.write_back_queue.mark_completed(&offset).await; res })?; @@ -268,6 +275,18 @@ impl StoragePoolLayer for StoragePoolUnit { } StoragePreference::NONE } + + fn storage_kind_map(&self) -> [StorageKind; NUM_STORAGE_CLASSES] { + let mut map = [StorageKind::default(); NUM_STORAGE_CLASSES]; + for idx in 0..NUM_STORAGE_CLASSES { + map[idx] = self.inner.tiers[idx].kind; + } + map + } + + fn default_storage_class(&self) -> u8 { + self.inner.default_storage_class + } } #[derive(serde::Serialize)] diff --git a/betree/src/tree/default_message_action.rs b/betree/src/tree/default_message_action.rs index 58e43c576..41df3d3ed 100644 --- a/betree/src/tree/default_message_action.rs +++ b/betree/src/tree/default_message_action.rs @@ -53,7 +53,7 @@ impl MsgType { 0 => Self::OverwriteNone, 1 => Self::OverwriteSome, 2 => Self::Upsert, - _ => unreachable!(), + _ => panic!("discriminant was {} which is not possible", discriminant), } } } @@ -196,8 +196,12 @@ impl DefaultMessageAction { let mut n_upserts = 0; let mut data = msg_data - .as_ref() - .map(|b| CowBytes::from(&b[..])) + .take() + // NOTE: Check if we can extract a direct copy from a sliced cowbytes directly iff the starting offset is 0. This can happen on repeated insertions in objects for message sizes <128K + .map(|b| { + b.into_cow_bytes() + .unwrap_or_else(|b| CowBytes::from(&b[..])) + }) .unwrap_or_default(); for upsert in upserts { @@ -212,6 +216,7 @@ impl DefaultMessageAction { if data.len() <= offset_bytes as usize { data.fill_zeros_up_to(offset_bytes as usize); + // FIXME: This line takes alot of time in profiles. data.push_slice(new_data); } else { data.fill_zeros_up_to(end_offset); @@ -237,7 +242,7 @@ impl DefaultMessageAction { } } - if n_upserts > 8 { + if n_upserts > 1024 { log::warn!("Applied {} upserts", n_upserts); } *msg_data = Some(data.into()); diff --git a/betree/src/tree/errors.rs b/betree/src/tree/errors.rs index 79cc95417..6a07d0083 100644 --- a/betree/src/tree/errors.rs +++ b/betree/src/tree/errors.rs @@ -3,7 +3,7 @@ use thiserror::Error; #[derive(Error, Debug)] pub enum Error { - #[error("Storage operation could not be performed")] + #[error("Storage operation could not be performed {source}")] DmuError { #[from] source: crate::data_management::Error, diff --git a/betree/src/tree/imp/child_buffer.rs b/betree/src/tree/imp/child_buffer.rs deleted file mode 100644 index ff579f10d..000000000 --- a/betree/src/tree/imp/child_buffer.rs +++ /dev/null @@ -1,404 +0,0 @@ -//! Implementation of a message buffering node wrapper. -//! -//! Encapsulating common nodes like [super::internal::InternalNode] and -//! [super::leaf::LeafNode]. -use crate::{ - cow_bytes::{CowBytes, SlicedCowBytes}, - data_management::{HasStoragePreference, ObjectReference}, - size::{Size, StaticSize}, - storage_pool::AtomicSystemStoragePreference, - tree::{pivot_key::LocalPivotKey, KeyInfo, MessageAction, PivotKey}, - AtomicStoragePreference, StoragePreference, -}; -use parking_lot::RwLock; -use serde::{Deserialize, Serialize}; -use std::{ - borrow::Borrow, - collections::{btree_map::Entry, BTreeMap, Bound}, - mem::replace, -}; - -/// A buffer for messages that belong to a child of a tree node. -#[derive(Debug, Serialize, Deserialize)] -#[serde(bound(serialize = "N: Serialize", deserialize = "N: Deserialize<'de>"))] -pub(super) struct ChildBuffer { - pub(super) messages_preference: AtomicStoragePreference, - #[serde(skip)] - pub(super) system_storage_preference: AtomicSystemStoragePreference, - buffer_entries_size: usize, - pub(super) buffer: BTreeMap, - #[serde(with = "ser_np")] - pub(super) node_pointer: RwLock, -} - -impl Size for (KeyInfo, SlicedCowBytes) { - fn size(&self) -> usize { - let (_keyinfo, data) = self; - KeyInfo::static_size() + data.size() - } -} - -impl HasStoragePreference for ChildBuffer { - fn current_preference(&self) -> Option { - self.messages_preference - .as_option() - .map(|msg_pref| { - StoragePreference::choose_faster( - msg_pref, - self.node_pointer.read().correct_preference(), - ) - }) - .map(|p| self.system_storage_preference.weak_bound(&p)) - } - - fn recalculate(&self) -> StoragePreference { - let mut pref = StoragePreference::NONE; - - for (keyinfo, _v) in self.buffer.values() { - pref.upgrade(keyinfo.storage_preference) - } - - self.messages_preference.set(pref); - - // pref can't be lower than that of child nodes - StoragePreference::choose_faster(pref, self.node_pointer.read().correct_preference()) - } - - fn system_storage_preference(&self) -> StoragePreference { - self.system_storage_preference.borrow().into() - } - - fn set_system_storage_preference(&mut self, pref: StoragePreference) { - self.system_storage_preference.set(pref) - } -} - -impl ChildBuffer { - /// Access the pivot key of the underlying object reference and update it to - /// reflect a structural change in the tree. - pub fn update_pivot_key(&mut self, lpk: LocalPivotKey) { - let or = self.node_pointer.get_mut(); - let d_id = or.index().d_id(); - or.set_index(lpk.to_global(d_id)); - } - - /// Insert an arbitrary PivotKey into the `ObjectReference`. - /// - /// FIXME: This is best replaced with actual type exclusion. - pub fn complete_object_ref(&mut self, pk: PivotKey) { - self.node_pointer.get_mut().set_index(pk) - } -} - -mod ser_np { - //! Serialization utilities of a node pointer type. - use super::RwLock; - use serde::{Deserialize, Deserializer, Serialize, Serializer}; - - pub fn serialize(np: &RwLock, serializer: S) -> Result - where - N: Serialize, - S: Serializer, - { - np.read().serialize(serializer) - } - - pub fn deserialize<'de, N, D>(deserializer: D) -> Result, D::Error> - where - N: Deserialize<'de>, - D: Deserializer<'de>, - { - N::deserialize(deserializer).map(RwLock::new) - } -} - -impl Size for ChildBuffer { - fn size(&self) -> usize { - Self::static_size() + self.buffer_entries_size + N::static_size() - } - - fn actual_size(&self) -> Option { - Some( - Self::static_size() - + N::static_size() - + self - .buffer - .iter() - .map(|(key, msg)| key.size() + msg.size()) - .sum::(), - ) - } -} - -impl ChildBuffer { - pub fn static_size() -> usize { - 17 - } - - pub fn buffer_size(&self) -> usize { - self.buffer_entries_size - } - - /// Returns whether there is no message in this buffer for the given `key`. - pub fn is_empty(&self, key: &[u8]) -> bool { - !self.buffer.contains_key(key) - } - - pub fn get(&self, key: &[u8]) -> Option<&(KeyInfo, SlicedCowBytes)> { - self.buffer.get(key) - } - - pub fn apply_with_info(&mut self, key: &[u8], pref: StoragePreference) -> Option<()> { - self.buffer.get_mut(key).map(|(keyinfo, _bytes)| { - keyinfo.storage_preference = pref; - }) - } -} - -impl ChildBuffer { - /// Returns an iterator over all messages. - pub fn get_all_messages( - &self, - ) -> impl Iterator + '_ { - self.buffer.iter().map(|(key, msg)| (key, msg)) - } - - /// Takes the message buffer out this `ChildBuffer`, - /// leaving an empty one in its place. - pub fn take(&mut self) -> (BTreeMap, usize) { - self.messages_preference.invalidate(); - ( - std::mem::take(&mut self.buffer), - replace(&mut self.buffer_entries_size, 0), - ) - } - - pub fn append(&mut self, other: &mut Self) { - self.buffer.append(&mut other.buffer); - self.buffer_entries_size += other.buffer_entries_size; - self.messages_preference - .upgrade_atomic(&other.messages_preference); - } - - /// Splits this `ChildBuffer` at `pivot` - /// so that `self` contains all entries up to (and including) `pivot_key` - /// and the returned `Self` contains the other entries and `node_pointer`. - pub fn split_at(&mut self, pivot: &CowBytes, node_pointer: N) -> Self { - let (buffer, buffer_entries_size) = self.split_off(pivot); - ChildBuffer { - messages_preference: AtomicStoragePreference::unknown(), - buffer, - buffer_entries_size, - node_pointer: RwLock::new(node_pointer), - system_storage_preference: AtomicSystemStoragePreference::from(StoragePreference::NONE), - } - } - - fn split_off( - &mut self, - pivot: &CowBytes, - ) -> (BTreeMap, usize) { - // `split_off` puts the split-key into the right buffer. - let mut next_key = pivot.to_vec(); - next_key.push(0); - let right_buffer = self.buffer.split_off(&next_key[..]); - self.messages_preference.invalidate(); - - let right_entry_size = right_buffer - .iter() - .map(|(key, value)| key.size() + value.size()) - .sum(); - self.buffer_entries_size -= right_entry_size; - (right_buffer, right_entry_size) - } - - pub fn rebalance(&mut self, right_sibling: &mut Self, new_pivot_key: &CowBytes) { - self.append(right_sibling); - let (buffer, buffer_entries_size) = self.split_off(new_pivot_key); - right_sibling.buffer = buffer; - right_sibling.buffer_entries_size = buffer_entries_size; - } - - /// Inserts a message to this buffer for the given `key`. - pub fn insert( - &mut self, - key: Q, - keyinfo: KeyInfo, - msg: SlicedCowBytes, - msg_action: M, - ) -> isize - where - Q: Borrow<[u8]> + Into, - M: MessageAction, - { - let key = key.into(); - let key_size = key.size(); - - self.messages_preference.upgrade(keyinfo.storage_preference); - - match self.buffer.entry(key.clone()) { - Entry::Vacant(e) => { - let size_delta = key_size + msg.size() + keyinfo.size(); - e.insert((keyinfo, msg)); - self.buffer_entries_size += size_delta; - size_delta as isize - } - Entry::Occupied(mut e) => { - let lower = e.get_mut().clone(); - let (_, lower_msg) = lower; - let lower_size = lower_msg.size(); - let merged_msg = msg_action.merge(&key, msg, lower_msg); - let merged_msg_size = merged_msg.size(); - e.get_mut().1 = merged_msg; - self.buffer_entries_size -= lower_size; - self.buffer_entries_size += merged_msg_size; - merged_msg_size as isize - lower_size as isize - } - } - } - - /// Constructs a new, empty buffer. - pub fn new(node_pointer: N) -> Self { - ChildBuffer { - messages_preference: AtomicStoragePreference::known(StoragePreference::NONE), - buffer: BTreeMap::new(), - buffer_entries_size: 0, - node_pointer: RwLock::new(node_pointer), - system_storage_preference: AtomicSystemStoragePreference::from(StoragePreference::NONE), - } - } -} - -impl ChildBuffer { - pub fn range_delete(&mut self, start: &[u8], end: Option<&[u8]>) -> usize { - // Context: Previously we mentioned the usage of a drain filter here and - // linked to an existing issue of how it is missing from the standard - // library. - // - // Adding a drain filter here would make things easier from the code - // perspective, but with the generic predicate, we cannot utilize the - // nice property of the BTreeMap that data is ordered and the traversal - // of the tree can be nicely restrictred with a proper range. Due to - // this I changed the T0D0 placed here to this very explanation you are - // reading. - let mut size_delta = 0; - let range = ( - Bound::Included(start), - end.map_or(Bound::Unbounded, Bound::Excluded), - ); - let mut keys = Vec::new(); - for (key, msg) in self.buffer.range_mut::<[u8], _>(range) { - size_delta += key.size() + msg.size(); - keys.push(key.clone()); - } - for key in keys { - self.buffer.remove(&key); - } - self.buffer_entries_size -= size_delta; - self.messages_preference.invalidate(); - size_delta - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::{arbitrary::GenExt, tree::default_message_action::DefaultMessageActionMsg}; - use bincode::serialized_size; - use quickcheck::{Arbitrary, Gen}; - use rand::Rng; - - impl Clone for ChildBuffer { - fn clone(&self) -> Self { - ChildBuffer { - messages_preference: self.messages_preference.clone(), - buffer_entries_size: self.buffer_entries_size, - buffer: self.buffer.clone(), - node_pointer: RwLock::new(self.node_pointer.read().clone()), - system_storage_preference: self.system_storage_preference.clone(), - } - } - } - - impl PartialEq for ChildBuffer { - fn eq(&self, other: &Self) -> bool { - self.buffer_entries_size == other.buffer_entries_size - && self.buffer == other.buffer - && *self.node_pointer.read() == *other.node_pointer.read() - } - } - - impl Arbitrary for ChildBuffer { - fn arbitrary(g: &mut Gen) -> Self { - let mut rng = g.rng(); - let entries_cnt = rng.gen_range(0..20); - let buffer: BTreeMap = (0..entries_cnt) - .map(|_| { - ( - CowBytes::arbitrary(g), - ( - KeyInfo::arbitrary(g), - DefaultMessageActionMsg::arbitrary(g).0, - ), - ) - }) - .collect(); - ChildBuffer { - messages_preference: AtomicStoragePreference::unknown(), - buffer_entries_size: buffer - .iter() - .map(|(key, value)| key.size() + value.size()) - .sum::(), - buffer, - node_pointer: RwLock::new(Arbitrary::arbitrary(g)), - system_storage_preference: AtomicSystemStoragePreference::from( - StoragePreference::NONE, - ), - } - } - } - - #[quickcheck] - fn check_serialize_size(child_buffer: ChildBuffer<()>) { - assert_eq!( - child_buffer.size(), - serialized_size(&child_buffer).unwrap() as usize - ); - - assert_eq!(Some(child_buffer.size()), child_buffer.actual_size()); - } - - #[quickcheck] - fn check_size_split_at(mut child_buffer: ChildBuffer<()>, pivot_key: CowBytes) { - let size_before = child_buffer.size(); - let sibling = child_buffer.split_at(&pivot_key, ()); - assert_eq!( - child_buffer.size(), - serialized_size(&child_buffer).unwrap() as usize - ); - assert_eq!(sibling.size(), serialized_size(&sibling).unwrap() as usize); - assert_eq!( - child_buffer.size() + sibling.buffer_entries_size, - size_before - ); - } - - #[quickcheck] - fn check_split_at(mut child_buffer: ChildBuffer<()>, pivot_key: CowBytes) { - let this = child_buffer.clone(); - let mut sibling = child_buffer.split_at(&pivot_key, ()); - assert!(child_buffer - .buffer - .iter() - .next_back() - .map_or(true, |(key, _value)| key.clone() <= pivot_key)); - assert!(sibling - .buffer - .iter() - .next() - .map_or(true, |(key, _value)| key.clone() > pivot_key)); - let (mut buffer, _) = child_buffer.take(); - buffer.append(&mut sibling.take().0); - assert_eq!(this.buffer, buffer); - } -} diff --git a/betree/src/tree/imp/derivate_ref.rs b/betree/src/tree/imp/derivate_ref.rs index eaa6d9de8..8b8281f6e 100644 --- a/betree/src/tree/imp/derivate_ref.rs +++ b/betree/src/tree/imp/derivate_ref.rs @@ -6,9 +6,9 @@ use std::{ ops::{Deref, DerefMut}, }; -use crate::cache::AddSize; +use crate::{cache::AddSize, size::Size}; -use super::internal::TakeChildBuffer; +use super::internal::copyless_internal::TakeChildBuffer; /// A reference allowing for a derivative of the original structure to be stored /// alongside the original. Helpful if a derivative of the original is dependent @@ -25,45 +25,63 @@ use super::internal::TakeChildBuffer; /// let owning_ref = OwningRef::new(o).map(|o| &o.some_transition()); /// // ^-- we can't a reference from a temporary value /// // Does compile 😸 -/// let derivate_ref = DerivateRef::try_new(o, |o| o.some_transition()) +/// let derivate_ref = DerivateRefNVM::try_new(o, |o| o.some_transition()) /// ``` -pub struct DerivateRef { +pub struct DerivateRefNVM { inner: U, owner: T, } -impl DerivateRef> { +impl DerivateRefNVM> { /// Unsafe conversions of a limited life-time reference in [TakeChildBuffer] - /// to a static one. This is only ever safe in the internal context of [DerivateRef]. + /// to a static one. This is only ever safe in the internal context of [DerivateRefNVM]. pub fn try_new(mut owner: T, f: F) -> Result where F: for<'a> FnOnce(&'a mut T::Target) -> Option>, { match unsafe { transmute(f(&mut owner)) } { None => Err(owner), - Some(inner) => Ok(DerivateRef { owner, inner }), + Some(inner) => Ok(DerivateRefNVM { owner, inner }), } } pub fn into_owner(self) -> T { self.owner } + + /// Call a function on the owned owner. + pub fn call(&self, f: F) -> X + where + F: FnOnce(&T) -> X, + { + f(&self.owner) + } } -impl AddSize for DerivateRef { +impl AddSize for DerivateRefNVM { fn add_size(&self, size_delta: isize) { self.owner.add_size(size_delta); } } -impl Deref for DerivateRef { +impl Size for DerivateRefNVM { + fn size(&self) -> usize { + self.owner.size() + } + + fn cache_size(&self) -> usize { + self.owner.cache_size() + } +} + +impl Deref for DerivateRefNVM { type Target = U; fn deref(&self) -> &U { &self.inner } } -impl DerefMut for DerivateRef { +impl DerefMut for DerivateRefNVM { fn deref_mut(&mut self) -> &mut U { &mut self.inner } diff --git a/betree/src/tree/imp/flush.rs b/betree/src/tree/imp/flush.rs index 671bb9161..bea92d790 100644 --- a/betree/src/tree/imp/flush.rs +++ b/betree/src/tree/imp/flush.rs @@ -1,13 +1,12 @@ //! Implementation of the tree-wide rebalancing and flushing logic. -//! +//g //! Calling [Tree::rebalance_tree] is not only possible with the root node but may be //! applied to a variety of nodes given that their parent node is correctly //! given. Use with caution. use std::borrow::Borrow; use super::{ - child_buffer::ChildBuffer, derivate_ref::DerivateRef, internal::TakeChildBuffer, FillUpResult, - Inner, Node, Tree, + derivate_ref::DerivateRefNVM, internal::TakeChildBuffer, FillUpResult, Inner, Node, Tree, }; use crate::{ cache::AddSize, @@ -51,12 +50,10 @@ where pub(super) fn rebalance_tree( &self, mut node: X::CacheValueRefMut, - mut parent: Option< - DerivateRef>>, - >, + mut parent: Option>>, ) -> Result<(), Error> { loop { - if !node.is_too_large() { + if !self.storage_map.node_is_too_large(&node) { return Ok(()); } debug!( @@ -68,51 +65,58 @@ where node.actual_size() ); // 1. Select the largest child buffer which can be flushed. - let mut child_buffer = - match DerivateRef::try_new(node, |node| node.try_find_flush_candidate()) { - // 1.1. If there is none we have to split the node. - Err(_node) => match parent { - None => { - self.split_root_node(_node); - return Ok(()); - } - Some(ref mut parent) => { - let (next_node, size_delta) = self.split_node(_node, parent)?; - parent.add_size(size_delta); - node = next_node; - continue; - } - }, - // 1.2. If successful we flush in the following steps to this node. - Ok(selected_child_buffer) => selected_child_buffer, - }; - let mut child = self.get_mut_node(child_buffer.node_pointer_mut())?; + let mut child_buffer = match DerivateRefNVM::try_new(node, |node| { + node.try_find_flush_candidate(&self.storage_map) + }) { + // 1.1. If there is none we have to split the node. + Err(_node) => match parent { + None => { + self.split_root_node(_node); + return Ok(()); + } + Some(ref mut parent) => { + let (next_node, size_delta) = self.split_node(_node, parent)?; + node = next_node; + parent.add_size(size_delta); + continue; + } + }, + // 1.2. If successful we flush in the following steps to this node. + Ok(selected_child_buffer) => selected_child_buffer, + }; + + let mut child = self.get_mut_node(child_buffer.child_pointer_mut())?; + // 2. Iterate down to child if too large - if !child.is_leaf() && child.is_too_large() { + if !child.is_leaf() && self.storage_map.node_is_too_large(&child) { warn!("Aborting flush, child is too large already"); parent = Some(child_buffer); node = child; continue; } // 3. If child is internal, small and has not many children -> merge the children of node. - if child.has_too_low_fanout() { + if child.has_too_low_fanout() && !self.storage_map.node_is_too_large(&child) { let size_delta = { let mut m = child_buffer.prepare_merge(); let mut sibling = self.get_mut_node(m.sibling_node_pointer())?; - let is_right_sibling = m.is_right_sibling(); + let child_on_left = m.is_right_sibling(); let MergeChildResult { pivot_key, old_np, size_delta, } = m.merge_children(); - if is_right_sibling { + if child_on_left { let size_delta = child.merge(&mut sibling, pivot_key); child.add_size(size_delta); } else { let size_delta = sibling.merge(&mut child, pivot_key); child.add_size(size_delta); } - self.dml.remove(old_np); + drop(sibling); + drop(child); + for np in old_np { + self.dml.remove(np); + } size_delta }; child_buffer.add_size(size_delta); @@ -128,7 +132,7 @@ where child.add_size(size_delta_child); // 6. Check if minimal leaf size is fulfilled, otherwise merge again. - if child.is_too_small_leaf() { + if self.storage_map.leaf_is_too_small(&child) { let size_delta = { let mut m = child_buffer.prepare_merge(); let mut sibling = self.get_mut_node(m.sibling_node_pointer())?; @@ -141,14 +145,16 @@ where left = &mut sibling; right = &mut child; }; - match left.leaf_rebalance(right) { + match left.leaf_rebalance(right, &self.storage_map) { FillUpResult::Merged { size_delta } => { left.add_size(size_delta); right.add_size(-size_delta); let MergeChildResult { old_np, size_delta, .. } = m.merge_children(); - self.dml.remove(old_np); + for np in old_np { + self.dml.remove(np); + } size_delta } FillUpResult::Rebalanced { @@ -164,16 +170,16 @@ where child_buffer.add_size(size_delta); } // 7. If the child is too large, split until it is not. - while child.is_too_large_leaf() { + while self.storage_map.leaf_is_too_large(&mut child) { let (next_node, size_delta) = self.split_node(child, &mut child_buffer)?; child_buffer.add_size(size_delta); child = next_node; } // 8. After finishing all operations once, see if they have to be repeated. - if child_buffer.size() > super::MAX_INTERNAL_NODE_SIZE { + if child_buffer.call(|p| self.storage_map.node_is_too_large(&p)) { warn!("Node is still too large"); - if child.is_too_large() { + if self.storage_map.node_is_too_large(&child) { warn!("... but child, too"); } node = child_buffer.into_owner(); diff --git a/betree/src/tree/imp/internal.rs b/betree/src/tree/imp/internal.rs deleted file mode 100644 index 1c9dde1ab..000000000 --- a/betree/src/tree/imp/internal.rs +++ /dev/null @@ -1,921 +0,0 @@ -//! Implementation of the [InternalNode] node type. -use super::{ - child_buffer::ChildBuffer, - node::{PivotGetMutResult, PivotGetResult}, - PivotKey, -}; -use crate::{ - cow_bytes::{CowBytes, SlicedCowBytes}, - data_management::{HasStoragePreference, ObjectReference}, - database::DatasetId, - size::{Size, SizeMut, StaticSize}, - storage_pool::AtomicSystemStoragePreference, - tree::{pivot_key::LocalPivotKey, KeyInfo, MessageAction}, - AtomicStoragePreference, StoragePreference, -}; -use bincode::serialized_size; -use parking_lot::RwLock; -use serde::{Deserialize, Serialize}; -use std::{borrow::Borrow, collections::BTreeMap, mem::replace}; - -#[derive(Debug, Serialize, Deserialize)] -#[cfg_attr(test, derive(PartialEq))] -pub(super) struct InternalNode { - level: u32, - entries_size: usize, - #[serde(skip)] - system_storage_preference: AtomicSystemStoragePreference, - #[serde(skip)] - pref: AtomicStoragePreference, - pub(super) pivot: Vec, - children: Vec, -} - -// @tilpner: -// Previously, this literal was magically spread across the code below, and I've (apparently -// correctly) guessed it to be the fixed size of an empty InternalNode<_> when encoded with bincode. -// I've added a test below to verify this and to ensure any bincode-sided change is noticed. -// This is still wrong because: -// -// * usize is platform-dependent, 28 is not. Size will be impl'd incorrectly on 32b platforms -// * not just the top-level usize, Vec contains further address-sized fields, though bincode -// might special-case Vec encoding so that this doesn't matter -// * the bincode format may not have changed in a while, but that's not a guarantee -// -// I'm not going to fix them, because the proper fix would be to take bincode out of everything, -// and that's a lot of implementation and testing effort. You should though, if you find the time. -// @jwuensche: -// Added TODO to better find this in the future. -// Will definitely need to adjust this at some point, though this is not now. -// const TEST_BINCODE_FIXED_SIZE: usize = 28; -// -// UPDATE: -// We removed by now the fixed constant and determine the base size of an -// internal node with bincode provided methods based on an empty node created on -// compile-time. We might want to store this value for future access or even -// better determine the size on compile time directly, this requires -// `serialized_size` to be const which it could but its not on their task list -// yet. - -// NOTE: Waiting for OnceCell to be stabilized... -// https://doc.rust-lang.org/stable/std/cell/struct.OnceCell.html -static EMPTY_NODE: InternalNode<()> = InternalNode { - level: 0, - entries_size: 0, - system_storage_preference: AtomicSystemStoragePreference::none(), - pref: AtomicStoragePreference::unknown(), - pivot: vec![], - children: vec![], -}; - -#[inline] -fn internal_node_base_size() -> usize { - // NOTE: The overhead introduced by using `serialized_size` is negligible - // and only about 3ns, but we can use OnceCell once (🥁) it is available. - serialized_size(&EMPTY_NODE) - .expect("Known node layout could not be estimated. This is an error in bincode.") - // We know that this is valid as the maximum size in bytes is below u32 - as usize -} - -impl Size for InternalNode { - fn size(&self) -> usize { - internal_node_base_size() + self.entries_size - } - - fn actual_size(&self) -> Option { - Some( - internal_node_base_size() - + self.pivot.iter().map(Size::size).sum::() - + self - .children - .iter() - .map(|child| { - child - .checked_size() - .expect("Child doesn't impl actual_size") - }) - .sum::(), - ) - } -} - -impl HasStoragePreference for InternalNode { - fn current_preference(&self) -> Option { - self.pref - .as_option() - .map(|pref| self.system_storage_preference.weak_bound(&pref)) - } - - fn recalculate(&self) -> StoragePreference { - let mut pref = StoragePreference::NONE; - - for child in &self.children { - pref.upgrade(child.correct_preference()) - } - - self.pref.set(pref); - pref - } - - fn correct_preference(&self) -> StoragePreference { - self.system_storage_preference - .weak_bound(&self.recalculate()) - } - - fn system_storage_preference(&self) -> StoragePreference { - self.system_storage_preference.borrow().into() - } - - fn set_system_storage_preference(&mut self, pref: StoragePreference) { - self.system_storage_preference.set(pref); - } -} - -impl InternalNode { - pub fn new(left_child: T, right_child: T, pivot_key: CowBytes, level: u32) -> Self - where - T: Size, - { - InternalNode { - level, - entries_size: left_child.size() + right_child.size() + pivot_key.size(), - pivot: vec![pivot_key], - children: vec![left_child, right_child], - system_storage_preference: AtomicSystemStoragePreference::from(StoragePreference::NONE), - pref: AtomicStoragePreference::unknown(), - } - } - - /// Returns the number of children. - pub fn fanout(&self) -> usize { - self.children.len() - } - - /// Returns the level of this node. - pub fn level(&self) -> u32 { - self.level - } - - /// Returns the index of the child buffer - /// corresponding to the given `key`. - fn idx(&self, key: &[u8]) -> usize { - match self - .pivot - .binary_search_by(|pivot_key| pivot_key.as_ref().cmp(key)) - { - Ok(idx) | Err(idx) => idx, - } - } - - pub fn iter(&self) -> impl Iterator + '_ { - self.children.iter() - } - - pub fn iter_mut(&mut self) -> impl Iterator + '_ { - self.children.iter_mut() - } - - pub fn iter_with_bounds( - &self, - ) -> impl Iterator, &T, Option<&CowBytes>)> + '_ { - self.children.iter().enumerate().map(move |(idx, child)| { - let maybe_left = if idx == 0 { - None - } else { - self.pivot.get(idx - 1) - }; - - let maybe_right = self.pivot.get(idx); - - (maybe_left, child, maybe_right) - }) - } -} - -impl InternalNode> { - pub fn get(&self, key: &[u8]) -> (&RwLock, Option<(KeyInfo, SlicedCowBytes)>) { - let child = &self.children[self.idx(key)]; - - let msg = child.get(key).cloned(); - (&child.node_pointer, msg) - } - - pub fn pivot_get(&self, pk: &PivotKey) -> PivotGetResult { - // Exact pivot matches are required only - debug_assert!(!pk.is_root()); - let pivot = pk.bytes().unwrap(); - self.pivot - .iter() - .enumerate() - .find(|(_idx, p)| **p == pivot) - .map_or_else( - || { - // Continue the search to the next level - let child = &self.children[self.idx(&pivot)]; - PivotGetResult::NextNode(&child.node_pointer) - }, - |(idx, _)| { - // Fetch the correct child pointer - let child; - if pk.is_left() { - child = &self.children[idx]; - } else { - child = &self.children[idx + 1]; - } - PivotGetResult::Target(Some(&child.node_pointer)) - }, - ) - } - - pub fn pivot_get_mut(&mut self, pk: &PivotKey) -> PivotGetMutResult { - // Exact pivot matches are required only - debug_assert!(!pk.is_root()); - let pivot = pk.bytes().unwrap(); - let (id, is_target) = self - .pivot - .iter() - .enumerate() - .find(|(_idx, p)| **p == pivot) - .map_or_else( - || { - // Continue the search to the next level - (self.idx(&pivot), false) - }, - |(idx, _)| { - // Fetch the correct child pointer - (idx, true) - }, - ); - match (is_target, pk.is_left()) { - (true, true) => { - PivotGetMutResult::Target(Some(self.children[id].node_pointer.get_mut())) - } - (true, false) => { - PivotGetMutResult::Target(Some(self.children[id + 1].node_pointer.get_mut())) - } - (false, _) => PivotGetMutResult::NextNode(self.children[id].node_pointer.get_mut()), - } - } - - pub fn apply_with_info(&mut self, key: &[u8], pref: StoragePreference) -> &mut N { - let idx = self.idx(key); - let child = &mut self.children[idx]; - - child.apply_with_info(key, pref); - child.node_pointer.get_mut() - } - - pub fn get_range( - &self, - key: &[u8], - left_pivot_key: &mut Option, - right_pivot_key: &mut Option, - all_msgs: &mut BTreeMap>, - ) -> &RwLock { - let idx = self.idx(key); - if idx > 0 { - *left_pivot_key = Some(self.pivot[idx - 1].clone()); - } - if idx < self.pivot.len() { - *right_pivot_key = Some(self.pivot[idx].clone()); - } - let child = &self.children[idx]; - for (key, msg) in child.get_all_messages() { - all_msgs - .entry(key.clone()) - .or_insert_with(Vec::new) - .push(msg.clone()); - } - - &child.node_pointer - } - - pub fn get_next_node(&self, key: &[u8]) -> Option<&RwLock> { - let idx = self.idx(key) + 1; - self.children.get(idx).map(|child| &child.node_pointer) - } - - pub fn insert( - &mut self, - key: Q, - keyinfo: KeyInfo, - msg: SlicedCowBytes, - msg_action: M, - ) -> isize - where - Q: Borrow<[u8]> + Into, - M: MessageAction, - { - self.pref.invalidate(); - let idx = self.idx(key.borrow()); - let added_size = self.children[idx].insert(key, keyinfo, msg, msg_action); - - if added_size > 0 { - self.entries_size += added_size as usize; - } else { - self.entries_size -= -added_size as usize; - } - added_size - } - - pub fn insert_msg_buffer(&mut self, iter: I, msg_action: M) -> isize - where - I: IntoIterator, - M: MessageAction, - { - self.pref.invalidate(); - let mut added_size = 0; - let mut buf_storage_pref = StoragePreference::NONE; - - for (k, (keyinfo, v)) in iter.into_iter() { - let idx = self.idx(&k); - buf_storage_pref.upgrade(keyinfo.storage_preference); - added_size += self.children[idx].insert(k, keyinfo, v, &msg_action); - } - - if added_size > 0 { - self.entries_size += added_size as usize; - } else { - self.entries_size -= -added_size as usize; - } - added_size - } - - pub fn drain_children(&mut self) -> impl Iterator + '_ { - self.pref.invalidate(); - self.entries_size = 0; - self.children - .drain(..) - .map(|child| child.node_pointer.into_inner()) - } -} - -impl InternalNode> { - pub fn range_delete( - &mut self, - start: &[u8], - end: Option<&[u8]>, - dead: &mut Vec, - ) -> (usize, &mut N, Option<&mut N>) { - self.pref.invalidate(); - let size_before = self.entries_size; - let start_idx = self.idx(start); - let end_idx = end.map_or(self.children.len() - 1, |i| self.idx(i)); - if start_idx == end_idx { - let size_delta = self.children[start_idx].range_delete(start, end); - return ( - size_delta, - self.children[start_idx].node_pointer.get_mut(), - None, - ); - } - // Skip children that may overlap. - let dead_start_idx = start_idx + 1; - let dead_end_idx = end_idx - end.is_some() as usize; - if dead_start_idx <= dead_end_idx { - for pivot_key in self.pivot.drain(dead_start_idx..dead_end_idx) { - self.entries_size -= pivot_key.size(); - } - let entries_size = &mut self.entries_size; - dead.extend( - self.children - .drain(dead_start_idx..=dead_end_idx) - .map(|child| { - *entries_size -= child.size(); - child.node_pointer.into_inner() - }), - ); - } - - let (left_child, mut right_child) = { - let (left, right) = self.children.split_at_mut(start_idx + 1); - (&mut left[start_idx], end.map(move |_| &mut right[0])) - }; - self.entries_size -= left_child.range_delete(start, None); - if let Some(ref mut child) = right_child { - self.entries_size -= child.range_delete(start, end); - } - let size_delta = size_before - self.entries_size; - - ( - size_delta, - left_child.node_pointer.get_mut(), - right_child.map(|child| child.node_pointer.get_mut()), - ) - } -} - -impl InternalNode> { - pub fn split(&mut self) -> (Self, CowBytes, isize, LocalPivotKey) { - self.pref.invalidate(); - let split_off_idx = self.fanout() / 2; - let pivot = self.pivot.split_off(split_off_idx); - let pivot_key = self.pivot.pop().unwrap(); - let mut children = self.children.split_off(split_off_idx); - - if let (Some(new_left_outer), Some(new_left_pivot)) = (children.first_mut(), pivot.first()) - { - new_left_outer.update_pivot_key(LocalPivotKey::LeftOuter(new_left_pivot.clone())) - } - - let entries_size = pivot.iter().map(Size::size).sum::() - + children.iter_mut().map(SizeMut::size).sum::(); - - let size_delta = entries_size + pivot_key.size(); - self.entries_size -= size_delta; - - let right_sibling = InternalNode { - level: self.level, - entries_size, - pivot, - children, - // Copy the system storage preference of the other node as we cannot - // be sure which key was targeted by recorded accesses. - system_storage_preference: self.system_storage_preference.clone(), - pref: AtomicStoragePreference::unknown(), - }; - ( - right_sibling, - pivot_key.clone(), - -(size_delta as isize), - LocalPivotKey::Right(pivot_key), - ) - } - - pub fn merge(&mut self, right_sibling: &mut Self, old_pivot_key: CowBytes) -> isize { - self.pref.invalidate(); - let size_delta = right_sibling.entries_size + old_pivot_key.size(); - self.entries_size += size_delta; - self.pivot.push(old_pivot_key); - self.pivot.append(&mut right_sibling.pivot); - self.children.append(&mut right_sibling.children); - - size_delta as isize - } - - /// Translate any object ref in a `ChildBuffer` from `Incomplete` to `Unmodified` state. - pub fn complete_object_refs(mut self, d_id: DatasetId) -> Self { - // TODO: - let first_pk = match self.pivot.first() { - Some(p) => PivotKey::LeftOuter(p.clone(), d_id), - None => unreachable!( - "The store contains an empty InternalNode, this should never be the case." - ), - }; - for (id, pk) in [first_pk] - .into_iter() - .chain(self.pivot.iter().map(|p| PivotKey::Right(p.clone(), d_id))) - .enumerate() - { - // SAFETY: There must always be pivots + 1 many children, otherwise - // the state of the Internal Node is broken. - self.children[id].complete_object_ref(pk) - } - self - } -} - -impl InternalNode> -where - ChildBuffer: Size, -{ - pub fn try_walk(&mut self, key: &[u8]) -> Option>> { - let child_idx = self.idx(key); - if self.children[child_idx].is_empty(key) { - Some(TakeChildBuffer { - node: self, - child_idx, - }) - } else { - None - } - } - - pub fn try_find_flush_candidate( - &mut self, - min_flush_size: usize, - max_node_size: usize, - min_fanout: usize, - ) -> Option>> { - let child_idx = { - let size = self.size(); - let fanout = self.fanout(); - let (child_idx, child) = self - .children - .iter() - .enumerate() - .max_by_key(|&(_, child)| child.buffer_size()) - .unwrap(); - - debug!("Largest child's buffer size: {}", child.buffer_size()); - - if child.buffer_size() >= min_flush_size - && (size - child.buffer_size() <= max_node_size || fanout < 2 * min_fanout) - { - Some(child_idx) - } else { - None - } - }; - child_idx.map(move |child_idx| TakeChildBuffer { - node: self, - child_idx, - }) - } -} - -pub(super) struct TakeChildBuffer<'a, T: 'a> { - node: &'a mut InternalNode, - child_idx: usize, -} - -impl<'a, N: StaticSize + HasStoragePreference> TakeChildBuffer<'a, ChildBuffer> { - pub(super) fn split_child( - &mut self, - sibling_np: N, - pivot_key: CowBytes, - select_right: bool, - ) -> isize { - // split_at invalidates both involved children (old and new), but as the new child - // is added to self, the overall entries don't change, so this node doesn't need to be - // invalidated - - let sibling = self.node.children[self.child_idx].split_at(&pivot_key, sibling_np); - let size_delta = sibling.size() + pivot_key.size(); - self.node.children.insert(self.child_idx + 1, sibling); - self.node.pivot.insert(self.child_idx, pivot_key); - self.node.entries_size += size_delta; - if select_right { - self.child_idx += 1; - } - size_delta as isize - } -} - -impl<'a, T> TakeChildBuffer<'a, T> -where - InternalNode: Size, -{ - pub(super) fn size(&self) -> usize { - Size::size(&*self.node) - } - - pub(super) fn prepare_merge(&mut self) -> PrepareMergeChild { - if self.child_idx + 1 < self.node.children.len() { - PrepareMergeChild { - node: self.node, - pivot_key_idx: self.child_idx, - other_child_idx: self.child_idx + 1, - } - } else { - PrepareMergeChild { - node: self.node, - pivot_key_idx: self.child_idx - 1, - other_child_idx: self.child_idx - 1, - } - } - } -} - -pub(super) struct PrepareMergeChild<'a, T: 'a> { - node: &'a mut InternalNode, - pivot_key_idx: usize, - other_child_idx: usize, -} - -impl<'a, N> PrepareMergeChild<'a, ChildBuffer> { - pub(super) fn sibling_node_pointer(&mut self) -> &mut RwLock { - &mut self.node.children[self.other_child_idx].node_pointer - } - pub(super) fn is_right_sibling(&self) -> bool { - self.pivot_key_idx != self.other_child_idx - } -} - -pub(super) struct MergeChildResult { - pub(super) pivot_key: CowBytes, - pub(super) old_np: NP, - pub(super) size_delta: isize, -} - -impl<'a, N: Size + HasStoragePreference> PrepareMergeChild<'a, ChildBuffer> { - pub(super) fn merge_children(self) -> MergeChildResult { - let mut right_sibling = self.node.children.remove(self.pivot_key_idx + 1); - let pivot_key = self.node.pivot.remove(self.pivot_key_idx); - let size_delta = - pivot_key.size() + ChildBuffer::::static_size() + right_sibling.node_pointer.size(); - self.node.entries_size -= size_delta; - - let left_sibling = &mut self.node.children[self.pivot_key_idx]; - left_sibling.append(&mut right_sibling); - left_sibling - .messages_preference - .upgrade_atomic(&right_sibling.messages_preference); - - MergeChildResult { - pivot_key, - old_np: right_sibling.node_pointer.into_inner(), - size_delta: -(size_delta as isize), - } - } -} - -impl<'a, N: Size + HasStoragePreference> PrepareMergeChild<'a, ChildBuffer> { - fn get_children(&mut self) -> (&mut ChildBuffer, &mut ChildBuffer) { - let (left, right) = self.node.children[self.pivot_key_idx..].split_at_mut(1); - (&mut left[0], &mut right[0]) - } - - pub(super) fn rebalanced(&mut self, new_pivot_key: CowBytes) -> isize { - { - // Move messages around - let (left_child, right_child) = self.get_children(); - left_child.rebalance(right_child, &new_pivot_key); - } - - let mut size_delta = new_pivot_key.size() as isize; - let old_pivot_key = replace(&mut self.node.pivot[self.pivot_key_idx], new_pivot_key); - size_delta -= old_pivot_key.size() as isize; - - size_delta - } -} - -impl<'a, N: Size + HasStoragePreference> TakeChildBuffer<'a, ChildBuffer> { - pub fn node_pointer_mut(&mut self) -> &mut RwLock { - &mut self.node.children[self.child_idx].node_pointer - } - pub fn take_buffer(&mut self) -> (BTreeMap, isize) { - let (buffer, size_delta) = self.node.children[self.child_idx].take(); - self.node.entries_size -= size_delta; - (buffer, -(size_delta as isize)) - } -} - -#[cfg(test)] -mod tests { - - - use super::*; - use crate::{ - arbitrary::GenExt, - database::DatasetId, - tree::default_message_action::{DefaultMessageAction, DefaultMessageActionMsg}, - }; - use bincode::serialized_size; - - use quickcheck::{Arbitrary, Gen, TestResult}; - use rand::Rng; - use serde::Serialize; - - // Keys are not allowed to be empty. This is usually caught at the tree layer, but these are - // bypassing that check. There's probably a good way to do this, but we can also just throw - // away the empty keys until we find one that isn't empty. - #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] - struct Key(CowBytes); - impl Arbitrary for Key { - fn arbitrary(g: &mut Gen) -> Self { - loop { - let c = CowBytes::arbitrary(g); - if !c.is_empty() { - return Key(c); - } - } - } - } - - impl Clone for InternalNode { - fn clone(&self) -> Self { - InternalNode { - level: self.level, - entries_size: self.entries_size, - pivot: self.pivot.clone(), - children: self.children.to_vec(), - system_storage_preference: self.system_storage_preference.clone(), - pref: self.pref.clone(), - } - } - } - - impl Arbitrary for InternalNode { - fn arbitrary(g: &mut Gen) -> Self { - let mut rng = g.rng(); - let pivot_key_cnt = rng.gen_range(1..20); - let mut entries_size = 0; - - let mut pivot = Vec::with_capacity(pivot_key_cnt); - for _ in 0..pivot_key_cnt { - let pivot_key = CowBytes::arbitrary(g); - entries_size += pivot_key.size(); - pivot.push(pivot_key); - } - - let mut children = Vec::with_capacity(pivot_key_cnt + 1); - for _ in 0..pivot_key_cnt + 1 { - let child = T::arbitrary(g); - entries_size += child.size(); - children.push(child); - } - - InternalNode { - pivot, - children, - entries_size, - level: 1, - system_storage_preference: AtomicSystemStoragePreference::from( - StoragePreference::NONE, - ), - pref: AtomicStoragePreference::unknown(), - } - } - } - - fn check_size(node: &mut InternalNode) { - assert_eq!( - node.size() as u64, - serialized_size(node).unwrap(), - "predicted size does not match serialized size" - ); - } - - #[quickcheck] - fn check_serialize_size(mut node: InternalNode) { - check_size(&mut node); - } - - #[quickcheck] - fn check_idx(node: InternalNode<()>, key: Key) { - let key = key.0; - let idx = node.idx(&key); - - if let Some(upper_key) = node.pivot.get(idx) { - assert!(&key <= upper_key); - } - if idx > 0 { - let lower_key = &node.pivot[idx - 1]; - assert!(lower_key < &key); - } - } - - #[quickcheck] - fn check_size_insert_single( - mut node: InternalNode>, - key: Key, - keyinfo: KeyInfo, - msg: DefaultMessageActionMsg, - ) { - let size_before = node.size() as isize; - let added_size = node.insert(key.0, keyinfo, msg.0, DefaultMessageAction); - assert_eq!(size_before + added_size, node.size() as isize); - - check_size(&mut node); - } - - #[quickcheck] - fn check_size_insert_msg_buffer( - mut node: InternalNode>, - buffer: BTreeMap, - ) { - let size_before = node.size() as isize; - let added_size = node.insert_msg_buffer( - buffer - .into_iter() - .map(|(Key(key), (keyinfo, msg))| (key, (keyinfo, msg.0))), - DefaultMessageAction, - ); - assert_eq!( - size_before + added_size, - node.size() as isize, - "size delta mismatch" - ); - - check_size(&mut node); - } - - #[quickcheck] - fn check_insert_msg_buffer( - mut node: InternalNode>, - buffer: BTreeMap, - ) { - let mut node_twin = node.clone(); - let added_size = node.insert_msg_buffer( - buffer - .iter() - .map(|(Key(key), (keyinfo, msg))| (key.clone(), (keyinfo.clone(), msg.0.clone()))), - DefaultMessageAction, - ); - - let mut added_size_twin = 0; - for (Key(key), (keyinfo, msg)) in buffer { - let idx = node_twin.idx(&key); - added_size_twin += - node_twin.children[idx].insert(key, keyinfo, msg.0, DefaultMessageAction); - } - if added_size_twin > 0 { - node_twin.entries_size += added_size_twin as usize; - } else { - node_twin.entries_size -= -added_size_twin as usize; - } - - assert_eq!(node, node_twin); - assert_eq!(added_size, added_size_twin); - } - - static mut PK: Option = None; - - impl ObjectReference for () { - type ObjectPointer = (); - - fn get_unmodified(&self) -> Option<&Self::ObjectPointer> { - Some(&()) - } - - fn set_index(&mut self, _pk: PivotKey) { - // NO-OP - } - - fn index(&self) -> &PivotKey { - unsafe { - if PK.is_none() { - PK = Some(PivotKey::LeftOuter( - CowBytes::from(vec![42u8]), - DatasetId::default(), - )); - } - PK.as_ref().unwrap() - } - } - } - - #[quickcheck] - fn check_size_split(mut node: InternalNode>) -> TestResult { - if node.fanout() < 2 { - return TestResult::discard(); - } - let size_before = node.size(); - let (mut right_sibling, _pivot, size_delta, _pivot_key) = node.split(); - assert_eq!(size_before as isize + size_delta, node.size() as isize); - check_size(&mut node); - check_size(&mut right_sibling); - - TestResult::passed() - } - - #[quickcheck] - fn check_split(mut node: InternalNode>) -> TestResult { - if node.fanout() < 4 { - return TestResult::discard(); - } - let twin = node.clone(); - let (mut right_sibling, pivot, _size_delta, _pivot_key) = node.split(); - - assert!(node.fanout() >= 2); - assert!(right_sibling.fanout() >= 2); - - node.entries_size += pivot.size() + right_sibling.entries_size; - node.pivot.push(pivot); - node.pivot.append(&mut right_sibling.pivot); - node.children.append(&mut right_sibling.children); - - assert_eq!(node, twin); - - TestResult::passed() - } - - #[quickcheck] - fn check_split_key(mut node: InternalNode>) -> TestResult { - if node.fanout() < 4 { - return TestResult::discard(); - } - let (right_sibling, pivot, _size_delta, pivot_key) = node.split(); - assert!(node.fanout() >= 2); - assert!(right_sibling.fanout() >= 2); - assert_eq!(LocalPivotKey::Right(pivot), pivot_key); - TestResult::passed() - } - - // #[test] - // fn check_constant() { - // let node: InternalNode> = InternalNode { - // entries_size: 0, - // level: 1, - // children: vec![], - // pivot: vec![], - // system_storage_preference: AtomicSystemStoragePreference::from(StoragePreference::NONE), - // pref: AtomicStoragePreference::unknown(), - // }; - - // assert_eq!( - // serialized_size(&node).unwrap(), - // TEST_BINCODE_FIXED_SIZE as u64, - // "magic constants are wrong" - // ); - // } - - // TODO tests - // split - // child split - // flush buffer - // get with max_msn -} diff --git a/betree/src/tree/imp/internal/copyless_internal.rs b/betree/src/tree/imp/internal/copyless_internal.rs new file mode 100644 index 000000000..a617bf71d --- /dev/null +++ b/betree/src/tree/imp/internal/copyless_internal.rs @@ -0,0 +1,1190 @@ +//! Implementation of the [DisjointInternalNode] node type. +use crate::{ + buffer::Buf, + checksum::{Checksum, ChecksumError}, + data_management::IntegrityMode, + tree::imp::{ + node::{PivotGetMutResult, PivotGetResult}, + PivotKey, + }, +}; + +use super::{packed_child_buffer::PackedChildBuffer, take_child_buffer::MergeChildResult}; + +use crate::{ + cow_bytes::{CowBytes, SlicedCowBytes}, + data_management::{HasStoragePreference, ObjectReference}, + database::DatasetId, + size::{Size, StaticSize}, + storage_pool::AtomicSystemStoragePreference, + tree::{imp::MIN_FANOUT, pivot_key::LocalPivotKey, KeyInfo}, + AtomicStoragePreference, StoragePreference, +}; +use parking_lot::RwLock; +use std::{borrow::Borrow, collections::BTreeMap, mem::replace}; + +use super::serialize_nodepointer; +use serde::{Deserialize, Serialize}; + +pub(in crate::tree::imp) struct CopylessInternalNode { + // FIXME: This type can be used as zero-copy + pub meta_data: InternalNodeMetaData, + pub children: Vec>, +} + +/// A link to the next child, this contains a buffer for messages as well as a +/// pointer to the child. +#[derive(Serialize, Deserialize, Debug)] +#[serde(bound(serialize = "N: Serialize", deserialize = "N: Deserialize<'de>"))] +pub(in crate::tree::imp) struct ChildLink { + #[serde(skip)] + buffer: PackedChildBuffer, + #[serde(with = "serialize_nodepointer")] + ptr: RwLock, +} + +impl PartialEq for ChildLink { + fn eq(&self, other: &Self) -> bool { + // TODO: Needs buffer check? + &*self.ptr.read() == &*other.ptr.read() + } +} + +impl From for std::io::Error { + fn from(value: ChecksumError) -> Self { + std::io::Error::new(std::io::ErrorKind::InvalidData, value) + } +} + +impl ChildLink { + pub fn new(buffer: PackedChildBuffer, ptr: N) -> Self { + ChildLink { + buffer, + ptr: RwLock::new(ptr), + } + } + + pub fn buffer_mut(&mut self) -> &mut PackedChildBuffer { + &mut self.buffer + } + + pub fn buffer(&self) -> &PackedChildBuffer { + &self.buffer + } + + pub fn ptr_mut(&mut self) -> &mut RwLock { + &mut self.ptr + } + + pub fn ptr(&self) -> &RwLock { + &self.ptr + } +} + +impl std::fmt::Debug for CopylessInternalNode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.meta_data.fmt(f) + } +} + +#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(test, derive(PartialEq))] +pub(in crate::tree::imp) struct InternalNodeMetaData { + pub current_size: usize, + pub level: u32, + pub system_storage_preference: AtomicSystemStoragePreference, + pub pref: AtomicStoragePreference, + pub(in crate::tree::imp) pivot: Vec, + pub entries_sizes: Vec, + pub entries_prefs: Vec, +} + +impl InternalNodeMetaData { + fn invalidate(&mut self) { + self.pref.invalidate(); + self.current_size = self.recalc_size(); + } + + fn recalc_size(&self) -> usize { + std::mem::size_of::() + + std::mem::size_of::() + + std::mem::size_of::() + + std::mem::size_of::() + + self.pivot.iter().map(|p| p.size()).sum::() + + self.pivot.len() * std::mem::size_of::() + + self.pivot.len() * std::mem::size_of::() + + META_BINCODE_STATIC + } +} + +const INTERNAL_BINCODE_STATIC: usize = 4 + 8; +impl Size for CopylessInternalNode { + fn size(&self) -> usize { + // Layout + // ------ + // - LE u32 Metadata len + // - InternalNodeMetaData bytes + // - LEN u32 + // - [child PTR; LEN] + // - [checksum; LEN] + // - [child BUFFER; LEN] + + std::mem::size_of::() + + self.meta_data.size() + + std::mem::size_of::() + + self.children.len() * N::static_size() + + self.children.len() * INTERNAL_INTEGRITY_CHECKSUM_SIZE + + self.meta_data.entries_sizes.iter().sum::() + + 8 + } + + fn actual_size(&self) -> Option { + // FIXME: Actually cache the serialized size and track delta + Some(self.size()) + } + + fn cache_size(&self) -> usize { + std::mem::size_of::() + + self.meta_data.size() + + std::mem::size_of::() + + self.children.len() * N::static_size() + + self + .children + .iter() + .map(|c| c.buffer.cache_size()) + .sum::() + } +} + +const META_BINCODE_STATIC: usize = 33; +const INTERNAL_INTEGRITY_CHECKSUM_SIZE: usize = 8 + 8; +impl Size for InternalNodeMetaData { + fn size(&self) -> usize { + self.current_size + // std::mem::size_of::() + // + std::mem::size_of::() + // + std::mem::size_of::() + // + std::mem::size_of::() + // + self.pivot.iter().map(|p| p.size()).sum::() + // + self.pivot.len() * std::mem::size_of::() + // + self.pivot.len() * std::mem::size_of::() + // + META_BINCODE_STATIC + } + + fn actual_size(&self) -> Option { + None + } +} + +impl HasStoragePreference for CopylessInternalNode { + fn current_preference(&self) -> Option { + self.meta_data + .pref + .as_option() + .map(|pref| self.meta_data.system_storage_preference.weak_bound(&pref)) + } + + fn recalculate(&self) -> StoragePreference { + let mut pref = StoragePreference::NONE; + + for child in self.meta_data.entries_prefs.iter() { + pref.upgrade(*child) + } + + self.meta_data.pref.set(pref); + pref + } + + fn correct_preference(&self) -> StoragePreference { + let storagepref = self + .current_preference() + .unwrap_or_else(|| self.recalculate()); + self.meta_data + .system_storage_preference + .weak_bound(&storagepref) + } + + fn system_storage_preference(&self) -> StoragePreference { + self.meta_data.system_storage_preference.borrow().into() + } + + fn set_system_storage_preference(&mut self, pref: StoragePreference) { + self.meta_data.system_storage_preference.set(pref); + } +} + +pub struct InternalNodeLink { + pub ptr: N, + pub buffer: PackedChildBuffer, + pub buffer_size: usize, +} + +impl InternalNodeLink { + pub fn destruct(self) -> (N, PackedChildBuffer) { + (self.ptr, self.buffer) + } +} + +impl Into> for InternalNodeLink { + fn into(self) -> ChildLink { + ChildLink { + buffer: self.buffer, + ptr: RwLock::new(self.ptr), + } + } +} + +impl CopylessInternalNode { + pub fn new( + left_child: InternalNodeLink, + right_child: InternalNodeLink, + pivot_key: CowBytes, + level: u32, + ) -> Self + where + N: StaticSize, + { + CopylessInternalNode { + meta_data: InternalNodeMetaData { + current_size: std::mem::size_of::() + + std::mem::size_of::() + + std::mem::size_of::() + + std::mem::size_of::() + + pivot_key.size() + + 1 * std::mem::size_of::() + + 1 * std::mem::size_of::() + + META_BINCODE_STATIC, + level, + entries_sizes: vec![left_child.buffer_size, right_child.buffer_size], + pivot: vec![pivot_key], + system_storage_preference: AtomicSystemStoragePreference::from( + StoragePreference::NONE, + ), + pref: AtomicStoragePreference::unknown(), + entries_prefs: vec![StoragePreference::NONE, StoragePreference::NONE], + }, + children: vec![left_child.into(), right_child.into()], + } + } + + /// Returns the number of children. + pub fn fanout(&self) -> usize { + self.children.len() + } + + /// Returns the level of this node. + pub fn level(&self) -> u32 { + self.meta_data.level + } + + /// Returns the index of the child buffer + /// corresponding to the given `key`. + pub(in crate::tree::imp) fn idx(&self, key: &[u8]) -> usize { + match self + .meta_data + .pivot + .binary_search_by(|pivot_key| pivot_key.as_ref().cmp(key)) + { + Ok(idx) | Err(idx) => idx, + } + } + + pub fn iter(&self) -> impl Iterator> + where + N: ObjectReference, + { + self.children.iter() + } + + pub fn iter_mut(&mut self) -> impl Iterator> + where + N: ObjectReference, + { + self.children.iter_mut() + } + + pub fn iter_with_bounds( + &self, + ) -> impl Iterator, &ChildLink, Option<&CowBytes>)> + '_ + where + N: ObjectReference, + { + self.children.iter().enumerate().map(move |(idx, child)| { + let maybe_left = if idx == 0 { + None + } else { + self.meta_data.pivot.get(idx - 1) + }; + + let maybe_right = self.meta_data.pivot.get(idx); + + (maybe_left, child, maybe_right) + }) + } + + /// Serialize the object into a writer. + /// + /// Layout + /// ------ + /// + /// - LE u32 Metadata len + /// - InternalNodeMetaData bytes + /// - [child PTR; LEN] + /// - [checksum; LEN] + /// - [child BUFFER; LEN] + pub fn pack( + &self, + mut w: W, + csum_builder: F, + ) -> Result, std::io::Error> + where + N: serde::Serialize + StaticSize, + F: Fn(&[u8]) -> C, + C: Checksum, + { + use std::io::Write; + + let mut tmp = vec![]; + let bytes_meta_data_len = bincode::serialized_size(&self.meta_data) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; + + tmp.write_all(&(bytes_meta_data_len as u32).to_le_bytes())?; + bincode::serialize_into(&mut tmp, &self.meta_data) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; + + let bytes_child_len = bincode::serialized_size(&self.children) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; + + tmp.write_all(&(bytes_child_len as u32).to_le_bytes())?; + bincode::serialize_into(&mut tmp, &self.children) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; + + let mut tmp_buffers = vec![]; + + for (size, child) in self + .meta_data + .entries_sizes + .iter() + .zip(self.children.iter()) + { + assert_eq!(*size, child.buffer.size()); + } + + for child in self.children.iter() { + let integrity = child.buffer.pack(&mut tmp_buffers, &csum_builder)?; + assert_eq!( + bincode::serialized_size(&integrity).unwrap(), + INTERNAL_INTEGRITY_CHECKSUM_SIZE as u64 + ); + bincode::serialize_into(&mut tmp, &integrity) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; + } + + let csum = csum_builder(&tmp); + w.write_all(&tmp)?; + w.write_all(&tmp_buffers)?; + Ok(IntegrityMode::Internal { + csum, + len: tmp.len() as u32, + }) + } + + /// Read object from a byte buffer and instantiate it. + pub fn unpack(buf: Buf, csum: IntegrityMode) -> Result + where + N: serde::de::DeserializeOwned + StaticSize, + { + let buf = buf.into_sliced_cow_bytes(); + const NODE_ID: usize = 4; + let mut cursor = NODE_ID; + + csum.checksum() + .unwrap() + .verify(&buf[cursor..cursor + csum.length().unwrap() as usize])?; + + let len = u32::from_le_bytes(buf[cursor..cursor + 4].try_into().unwrap()) as usize; + cursor += 4; + + let meta_data: InternalNodeMetaData = bincode::deserialize(&buf[cursor..cursor + len]) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e)) + .unwrap(); + cursor += len; + + let ptrs_len = u32::from_le_bytes(buf[cursor..cursor + 4].try_into().unwrap()) as usize; + cursor += 4; + + // NOTE: This section scales different from the time than the packed buffers unpack which is weird + let mut ptrs: Vec> = bincode::deserialize(&buf[cursor..cursor + ptrs_len]) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; + cursor += ptrs_len; + + let mut checksums: Vec> = vec![]; + for _ in ptrs.iter() { + checksums.push( + bincode::deserialize(&buf[cursor..cursor + INTERNAL_INTEGRITY_CHECKSUM_SIZE]) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?, + ); + cursor += INTERNAL_INTEGRITY_CHECKSUM_SIZE; + } + for (idx, buffer_csum) in checksums.into_iter().enumerate() { + let sub = buf.clone().slice_from(cursor as u32); + let b: PackedChildBuffer = PackedChildBuffer::unpack(sub, buffer_csum)?; + cursor += b.size(); + assert_eq!(meta_data.entries_sizes[idx], b.size()); + let _ = std::mem::replace(&mut ptrs[idx].buffer, b); + assert_eq!(meta_data.entries_sizes[idx], ptrs[idx].buffer.size()); + } + + Ok(CopylessInternalNode { + meta_data, + children: ptrs, + }) + } + + pub fn after_insert_size_delta(&mut self, idx: usize, size_delta: isize) { + self.meta_data.entries_sizes[idx] = self.children[idx].buffer.size(); + + // assert!( + // self.meta_data.entries_sizes[idx] < 8 * 1024 * 1024, + // "child buffer got way too large: {:#?}", + // std::backtrace::Backtrace::force_capture() + // ); + } + + pub(crate) fn has_too_high_fanout(&self, max_size: usize) -> bool { + self.meta_data.pivot.iter().map(|p| p.len()).sum::() + > (max_size as f32).powf(0.5).ceil() as usize + } +} + +impl CopylessInternalNode { + pub fn get(&self, key: &[u8]) -> (&RwLock, Option<(KeyInfo, SlicedCowBytes)>) + where + N: ObjectReference, + { + let child = &self.children[self.idx(key)]; + (&child.ptr, child.buffer.get(key)) + } + + pub fn get_mut(&mut self, key: &[u8]) -> &mut ChildLink + where + N: ObjectReference, + { + let idx = self.idx(key); + &mut self.children[idx] + } + + pub fn pivot_get(&self, pk: &PivotKey) -> PivotGetResult + where + N: ObjectReference, + { + // Exact pivot matches are required only + debug_assert!(!pk.is_root()); + let pivot = pk.bytes().unwrap(); + self.meta_data + .pivot + .iter() + .enumerate() + .find(|(_idx, p)| **p == pivot) + .map_or_else( + || { + // Continue the search to the next level + PivotGetResult::NextNode(&self.children[self.idx(&pivot)].ptr) + }, + |(idx, _)| { + // Fetch the correct child pointer + let child; + if pk.is_left() { + child = &self.children[idx].ptr; + } else { + child = &self.children[idx + 1].ptr; + } + PivotGetResult::Target(Some(child)) + }, + ) + } + + pub fn pivot_get_mut(&mut self, pk: &PivotKey) -> PivotGetMutResult + where + N: ObjectReference, + { + // Exact pivot matches are required only + debug_assert!(!pk.is_root()); + let pivot = pk.bytes().unwrap(); + let (id, is_target) = self + .meta_data + .pivot + .iter() + .enumerate() + .find(|(_idx, p)| **p == pivot) + .map_or_else( + || { + // Continue the search to the next level + (self.idx(&pivot), false) + }, + |(idx, _)| { + // Fetch the correct child pointer + (idx, true) + }, + ); + match (is_target, pk.is_left()) { + (true, true) => PivotGetMutResult::Target(Some(self.children[id].ptr.get_mut())), + (true, false) => PivotGetMutResult::Target(Some(self.children[id + 1].ptr.get_mut())), + (false, _) => PivotGetMutResult::NextNode(self.children[id].ptr.get_mut()), + } + } + + pub fn apply_with_info(&mut self, key: &[u8], pref: StoragePreference) -> &mut N + where + N: ObjectReference, + { + let idx = self.idx(key); + let child = self.children[idx].ptr.get_mut(); + self.meta_data.entries_prefs[idx].upgrade(pref); + + child + } + + pub fn get_range( + &self, + key: &[u8], + left_pivot_key: &mut Option, + right_pivot_key: &mut Option, + _all_msgs: &mut BTreeMap>, + ) -> &ChildLink { + let idx = self.idx(key); + if idx > 0 { + *left_pivot_key = Some(self.meta_data.pivot[idx - 1].clone()); + } + if idx < self.meta_data.pivot.len() { + *right_pivot_key = Some(self.meta_data.pivot[idx].clone()); + } + &self.children[idx] + } + + pub fn get_next_node(&self, key: &[u8]) -> Option<&ChildLink> { + let idx = self.idx(key) + 1; + self.children.get(idx) + } + + pub fn drain_children(&mut self) -> impl Iterator> + '_ + where + N: ObjectReference, + { + self.meta_data.invalidate(); + self.children.drain(..) + } +} + +impl Size for Vec { + fn size(&self) -> usize { + 8 + self.len() * N::static_size() + } +} + +impl CopylessInternalNode { + pub fn split(&mut self) -> (Self, CowBytes, isize, LocalPivotKey) { + let split_off_idx = self.fanout() / 2; + let pivot = self.meta_data.pivot.split_off(split_off_idx); + let pivot_key = self.meta_data.pivot.pop().unwrap(); + + let mut children = self.children.split_off(split_off_idx); + if let Some(first_child) = children.get_mut(0) { + let mut c_ptr = first_child.ptr.write(); + let ds_id = c_ptr.index().d_id(); + c_ptr.set_index(PivotKey::LeftOuter(pivot[0].clone(), ds_id)); + } + let entries_sizes = self.meta_data.entries_sizes.split_off(split_off_idx); + let entries_prefs = self.meta_data.entries_prefs.split_off(split_off_idx); + + let entries_size = entries_sizes.len() * std::mem::size_of::() + + entries_prefs.len() + + pivot.iter().map(|p| p.size()).sum::() + + children.len() * N::static_size() + + entries_sizes.iter().sum::(); + + let size_delta = entries_size + pivot_key.size(); + + let mut right_sibling = CopylessInternalNode { + meta_data: InternalNodeMetaData { + level: self.meta_data.level, + entries_sizes, + entries_prefs, + pivot, + // Copy the system storage preference of the other node as we cannot + // be sure which key was targeted by recorded accesses. + system_storage_preference: self.meta_data.system_storage_preference.clone(), + pref: AtomicStoragePreference::unknown(), + current_size: 0, + }, + children, + }; + self.meta_data.invalidate(); + right_sibling.meta_data.invalidate(); + + assert!(self.fanout() >= MIN_FANOUT); + assert!(right_sibling.fanout() >= MIN_FANOUT); + ( + right_sibling, + pivot_key.clone(), + -(size_delta as isize), + LocalPivotKey::Right(pivot_key), + ) + } + + pub fn merge(&mut self, right_sibling: &mut Self, old_pivot_key: CowBytes) -> isize { + let old = self.size(); + self.meta_data.pivot.push(old_pivot_key); + self.meta_data + .pivot + .append(&mut right_sibling.meta_data.pivot); + self.meta_data + .entries_prefs + .append(&mut right_sibling.meta_data.entries_prefs); + self.meta_data + .entries_sizes + .append(&mut right_sibling.meta_data.entries_sizes); + self.meta_data.invalidate(); + + self.children.append(&mut right_sibling.children); + let new = self.size(); + + old as isize - new as isize + } + + /// Translate any object ref in a `NVMChildBuffer` from `Incomplete` to `Unmodified` state. + pub fn complete_object_refs(self, d_id: DatasetId) -> Self { + let first_pk = match self.meta_data.pivot.first() { + Some(p) => PivotKey::LeftOuter(p.clone(), d_id), + None => unreachable!( + "The store contains an empty InternalNode, this should never be the case." + ), + }; + for (id, pk) in [first_pk] + .into_iter() + .chain( + self.meta_data + .pivot + .iter() + .map(|p| PivotKey::Right(p.clone(), d_id)), + ) + .enumerate() + { + // SAFETY: There must always be pivots + 1 many children, otherwise + // the state of the Internal Node is broken. + self.children[id].ptr.write().set_index(pk.clone()); + } + self + } +} + +impl CopylessInternalNode +where + N: StaticSize, + N: ObjectReference, +{ + pub fn try_walk(&mut self, key: &[u8]) -> TakeChildBuffer { + let child_idx = self.idx(key); + + TakeChildBuffer { + node: self, + child_idx, + } + } + + pub fn try_find_flush_candidate( + &mut self, + min_flush_size: usize, + max_node_size: usize, + min_fanout: usize, + ) -> Option> + where + N: ObjectReference, + { + let child_idx = { + let (child_idx, child) = self + .meta_data + .entries_sizes + .iter() + .enumerate() + .max_by_key(|(_, v)| *v) + .unwrap(); + assert_eq!(self.children[child_idx].buffer.size(), *child); + + if *child >= min_flush_size + && ((self.size() - *child) <= max_node_size || self.fanout() < 2 * min_fanout) + && !self.has_too_high_fanout(max_node_size) + { + Some(child_idx) + } else if self.fanout() < 2 * min_fanout { + // NOTE: No further split is possible without violating tree + // conditions so, do everything to avoid this here. + Some(child_idx) + } else { + None + } + }; + child_idx.map(move |child_idx| TakeChildBuffer { + node: self, + child_idx, + }) + } +} + +pub(in crate::tree::imp) struct TakeChildBuffer<'a, N: 'a + 'static> { + node: &'a mut CopylessInternalNode, + child_idx: usize, +} + +impl<'a, N: StaticSize> Size for TakeChildBuffer<'a, N> { + fn size(&self) -> usize { + self.node.size() + } + + fn cache_size(&self) -> usize { + self.node.cache_size() + } +} + +impl<'a, N: StaticSize + HasStoragePreference> TakeChildBuffer<'a, N> { + pub(in crate::tree::imp) fn split_child( + &mut self, + sibling_np: N, + pivot_key: CowBytes, + select_right: bool, + ) -> isize + where + N: ObjectReference, + { + // split_at invalidates both involved children (old and new), but as the new child + // is added to self, the overall entries don't change, so this node doesn't need to be + // invalidated + + let before = self.cache_size(); + let sibling = self.node.children[self.child_idx] + .buffer + .split_at(&pivot_key); + let sibling_size = sibling.size(); + // let size_delta = sibling_size + pivot_key.size(); + self.node.children.insert( + self.child_idx + 1, + ChildLink { + buffer: sibling, + ptr: RwLock::new(sibling_np), + }, + ); + self.node.meta_data.pivot.insert(self.child_idx, pivot_key); + self.node.meta_data.entries_sizes[self.child_idx] = + self.node.children[self.child_idx].buffer.size(); + self.node + .meta_data + .entries_sizes + .insert(self.child_idx + 1, sibling_size); + self.node.meta_data.entries_prefs.insert( + self.child_idx + 1, + self.node.meta_data.entries_prefs[self.child_idx], + ); + if select_right { + self.child_idx += 1; + } + + // NOTE: recalculate, can be improved + self.cache_size() as isize - (before as isize) + + // size_delta as isize + } + + pub fn take_buffer(&mut self) -> (BTreeMap, isize) { + let (map, size_delta) = self.node.children[self.child_idx].buffer.take(); + self.node + .after_insert_size_delta(self.child_idx, -(size_delta as isize)); + (map, -(size_delta as isize)) + } +} + +impl<'a, N> TakeChildBuffer<'a, N> +where + N: StaticSize, +{ + pub(in crate::tree::imp) fn size(&self) -> usize { + (&*self.node).size() + } + + pub(in crate::tree::imp) fn prepare_merge(&mut self) -> PrepareMergeChild { + assert!(self.node.fanout() >= 2); + let (pivot_key_idx, other_child_idx) = if self.child_idx + 1 < self.node.children.len() { + (self.child_idx, self.child_idx + 1) + } else { + (self.child_idx - 1, self.child_idx - 1) + }; + + PrepareMergeChild { + node: self.node, + pivot_key_idx, + other_child_idx, + } + } + + // pub(in crate::tree::imp) fn add_size(&mut self, size_delta: isize) { + // self.node + // .after_insert_size_delta(self.child_idx, size_delta); + // } +} + +pub(in crate::tree::imp) struct PrepareMergeChild<'a, N: 'a + 'static> { + node: &'a mut CopylessInternalNode, + pivot_key_idx: usize, + other_child_idx: usize, +} + +impl<'a, N> PrepareMergeChild<'a, N> { + pub(in crate::tree::imp) fn sibling_node_pointer(&mut self) -> &mut RwLock + where + N: ObjectReference, + { + &mut self.node.children[self.other_child_idx].ptr + } + pub(in crate::tree::imp) fn is_right_sibling(&self) -> bool { + self.pivot_key_idx != self.other_child_idx + } +} + +impl<'a, N> PrepareMergeChild<'a, N> +where + N: ObjectReference + HasStoragePreference, +{ + pub(in crate::tree::imp) fn merge_children( + self, + ) -> MergeChildResult>> { + let mut right_child_links = self.node.children.remove(self.pivot_key_idx + 1); + let pivot_key = self.node.meta_data.pivot.remove(self.pivot_key_idx); + self.node + .meta_data + .entries_prefs + .remove(self.pivot_key_idx + 1); + self.node + .meta_data + .entries_sizes + .remove(self.pivot_key_idx + 1); + + let left_buffer = self.node.children[self.pivot_key_idx].buffer_mut(); + let mut right_buffer = right_child_links.buffer_mut(); + + let size_delta = pivot_key.size() + + N::static_size() * 2 + + std::mem::size_of::() + + std::mem::size_of::(); + left_buffer.append(&mut right_buffer); + self.node.meta_data.entries_sizes[self.pivot_key_idx] = left_buffer.size(); + self.node.meta_data.invalidate(); + + MergeChildResult { + pivot_key, + old_np: Box::new([right_child_links.ptr.into_inner()].into_iter()), + size_delta: -(size_delta as isize), + } + } +} + +impl<'a, N> PrepareMergeChild<'a, N> +where + N: ObjectReference + HasStoragePreference, +{ + pub(in crate::tree::imp) fn rebalanced(&mut self, new_pivot_key: CowBytes) -> isize { + { + let (left, right) = self.node.children[self.pivot_key_idx..].split_at_mut(1); + // Move messages around + let (left_child, right_child) = (&mut left[0].buffer, &mut right[0].buffer); + left_child.rebalance(right_child, &new_pivot_key); + self.node.meta_data.entries_sizes[self.pivot_key_idx] = left_child.size(); + self.node.meta_data.entries_sizes[self.pivot_key_idx + 1] = left_child.size(); + } + + let mut size_delta = new_pivot_key.size() as isize; + let old_pivot_key = replace( + &mut self.node.meta_data.pivot[self.pivot_key_idx], + new_pivot_key, + ); + size_delta -= old_pivot_key.size() as isize; + + size_delta + } +} + +impl<'a, N: Size + HasStoragePreference> TakeChildBuffer<'a, N> { + pub fn child_pointer_mut(&mut self) -> &mut RwLock + where + N: ObjectReference, + { + &mut self.node.children[self.child_idx].ptr + } + + pub fn buffer_mut(&mut self) -> &mut PackedChildBuffer + where + N: ObjectReference, + { + &mut self.node.children[self.child_idx].buffer + } + + pub fn buffer(&self) -> &PackedChildBuffer + where + N: ObjectReference, + { + &self.node.children[self.child_idx].buffer + } +} + +#[cfg(test)] +pub(crate) use tests::Key as TestKey; + +#[cfg(test)] +pub(super) mod tests { + + use std::io::Write; + + use super::*; + use crate::{ + arbitrary::GenExt, + buffer::BufWrite, + checksum::{Builder, GxHash, State, XxHash}, + database::DatasetId, + }; + + use quickcheck::{Arbitrary, Gen, TestResult}; + use rand::Rng; + impl ObjectReference for () { + type ObjectPointer = (); + + fn get_unmodified(&self) -> Option<&Self::ObjectPointer> { + Some(&()) + } + + fn set_index(&mut self, _pk: PivotKey) { + // NO-OP + } + + fn index(&self) -> &PivotKey { + unsafe { + if PK.is_none() { + PK = Some(PivotKey::LeftOuter( + CowBytes::from(vec![42u8]), + DatasetId::default(), + )); + } + PK.as_ref().unwrap() + } + } + } + + // Keys are not allowed to be empty. This is usually caught at the tree layer, but these are + // bypassing that check. There's probably a good way to do this, but we can also just throw + // away the empty keys until we find one that isn't empty. + #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] + pub struct Key(pub CowBytes); + impl Arbitrary for Key { + fn arbitrary(g: &mut Gen) -> Self { + loop { + let c = CowBytes::arbitrary(g); + if !c.is_empty() { + return Key(c); + } + } + } + } + + impl Clone for ChildLink { + fn clone(&self) -> Self { + Self { + buffer: self.buffer.clone(), + ptr: self.ptr.read().clone().into(), + } + } + } + + impl Clone for CopylessInternalNode { + fn clone(&self) -> Self { + CopylessInternalNode { + meta_data: InternalNodeMetaData { + level: self.meta_data.level, + pivot: self.meta_data.pivot.clone(), + system_storage_preference: self.meta_data.system_storage_preference.clone(), + pref: self.meta_data.pref.clone(), + entries_prefs: self.meta_data.entries_prefs.clone(), + entries_sizes: self.meta_data.entries_sizes.clone(), + current_size: self.meta_data.current_size, + }, + children: self.children.clone(), + } + } + } + + impl Arbitrary for CopylessInternalNode { + fn arbitrary(g: &mut Gen) -> Self { + let mut rng = g.rng(); + let pivot_key_cnt = rng.gen_range(0..100); + + let mut pivot = Vec::with_capacity(pivot_key_cnt); + for _ in 0..pivot_key_cnt { + let pivot_key = { + let k = Key::arbitrary(g); + k.0 + }; + pivot.push(pivot_key); + } + pivot.sort(); + + let mut children: Vec> = Vec::with_capacity(pivot_key_cnt + 1); + for _ in 0..pivot_key_cnt + 1 { + let buffer = PackedChildBuffer::arbitrary(g); + children.push(ChildLink { + buffer, + ptr: RwLock::new(T::arbitrary(g)), + }); + } + + let mut node = CopylessInternalNode { + meta_data: InternalNodeMetaData { + pivot, + level: 1, + system_storage_preference: AtomicSystemStoragePreference::from( + StoragePreference::NONE, + ), + pref: AtomicStoragePreference::unknown(), + entries_prefs: vec![StoragePreference::NONE; pivot_key_cnt + 1], + entries_sizes: children.iter().map(|c| c.buffer.size()).collect::>(), + current_size: 0, + }, + children, + }; + node.meta_data.invalidate(); + node + } + } + + pub fn quick_csum(bytes: &[u8]) -> crate::checksum::GxHash { + let mut builder = GxHash::builder().build(); + builder.ingest(bytes); + builder.finish() + } + + fn serialized_size(node: &CopylessInternalNode) -> usize { + let mut buf = Vec::new(); + node.pack(&mut buf, quick_csum).unwrap(); + buf.len() + } + + fn check_size(node: &CopylessInternalNode) { + assert_eq!(node.size(), serialized_size(node)) + } + + #[quickcheck] + fn actual_size(node: CopylessInternalNode<()>) { + assert_eq!(node.size(), serialized_size(&node)) + } + + #[quickcheck] + fn idx(node: CopylessInternalNode<()>, key: Key) { + let key = key.0; + let idx = node.idx(&key); + + if let Some(upper_key) = node.meta_data.pivot.get(idx) { + assert!(&key <= upper_key); + } + if idx > 0 { + let lower_key = &node.meta_data.pivot[idx - 1]; + assert!(lower_key < &key); + } + } + + static mut PK: Option = None; + + #[quickcheck] + fn size_split(mut node: CopylessInternalNode<()>) -> TestResult { + if node.fanout() < 4 { + return TestResult::discard(); + } + let size_before = node.size(); + let (right_sibling, _, size_delta, _pivot_key) = node.split(); + // assert_eq!(size_before as isize + size_delta, node.size() as isize); + + check_size(&node); + check_size(&right_sibling); + + TestResult::passed() + } + + #[quickcheck] + fn split(mut node: CopylessInternalNode<()>) -> TestResult { + if node.fanout() < 4 { + return TestResult::discard(); + } + let twin = node.clone(); + let (mut right_sibling, pivot, _size_delta, _pivot_key) = node.split(); + + assert!(*node.meta_data.pivot.last().unwrap() <= pivot); + assert!(*right_sibling.meta_data.pivot.first().unwrap() > pivot); + assert!(node.fanout() >= 2); + assert!(right_sibling.fanout() >= 2); + + assert!(node.children.len() == node.meta_data.pivot.len() + 1); + assert!(right_sibling.children.len() == right_sibling.meta_data.pivot.len() + 1); + assert!((node.children.len() as isize - right_sibling.children.len() as isize).abs() <= 1); + + let _size_before = node.size(); + let _size_delta = node.merge(&mut right_sibling, pivot); + let _size_after = node.size(); + // assert_eq!(size_before as isize + size_delta, size_after as isize); + assert_eq!(node.size(), twin.size()); + + TestResult::passed() + } + + #[quickcheck] + fn split_key(mut node: CopylessInternalNode<()>) -> TestResult { + if node.fanout() < 4 { + return TestResult::discard(); + } + let (right_sibling, pivot, _size_delta, pivot_key) = node.split(); + assert!(node.fanout() >= 2); + assert!(right_sibling.fanout() >= 2); + assert_eq!(LocalPivotKey::Right(pivot), pivot_key); + TestResult::passed() + } + + #[quickcheck] + fn split_and_merge(mut node: CopylessInternalNode<()>) -> TestResult { + if node.fanout() < 4 { + return TestResult::discard(); + } + + let twin = node.clone(); + let (mut right_node, pivot, ..) = node.split(); + node.merge(&mut right_node, pivot); + check_size(&node); + check_size(&twin); + assert_eq!(node.meta_data, twin.meta_data); + assert_eq!(node.children, twin.children); + TestResult::passed() + } + + #[quickcheck] + fn serialize_then_deserialize(node: CopylessInternalNode<()>) { + println!("Start"); + let mut buf = BufWrite::with_capacity(crate::vdev::Block(1)); + println!("Start Prefix"); + buf.write_all(&[0; 4]).unwrap(); + println!("Start packing"); + let csum = node.pack(&mut buf, quick_csum).unwrap(); + println!("Done packing"); + let unpacked = CopylessInternalNode::<()>::unpack(buf.into_buf(), csum).unwrap(); + println!("Done unpacking"); + assert_eq!(unpacked.meta_data, node.meta_data); + println!("Checked meta data"); + assert_eq!(unpacked.children, node.children); + println!("Checked children"); + } + + // TODO tests + // flush buffer + // get with max_msn +} diff --git a/betree/src/tree/imp/internal/mod.rs b/betree/src/tree/imp/internal/mod.rs new file mode 100644 index 000000000..1a540230d --- /dev/null +++ b/betree/src/tree/imp/internal/mod.rs @@ -0,0 +1,7 @@ +pub(super) mod copyless_internal; +pub(super) mod packed_child_buffer; +pub(super) mod serialize_nodepointer; +pub(super) mod take_child_buffer; + +pub(super) use copyless_internal::TakeChildBuffer; +pub(super) use take_child_buffer::MergeChildResult; diff --git a/betree/src/tree/imp/internal/packed_child_buffer.rs b/betree/src/tree/imp/internal/packed_child_buffer.rs new file mode 100644 index 000000000..64c78437a --- /dev/null +++ b/betree/src/tree/imp/internal/packed_child_buffer.rs @@ -0,0 +1,1185 @@ +//! Implementation of a message buffering node wrapper. +use crate::{ + checksum::Checksum as ChecksumTrait, + cow_bytes::{CowBytes, SlicedCowBytes}, + data_management::{HasStoragePreference, IntegrityMode}, + database::Checksum, + size::{Size, StaticSize}, + storage_pool::AtomicSystemStoragePreference, + tree::{imp::leaf::FillUpResult, pivot_key::LocalPivotKey, KeyInfo, MessageAction}, + AtomicStoragePreference, StoragePreference, +}; +use std::{ + borrow::Borrow, + cmp::Ordering, + collections::{ + btree_map::{self, Entry}, + BTreeMap, Bound, + }, + mem::replace, + ops::{Add, AddAssign}, + ptr::slice_from_raw_parts, +}; + +trait CutSlice { + fn cut(&self, pos: usize, len: usize) -> &[T]; +} + +impl CutSlice for [T] { + fn cut(&self, pos: usize, len: usize) -> &[T] { + &self[pos..pos + len] + } +} + +/// Rich return type indicating that a cache size of the called object happened. +pub(in crate::tree) struct WithCacheSizeChange { + inner: T, + size_delta: isize, +} + +impl From for WithCacheSizeChange<()> { + fn from(value: isize) -> Self { + Self { + size_delta: value, + inner: (), + } + } +} + +impl Add for WithCacheSizeChange<()> { + type Output = WithCacheSizeChange<()>; + + fn add(self, rhs: Self) -> Self::Output { + WithCacheSizeChange { + size_delta: self.size_delta + rhs.size_delta, + ..self + } + } +} + +impl AddAssign for WithCacheSizeChange<()> { + fn add_assign(&mut self, rhs: Self) { + self.size_delta += rhs.size_delta + } +} + +impl WithCacheSizeChange { + pub fn new(inner: T, size_delta: isize) -> Self { + Self { inner, size_delta } + } + + pub fn map(self, mut f: F) -> WithCacheSizeChange + where + F: FnMut(T) -> U, + { + WithCacheSizeChange { + inner: f(self.inner), + size_delta: self.size_delta, + } + } + + pub fn map_with_size_change(self, mut f: F) -> WithCacheSizeChange + where + F: FnMut(T) -> WithCacheSizeChange, + { + let other = f(self.inner); + WithCacheSizeChange { + inner: other.inner, + size_delta: self.size_delta + other.size_delta, + } + } + + pub fn add_size(self, delta: isize) -> WithCacheSizeChange { + WithCacheSizeChange { + size_delta: self.size_delta + delta, + ..self + } + } + + pub fn zero() -> WithCacheSizeChange<()> { + WithCacheSizeChange { + inner: (), + size_delta: 0, + } + } + + pub fn take(self) -> (T, isize) { + (self.inner, self.size_delta) + } +} + +/// A buffer for messages that belong to a child of a tree node. +#[derive(Debug)] +pub(in crate::tree::imp) struct PackedChildBuffer { + pub(in crate::tree::imp) messages_preference: AtomicStoragePreference, + // This preference should always be set by the parent. Needs to be on fast + // memory or NVMe to be worth the additional queries. + pub(in crate::tree::imp) system_storage_preference: AtomicSystemStoragePreference, + pub(in crate::tree::imp) entries_size: usize, + pub(in crate::tree::imp) buffer: Map, + + pub(in crate::tree::imp) is_leaf: bool, +} + +impl Default for PackedChildBuffer { + fn default() -> Self { + PackedChildBuffer::new(false) + } +} + +pub const BUFFER_STATIC_SIZE: usize = HEADER; +const IS_LEAF_HEADER: usize = 1; +const HEADER: usize = IS_LEAF_HEADER + + std::mem::size_of::() + + std::mem::size_of::() + + std::mem::size_of::(); +const KEY_IDX_SIZE: usize = + std::mem::size_of::() + std::mem::size_of::() + std::mem::size_of::(); +const PER_KEY_BYTES: usize = 16; + +#[derive(Debug)] +pub(in crate::tree::imp) enum Map { + Packed { + entry_count: usize, + data: SlicedCowBytes, + }, + Unpacked(BTreeMap), +} + +#[repr(C)] +pub struct KeyIdx { + pos: u32, + len: u32, + pref: u8, +} + +impl KeyIdx { + pub fn unpack(buf: &[u8; 9]) -> KeyIdx { + KeyIdx { + pos: u32::from_le_bytes(buf[0..4].try_into().unwrap()), + len: u32::from_le_bytes(buf[4..8].try_into().unwrap()), + pref: u8::from_le_bytes(buf[8..9].try_into().unwrap()), + } + } +} + +impl Map { + /// Fetch a mutable version of the internal btree map. + pub(in crate::tree::imp) fn unpacked( + &mut self, + ) -> WithCacheSizeChange<&mut BTreeMap> { + match self { + Map::Packed { entry_count, data } => { + // NOTE: copy data before to avoid sync epoch shenanigans + // necesary as we might rewrite the original memory region once here + let mut keys: Vec = Vec::with_capacity(*entry_count); + let mut key_info = Vec::with_capacity(*entry_count); + let mut values_pos: Vec<(u32, u32, Checksum)> = Vec::with_capacity(*entry_count); + + // current in-cache size + let mut size_delta: isize = -2 * std::mem::size_of::() as isize; + + for idx in 0..*entry_count { + size_delta += KeyInfo::static_size() as isize; + let off = HEADER + idx * KEY_IDX_SIZE; + let kidx = KeyIdx::unpack(data.cut(off, 9).try_into().unwrap()); + key_info.push(KeyInfo { + storage_preference: StoragePreference::from_u8(kidx.pref), + }); + keys.push(CowBytes::from( + data.cut(kidx.pos as usize, kidx.len as usize), + )); + size_delta += kidx.len as isize; + + let val_pos_off = kidx.pos as usize + kidx.len as usize; + let val_pos = u32::from_le_bytes(data.cut(val_pos_off, 4).try_into().unwrap()); + let val_len = + u32::from_le_bytes(data.cut(val_pos_off + 4, 4).try_into().unwrap()); + let val_csum: crate::database::Checksum = bincode::deserialize(data.cut( + val_pos_off + 4 + 4, + crate::database::Checksum::static_size(), + )) + .unwrap(); + values_pos.push((val_pos, val_len, val_csum)); + size_delta += val_len as isize; + } + + *self = Map::Unpacked(BTreeMap::from_iter(keys.into_iter().zip( + key_info.into_iter().zip(values_pos.into_iter().map( + move |(pos, len, csum)| { + // NOTE: copies data to not be invalidated later on rewrites... could be solved differently + let buf = CowBytes::from(&data[pos as usize..(pos + len) as usize]) + .slice_from(0); + csum.verify(&buf).unwrap(); + buf + }, + )), + ))); + + WithCacheSizeChange::new( + match self { + Map::Unpacked(ref mut map) => map, + _ => unreachable!(), + }, + size_delta, + ) + } + Map::Unpacked(ref mut map) => WithCacheSizeChange::new(map, 0), + } + } + + /// Assert an unpacked instance. + fn assert_unpacked(&self) -> &BTreeMap { + match self { + Map::Packed { .. } => { + panic!("Tried to assert a packed ChildBuffer instance.") + } + Map::Unpacked(ref map) => map, + } + } + + /// + fn assert_packed(&self) -> &SlicedCowBytes { + match self { + Map::Packed { data, .. } => &data, + Map::Unpacked(_) => panic!("Tried to assert an unpacked ChildBuffer instance."), + } + } + + /// True if a proper btree map has been created for this instance. + fn is_unpacked(&self) -> bool { + match self { + Map::Packed { .. } => false, + Map::Unpacked(_) => true, + } + } + + /// Returns whether there is no message in this buffer for the given `key`. + pub fn is_empty(&self, key: &[u8]) -> bool { + match self { + Map::Packed { .. } => self.find(key).is_none(), + Map::Unpacked(btree) => !btree.contains_key(key), + } + } + + /// Return the number of bytes at the start of map that is contained within + /// the general checksum of the node. + pub fn len_bytes_contained_in_checksum(&self) -> usize { + match self { + Map::Packed { entry_count, data } => { + if *entry_count < 1 { + return HEADER; + } + let off = HEADER + entry_count.saturating_sub(1) * KEY_IDX_SIZE; + let kidx = KeyIdx::unpack(data.cut(off, 9).try_into().unwrap()); + kidx.pos as usize + + kidx.len as usize + + std::mem::size_of::() + + std::mem::size_of::() + + Checksum::static_size() + } + Map::Unpacked(_) => unreachable!("cannot get the number of bytes of unpacked maps"), + } + } + + /// Return the number of elements. + pub fn len(&self) -> usize { + match self { + Map::Packed { entry_count, .. } => *entry_count, + Map::Unpacked(btree) => btree.len(), + } + } + + pub fn get(&self, key: &[u8]) -> Option<(KeyInfo, SlicedCowBytes)> { + match self { + Map::Packed { data, .. } => self.find(key).map(|(pref, pos, len, csum)| { + let buf = unsafe { SlicedCowBytes::from_raw(data.as_ptr().add(pos), len) }; + // TODO: Pass on result + csum.verify(&buf).unwrap(); + ( + KeyInfo { + storage_preference: StoragePreference::from_u8(pref), + }, + buf.slice_from(0), + ) + }), + // TODO: This should be a cheap copy (a few bytes for the pref and + // the ptrs in slicedcowbytes) but please check this again. + Map::Unpacked(btree) => btree.get(key).cloned(), + } + } + + // Return the preference and location of the value within the boxed value. + fn find(&self, key: &[u8]) -> Option<(u8, usize, usize, Checksum)> { + match self { + Map::Packed { entry_count, data } => { + // Perform binary search + let mut left = 0 as isize; + let mut right = (*entry_count as isize) - 1; + loop { + if left > right { + break; + } + let mid = (left + right) / 2 + (left + right) % 2; + let kidx = KeyIdx::unpack( + data.cut(HEADER + (KEY_IDX_SIZE * mid as usize), KEY_IDX_SIZE) + .try_into() + .unwrap(), + ); + + let k = slice_from_raw_parts( + unsafe { data.as_ptr().add(kidx.pos as usize) }, + kidx.len as usize, + ); + + match key.cmp(unsafe { &*k }) { + Ordering::Less => { + right = mid as isize - 1; + } + Ordering::Equal => { + let val_pos_off = kidx.pos as usize + kidx.len as usize; + let val_pos = + u32::from_le_bytes(data.cut(val_pos_off, 4).try_into().unwrap()) + as usize; + let val_len = u32::from_le_bytes( + data.cut(val_pos_off + 4, 4).try_into().unwrap(), + ) as usize; + let val_csum: Checksum = bincode::deserialize( + data.cut(val_pos_off + 4 + 4, Checksum::static_size()), + ) + .unwrap(); + return Some((kidx.pref, val_pos, val_len, val_csum)); + } + Ordering::Greater => { + left = mid + 1; + } + } + } + None + } + Map::Unpacked(_) => unreachable!(), + } + } +} + +impl HasStoragePreference for PackedChildBuffer { + fn current_preference(&self) -> Option { + self.messages_preference + .as_option() + // .map(|msg_pref| { + // StoragePreference::choose_faster( + // msg_pref, + // self.node_pointer.read().correct_preference(), + // ) + // }) + .map(|p| self.system_storage_preference.weak_bound(&p)) + } + + fn recalculate(&self) -> StoragePreference { + let mut pref = StoragePreference::NONE; + + for (keyinfo, _v) in self.buffer.assert_unpacked().values() { + pref.upgrade(keyinfo.storage_preference) + } + + self.messages_preference.set(pref); + + // pref can't be lower than that of child nodes + StoragePreference::choose_faster( + pref, + StoragePreference::NONE, + // self.parent_preference + // .as_option() + // .unwrap_or(StoragePreference::NONE), + ) + } + + fn system_storage_preference(&self) -> StoragePreference { + self.system_storage_preference.borrow().into() + } + + fn set_system_storage_preference(&mut self, pref: StoragePreference) { + self.system_storage_preference.set(pref) + } +} + +impl Size for PackedChildBuffer { + fn size(&self) -> usize { + HEADER + self.entries_size + } + + fn actual_size(&self) -> Option { + Some(self.size()) + } + + fn cache_size(&self) -> usize { + match &self.buffer { + Map::Packed { data, .. } => { + HEADER + std::mem::size_of::() * 2 + data.cache_size() + } + Map::Unpacked(_) => self.size(), + } + } +} + +impl PackedChildBuffer { + pub fn buffer_size(&self) -> usize { + self.entries_size + } + + /// Returns whether there is no message in this buffer for the given `key`. + pub fn is_empty(&self, key: &[u8]) -> bool { + self.buffer.is_empty(key) + } + + pub fn get(&self, key: &[u8]) -> Option<(KeyInfo, SlicedCowBytes)> { + self.buffer.get(key) + } + + pub fn apply_with_info( + &mut self, + key: &[u8], + pref: StoragePreference, + ) -> WithCacheSizeChange> { + self.messages_preference.invalidate(); + self.buffer.unpacked().map(|tree| { + tree.get_mut(key).map(|(keyinfo, _bytes)| { + keyinfo.storage_preference = pref; + keyinfo.clone() + }) + }) + } + + pub fn unpack_data(&mut self) -> WithCacheSizeChange<()> { + self.buffer.unpacked().map(|_| ()) + } + pub fn split( + &mut self, + min_size: usize, + max_size: usize, + ) -> WithCacheSizeChange<(PackedChildBuffer, CowBytes, LocalPivotKey)> { + assert!(self.size() > max_size); + assert!(self.buffer.len() > 2); + + self.buffer.unpacked().map_with_size_change(|buffer| { + let mut right_sibling = Self::new(self.is_leaf); + assert!(right_sibling.entries_size == 0); + + let mut sibling_size = 0; + let mut sibling_pref = StoragePreference::NONE; + let mut split_key = None; + for (k, (keyinfo, v)) in buffer.iter().rev() { + sibling_size += k.len() + v.len() + PER_KEY_BYTES + keyinfo.size(); + sibling_pref.upgrade(keyinfo.storage_preference); + + if sibling_size >= min_size { + split_key = Some(k.clone()); + break; + } + } + let split_key = split_key.unwrap(); + right_sibling.buffer = Map::Unpacked(buffer.split_off(&split_key)); + self.entries_size -= sibling_size; + right_sibling.entries_size = sibling_size; + right_sibling.messages_preference.set(sibling_pref); + + // have removed many keys from self, no longer certain about own pref, mark invalid + self.messages_preference.invalidate(); + + let pivot_key = buffer.iter().next_back().unwrap().0.clone(); + + WithCacheSizeChange::new( + ( + right_sibling, + pivot_key.clone(), + LocalPivotKey::Right(pivot_key), + ), + -(sibling_size as isize), + ) + }) + } + + pub(crate) fn insert_msg_buffer( + &mut self, + msg_buffer: I, + msg_action: M, + ) -> WithCacheSizeChange<()> + where + I: IntoIterator, + M: MessageAction, + { + let mut size_delta = WithCacheSizeChange::new((), 0); + for (key, (keyinfo, msg)) in msg_buffer { + size_delta += self.insert(key, keyinfo, msg, &msg_action); + } + size_delta + } +} + +pub struct PackedBufferIterator<'a> { + buffer: &'a SlicedCowBytes, + cur: usize, + entry_count: usize, + keys: Vec, +} + +impl<'a> Iterator for PackedBufferIterator<'a> { + type Item = (&'a [u8], (KeyInfo, SlicedCowBytes)); + + fn next(&mut self) -> Option { + if self.cur >= self.entry_count { + return None; + } + + let kpos = &self.keys[self.cur]; + + let vpos_off = (kpos.pos + kpos.len) as usize; + let vpos = u32::from_le_bytes(self.buffer.cut(vpos_off, 4).try_into().unwrap()); + let vlen = u32::from_le_bytes(self.buffer.cut(vpos_off + 4, 4).try_into().unwrap()); + let val = self.buffer.clone().subslice(vpos, vlen); + self.cur += 1; + Some(( + self.buffer.cut(kpos.pos as usize, kpos.len as usize), + ( + KeyInfo { + storage_preference: StoragePreference::from_u8(kpos.pref), + }, + val, + ), + )) + } +} + +pub enum Iter<'a> { + Packed(PackedBufferIterator<'a>), + Unpacked(btree_map::Iter<'a, CowBytes, (KeyInfo, SlicedCowBytes)>), +} + +impl<'a> Iter<'a> { + fn new(cbuf: &'a PackedChildBuffer) -> Self { + match cbuf.buffer { + Map::Packed { + entry_count, + ref data, + } => Iter::Packed(PackedBufferIterator { + keys: (0..entry_count) + .map(|idx| { + KeyIdx::unpack( + data.cut(HEADER + KEY_IDX_SIZE * idx, KEY_IDX_SIZE) + .try_into() + .unwrap(), + ) + }) + .collect(), + buffer: data, + cur: 0, + entry_count, + }), + Map::Unpacked(ref btree) => Iter::Unpacked(btree.iter()), + } + } +} + +impl<'a> Iterator for Iter<'a> { + type Item = (&'a [u8], (KeyInfo, SlicedCowBytes)); + + fn next(&mut self) -> Option { + match self { + Iter::Packed(i) => i.next(), + Iter::Unpacked(i) => i.next().map(|(a, b)| (&a[..], b.clone())), + } + } +} + +impl PackedChildBuffer { + /// Returns an iterator over all messages. + pub fn get_all_messages( + &self, + ) -> impl Iterator + '_ { + Iter::new(self) + } + + pub fn len(&self) -> usize { + self.buffer.len() + } + + /// Takes the message buffer out this `NVMChildBuffer`, + /// leaving an empty one in its place. + pub fn take(&mut self) -> (BTreeMap, usize) { + self.messages_preference.invalidate(); + ( + std::mem::take(&mut self.buffer.unpacked().inner), + replace(&mut self.entries_size, 0), + ) + } + + pub fn append(&mut self, other: &mut Self) -> WithCacheSizeChange<()> { + self.buffer.unpacked().map_with_size_change(|buffer| { + buffer.append(&mut other.buffer.unpacked().inner); + self.entries_size += other.entries_size; + self.messages_preference + .upgrade_atomic(&other.messages_preference); + (other.entries_size as isize).into() + }) + } + + /// Splits this `PackedChildBuffer` at `pivot` so that `self` contains all + /// entries up to (and including) `pivot_key` and the returned `Self` + /// contains the other entries. + pub fn split_at(&mut self, pivot: &CowBytes) -> Self { + let (buffer, buffer_entries_size) = self.split_off(pivot); + PackedChildBuffer { + messages_preference: AtomicStoragePreference::unknown(), + buffer: Map::Unpacked(buffer), + entries_size: buffer_entries_size, + system_storage_preference: AtomicSystemStoragePreference::from(StoragePreference::NONE), + is_leaf: self.is_leaf, + } + } + + fn split_off( + &mut self, + pivot: &CowBytes, + ) -> (BTreeMap, usize) { + // `split_off` puts the split-key into the right buffer. + let mut next_key = pivot.to_vec(); + next_key.push(0); + + assert!(self.buffer.is_unpacked()); + let right_buffer = self.buffer.unpacked().inner.split_off(&next_key[..]); + self.messages_preference.invalidate(); + + let right_entry_size = right_buffer + .iter() + .map(|(key, value)| { + key.size() + value.1.size() + value.0.size() + Checksum::static_size() + }) + .sum(); + self.entries_size -= right_entry_size; + (right_buffer, right_entry_size) + } + + pub fn rebalance(&mut self, right_sibling: &mut Self, new_pivot_key: &CowBytes) { + self.append(right_sibling); + let (buffer, buffer_entries_size) = self.split_off(new_pivot_key); + right_sibling.buffer = Map::Unpacked(buffer); + right_sibling.entries_size = buffer_entries_size; + } + + pub fn rebalance_size( + &mut self, + right_sibling: &mut Self, + min_size: usize, + max_size: usize, + ) -> FillUpResult { + let cache_change = self.append(right_sibling); + if self.size() <= max_size { + FillUpResult::Merged { + size_delta: cache_change.size_delta, + } + } else { + // First size_delta is from the merge operation where we split + let split = self.split(min_size, max_size); + let (sibling, pivot_key, _) = split.inner; + *right_sibling = sibling; + FillUpResult::Rebalanced { + pivot_key, + size_delta: cache_change.size_delta + split.size_delta, + } + } + } + + /// Inserts a message to this buffer for the given `key`. + pub fn insert( + &mut self, + key: Q, + keyinfo: KeyInfo, + msg: SlicedCowBytes, + msg_action: M, + ) -> WithCacheSizeChange<()> + where + Q: Borrow<[u8]> + Into, + M: MessageAction, + { + let key = key.into(); + let key_size = key.size(); + + self.messages_preference.upgrade(keyinfo.storage_preference); + + // grab cache size change and drop ref + let size_change = self.buffer.unpacked(); + + match size_change.inner.entry(key.clone()) { + Entry::Vacant(e) => { + // Resolve messages when the buffer is a leaf. + let size_delta = if self.is_leaf { + let mut data = None; + msg_action.apply_to_leaf(&key, msg, &mut data); + if let Some(data) = data { + let size = + keyinfo.size() + data.size() + key_size + Checksum::static_size(); + e.insert((keyinfo, data)); + size + } else { + 0 + } + } else { + let size = key_size + msg.size() + keyinfo.size() + Checksum::static_size(); + e.insert((keyinfo, msg)); + size + }; + + self.entries_size += size_delta; + // assert_eq!(self.cache_size(), old_size + size_delta); + size_change.map_with_size_change(|_| (size_delta as isize).into()) + } + Entry::Occupied(mut e) => { + let lower = e.get_mut(); + // NOTE: We move values out of the entry temporarily and replace it with a bogus value which cannnot be accessed in the mean time. + let lower_msg = unsafe { + std::mem::replace(&mut lower.1, SlicedCowBytes::from_raw(std::ptr::null(), 0)) + }; + let lower_size = lower_msg.size(); + + let (merged, merged_size) = if self.is_leaf { + let mut new = Some(lower_msg); + msg_action.apply_to_leaf(&key, msg, &mut new); + if let Some(data) = new { + let new_size = data.size(); + (data, new_size) + } else { + let data = e.remove(); + return size_change.map_with_size_change(|_| { + (-(key_size as isize + data.1.size() as isize + PER_KEY_BYTES as isize)) + .into() + }); + } + } else { + let merged_msg = msg_action.merge(&key, msg, lower_msg); + let merged_msg_size = merged_msg.size(); + (merged_msg, merged_msg_size) + }; + e.get_mut().1 = merged; + + self.entries_size += merged_size; + self.entries_size -= lower_size; + // assert_eq!(self.cache_size(), old_size + merged_size - lower_size); + size_change + .map_with_size_change(|_| (merged_size as isize - lower_size as isize).into()) + } + } + } + + /// Constructs a new, empty buffer. + pub fn new(is_leaf: bool) -> Self { + PackedChildBuffer { + messages_preference: AtomicStoragePreference::known(StoragePreference::NONE), + buffer: Map::Unpacked(BTreeMap::new()), + entries_size: 0, + system_storage_preference: AtomicSystemStoragePreference::from(StoragePreference::NONE), + is_leaf, + } + } + + /// This method packs entries similar to the packed leaf as they are quite + /// similar in their behavior. + /// + /// + /// + /// Packed Stream is constructed as so (all numbers are in Little Endian): + /// - u8: is leaf + /// - u32: len entries + /// - u32: entries_size + /// - u8: storage pref + /// - [ + /// u32: pos key, + /// u32: len key, + /// u8: pref key, + /// ] + /// - [ + /// bytes: key, + /// u32: pos val, + /// u32: len val, + /// Checksum: checksum, + /// ] + /// - [ + /// bytes: val, + /// ] + /// + pub fn pack( + &self, + mut w: W, + csum_builder: F, + ) -> Result, std::io::Error> + where + W: std::io::Write, + F: Fn(&[u8]) -> C, + C: ChecksumTrait, + { + if !self.buffer.is_unpacked() { + // Copy the contents of the buffer to the new writer without unpacking. + w.write_all(&self.buffer.assert_packed()[..self.size()])?; + return Ok(IntegrityMode::Internal { + len: self.buffer.len_bytes_contained_in_checksum() as u32, + csum: csum_builder( + &self.buffer.assert_packed()[..self.buffer.len_bytes_contained_in_checksum()], + ), + }); + } + + use std::io::Write; + let mut tmp = vec![]; + + if self.is_leaf { + tmp.write_all(&[1])?; + } else { + tmp.write_all(&[0])?; + } + tmp.write_all(&(self.buffer.len() as u32).to_le_bytes())?; + tmp.write_all(&(self.entries_size as u32).to_le_bytes())?; + tmp.write_all( + &self + .system_storage_preference + .strong_bound(&StoragePreference::NONE) + .as_u8() + .to_le_bytes(), + )?; + + let mut free_after = HEADER + self.buffer.len() * KEY_IDX_SIZE; + for (key, (info, _)) in self.buffer.assert_unpacked().iter() { + let key_len = key.len(); + tmp.write_all(&(free_after as u32).to_le_bytes())?; + tmp.write_all(&(key_len as u32).to_le_bytes())?; + tmp.write_all(&info.storage_preference.as_u8().to_le_bytes())?; + free_after += key_len + + std::mem::size_of::() + + std::mem::size_of::() + + Checksum::static_size() + } + for (key, (_, val)) in self.buffer.assert_unpacked().iter() { + tmp.write_all(&key)?; + + let checksum = csum_builder(val); + // TODO: maybe size in unpacking this + tmp.write_all(&(free_after as u32).to_le_bytes())?; + tmp.write_all(&(val.len() as u32).to_le_bytes())?; + bincode::serialize_into(&mut tmp, &checksum).unwrap(); + free_after += val.len(); + } + let head_csum = csum_builder(&tmp); + w.write_all(&tmp)?; + for (_, (_, val)) in self.buffer.assert_unpacked().iter() { + w.write_all(&val)?; + } + + Ok(IntegrityMode::Internal { + csum: head_csum, + len: tmp.len() as u32, + }) + } + + pub fn unpack(buf: SlicedCowBytes, csum: IntegrityMode) -> Result + where + C: ChecksumTrait, + { + let is_leaf = buf[0] != 0; + let entry_count = + u32::from_le_bytes(buf[IS_LEAF_HEADER..IS_LEAF_HEADER + 4].try_into().unwrap()) + as usize; + let entries_size = u32::from_le_bytes( + buf[IS_LEAF_HEADER + 4..IS_LEAF_HEADER + 4 + 4] + .try_into() + .unwrap(), + ) as usize; + // assert!(entries_size < 8 * 1024 * 1024, "size was {}", entries_size); + let pref = u8::from_le_bytes( + buf[IS_LEAF_HEADER + 8..IS_LEAF_HEADER + 9] + .try_into() + .unwrap(), + ); + let buffer = Map::Packed { + entry_count, + data: buf.clone(), + }; + csum.checksum() + .unwrap() + .verify(&buf[..csum.length().unwrap() as usize]) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; + Ok(Self { + messages_preference: AtomicStoragePreference::known(StoragePreference::from_u8(pref)), + system_storage_preference: AtomicSystemStoragePreference::from( + StoragePreference::from_u8(pref), + ), + entries_size, + buffer, + is_leaf, + }) + } +} + +impl PackedChildBuffer { + pub fn range_delete(&mut self, start: &[u8], end: Option<&[u8]>) -> WithCacheSizeChange<()> { + // Context: Previously we mentioned the usage of a drain filter here and + // linked to an existing issue of how it is missing from the standard + // library. + // + // Adding a drain filter here would make things easier from the code + // perspective, but with the generic predicate, we cannot utilize the + // nice property of the BTreeMap that data is ordered and the traversal + // of the tree can be nicely restrictred with a proper range. Due to + // this I changed the T0D0 placed here to this very explanation you are + // reading. + let mut size_delta = 0; + let range = ( + Bound::Included(start), + end.map_or(Bound::Unbounded, Bound::Excluded), + ); + let mut keys = Vec::new(); + + let buffer = self.buffer.unpacked(); + + for (key, msg) in buffer.inner.range_mut::<[u8], _>(range) { + size_delta += key.size() + msg.1.size(); + keys.push(key.clone()); + } + for key in keys.into_iter() { + buffer.inner.remove(&key); + } + self.entries_size -= size_delta; + self.messages_preference.invalidate(); + (buffer.size_delta - (size_delta as isize)).into() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + arbitrary::GenExt, + tree::{ + default_message_action::DefaultMessageActionMsg, + imp::internal::copyless_internal::tests::quick_csum, + }, + }; + use quickcheck::{Arbitrary, Gen, TestResult}; + use rand::Rng; + + impl Clone for PackedChildBuffer { + fn clone(&self) -> Self { + PackedChildBuffer { + messages_preference: self.messages_preference.clone(), + entries_size: self.entries_size, + buffer: Map::Unpacked(self.buffer.assert_unpacked().clone()), + system_storage_preference: self.system_storage_preference.clone(), + is_leaf: self.is_leaf, + } + } + } + + impl PartialEq for PackedChildBuffer { + fn eq(&self, other: &Self) -> bool { + self.entries_size == other.entries_size + && self.buffer.assert_unpacked() == other.buffer.assert_unpacked() + } + } + + impl Arbitrary for KeyInfo { + fn arbitrary(g: &mut Gen) -> Self { + KeyInfo { + storage_preference: StoragePreference::from_u8( + g.rng().gen::() % StoragePreference::SLOWEST.as_u8(), + ), + } + } + } + + impl Arbitrary for PackedChildBuffer { + fn arbitrary(g: &mut Gen) -> Self { + let mut rng = g.rng(); + let entries_cnt = rng.gen_range(0..20); + let buffer: BTreeMap = (0..entries_cnt) + .map(|_| { + ( + super::super::copyless_internal::TestKey::arbitrary(g).0, + ( + KeyInfo::arbitrary(g), + DefaultMessageActionMsg::arbitrary(g).0, + ), + ) + }) + .collect(); + PackedChildBuffer { + messages_preference: AtomicStoragePreference::unknown(), + entries_size: buffer + .iter() + .map(|(key, value)| { + key.size() + value.0.size() + value.1.size() + Checksum::static_size() + }) + .sum::(), + buffer: Map::Unpacked(buffer), + system_storage_preference: AtomicSystemStoragePreference::from( + StoragePreference::NONE, + ), + is_leaf: false, + } + } + } + + fn check_size(child_buffer: &PackedChildBuffer) { + let mut buf = Vec::new(); + child_buffer + .pack( + &mut buf, + crate::tree::imp::internal::copyless_internal::tests::quick_csum, + ) + .unwrap(); + assert_eq!(buf.len(), child_buffer.size()) + } + + #[quickcheck] + fn actual_size(child_buffer: PackedChildBuffer) { + check_size(&child_buffer) + } + + #[quickcheck] + fn size_split_at(mut child_buffer: PackedChildBuffer, pivot_key: CowBytes) { + let sbl = child_buffer.split_at(&pivot_key); + check_size(&child_buffer); + assert!(child_buffer.checked_size().is_ok()); + check_size(&sbl); + assert!(sbl.checked_size().is_ok()); + } + + #[quickcheck] + fn split_at(mut child_buffer: PackedChildBuffer, pivot_key: CowBytes) { + let sbl = child_buffer.split_at(&pivot_key); + assert!(child_buffer + .buffer + .assert_unpacked() + .last_key_value() + .map(|(k, _)| *k <= pivot_key) + .unwrap_or(true)); + assert!(sbl + .buffer + .assert_unpacked() + .first_key_value() + .map(|(k, _)| *k > pivot_key) + .unwrap_or(true)); + } + + #[quickcheck] + fn append(mut child_buffer: PackedChildBuffer) -> TestResult { + if child_buffer.buffer.len() < 4 { + return TestResult::discard(); + } + let before_size = child_buffer.size(); + let pivot = child_buffer + .buffer + .assert_unpacked() + .iter() + .nth(3) + .unwrap() + .0 + .clone(); + + let mut other = child_buffer.split_at(&pivot); + child_buffer.append(&mut other); + + assert_eq!(before_size, child_buffer.size()); + + TestResult::passed() + } + + #[quickcheck] + fn unpack_equality(child_buffer: PackedChildBuffer) { + let mut buf = Vec::new(); + // buf.extend_from_slice(&[0u8; NODE_ID]); + let csum = child_buffer.pack(&mut buf, quick_csum).unwrap(); + + let mut other = PackedChildBuffer::unpack(CowBytes::from(buf).into(), csum).unwrap(); + other.buffer.unpacked(); + + for (key, (info, val)) in child_buffer.buffer.assert_unpacked() { + let res = other.get(key).unwrap(); + assert_eq!((&res.0, &res.1), (info, val)); + } + } + + #[quickcheck] + fn unpackless_access(child_buffer: PackedChildBuffer) { + let mut buf = Vec::new(); + // buf.extend_from_slice(&[0u8; NODE_ID]); + let csum = child_buffer.pack(&mut buf, quick_csum).unwrap(); + + let other = PackedChildBuffer::unpack(CowBytes::from(buf).into(), csum).unwrap(); + + for (key, (info, val)) in child_buffer.buffer.assert_unpacked() { + let res = other.get(key).unwrap(); + assert_eq!((&res.0, &res.1), (info, val)); + } + } + + #[quickcheck] + fn unpackless_iter(child_buffer: PackedChildBuffer) { + let mut buf = Vec::new(); + // buf.extend_from_slice(&[0u8; NODE_ID]); + let csum = child_buffer.pack(&mut buf, quick_csum).unwrap(); + + let other = PackedChildBuffer::unpack(CowBytes::from(buf).into(), csum).unwrap(); + + for (idx, (key, tup)) in child_buffer.get_all_messages().enumerate() { + let res = other.get_all_messages().nth(idx).unwrap(); + assert_eq!((key, tup), res); + } + } + + #[quickcheck] + fn serialize_deserialize_idempotent(child_buffer: PackedChildBuffer) { + let mut buf = Vec::new(); + // buf.extend_from_slice(&[0u8; NODE_ID]); + let csum = child_buffer.pack(&mut buf, quick_csum).unwrap(); + let mut other = PackedChildBuffer::unpack(CowBytes::from(buf).into(), csum).unwrap(); + other.buffer.unpacked(); + assert_eq!(other, child_buffer); + } + + #[quickcheck] + fn insert_internal( + mut child_buffer: PackedChildBuffer, + key: CowBytes, + info: KeyInfo, + msg: CowBytes, + ) { + check_size(&child_buffer); + child_buffer.insert( + key.clone(), + info.clone(), + msg.clone().into(), + crate::tree::DefaultMessageAction, + ); + check_size(&child_buffer); + } + + #[quickcheck] + fn insert_leaf( + mut child_buffer: PackedChildBuffer, + key: CowBytes, + info: KeyInfo, + mut msg: CowBytes, + ) -> quickcheck::TestResult { + child_buffer.is_leaf = true; + if msg.len() < 3 { + return TestResult::discard(); + } + msg[0] = 1; + check_size(&child_buffer); + child_buffer.insert( + key.clone(), + info.clone(), + msg.clone().into(), + crate::tree::DefaultMessageAction, + ); + check_size(&child_buffer); + TestResult::passed() + } +} diff --git a/betree/src/tree/imp/internal/serialize_nodepointer.rs b/betree/src/tree/imp/internal/serialize_nodepointer.rs new file mode 100644 index 000000000..8fee8b84e --- /dev/null +++ b/betree/src/tree/imp/internal/serialize_nodepointer.rs @@ -0,0 +1,19 @@ +//! Serialization utilities of a node pointer type. +use crate::tree::imp::RwLock; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; + +pub fn serialize(np: &RwLock, serializer: S) -> Result +where + N: Serialize, + S: Serializer, +{ + np.read().serialize(serializer) +} + +pub fn deserialize<'de, N, D>(deserializer: D) -> Result, D::Error> +where + N: Deserialize<'de>, + D: Deserializer<'de>, +{ + N::deserialize(deserializer).map(RwLock::new) +} diff --git a/betree/src/tree/imp/internal/take_child_buffer.rs b/betree/src/tree/imp/internal/take_child_buffer.rs new file mode 100644 index 000000000..6a9a41deb --- /dev/null +++ b/betree/src/tree/imp/internal/take_child_buffer.rs @@ -0,0 +1,7 @@ +use crate::cow_bytes::CowBytes; + +pub(in crate::tree::imp) struct MergeChildResult { + pub(in crate::tree::imp) pivot_key: CowBytes, + pub(in crate::tree::imp) old_np: NP, + pub(in crate::tree::imp) size_delta: isize, +} diff --git a/betree/src/tree/imp/leaf.rs b/betree/src/tree/imp/leaf.rs deleted file mode 100644 index e003a3cb7..000000000 --- a/betree/src/tree/imp/leaf.rs +++ /dev/null @@ -1,521 +0,0 @@ -//! Implementation of the [LeafNode] node type. -use crate::{ - cow_bytes::{CowBytes, SlicedCowBytes}, - data_management::HasStoragePreference, - size::Size, - storage_pool::AtomicSystemStoragePreference, - tree::{imp::packed, pivot_key::LocalPivotKey, KeyInfo, MessageAction}, - AtomicStoragePreference, StoragePreference, -}; -use std::{borrow::Borrow, collections::BTreeMap, iter::FromIterator}; - -/// A leaf node of the tree holds pairs of keys values which are plain data. -#[derive(Debug, Clone)] -#[cfg_attr(test, derive(PartialEq))] -pub(super) struct LeafNode { - storage_preference: AtomicStoragePreference, - /// A storage preference assigned by the Migration Policy - system_storage_preference: AtomicSystemStoragePreference, - entries_size: usize, - entries: BTreeMap, -} - -/// Case-dependent outcome of a rebalance operation. -#[derive(Debug)] -pub(super) enum FillUpResult { - Rebalanced { - pivot_key: CowBytes, - size_delta: isize, - }, - Merged { - size_delta: isize, - }, -} - -impl Size for LeafNode { - fn size(&self) -> usize { - packed::HEADER_FIXED_LEN + self.entries_size - } - - fn actual_size(&self) -> Option { - Some( - packed::HEADER_FIXED_LEN - + self - .entries - .iter() - .map(|(key, (_keyinfo, value))| packed::ENTRY_LEN + key.len() + value.len()) - .sum::(), - ) - } -} - -impl HasStoragePreference for LeafNode { - fn current_preference(&self) -> Option { - self.storage_preference - .as_option() - .map(|pref| self.system_storage_preference.weak_bound(&pref)) - } - - fn recalculate(&self) -> StoragePreference { - let mut pref = StoragePreference::NONE; - - for (keyinfo, _v) in self.entries.values() { - pref.upgrade(keyinfo.storage_preference); - } - - self.storage_preference.set(pref); - self.system_storage_preference.weak_bound(&pref) - } - - fn system_storage_preference(&self) -> StoragePreference { - self.system_storage_preference.borrow().into() - } - - fn set_system_storage_preference(&mut self, pref: StoragePreference) { - self.system_storage_preference.set(pref) - } -} - -impl<'a> FromIterator<(&'a [u8], (KeyInfo, SlicedCowBytes))> for LeafNode { - fn from_iter(iter: T) -> Self - where - T: IntoIterator, - { - let mut storage_pref = StoragePreference::NONE; - let mut entries_size = 0; - - let mut entries = BTreeMap::new(); - let mut needs_second_pass = false; - - for (key, (keyinfo, value)) in iter.into_iter() { - // pref of overall node is highest pref from keys. - // We're already looking at every entry here, so finding the overall pref here - // avoids a full scan later. - storage_pref.upgrade(keyinfo.storage_preference); - entries_size += packed::ENTRY_LEN + key.len() + value.len(); - - let curr_storage_pref = keyinfo.storage_preference; - if let Some((ckeyinfo, cvalue)) = entries.insert(CowBytes::from(key), (keyinfo, value)) - { - // iterator has collisions, try to compensate - // - // this entry will no longer be part of the final map, subtract its size - entries_size -= packed::ENTRY_LEN + key.len() + cvalue.len(); - - // In case the old value increased the overall storage priority (faster), and the new - // value wouldn't have increased it as much, we might need to recalculate the - // proper preference in a second pass. - if ckeyinfo.storage_preference != curr_storage_pref { - needs_second_pass = true; - } - } - } - - if needs_second_pass { - storage_pref = StoragePreference::NONE; - for (keyinfo, _value) in entries.values() { - storage_pref.upgrade(keyinfo.storage_preference); - } - } - - LeafNode { - storage_preference: AtomicStoragePreference::known(storage_pref), - system_storage_preference: AtomicSystemStoragePreference::from(StoragePreference::NONE), - entries_size, - entries, - } - } -} - -impl LeafNode { - /// Constructs a new, empty `LeafNode`. - pub fn new() -> Self { - LeafNode { - storage_preference: AtomicStoragePreference::known(StoragePreference::NONE), - system_storage_preference: AtomicSystemStoragePreference::from(StoragePreference::NONE), - entries_size: 0, - entries: BTreeMap::new(), - } - } - - /// Returns the value for the given key. - pub fn get(&self, key: &[u8]) -> Option { - self.entries.get(key).map(|(_info, data)| data).cloned() - } - - pub(in crate::tree) fn get_with_info(&self, key: &[u8]) -> Option<(KeyInfo, SlicedCowBytes)> { - self.entries.get(key).cloned() - } - - pub(in crate::tree) fn entries(&self) -> &BTreeMap { - &self.entries - } - - pub(in crate::tree) fn entry_info(&mut self, key: &[u8]) -> Option<&mut KeyInfo> { - self.entries.get_mut(key).map(|e| &mut e.0) - } - - /// Split the node and transfer entries to a given other node `right_sibling`. - /// Use entries which are, when summed up in-order, above the `min_size` limit. - /// Returns new pivot key and size delta to the left sibling. - fn do_split_off( - &mut self, - right_sibling: &mut Self, - min_size: usize, - max_size: usize, - ) -> (CowBytes, isize) { - debug_assert!(self.size() > max_size); - debug_assert!(right_sibling.entries_size == 0); - - let mut sibling_size = 0; - let mut sibling_pref = StoragePreference::NONE; - let mut split_key = None; - for (k, (keyinfo, v)) in self.entries.iter().rev() { - sibling_size += packed::ENTRY_LEN + k.len() + v.len(); - sibling_pref.upgrade(keyinfo.storage_preference); - - if packed::HEADER_FIXED_LEN + sibling_size >= min_size { - split_key = Some(k.clone()); - break; - } - } - let split_key = split_key.unwrap(); - - right_sibling.entries = self.entries.split_off(&split_key); - self.entries_size -= sibling_size; - right_sibling.entries_size = sibling_size; - right_sibling.storage_preference.set(sibling_pref); - - // have removed many keys from self, no longer certain about own pref, mark invalid - self.storage_preference.invalidate(); - - let size_delta = -(sibling_size as isize); - - let pivot_key = self.entries.keys().next_back().cloned().unwrap(); - (pivot_key, size_delta) - } - - pub fn apply(&mut self, key: K, pref: StoragePreference) -> Option - where - K: Borrow<[u8]>, - { - self.storage_preference.invalidate(); - self.entries.get_mut(key.borrow()).map(|entry| { - entry.0.storage_preference = pref; - entry.0.clone() - }) - } - - /// Inserts a new message as leaf entry. - pub fn insert( - &mut self, - key: Q, - keyinfo: KeyInfo, - msg: SlicedCowBytes, - msg_action: M, - ) -> isize - where - Q: Borrow<[u8]> + Into, - M: MessageAction, - { - let size_before = self.entries_size as isize; - let key_size = key.borrow().len(); - let mut data = self.get(key.borrow()); - msg_action.apply_to_leaf(key.borrow(), msg, &mut data); - - if let Some(data) = data { - // Value was added or preserved by msg - self.entries_size += data.len(); - self.storage_preference.upgrade(keyinfo.storage_preference); - - if let Some((old_info, old_data)) = - self.entries.insert(key.into(), (keyinfo.clone(), data)) - { - // There was a previous value in entries, which was now replaced - self.entries_size -= old_data.len(); - - // if previous entry was stricter than new entry, invalidate - if old_info.storage_preference < keyinfo.storage_preference { - self.storage_preference.invalidate(); - } - } else { - // There was no previous value in entries - self.entries_size += packed::ENTRY_LEN; - self.entries_size += key_size; - } - } else if let Some((old_info, old_data)) = self.entries.remove(key.borrow()) { - // The value was removed by msg, this may be a downgrade opportunity. - // The preference of the removed entry can't be stricter than the current node - // preference, by invariant. That leaves "less strict" and "as strict" as the - // node preference: - // - // - less strict: - // If the preference of the removed entry is less strict than the current - // node preference, there must be another entry which is preventing a downgrade. - // - as strict: - // The removed entry _may_ have caused the original upgrade to this preference, - // we'll have to trigger a scan to find out. - if self.storage_preference.as_option() == Some(old_info.storage_preference) { - self.storage_preference.invalidate(); - } - - self.entries_size -= packed::ENTRY_LEN; - self.entries_size -= key_size; - self.entries_size -= old_data.len(); - } - self.entries_size as isize - size_before - } - - /// Inserts messages as leaf entries. - pub fn insert_msg_buffer(&mut self, msg_buffer: I, msg_action: M) -> isize - where - M: MessageAction, - I: IntoIterator, - { - let mut size_delta = 0; - for (key, (keyinfo, msg)) in msg_buffer { - size_delta += self.insert(key, keyinfo, msg, &msg_action); - } - size_delta - } - - /// Splits this `LeafNode` into to two leaf nodes. - /// Returns a new right sibling, the corresponding pivot key, and the size - /// delta of this node. - pub fn split( - &mut self, - min_size: usize, - max_size: usize, - ) -> (Self, CowBytes, isize, LocalPivotKey) { - // assert!(self.size() > S::MAX); - let mut right_sibling = LeafNode { - // During a split, preference can't be inherited because the new subset of entries - // might be a subset with a lower maximal preference. - storage_preference: AtomicStoragePreference::known(StoragePreference::NONE), - system_storage_preference: AtomicSystemStoragePreference::from(StoragePreference::NONE), - entries_size: 0, - entries: BTreeMap::new(), - }; - - // This adjusts sibling's size and pref according to its new entries - let (pivot_key, size_delta) = self.do_split_off(&mut right_sibling, min_size, max_size); - - ( - right_sibling, - pivot_key.clone(), - size_delta, - LocalPivotKey::Right(pivot_key), - ) - } - - /// Merge all entries from the *right* node into the *left* node. Returns - /// the size change, positive for the left node, negative for the right - /// node. - pub fn merge(&mut self, right_sibling: &mut Self) -> isize { - self.entries.append(&mut right_sibling.entries); - let size_delta = right_sibling.entries_size; - self.entries_size += right_sibling.entries_size; - - self.storage_preference - .upgrade_atomic(&right_sibling.storage_preference); - - // right_sibling is now empty, reset to defaults - right_sibling.entries_size = 0; - right_sibling - .storage_preference - .set(StoragePreference::NONE); - - size_delta as isize - } - - /// Rebalances `self` and `right_sibling`. Returns `Merged` - /// if all entries of `right_sibling` have been merged into this node. - /// Otherwise, returns a new pivot key. - pub fn rebalance( - &mut self, - right_sibling: &mut Self, - min_size: usize, - max_size: usize, - ) -> FillUpResult { - let size_delta = self.merge(right_sibling); - if self.size() <= max_size { - FillUpResult::Merged { size_delta } - } else { - // First size_delta is from the merge operation where we split - let (pivot_key, split_size_delta) = - self.do_split_off(right_sibling, min_size, max_size); - FillUpResult::Rebalanced { - pivot_key, - size_delta: size_delta + split_size_delta, - } - } - } - - /*pub fn range_delete(&mut self, start: &[u8], end: Option<&[u8]>) -> usize { - // https://github.com/rust-lang/rust/issues/42849 - let size_before = self.entries_size; - let range = ( - Bound::Included(start), - end.map_or(Bound::Unbounded, Bound::Excluded), - ); - let mut keys = Vec::new(); - for (key, (_keyinfo, value)) in self.entries.range_mut::<[u8], _>(range) { - self.entries_size -= key.len() + value.len(); - keys.push(key.clone()); - } - for key in keys { - self.entries.remove(&key); - } - size_before - self.entries_size - }*/ -} - -#[cfg(test)] -mod tests { - use super::{CowBytes, LeafNode, Size}; - use crate::{ - arbitrary::GenExt, - data_management::HasStoragePreference, - tree::{ - default_message_action::{DefaultMessageAction, DefaultMessageActionMsg}, - imp::packed::PackedMap, - KeyInfo, - }, - StoragePreference, - }; - use quickcheck::{Arbitrary, Gen, TestResult}; - use rand::Rng; - - impl Arbitrary for KeyInfo { - fn arbitrary(g: &mut Gen) -> Self { - let sp = g.rng().gen_range(0..=3); - KeyInfo { - storage_preference: StoragePreference::from_u8(sp), - } - } - } - - impl Arbitrary for LeafNode { - fn arbitrary(g: &mut Gen) -> Self { - let len = g.rng().gen_range(0..20); - let entries: Vec<_> = (0..len) - .map(|_| { - ( - CowBytes::arbitrary(g), - DefaultMessageActionMsg::arbitrary(g), - ) - }) - .map(|(k, v)| (k, v.0)) - .collect(); - - let node: LeafNode = entries - .iter() - .map(|(k, v)| (&k[..], (KeyInfo::arbitrary(g), v.clone()))) - .collect(); - node.recalculate(); - node - } - - fn shrink(&self) -> Box> { - let v: Vec<_> = self - .entries - .clone() - .into_iter() - .map(|(k, (info, v))| (k, (info, CowBytes::from(v.to_vec())))) - .collect(); - Box::new(v.shrink().map(|entries| { - entries - .iter() - .map(|(k, (info, v))| (&k[..], (info.clone(), v.clone().into()))) - .collect() - })) - } - } - - fn serialized_size(leaf_node: &LeafNode) -> usize { - let mut data = Vec::new(); - PackedMap::pack(leaf_node, &mut data).unwrap(); - data.len() - } - - #[quickcheck] - fn check_actual_size(leaf_node: LeafNode) { - assert_eq!(leaf_node.actual_size(), Some(serialized_size(&leaf_node))); - } - - #[quickcheck] - fn check_serialize_size(leaf_node: LeafNode) { - let size = leaf_node.size(); - let serialized = serialized_size(&leaf_node); - if size != serialized { - eprintln!( - "leaf {:?}, size {}, actual_size {:?}, serialized_size {}", - leaf_node, - size, - leaf_node.actual_size(), - serialized - ); - assert_eq!(size, serialized); - } - } - - #[quickcheck] - fn check_serialization(leaf_node: LeafNode) { - let mut data = Vec::new(); - PackedMap::pack(&leaf_node, &mut data).unwrap(); - let twin = PackedMap::new(data).unpack_leaf(); - - assert_eq!(leaf_node, twin); - } - - #[quickcheck] - fn check_size_insert( - mut leaf_node: LeafNode, - key: CowBytes, - key_info: KeyInfo, - msg: DefaultMessageActionMsg, - ) { - let size_before = leaf_node.size(); - let size_delta = leaf_node.insert(key, key_info, msg.0, DefaultMessageAction); - let size_after = leaf_node.size(); - assert_eq!((size_before as isize + size_delta) as usize, size_after); - assert_eq!({ serialized_size(&leaf_node) }, size_after); - } - - const MIN_LEAF_SIZE: usize = 512; - const MAX_LEAF_SIZE: usize = 2048; - - #[quickcheck] - fn check_size_split(mut leaf_node: LeafNode) -> TestResult { - let size_before = leaf_node.size(); - - if size_before <= MAX_LEAF_SIZE { - return TestResult::discard(); - } - - let (sibling, _, size_delta, _pivot_key) = leaf_node.split(MIN_LEAF_SIZE, MAX_LEAF_SIZE); - assert_eq!({ serialized_size(&leaf_node) }, leaf_node.size()); - assert_eq!({ serialized_size(&sibling) }, sibling.size()); - assert_eq!( - (size_before as isize + size_delta) as usize, - leaf_node.size() - ); - assert!(sibling.size() <= MAX_LEAF_SIZE); - assert!(sibling.size() >= MIN_LEAF_SIZE); - assert!(leaf_node.size() >= MIN_LEAF_SIZE); - TestResult::passed() - } - - #[quickcheck] - fn check_split_merge_idempotent(mut leaf_node: LeafNode) -> TestResult { - if leaf_node.size() <= MAX_LEAF_SIZE { - return TestResult::discard(); - } - let this = leaf_node.clone(); - let (mut sibling, ..) = leaf_node.split(MIN_LEAF_SIZE, MAX_LEAF_SIZE); - leaf_node.recalculate(); - leaf_node.merge(&mut sibling); - assert_eq!(this, leaf_node); - TestResult::passed() - } -} diff --git a/betree/src/tree/imp/leaf/mod.rs b/betree/src/tree/imp/leaf/mod.rs new file mode 100644 index 000000000..070696f8e --- /dev/null +++ b/betree/src/tree/imp/leaf/mod.rs @@ -0,0 +1,15 @@ +//! Various impl of a "leaf" type node. + +use crate::cow_bytes::CowBytes; + +/// Case-dependent outcome of a rebalance operation. +#[derive(Debug)] +pub(super) enum FillUpResult { + Rebalanced { + pivot_key: CowBytes, + size_delta: isize, + }, + Merged { + size_delta: isize, + }, +} diff --git a/betree/src/tree/imp/mod.rs b/betree/src/tree/imp/mod.rs index 31c98d546..030c4c525 100644 --- a/betree/src/tree/imp/mod.rs +++ b/betree/src/tree/imp/mod.rs @@ -1,12 +1,12 @@ //! Implementation of tree structures. use self::{ - derivate_ref::DerivateRef, + derivate_ref::DerivateRefNVM, node::{ApplyResult, GetResult, PivotGetMutResult, PivotGetResult}, }; use super::{ errors::*, layer::{ErasedTreeSync, TreeLayer}, - PivotKey, + PivotKey, StorageKind, }; use crate::{ cache::AddSize, @@ -15,6 +15,7 @@ use crate::{ database::DatasetId, range_validation::is_inclusive_non_empty, size::StaticSize, + storage_pool::{StoragePoolLayer, NUM_STORAGE_CLASSES}, tree::MessageAction, StoragePreference, }; @@ -23,13 +24,34 @@ use owning_ref::OwningRef; use parking_lot::{RwLock, RwLockWriteGuard}; use std::{borrow::Borrow, marker::PhantomData, mem, ops::RangeBounds}; +use internal::TakeChildBuffer; + /// Additional information for a single entry. Concerns meta information like /// the desired storage level of a key. -#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[derive( + Debug, + Clone, + PartialEq, + Eq, + serde::Serialize, + serde::Deserialize, + rkyv::Archive, + rkyv::Serialize, + rkyv::Deserialize, +)] +#[archive(check_bytes)] pub struct KeyInfo { storage_preference: StoragePreference, } +impl Default for KeyInfo { + fn default() -> Self { + Self { + storage_preference: StoragePreference::NONE, + } + } +} + impl StaticSize for KeyInfo { fn static_size() -> usize { mem::size_of::() @@ -51,11 +73,8 @@ impl KeyInfo { } } -pub(super) const MAX_INTERNAL_NODE_SIZE: usize = 4 * 1024 * 1024; const MIN_FLUSH_SIZE: usize = 256 * 1024; -const MIN_FANOUT: usize = 4; -const MIN_LEAF_NODE_SIZE: usize = 1024 * 1024; -const MAX_LEAF_NODE_SIZE: usize = MAX_INTERNAL_NODE_SIZE; +const MIN_FANOUT: usize = 2; pub(crate) const MAX_MESSAGE_SIZE: usize = 512 * 1024; /// The actual tree type. @@ -65,6 +84,23 @@ pub struct Tree>> { evict: bool, marker: PhantomData, storage_preference: StoragePreference, + /// A 1-to-1 map of each storage class to the desired data representation. + storage_map: StorageMap, +} + +#[derive(Clone, Debug)] +pub(crate) struct StorageMap { + map: [StorageKind; NUM_STORAGE_CLASSES], + default: StorageKind, +} + +impl StorageMap { + pub fn get(&self, pref: StoragePreference) -> StorageKind { + self.map + .get(pref.as_u8() as usize) + .cloned() + .unwrap_or(self.default) + } } impl>> Clone for Tree { @@ -75,6 +111,7 @@ impl>> Clone for Tre evict: self.evict, marker: PhantomData, storage_preference: self.storage_preference, + storage_map: self.storage_map.clone(), } } } @@ -154,6 +191,10 @@ where ) -> Self { Tree { inner: I::from(Inner::new(tree_id, root_node, msg_action)), + storage_map: StorageMap { + map: dml.spl().storage_kind_map(), + default: dml.spl().storage_kind_map()[dml.spl().default_storage_class() as usize], + }, dml, evict: true, marker: PhantomData, @@ -183,6 +224,10 @@ where ) -> Self { Tree { inner, + storage_map: StorageMap { + map: dml.spl().storage_kind_map(), + default: dml.spl().storage_kind_map()[dml.spl().default_storage_class() as usize], + }, dml, evict, marker: PhantomData, @@ -247,7 +292,6 @@ where &self, pivot: &PivotKey, ) -> Result, Error> { - let pivot = pivot.borrow(); let mut node = self.get_root_node()?; Ok(loop { let next_node = match node.pivot_get(pivot) { @@ -264,7 +308,6 @@ where &self, pivot: &PivotKey, ) -> Result, Error> { - let pivot = pivot.borrow(); let mut node = self.get_mut_root_node()?; Ok(loop { let next_node = match node.pivot_get_mut(pivot) { @@ -384,15 +427,25 @@ where }; node = next_node; }; - match data { - None => Ok(None), + None => { + let mut tmp = None; + let mut info = None; + for (keyinfo, msg) in msgs.into_iter().rev() { + info = Some(keyinfo); + self.msg_action().apply(key, &msg, &mut tmp); + } + drop(node); + if self.evict { + self.dml.evict()?; + } + Ok(tmp.map(|data| (info.unwrap(), data))) + } Some((info, data)) => { let mut tmp = Some(data); for (_keyinfo, msg) in msgs.into_iter().rev() { self.msg_action().apply(key, &msg, &mut tmp); } - drop(node); if self.evict { self.dml.evict()?; @@ -417,6 +470,11 @@ where let next_node = match node.apply_with_info(key, pref) { ApplyResult::NextNode(np) => self.get_mut_node_mut(np)?, ApplyResult::Leaf(info) => break info, + // ApplyResult::NextNode { child, buffer } => { + // let mut buffer = self.get_mut_node_mut(buffer)?; + // buffer.apply_with_info(key, pref); + // self.get_mut_node_mut(child)? + // } }; node = next_node; }); @@ -448,6 +506,11 @@ where .map(|res| res.map(|(_info, data)| data)) } + // NOTE: Our specific type actually implements a somewhat optimized variant + // of the usual b-epsilon tree insertion, we iterate as far down as we can + // on "Modified" nodes which do not contain the modified key already. This way we ensure that: + // 1. Recombination of messages are minimized. + // 2. Expensive flush operations are delayed. (Structure changes) fn insert( &self, key: K, @@ -464,17 +527,22 @@ where let mut node = { let mut node = self.get_mut_root_node()?; loop { - match DerivateRef::try_new(node, |node| node.try_walk(key.borrow())) { + if self.storage_map.node_is_too_large(&mut node) { + break node; + } + match DerivateRefNVM::try_new(node, |node| node.try_walk(key.borrow())) { Ok(mut child_buffer) => { - if let Some(child) = self.try_get_mut_node(child_buffer.node_pointer_mut()) - { + let maybe_child = self.try_get_mut_node(child_buffer.child_pointer_mut()); + if let Some(child) = maybe_child { node = child; parent = Some(child_buffer); } else { break child_buffer.into_owner(); } } - Err(node) => break node, + Err(node) => { + break node; + } }; } }; @@ -493,11 +561,10 @@ where self.rebalance_tree(node, parent)?; // All non-root trees will start the eviction process. - // TODO: Is the eviction on root trees harmful? Evictions started by - // other trees will evict root nodes anyway. if self.evict { self.dml.evict()?; } + Ok(()) } @@ -550,17 +617,15 @@ where } } -mod child_buffer; mod derivate_ref; mod flush; mod internal; mod leaf; mod node; -mod packed; mod range; mod split; -pub use self::{ - node::{Node, NodeInfo}, - range::RangeIterator, -}; +#[cfg(feature = "internal-api")] +pub use self::node::NodeInfo; + +pub use self::{node::Node, range::RangeIterator}; diff --git a/betree/src/tree/imp/node.rs b/betree/src/tree/imp/node.rs index f22c82790..58310a2c5 100644 --- a/betree/src/tree/imp/node.rs +++ b/betree/src/tree/imp/node.rs @@ -1,23 +1,25 @@ //! Implementation of the generic node wrapper. use self::Inner::*; use super::{ - child_buffer::ChildBuffer, - internal::{InternalNode, TakeChildBuffer}, - leaf::LeafNode, - packed::PackedMap, - FillUpResult, KeyInfo, PivotKey, MAX_INTERNAL_NODE_SIZE, MAX_LEAF_NODE_SIZE, MIN_FANOUT, - MIN_FLUSH_SIZE, MIN_LEAF_NODE_SIZE, + internal::{ + copyless_internal::{ChildLink, CopylessInternalNode, InternalNodeLink}, + packed_child_buffer::PackedChildBuffer, + TakeChildBuffer, + }, + FillUpResult, KeyInfo, PivotKey, StorageMap, MIN_FANOUT, MIN_FLUSH_SIZE, }; use crate::{ + buffer::{self, Buf}, + checksum::{Builder, Checksum}, cow_bytes::{CowBytes, SlicedCowBytes}, - data_management::{Dml, HasStoragePreference, Object, ObjectReference}, + data_management::{ + Dml, HasStoragePreference, IntegrityMode, Object, ObjectReference, PreparePack, + }, database::DatasetId, size::{Size, SizeMut, StaticSize}, - storage_pool::DiskOffset, - tree::{pivot_key::LocalPivotKey, MessageAction}, + tree::{pivot_key::LocalPivotKey, MessageAction, StorageKind}, StoragePreference, }; -use bincode::{deserialize, serialize_into}; use parking_lot::RwLock; use std::{ borrow::Borrow, @@ -32,36 +34,126 @@ pub struct Node(Inner); #[derive(Debug)] pub(super) enum Inner { - PackedLeaf(PackedMap), - Leaf(LeafNode), - Internal(InternalNode>), + MemLeaf(PackedChildBuffer), + CopylessInternal(CopylessInternalNode), +} + +macro_rules! kib { + ($n:expr) => { + $n * 1024 + }; +} + +macro_rules! mib { + ($n:expr) => { + $n * 1024 * 1024 + }; +} + +// NOTE: This section is the main description of the properties of the chosen tree nodes. +// +// Essentially a mapping from node type and storage kind to min or max size is +// created. To be noted here is that the current representation of the leaf can +// change before it is actually written to the desired storage kind. So a block +// leaf might be changed to a memory leaf when written to memory. +impl StorageMap { + pub fn node_is_too_large(&self, node: &Node) -> bool { + self.max_size(node) + .map(|max_size| node.inner_size() > max_size || node.has_too_high_fanout(max_size)) + .unwrap_or(false) + } + + pub fn leaf_is_too_large( + &self, + node: &mut Node, + ) -> bool { + node.is_leaf() && self.node_is_too_large(node) + } + + pub fn leaf_is_too_small(&self, node: &Node) -> bool { + node.is_leaf() + && self + .min_size(node) + .map(|min_size| node.inner_size() < min_size) + .unwrap_or(false) + } + + pub fn min_size(&self, node: &Node) -> Option { + let pref = node.correct_preference(); + Some(match (&node.0, self.get(pref)) { + (CopylessInternal(_), _) => return None, + (_, StorageKind::Hdd) => mib!(1), + (_, StorageKind::Ssd) => kib!(512), + (_, StorageKind::Memory) => kib!(512), + }) + } + + pub fn max_size(&self, node: &Node) -> Option { + let pref = node.correct_preference(); + Some(match (&node.0, self.get(pref)) { + (_, StorageKind::Hdd) => mib!(4), + (_, StorageKind::Ssd) => mib!(2), + (_, StorageKind::Memory) => mib!(2), + }) + } +} + +trait ChildBufferIteratorTrait<'a, N> { + fn cb_iter_mut(&'a mut self) -> Box + 'a>; + fn cb_iter_ref(&'a self) -> Box + 'a>; + fn cb_iter(self) -> Box + 'a>; +} + +impl<'a> ChildBufferIteratorTrait<'a, Option> + for Vec> +{ + fn cb_iter_mut( + &'a mut self, + ) -> Box> + 'a> { + Box::new(self.iter_mut()) + } + + fn cb_iter_ref(&'a self) -> Box> + 'a> { + Box::new(self.iter()) + } + + fn cb_iter(self) -> Box> + 'a> { + Box::new(self.into_iter()) + } +} + +pub(super) enum ChildrenObjects<'a, N> { + ChildBuffer(Box + 'a>), + NVMChildBuffer(Box> + 'a>), +} + +#[derive(Debug)] +enum NodeInnerType { + CopylessLeaf = 1, + CopylessInternal, } +pub(super) const NODE_PREFIX_LEN: usize = std::mem::size_of::(); + impl HasStoragePreference for Node { fn current_preference(&self) -> Option { match self.0 { - PackedLeaf(_) => None, - Leaf(ref leaf) => leaf.current_preference(), - Internal(ref internal) => internal.current_preference(), + MemLeaf(ref nvmleaf) => nvmleaf.current_preference(), + CopylessInternal(ref nvminternal) => nvminternal.current_preference(), } } fn recalculate(&self) -> StoragePreference { match self.0 { - PackedLeaf(_) => { - unreachable!("packed leaves are never written back, have no preference") - } - Leaf(ref leaf) => leaf.recalculate(), - Internal(ref internal) => internal.recalculate(), + MemLeaf(ref nvmleaf) => nvmleaf.recalculate(), + CopylessInternal(ref nvminternal) => nvminternal.recalculate(), } } fn system_storage_preference(&self) -> StoragePreference { match self.0 { - // A packed leaf does not have a storage preference - PackedLeaf(_) => unreachable!("packed leaf preference cannot be determined"), - Leaf(ref leaf) => leaf.system_storage_preference(), - Internal(ref int) => int.system_storage_preference(), + MemLeaf(ref nvmleaf) => nvmleaf.system_storage_preference(), + CopylessInternal(ref nvminternal) => nvminternal.system_storage_preference(), } } @@ -71,46 +163,64 @@ impl HasStoragePreference for Node { // waiting for the next read operation for this leaf. self.ensure_unpacked(); match self.0 { - PackedLeaf(_) => unreachable!("packed leaves cannot have their preference updated"), - Leaf(ref mut leaf) => leaf.set_system_storage_preference(pref), - Internal(ref mut int) => int.set_system_storage_preference(pref), + MemLeaf(ref mut nvmleaf) => nvmleaf.set_system_storage_preference(pref), + CopylessInternal(ref mut nvminternal) => { + nvminternal.set_system_storage_preference(pref) + } } } } -impl Object for Node { - fn pack(&self, mut writer: W) -> Result<(), io::Error> { +impl Object for Node { + fn pack C, C: Checksum>( + &self, + mut writer: W, + _: PreparePack, + csum_builder: F, + ) -> Result, io::Error> { match self.0 { - PackedLeaf(ref map) => writer.write_all(map.inner()), - Leaf(ref leaf) => PackedMap::pack(leaf, writer), - Internal(ref internal) => { - writer.write_all(&[0xFFu8, 0xFF, 0xFF, 0xFF] as &[u8])?; - serialize_into(writer, internal) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) + MemLeaf(ref leaf) => { + writer.write_all((NodeInnerType::CopylessLeaf as u32).to_be_bytes().as_ref())?; + leaf.pack(writer, csum_builder) + } + CopylessInternal(ref cpl_internal) => { + writer.write_all( + (NodeInnerType::CopylessInternal as u32) + .to_be_bytes() + .as_ref(), + )?; + cpl_internal.pack(writer, csum_builder) } } } - fn unpack_at(_offset: DiskOffset, d_id: DatasetId, data: Box<[u8]>) -> Result { - if data[..4] == [0xFFu8, 0xFF, 0xFF, 0xFF] { - match deserialize::>(&data[4..]) { - Ok(internal) => Ok(Node(Internal(internal.complete_object_refs(d_id)))), - Err(e) => Err(io::Error::new(io::ErrorKind::InvalidData, e)), - } + fn unpack_at( + d_id: DatasetId, + data: Buf, + integrity_mode: IntegrityMode, + ) -> Result { + if data[0..4] == (NodeInnerType::CopylessInternal as u32).to_be_bytes() { + Ok(Node(CopylessInternal( + CopylessInternalNode::unpack(data, integrity_mode)?.complete_object_refs(d_id), + ))) + } else if data[0..4] == (NodeInnerType::CopylessLeaf as u32).to_be_bytes() { + Ok(Node(MemLeaf(PackedChildBuffer::unpack( + data.into_sliced_cow_bytes().slice_from(4), + integrity_mode, + )?))) } else { - // storage_preference is not preserved for packed leaves, - // because they will not be written back to disk until modified, - // and every modification requires them to be unpacked. - // The leaf contents are scanned cheaply during unpacking, which - // recalculates the correct storage_preference for the contained keys. - Ok(Node(PackedLeaf(PackedMap::new(data.into_vec())))) + panic!( + "Unkown bytes to unpack. [0..4]: {}", + u32::from_be_bytes(data[..4].try_into().unwrap()) + ); } } fn debug_info(&self) -> String { format!( - "{}: {:?}, {}, {:?}", + "{}: {}, {:?}, {}, {:?}", self.kind(), + self.level(), self.fanout(), self.size(), self.actual_size() @@ -128,50 +238,105 @@ impl Object for Node { } Ok(()) } + + fn prepare_pack( + &mut self, + _storage_kind: StorageKind, + _pivot_key: &PivotKey, + ) -> Result + where + R: ObjectReference, + { + // NOTE: Only necessary transitions are represented here, all others are no-op. Can be improved. + // self.0 = match ( + // std::mem::replace(&mut self.0, unsafe { std::mem::zeroed() }), + // storage_kind, + // ) { + // // (Internal(internal), StorageKind::Memory) | (Internal(internal), StorageKind::Ssd) => { + // // // Spawn new child buffers from one internal node. + // // Inner::DisjointInternal(internal.to_disjoint_node(|new_cbuf| { + // // dmu.insert( + // // Node(Inner::ChildBuffer(new_cbuf)), + // // pivot_key.d_id(), + // // pivot_key.clone(), + // // ) + // // })) + // // } + // (CopylessInternal(_internal), StorageKind::Hdd) => { + // // Fetch children and pipe them into one node. + // unimplemented!(); + // // let mut cbufs = Vec::with_capacity(internal.children.len()); + // // Inner::Internal(InternalNode::from_disjoint_node(internal, cbufs)) + // } + // (Leaf(leaf), StorageKind::Memory) => Inner::MemLeaf(leaf.to_memory_leaf()), + // (MemLeaf(leaf), StorageKind::Ssd) | (MemLeaf(leaf), StorageKind::Hdd) => { + // Inner::Leaf(leaf.to_block_leaf()) + // } + // (default, _) => default, + // }; + Ok(PreparePack()) + } } impl Size for Node { fn size(&self) -> usize { match self.0 { - PackedLeaf(ref map) => map.size(), - Leaf(ref leaf) => leaf.size(), - Internal(ref internal) => 4 + internal.size(), + MemLeaf(ref nvmleaf) => 4 + nvmleaf.size(), + CopylessInternal(ref nvminternal) => 4 + nvminternal.size(), } } fn actual_size(&self) -> Option { match self.0 { - PackedLeaf(ref map) => map.actual_size(), - Leaf(ref leaf) => leaf.actual_size(), - Internal(ref internal) => internal.actual_size().map(|size| 4 + size), + MemLeaf(ref nvmleaf) => nvmleaf.actual_size().map(|size| 4 + size), + CopylessInternal(ref nvminternal) => nvminternal.actual_size().map(|size| 4 + size), + } + } + + fn cache_size(&self) -> usize { + match &self.0 { + MemLeaf(l) => l.cache_size(), + CopylessInternal(i) => i.cache_size(), } } } impl Node { - pub(super) fn try_walk(&mut self, key: &[u8]) -> Option>> { + pub(super) fn try_walk(&mut self, key: &[u8]) -> Option> + where + N: ObjectReference, + { match self.0 { - Leaf(_) | PackedLeaf(_) => None, - Internal(ref mut internal) => internal.try_walk(key), + MemLeaf(_) => None, + CopylessInternal(ref mut nvminternal) => Some(nvminternal.try_walk(key)), } } - pub(super) fn try_find_flush_candidate(&mut self) -> Option>> { + pub(super) fn try_find_flush_candidate( + &mut self, + storage_map: &StorageMap, + ) -> Option> + where + N: ObjectReference, + { + let max_size = storage_map.max_size(&self); match self.0 { - Leaf(_) | PackedLeaf(_) => None, - Internal(ref mut internal) => internal.try_find_flush_candidate( - MIN_FLUSH_SIZE, - MAX_INTERNAL_NODE_SIZE, - MIN_FANOUT, - ), + MemLeaf(_) => None, + CopylessInternal(ref mut nvminternal) => { + nvminternal.try_find_flush_candidate(MIN_FLUSH_SIZE, max_size.unwrap(), MIN_FANOUT) + } } } - pub(super) fn is_too_large(&self) -> bool { - match self.0 { - PackedLeaf(ref map) => map.size() > MAX_LEAF_NODE_SIZE, - Leaf(ref leaf) => leaf.size() > MAX_LEAF_NODE_SIZE, - Internal(ref internal) => internal.size() > MAX_INTERNAL_NODE_SIZE, + /// This method actually checks the size of the pivots compared to the + /// maximum size allowed. Pivots should always fill up less than B^epsilon + /// space. + fn has_too_high_fanout(&self, max_size: usize) -> bool { + match &self.0 { + CopylessInternal(copyless_internal_node) => { + copyless_internal_node.has_too_high_fanout(max_size) + } + _ => false, } } } @@ -179,163 +344,194 @@ impl Node { impl Node { pub(super) fn kind(&self) -> &str { match self.0 { - PackedLeaf(_) => "packed leaf", - Leaf(_) => "leaf", - Internal(_) => "internal", + MemLeaf(_) => "nvmleaf", + CopylessInternal(_) => "nvminternal", } } - pub(super) fn fanout(&self) -> Option { + pub(super) fn fanout(&self) -> Option + where + N: ObjectReference, + { match self.0 { - Leaf(_) | PackedLeaf(_) => None, - Internal(ref internal) => Some(internal.fanout()), + MemLeaf(_) => None, + CopylessInternal(ref nvminternal) => Some(nvminternal.fanout()), } } fn ensure_unpacked(&mut self) -> isize { - let before = self.size(); - - let leaf = if let PackedLeaf(ref mut map) = self.0 { - map.unpack_leaf() - } else { - return 0; - }; - - self.0 = Leaf(leaf); - let after = self.size(); - after as isize - before as isize + let before = self.cache_size(); + + match &mut self.0 { + MemLeaf(mleaf) => { + let before = mleaf.cache_size(); + mleaf.unpack_data(); + let after = mleaf.cache_size(); + after as isize - before as isize + } + _ => 0, + } } fn take(&mut self) -> Self { replace(self, Self::empty_leaf()) } - pub(super) fn has_too_low_fanout(&self) -> bool { - match self.0 { - Leaf(_) | PackedLeaf(_) => false, - Internal(ref internal) => internal.fanout() < MIN_FANOUT, - } - } - - pub(super) fn is_too_small_leaf(&self) -> bool { + pub(super) fn has_too_low_fanout(&self) -> bool + where + N: ObjectReference, + { match self.0 { - PackedLeaf(ref map) => map.size() < MIN_LEAF_NODE_SIZE, - Leaf(ref leaf) => leaf.size() < MIN_LEAF_NODE_SIZE, - Internal(_) => false, + MemLeaf(_) => false, + CopylessInternal(ref nvminternal) => nvminternal.fanout() < MIN_FANOUT, } } - pub(super) fn is_too_large_leaf(&self) -> bool { + pub(super) fn is_leaf(&self) -> bool { match self.0 { - PackedLeaf(ref map) => map.size() > MAX_LEAF_NODE_SIZE, - Leaf(ref leaf) => leaf.size() > MAX_LEAF_NODE_SIZE, - Internal(_) => false, + MemLeaf(_) => true, + CopylessInternal(_) => false, } } - pub(super) fn is_leaf(&self) -> bool { + pub(super) fn is_disjoint(&self) -> bool { match self.0 { - Leaf(_) | PackedLeaf(_) => true, - Internal(_) => false, + MemLeaf(_) => false, + CopylessInternal(_) => true, } } pub(super) fn empty_leaf() -> Self { - Node(Leaf(LeafNode::new())) + Node(MemLeaf(PackedChildBuffer::new(true))) } pub(super) fn level(&self) -> u32 { match self.0 { - Leaf(_) | PackedLeaf(_) => 0, - Internal(ref internal) => internal.level(), + MemLeaf(_) => 0, + CopylessInternal(ref nvminternal) => nvminternal.level(), } } - pub(super) fn root_needs_merge(&self) -> bool { + pub(super) fn root_needs_merge(&self) -> bool + where + N: ObjectReference, + { match self.0 { - Leaf(_) | PackedLeaf(_) => false, - Internal(ref internal) => internal.fanout() == 1, + MemLeaf(_) => false, + CopylessInternal(ref nvminternal) => nvminternal.fanout() == 1, + } + } + + fn inner_size(&self) -> usize { + match &self.0 { + MemLeaf(m) => m.size(), + CopylessInternal(d) => d.size(), } } } impl Node { - pub(super) fn split_root_mut(&mut self, allocate_obj: F) -> isize + pub(super) fn split_root_mut(&mut self, storage_map: &StorageMap, allocate_obj: F) -> isize where F: Fn(Self, LocalPivotKey) -> N, { - let size_before = self.size(); + let can_be_copyless = match storage_map.get(self.correct_preference()) { + StorageKind::Memory => true, + _ => false, + }; + + let size_before = self.cache_size(); self.ensure_unpacked(); - // FIXME: Update this PivotKey, as the index of the node is changing due to the structural change. let mut left_sibling = self.take(); + + let min_size = storage_map.min_size(&left_sibling); + let max_size = storage_map.max_size(&left_sibling); let (right_sibling, pivot_key, cur_level) = match left_sibling.0 { - PackedLeaf(_) => unreachable!(), - Leaf(ref mut leaf) => { - let (right_sibling, pivot_key, _, _pk) = - leaf.split(MIN_LEAF_NODE_SIZE, MAX_LEAF_NODE_SIZE); - (Node(Leaf(right_sibling)), pivot_key, 0) + MemLeaf(ref mut nvmleaf) => { + let (right_sibling, pivot_key, _pk) = + nvmleaf.split(min_size.unwrap(), max_size.unwrap()).take().0; + (Node(MemLeaf(right_sibling)), pivot_key, 0) } - Internal(ref mut internal) => { - let (right_sibling, pivot_key, _, _pk) = internal.split(); - (Node(Internal(right_sibling)), pivot_key, internal.level()) + CopylessInternal(ref mut nvminternal) => { + let (right_sibling, pivot_key, _, _pk) = nvminternal.split(); + ( + Node(CopylessInternal(right_sibling)), + pivot_key, + nvminternal.level(), + ) } }; debug!("Root split pivot key: {:?}", pivot_key); - *self = Node(Internal(InternalNode::new( - ChildBuffer::new(allocate_obj( - left_sibling, - LocalPivotKey::LeftOuter(pivot_key.clone()), - )), - ChildBuffer::new(allocate_obj( - right_sibling, - LocalPivotKey::Right(pivot_key.clone()), - )), + + debug_assert!(!left_sibling.has_too_low_fanout()); + debug_assert!(!right_sibling.has_too_low_fanout()); + + let left_child = allocate_obj(left_sibling, LocalPivotKey::LeftOuter(pivot_key.clone())); + let right_child = allocate_obj(right_sibling, LocalPivotKey::Right(pivot_key.clone())); + + let left_buffer = PackedChildBuffer::new(false); + let right_buffer = PackedChildBuffer::new(false); + + let left_link = InternalNodeLink { + buffer_size: left_buffer.size(), + buffer: left_buffer, + ptr: left_child, + }; + + let right_link = InternalNodeLink { + buffer_size: right_buffer.size(), + buffer: right_buffer, + ptr: right_child, + }; + *self = Node(CopylessInternal(CopylessInternalNode::new( + left_link, + right_link, pivot_key, cur_level + 1, ))); - let size_after = self.size(); + + let size_after = self.cache_size(); size_after as isize - size_before as isize } } -pub(super) enum GetResult<'a, N: 'a> { +pub(super) enum GetResult<'a, N: 'a + 'static> { Data(Option<(KeyInfo, SlicedCowBytes)>), NextNode(&'a RwLock), } -pub(super) enum ApplyResult<'a, N: 'a> { +pub(super) enum ApplyResult<'a, N: 'a + 'static> { Leaf(Option), NextNode(&'a mut N), } -pub(super) enum PivotGetResult<'a, N: 'a> { +pub(super) enum PivotGetResult<'a, N: 'a + 'static> { Target(Option<&'a RwLock>), NextNode(&'a RwLock), } -pub(super) enum PivotGetMutResult<'a, N: 'a> { +pub(super) enum PivotGetMutResult<'a, N: 'a + 'static> { Target(Option<&'a mut N>), NextNode(&'a mut N), } -pub(super) enum GetRangeResult<'a, T, N: 'a> { +/// Return type of range query fetching all children to the lowest nodes. +pub(super) enum GetRangeResult<'a, T, N: 'a + 'static> { Data(T), NextNode { np: &'a RwLock, - prefetch_option: Option<&'a RwLock>, + prefetch_option_node: Option<&'a RwLock>, }, } impl Node { - pub(super) fn get( - &self, - key: &[u8], - msgs: &mut Vec<(KeyInfo, SlicedCowBytes)>, - ) -> GetResult { + pub(super) fn get(&self, key: &[u8], msgs: &mut Vec<(KeyInfo, SlicedCowBytes)>) -> GetResult + where + N: ObjectReference, + { match self.0 { - PackedLeaf(ref map) => GetResult::Data(map.get(key)), - Leaf(ref leaf) => GetResult::Data(leaf.get_with_info(key)), - Internal(ref internal) => { - let (child_np, msg) = internal.get(key); + MemLeaf(ref nvmleaf) => GetResult::Data(nvmleaf.get(key)), + CopylessInternal(ref nvminternal) => { + let (child_np, msg) = nvminternal.get(key); if let Some(msg) = msg { msgs.push(msg); } @@ -351,44 +547,58 @@ impl Node { right_pivot_key: &mut Option, all_msgs: &mut BTreeMap>, ) -> GetRangeResult + 'a>, N> + where + N: ObjectReference, { match self.0 { - PackedLeaf(ref map) => GetRangeResult::Data(Box::new(map.get_all())), - Leaf(ref leaf) => GetRangeResult::Data(Box::new( - leaf.entries().iter().map(|(k, v)| (&k[..], v.clone())), - )), - Internal(ref internal) => { - let prefetch_option = if internal.level() == 1 { - internal.get_next_node(key) + MemLeaf(ref nvmleaf) => GetRangeResult::Data(Box::new(nvmleaf.get_all_messages())), + CopylessInternal(ref nvminternal) => { + let prefetch_option = if nvminternal.level() == 1 && false { + nvminternal.get_next_node(key) } else { None }; - let np = internal.get_range(key, left_pivot_key, right_pivot_key, all_msgs); + + let cl = nvminternal.get_range(key, left_pivot_key, right_pivot_key, all_msgs); + + for (key, msg) in cl.buffer().get_all_messages() { + all_msgs + .entry(CowBytes::from(key)) + .or_insert_with(Vec::new) + .push(msg.clone()); + } + GetRangeResult::NextNode { - prefetch_option, - np, + np: cl.ptr(), + prefetch_option_node: prefetch_option.map(|l| l.ptr()), } } } } - pub(super) fn pivot_get(&self, pk: &PivotKey) -> Option> { + pub(super) fn pivot_get(&self, pk: &PivotKey) -> Option> + where + N: ObjectReference, + { if pk.is_root() { return Some(PivotGetResult::Target(None)); } match self.0 { - PackedLeaf(_) | Leaf(_) => None, - Internal(ref internal) => Some(internal.pivot_get(pk)), + MemLeaf(_) => None, + CopylessInternal(ref nvminternal) => Some(nvminternal.pivot_get(pk)), } } - pub(super) fn pivot_get_mut(&mut self, pk: &PivotKey) -> Option> { + pub(super) fn pivot_get_mut(&mut self, pk: &PivotKey) -> Option> + where + N: ObjectReference, + { if pk.is_root() { return Some(PivotGetMutResult::Target(None)); } match self.0 { - PackedLeaf(_) | Leaf(_) => None, - Internal(ref mut internal) => Some(internal.pivot_get_mut(pk)), + MemLeaf(_) => None, + CopylessInternal(ref mut nvminternal) => Some(nvminternal.pivot_get_mut(pk)), } } } @@ -404,14 +614,23 @@ impl Node { where K: Borrow<[u8]> + Into, M: MessageAction, + N: ObjectReference, { let size_delta = self.ensure_unpacked(); let keyinfo = KeyInfo { storage_preference }; size_delta + (match self.0 { - PackedLeaf(_) => unreachable!(), - Leaf(ref mut leaf) => leaf.insert(key, keyinfo, msg, msg_action), - Internal(ref mut internal) => internal.insert(key, keyinfo, msg, msg_action), + MemLeaf(ref mut nvmleaf) => nvmleaf.insert(key, keyinfo, msg, msg_action).take().1, + CopylessInternal(ref mut nvminternal) => { + // This is a remainder from the version in which we + // wroteback child buffers separately. + let child_idx = nvminternal.idx(key.borrow()); + let link = nvminternal.get_mut(key.borrow()); + let buffer_node = link.buffer_mut(); + let size_delta = buffer_node.insert(key, keyinfo, msg, msg_action).take().1; + nvminternal.after_insert_size_delta(child_idx, size_delta); + size_delta + } }) } @@ -419,107 +638,164 @@ impl Node { where I: IntoIterator, M: MessageAction, + N: ObjectReference, { let size_delta = self.ensure_unpacked(); size_delta + (match self.0 { - PackedLeaf(_) => unreachable!(), - Leaf(ref mut leaf) => leaf.insert_msg_buffer(msg_buffer, msg_action), - Internal(ref mut internal) => internal.insert_msg_buffer(msg_buffer, msg_action), + MemLeaf(ref mut nvmleaf) => { + nvmleaf.insert_msg_buffer(msg_buffer, msg_action).take().1 + } + CopylessInternal(ref mut nvminternal) => { + // This is a remainder from the version in which we + // wroteback child buffers separately. + let mut size_delta = 0; + for (k, (kinfo, v)) in msg_buffer { + let idx = nvminternal.idx(&k); + let link = nvminternal.get_mut(&k); + let buffer_node = link.buffer_mut(); + let delta = buffer_node.insert(k, kinfo, v, msg_action.clone()).take().1; + nvminternal.after_insert_size_delta(idx, delta); + size_delta += delta; + } + size_delta + } }) } - pub(super) fn apply_with_info( - &mut self, - key: &[u8], - pref: StoragePreference, - ) -> ApplyResult { + pub(super) fn apply_with_info(&mut self, key: &[u8], pref: StoragePreference) -> ApplyResult + where + N: ObjectReference, + { // FIXME: This is bad for performance, what we want to do here is modify // the preference in place determine the new preference and write the // PACKED leaf as is again. This violates the restriction that they may // never be written again, therefore we need a new interface preparing // packed leafs for this exact and only purpose. - self.ensure_unpacked(); + // + // FIXME: When we unpack this the cache size changes, we need to update + // the cache entry. + let size_delta = self.ensure_unpacked(); match self.0 { // FIXME: see above - PackedLeaf(_) => unreachable!(), - Leaf(ref mut leaf) => ApplyResult::Leaf(leaf.apply(key, pref)), - Internal(ref mut internal) => { - ApplyResult::NextNode(internal.apply_with_info(key, pref)) + MemLeaf(ref mut nvmleaf) => { + ApplyResult::Leaf(nvmleaf.apply_with_info(key, pref).take().0) + } + CopylessInternal(ref mut nvminternal) => { + ApplyResult::NextNode(nvminternal.apply_with_info(key, pref)) } } } } impl Node { - pub(super) fn child_pointer_iter_mut(&mut self) -> Option + '_> { + pub(super) fn child_pointer_iter_mut(&mut self) -> Option + '_>> + where + N: ObjectReference, + { match self.0 { - Leaf(_) | PackedLeaf(_) => None, - Internal(ref mut internal) => Some( - internal + MemLeaf(_) => None, + CopylessInternal(ref mut nvminternal) => Some(Box::new( + nvminternal .iter_mut() - .map(|child| child.node_pointer.get_mut()), - ), + .map(|child| child.ptr_mut().get_mut()), + )), + // NOTE: This returns none as it is not necessarily harmful to write + // it back as no consistency constraints have to be met. } } - pub(super) fn child_pointer_iter(&self) -> Option> + '_> { + pub(super) fn child_pointer_iter(&self) -> Option> + '_>> + where + N: ObjectReference, + { match self.0 { - Leaf(_) | PackedLeaf(_) => None, - Internal(ref internal) => Some(internal.iter().map(|child| &child.node_pointer)), + MemLeaf(_) => None, + CopylessInternal(ref nvminternal) => { + Some(Box::new(nvminternal.iter().map(|link| link.ptr()))) + } } } - pub(super) fn drain_children(&mut self) -> Option + '_> { + pub(super) fn drain_children(&mut self) -> Option> + where + N: ObjectReference, + { match self.0 { - Leaf(_) | PackedLeaf(_) => None, - Internal(ref mut internal) => Some(internal.drain_children()), + MemLeaf(_) => None, + CopylessInternal(ref mut nvminternal) => Some(ChildrenObjects::NVMChildBuffer( + Box::new(nvminternal.drain_children()), + )), } } } impl Node { - pub(super) fn split(&mut self) -> (Self, CowBytes, isize, LocalPivotKey) { + pub(super) fn split( + &mut self, + storage_map: &StorageMap, + ) -> (Self, CowBytes, isize, LocalPivotKey) { self.ensure_unpacked(); + + let min_size = storage_map.min_size(self); + let max_size = storage_map.min_size(self); match self.0 { - PackedLeaf(_) => unreachable!(), - Leaf(ref mut leaf) => { - let (node, pivot_key, size_delta, pk) = - leaf.split(MIN_LEAF_NODE_SIZE, MAX_LEAF_NODE_SIZE); - (Node(Leaf(node)), pivot_key, size_delta, pk) + MemLeaf(ref mut nvmleaf) => { + let ((node, pivot_key, pk), size_delta) = + nvmleaf.split(min_size.unwrap(), max_size.unwrap()).take(); + (Node(MemLeaf(node)), pivot_key, size_delta, pk) } - Internal(ref mut internal) => { - debug_assert!( - internal.fanout() >= 2 * MIN_FANOUT, + CopylessInternal(ref mut nvminternal) => { + assert!( + nvminternal.fanout() >= 2 * MIN_FANOUT, "internal split failed due to low fanout: {}, size: {}, actual_size: {:?}", - internal.fanout(), - internal.size(), - internal.actual_size() + nvminternal.fanout(), + nvminternal.size(), + nvminternal.actual_size() ); - let (node, pivot_key, size_delta, pk) = internal.split(); - (Node(Internal(node)), pivot_key, size_delta, pk) + let (node, pivot_key, size_delta, pk) = nvminternal.split(); + (Node(CopylessInternal(node)), pivot_key, size_delta, pk) } } } pub(super) fn merge(&mut self, right_sibling: &mut Self, pivot_key: CowBytes) -> isize { - self.ensure_unpacked(); - right_sibling.ensure_unpacked(); - match (&mut self.0, &mut right_sibling.0) { - (&mut Leaf(ref mut left), &mut Leaf(ref mut right)) => left.merge(right), - (&mut Internal(ref mut left), &mut Internal(ref mut right)) => { + // FIXME: Propagate isize change completely + let d0 = self.ensure_unpacked(); + let _ = right_sibling.ensure_unpacked(); + d0 + match (&mut self.0, &mut right_sibling.0) { + (&mut MemLeaf(ref mut left), &mut MemLeaf(ref mut right)) => { + left.append(right).take().1 + } + (&mut CopylessInternal(ref mut left), &mut CopylessInternal(ref mut right)) => { left.merge(right, pivot_key) } - _ => unreachable!(), + _ => { + let bt = std::backtrace::Backtrace::force_capture(); + println!("{}", bt); + println!( + "Left is {} \n Right is {}", + self.debug_info(), + right_sibling.debug_info() + ); + unreachable!() + } } } - pub(super) fn leaf_rebalance(&mut self, right_sibling: &mut Self) -> FillUpResult { + pub(super) fn leaf_rebalance( + &mut self, + right_sibling: &mut Self, + storage_map: &StorageMap, + ) -> FillUpResult { self.ensure_unpacked(); right_sibling.ensure_unpacked(); + + let min_size = storage_map.min_size(self); + let max_size = storage_map.min_size(self); match (&mut self.0, &mut right_sibling.0) { - (&mut Leaf(ref mut left), &mut Leaf(ref mut right)) => { - left.rebalance(right, MIN_LEAF_NODE_SIZE, MAX_LEAF_NODE_SIZE) + (&mut MemLeaf(ref mut left), &mut MemLeaf(ref mut right)) => { + left.rebalance_size(right, min_size.unwrap(), max_size.unwrap()) } _ => unreachable!(), } @@ -557,22 +833,19 @@ pub struct ChildInfo { #[derive(serde::Serialize)] #[serde(tag = "type", rename_all = "lowercase")] +#[allow(missing_docs)] pub enum NodeInfo { - Internal { + Leaf { level: u32, storage: StoragePreference, system_storage: StoragePreference, - children: Vec, + entry_count: usize, }, - Leaf { + Internal { level: u32, storage: StoragePreference, system_storage: StoragePreference, - entry_count: usize, - }, - Packed { - entry_count: u32, - range: Vec, + children: Vec, }, } @@ -605,66 +878,58 @@ impl Node { N: ObjectReference, { match &self.0 { - Inner::Internal(int) => NodeInfo::Internal { + Inner::MemLeaf(ref nvmleaf) => NodeInfo::Leaf { storage: self.correct_preference(), system_storage: self.system_storage_preference(), level: self.level(), - children: { - int.iter_with_bounds() - .map(|(maybe_left, child_buf, maybe_right)| { - let (child, storage_preference, pivot_key) = { - let mut np = child_buf.node_pointer.write(); - let pivot_key = np.index().clone(); - let storage_preference = np.correct_preference(); - let child = dml.get(&mut np).unwrap(); - (child, storage_preference, pivot_key) - }; - - let node_info = child.node_info(dml); - drop(child); - - dml.evict().unwrap(); - - ChildInfo { - from: maybe_left.map(|cow| ByteString(cow.to_vec())), - to: maybe_right.map(|cow| ByteString(cow.to_vec())), - storage: storage_preference, - pivot_key, - child: node_info, - } - }) - .collect() - }, + entry_count: nvmleaf.len(), }, - Inner::Leaf(leaf) => NodeInfo::Leaf { + Inner::CopylessInternal(ref nvminternal) => NodeInfo::Internal { storage: self.correct_preference(), system_storage: self.system_storage_preference(), level: self.level(), - entry_count: leaf.entries().len(), - }, - Inner::PackedLeaf(packed) => { - let len = packed.entry_count(); - NodeInfo::Packed { - entry_count: len, - range: if len == 0 { - Vec::new() - } else { - [ - packed.get_full_by_index(0), - packed.get_full_by_index(len - 1), - ] + children: { + let itr = nvminternal + .children .iter() - .filter_map(|opt| { - if let Some((key, _)) = opt { - Some(ByteString(key.to_vec())) - } else { + .enumerate() + .map(move |(idx, child)| { + let maybe_left = if idx == 0 { None - } - }) - .collect() - }, - } - } + } else { + nvminternal.meta_data.pivot.get(idx - 1) + }; + + let maybe_right = nvminternal.meta_data.pivot.get(idx); + + (maybe_left, child, maybe_right) + }); + + itr.map(|(maybe_left, child_buf, maybe_right)| { + let (child, storage_preference, pivot_key) = { + let mut np = child_buf.ptr().write(); + let pivot_key = np.index().clone(); + let storage_preference = np.correct_preference(); + let child = dml.get(&mut np).unwrap(); + (child, storage_preference, pivot_key) + }; + + let node_info = child.node_info(dml); + drop(child); + + dml.evict().unwrap(); + + ChildInfo { + from: maybe_left.map(|cow| ByteString(cow.to_vec())), + to: maybe_right.map(|cow| ByteString(cow.to_vec())), + storage: storage_preference, + pivot_key, + child: node_info, + } + }) + .collect() + }, + }, } } } diff --git a/betree/src/tree/imp/packed.rs b/betree/src/tree/imp/packed.rs deleted file mode 100644 index 3b8e955ae..000000000 --- a/betree/src/tree/imp/packed.rs +++ /dev/null @@ -1,300 +0,0 @@ -//! On-disk representation of a node. -//! -//! Can be used for read-only access to avoid deserialization. -use super::leaf::LeafNode; -use crate::{ - cow_bytes::{CowBytes, SlicedCowBytes}, - data_management::HasStoragePreference, - size::Size, - tree::KeyInfo, - StoragePreference, -}; -use byteorder::{ByteOrder, LittleEndian, WriteBytesExt}; -use std::{ - cmp, - io::{self, Write}, - mem::size_of, -}; - -// account for trailing fake element -pub(crate) const HEADER_FIXED_LEN: usize = HEADER_LEN + OFFSET_LEN; - -const HEADER_LEN: usize = size_of::() + size_of::(); - -// Offsets are stored as 24-bit unsigned integers in little-endian order -pub(crate) const OFFSET_LEN: usize = 3; -// 2 offsets (u24) and a keyinfo (u8) -pub(crate) const ENTRY_LEN: usize = 2 * OFFSET_LEN + 1; -pub(crate) const ENTRY_KEY_OFFSET: usize = 0; -pub(crate) const ENTRY_KEY_INFO_OFFSET: usize = ENTRY_KEY_OFFSET + OFFSET_LEN; -pub(crate) const ENTRY_DATA_OFFSET: usize = ENTRY_KEY_INFO_OFFSET + 1; - -/// On-disk serialized leaf node. Simplified to a map contains 40 bytes of -/// headers followed by data. -/// -/// ```text -/// Layout: -/// entry_count: u32, -/// system_pref: u8, -/// entries: [Entry; entry_count], -/// # necessary to compute the length of the last value -/// data_end: Offset, -/// data: [u8] -/// -/// # These positions only mark the beginning of the key and value section in data, -/// # the length is computed by subtracting two consecutive offsets. -/// # This is possible because entries are written in the same order as their data -/// # will have in `data`. -/// Entry: -/// key_pos: Offset, -/// key_info: KeyInfo, -/// data_pos: Offset -/// -/// # 2^24 byte ~= 16.7MB, plenty at a max node size of 4MiB -/// Offset: -/// u24 -/// -/// KeyInfo: -/// storage_preference: u8 -/// -/// ``` -#[derive(Debug)] -pub(crate) struct PackedMap { - entry_count: u32, - system_preference: u8, - data: CowBytes, -} - -/// New type for safe-handling of data offsets u32s. -#[derive(Debug, Copy, Clone)] -struct Offset(u32); - -fn prefix_size(entry_count: u32) -> usize { - HEADER_FIXED_LEN + ENTRY_LEN * entry_count as usize -} - -impl PackedMap { - pub fn new(data: Vec) -> Self { - debug_assert!(data.len() >= 4); - let entry_count = LittleEndian::read_u32(&data[..4]); - let system_preference = data[4]; - - PackedMap { - data: data.into(), - entry_count, - system_preference, - } - } - - fn read_offset(&self, byte_idx: usize) -> Offset { - Offset(LittleEndian::read_u24( - &self.data[byte_idx..byte_idx + OFFSET_LEN], - )) - } - - // In the data segment, the value is always written directly after the key, - // so the key length can be calculated by subtraction. - fn key_pos(&self, idx: u32) -> (Offset, u32) { - debug_assert!(idx < self.entry_count); - - let entry_pos = HEADER_LEN + idx as usize * ENTRY_LEN; - - let key_offset = self.read_offset(entry_pos + ENTRY_KEY_OFFSET); - let data_offset = self.read_offset(entry_pos + ENTRY_DATA_OFFSET); - let key_len = data_offset.0 - key_offset.0; - - (key_offset, key_len) - } - - // In the data segment, the next key is usually written after the current value, - // so the value length can usually be calculated by subtraction. - fn val_pos(&self, idx: u32) -> (Offset, u32) { - debug_assert!(idx < self.entry_count); - - let entry_pos = HEADER_LEN + idx as usize * ENTRY_LEN; - let data_offset = self.read_offset(entry_pos + ENTRY_DATA_OFFSET); - - // this works even for the last entry, as a single offset is appended to the last full - // entry, and the key offset comes first, so the rest of that fake entry is not missed. - let next_entry_pos = entry_pos + ENTRY_LEN; - let next_key_offset = self.read_offset(next_entry_pos + ENTRY_KEY_OFFSET); - - let data_len = next_key_offset.0 - data_offset.0; - (data_offset, data_len) - } - - fn key_info(&self, idx: u32) -> KeyInfo { - debug_assert!(idx < self.entry_count); - let entry_pos = HEADER_LEN + idx as usize * ENTRY_LEN; - - KeyInfo { - storage_preference: StoragePreference::from_u8( - self.data[entry_pos + ENTRY_KEY_INFO_OFFSET], - ), - } - } - - fn get_slice(&self, (Offset(pos), len): (Offset, u32)) -> &[u8] { - &self.data[pos as usize..pos as usize + len as usize] - } - - fn get_slice_cow(&self, (Offset(pos), len): (Offset, u32)) -> SlicedCowBytes { - self.data.clone().slice(pos, len) - } - - // Adapted from std::slice::binary_search_by - fn binary_search(&self, key: &[u8]) -> Result { - use cmp::Ordering::*; - let mut size = self.entry_count; - if size == 0 { - return Err(0); - } - let mut base = 0; - while size > 1 { - let half = size / 2; - let mid = base + half; - let cmp = self.get_slice(self.key_pos(mid)).cmp(key); - base = if cmp == Greater { base } else { mid }; - size -= half; - } - let cmp = self.get_slice(self.key_pos(base)).cmp(key); - if cmp == Equal { - Ok(base) - } else { - Err(base + (cmp == Less) as u32) - } - } - - pub fn get_by_index(&self, idx: u32) -> Option<(KeyInfo, SlicedCowBytes)> { - Some((self.key_info(idx), self.get_slice_cow(self.val_pos(idx)))) - } - - pub fn get_full_by_index( - &self, - idx: u32, - ) -> Option<(SlicedCowBytes, (KeyInfo, SlicedCowBytes))> { - Some(( - self.get_slice_cow(self.key_pos(idx)), - (self.key_info(idx), self.get_slice_cow(self.val_pos(idx))), - )) - } - - pub fn get(&self, key: &[u8]) -> Option<(KeyInfo, SlicedCowBytes)> { - let result = self.binary_search(key); - let idx = match result { - Err(_) => return None, - Ok(idx) => idx, - }; - - self.get_by_index(idx) - } - - pub fn get_all(&self) -> impl Iterator + '_ { - struct Iter<'a> { - packed: &'a PackedMap, - idx: u32, - } - impl<'a> Iterator for Iter<'a> { - type Item = (&'a [u8], (KeyInfo, SlicedCowBytes)); - - fn next(&mut self) -> Option { - if self.idx < self.packed.entry_count { - let ret = Some(( - self.packed.get_slice(self.packed.key_pos(self.idx)), - ( - self.packed.key_info(self.idx), - self.packed.get_slice_cow(self.packed.val_pos(self.idx)), - ), - )); - self.idx += 1; - ret - } else { - None - } - } - } - - Iter { - packed: self, - idx: 0, - } - } - - pub(super) fn unpack_leaf(&self) -> LeafNode { - let mut leaf: LeafNode = self.get_all().collect(); - // Restore system storage preference state - leaf.set_system_storage_preference(StoragePreference::from_u8(self.system_preference)); - leaf - } - - pub(super) fn pack(leaf: &LeafNode, mut writer: W) -> io::Result<()> { - let entries = leaf.entries(); - let entries_cnt = entries.len() as u32; - writer.write_u32::(entries_cnt)?; - writer.write_u8(leaf.system_storage_preference().as_u8())?; - - let mut pos = prefix_size(entries_cnt) as u32; - for (key, (keyinfo, value)) in entries { - writer.write_u24::(pos)?; - pos += key.len() as u32; - - writer.write_u8(keyinfo.storage_preference.as_u8())?; - - writer.write_u24::(pos)?; - pos += value.len() as u32; - } - - writer.write_u24::(pos)?; - - for (key, (_keyinfo, value)) in entries { - writer.write_all(key)?; - writer.write_all(value)?; - } - Ok(()) - } - - pub(super) fn inner(&self) -> &CowBytes { - &self.data - } - - pub(super) fn entry_count(&self) -> u32 { - self.entry_count - } -} - -impl Size for PackedMap { - fn size(&self) -> usize { - self.data.len() - } - - fn actual_size(&self) -> Option { - Some(self.size()) - } -} - -#[cfg(test)] -mod tests { - use super::{LeafNode, PackedMap}; - - #[quickcheck] - fn check_packed_contents(leaf: LeafNode) { - let mut v = Vec::new(); - PackedMap::pack(&leaf, &mut v).unwrap(); - - let packed = PackedMap::new(v); - - for (k, (ki, v)) in leaf.entries() { - let (pki, pv) = packed.get(k).unwrap(); - assert_eq!(ki, &pki, "keyinfo mismatch"); - assert_eq!(v, &pv, "value mismatch"); - } - - assert_eq!( - leaf.entries() - .iter() - .map(|(k, v)| (&k[..], v.clone())) - .collect::>(), - packed.get_all().collect::>() - ); - } -} diff --git a/betree/src/tree/imp/range.rs b/betree/src/tree/imp/range.rs index eed085cce..0a94f561a 100644 --- a/betree/src/tree/imp/range.rs +++ b/betree/src/tree/imp/range.rs @@ -37,7 +37,8 @@ pub struct RangeIterator>> { max_key: Option>, tree: Tree, finished: bool, - prefetch: Option, + prefetch_node: Option, + prefetch_buffer: Option, } impl Iterator for RangeIterator @@ -96,7 +97,8 @@ where tree, finished: false, buffer: VecDeque::new(), - prefetch: None, + prefetch_node: None, + prefetch_buffer: None, } } @@ -106,7 +108,7 @@ where Bounded::Included(ref x) | Bounded::Excluded(ref x) => x, }; self.tree - .leaf_range_query(min_key, &mut self.buffer, &mut self.prefetch)? + .leaf_range_query(min_key, &mut self.buffer, &mut self.prefetch_node)? }; // Strip entries which are out of bounds from the buffer. @@ -168,7 +170,7 @@ where &self, key: &[u8], data: &mut VecDeque<(CowBytes, (KeyInfo, SlicedCowBytes))>, - prefetch: &mut Option, + prefetch_node: &mut Option, ) -> Result, Error> { let result = { let mut left_pivot_key = None; @@ -186,19 +188,22 @@ where &mut messages, ) { GetRangeResult::NextNode { - prefetch_option, np, + prefetch_option_node, } => { - let previous_prefetch = if let Some(prefetch_np) = prefetch_option { + let previous_prefetch_node = if let Some(prefetch_np) = prefetch_option_node + { let f = self.dml.prefetch(&prefetch_np.read())?; - replace(prefetch, f) + replace(prefetch_node, f) } else { - prefetch.take() + prefetch_node.take() }; - if let Some(previous_prefetch) = previous_prefetch { - self.dml.finish_prefetch(previous_prefetch)?; + + if let Some(previous_prefetch) = previous_prefetch_node { + self.dml.finish_prefetch(previous_prefetch)? + } else { + self.get_node(np)? } - self.get_node(np)? } GetRangeResult::Data(leaf_entries) => { self.apply_messages( diff --git a/betree/src/tree/imp/split.rs b/betree/src/tree/imp/split.rs index c2a2d44a9..c69afd618 100644 --- a/betree/src/tree/imp/split.rs +++ b/betree/src/tree/imp/split.rs @@ -1,9 +1,10 @@ //! Encapsulating logic for splitting of normal and root nodes. -use super::{child_buffer::ChildBuffer, internal::TakeChildBuffer, Inner, Node, Tree}; +use super::{Inner, Node, Tree}; use crate::{ cache::AddSize, data_management::{Dml, HasStoragePreference, ObjectReference}, size::Size, + tree::imp::internal::TakeChildBuffer, tree::{errors::*, MessageAction}, }; use std::borrow::Borrow; @@ -17,7 +18,7 @@ where { pub(super) fn split_root_node(&self, mut root_node: X::CacheValueRefMut) { self.dml.verify_cache(); - let before = root_node.size(); + let before = root_node.cache_size(); debug!( "Splitting root. {}, {:?}, {}, {:?}", root_node.kind(), @@ -25,7 +26,7 @@ where root_node.size(), root_node.actual_size() ); - let size_delta = root_node.split_root_mut(|node, pk| { + let size_delta = root_node.split_root_mut(&self.storage_map, |node, pk| { debug!( "Root split child: {}, {:?}, {}, {:?}", node.kind(), @@ -37,7 +38,7 @@ where .insert(node, self.tree_id(), pk.to_global(self.tree_id())) }); info!("Root split done. {}, {}", root_node.size(), size_delta); - debug_assert!(before as isize + size_delta == root_node.size() as isize); + assert!(before as isize + size_delta == root_node.cache_size() as isize); root_node.finish(size_delta); self.dml.verify_cache(); } @@ -45,12 +46,12 @@ where pub(super) fn split_node( &self, mut node: X::CacheValueRefMut, - parent: &mut TakeChildBuffer>, + parent: &mut TakeChildBuffer, ) -> Result<(X::CacheValueRefMut, isize), Error> { self.dml.verify_cache(); - let before = node.size(); - let (sibling, pivot_key, size_delta, lpk) = node.split(); + let before = node.cache_size(); + let (sibling, pivot_key, size_delta, lpk) = node.split(&self.storage_map); let pk = lpk.to_global(self.tree_id()); let select_right = sibling.size() > node.size(); debug!( diff --git a/betree/src/tree/message_action.rs b/betree/src/tree/message_action.rs index 52fde2afd..0a4812536 100644 --- a/betree/src/tree/message_action.rs +++ b/betree/src/tree/message_action.rs @@ -7,7 +7,7 @@ use crate::cow_bytes::SlicedCowBytes; use std::{fmt::Debug, ops::Deref}; /// Defines the action of a message. -pub trait MessageAction: Debug + Send + Sync { +pub trait MessageAction: Clone + Debug + Send + Sync { /// Applies the message `msg`. `data` holds the current data. fn apply(&self, key: &[u8], msg: &SlicedCowBytes, data: &mut Option); @@ -26,7 +26,7 @@ pub trait MessageAction: Debug + Send + Sync { ) -> SlicedCowBytes; } -impl MessageAction for T +impl MessageAction for T where T::Target: MessageAction, { diff --git a/betree/src/tree/mod.rs b/betree/src/tree/mod.rs index c1640ad6a..97c2406a5 100644 --- a/betree/src/tree/mod.rs +++ b/betree/src/tree/mod.rs @@ -3,11 +3,13 @@ mod default_message_action; mod errors; -mod imp; +pub mod imp; mod layer; mod message_action; mod pivot_key; +use serde::{Deserialize, Serialize}; + use crate::cow_bytes::{CowBytes, SlicedCowBytes}; pub use self::{ @@ -17,8 +19,26 @@ pub use self::{ message_action::MessageAction, }; +#[repr(C)] +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Serialize, Deserialize)] +/// Which node representation the tree should use. +pub enum StorageKind { + /// Conventional large nodes. HDD optimized. + Hdd = 0, + /// Partially fetched nodes. Memory only. + Memory, + /// Segmented nodes. For fast SSDs. + Ssd, +} + +impl Default for StorageKind { + fn default() -> Self { + Self::Hdd + } +} + #[cfg(not(feature = "internal-api"))] -pub(crate) use self::{imp::NodeInfo, pivot_key::PivotKey}; +pub(crate) use self::pivot_key::PivotKey; #[cfg(feature = "internal-api")] pub use self::{imp::NodeInfo, pivot_key::PivotKey}; diff --git a/betree/src/tree/pivot_key.rs b/betree/src/tree/pivot_key.rs index 0df904973..2b5b0493b 100644 --- a/betree/src/tree/pivot_key.rs +++ b/betree/src/tree/pivot_key.rs @@ -30,8 +30,11 @@ use crate::{cow_bytes::CowBytes, database::DatasetId}; /// ``` #[derive(Hash, Clone, Debug, PartialEq, Eq, Serialize)] pub enum PivotKey { + /// Left most child of this node. Left of `.0`. LeftOuter(CowBytes, DatasetId), + /// Right child of `.0`. Right(CowBytes, DatasetId), + /// Root of the given tree. Root(DatasetId), } diff --git a/betree/src/vdev/block.rs b/betree/src/vdev/block.rs index c4dbcb63e..6a5db71ea 100644 --- a/betree/src/vdev/block.rs +++ b/betree/src/vdev/block.rs @@ -9,10 +9,31 @@ use std::{ /// A unit which represents a number of bytes which are a multiple of /// `BLOCK_SIZE`. -#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +#[derive( + Debug, + Copy, + Clone, + PartialEq, + Eq, + PartialOrd, + Ord, + Hash, + Serialize, + Deserialize, + rkyv::Archive, + rkyv::Serialize, + rkyv::Deserialize, +)] +#[archive(check_bytes)] #[serde(transparent)] pub struct Block(pub T); +impl std::fmt::Display for Block { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_fmt(format_args!("Block({})", self.0)) + } +} + impl StaticSize for Block { fn static_size() -> usize { // Works for standard sizes diff --git a/betree/src/vdev/mem.rs b/betree/src/vdev/mem.rs index 7becbd683..e56f93365 100644 --- a/betree/src/vdev/mem.rs +++ b/betree/src/vdev/mem.rs @@ -2,14 +2,11 @@ use super::{ errors::*, AtomicStatistics, Block, Result, ScrubResult, Statistics, Vdev, VdevLeafRead, VdevLeafWrite, VdevRead, }; -use crate::{ - buffer::{Buf, BufWrite}, - checksum::Checksum, -}; +use crate::{buffer::Buf, checksum::Checksum}; use async_trait::async_trait; use parking_lot::RwLock; use std::{ - io::{self, Write}, + io, ops::{Deref, DerefMut}, sync::atomic::Ordering, }; @@ -60,8 +57,13 @@ impl Memory { match self.slice_blocks(size, offset) { Ok(slice) => { - let mut buf = BufWrite::with_capacity(size); - buf.write_all(&slice)?; + let buf = unsafe { + Buf::from_raw( + std::ptr::NonNull::new(slice.as_ptr() as *mut u8) + .expect("Pointer in Memory vdev was null."), + size, + ) + }; #[cfg(feature = "latency_metrics")] self.stats.read_op_latency.fetch_add( start @@ -71,7 +73,7 @@ impl Memory { .unwrap_or(u32::MAX as u64), Ordering::Relaxed, ); - Ok(buf.into_buf()) + Ok(buf) } Err(e) => { #[cfg(feature = "latency_metrics")] diff --git a/betree/src/vdev/parity1.rs b/betree/src/vdev/parity1.rs index 73b2639bb..37d326a61 100644 --- a/betree/src/vdev/parity1.rs +++ b/betree/src/vdev/parity1.rs @@ -167,7 +167,7 @@ impl Parity1 { } let (left, right) = buf.split_at(col_length); buf = right; - reads.push(disk.read_raw(left, disk_offset).into_future()); + reads.push_back(disk.read_raw(left, disk_offset).into_future()); } } let mut failed_idx = None; diff --git a/betree/src/vdev/pmemfile.rs b/betree/src/vdev/pmemfile.rs index ec8e578e0..31b554de2 100644 --- a/betree/src/vdev/pmemfile.rs +++ b/betree/src/vdev/pmemfile.rs @@ -6,14 +6,7 @@ use crate::{buffer::Buf, checksum::Checksum}; use async_trait::async_trait; use libc::{c_ulong, ioctl}; use pmdk; -use std::{ - fs, - io, - os::unix:: - io::AsRawFd, - - sync::atomic::Ordering, -}; +use std::{fs, io, os::unix::io::AsRawFd, sync::atomic::Ordering}; /// `LeafVdev` which is backed by NVM and uses `pmdk`. #[derive(Debug)] @@ -59,12 +52,26 @@ impl VdevRead for PMemFile { checksum: C, ) -> Result { self.stats.read.fetch_add(size.as_u64(), Ordering::Relaxed); - let buf = { - let mut buf = Buf::zeroed(size).into_full_mut(); - self.file.read(offset.to_bytes() as usize, buf.as_mut()); - buf.into_full_buf() + let buf = unsafe { + let slice = self + .file + .get_slice(offset.to_bytes() as usize, size.to_bytes() as usize)?; + // # SAFETY + // Since Bufs are read only anyways we ensure the safety of this + // step by re-packing this forced mutable pointer into one. + Buf::from_raw( + std::ptr::NonNull::new(slice.as_ptr() as *mut u8) + .expect("Pmem pointer was null when trying to read from offset."), + size, + ) }; + // let buf = { + // let mut buf = Buf::zeroed(size).into_full_mut(); + // self.file.read(offset.to_bytes() as usize, buf.as_mut()); + // buf.into_full_buf() + // }; + match checksum.verify(&buf).map_err(VdevError::from) { Ok(()) => Ok(buf), Err(e) => { @@ -92,10 +99,24 @@ impl VdevRead for PMemFile { async fn read_raw(&self, size: Block, offset: Block) -> Result> { self.stats.read.fetch_add(size.as_u64(), Ordering::Relaxed); - let mut buf = Buf::zeroed(size).into_full_mut(); + // let mut buf = Buf::zeroed(size).into_full_mut(); + + let buf = unsafe { + let slice = self + .file + .get_slice(offset.to_bytes() as usize, size.to_bytes() as usize)?; + // # SAFETY + // Since Bufs are read only anyways we ensure the safety of this + // step by re-packing this forced mutable pointer into one. + Buf::from_raw( + std::ptr::NonNull::new(slice.as_ptr() as *mut u8) + .expect("Pmem pointer was null when trying to read from offset."), + size, + ) + }; - self.file.read(offset.to_bytes() as usize, buf.as_mut()); - Ok(vec![buf.into_full_buf()]) + // self.file.read(offset.to_bytes() as usize, buf.as_mut()); + Ok(vec![buf]) } } diff --git a/betree/tests/src/configs.rs b/betree/tests/src/configs.rs index 7574047c7..0b77925c2 100644 --- a/betree/tests/src/configs.rs +++ b/betree/tests/src/configs.rs @@ -4,6 +4,7 @@ use betree_storage_stack::{ database::AccessMode, migration::{LfuConfig, LfuMode, MigrationConfig, MigrationPolicies}, storage_pool::{configuration::Vdev, LeafVdev, TierConfiguration}, + tree::StorageKind, DatabaseConfiguration, StoragePoolConfiguration, }; @@ -19,6 +20,7 @@ pub fn access_specific_config() -> DatabaseConfiguration { })], preferred_access_type: betree_storage_stack::PreferredAccessType::RandomReadWrite, + storage_kind: StorageKind::Ssd, }, TierConfiguration { top_level_vdevs: vec![Vdev::Leaf(LeafVdev::Memory { @@ -26,6 +28,7 @@ pub fn access_specific_config() -> DatabaseConfiguration { })], preferred_access_type: betree_storage_stack::PreferredAccessType::SequentialReadWrite, + storage_kind: StorageKind::Hdd, }, ], ..Default::default() diff --git a/betree/tests/src/lib.rs b/betree/tests/src/lib.rs index a3c235ef3..fcc2cb4f1 100644 --- a/betree/tests/src/lib.rs +++ b/betree/tests/src/lib.rs @@ -16,8 +16,10 @@ use betree_storage_stack::{ use std::{ env, io::{BufReader, Read}, + ops::RangeFull, sync::RwLockWriteGuard, }; +use util::random_db; use rand::{prelude::ThreadRng, Rng, SeedableRng}; use rand_xoshiro::Xoshiro256PlusPlus; @@ -25,13 +27,14 @@ use rand_xoshiro::Xoshiro256PlusPlus; use insta::assert_json_snapshot; use serde_json::json; -fn test_db(tiers: u32, mb_per_tier: u32) -> Database { +fn test_db(tiers: u32, mb_per_tier: u32, kind: StorageKind) -> Database { let tier_size = mb_per_tier as usize * 1024 * 1024; let cfg = DatabaseConfiguration { storage: StoragePoolConfiguration { tiers: (0..tiers) .map(|_| TierConfiguration { top_level_vdevs: vec![Vdev::Leaf(LeafVdev::Memory { mem: tier_size })], + storage_kind: kind, ..Default::default() }) .collect(), @@ -39,6 +42,7 @@ fn test_db(tiers: u32, mb_per_tier: u32) -> Database { }, compression: CompressionConfiguration::None, access_mode: AccessMode::AlwaysCreateNew, + cache_size: 32 * 1024 * 1024, ..Default::default() }; @@ -78,7 +82,7 @@ struct TestDriver { impl TestDriver { fn setup(test_name: &str, tiers: u32, mb_per_tier: u32) -> TestDriver { - let mut database = test_db(tiers, mb_per_tier); + let mut database = test_db(tiers, mb_per_tier, StorageKind::Hdd); TestDriver { name: String::from(test_name), @@ -181,6 +185,42 @@ impl TestDriver { } } +use betree_storage_stack::tree::StorageKind; + +#[rstest] +#[case(StorageKind::Memory)] +#[case(StorageKind::Hdd)] +fn insert_single_key(#[case] kind: StorageKind) { + let mut db = test_db(1, 512, kind); + let ds = db.open_or_create_dataset(b"foo").unwrap(); + + let key = &[42][..]; + let val = b"Hello World"; + ds.insert(key, val).unwrap(); + db.sync().unwrap(); + assert_eq!(&ds.get(key).unwrap().unwrap()[..], val); +} + +#[rstest] +#[case(StorageKind::Memory)] +#[case(StorageKind::Hdd)] +fn insert_random_keys(#[case] kind: StorageKind) { + let (db, ds, ks) = random_db(1, 1024, kind); + db.drop_cache().unwrap(); + for idx in 1..ks { + let k = (idx as u64).to_be_bytes(); + // println!("{:?} {}/{ks}", k.as_bytes(), idx); + assert_eq!(ds.get(&k[..]).unwrap().unwrap().len(), 1024); + } + // FIXME: Iterator is still broken... + for (idx, r) in ds.range::(..).unwrap().enumerate() { + let (key, val) = r.unwrap(); + let k = (idx as u64 + 1).to_be_bytes(); + assert_eq!(&k[..], &key[..]); + assert_eq!(val.len(), 1024); + } +} + #[test] fn insert_single() { let mut driver = TestDriver::setup("insert single", 1, 256); @@ -307,7 +347,7 @@ const TO_MEBIBYTE: usize = 1024 * 1024; // We repeat this test here to trigger this potential behavior fn write_flaky(tier_size_mb: u32, write_size_mb: usize) { for _ in 0..3 { - let mut db = test_db(1, tier_size_mb); + let mut db = test_db(1, tier_size_mb, StorageKind::Hdd); let os = db .open_named_object_store(b"test", StoragePreference::FASTEST) .expect("Oh no! Could not open object store"); @@ -376,7 +416,7 @@ fn write_full(#[case] tier_size_mb: u32, #[case] par_space: f32) { // on available storage space. fn write_overfull(#[case] tier_size_mb: u32, #[case] par_space: f32) { // env_logger::init(); - let mut db = test_db(1, tier_size_mb); + let mut db = test_db(1, tier_size_mb, Default::default()); let os = db .open_named_object_store(b"test", StoragePreference::FASTEST) .expect("Oh no! Could not open object store"); @@ -418,7 +458,7 @@ fn rng() -> ThreadRng { #[case::d(2048)] fn write_sequence(#[case] tier_size_mb: u32) { let mut rng = rand::thread_rng(); - let mut db = test_db(1, tier_size_mb); + let mut db = test_db(1, tier_size_mb, Default::default()); let os = db .open_named_object_store(b"test", StoragePreference::FASTEST) .expect("Oh no! Could not open object store"); @@ -446,7 +486,7 @@ use rand::prelude::SliceRandom; #[case::c(1024)] #[case::d(2048)] fn write_delete_sequence(#[case] tier_size_mb: u32, mut rng: ThreadRng) { - let mut db = test_db(1, tier_size_mb); + let mut db = test_db(1, tier_size_mb, Default::default()); let os = db .open_named_object_store(b"test", StoragePreference::FASTEST) .expect("Oh no! Could not open object store"); @@ -499,7 +539,7 @@ fn write_delete_sequence(#[case] tier_size_mb: u32, mut rng: ThreadRng) { // The size s_1 of the tier should be in relation to the buffer size s_2 // s_1 < 3*s_2 && s_1 > 2*s_2 fn write_delete_essential_size(#[case] tier_size_mb: u32, #[case] buf_size: usize) { - let mut db = test_db(1, tier_size_mb); + let mut db = test_db(1, tier_size_mb, Default::default()); let os = db .open_named_object_store(b"test", StoragePreference::FASTEST) .expect("Oh no! Could not open object store"); @@ -563,7 +603,7 @@ fn write_delete_essential_size(#[case] tier_size_mb: u32, #[case] buf_size: usiz // We should include some measure to handle these cases. // -> Space Accounting! fn overwrite_buffer(#[case] tier_size_mb: u32, #[case] buf_size: usize) { - let mut db = test_db(1, tier_size_mb); + let mut db = test_db(1, tier_size_mb, Default::default()); let os = db .open_named_object_store(b"test", StoragePreference::FASTEST) .expect("Oh no! Could not open object store"); @@ -601,7 +641,7 @@ fn overwrite_buffer(#[case] tier_size_mb: u32, #[case] buf_size: usize) { #[rstest] #[case::a(2048)] fn write_sequence_random_fill(#[case] tier_size_mb: u32, mut rng: ThreadRng) { - let mut db = test_db(1, tier_size_mb); + let mut db = test_db(1, tier_size_mb, Default::default()); let os = db .open_named_object_store(b"test", StoragePreference::FASTEST) .expect("Oh no! Could not open object store"); @@ -629,7 +669,7 @@ fn write_sequence_random_fill(#[case] tier_size_mb: u32, mut rng: ThreadRng) { #[rstest] #[case::a(32)] fn dataset_migrate_down(#[case] tier_size_mb: u32) { - let mut db = test_db(2, tier_size_mb); + let mut db = test_db(2, tier_size_mb, Default::default()); let ds = db.open_or_create_dataset(b"miniprod").unwrap(); let buf = vec![42u8; 512 * 1024]; let key = b"test".to_vec(); @@ -639,6 +679,10 @@ fn dataset_migrate_down(#[case] tier_size_mb: u32) { let space = db.free_space_tier(); assert!(space[0].free < space[1].free); ds.migrate(key, StoragePreference::FAST).unwrap(); + println!( + "{}", + serde_json::to_string_pretty(&ds.tree_dump().unwrap()).unwrap() + ); db.sync().unwrap(); let space = db.free_space_tier(); assert!(space[0].free > space[1].free); @@ -651,7 +695,7 @@ fn dataset_migrate_down(#[case] tier_size_mb: u32) { #[case::d(2048)] fn object_migrate_down(#[case] tier_size_mb: u32) { // env_logger::init(); - let mut db = test_db(2, tier_size_mb); + let mut db = test_db(2, tier_size_mb, Default::default()); let os = db .open_named_object_store(b"test", StoragePreference::FASTEST) .unwrap(); @@ -668,7 +712,7 @@ fn object_migrate_down(#[case] tier_size_mb: u32) { #[rstest] #[case::a(32)] fn dataset_migrate_up(#[case] tier_size_mb: u32) { - let mut db = test_db(2, tier_size_mb); + let mut db = test_db(2, tier_size_mb, Default::default()); let ds = db.open_or_create_dataset(b"miniprod").unwrap(); let buf = vec![42u8; 512 * 1024]; let key = b"test".to_vec(); @@ -690,7 +734,7 @@ fn dataset_migrate_up(#[case] tier_size_mb: u32) { #[case::d(2048)] fn object_migrate_up(#[case] tier_size_mb: u32) { // env_logger::init(); - let mut db = test_db(2, tier_size_mb); + let mut db = test_db(2, tier_size_mb, Default::default()); let os = db .open_named_object_store(b"test", StoragePreference::FAST) .unwrap(); @@ -733,7 +777,7 @@ fn object_migrate_invalid_size(#[case] tier_size_mb: u32, #[case] buffer_size: u #[case::c(512)] #[case::d(2048)] fn object_migrate_invalid_tier(#[case] tier_size_mb: u32) { - let mut db = test_db(2, tier_size_mb); + let mut db = test_db(2, tier_size_mb, Default::default()); let os = db .open_named_object_store(b"test", StoragePreference::FASTEST) .unwrap(); @@ -753,7 +797,7 @@ fn object_migrate_invalid_tier(#[case] tier_size_mb: u32) { #[case::d(2048)] // @jwuensche: This case should not raise any errors and should just allow silent dropping of the operation. fn object_migrate_nochange(#[case] tier_size_mb: u32) { - let mut db = test_db(2, tier_size_mb); + let mut db = test_db(2, tier_size_mb, Default::default()); let os = db .open_named_object_store(b"test", StoragePreference::FASTEST) .unwrap(); @@ -768,7 +812,7 @@ fn object_migrate_nochange(#[case] tier_size_mb: u32) { #[rstest] fn space_accounting_smoke() { // env_logger::init(); - let mut db = test_db(2, 64); + let mut db = test_db(2, 64, Default::default()); let before = db.free_space_tier(); let os = db .open_named_object_store(b"test", StoragePreference::FASTEST) diff --git a/betree/tests/src/object_store.rs b/betree/tests/src/object_store.rs index d37d22c18..007575a53 100644 --- a/betree/tests/src/object_store.rs +++ b/betree/tests/src/object_store.rs @@ -5,7 +5,7 @@ use super::{configs, test_db, TO_MEBIBYTE}; #[test] // Open and close the default object store and test if the objects are preserved fn default_object_store_object_persists() { - let mut db = test_db(2, 64); + let mut db = test_db(2, 64, Default::default()); let os = db.open_object_store().unwrap(); let obj = os.open_or_create_object(b"hewo").unwrap(); obj.write_at(&[1, 2, 3], 0).unwrap(); @@ -25,7 +25,7 @@ fn default_object_store_object_persists() { #[test] // Open and close the default object store and test if the objects are preserved fn object_store_object_persists() { - let mut db = test_db(2, 64); + let mut db = test_db(2, 64, Default::default()); let os = db .open_named_object_store(b"uwu", StoragePreference::NONE) .unwrap(); @@ -46,7 +46,7 @@ fn object_store_object_persists() { #[test] fn object_store_iter() { - let mut db = test_db(2, 64); + let mut db = test_db(2, 64, Default::default()); let os = db.open_object_store().unwrap(); db.close_object_store(os); let os = db @@ -65,7 +65,7 @@ fn object_store_iter() { #[test] fn object_store_object_iter() { - let mut db = test_db(2, 64); + let mut db = test_db(2, 64, Default::default()); let os = db.open_object_store().unwrap(); let _ = os.open_or_create_object(b"hewo").unwrap(); let _ = os.open_or_create_object(b"uwu").unwrap(); @@ -78,7 +78,7 @@ fn object_store_object_iter() { fn object_store_reinit_from_iterator() { // Test opening of multiple stores by their names. // Test if the default store name '0' gets skipped. - let mut db = test_db(2, 64); + let mut db = test_db(2, 64, Default::default()); let os = db .open_named_object_store(b"foo", StoragePreference::NONE) .unwrap(); @@ -128,7 +128,7 @@ fn object_store_access_pattern() { #[test] fn object_store_reinit_from_id() { - let mut db = test_db(2, 64); + let mut db = test_db(2, 64, Default::default()); let os = db.open_object_store().unwrap(); db.close_object_store(os); let mut osl = db.iter_object_stores_pub().unwrap(); diff --git a/betree/tests/src/pivot_key.rs b/betree/tests/src/pivot_key.rs index 97c0ede6f..b5333d677 100644 --- a/betree/tests/src/pivot_key.rs +++ b/betree/tests/src/pivot_key.rs @@ -4,14 +4,14 @@ use rand::seq::IteratorRandom; #[test] fn structure_is_good() { - let (_db, ds) = util::random_db(2, 128); + let (_db, ds, _) = util::random_db(1, 256, Default::default()); let dmp = ds.tree_dump().unwrap(); internal_node_check(&dmp) } #[test] fn get() { - let (_db, ds) = util::random_db(2, 128); + let (db, ds, _) = util::random_db(1, 256, Default::default()); let dmp = ds.tree_dump().unwrap(); let pk = random_pivot_key(&dmp).unwrap(); let _node = ds.test_get_node_pivot(pk).unwrap().unwrap(); @@ -36,10 +36,15 @@ fn random_pivot_key(ni: &NodeInfo) -> Option<&PivotKey> { } fn internal_node_check(ni: &NodeInfo) { - if let NodeInfo::Internal { children, .. } = ni { + if let NodeInfo::Internal { + children, level, .. + } = ni + { for (idx, c_buf) in children.iter().enumerate() { assert!(!c_buf.pivot_key.is_root()); if idx == 0 { + dbg!(&c_buf.pivot_key); + dbg!(level); assert!(c_buf.pivot_key.is_left()); } else { assert!(c_buf.pivot_key.is_right()); diff --git a/betree/tests/src/snapshots/betree_tests__delete single__deleted something.snap b/betree/tests/src/snapshots/betree_tests__delete single__deleted something.snap index e025b18e1..e0ecd6539 100644 --- a/betree/tests/src/snapshots/betree_tests__delete single__deleted something.snap +++ b/betree/tests/src/snapshots/betree_tests__delete single__deleted something.snap @@ -1,6 +1,6 @@ --- source: betree/tests/src/lib.rs -expression: "json!({\n \"shape/data\" :\n self.object_store.data_tree().tree_dump().expect(\"Failed to create data tree dump\"),\n \"keys/data\" : self.object_store.data_tree().range :: < _, & [u8] >\n (..).expect(\"Failed to query data keys\").map(| res |\n res.map(| (k, _v) | k)).collect :: < Result < Vec < _ >, _ >>\n ().expect(\"Failed to gather data keys\"), \"keys/meta\" :\n self.object_store.meta_tree().range :: < _, & [u8] >\n (..).expect(\"Failed to query meta keys\").map(| res |\n res.map(| (k, _v) | k)).collect :: < Result < Vec < _ >, _ >>\n ().expect(\"Failed to gather meta keys\")\n })" +expression: "json!({\n \"shape/data\":\n self.object_store.data_tree().tree_dump().expect(\"Failed to create data tree dump\"),\n \"keys/data\": self.object_store.data_tree().range::<_,\n &[u8]>(..).expect(\"Failed to query data keys\").map(|res|\n res.map(|(k, _v)| k)).collect::,\n _>>().expect(\"Failed to gather data keys\"), \"keys/meta\":\n self.object_store.meta_tree().range::<_,\n &[u8]>(..).expect(\"Failed to query meta keys\").map(|res|\n res.map(|(k, _v)| k)).collect::,\n _>>().expect(\"Failed to gather meta keys\")\n})" --- { "keys/data": [ @@ -18,7 +18,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -49,7 +49,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -80,7 +80,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -111,7 +111,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -142,7 +142,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -173,7 +173,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -204,7 +204,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -235,7 +235,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -266,7 +266,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -297,7 +297,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -328,7 +328,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -359,7 +359,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -390,7 +390,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -421,7 +421,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -452,7 +452,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -483,7 +483,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -514,7 +514,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -545,7 +545,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -576,7 +576,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -607,7 +607,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -638,7 +638,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -669,7 +669,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -700,7 +700,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -731,7 +731,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -762,7 +762,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -793,7 +793,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -824,7 +824,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -855,7 +855,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -886,7 +886,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -917,7 +917,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -948,7 +948,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -979,7 +979,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -1318,7 +1318,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t } ], "level": 1, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "internal" } diff --git a/betree/tests/src/snapshots/betree_tests__delete single__inserted something.snap b/betree/tests/src/snapshots/betree_tests__delete single__inserted something.snap index 4558c07c7..adddbb51f 100644 --- a/betree/tests/src/snapshots/betree_tests__delete single__inserted something.snap +++ b/betree/tests/src/snapshots/betree_tests__delete single__inserted something.snap @@ -1,6 +1,6 @@ --- source: betree/tests/src/lib.rs -expression: "json!({\n \"shape/data\" :\n self.object_store.data_tree().tree_dump().expect(\"Failed to create data tree dump\"),\n \"keys/data\" : self.object_store.data_tree().range :: < _, & [u8] >\n (..).expect(\"Failed to query data keys\").map(| res |\n res.map(| (k, _v) | k)).collect :: < Result < Vec < _ >, _ >>\n ().expect(\"Failed to gather data keys\"), \"keys/meta\" :\n self.object_store.meta_tree().range :: < _, & [u8] >\n (..).expect(\"Failed to query meta keys\").map(| res |\n res.map(| (k, _v) | k)).collect :: < Result < Vec < _ >, _ >>\n ().expect(\"Failed to gather meta keys\")\n })" +expression: "json!({\n \"shape/data\":\n self.object_store.data_tree().tree_dump().expect(\"Failed to create data tree dump\"),\n \"keys/data\": self.object_store.data_tree().range::<_,\n &[u8]>(..).expect(\"Failed to query data keys\").map(|res|\n res.map(|(k, _v)| k)).collect::,\n _>>().expect(\"Failed to gather data keys\"), \"keys/meta\":\n self.object_store.meta_tree().range::<_,\n &[u8]>(..).expect(\"Failed to query meta keys\").map(|res|\n res.map(|(k, _v)| k)).collect::,\n _>>().expect(\"Failed to gather meta keys\")\n})" --- { "keys/data": [ @@ -14030,7 +14030,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14061,7 +14061,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14092,7 +14092,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14123,7 +14123,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14154,7 +14154,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14185,7 +14185,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14216,7 +14216,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14247,7 +14247,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14278,7 +14278,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14309,7 +14309,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14340,7 +14340,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14371,7 +14371,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14402,7 +14402,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14433,7 +14433,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14464,7 +14464,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14495,7 +14495,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14526,7 +14526,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14557,7 +14557,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14588,7 +14588,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14619,7 +14619,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14650,7 +14650,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14681,7 +14681,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14712,7 +14712,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14743,7 +14743,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14774,7 +14774,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14805,7 +14805,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14836,7 +14836,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14867,7 +14867,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14898,7 +14898,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14929,7 +14929,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14960,7 +14960,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -14991,7 +14991,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -15330,7 +15330,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t } ], "level": 1, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "internal" } diff --git a/betree/tests/src/snapshots/betree_tests__downgrade__fast pref.snap b/betree/tests/src/snapshots/betree_tests__downgrade__fast pref.snap index c9b94f95f..2ad7e76b7 100644 --- a/betree/tests/src/snapshots/betree_tests__downgrade__fast pref.snap +++ b/betree/tests/src/snapshots/betree_tests__downgrade__fast pref.snap @@ -1,6 +1,6 @@ --- source: betree/tests/src/lib.rs -expression: "json!({\n \"shape/data\" :\n self.object_store.data_tree().tree_dump().expect(\"Failed to create data tree dump\"),\n \"keys/data\" : self.object_store.data_tree().range :: < _, & [u8] >\n (..).expect(\"Failed to query data keys\").map(| res |\n res.map(| (k, _v) | k)).collect :: < Result < Vec < _ >, _ >>\n ().expect(\"Failed to gather data keys\"), \"keys/meta\" :\n self.object_store.meta_tree().range :: < _, & [u8] >\n (..).expect(\"Failed to query meta keys\").map(| res |\n res.map(| (k, _v) | k)).collect :: < Result < Vec < _ >, _ >>\n ().expect(\"Failed to gather meta keys\")\n })" +expression: "json!({\n \"shape/data\":\n self.object_store.data_tree().tree_dump().expect(\"Failed to create data tree dump\"),\n \"keys/data\": self.object_store.data_tree().range::<_,\n &[u8]>(..).expect(\"Failed to query data keys\").map(|res|\n res.map(|(k, _v)| k)).collect::,\n _>>().expect(\"Failed to gather data keys\"), \"keys/meta\":\n self.object_store.meta_tree().range::<_,\n &[u8]>(..).expect(\"Failed to query meta keys\").map(|res|\n res.map(|(k, _v)| k)).collect::,\n _>>().expect(\"Failed to gather meta keys\")\n})" --- { "keys/data": [ @@ -1872,7 +1872,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 1, + "storage": 0, "system_storage": 254, "type": "leaf" }, @@ -1896,14 +1896,14 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t 1 ] }, - "storage": 1, + "storage": 0, "to": "0000 0000 0000 0000 0000 0017" }, { "child": { "entry_count": 24, "level": 0, - "storage": 1, + "storage": 0, "system_storage": 254, "type": "leaf" }, @@ -1927,14 +1927,14 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t 1 ] }, - "storage": 1, + "storage": 0, "to": "0000 0000 0000 0000 0000 002F" }, { "child": { "entry_count": 24, "level": 0, - "storage": 1, + "storage": 0, "system_storage": 254, "type": "leaf" }, @@ -1958,14 +1958,14 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t 1 ] }, - "storage": 1, + "storage": 0, "to": "0000 0000 0000 0000 0000 0047" }, { "child": { "entry_count": 24, "level": 0, - "storage": 1, + "storage": 0, "system_storage": 254, "type": "leaf" }, @@ -1989,14 +1989,14 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t 1 ] }, - "storage": 1, + "storage": 0, "to": "0000 0000 0000 0000 0000 005F" }, { "child": { "entry_count": 24, "level": 0, - "storage": 1, + "storage": 0, "system_storage": 254, "type": "leaf" }, @@ -2020,7 +2020,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t 1 ] }, - "storage": 1, + "storage": 0, "to": "0000 0000 0000 0000 0000 0077" }, { @@ -2056,7 +2056,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t } ], "level": 1, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "internal" } diff --git a/betree/tests/src/snapshots/betree_tests__downgrade__fastest pref.snap b/betree/tests/src/snapshots/betree_tests__downgrade__fastest pref.snap index 685ff4e3e..50b93b895 100644 --- a/betree/tests/src/snapshots/betree_tests__downgrade__fastest pref.snap +++ b/betree/tests/src/snapshots/betree_tests__downgrade__fastest pref.snap @@ -1,6 +1,6 @@ --- source: betree/tests/src/lib.rs -expression: "json!({\n \"shape/data\" :\n self.object_store.data_tree().tree_dump().expect(\"Failed to create data tree dump\"),\n \"keys/data\" : self.object_store.data_tree().range :: < _, & [u8] >\n (..).expect(\"Failed to query data keys\").map(| res |\n res.map(| (k, _v) | k)).collect :: < Result < Vec < _ >, _ >>\n ().expect(\"Failed to gather data keys\"), \"keys/meta\" :\n self.object_store.meta_tree().range :: < _, & [u8] >\n (..).expect(\"Failed to query meta keys\").map(| res |\n res.map(| (k, _v) | k)).collect :: < Result < Vec < _ >, _ >>\n ().expect(\"Failed to gather meta keys\")\n })" +expression: "json!({\n \"shape/data\":\n self.object_store.data_tree().tree_dump().expect(\"Failed to create data tree dump\"),\n \"keys/data\": self.object_store.data_tree().range::<_,\n &[u8]>(..).expect(\"Failed to query data keys\").map(|res|\n res.map(|(k, _v)| k)).collect::,\n _>>().expect(\"Failed to gather data keys\"), \"keys/meta\":\n self.object_store.meta_tree().range::<_,\n &[u8]>(..).expect(\"Failed to query meta keys\").map(|res|\n res.map(|(k, _v)| k)).collect::,\n _>>().expect(\"Failed to gather meta keys\")\n})" --- { "keys/data": [ @@ -1927,7 +1927,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t } ], "level": 1, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "internal" } diff --git a/betree/tests/src/snapshots/betree_tests__insert single__deleted foo.snap b/betree/tests/src/snapshots/betree_tests__insert single__deleted foo.snap index 7fb266ec3..38f1006b7 100644 --- a/betree/tests/src/snapshots/betree_tests__insert single__deleted foo.snap +++ b/betree/tests/src/snapshots/betree_tests__insert single__deleted foo.snap @@ -1,6 +1,6 @@ --- source: betree/tests/src/lib.rs -expression: "json!({\n \"shape/data\" :\n self.object_store.data_tree().tree_dump().expect(\"Failed to create data tree dump\"),\n \"keys/data\" : self.object_store.data_tree().range :: < _, & [u8] >\n (..).expect(\"Failed to query data keys\").map(| res |\n res.map(| (k, _v) | k)).collect :: < Result < Vec < _ >, _ >>\n ().expect(\"Failed to gather data keys\"), \"keys/meta\" :\n self.object_store.meta_tree().range :: < _, & [u8] >\n (..).expect(\"Failed to query meta keys\").map(| res |\n res.map(| (k, _v) | k)).collect :: < Result < Vec < _ >, _ >>\n ().expect(\"Failed to gather meta keys\")\n })" +expression: "json!({\n \"shape/data\":\n self.object_store.data_tree().tree_dump().expect(\"Failed to create data tree dump\"),\n \"keys/data\": self.object_store.data_tree().range::<_,\n &[u8]>(..).expect(\"Failed to query data keys\").map(|res|\n res.map(|(k, _v)| k)).collect::,\n _>>().expect(\"Failed to gather data keys\"), \"keys/meta\":\n self.object_store.meta_tree().range::<_,\n &[u8]>(..).expect(\"Failed to query meta keys\").map(|res|\n res.map(|(k, _v)| k)).collect::,\n _>>().expect(\"Failed to gather meta keys\")\n})" --- { "keys/data": [ @@ -18,7 +18,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -357,7 +357,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t } ], "level": 1, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "internal" } diff --git a/betree/tests/src/snapshots/betree_tests__insert single__inserted bar.snap b/betree/tests/src/snapshots/betree_tests__insert single__inserted bar.snap index 575c794ac..e6bbb85d0 100644 --- a/betree/tests/src/snapshots/betree_tests__insert single__inserted bar.snap +++ b/betree/tests/src/snapshots/betree_tests__insert single__inserted bar.snap @@ -1,6 +1,6 @@ --- source: betree/tests/src/lib.rs -expression: "json!({\n \"shape/data\" :\n self.object_store.data_tree().tree_dump().expect(\"Failed to create data tree dump\"),\n \"keys/data\" : self.object_store.data_tree().range :: < _, & [u8] >\n (..).expect(\"Failed to query data keys\").map(| res |\n res.map(| (k, _v) | k)).collect :: < Result < Vec < _ >, _ >>\n ().expect(\"Failed to gather data keys\"), \"keys/meta\" :\n self.object_store.meta_tree().range :: < _, & [u8] >\n (..).expect(\"Failed to query meta keys\").map(| res |\n res.map(| (k, _v) | k)).collect :: < Result < Vec < _ >, _ >>\n ().expect(\"Failed to gather meta keys\")\n })" +expression: "json!({\n \"shape/data\":\n self.object_store.data_tree().tree_dump().expect(\"Failed to create data tree dump\"),\n \"keys/data\": self.object_store.data_tree().range::<_,\n &[u8]>(..).expect(\"Failed to query data keys\").map(|res|\n res.map(|(k, _v)| k)).collect::,\n _>>().expect(\"Failed to gather data keys\"), \"keys/meta\":\n self.object_store.meta_tree().range::<_,\n &[u8]>(..).expect(\"Failed to query meta keys\").map(|res|\n res.map(|(k, _v)| k)).collect::,\n _>>().expect(\"Failed to gather meta keys\")\n})" --- { "keys/data": [ @@ -2656,7 +2656,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -2687,7 +2687,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -2718,7 +2718,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -2749,7 +2749,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -2780,7 +2780,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -2811,7 +2811,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -2842,7 +2842,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -2873,7 +2873,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -2964,7 +2964,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t }, { "child": { - "entry_count": 24, + "entry_count": 15, "level": 0, "storage": 0, "system_storage": 254, @@ -2991,7 +2991,38 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t ] }, "storage": 0, - "to": "0000 0000 0000 0001 0000 0017" + "to": "0000 0000 0000 0001 0000 000E" + }, + { + "child": { + "entry_count": 8, + "level": 0, + "storage": 0, + "system_storage": 254, + "type": "leaf" + }, + "from": "0000 0000 0000 0001 0000 000E", + "pivot_key": { + "Right": [ + [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 0, + 0, + 0, + 14 + ], + 1 + ] + }, + "storage": 0, + "to": "0000 0000 0000 0001 0000 0016" }, { "child": { @@ -3001,7 +3032,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "system_storage": 254, "type": "leaf" }, - "from": "0000 0000 0000 0001 0000 0017", + "from": "0000 0000 0000 0001 0000 0016", "pivot_key": { "Right": [ [ @@ -3016,13 +3047,13 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t 0, 0, 0, - 23 + 22 ], 1 ] }, "storage": 0, - "to": "0000 0000 0000 0001 0000 002F" + "to": "0000 0000 0000 0001 0000 002E" }, { "child": { @@ -3032,7 +3063,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "system_storage": 254, "type": "leaf" }, - "from": "0000 0000 0000 0001 0000 002F", + "from": "0000 0000 0000 0001 0000 002E", "pivot_key": { "Right": [ [ @@ -3047,13 +3078,13 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t 0, 0, 0, - 47 + 46 ], 1 ] }, "storage": 0, - "to": "0000 0000 0000 0001 0000 0047" + "to": "0000 0000 0000 0001 0000 0046" }, { "child": { @@ -3063,7 +3094,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "system_storage": 254, "type": "leaf" }, - "from": "0000 0000 0000 0001 0000 0047", + "from": "0000 0000 0000 0001 0000 0046", "pivot_key": { "Right": [ [ @@ -3078,13 +3109,13 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t 0, 0, 0, - 71 + 70 ], 1 ] }, "storage": 0, - "to": "0000 0000 0000 0001 0000 005F" + "to": "0000 0000 0000 0001 0000 005E" }, { "child": { @@ -3094,7 +3125,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "system_storage": 254, "type": "leaf" }, - "from": "0000 0000 0000 0001 0000 005F", + "from": "0000 0000 0000 0001 0000 005E", "pivot_key": { "Right": [ [ @@ -3109,13 +3140,13 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t 0, 0, 0, - 95 + 94 ], 1 ] }, "storage": 0, - "to": "0000 0000 0000 0001 0000 0077" + "to": "0000 0000 0000 0001 0000 0076" }, { "child": { @@ -3125,7 +3156,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "system_storage": 254, "type": "leaf" }, - "from": "0000 0000 0000 0001 0000 0077", + "from": "0000 0000 0000 0001 0000 0076", "pivot_key": { "Right": [ [ @@ -3140,13 +3171,13 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t 0, 0, 0, - 119 + 118 ], 1 ] }, "storage": 0, - "to": "0000 0000 0000 0001 0000 008F" + "to": "0000 0000 0000 0001 0000 008E" }, { "child": { @@ -3156,7 +3187,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "system_storage": 254, "type": "leaf" }, - "from": "0000 0000 0000 0001 0000 008F", + "from": "0000 0000 0000 0001 0000 008E", "pivot_key": { "Right": [ [ @@ -3171,23 +3202,23 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t 0, 0, 0, - 143 + 142 ], 1 ] }, "storage": 0, - "to": "0000 0000 0000 0001 0000 00A7" + "to": "0000 0000 0000 0001 0000 00A6" }, { "child": { - "entry_count": 21, + "entry_count": 22, "level": 0, "storage": 0, "system_storage": 254, "type": "leaf" }, - "from": "0000 0000 0000 0001 0000 00A7", + "from": "0000 0000 0000 0001 0000 00A6", "pivot_key": { "Right": [ [ @@ -3202,7 +3233,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t 0, 0, 0, - 167 + 166 ], 1 ] @@ -3212,7 +3243,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t } ], "level": 1, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "internal" } diff --git a/betree/tests/src/snapshots/betree_tests__insert single__inserted foo.snap b/betree/tests/src/snapshots/betree_tests__insert single__inserted foo.snap index 685ff4e3e..50b93b895 100644 --- a/betree/tests/src/snapshots/betree_tests__insert single__inserted foo.snap +++ b/betree/tests/src/snapshots/betree_tests__insert single__inserted foo.snap @@ -1,6 +1,6 @@ --- source: betree/tests/src/lib.rs -expression: "json!({\n \"shape/data\" :\n self.object_store.data_tree().tree_dump().expect(\"Failed to create data tree dump\"),\n \"keys/data\" : self.object_store.data_tree().range :: < _, & [u8] >\n (..).expect(\"Failed to query data keys\").map(| res |\n res.map(| (k, _v) | k)).collect :: < Result < Vec < _ >, _ >>\n ().expect(\"Failed to gather data keys\"), \"keys/meta\" :\n self.object_store.meta_tree().range :: < _, & [u8] >\n (..).expect(\"Failed to query meta keys\").map(| res |\n res.map(| (k, _v) | k)).collect :: < Result < Vec < _ >, _ >>\n ().expect(\"Failed to gather meta keys\")\n })" +expression: "json!({\n \"shape/data\":\n self.object_store.data_tree().tree_dump().expect(\"Failed to create data tree dump\"),\n \"keys/data\": self.object_store.data_tree().range::<_,\n &[u8]>(..).expect(\"Failed to query data keys\").map(|res|\n res.map(|(k, _v)| k)).collect::,\n _>>().expect(\"Failed to gather data keys\"), \"keys/meta\":\n self.object_store.meta_tree().range::<_,\n &[u8]>(..).expect(\"Failed to query meta keys\").map(|res|\n res.map(|(k, _v)| k)).collect::,\n _>>().expect(\"Failed to gather meta keys\")\n})" --- { "keys/data": [ @@ -1927,7 +1927,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t } ], "level": 1, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "internal" } diff --git a/betree/tests/src/snapshots/betree_tests__insert single__rewrote foo, but larger.snap b/betree/tests/src/snapshots/betree_tests__insert single__rewrote foo, but larger.snap index fb1e5d976..6463ac4d1 100644 --- a/betree/tests/src/snapshots/betree_tests__insert single__rewrote foo, but larger.snap +++ b/betree/tests/src/snapshots/betree_tests__insert single__rewrote foo, but larger.snap @@ -1,6 +1,6 @@ --- source: betree/tests/src/lib.rs -expression: "json!({\n \"shape/data\" :\n self.object_store.data_tree().tree_dump().expect(\"Failed to create data tree dump\"),\n \"keys/data\" : self.object_store.data_tree().range :: < _, & [u8] >\n (..).expect(\"Failed to query data keys\").map(| res |\n res.map(| (k, _v) | k)).collect :: < Result < Vec < _ >, _ >>\n ().expect(\"Failed to gather data keys\"), \"keys/meta\" :\n self.object_store.meta_tree().range :: < _, & [u8] >\n (..).expect(\"Failed to query meta keys\").map(| res |\n res.map(| (k, _v) | k)).collect :: < Result < Vec < _ >, _ >>\n ().expect(\"Failed to gather meta keys\")\n })" +expression: "json!({\n \"shape/data\":\n self.object_store.data_tree().tree_dump().expect(\"Failed to create data tree dump\"),\n \"keys/data\": self.object_store.data_tree().range::<_,\n &[u8]>(..).expect(\"Failed to query data keys\").map(|res|\n res.map(|(k, _v)| k)).collect::,\n _>>().expect(\"Failed to gather data keys\"), \"keys/meta\":\n self.object_store.meta_tree().range::<_,\n &[u8]>(..).expect(\"Failed to query meta keys\").map(|res|\n res.map(|(k, _v)| k)).collect::,\n _>>().expect(\"Failed to gather meta keys\")\n})" --- { "keys/data": [ @@ -3524,7 +3524,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -3863,7 +3863,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t } ], "level": 1, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "internal" } diff --git a/betree/tests/src/snapshots/betree_tests__sparse__sparse write 1.snap b/betree/tests/src/snapshots/betree_tests__sparse__sparse write 1.snap index f526b3677..57113093e 100644 --- a/betree/tests/src/snapshots/betree_tests__sparse__sparse write 1.snap +++ b/betree/tests/src/snapshots/betree_tests__sparse__sparse write 1.snap @@ -1,6 +1,6 @@ --- source: betree/tests/src/lib.rs -expression: "json!({\n \"shape/data\" :\n self.object_store.data_tree().tree_dump().expect(\"Failed to create data tree dump\"),\n \"keys/data\" : self.object_store.data_tree().range :: < _, & [u8] >\n (..).expect(\"Failed to query data keys\").map(| res |\n res.map(| (k, _v) | k)).collect :: < Result < Vec < _ >, _ >>\n ().expect(\"Failed to gather data keys\"), \"keys/meta\" :\n self.object_store.meta_tree().range :: < _, & [u8] >\n (..).expect(\"Failed to query meta keys\").map(| res |\n res.map(| (k, _v) | k)).collect :: < Result < Vec < _ >, _ >>\n ().expect(\"Failed to gather meta keys\")\n })" +expression: "json!({\n \"shape/data\":\n self.object_store.data_tree().tree_dump().expect(\"Failed to create data tree dump\"),\n \"keys/data\": self.object_store.data_tree().range::<_,\n &[u8]>(..).expect(\"Failed to query data keys\").map(|res|\n res.map(|(k, _v)| k)).collect::,\n _>>().expect(\"Failed to gather data keys\"), \"keys/meta\":\n self.object_store.meta_tree().range::<_,\n &[u8]>(..).expect(\"Failed to query meta keys\").map(|res|\n res.map(|(k, _v)| k)).collect::,\n _>>().expect(\"Failed to gather meta keys\")\n})" --- { "keys/data": [ @@ -3101,7 +3101,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t } ], "level": 1, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "internal" } diff --git a/betree/tests/src/snapshots/betree_tests__sparse__sparse write 2.snap b/betree/tests/src/snapshots/betree_tests__sparse__sparse write 2.snap index 413e8db67..cb6b01e40 100644 --- a/betree/tests/src/snapshots/betree_tests__sparse__sparse write 2.snap +++ b/betree/tests/src/snapshots/betree_tests__sparse__sparse write 2.snap @@ -1,6 +1,6 @@ --- source: betree/tests/src/lib.rs -expression: "json!({\n \"shape/data\" :\n self.object_store.data_tree().tree_dump().expect(\"Failed to create data tree dump\"),\n \"keys/data\" : self.object_store.data_tree().range :: < _, & [u8] >\n (..).expect(\"Failed to query data keys\").map(| res |\n res.map(| (k, _v) | k)).collect :: < Result < Vec < _ >, _ >>\n ().expect(\"Failed to gather data keys\"), \"keys/meta\" :\n self.object_store.meta_tree().range :: < _, & [u8] >\n (..).expect(\"Failed to query meta keys\").map(| res |\n res.map(| (k, _v) | k)).collect :: < Result < Vec < _ >, _ >>\n ().expect(\"Failed to gather meta keys\")\n })" +expression: "json!({\n \"shape/data\":\n self.object_store.data_tree().tree_dump().expect(\"Failed to create data tree dump\"),\n \"keys/data\": self.object_store.data_tree().range::<_,\n &[u8]>(..).expect(\"Failed to query data keys\").map(|res|\n res.map(|(k, _v)| k)).collect::,\n _>>().expect(\"Failed to gather data keys\"), \"keys/meta\":\n self.object_store.meta_tree().range::<_,\n &[u8]>(..).expect(\"Failed to query meta keys\").map(|res|\n res.map(|(k, _v)| k)).collect::,\n _>>().expect(\"Failed to gather meta keys\")\n})" --- { "keys/data": [ @@ -7024,7 +7024,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -7055,7 +7055,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -7086,7 +7086,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -7117,7 +7117,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -7148,7 +7148,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -7179,7 +7179,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -7210,7 +7210,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -7241,7 +7241,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -7272,7 +7272,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -7303,7 +7303,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 8, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -7334,7 +7334,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t "child": { "entry_count": 24, "level": 0, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "leaf" }, @@ -7704,7 +7704,7 @@ expression: "json!({\n \"shape/data\" :\n self.object_store.data_t } ], "level": 1, - "storage": 0, + "storage": 254, "system_storage": 254, "type": "internal" } diff --git a/betree/tests/src/util.rs b/betree/tests/src/util.rs index a84a61a77..b9e5245ac 100644 --- a/betree/tests/src/util.rs +++ b/betree/tests/src/util.rs @@ -1,18 +1,26 @@ use super::test_db; -use betree_storage_stack::{Database, Dataset}; -use rand::RngCore; +use betree_storage_stack::{tree::StorageKind, Database, Dataset}; +use rand::{ + rngs::{StdRng, ThreadRng}, + seq::SliceRandom, + RngCore, +}; -pub fn random_db(tier: u32, mb_per_tier: u32) -> (Database, Dataset) { - let mut db = test_db(tier, mb_per_tier); +pub fn random_db(tier: u32, mb_per_tier: u32, kind: StorageKind) -> (Database, Dataset, u32) { + let mut db = test_db(tier, mb_per_tier, kind); let ds = db.open_or_create_dataset(b"hey").unwrap(); - let mut key = vec![0u8; 64]; - let mut val = vec![0u8; 4096]; - let mut rng = rand::thread_rng(); - for _ in 0..20000 { - rng.fill_bytes(&mut key); + let mut val = vec![0u8; 1024]; + let mut rng: StdRng = rand::SeedableRng::seed_from_u64(1337); + let ks = (tier as f32 * (mb_per_tier as u64 * 1024 * 1024) as f32 * 0.4) as u32 / 1086; + let mut foo: Vec = (1..ks as u64).collect(); + foo.shuffle(&mut rng); + for (it, idx) in foo.iter().enumerate() { rng.fill_bytes(&mut val); - ds.insert(key.clone(), val.as_slice()).unwrap(); + ds.insert(&idx.to_be_bytes()[..], val.as_slice()).unwrap(); + if it % 10000 == 0 { + db.sync().unwrap(); + } } db.sync().unwrap(); - (db, ds) + (db, ds, ks) } diff --git a/fio-haura/.ci/haura.json b/fio-haura/.ci/haura.json index 67e8b2718..a54869cbc 100644 --- a/fio-haura/.ci/haura.json +++ b/fio-haura/.ci/haura.json @@ -3,7 +3,8 @@ "tiers": [ { "top_level_vdevs": [], - "preferred_access_type": "Unknown" + "preferred_access_type": "Unknown", + "storage_kind": "Ssd" } ], "queue_depth_factor": 20, @@ -31,5 +32,4 @@ "sync_interval_ms": null, "migration_policy": null, "metrics": null -} - +} \ No newline at end of file diff --git a/fio-haura/bench_fio.sh b/fio-haura/bench_fio.sh new file mode 100755 index 000000000..26733b202 --- /dev/null +++ b/fio-haura/bench_fio.sh @@ -0,0 +1,42 @@ +#!/bin/env bash + +set -e + +# This script contains a structured approach to run multiple fio runs with +# multiple parameters. It is intended to be modified to customize your benchmark +# runs. +export_options=(--group_reporting --output-format=json --output=output.json --write_bw_log=bench --write_lat_log=bench --write_hist_log=bench --write_iops_log=bench --log_hist_msec=100 --log_avg_msec=100 --directory=./.bench-fio-tmp-data) +root=$PWD + +# Below are possible configuration options. Add elements to run multiple +# benchmarks. +modes=(read randread write randwrite) +ioengines=("external:${root}/src/fio-engine-haura.o") +blocksizes=(4k 4m) +jobs=(1 2 3 4 5 6 7 8) +size_gb=1 +runtime=30s +extra_options=(--disrespect-fio-options) +id="results_ID" + +mkdir "$id" +pushd "$id" || exit + +for ioengine in "${ioengines[@]}"; do + for job in "${jobs[@]}"; do + for mode in "${modes[@]}"; do + for blocksize in "${blocksizes[@]}"; do + name="${mode}_$(echo "$ioengine" | awk -F'/' '{print $NF}')_${blocksize}_${job}" + mkdir "${name}" + pushd "${name}" || exit + size=$((size_gb * 1024 / job)) + mkdir .bench-fio-tmp-data + "${root}/fio-fio-3.33/fio" "--name=${name}" "--readwrite=${mode}" "--ioengine=${ioengine}" "--blocksize=${blocksize}" "--numjobs=${job}" "--runtime=${runtime}" "--size=${size}M" "${export_options[@]}" "${extra_options[@]}" + rm -rf .bench-fio-tmp-data + popd || exit + done + done + done +done + +popd || exit diff --git a/fio-haura/flamegraph.html b/fio-haura/flamegraph.html new file mode 100644 index 000000000..16eb5b2ba --- /dev/null +++ b/fio-haura/flamegraph.html @@ -0,0 +1,30 @@ + + + + + Flame Graph + + +

+ + + +
+ + +
+
+
+ +
+
Loading Flame Graph...
+
+ + + + diff --git a/fio-haura/jobfiles/rnd_rw_iops.fio b/fio-haura/jobfiles/rnd_rw_iops.fio new file mode 100644 index 000000000..f4c3eb462 --- /dev/null +++ b/fio-haura/jobfiles/rnd_rw_iops.fio @@ -0,0 +1,10 @@ +[rnd-rw-iops] +rw=randrw +numjobs=1 +bs=32k +direct=1 +ioengine=external:src/fio-engine-haura.o +size=4g +runtime=10s +group_reporting +disrespect-fio-options diff --git a/fio-haura/jobfiles/rnd_write_iops.fio b/fio-haura/jobfiles/rnd_write_iops.fio index 4d82581d8..b51dbc30a 100644 --- a/fio-haura/jobfiles/rnd_write_iops.fio +++ b/fio-haura/jobfiles/rnd_write_iops.fio @@ -1,9 +1,9 @@ [rnd-write-iops] rw=randwrite -numjobs=4 +numjobs=3 bs=4k direct=1 ioengine=external:src/fio-engine-haura.o -size=2g -io_size=1g -fsync=16384 +size=80g +io_size=2g +disrespect-fio-options diff --git a/fio-haura/jobfiles/seq_write_bw.fio b/fio-haura/jobfiles/seq_write_bw.fio index cd7a528bf..657494df0 100644 --- a/fio-haura/jobfiles/seq_write_bw.fio +++ b/fio-haura/jobfiles/seq_write_bw.fio @@ -1,7 +1,9 @@ [seq-write-bw] rw=write -numjobs=4 +numjobs=1 bs=4m direct=1 ioengine=external:src/fio-engine-haura.o +disrespect-fio-options size=2g +fsync=16384 diff --git a/fio-haura/plots/bw_log.py b/fio-haura/plots/bw_log.py new file mode 100755 index 000000000..f75151cbe --- /dev/null +++ b/fio-haura/plots/bw_log.py @@ -0,0 +1,77 @@ +#!/bin/env python + +import numpy +import pandas +import matplotlib.pyplot as plt +import sys +import json +import glob + +def plot_bw_lat_log(path): + """ + Plot an amalgation of different plots containing bandwidth, latency and IOPS + over time. This plots for each job a line, although they remain unnamed in + the output. + """ + bws = [pandas.read_csv(res, names=['msec', 'value', 'data_dir', 'bs', 'prio']) for res in glob.glob(path + '/bench_bw.*')] + lats = [pandas.read_csv(res, names=['msec', 'value', 'data_dir', 'bs', 'prio']) for res in glob.glob(path + '/bench_lat.*')] + iopss = [pandas.read_csv(res, names=['msec', 'value', 'data_dir', 'bs', 'prio']) for res in glob.glob(path + '/bench_iops.*')] + + fig, axs = plt.subplots(3,1,figsize=(6,7)) + # plot in MiB/s + for bw in bws: + axs[0].plot(bw['msec'] / 1000, bw['value'] / 1024) + axs[0].set_title(f"{path} - Bandwidth [MiB/s]") + axs[0].set_yscale('log') + # plot in ns + for lat in lats: + axs[1].plot(lat['msec'] / 1000, lat['value'], label='Latency') + axs[1].set_title(f"{path} - Average Latency [ns]") + axs[1].set_yscale('log') + # plot in IOPS + for iops in iopss: + axs[2].plot(iops['msec'] / 1000, iops['value'], label='IOPS') + axs[2].set_title(f"{path} - IOPS [#]") + axs[2].set_xlabel('Runtime [s]') + axs[2].set_yscale('log') + fig.tight_layout() + fig.savefig(f'{path}/log.svg') + +def plot_lat_dist(path): + """ + Plot the latency distribution for completion latency (clat) from the fio + output, this works regardless of grouped reporting or single job reporting. + Although grouped reporting improves readability. + + This method creates both read and write version of this plot. + """ + with open(path + '/output.json') as data: + js = json.load(data) + + def plot(mode): + fig, ax = plt.subplots(1,1) + total_jobs = len(js["jobs"]) + if "percentile" not in js["jobs"][0][mode]["clat_ns"].keys(): + return + for (idx, job) in enumerate(js["jobs"]): + bins = job[mode]["clat_ns"]["percentile"].keys() + vals = job[mode]["clat_ns"]["percentile"].values() + ax.bar(numpy.array(range(0,len(vals))) + 1/total_jobs * idx, vals, min(1/total_jobs, 0.8)) + ax.set_xticks(range(0,len(vals)), labels=[s[:5] for s in bins], rotation='vertical') + ax.set_xlabel("Percentile [%]") + ax.set_ylabel("Latency [ns]") + ax.set_yscale('log') + ax.set_title(f'{path} - {mode} Latency Percentiles') + fig.tight_layout() + fig.savefig(f'{path}/{mode}_latency.svg') + + plot("read") + plot("write") + +if len(sys.argv) < 2: + print("Usage:") + print(f" {sys.argv[0]} []") + +for res in sys.argv[1:]: + plot_bw_lat_log(res) + plot_lat_dist(res) diff --git a/fio-haura/src/fio-engine-haura.c b/fio-haura/src/fio-engine-haura.c index 99914b7e9..9c80c0bbd 100644 --- a/fio-haura/src/fio-engine-haura.c +++ b/fio-haura/src/fio-engine-haura.c @@ -44,6 +44,7 @@ struct fio_haura_options { int disrespect_fio_queue_depth; int disrespect_fio_direct; int disrespect_fio_options; + int haura_nvm; }; struct haura_data { @@ -110,6 +111,7 @@ static struct fio_option options[] = { static int bail(struct err_t *error) { betree_print_error(error); + printf("\n"); betree_free_err(error); return 1; } @@ -304,8 +306,11 @@ static int fio_haura_setup(struct thread_data *td) { /* Haura needs some additional space to provide extra data like object * pointers and metadata. This is more of a hack, but nonetheless. */ creat(td->files[idx]->file_name, 0644); - if (truncate(td->files[idx]->file_name, max(td->o.file_size_high, td->o.size) + (50 * 1024 * 1024))) { - fprintf(stderr,"Could not retruncate file to provide enough storage for Haura.\n"); + if (truncate(td->files[idx]->file_name, + max(td->o.file_size_high, td->o.size) + (50 * 1024 * 1024))) { + fprintf( + stderr, + "Could not retruncate file to provide enough storage for Haura.\n"); } } @@ -322,15 +327,63 @@ static int fio_haura_setup(struct thread_data *td) { return bail(error); } fio_haura_translate(td, cfg); - if ((global_data.db = betree_create_db(cfg, &error)) == NULL) { - return bail(error); - } - if ((global_data.obj_s = betree_create_object_store( - global_data.db, "fio", 3, pref, &error)) == NULL) { - return bail(error); + + int is_prefilled = 0; + /* + ** Checking for any pre-existing data we might be able to use. + */ + if ((global_data.db = betree_open_db(cfg, &error)) == NULL || + td_write(td)) { + error = NULL; + new_db: + if ((global_data.db = betree_create_db(cfg, &error)) == NULL) { + return bail(error); + } + if ((global_data.obj_s = betree_create_object_store( + global_data.db, "fio", 3, pref, &error)) == NULL) { + return bail(error); + } + } else { + /* + ** Check if object store exists and objects are valid otherwise open new + *db. + */ + if ((global_data.obj_s = betree_create_object_store( + global_data.db, "fio", 3, pref, &error)) == NULL) { + betree_close_db(global_data.db); + global_data.db = NULL; + goto new_db; + } + + char init[2] = {1}; + + for (size_t idx = 0; idx < global_data.jobs; idx += 1) { + init[1] += 1; + + unsigned long long object_size = -1; + if ((object_size = betree_object_get_size(global_data.obj_s, init, 2, + &error)) == -1) { + betree_close_db(global_data.db); + global_data.db = NULL; + global_data.obj_s = NULL; + goto new_db; + } + + if (td->o.size > object_size) { + betree_close_db(global_data.db); + global_data.db = NULL; + global_data.obj_s = NULL; + goto new_db; + } + } + + // If we made it this far the data present is sufficient for the + // benchmark. Good job! + printf("haura: Reusing stored data from previous benchmark\n"); + is_prefilled = 1; } - char init[2] = {1}; + char init[2] = {1}; global_data.objs = malloc(sizeof(struct obj_t *) * global_data.jobs); // Create a private object for each thread for (size_t idx = 0; idx < global_data.jobs; idx += 1) { @@ -343,7 +396,7 @@ static int fio_haura_setup(struct thread_data *td) { /* Due to limitations in the fio initialization process we prepopulate the * objects here, which is suboptimal but the only place possible due to * the order of execution. */ - if (!td_write(td)) { + if (!td_write(td) && !is_prefilled) { unsigned long long block_size = td->o.bs[DDIR_WRITE]; unsigned long long max_io_size = td->o.size; void *buf = malloc(block_size); @@ -405,4 +458,5 @@ struct ioengine_ops ioengine = { .setup = fio_haura_setup, .options = options, .option_struct_size = sizeof(struct fio_haura_options), + .flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOEXTEND | FIO_NODISKUTIL, }; diff --git a/julea-betree/Cargo.toml b/julea-betree/Cargo.toml index 0d339f779..27fd9eb05 100644 --- a/julea-betree/Cargo.toml +++ b/julea-betree/Cargo.toml @@ -3,7 +3,7 @@ name = "julea-betree" version = "0.1.0" authors = ["tilpner "] edition = "2018" -rust-version = "1.70" +rust-version = "1.82.0" [lib] name = "object_betree"