diff --git a/CMakeLists.txt b/CMakeLists.txt
index 91d544ff..73bdff78 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,6 +28,7 @@ CPMAddPackage("gh:simdjson/simdjson#v3.9.2")
 find_package(RapidJSON REQUIRED)
 find_library(TCLAP tclap)
 
+option(GPUEVM_O2_DEBUG "Enable O2 debug for CUDA" ON)
 option(GPUEVM_DEBUG_INTERPRETER "Printing from device during interpreter runtime" OFF)
 option(GPUEVM_DEBUG_FUZZER "Print debug info from fuzzer during runtime" OFF)
 option(GPUEVM_DEBUG_NVTX "Printing nvtx context pushing/popping during runtime" OFF)
@@ -103,10 +104,13 @@ function(configure_target_gpuevm target)
         target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-g>)
         target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xptxas -v>)
 
-        # TODO make this an option
-        target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-dopt=on>)
-        target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O2>)
-        target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xptxas -O2>)
+        # Recall that we have this a suboption of the debug build type bc we couldn't figure out how to capture
+        # all the lineinfo in a -g Release build
+        if(GPUEVM_O2_DEBUG)
+            target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-dopt=on>)
+            target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O2>)
+            target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xptxas -O2>)
+        endif()
 
         # And set the GPUEVM_DEBUG symbol
         target_compile_definitions(${target} PRIVATE GPUEVM_DEBUG)
diff --git a/src/common/fuzz_algos/CMakeLists.txt b/src/common/fuzz_algos/CMakeLists.txt
index 540dbfa0..7fb4157d 100644
--- a/src/common/fuzz_algos/CMakeLists.txt
+++ b/src/common/fuzz_algos/CMakeLists.txt
@@ -17,4 +17,4 @@ set_target_properties(evm_fuzz_algos PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 
 target_include_directories(evm_fuzz_algos PUBLIC ${GPU_EVM_SOURCE_DIR}/src/common)
 
-target_link_libraries(evm_fuzz_algos PUBLIC evm_ingest ${CUDA_LIBRARIES} libcuda.so cudart)
\ No newline at end of file
+target_link_libraries(evm_fuzz_algos PUBLIC evm_ingest ${CUDA_LIBRARIES} libcuda.so cudart termcolor)
\ No newline at end of file
diff --git a/src/common/fuzz_algos/fuzz_check_invariants.cpp b/src/common/fuzz_algos/fuzz_check_invariants.cpp
index eb12efab..580fe76f 100644
--- a/src/common/fuzz_algos/fuzz_check_invariants.cpp
+++ b/src/common/fuzz_algos/fuzz_check_invariants.cpp
@@ -4,6 +4,7 @@
 
 #include <iomanip>
 #include <iostream>
+#include <termcolor/termcolor.hpp>
 
 __global__ void set_test_selectors_kernel(GpuArrayView<uint8_t> test_selectors_out, const __restrict__ uint8_t* invariant_selectors, const ExtFuzzFuncs fuzz_funcs, uint32_t n_tested_storages, uint32_t n_tot_runners) {
     auto tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -40,25 +41,25 @@ void set_test_selectors(GpuArrayView<uint8_t> test_selectors, const ExtFuzzFuncs
     safeCudaFree(selector_buf);
 }
 
-__global__ void set_storage_indices_kernel(GpuArrayView<size_t> storage_indices, uint32_t n_tested_storages, uint32_t n_test_threads, size_t index_offset) {
+__global__ void set_storage_indices_kernel(GpuArrayView<size_t> storage_indices, uint32_t n_tested_storages, uint32_t n_test_threads) {
     auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+    // For tid < n_tested_storages, we're already done with the cyclical assignment
+    if (tid < n_tested_storages) return;
     // All evms above n_test_threads just get storage index 0, since it's guaranteed to exist
     if (tid >= MAX_N_SEPARATE_CONTEXTS) return;
     if (tid >= n_test_threads) {
         storage_indices[tid] = 0;
     } else {
-        storage_indices[tid] = (tid % n_tested_storages) + index_offset;
+        storage_indices[tid] = storage_indices[tid % n_tested_storages];
     }
 }
 
-void set_storage_indices(GpuArrayView<size_t> storage_indices, uint32_t n_tested_storages, uint32_t n_test_threads, size_t index_offset) {
+void cyclic_repeat_storage_indices(GpuArrayView<size_t> storage_indices, uint32_t n_tested_storages, uint32_t n_test_threads) {
     auto [grid_size, block_size] = get_cuda_max_occupancy(storage_indices.size, (void*)set_storage_indices_kernel);
-    set_storage_indices_kernel<<<grid_size, block_size>>>(storage_indices, n_tested_storages, n_test_threads, index_offset);
+    set_storage_indices_kernel<<<grid_size, block_size>>>(storage_indices, n_tested_storages, n_test_threads);
     safeStreamSync();
 }
 
-// TODO store this in the storage corpus itself
-size_t first_unchecked_idx = 0;
 void fuzz_check_invariants(FuzzerState& fuzz_state) {
     gpuevm_nvtx_push("fuzz_check_invariants");
     // TODO: Complete the implementation and remove limitations
@@ -66,42 +67,38 @@ void fuzz_check_invariants(FuzzerState& fuzz_state) {
     // THE AMPERSAND IS NECESSARY WHEN USING AUTO; OTHERWISE YOU GET A BY-VALUE COPY!
     auto& api_ctx = fuzz_active_ctx();
 
-    assert(fuzz_state.fuzz_funcs.n_invariant_funcs > 0);
-    assert(fuzz_state.fuzz_funcs.n_invariant_funcs <= MAX_N_SEPARATE_CONTEXTS);
-    assert(api_ctx.n_runners == MAX_N_SEPARATE_CONTEXTS);
+    assert(fuzz_state.fuzz_funcs.n_invariant_funcs > 0 && "No invariants to check");
+    assert(fuzz_state.fuzz_funcs.n_invariant_funcs <= MAX_N_SEPARATE_CONTEXTS && "Too many invariants to check (more than one per thread)");
+    assert(api_ctx.n_runners == MAX_N_SEPARATE_CONTEXTS && "Wrong number of interpreter threads, must use one per cuda thread");
 
     // The actual number of storages we can test (since each storage needs to be tested against all invariants)
     auto n_tested_storages = MAX_N_SEPARATE_CONTEXTS / fuzz_state.fuzz_funcs.n_invariant_funcs;
     auto n_test_threads = n_tested_storages * fuzz_state.fuzz_funcs.n_invariant_funcs;
 
-    // TODO remove this requirement
-    // There must be enough storages actually recorded
-    assert(fuzz_state.storage_corpus.size >= n_tested_storages);
-
-    /* Plan for using/setting the checkedness on storages
-    1. Get the first storage index that is unchecked, (This should and can just be tracked in single counter, until we switch to having separate fuzzing and invariantchecking storage corpori)
-    2. Doublecheck that we actually have enough storages to run a full batch
-    3. Use the index as offset when we make the cyclic storage selection array
-    4. Mark the check storages from that index as checked
+    /* Storage selection for invariant checking
+     1. Check that there are n_tested_storages that are unchecked
+     2. Get the n_tested_storages first such indices
+     3. Use that array of indices to cyclically repeat
+     4. Mark those storages as checked
+     5. Move the storages into the ctx's gpu_map
      */
 
-    // 1. Get the first unchecked index
-    // TODO use cub::argindexinputiterator in the future?
-
-    // 2. Check that we have enough storages
-    assert(fuzz_state.storage_corpus.size > first_unchecked_idx + n_tested_storages);
+    // 1. Check that there are n_tested_storages that are unchecked
+    assert(fuzz_state.storage_corpus.num_unchecked_cases() >= n_tested_storages && "Not enough unchecked storages to check invariants on");
 
-    // 3. Use the unchecked_idx as (value)offset in storage selection array
+    // 2. Get the n_tested_storages first such indices
     GpuArray<size_t> storage_indices(MAX_N_SEPARATE_CONTEXTS);
-    set_storage_indices(storage_indices.view(), n_tested_storages, n_test_threads, first_unchecked_idx);
+    auto small_storage_indices_view = storage_indices.view();
+    // We need to let the get_unchecked_indices only put n_tested_storages into the array
+    small_storage_indices_view.size = n_tested_storages;
+    fuzz_state.storage_corpus.get_unchecked_indices(small_storage_indices_view, n_tested_storages);
+    // 3. Use that array of indices to cyclically repeat
+    cyclic_repeat_storage_indices(storage_indices.view(), n_tested_storages, n_test_threads);
 
     // 4. Set the checkedness of the storages to true
-    // TODO this should also probably be a member of the storage corpus
-    for (auto i = 0; i < n_tested_storages; i++) {
-        fuzz_state.storage_corpus.metadatas[i + first_unchecked_idx].checked_invariants = true;
-    }
+    fuzz_state.storage_corpus.mark_checked(small_storage_indices_view);
 
-    // Set the storages into the ctx
+    // 5. Set the storages into the ctx's gpu_map
     fuzz_state.storage_corpus.scatter_kvs(api_ctx.storage, storage_indices.view());
 
     // Now we group the selectors into contiguous groups to get maximal warp coherence
@@ -150,17 +147,17 @@ void fuzz_check_invariants(FuzzerState& fuzz_state) {
 
     CUDA_SAFE_CALL(cudaDeviceSynchronize());
 
+    size_t n_broken_invariants = fuzz_state.fuzz_funcs.n_invariant_funcs - std::count(invariant_revert_mask.data, invariant_revert_mask.data + fuzz_state.fuzz_funcs.n_invariant_funcs, 0);
+
     // If any reverts are new (i.e. set in this mask, but not previously set in the reverted_invariants_times), set to current time
     fuzz_time_diff revert_time = timing::time_diff(fuzz_state.start_time, timing::current_time());
     for (auto invariant_idx = 0; invariant_idx < fuzz_state.fuzz_funcs.n_invariant_funcs; invariant_idx++) {
         if (invariant_revert_mask[invariant_idx] && fuzz_state.reverted_invariants_times[invariant_idx] == 0) {
-            std::cout << "[Broke inv " << invariant_idx << "]";
+            std::cout << termcolor::red << "[Broke inv " << invariant_idx << "| tot broken: " << n_broken_invariants << "]" << termcolor::reset;
             fuzz_state.reverted_invariants_times[invariant_idx] = revert_time;
         }
     }
     // TODO check what the actual memory consistency guarantees are for stream syncs vs device syncs and manual (i.e. not via kernels that are in streams) access to unified memory
     CUDA_SAFE_CALL(cudaDeviceSynchronize());
-
-    first_unchecked_idx += n_tested_storages;
     gpuevm_nvtx_pop();
 }
\ No newline at end of file
diff --git a/src/common/fuzz_algos/fuzz_cull.cpp b/src/common/fuzz_algos/fuzz_cull.cpp
index fff235d2..1c2b4476 100644
--- a/src/common/fuzz_algos/fuzz_cull.cpp
+++ b/src/common/fuzz_algos/fuzz_cull.cpp
@@ -1,14 +1,108 @@
 #include "fuzz_cull.h"
 
+#include <fuzz_dss/gpu_array_utils.h>
+
+#include <iomanip>
+
 /*
-Culling the queue of fuzzing inputs.
+Culling the inputs/storages that have been accumulated.
+
+Calldata culling ideas:
 Initial implementation reconstructs all edge coverage found by every input in the queue, and then finds a subset (not necessarily minimal, this is subset cover which is NP hard) of queue entries
 that cover the same edges.
+
+Storage culling ideas:
+Remove all storages that have above M visits and below R ratio of votes/visits
+TODO think about some sort of probabalistic thing to cull
+TODO make it reach a target absolute # passing instead (histogram vote ratios, then cutoff)
 */
 
+namespace fuzz_culling {
+    // Thresholds for storage culling
+    constexpr int MIN_VISITS = 1024;
+    constexpr float MIN_VOTE_RATIO = 1 / 1024;
+
+    // From metadata array, set into allocated mask whether a storage passes (should not be culled)
+    __global__ void cull_storages_mark_kernel(const GpuArrayView<StorageCaseMetadata> metadata, GpuArrayView<bool> passes_mask, const GpuArrayView<size_t> storage_sizes, size_t max_storage_size) {
+        int tid = blockIdx.x * blockDim.x + threadIdx.x;
+        if (tid >= metadata.size) return;
+        // Note that if we haven't had MIN_VISITS, then we spare that storage
+        auto votes = metadata[tid].fuzz_case_metadata.votes;
+        auto visits = metadata[tid].fuzz_case_metadata.visits;
+        auto ratio = metadata[tid].fuzz_case_metadata.vote_ratio();
+        auto storage_size = storage_sizes[tid];
+        // TODO come up with good rules here
+        if (visits < MIN_VISITS) {
+            // Automatically spared
+            passes_mask[tid] = storage_size >= (max_storage_size - 1);
+        } else {
+            // Having already had MIN_VISITS changes, it better have generated good results, or it should be of max size
+            passes_mask[tid] = (ratio >= MIN_VOTE_RATIO);
+        }
+    }
+
+    size_t max_storage_size(const GpuArrayView<size_t>& kv_sizes) {
+        auto max_size_ptr = thrust::max_element(thrust::device, kv_sizes.data, kv_sizes.data + kv_sizes.size);
+        auto max_size = *max_size_ptr;
+        return max_size;
+    }
+
+    void cull_storages_mark(FuzzStorageBuffer& storage_corpus, GpuArrayView<bool> passes_mask) {
+        assert(storage_corpus.size <= passes_mask.size);
+        // We calculate the max storage kv size, and then don't allow any max-size storages to be culled
+        auto max_size = max_storage_size(storage_corpus.kv_sizes.view());
+
+        auto [grid_size, block_size] = get_cuda_max_occupancy(storage_corpus.metadatas.size(), (void*)cull_storages_mark_kernel);
+        cull_storages_mark_kernel<<<grid_size, block_size>>>(storage_corpus.metadatas.view(), passes_mask, storage_corpus.kv_sizes.view(), max_size);
+    }
+
+    // Given a pass mask, removes & compacts the storage corpus
+    // TODO think about whether ancestor handling should be done in a GC-like way instead
+    void cull_storages_filter(FuzzerState& fuzz_state, GpuArrayView<bool> passes_mask) {
+        // We get back the culled storages from the compaction
+
+        auto culled_storages = fuzz_state.storage_corpus.filter(passes_mask);
+        GpuArray<bool> ancestor_mask = fuzz_state.storage_corpus.get_ancestor_mask(culled_storages);
+        // NB ignoring return value here, we just want the ones that remain after keeping ancestors
+        culled_storages.filter(ancestor_mask);
+        // TODO make sure batch actually checks hashes)
+        fuzz_state.ancestor_corpus.push_batch(culled_storages);
+    }
+
+}
+
 void fuzz_cull(FuzzerState& fuzz_state) {
-    std::cout << "hello world I am culling!" << std::endl;
-#ifdef GPUEVM_DEBUG
-    std::cout << "Culling from corpus size " << fuzz_state.corpus.size() << std::endl;
-#endif
+    gpuevm_nvtx_push("fuzz_cull");
+
+    long long num_passes_before = fuzz_state.storage_corpus.size;
+
+    // TODO cache the various tempbuffers here
+    GpuArray<bool> passes_mask(fuzz_state.storage_corpus.size);
+
+    // We get back a mask that has true if the storage passed, and false if it should be culled (with no regard for ancestor-keeping)
+    fuzz_culling::cull_storages_mark(fuzz_state.storage_corpus, passes_mask.view());
+    safeStreamSync();
+    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+
+    // Don't allow removing all states, so check that enough are kept before actually culling
+    if (device_array_count<bool>(passes_mask, true) >= 32) {
+        // We take care of ancestors here
+        fuzz_culling::cull_storages_filter(fuzz_state, passes_mask.view());
+        safeStreamSync();
+
+        CUDA_SAFE_CALL(cudaDeviceSynchronize());
+        // Count and print some stats about the passes_mask
+        long long num_passes_after = device_array_count<bool>(passes_mask, true);
+        long long failed = num_passes_before - num_passes_after;
+        if (failed > 0) {
+            std::cout << "[Cull: " << num_passes_after << " pass, " << failed << " failed | ";
+            std::cout << "Rate " << std::fixed << std::setprecision(1) << 100 * failed / (float)num_passes_before << "%]";
+        } else {
+            std::cout << "P";
+        }
+    } else {
+        std::cout << "N";
+    }
+
+    gpuevm_nvtx_pop();
 }
diff --git a/src/common/fuzz_algos/fuzz_dedup.cpp b/src/common/fuzz_algos/fuzz_dedup.cpp
index 5ef1539e..a2f14c0d 100644
--- a/src/common/fuzz_algos/fuzz_dedup.cpp
+++ b/src/common/fuzz_algos/fuzz_dedup.cpp
@@ -106,6 +106,20 @@ namespace fuzz_dedup {
         return marked_inputs;
     }
 
+    __global__ void add_votes_to_storages_kernel(const GpuArrayView<bool> interesting_mask, const GpuArrayView<size_t> storage_indices, GpuArrayView<StorageCaseMetadata> metadata) {
+        int tid = blockIdx.x * blockDim.x + threadIdx.x;
+        if (tid >= interesting_mask.size) return;
+        if (interesting_mask.data[tid])
+            atomicAdd((unsigned long long*)&(metadata[storage_indices[tid]].fuzz_case_metadata.votes), (unsigned long long)1);
+    }
+
+    void add_votes_to_storages(const GpuArrayView<bool> interesting_mask, const GpuArrayView<size_t> storage_indices, GpuArrayView<StorageCaseMetadata> metadata) {
+        gpuevm_nvtx_push("add_votes_to_storages");
+        auto [grid_size, block_size] = get_cuda_max_occupancy(interesting_mask.size, (void*)add_votes_to_storages_kernel);
+        add_votes_to_storages_kernel<<<grid_size, block_size>>>(interesting_mask, storage_indices, metadata);
+        gpuevm_nvtx_pop();
+    }
+
     __global__ void get_revert_mask_kernel(uint8_t* const bytecode, const GpuArrayView<evm_tracer_t> tracers_in, GpuArrayView<bool> reverting_mask) {
         int tid = blockIdx.x * blockDim.x + threadIdx.x;
         if (tid >= tracers_in.size) return;
diff --git a/src/common/fuzz_algos/fuzz_dedup.h b/src/common/fuzz_algos/fuzz_dedup.h
index b32f9b18..bf1e7b26 100644
--- a/src/common/fuzz_algos/fuzz_dedup.h
+++ b/src/common/fuzz_algos/fuzz_dedup.h
@@ -14,6 +14,7 @@ namespace fuzz_dedup {
     FuzzCaseInputs filter_inputs_by_mask(const FuzzCaseInputs& run_inputs, const GpuArrayView<bool>& mask);
 
     std::pair<std::optional<GpuArrayView<bool>>, std::optional<FuzzCaseInputs>> get_reverting_data(uint8_t* const bytecode, evm_tracer_t* tracer_buf, const FuzzCaseInputs& run_inputs);
+    void add_votes_to_storages(const GpuArrayView<bool> interesting_mask, const GpuArrayView<size_t> storage_indices, GpuArrayView<StorageCaseMetadata> metadata);
     void get_revert_mask(uint8_t* const bytecode, const GpuArrayView<evm_tracer_t> tracers_in, GpuArrayView<bool> reverting_mask);
     void compute_revert_discard_mask(const GpuArrayView<bool> reverting_mask, const GpuArrayView<evm_tracer_t> tracers_in, GpuArrayView<bool> revert_discard_mask);
 
@@ -22,4 +23,4 @@ namespace fuzz_dedup {
     void inplace_bool_or_gpuarrays(const GpuArrayView<bool> a_inout, const GpuArrayView<bool> b_in);
     void inplace_bool_not_gpuarrays(const GpuArrayView<bool> a_inout);
     void inplace_bool_and_gpuarrays(const GpuArrayView<bool> a_inout, const GpuArrayView<bool> b_in);
-    }
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/src/common/fuzz_algos/fuzz_state.cpp b/src/common/fuzz_algos/fuzz_state.cpp
index 143b5277..fe14baa4 100644
--- a/src/common/fuzz_algos/fuzz_state.cpp
+++ b/src/common/fuzz_algos/fuzz_state.cpp
@@ -21,6 +21,7 @@ FuzzerState::FuzzerState(const ExtGpuEvmFuzzSetup& setup, const FuzzStorageCase&
                                                                                                     corpus(EvmFlatAbiTypes(fuzz_funcs.stateful_funcs[0].evm_flat_abi).calldata_size()),
                                                                                                     storage_corpus(initial_storage),
                                                                                                     storage_hashes_seen(max_corpus_size_pow_2 + 5),
+                                                                                                    ancestor_corpus(),
                                                                                                     storage_dep_map(max_corpus_size_pow_2 + 5),
                                                                                                     reverting_exit_datas_map(max_corpus_size_pow_2 + 5),
                                                                                                     reverting_corpus(EvmFlatAbiTypes(fuzz_funcs.stateful_funcs[0].evm_flat_abi).calldata_size()),
@@ -120,10 +121,12 @@ std::tuple<FuzzCaseInputs, std::optional<FuzzCaseInputs>, std::optional<FuzzStor
     gpuevm_nvtx_push("run_get_interesting");
 
     // Set the storage cases to use
-    storage_corpus.scatter_kvs(fuzz_active_ctx().storage, storage_indices);
+    // Also mark these storage scatters as being real visits in the metadata
+    storage_corpus.scatter_kvs(fuzz_active_ctx().storage, storage_indices, true);
 
     // Setup gpuevm with those calldatas
     fuzz_set_calldatas(inputs);
+
     // Put the current distmap into interpreter memory
     // This can do resizing under the hood
     dist_map.sync_to_ctx(fuzz_active_ctx());
@@ -182,7 +185,7 @@ std::tuple<FuzzCaseInputs, std::optional<FuzzCaseInputs>, std::optional<FuzzStor
     auto n_interesting_storages = device_array_count(GpuArrayView(scratch_storage_hashes_interesting_mask), true);
     gpuevm_nvtx_pop();
 
-    // Immediately collect the interesting storages
+    // Immediately collect the interesting storages (NB here interesting storage means that is has new storagehash, and is not discarded)
     std::optional<FuzzStorageCaseBatch> interesting_storages;
     if (n_interesting_storages > 0) {
         interesting_storages = FuzzStorageCaseBatch(run_ctx, scratch_storage_hashes_interesting_mask);
@@ -261,6 +264,11 @@ std::tuple<FuzzCaseInputs, std::optional<FuzzCaseInputs>, std::optional<FuzzStor
             return {fuzz_dedup::concat_fuzz_case_inputs(interesting_inputs, preserve_add_back_inputs), opt_reverting_inputs, interesting_storages};
         }
     }
+
+    // For all the interesting inputs, we want to add votes to the storages they started with
+    // So we use the general interestingness mask and add votes to those storages
+    fuzz_dedup::add_votes_to_storages(interesting_mask, input_storage_indices, storage_corpus.metadatas.view());
+
     gpuevm_nvtx_pop();
     return {interesting_inputs, opt_reverting_inputs, interesting_storages};
 }
diff --git a/src/common/fuzz_algos/fuzz_state.h b/src/common/fuzz_algos/fuzz_state.h
index fe3edf5d..1bc8dcae 100644
--- a/src/common/fuzz_algos/fuzz_state.h
+++ b/src/common/fuzz_algos/fuzz_state.h
@@ -37,6 +37,7 @@ struct FuzzerState {
 
     FuzzStorageBuffer storage_corpus;
     DeviceHashSet<storage_hash_t> storage_hashes_seen;
+    FuzzStorageBuffer ancestor_corpus;
     StoreDepMap storage_dep_map;
 
     DeviceHashSet<exit_data_t> reverting_exit_datas_map;
diff --git a/src/common/fuzz_dss/fuzz_case_queue.h b/src/common/fuzz_dss/fuzz_case_queue.h
index f0a6a6ee..46c12bc5 100644
--- a/src/common/fuzz_dss/fuzz_case_queue.h
+++ b/src/common/fuzz_dss/fuzz_case_queue.h
@@ -8,6 +8,7 @@ struct FuzzCaseInputs {
     uint32_t calldata_size;
     uint32_t n_cases;
     FuzzCaseInputs repeat_to_size(size_t new_size) const;
+
     static FuzzCaseInputs empty();
 };
 
diff --git a/src/common/fuzz_dss/fuzz_mutate.cpp b/src/common/fuzz_dss/fuzz_mutate.cpp
index 5b3ffad1..58568f0c 100644
--- a/src/common/fuzz_dss/fuzz_mutate.cpp
+++ b/src/common/fuzz_dss/fuzz_mutate.cpp
@@ -10,6 +10,7 @@
 #include <thrust/extrema.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/sort.h>
 
 #include <numeric>
 
@@ -17,36 +18,80 @@
 
 // --------- Storage selection kernels ----------
 
-__global__ void select_storage_cases_kernel(size_t corpus_size, GpuArrayView<size_t> out_indices, const GpuArrayView<size_t> max_indices, curandState_t* rng_states) {
+__global__ void select_storage_cases_kernel(size_t corpus_size, GpuArrayView<size_t> out_indices, const GpuArrayView<size_t> max_indices, const GpuArrayView<size_t> ratio_sorted_indices, curandState_t* rng_states) {
     auto out_inx = TID;
     if (out_inx >= out_indices.size) return;
+    double rand_val = curand_uniform(&rng_states[out_inx]);
+    const float thresholds[] = {0.10, 0.2};
+    const float highest_threshold = thresholds[sizeof(thresholds) / sizeof(float) - 1];
     // mix of random or max-sized
-    if (out_inx < double(out_indices.size) * 0.05) {
+    if (rand_val < thresholds[0]) {
         // Completely random
         out_indices[out_inx] = curand(&rng_states[out_inx]) % corpus_size;
-    } else {
+    } else if (rand_val < thresholds[1]) {
         // Select a random within the max-sized storages
-        out_indices[out_inx] = curand(&rng_states[out_inx]) % max_indices.size;
+        size_t selection_normalized_inx = curand_uniform(&rng_states[out_inx]) * max_indices.size;
+        selection_normalized_inx = min(max(selection_normalized_inx, (size_t)0), max_indices.size - 1);
+        out_indices[out_inx] = max_indices[selection_normalized_inx];
+    } else {
+        // Select a random within the sorted indices by vote ratio
+        double normalized_randval = ((rand_val - highest_threshold) / (1 - highest_threshold));
+        normalized_randval *= normalized_randval;
+        size_t selection_normalized_inx = ratio_sorted_indices.size * normalized_randval;
+        selection_normalized_inx = min(max(selection_normalized_inx, (size_t)0), ratio_sorted_indices.size - 1);
+        out_indices[out_inx] = ratio_sorted_indices[selection_normalized_inx];
     }
 }
 
 // --------- /Storage selection kernels ----------
 
 // --------- Storage selection precomputation logic ---------
+
+// Functor to compare (thrust comp op) vote ratios from metadata array pointer
+struct VoteRatioComparator {
+    const StorageCaseMetadata* metadata;
+
+    VoteRatioComparator(const StorageCaseMetadata* metadata)
+        : metadata(metadata) {}
+
+    __host__ __device__ bool operator()(size_t a, size_t b) const {
+        // Comparison order to make default thrust sort produce descending vote ratio
+        return metadata[a].fuzz_case_metadata.vote_ratio() < metadata[b].fuzz_case_metadata.vote_ratio();
+    }
+};
+
+// Array of indices into storage corpus that are sorted (decreasing) according to vote ratio
+// TODO move to storage buffer?
+GpuArray<size_t> FuzzMutator::compute_ratio_sorted_indices(const FuzzStorageBuffer& storage_corpus) {
+    // Index iterator -> fill gpuarray
+    auto indices = GpuArray<size_t>(storage_corpus.size);
+    thrust::sequence(thrust::device, indices.data, indices.data + indices.size);
+
+    // Thrust argsort on fuzzcase_metadata.vote_ratio()
+    auto comparator = VoteRatioComparator(storage_corpus.metadatas.data);
+    thrust::sort(thrust::device, indices.data, indices.data + indices.size, comparator);
+    return indices;
+}
+
+// TODO move this to storage buffer?
 GpuArray<size_t> FuzzMutator::compute_max_storage_indices(const FuzzStorageBuffer& storage_corpus) {
     // Compute the max value of size
     auto n_storages = storage_corpus.size;
-    auto max_size_ptr = thrust::max_element(thrust::device, (size_t*)storage_corpus.kv_sizes, (size_t*)storage_corpus.kv_sizes + n_storages);
+
+    // Raw ptr from the appendbuffer's internal gpuarray
+    auto sizes_it = storage_corpus.kv_sizes.data.data;
+
+    auto max_size_ptr = thrust::max_element(thrust::device, sizes_it, sizes_it + n_storages);
     auto max_size = *max_size_ptr;
 
-    auto n_eq_max = thrust::count(thrust::device, (size_t*)storage_corpus.kv_sizes, (size_t*)storage_corpus.kv_sizes + n_storages, max_size);
+    auto n_eq_max = thrust::count(thrust::device, sizes_it, sizes_it + n_storages, max_size);
 
     // Store the indices into a new buffer
     // TODO figure out a way to do this without allocating memory
     GpuArray<size_t> max_indices(n_eq_max);
     auto end_ptr = thrust::copy_if(thrust::device, thrust::make_counting_iterator<int>(0),
                                    thrust::make_counting_iterator<int>(n_storages),
-                                   (size_t*)storage_corpus.kv_sizes,
+                                   sizes_it,
                                    max_indices.data,
                                    thrust::placeholders::_1 == max_size);
     return max_indices;
@@ -64,14 +109,23 @@ GpuArray<size_t> FuzzMutator::compute_max_storage_indices(const FuzzStorageBuffe
 // Ideas for selection strategies:
 // (Strategy selection via weighted sampling, basically the same as calldata mutation)
 // TODO implement strategy selection via weighted sampling
-// 1. Completely random (implemented)
-// 2. Max-size only (random within those)
+// 1. Completely random (not very useful)
+// 2. Max-size only (random within those, implemented)
 // 3. Max-size *and* max values within those
 // 4. Size-weighted
 void FuzzMutator::select_storage_cases(const FuzzStorageBuffer& storage_corpus, GpuArrayView<size_t> indices) {
     auto max_indices = compute_max_storage_indices(storage_corpus);
-    select_storage_cases_kernel<<<run_conf.n_blocks, run_conf.n_threads_per_block>>>(storage_corpus.size, indices, max_indices, rng_states);
+    auto ratio_sorted_indices = compute_ratio_sorted_indices(storage_corpus);
+    select_storage_cases_kernel<<<run_conf.n_blocks, run_conf.n_threads_per_block>>>(storage_corpus.size, indices, max_indices, ratio_sorted_indices, rng_states);
     safeStreamSync();
+#ifdef GPUEVM_DEBUG_FUZZER
+    // Check each storage index that it's not larger than the corpus size
+    for (auto i = 0; i < indices.size; i++) {
+        if (indices[i] >= storage_corpus.size) {
+            std::cout << "Storage index error" << std::endl;
+        }
+    }
+#endif
 }
 
 // ---------- /Main storage selection dispatch ----------
diff --git a/src/common/fuzz_dss/fuzz_mutate.h b/src/common/fuzz_dss/fuzz_mutate.h
index d0dfcbac..b5aed1a8 100644
--- a/src/common/fuzz_dss/fuzz_mutate.h
+++ b/src/common/fuzz_dss/fuzz_mutate.h
@@ -40,6 +40,7 @@ class FuzzMutator {
 
     // ---------- Storage selection functions ----------
     GpuArray<size_t> compute_max_storage_indices(const FuzzStorageBuffer& storage_corpus);
+    GpuArray<size_t> compute_ratio_sorted_indices(const FuzzStorageBuffer& storage_corpus);
     // ----------/Storage selection functions ----------
 
     // ---------- Calldata mutation functions ----------
diff --git a/src/common/fuzz_dss/fuzz_storage.cpp b/src/common/fuzz_dss/fuzz_storage.cpp
index 7508f199..df8864d5 100644
--- a/src/common/fuzz_dss/fuzz_storage.cpp
+++ b/src/common/fuzz_dss/fuzz_storage.cpp
@@ -9,6 +9,7 @@
 #include <cub/iterator/transform_input_iterator.cuh>
 
 // -------- FuzzStorageCaseBatch implementation ---------
+// TODO this shouldn't necessarily be thread_local, we should have all the tempbufs in a tempbuf manager thingamabob
 static thread_local TempStorageBuffer marked_indices_tempbuf(sizeof(size_t) * MAX_N_SEPARATE_CONTEXTS);
 
 // Custom boolmask-to-indices function
@@ -201,6 +202,64 @@ void FuzzStorageCaseBatch::gather_kvs(const GpuevmCtx& ctx, size_t n_marked) {
     gpuevm_nvtx_pop();
 }
 
+FuzzStorageCaseBatch FuzzStorageCaseBatch::filter(const GpuArrayView<bool>& mask) {
+    // Similar to fuzzstoragebuffer::filter
+    auto n_culled = device_array_count(mask, false);
+
+    // If there's nothing to cull, we're done
+    if (n_culled == 0) {
+        return FuzzStorageCaseBatch(0,
+                                    GpuArray<uint256>(0),
+                                    GpuArray<uint256>(0),
+                                    GpuArray<size_t>(0),
+                                    GpuArray<storage_hash_t>(0),
+                                    GpuArray<storage_hash_t>(0),
+                                    GpuArray<uint8_t>(0),
+                                    GpuArray<size_t>(0),
+                                    GpuArray<StorageCaseMetadata>(0));
+    }
+
+    // Create expanded masks
+    GpuArray<bool> culled_kv_mask(keys.size);
+    device_array_rle_decode(kv_sizes.view(), mask, culled_kv_mask.view());
+    GpuArray<bool> culled_producing_calldata_mask(producing_calldatas.size);
+    device_array_rle_decode(producing_calldata_sizes.view(), mask, culled_producing_calldata_mask.view());
+    // Cull the variably-sized parts
+    auto [nonculled_keys, culled_keys] = device_array_partition(keys.view(), culled_kv_mask.view());
+    auto [nonculled_vals, culled_vals] = device_array_partition(vals.view(), culled_kv_mask.view());
+
+    auto [nonculled_producing_calldatas, culled_producing_calldatas] = device_array_partition(producing_calldatas.view(), culled_producing_calldata_mask.view());
+
+    auto [nonculled_kv_sizes, culled_kv_sizes] = device_array_partition(kv_sizes.view(), mask);
+    auto [nonculled_producing_calldata_sizes, culled_producing_calldata_sizes] = device_array_partition(producing_calldata_sizes.view(), mask);
+    auto [nonculled_hashes, culled_hashes] = device_array_partition(hashes.view(), mask);
+    auto [nonculled_parent_hashes, culled_parent_hashes] = device_array_partition(parent_hashes.view(), mask);
+
+    auto [nonculled_metadata, culled_metadata] = device_array_partition(metadata.view(), mask);
+
+    // Set the ones for the current object
+    n_cases = n_cases - n_culled;
+    keys = std::move(nonculled_keys);
+    vals = std::move(nonculled_vals);
+    kv_sizes = std::move(nonculled_kv_sizes);
+    hashes = std::move(nonculled_hashes);
+    parent_hashes = std::move(nonculled_parent_hashes);
+    producing_calldatas = std::move(nonculled_producing_calldatas);
+    producing_calldata_sizes = std::move(nonculled_producing_calldata_sizes);
+    metadata = std::move(nonculled_metadata);
+
+    return FuzzStorageCaseBatch(
+        n_culled,
+        culled_keys,
+        culled_vals,
+        culled_kv_sizes,
+        culled_hashes,
+        culled_parent_hashes,
+        culled_producing_calldatas,
+        culled_producing_calldata_sizes,
+        culled_metadata);
+}
+
 // -------- /FuzzStorageCaseBatch implementation ---------
 
 // -------- AppendBuffer implementation -----------
@@ -208,6 +267,9 @@ void FuzzStorageCaseBatch::gather_kvs(const GpuevmCtx& ctx, size_t n_marked) {
 template <typename T>
 AppendBuffer<T>::AppendBuffer(size_t cap) : cap(max(min_size, cap)), used_size(0), data(max(min_size, cap)) {}
 
+template <typename T>
+AppendBuffer<T>::AppendBuffer(const GpuArrayView<T>& arr) : cap(arr.size), used_size(arr.size), data(arr) {}
+
 template <typename T>
 size_t AppendBuffer<T>::capacity() const {
     return cap;
@@ -232,6 +294,33 @@ void AppendBuffer<T>::push(const T& val) {
     used_size++;
 }
 
+template <typename T>
+void AppendBuffer<T>::filter(const GpuArrayView<bool>& mask) {
+    assert(mask.size == size());
+    if (size() == 0) return;
+
+    // Since the array-filter-inout doesn't support overlapping in and out arrays, we must allocate ourselves to a new buffer
+    auto n_marked = device_array_count(mask, true);
+    GpuArray<T> new_data(n_marked);
+    device_array_filter_inout(data.view(), mask, new_data.view());
+    data = std::move(new_data);
+    used_size = n_marked;
+    cap = n_marked;
+    maybe_resize();
+}
+
+// Keeps only the true-marked, returns the false-marked (i.e. becomes the true-marked itself)
+template <typename T>
+AppendBuffer<T> AppendBuffer<T>::partition(const GpuArrayView<bool>& mask) {
+    if (size() == 0) return AppendBuffer<T>(0);
+    // NB We return a this.view, which uses the used data size as size, not the capacity as data.view() would
+    auto [true_buf, false_buf] = device_array_partition(this->view(), mask);
+    data = std::move(true_buf);
+    used_size = data.size;
+    cap = data.size;
+    return AppendBuffer<T>(false_buf);
+}
+
 template <typename T>
 T& AppendBuffer<T>::operator[](size_t idx) const {
     return data[idx];
@@ -300,6 +389,7 @@ template struct AppendBuffer<size_t>;
 template struct AppendBuffer<uint32_t>;
 template struct AppendBuffer<uint8_t>;
 template struct AppendBuffer<StorageCaseMetadata>;
+template struct AppendBuffer<bool>;
 
 // ---------- /Explicit instantiations for AppendBuffer --------
 
@@ -327,10 +417,80 @@ void FuzzStorageBuffer::push(const FuzzStorageCaseView& storage) {
         producing_calldata_start_inxs.push(producing_calldata_start_inxs[size - 1] + producing_calldata_sizes[size - 1]);
     }
     hash_to_idx[storage.hash] = size;
+
     size++;
     gpuevm_nvtx_pop();
 }
 
+FuzzStorageCaseBatch FuzzStorageBuffer::filter(const GpuArrayView<bool>& mask) {
+    // RLE-filtering: keys, vals
+    // Special care (non-appendbuffers and not rle): hash-to-idx, kv_start_inxs, producing_calldata_start_inxs, producing_calldatas
+    // Members to normal-filter:
+    // Normal:
+    //  - kv_sizes
+    //  - hashes
+    //  - parent_hashes
+    //  - producing_calldata_sizes
+    //  - metadatas
+
+    gpuevm_nvtx_push("FuzzStorageBuffer::filter");
+
+    // We have to produce this before we change the size of the kv sizes array
+    // To split the keys & vals that were culled from those who were not, we need to generate a mask that uses the kv sizes as runlengths and the normal mask as mask
+    GpuArray<bool> culled_kv_items_mask(keys.size());
+    device_array_rle_decode(kv_sizes.view(), mask, culled_kv_items_mask.view());
+    // Similarly for producing calldatas
+    GpuArray<bool> culled_producing_calldata_items_mask(producing_calldatas.size());
+    device_array_rle_decode(producing_calldata_sizes.view(), mask, culled_producing_calldata_items_mask.view());
+
+    // Normal simple culling
+    // Note that this changes our members inplace!
+    auto culled_kv_sizes = kv_sizes.partition(mask);
+    // We essentially have this value for free now, since it obvs applies to all the entries
+    auto n_culled = culled_kv_sizes.size();
+    auto n_nonculled = size - n_culled;
+
+    auto culled_hashes = hashes.partition(mask);
+    auto culled_parent_hashes = parent_hashes.partition(mask);
+    auto culled_producing_calldata_sizes = producing_calldata_sizes.partition(mask);
+    auto culled_metadatas = metadatas.partition(mask);
+
+    // For our own kv_start_inxs member, we need to resize it to the non-culled size
+    [[maybe_unused]] auto culled_kv_start_inxs = kv_start_inxs.partition(mask);
+    // Starts are a prefix sum of the sizes
+    thrust::exclusive_scan(thrust::device, kv_sizes.data.data, kv_sizes.data.data + n_nonculled, kv_start_inxs.data.data);
+
+    // Need to resize and update the calldata start indices
+    [[maybe_unused]] auto culled_producing_calldata_start_inxs = producing_calldata_start_inxs.partition(mask);
+    thrust::exclusive_scan(thrust::device, producing_calldata_sizes.data.data, producing_calldata_sizes.data.data + n_nonculled, producing_calldata_start_inxs.data.data);
+
+    // And now we filter keys, vals, and producing_calldatas
+    auto culled_keys = keys.partition(culled_kv_items_mask);
+    auto culled_vals = vals.partition(culled_kv_items_mask);
+    auto culled_producing_calldatas = producing_calldatas.partition(culled_producing_calldata_items_mask);
+
+    // And in the end we need to change the hash-to-idx mapping
+    hash_to_idx.clear();
+    // Fill the non-culled
+    for (size_t i = 0; i < n_nonculled; i++) {
+        hash_to_idx[hashes[i]] = i;
+    }
+
+    size = n_nonculled;
+
+    gpuevm_nvtx_pop();
+    // Return a storagecasebatch containing the culled items only, i.e. the buffers we've got back from all the .partition calls
+    return FuzzStorageCaseBatch(n_culled,
+                                culled_keys.data,
+                                culled_vals.data,
+                                culled_kv_sizes.data,
+                                culled_hashes.data,
+                                culled_parent_hashes.data,
+                                culled_producing_calldatas.data,
+                                culled_producing_calldata_sizes.data,
+                                culled_metadatas.data);
+}
+
 // NB This doesn't check if the hashes are already in the batch
 void FuzzStorageBuffer::push_batch(const FuzzStorageCaseBatch& batch) {
     gpuevm_nvtx_push("FuzzStorageBuffer::push_batch");
@@ -378,6 +538,18 @@ FuzzStorageCaseView FuzzStorageBuffer::operator[](size_t idx) const {
         metadatas[idx]);
 }
 
+// Gives back a mask of same size as the batch (i.e. number of storage cases in the batch), marking whether the case is a direct parent anything in the current storage
+GpuArray<bool> FuzzStorageBuffer::get_ancestor_mask(const FuzzStorageCaseBatch& batch) const {
+    // We are essentially doing a batchlookup of the hashes in batch, to the parent_hashes (as a set) of the current storage
+    // At most load factor 0.5 = 1/2
+    DeviceHashSet<storage_hash_t> parent_hashes_set(ceil_log2(size) + 1);
+    // Insert all the parent hashes
+    // TODO do the bulk functions of hashset/hashmap actually handle >MAX_N_CONTEXTS items at once operations correctly?
+    parent_hashes_set.insert_bulk(parent_hashes.data.view());
+    // Do the batch lookup
+    return parent_hashes_set.lookup_bulk(batch.hashes);
+}
+
 std::vector<FuzzStorageCaseView> FuzzStorageBuffer::get_storage_chain(storage_hash_t hash) const {
     gpuevm_nvtx_push("FuzzStorageBuffer::get_storage_chain");
     std::vector<FuzzStorageCaseView> chain;
@@ -399,12 +571,15 @@ void __global__ scatter_kvs_kernel(
     const GpuArrayView<size_t> kv_sizes,
     const GpuArrayView<storage_hash_t> hashes,
     const GpuArrayView<storage_hash_t> parent_hashes,
-    const GpuArrayView<size_t> kv_start_inxs) {
+    const GpuArrayView<size_t> kv_start_inxs,
+    GpuArrayView<StorageCaseMetadata> metadata,
+    bool count_visits) {
     // NB Very important that this TID is the same as is used in MAP_INX in gpu_map
     auto tid = TID;
-    assert(tid < MAP_N_CTXS);
+    assert(tid < MAP_N_CTXS && "Thread index out of bounds");
 
     const auto storage_inx = storage_indices[tid];
+    assert(storage_inx < kv_sizes.size && "Storage index out of bounds");
     const auto kv_size = kv_sizes[storage_inx];
     const auto kv_start_inx = kv_start_inxs[storage_inx];
     const auto parent_hash = parent_hashes[storage_inx];
@@ -421,15 +596,20 @@ void __global__ scatter_kvs_kernel(
         gpu_map.insert(key, val);
     }
 
+    if (count_visits) {
+        atomicAdd((unsigned long long*)&(metadata[storage_inx].fuzz_case_metadata.visits), (unsigned long long)1);
+    }
+
 #ifdef GPUEVM_DEBUG
     // We only need the hash for correctness checking, i.e. the hash after inserting everything should be the same as the one recorded in the buffer
     const auto hash = hashes[storage_inx];
-    assert(gpu_map.get_rolling_hash_ptr(tid)->get_internal_hash() == hash);
+    // TODO bring back
+    // assert(gpu_map.get_rolling_hash_ptr(tid)->get_internal_hash() == hash);
 #endif
 }
 
 template <size_t MAP_N_CTXS, size_t MAP_CAP>
-void FuzzStorageBuffer::scatter_kvs(GpuMap<MAP_N_CTXS, MAP_CAP>& gpu_map, const GpuArrayView<size_t>& storage_indices) {
+void FuzzStorageBuffer::scatter_kvs(GpuMap<MAP_N_CTXS, MAP_CAP>& gpu_map, const GpuArrayView<size_t>& storage_indices, bool count_visits) {
     gpuevm_nvtx_push("FuzzStorageBuffer::scatter_kvs");
     // Because of how we're implemented gpumap to be fast when running in the interpreter, if each thread does a single operation on storage
     // in a tid <-> evm_inx fashion, memory operations will be pretty coalesced and thus fast
@@ -447,15 +627,26 @@ void FuzzStorageBuffer::scatter_kvs(GpuMap<MAP_N_CTXS, MAP_CAP>& gpu_map, const
     gpu_map.fastSetZero();
     safeStreamSync();
 
+#ifdef GPUEVM_DEBUG_FUZZER
+    // Loop through the storage indices, and check whether they're out of bounds (longer than the kv_sizes array)
+    if (size != kv_sizes.size()) {
+        std::cout << "Size error" << std::endl;
+    }
+    for (size_t i = 0; i < storage_indices.size; i++) {
+        if (storage_indices[i] >= kv_sizes.size()) {
+            std::cout << "Storage index error" << std::endl;
+        }
+    }
+#endif
     // This does all the magic, but needs everything in the form of arrayviews
-    scatter_kvs_kernel<<<MAP_N_CTXS / BLOCKDIMX, BLOCKDIMX>>>(gpu_map, storage_indices, keys.view(), vals.view(), kv_sizes.view(), hashes.view(), parent_hashes.view(), kv_start_inxs.view());
+    scatter_kvs_kernel<<<MAP_N_CTXS / BLOCKDIMX, BLOCKDIMX>>>(gpu_map, storage_indices, keys.view(), vals.view(), kv_sizes.view(), hashes.view(), parent_hashes.view(), kv_start_inxs.view(), metadatas.view(), count_visits);
 
     safeStreamSync();
     gpuevm_nvtx_pop();
 }
 
 // Explicit instantiation of scatter_kvs for the default gpuevmctx
-template void FuzzStorageBuffer::scatter_kvs<MAX_N_SEPARATE_CONTEXTS, STORAGE_SLOTS_ELS>(GpuMap<MAX_N_SEPARATE_CONTEXTS, STORAGE_SLOTS_ELS>& gpu_map, const GpuArrayView<size_t>& storage_indices);
+template void FuzzStorageBuffer::scatter_kvs<MAX_N_SEPARATE_CONTEXTS, STORAGE_SLOTS_ELS>(GpuMap<MAX_N_SEPARATE_CONTEXTS, STORAGE_SLOTS_ELS>& gpu_map, const GpuArrayView<size_t>& storage_indices, bool count_visits);
 
 // Functor getting a (inverted and into arithmetic type) bool from StorageCaseMetadata
 struct GetUnchecked {
@@ -482,4 +673,33 @@ size_t FuzzStorageBuffer::num_unchecked_cases() const {
     return *cub_temp_buffer.inout_size_ptr;
 }
 
+void FuzzStorageBuffer::get_unchecked_indices(GpuArrayView<size_t> out_indices, size_t n_indices) const {
+    gpuevm_nvtx_push("FuzzStorageBuffer::get_unchecked_indices");
+    /** Writes the n_indices first indices that are marked as unchecked into the out_indices array
+     * Errors if there are less than n_indices that are unchecked, or if the passed view is not of appropriate size
+     */
+    // TODO make this gpu accelerated
+    assert(out_indices.size >= n_indices && "Output indices array is not of correct size");
+
+    size_t out_idx = 0;
+    for (size_t i = 0; i < size; i++) {
+        if (!metadatas[i].checked_invariants && out_idx < n_indices) {
+            out_indices[out_idx++] = i;
+        }
+    }
+    assert(out_idx == n_indices && "Not enough unchecked cases to get");
+    // Unified memory needs full system sync
+    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    gpuevm_nvtx_pop();
+}
+
+void FuzzStorageBuffer::mark_checked(const GpuArrayView<size_t>& indices) {
+    gpuevm_nvtx_push("FuzzStorageBuffer::mark_checked");
+    // TODO make this gpu accelerated
+    for (size_t i = 0; i < indices.size; i++) {
+        metadatas[indices[i]].checked_invariants = true;
+    }
+    gpuevm_nvtx_pop();
+}
+
 // ---------- /FuzzStorageBuffer implementation ----------
diff --git a/src/common/fuzz_dss/fuzz_storage.h b/src/common/fuzz_dss/fuzz_storage.h
index e3b7d345..48d9a2c7 100644
--- a/src/common/fuzz_dss/fuzz_storage.h
+++ b/src/common/fuzz_dss/fuzz_storage.h
@@ -79,6 +79,10 @@ struct FuzzStorageCaseBatch {
     // Dummy for development
     FuzzStorageCaseBatch() : n_cases(0), keys(nullptr, 0), kv_sizes(nullptr, 0), vals(nullptr, 0), hashes(nullptr, 0), parent_hashes(nullptr, 0), producing_calldatas(nullptr, 0), producing_calldata_sizes(nullptr, 0), metadata(nullptr, 0) {}
 
+    // Fully constructed from passed items
+    FuzzStorageCaseBatch(size_t n_cases, const GpuArrayView<uint256>& keys, const GpuArrayView<uint256>& vals, const GpuArrayView<size_t>& kv_sizes, const GpuArrayView<storage_hash_t>& hashes, const GpuArrayView<storage_hash_t>& parent_hashes, const GpuArrayView<uint8_t>& producing_calldatas, const GpuArrayView<size_t>& producing_calldata_sizes, const GpuArrayView<StorageCaseMetadata>& metadata)
+        : n_cases(n_cases), keys(keys), vals(vals), kv_sizes(kv_sizes), hashes(hashes), parent_hashes(parent_hashes), producing_calldatas(producing_calldatas), producing_calldata_sizes(producing_calldata_sizes), metadata(metadata) {}
+
     FuzzStorageCaseBatch(const GpuevmCtx& ctx, const GpuArrayView<bool>& mask);
     __host__ friend std::ostream& operator<<(std::ostream& os, const FuzzStorageCaseBatch& batch) {
         os << "FuzzStorageCaseBatch {\n";
@@ -103,6 +107,9 @@ struct FuzzStorageCaseBatch {
         return os;
     }
 
+    // Filters the batch based on the passed in mask, and returns a new batch with the culled-out elements
+    FuzzStorageCaseBatch filter(const GpuArrayView<bool>& mask);
+
     // We'd like the functions below to be private, but due to them using the nvcc --expt-extended-lambda flag with device lambdas they can't be.
     // private:
     // Gathers both hash and parenthash of each marked (true) in the mask
@@ -117,12 +124,22 @@ template <typename T>
 struct AppendBuffer {
     AppendBuffer() : AppendBuffer(min_size) {}
     AppendBuffer(size_t cap);
+    // Copies the data from the passed in array into the buffer
+    AppendBuffer(const GpuArrayView<T>& arr);
 
     size_t capacity() const;
     size_t size() const;
     void push(const GpuArrayView<T>& arr);
     void push(const T& val);
 
+    // Keeps (and keeps order) only the elements for which the mask is true
+    // almost always doesn't keep existing buffer, so it's not safe to keep views to the buffer when invoking this
+    void filter(const GpuArrayView<bool>& mask);
+
+    // Partitions the buffer into two, one with the true-masked elements, and one with the false-masked elements
+    // The false-masked elements are returned, the buffer becomes the true-marked ones inplace
+    AppendBuffer<T> partition(const GpuArrayView<bool>& mask);
+
     T& operator[](size_t idx) const;
 
     operator T*() const {
@@ -162,6 +179,16 @@ struct FuzzStorageBuffer {
         push(storage);
     }
 
+    // We can also initialize with all the members
+    FuzzStorageBuffer(const AppendBuffer<uint256>& keys, const AppendBuffer<uint256>& vals, const AppendBuffer<size_t>& kv_sizes,
+                      const AppendBuffer<storage_hash_t>& hashes, const AppendBuffer<storage_hash_t>& parent_hashes,
+                      const AppendBuffer<uint8_t>& producing_calldatas, const AppendBuffer<size_t>& producing_calldata_sizes,
+                      const AppendBuffer<StorageCaseMetadata>& metadatas,
+                      const AppendBuffer<size_t>& kv_start_inxs,
+                      const AppendBuffer<size_t>& producing_calldata_start_inxs,
+                      const std::unordered_map<storage_hash_t, size_t>& hash_to_idx)
+        : keys(keys), vals(vals), kv_sizes(kv_sizes), hashes(hashes), parent_hashes(parent_hashes), producing_calldatas(producing_calldatas), producing_calldata_sizes(producing_calldata_sizes), metadatas(metadatas), kv_start_inxs(kv_start_inxs), producing_calldata_start_inxs(producing_calldata_start_inxs), hash_to_idx(hash_to_idx) {}
+
     // Number of storage cases in the queue
     size_t size = 0;
 
@@ -169,22 +196,36 @@ struct FuzzStorageBuffer {
     void push(const FuzzStorageCaseView& storage);
 
     // NB This does not filter on if the hash is already in the buf!
-    // TODO we should filter here
+    // TODO we should filter here, in fact the seen hashes hashset should be a part of this struct
     void push_batch(const FuzzStorageCaseBatch& batch);
 
+    // Filters the storage buffer to only contain the elements for which the mask is true,
+    // returns a batch (not a buffer!) that contains the storage elements marked as non-true (i.e. this is basically an ordered in-place partition routine)
+    FuzzStorageCaseBatch filter(const GpuArrayView<bool>& mask);
+
     FuzzStorageCaseView operator[](size_t idx) const;
 
+    // Gives back a mask of same size as the batch (i.e. number of storage cases in the batch), marking whether the case is a direct parent anything in the current storage
+    GpuArray<bool> get_ancestor_mask(const FuzzStorageCaseBatch& batch) const;
+
     // NB The views returned from this point to internal (gpu) buffers, so will be invalid when this goes out of scope
     std::vector<FuzzStorageCaseView> get_storage_chain(storage_hash_t hash) const;
 
     // Given an array of indices of size MAP_N_CTXS, scatter the kvs of the storage cases at those indices into the gpumap
     // This basically does a bunch of parallel inserts, but in a as-fast-as-possible way
+    // The last argument marks whether we should count / increment the indicates storages 'visits' metadata field
     template <size_t MAP_N_CTXS, size_t MAP_CAP>
-    void scatter_kvs(GpuMap<MAP_N_CTXS, MAP_CAP>& gpu_map, const GpuArrayView<size_t>& storage_indices);
+    void scatter_kvs(GpuMap<MAP_N_CTXS, MAP_CAP>& gpu_map, const GpuArrayView<size_t>& storage_indices, bool count_visits = false);
 
     // The number of storage cases in the buffer that haven't been been checked for invariant violations
     size_t num_unchecked_cases() const;
 
+    // Writes the n_indices first indices that are marked as unchecked into the out_indices array
+    void get_unchecked_indices(GpuArrayView<size_t> out_indices, size_t n_indices) const;
+
+    // Sets the checked_invariants metadata field to true for the passed indices
+    void mark_checked(const GpuArrayView<size_t>& indices);
+
     // Most things below here are not supposed to be touched from the outside, but we do need to sometimes, so we're keeping them public
     // It's an invariant that keys.size == vals.size == Sum(i=0...n_entries) sizes[i]
     // Similarly for the producing_calldatas (since each calldata could be of different sizes)
diff --git a/src/common/fuzz_dss/gpu_array_utils.cpp b/src/common/fuzz_dss/gpu_array_utils.cpp
index f136a7e4..a54d797f 100644
--- a/src/common/fuzz_dss/gpu_array_utils.cpp
+++ b/src/common/fuzz_dss/gpu_array_utils.cpp
@@ -1,4 +1,5 @@
 #include <definitions/evm_defs.h>
+#include <fuzz_dss/fuzz_case_metadata.h>
 #include <fuzz_dss/fuzz_code_analysis.h>
 #include <fuzz_dss/fuzz_trace_dist.h>
 #include <fuzz_dss/gpu_array_utils.h>
@@ -6,7 +7,9 @@
 #include <gpu_dss/gpu_array.h>
 #include <thrust/count.h>
 #include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
+#include <thrust/gather.h>
 #include <thrust/sequence.h>
 
 #include <cstdint>
@@ -131,7 +134,7 @@ GpuArray<T> device_array_uniquify(const GpuArrayView<T>& in) {
 // Filters a gpuarray into an output array, based on the passed boolmask. Preserves order
 // Doesn't allocate any new memory, uses passed buffers and assumes output fits # marked elements
 template <typename T>
-void device_array_filter_inout(const GpuArrayView<T>& in, const GpuArrayView<bool>& mask, GpuArrayView<T>& out) {
+void device_array_filter_inout(const GpuArrayView<T>& in, const GpuArrayView<bool>& mask, const GpuArrayView<T>& out) {
     size_t temp_storage_bytes = 0;
 
     cub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, (T*)in.data, (bool*)mask.data, (T*)out.data, cub_temp_buffer.inout_size_ptr, mask.size);
@@ -149,6 +152,44 @@ GpuArray<T> device_array_filter(const GpuArrayView<T>& in, const GpuArrayView<bo
     return out;
 }
 
+// Helper cub functors for the partition function
+template <bool do_invert>
+struct CubInvertMaskConversionOp {
+    __host__ __device__ __forceinline__ bool operator()(const bool& val) const {
+        if (do_invert) return !val;
+        return val;
+    }
+};
+
+// Partitions a gpuarray into true and false arrays, both passed in as views, based on a boolmask. Preserves order
+// Note that the out parameters are not const because we modify the pointer directly, and assume that the sizes are correctly set
+template <typename T>
+void device_array_partition_inout(const GpuArrayView<T>& in, const GpuArrayView<bool>& mask, const GpuArrayView<T>& out_true, const GpuArrayView<T>& out_false) {
+    assert(mask.size == in.size);
+    size_t temp_storage_bytes = 0;
+    // This is two filter calls, just unrolled into here
+    cub::TransformInputIterator<bool, CubInvertMaskConversionOp<false>, bool*> mask_uninverted(mask.data, CubInvertMaskConversionOp<false>());
+    cub::DeviceSelect::Flagged((void*)nullptr, temp_storage_bytes, (T*)in.data, mask_uninverted, (T*)out_true.data, cub_temp_buffer.inout_size_ptr, mask.size);
+    cub_temp_buffer.ensure(temp_storage_bytes);
+    cub::DeviceSelect::Flagged((void*)cub_temp_buffer, temp_storage_bytes, (T*)in.data, mask_uninverted, (T*)out_true.data, cub_temp_buffer.inout_size_ptr, mask.size);
+
+    // And the false-select
+    auto mask_inverted = cub::TransformInputIterator<bool, CubInvertMaskConversionOp<true>, bool*>(mask.data, CubInvertMaskConversionOp<true>());
+    cub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, (T*)in.data, mask_inverted, (T*)out_false.data, cub_temp_buffer.inout_size_ptr, mask.size);
+    cub_temp_buffer.ensure(temp_storage_bytes);
+    cub::DeviceSelect::Flagged((void*)cub_temp_buffer, temp_storage_bytes, (T*)in.data, mask_inverted, (T*)out_false.data, cub_temp_buffer.inout_size_ptr, mask.size);
+}
+
+template <typename T>
+std::pair<GpuArray<T>, GpuArray<T>> device_array_partition(const GpuArrayView<T>& in, const GpuArrayView<bool>& mask) {
+    assert(mask.size == in.size);
+    auto n_true = device_array_count<bool>(mask, true);
+    GpuArray<T> out_true(n_true);
+    GpuArray<T> out_false(mask.size - n_true);
+    device_array_partition_inout(in, mask, out_true.view(), out_false.view());
+    return std::make_pair(out_true, out_false);
+}
+
 // Transforms a array of indices into a boolmask, of a given size (that must be larger than the largest index in the index array)
 // Takes in a pre-allocated bool array, sets all values to zero before marking indices!
 void device_array_indices_to_boolmask_inout(const GpuArrayView<size_t>& indices, GpuArrayView<bool>& mask) {
@@ -172,6 +213,56 @@ std::vector<T> device_array_to_host(const GpuArrayView<T>& a) {
     return result;
 }
 
+// Modifies the output array in-place, and requires the passed output view to be correctly sized
+// View is consted because we act on the underlying array, not the view data
+template <typename T>
+void device_array_rle_decode(const GpuArrayView<size_t>& run_lengths_view,
+                             const GpuArrayView<T>& run_values_view,
+                             const GpuArrayView<T>& output_view) {
+    typedef long long difference_type;
+    size_t* run_lengths_start = run_lengths_view.data;
+    size_t* run_lengths_end = run_lengths_view.data + run_lengths_view.size;
+    T* run_values_start = run_values_view.data;
+    T* output = const_cast<T*>(output_view.data);
+
+    difference_type input_size = run_lengths_view.size;
+    // Note that the reduction operator is + by default and the unit is 0 by default a sum,
+    // so this is just summing the runlengths to get the size of the decoded buffer
+    difference_type output_size = thrust::reduce(run_lengths_start, run_lengths_end);
+    assert(output_view.size == output_size && "output_view must be of size output_size");
+    if (output_size == 0) {
+        // Well at this point we're done, nothing to decode
+        return;
+    }
+
+    // scan the counts to obtain output offsets for each input element
+    thrust::device_vector<difference_type> output_offsets(input_size, 0);
+    thrust::exclusive_scan(run_lengths_start, run_lengths_end, output_offsets.begin());
+
+    // scatter the nonzero counts into their corresponding output positions
+    thrust::device_vector<difference_type> output_indices(output_size, 0);
+    auto scatter_input_it = thrust::counting_iterator<difference_type>(0);
+    auto scatter_pred = [] __device__(difference_type x) { return x != 0; };
+    thrust::scatter_if(thrust::device, scatter_input_it,
+                       scatter_input_it + input_size,
+                       output_offsets.data(),
+                       run_lengths_start,
+                       output_indices.data(),
+                       scatter_pred);
+
+    // compute max-scan over the output indices, filling in the holes
+    thrust::inclusive_scan(thrust::device, output_indices.data(),
+                           output_indices.data() + output_size,
+                           output_indices.data(),
+                           thrust::maximum<difference_type>());
+
+    // gather input values according to index array (output = run_values_start[output_indices])
+    thrust::gather(output_indices.begin(),
+                   output_indices.end(),
+                   thrust::device_pointer_cast(run_values_start),
+                   thrust::device_pointer_cast(output));
+}
+
 // -------- Explicit template instantiations of utility functions for gpuarrays -----
 
 template std::vector<uint32_t> device_array_to_host(const GpuArrayView<uint32_t>&);
@@ -182,5 +273,23 @@ template std::vector<uint256> device_array_to_host(const GpuArrayView<uint256>&)
 template size_t device_array_count(const GpuArrayView<uint32_t>&, const uint32_t&);
 template size_t device_array_count(const GpuArrayView<bool>&, const bool&);
 template GpuArray<uint32_t> device_array_uniquify(const GpuArrayView<uint32_t>& in);
-template void device_array_filter_inout<evm_tracer_t>(GpuArrayView<evm_tracer_t> const&, GpuArrayView<bool> const&, GpuArrayView<evm_tracer_t>&);
-template void device_array_filter_inout<size_t>(GpuArrayView<size_t> const&, GpuArrayView<bool> const&, GpuArrayView<size_t>&);
+template void device_array_filter_inout<bool>(GpuArrayView<bool> const&, GpuArrayView<bool> const&, const GpuArrayView<bool>&);
+template void device_array_filter_inout<uint8_t>(GpuArrayView<uint8_t> const&, GpuArrayView<bool> const&, const GpuArrayView<uint8_t>&);
+template void device_array_filter_inout<uint32_t>(GpuArrayView<uint32_t> const&, GpuArrayView<bool> const&, const GpuArrayView<uint32_t>&);
+template void device_array_filter_inout<uint64_t>(GpuArrayView<uint64_t> const&, GpuArrayView<bool> const&, const GpuArrayView<uint64_t>&);
+template void device_array_filter_inout<evm_tracer_t>(GpuArrayView<evm_tracer_t> const&, GpuArrayView<bool> const&, const GpuArrayView<evm_tracer_t>&);
+template void device_array_filter_inout<uint256>(GpuArrayView<uint256> const&, GpuArrayView<bool> const&, const GpuArrayView<uint256>&);
+template std::pair<GpuArray<bool>, GpuArray<bool>> device_array_partition<bool>(const GpuArrayView<bool>&, const GpuArrayView<bool>&);
+template std::pair<GpuArray<uint8_t>, GpuArray<uint8_t>> device_array_partition<uint8_t>(const GpuArrayView<uint8_t>&, const GpuArrayView<bool>&);
+template std::pair<GpuArray<uint32_t>, GpuArray<uint32_t>> device_array_partition<uint32_t>(const GpuArrayView<uint32_t>&, const GpuArrayView<bool>&);
+template std::pair<GpuArray<uint64_t>, GpuArray<uint64_t>> device_array_partition<uint64_t>(const GpuArrayView<uint64_t>&, const GpuArrayView<bool>&);
+template std::pair<GpuArray<uint256>, GpuArray<uint256>> device_array_partition<uint256>(const GpuArrayView<uint256>&, const GpuArrayView<bool>&);
+template void device_array_rle_decode(const GpuArrayView<size_t>& run_lengths_view,
+                                      const GpuArrayView<bool>& run_values_view,
+                                      const GpuArrayView<bool>& output_view);
+template void device_array_rle_decode(const GpuArrayView<size_t>& run_lengths_view,
+                                      const GpuArrayView<uint256>& run_values_view,
+                                      const GpuArrayView<uint256>& output_view);
+// This is why we pull in metadata defs
+template std::pair<GpuArray<StorageCaseMetadata>, GpuArray<StorageCaseMetadata>> device_array_partition<StorageCaseMetadata>(const GpuArrayView<StorageCaseMetadata>&, const GpuArrayView<bool>&);
+template void device_array_filter_inout<StorageCaseMetadata>(GpuArrayView<StorageCaseMetadata> const&, GpuArrayView<bool> const&, const GpuArrayView<StorageCaseMetadata>&);
\ No newline at end of file
diff --git a/src/common/fuzz_dss/gpu_array_utils.h b/src/common/fuzz_dss/gpu_array_utils.h
index 2c82d2a5..3af4bffa 100644
--- a/src/common/fuzz_dss/gpu_array_utils.h
+++ b/src/common/fuzz_dss/gpu_array_utils.h
@@ -2,6 +2,8 @@
 
 #include <gpu_dss/gpu_array.h>
 
+#include <utility>
+
 // -------- Declaration of templated utilities for gpuarrays --------
 
 struct DeviceLessThan {
@@ -28,11 +30,15 @@ GpuArray<T> device_array_unique_from_runs(const GpuArrayView<T>& in);
 
 // Filters a gpuarray into an output array, based on the passed boolmask. Preserves order
 template <typename T>
-void device_array_filter_inout(const GpuArrayView<T>& in, const GpuArrayView<bool>& mask, GpuArrayView<T>& out);
+GpuArray<T> device_array_filter(const GpuArrayView<T>& in, const GpuArrayView<bool>& mask);
 
-// Filters a gpuarray into an output array, based on the passed boolmask. Preserves order
+// Partitions a gpuarray into true and false arrays, both passed in as views, based on a boolmask. Assumes caller has checked that out views fit the output. Preserves order
 template <typename T>
-GpuArray<T> device_array_filter(const GpuArrayView<T>& in, const GpuArrayView<bool>& mask);
+void device_array_partition_inout(const GpuArrayView<T>& in, const GpuArrayView<bool>& mask, const GpuArrayView<T>& out_true, const GpuArrayView<T>& out_false);
+
+// Partitions a gpuarray into true and false arrays, giving back pair of (marked_true, marked_false) that it allocates. Preserves order
+template <typename T>
+std::pair<GpuArray<T>, GpuArray<T>> device_array_partition(const GpuArrayView<T>& in, const GpuArrayView<bool>& mask);
 
 // Counts number of occurences (doesn't deduplicate etc) in the array
 template <typename T>
@@ -52,10 +58,6 @@ GpuArray<T> device_array_unique_from_runs(const GpuArrayView<T>& in);
 template <typename T>
 GpuArray<T> device_array_uniquify(const GpuArrayView<T>& in);
 
-// Filters a gpuarray into an output array, based on the passed boolmask. Preserves order
-template <typename T>
-void device_array_filter_inout(const GpuArrayView<T>& in, const GpuArrayView<bool>& mask, GpuArrayView<T>& out);
-
 // Filters a gpuarray into an output array, based on the passed boolmask. Preserves order
 template <typename T>
 GpuArray<T> device_array_filter(const GpuArrayView<T>& in, const GpuArrayView<bool>& mask);
@@ -83,7 +85,7 @@ GpuArray<T> device_array_uniquify(const GpuArrayView<T>& in);
 // Filters a gpuarray into an output array, based on the passed boolmask. Preserves order
 // Doesn't allocate any new memory, uses passed buffers and assumes output fits # marked elements
 template <typename T>
-void device_array_filter_inout(const GpuArrayView<T>& in, const GpuArrayView<bool>& mask, GpuArrayView<T>& out);
+void device_array_filter_inout(const GpuArrayView<T>& in, const GpuArrayView<bool>& mask, const GpuArrayView<T>& out);
 
 // Filters a gpuarray into an output array, based on the passed boolmask. Preserves order
 template <typename T>
@@ -101,4 +103,12 @@ void device_array_indices_to_boolmask_inout(const GpuArrayView<size_t>& indices,
 template <typename T>
 std::vector<T> device_array_to_host(const GpuArrayView<T>& a);
 
+// Does the opposite of RLE
+// From https://github.com/NVIDIA/thrust/blob/master/examples/expand.cu
+// Modifies the output array in-place, and requires the passed output view to be correctly sized
+template <typename T>
+void device_array_rle_decode(const GpuArrayView<size_t>& run_lengths,
+                             const GpuArrayView<T>& run_values,
+                             const GpuArrayView<T>& output);
+
 // -------- /Declaration of templated utilities for gpuarrays --------
diff --git a/src/common/fuzzer/fuzzer.cpp b/src/common/fuzzer/fuzzer.cpp
index 5589ed7d..452f46ff 100644
--- a/src/common/fuzzer/fuzzer.cpp
+++ b/src/common/fuzzer/fuzzer.cpp
@@ -3,6 +3,7 @@
 #include <common/fuzz_algos/fuzz_evolve.h>
 #include <common/fuzz_algos/fuzz_gpuevm_harness.h>
 #include <common/fuzz_dss/fuzz_case_queue.h>
+#include <common/fuzz_dss/gpu_array_utils.h>
 #include <common/fuzzer/fuzzer.h>
 #include <common/gpu_dss/gpu_array.h>
 #include <common/utils/utils.h>
@@ -59,70 +60,69 @@ ExtGpuEvmFuzzRets run_fuzzer(const ExtGpuEvmFuzzSetup& setup, const FuzzStorageC
             fuzz_check_invariants(fuzz_state);
         }
 
-        if (fuzz_state.corpus.size() > fuzz_state.max_corpus_size) {
-#ifdef GPUEVM_DEBUG_FUZZER
-            std::cout << "Culling corpus" << std::endl;
-#endif
-            fuzz_cull(fuzz_state);
-        } else {
+        // TODO think about good thresholds/triggers for this
+        if ((fuzz_state.storage_corpus.size * fuzz_state.corpus.size()) > (MAX_N_SEPARATE_CONTEXTS * 100)) {
+            if (run_inx % 40 == 0) {
+                fuzz_cull(fuzz_state);
+            }
+        }
 #ifdef GPUEVM_DEBUG_FUZZER
-            std::cout << "run_inx: " << run_inx << ", corpus size: " << fuzz_state.corpus.size() << " and " << fuzz_state.reverting_corpus.size() << " reverting cases" << std::endl;
-            auto pre_map = fuzz_state.dist_map.to_host_map();
+        std::cout << "run_inx: " << run_inx << ", corpus size: " << fuzz_state.corpus.size() << " and " << fuzz_state.reverting_corpus.size() << " reverting cases" << std::endl;
+        auto pre_map = fuzz_state.dist_map.to_host_map();
 #endif
-            auto pre_coverage = fuzz_active_ctx().rti_maps.coverage_map.tot_coverage();
-            auto [add_to_corpus, opt_reverting_corpus, opt_new_storages] = fuzz_evolve(fuzz_state);
+        auto pre_coverage = fuzz_active_ctx().rti_maps.coverage_map.tot_coverage();
+        auto [add_to_corpus, opt_reverting_corpus, opt_new_storages] = fuzz_evolve(fuzz_state);
+        gpuevm_nvtx_pop();
+        gpuevm_nvtx_push("calldata_push_to_corpus");
+        fuzz_state.corpus.pushn(add_to_corpus);
+        gpuevm_nvtx_pop();
+
+        if (opt_reverting_corpus.has_value()) {
+            gpuevm_nvtx_push("reverting_corpus_push");
+            fuzz_state.reverting_corpus.pushn(opt_reverting_corpus.value());
             gpuevm_nvtx_pop();
-            gpuevm_nvtx_push("calldata_push_to_corpus");
-            fuzz_state.corpus.pushn(add_to_corpus);
+            std::cout << "Found new reverting cases, added " << opt_reverting_corpus.value().n_cases << " cases to reverting-corpus" << std::endl;
+        }
+        if (opt_new_storages.has_value()) {
+            gpuevm_nvtx_push("new_storages_push");
+            auto new_storages = opt_new_storages.value();
+            fuzz_state.storage_corpus.push_batch(new_storages);
             gpuevm_nvtx_pop();
-
-            if (opt_reverting_corpus.has_value()) {
-                gpuevm_nvtx_push("reverting_corpus_push");
-                fuzz_state.reverting_corpus.pushn(opt_reverting_corpus.value());
-                gpuevm_nvtx_pop();
-                std::cout << "Found new reverting cases, added " << opt_reverting_corpus.value().n_cases << " cases to reverting-corpus" << std::endl;
-            }
-            if (opt_new_storages.has_value()) {
-                gpuevm_nvtx_push("new_storages_push");
-                auto new_storages = opt_new_storages.value();
-                fuzz_state.storage_corpus.push_batch(new_storages);
-                gpuevm_nvtx_pop();
 #ifdef GPUEVM_DEBUG_FUZZER
-                std::cout << "Found new interesting storages, added " << new_storages.n_cases << " storages to storage-corpus" << std::endl;
-                std::cout << "New storage batch: " << new_storages << std::endl;
+            std::cout << "Found new interesting storages, added " << new_storages.n_cases << " storages to storage-corpus" << std::endl;
+            std::cout << "New storage batch: " << new_storages << std::endl;
 #endif
-            }
+        }
 
-            auto post_coverage = fuzz_active_ctx().rti_maps.coverage_map.tot_coverage();
-            if (post_coverage > pre_coverage) {
-                std::cout << "[Coverage increased from " << pre_coverage << " to " << post_coverage << " (+" << post_coverage - pre_coverage << ") in run " << run_inx << "]" << std::flush;
-            }
+        auto post_coverage = fuzz_active_ctx().rti_maps.coverage_map.tot_coverage();
+        if (post_coverage > pre_coverage) {
+            std::cout << "[Coverage increased from " << pre_coverage << " to " << post_coverage << " (+" << post_coverage - pre_coverage << ") in run " << run_inx << "]" << std::flush;
+        }
 #if GPUEVM_DEBUG_FUZZER
-            auto post_map = fuzz_state.dist_map.to_host_map();
-            // std::cout << "Current distmap: " << fuzz_state.dist_map << std::endl;
-            std::unordered_map<distmap_key_t, distmap_val_t> diff_map;
-            std::unordered_map<distmap_key_t, distmap_val_t> prev_to_diff_map;
-            for (auto& kv : post_map) {
-                if (pre_map.find(kv.first) == pre_map.end()) {
-                    diff_map[kv.first] = kv.second;
-                    prev_to_diff_map[kv.first] = distmap_val_t();
-                } else if (pre_map[kv.first] != kv.second) {
-                    diff_map[kv.first] = kv.second;
-                    prev_to_diff_map[kv.first] = pre_map[kv.first];
-                }
+        auto post_map = fuzz_state.dist_map.to_host_map();
+        // std::cout << "Current distmap: " << fuzz_state.dist_map << std::endl;
+        std::unordered_map<distmap_key_t, distmap_val_t> diff_map;
+        std::unordered_map<distmap_key_t, distmap_val_t> prev_to_diff_map;
+        for (auto& kv : post_map) {
+            if (pre_map.find(kv.first) == pre_map.end()) {
+                diff_map[kv.first] = kv.second;
+                prev_to_diff_map[kv.first] = distmap_val_t();
+            } else if (pre_map[kv.first] != kv.second) {
+                diff_map[kv.first] = kv.second;
+                prev_to_diff_map[kv.first] = pre_map[kv.first];
             }
-            if (diff_map.size() > 0) {
-                auto seen_covhashes_vec = dev_hashset_to_vec(fuzz_state.covhashes_seen);
-                std::cout << "Current (" << seen_covhashes_vec.size() << " unique) seen covhashes: " << seen_covhashes_vec << std::endl;
-                std::cout << "Current distmap size (in pairs): " << fuzz_state.dist_map.size() << std::endl;
-                std::cout << "Dist map diff size: " << diff_map.size() << std::endl;
-                for (auto& kv : diff_map) {
-                    std::cout << "Instrhash: " << std::hex << std::setfill('0') << std::setw(8) << kv.first << std::dec << ", branchdist: " << kv.second << ", prev branchdist: " << prev_to_diff_map[kv.first] << std::endl;
-                }
+        }
+        if (diff_map.size() > 0) {
+            auto seen_covhashes_vec = dev_hashset_to_vec(fuzz_state.covhashes_seen);
+            std::cout << "Current (" << seen_covhashes_vec.size() << " unique) seen covhashes: " << seen_covhashes_vec << std::endl;
+            std::cout << "Current distmap size (in pairs): " << fuzz_state.dist_map.size() << std::endl;
+            std::cout << "Dist map diff size: " << diff_map.size() << std::endl;
+            for (auto& kv : diff_map) {
+                std::cout << "Instrhash: " << std::hex << std::setfill('0') << std::setw(8) << kv.first << std::dec << ", branchdist: " << kv.second << ", prev branchdist: " << prev_to_diff_map[kv.first] << std::endl;
             }
+        }
 
 #endif
-        }
 
         if (run_inx < fuzz_state.n_fuzz_par_runs - 1) {
             std::cout << "-";
diff --git a/src/common/gpu_dss/gpu_hashmap.h b/src/common/gpu_dss/gpu_hashmap.h
index 93e275ed..844fee99 100644
--- a/src/common/gpu_dss/gpu_hashmap.h
+++ b/src/common/gpu_dss/gpu_hashmap.h
@@ -173,13 +173,6 @@ class GpuMap {
     MAP_HOSTDEVFUNC void insert(const uint256& key, const uint256& val) {
         MAP_ASSERT(n_fill[MAP_INX(0)] < MAP_CAP);
 
-        // debug printing
-        // printf("INSERT | Inserting key: ");
-        // for (int i = 0; i < 4; i++) {
-        //    printf("%016lx", ((uint64_t*)&key)[i]);
-        //}
-        // printf("\n");
-
         hash_t place = find(key);
         hash_t pos = place & MAP_EMPTY_MASK;
 
diff --git a/src/common/ingest/evm_runtime_ctx.cpp b/src/common/ingest/evm_runtime_ctx.cpp
index e11bd8f0..5d3812f2 100644
--- a/src/common/ingest/evm_runtime_ctx.cpp
+++ b/src/common/ingest/evm_runtime_ctx.cpp
@@ -154,19 +154,3 @@ void GpuevmCtx::dealloc() {
     calleraddresses.dealloc();
     CUDA_SAFE_CALL(cudaDeviceSynchronize());
 }
-
-void GpuevmCtx::printReturnValues() {
-    // Execution is done here, fetch outputs or integrate into a larger context/fuzzer/MEV searcher or w/e we want.
-    for (int evm_inx = 0; evm_inx < n_runners; ++evm_inx) {
-        uint32_t return_offset = return_offsets_buf[evm_inx];
-        uint32_t return_size = return_sizes_buf[evm_inx];
-        uint8_t *return_data = &memory_buf[evm_inx * MEM_SIZE_BYTES] + return_offset;
-        // Print it as hex, using std::cout std::hex and std::setw / std::setfill
-        std::cout << "EVM #" << evm_inx << " returned " << return_size << " bytes: " << std::endl;
-        for (uint32_t i = 0; i < return_size; ++i) {
-            std::cout << std::hex << std::setw(2) << std::setfill('0') << (int)return_data[i];
-        }
-        std::cout << std::dec << std::endl;
-        break;  // TODO remove and actually do printing of a single evm or something, triggered from fuzzoutput printing
-    }
-}
\ No newline at end of file
diff --git a/src/common/ingest/evm_runtime_ctx.h b/src/common/ingest/evm_runtime_ctx.h
index 6bc0151f..cddec0a5 100644
--- a/src/common/ingest/evm_runtime_ctx.h
+++ b/src/common/ingest/evm_runtime_ctx.h
@@ -43,5 +43,4 @@ class GpuevmCtx {
     void dealloc_calldatas();
 
     void dealloc();
-    void printReturnValues();
 };
diff --git a/src/common/utils/utils.cpp b/src/common/utils/utils.cpp
index d1c3021f..a8a1fb4c 100644
--- a/src/common/utils/utils.cpp
+++ b/src/common/utils/utils.cpp
@@ -81,8 +81,24 @@ template std::ostream &operator<<(std::ostream &os, const std::unordered_map<std
 
 // -------- Implementation for the timing utils ---------
 namespace timing {
-    fuzz_time_point current_time() {
+    // Provides nanosecond resolution timing wrt an arbitrary epoch
+    // NOTE THAT DEVICE AND HOST EPOCHS ARE NOT NECESSARILY SYNCHRONIZED
+    // TODO provide such synced epochs
+    __device__ uint64_t device_nano_time() {
+        uint64_t mclk;
+        asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(mclk));
+        return mclk;
+    }
+    __host__ __device__ fuzz_time_point current_time() {
+#ifdef __CUDA_ARCH__
+        // Cursed casts
+        long long nano_time = device_nano_time();
+        std::chrono::nanoseconds time = std::chrono::nanoseconds(nano_time);
+        std::chrono::time_point<std::chrono::high_resolution_clock, std::chrono::nanoseconds> time_point(time);
+        return time_point;
+#else
         return std::chrono::high_resolution_clock::now();
+#endif
     }
 
     fuzz_time_diff time_diff(fuzz_time_point start, fuzz_time_point end) {
diff --git a/src/common/utils/utils.h b/src/common/utils/utils.h
index bc89fffb..3b1c9ab8 100644
--- a/src/common/utils/utils.h
+++ b/src/common/utils/utils.h
@@ -21,7 +21,8 @@ typedef uint64_t fuzz_time_diff;
 
 namespace timing {
     // ------- Timing for within-app uses ---------
-    fuzz_time_point current_time();
+    __device__ uint64_t device_nano_time();
+    __host__ __device__ fuzz_time_point current_time();
     fuzz_time_diff time_diff(fuzz_time_point start, fuzz_time_point end);
 
     // Appropriately uses SI prefixes to print