diff --git a/CMakeLists.txt b/CMakeLists.txt index 91d544ff..73bdff78 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,7 @@ CPMAddPackage("gh:simdjson/simdjson#v3.9.2") find_package(RapidJSON REQUIRED) find_library(TCLAP tclap) +option(GPUEVM_O2_DEBUG "Enable O2 debug for CUDA" ON) option(GPUEVM_DEBUG_INTERPRETER "Printing from device during interpreter runtime" OFF) option(GPUEVM_DEBUG_FUZZER "Print debug info from fuzzer during runtime" OFF) option(GPUEVM_DEBUG_NVTX "Printing nvtx context pushing/popping during runtime" OFF) @@ -103,10 +104,13 @@ function(configure_target_gpuevm target) target_compile_options(${target} PRIVATE $<$:-g>) target_compile_options(${target} PRIVATE $<$:-Xptxas -v>) - # TODO make this an option - target_compile_options(${target} PRIVATE $<$:-dopt=on>) - target_compile_options(${target} PRIVATE $<$:-O2>) - target_compile_options(${target} PRIVATE $<$:-Xptxas -O2>) + # Recall that we have this a suboption of the debug build type bc we couldn't figure out how to capture + # all the lineinfo in a -g Release build + if(GPUEVM_O2_DEBUG) + target_compile_options(${target} PRIVATE $<$:-dopt=on>) + target_compile_options(${target} PRIVATE $<$:-O2>) + target_compile_options(${target} PRIVATE $<$:-Xptxas -O2>) + endif() # And set the GPUEVM_DEBUG symbol target_compile_definitions(${target} PRIVATE GPUEVM_DEBUG) diff --git a/src/common/fuzz_algos/CMakeLists.txt b/src/common/fuzz_algos/CMakeLists.txt index 540dbfa0..7fb4157d 100644 --- a/src/common/fuzz_algos/CMakeLists.txt +++ b/src/common/fuzz_algos/CMakeLists.txt @@ -17,4 +17,4 @@ set_target_properties(evm_fuzz_algos PROPERTIES CUDA_SEPARABLE_COMPILATION ON) target_include_directories(evm_fuzz_algos PUBLIC ${GPU_EVM_SOURCE_DIR}/src/common) -target_link_libraries(evm_fuzz_algos PUBLIC evm_ingest ${CUDA_LIBRARIES} libcuda.so cudart) \ No newline at end of file +target_link_libraries(evm_fuzz_algos PUBLIC evm_ingest ${CUDA_LIBRARIES} libcuda.so cudart termcolor) \ No newline at end of file diff --git a/src/common/fuzz_algos/fuzz_check_invariants.cpp b/src/common/fuzz_algos/fuzz_check_invariants.cpp index eb12efab..580fe76f 100644 --- a/src/common/fuzz_algos/fuzz_check_invariants.cpp +++ b/src/common/fuzz_algos/fuzz_check_invariants.cpp @@ -4,6 +4,7 @@ #include #include +#include __global__ void set_test_selectors_kernel(GpuArrayView test_selectors_out, const __restrict__ uint8_t* invariant_selectors, const ExtFuzzFuncs fuzz_funcs, uint32_t n_tested_storages, uint32_t n_tot_runners) { auto tid = blockIdx.x * blockDim.x + threadIdx.x; @@ -40,25 +41,25 @@ void set_test_selectors(GpuArrayView test_selectors, const ExtFuzzFuncs safeCudaFree(selector_buf); } -__global__ void set_storage_indices_kernel(GpuArrayView storage_indices, uint32_t n_tested_storages, uint32_t n_test_threads, size_t index_offset) { +__global__ void set_storage_indices_kernel(GpuArrayView storage_indices, uint32_t n_tested_storages, uint32_t n_test_threads) { auto tid = blockIdx.x * blockDim.x + threadIdx.x; + // For tid < n_tested_storages, we're already done with the cyclical assignment + if (tid < n_tested_storages) return; // All evms above n_test_threads just get storage index 0, since it's guaranteed to exist if (tid >= MAX_N_SEPARATE_CONTEXTS) return; if (tid >= n_test_threads) { storage_indices[tid] = 0; } else { - storage_indices[tid] = (tid % n_tested_storages) + index_offset; + storage_indices[tid] = storage_indices[tid % n_tested_storages]; } } -void set_storage_indices(GpuArrayView storage_indices, uint32_t n_tested_storages, uint32_t n_test_threads, size_t index_offset) { +void cyclic_repeat_storage_indices(GpuArrayView storage_indices, uint32_t n_tested_storages, uint32_t n_test_threads) { auto [grid_size, block_size] = get_cuda_max_occupancy(storage_indices.size, (void*)set_storage_indices_kernel); - set_storage_indices_kernel<<>>(storage_indices, n_tested_storages, n_test_threads, index_offset); + set_storage_indices_kernel<<>>(storage_indices, n_tested_storages, n_test_threads); safeStreamSync(); } -// TODO store this in the storage corpus itself -size_t first_unchecked_idx = 0; void fuzz_check_invariants(FuzzerState& fuzz_state) { gpuevm_nvtx_push("fuzz_check_invariants"); // TODO: Complete the implementation and remove limitations @@ -66,42 +67,38 @@ void fuzz_check_invariants(FuzzerState& fuzz_state) { // THE AMPERSAND IS NECESSARY WHEN USING AUTO; OTHERWISE YOU GET A BY-VALUE COPY! auto& api_ctx = fuzz_active_ctx(); - assert(fuzz_state.fuzz_funcs.n_invariant_funcs > 0); - assert(fuzz_state.fuzz_funcs.n_invariant_funcs <= MAX_N_SEPARATE_CONTEXTS); - assert(api_ctx.n_runners == MAX_N_SEPARATE_CONTEXTS); + assert(fuzz_state.fuzz_funcs.n_invariant_funcs > 0 && "No invariants to check"); + assert(fuzz_state.fuzz_funcs.n_invariant_funcs <= MAX_N_SEPARATE_CONTEXTS && "Too many invariants to check (more than one per thread)"); + assert(api_ctx.n_runners == MAX_N_SEPARATE_CONTEXTS && "Wrong number of interpreter threads, must use one per cuda thread"); // The actual number of storages we can test (since each storage needs to be tested against all invariants) auto n_tested_storages = MAX_N_SEPARATE_CONTEXTS / fuzz_state.fuzz_funcs.n_invariant_funcs; auto n_test_threads = n_tested_storages * fuzz_state.fuzz_funcs.n_invariant_funcs; - // TODO remove this requirement - // There must be enough storages actually recorded - assert(fuzz_state.storage_corpus.size >= n_tested_storages); - - /* Plan for using/setting the checkedness on storages - 1. Get the first storage index that is unchecked, (This should and can just be tracked in single counter, until we switch to having separate fuzzing and invariantchecking storage corpori) - 2. Doublecheck that we actually have enough storages to run a full batch - 3. Use the index as offset when we make the cyclic storage selection array - 4. Mark the check storages from that index as checked + /* Storage selection for invariant checking + 1. Check that there are n_tested_storages that are unchecked + 2. Get the n_tested_storages first such indices + 3. Use that array of indices to cyclically repeat + 4. Mark those storages as checked + 5. Move the storages into the ctx's gpu_map */ - // 1. Get the first unchecked index - // TODO use cub::argindexinputiterator in the future? - - // 2. Check that we have enough storages - assert(fuzz_state.storage_corpus.size > first_unchecked_idx + n_tested_storages); + // 1. Check that there are n_tested_storages that are unchecked + assert(fuzz_state.storage_corpus.num_unchecked_cases() >= n_tested_storages && "Not enough unchecked storages to check invariants on"); - // 3. Use the unchecked_idx as (value)offset in storage selection array + // 2. Get the n_tested_storages first such indices GpuArray storage_indices(MAX_N_SEPARATE_CONTEXTS); - set_storage_indices(storage_indices.view(), n_tested_storages, n_test_threads, first_unchecked_idx); + auto small_storage_indices_view = storage_indices.view(); + // We need to let the get_unchecked_indices only put n_tested_storages into the array + small_storage_indices_view.size = n_tested_storages; + fuzz_state.storage_corpus.get_unchecked_indices(small_storage_indices_view, n_tested_storages); + // 3. Use that array of indices to cyclically repeat + cyclic_repeat_storage_indices(storage_indices.view(), n_tested_storages, n_test_threads); // 4. Set the checkedness of the storages to true - // TODO this should also probably be a member of the storage corpus - for (auto i = 0; i < n_tested_storages; i++) { - fuzz_state.storage_corpus.metadatas[i + first_unchecked_idx].checked_invariants = true; - } + fuzz_state.storage_corpus.mark_checked(small_storage_indices_view); - // Set the storages into the ctx + // 5. Set the storages into the ctx's gpu_map fuzz_state.storage_corpus.scatter_kvs(api_ctx.storage, storage_indices.view()); // Now we group the selectors into contiguous groups to get maximal warp coherence @@ -150,17 +147,17 @@ void fuzz_check_invariants(FuzzerState& fuzz_state) { CUDA_SAFE_CALL(cudaDeviceSynchronize()); + size_t n_broken_invariants = fuzz_state.fuzz_funcs.n_invariant_funcs - std::count(invariant_revert_mask.data, invariant_revert_mask.data + fuzz_state.fuzz_funcs.n_invariant_funcs, 0); + // If any reverts are new (i.e. set in this mask, but not previously set in the reverted_invariants_times), set to current time fuzz_time_diff revert_time = timing::time_diff(fuzz_state.start_time, timing::current_time()); for (auto invariant_idx = 0; invariant_idx < fuzz_state.fuzz_funcs.n_invariant_funcs; invariant_idx++) { if (invariant_revert_mask[invariant_idx] && fuzz_state.reverted_invariants_times[invariant_idx] == 0) { - std::cout << "[Broke inv " << invariant_idx << "]"; + std::cout << termcolor::red << "[Broke inv " << invariant_idx << "| tot broken: " << n_broken_invariants << "]" << termcolor::reset; fuzz_state.reverted_invariants_times[invariant_idx] = revert_time; } } // TODO check what the actual memory consistency guarantees are for stream syncs vs device syncs and manual (i.e. not via kernels that are in streams) access to unified memory CUDA_SAFE_CALL(cudaDeviceSynchronize()); - - first_unchecked_idx += n_tested_storages; gpuevm_nvtx_pop(); } \ No newline at end of file diff --git a/src/common/fuzz_algos/fuzz_cull.cpp b/src/common/fuzz_algos/fuzz_cull.cpp index fff235d2..1c2b4476 100644 --- a/src/common/fuzz_algos/fuzz_cull.cpp +++ b/src/common/fuzz_algos/fuzz_cull.cpp @@ -1,14 +1,108 @@ #include "fuzz_cull.h" +#include + +#include + /* -Culling the queue of fuzzing inputs. +Culling the inputs/storages that have been accumulated. + +Calldata culling ideas: Initial implementation reconstructs all edge coverage found by every input in the queue, and then finds a subset (not necessarily minimal, this is subset cover which is NP hard) of queue entries that cover the same edges. + +Storage culling ideas: +Remove all storages that have above M visits and below R ratio of votes/visits +TODO think about some sort of probabalistic thing to cull +TODO make it reach a target absolute # passing instead (histogram vote ratios, then cutoff) */ +namespace fuzz_culling { + // Thresholds for storage culling + constexpr int MIN_VISITS = 1024; + constexpr float MIN_VOTE_RATIO = 1 / 1024; + + // From metadata array, set into allocated mask whether a storage passes (should not be culled) + __global__ void cull_storages_mark_kernel(const GpuArrayView metadata, GpuArrayView passes_mask, const GpuArrayView storage_sizes, size_t max_storage_size) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= metadata.size) return; + // Note that if we haven't had MIN_VISITS, then we spare that storage + auto votes = metadata[tid].fuzz_case_metadata.votes; + auto visits = metadata[tid].fuzz_case_metadata.visits; + auto ratio = metadata[tid].fuzz_case_metadata.vote_ratio(); + auto storage_size = storage_sizes[tid]; + // TODO come up with good rules here + if (visits < MIN_VISITS) { + // Automatically spared + passes_mask[tid] = storage_size >= (max_storage_size - 1); + } else { + // Having already had MIN_VISITS changes, it better have generated good results, or it should be of max size + passes_mask[tid] = (ratio >= MIN_VOTE_RATIO); + } + } + + size_t max_storage_size(const GpuArrayView& kv_sizes) { + auto max_size_ptr = thrust::max_element(thrust::device, kv_sizes.data, kv_sizes.data + kv_sizes.size); + auto max_size = *max_size_ptr; + return max_size; + } + + void cull_storages_mark(FuzzStorageBuffer& storage_corpus, GpuArrayView passes_mask) { + assert(storage_corpus.size <= passes_mask.size); + // We calculate the max storage kv size, and then don't allow any max-size storages to be culled + auto max_size = max_storage_size(storage_corpus.kv_sizes.view()); + + auto [grid_size, block_size] = get_cuda_max_occupancy(storage_corpus.metadatas.size(), (void*)cull_storages_mark_kernel); + cull_storages_mark_kernel<<>>(storage_corpus.metadatas.view(), passes_mask, storage_corpus.kv_sizes.view(), max_size); + } + + // Given a pass mask, removes & compacts the storage corpus + // TODO think about whether ancestor handling should be done in a GC-like way instead + void cull_storages_filter(FuzzerState& fuzz_state, GpuArrayView passes_mask) { + // We get back the culled storages from the compaction + + auto culled_storages = fuzz_state.storage_corpus.filter(passes_mask); + GpuArray ancestor_mask = fuzz_state.storage_corpus.get_ancestor_mask(culled_storages); + // NB ignoring return value here, we just want the ones that remain after keeping ancestors + culled_storages.filter(ancestor_mask); + // TODO make sure batch actually checks hashes) + fuzz_state.ancestor_corpus.push_batch(culled_storages); + } + +} + void fuzz_cull(FuzzerState& fuzz_state) { - std::cout << "hello world I am culling!" << std::endl; -#ifdef GPUEVM_DEBUG - std::cout << "Culling from corpus size " << fuzz_state.corpus.size() << std::endl; -#endif + gpuevm_nvtx_push("fuzz_cull"); + + long long num_passes_before = fuzz_state.storage_corpus.size; + + // TODO cache the various tempbuffers here + GpuArray passes_mask(fuzz_state.storage_corpus.size); + + // We get back a mask that has true if the storage passed, and false if it should be culled (with no regard for ancestor-keeping) + fuzz_culling::cull_storages_mark(fuzz_state.storage_corpus, passes_mask.view()); + safeStreamSync(); + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + + // Don't allow removing all states, so check that enough are kept before actually culling + if (device_array_count(passes_mask, true) >= 32) { + // We take care of ancestors here + fuzz_culling::cull_storages_filter(fuzz_state, passes_mask.view()); + safeStreamSync(); + + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + // Count and print some stats about the passes_mask + long long num_passes_after = device_array_count(passes_mask, true); + long long failed = num_passes_before - num_passes_after; + if (failed > 0) { + std::cout << "[Cull: " << num_passes_after << " pass, " << failed << " failed | "; + std::cout << "Rate " << std::fixed << std::setprecision(1) << 100 * failed / (float)num_passes_before << "%]"; + } else { + std::cout << "P"; + } + } else { + std::cout << "N"; + } + + gpuevm_nvtx_pop(); } diff --git a/src/common/fuzz_algos/fuzz_dedup.cpp b/src/common/fuzz_algos/fuzz_dedup.cpp index 5ef1539e..a2f14c0d 100644 --- a/src/common/fuzz_algos/fuzz_dedup.cpp +++ b/src/common/fuzz_algos/fuzz_dedup.cpp @@ -106,6 +106,20 @@ namespace fuzz_dedup { return marked_inputs; } + __global__ void add_votes_to_storages_kernel(const GpuArrayView interesting_mask, const GpuArrayView storage_indices, GpuArrayView metadata) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= interesting_mask.size) return; + if (interesting_mask.data[tid]) + atomicAdd((unsigned long long*)&(metadata[storage_indices[tid]].fuzz_case_metadata.votes), (unsigned long long)1); + } + + void add_votes_to_storages(const GpuArrayView interesting_mask, const GpuArrayView storage_indices, GpuArrayView metadata) { + gpuevm_nvtx_push("add_votes_to_storages"); + auto [grid_size, block_size] = get_cuda_max_occupancy(interesting_mask.size, (void*)add_votes_to_storages_kernel); + add_votes_to_storages_kernel<<>>(interesting_mask, storage_indices, metadata); + gpuevm_nvtx_pop(); + } + __global__ void get_revert_mask_kernel(uint8_t* const bytecode, const GpuArrayView tracers_in, GpuArrayView reverting_mask) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid >= tracers_in.size) return; diff --git a/src/common/fuzz_algos/fuzz_dedup.h b/src/common/fuzz_algos/fuzz_dedup.h index b32f9b18..bf1e7b26 100644 --- a/src/common/fuzz_algos/fuzz_dedup.h +++ b/src/common/fuzz_algos/fuzz_dedup.h @@ -14,6 +14,7 @@ namespace fuzz_dedup { FuzzCaseInputs filter_inputs_by_mask(const FuzzCaseInputs& run_inputs, const GpuArrayView& mask); std::pair>, std::optional> get_reverting_data(uint8_t* const bytecode, evm_tracer_t* tracer_buf, const FuzzCaseInputs& run_inputs); + void add_votes_to_storages(const GpuArrayView interesting_mask, const GpuArrayView storage_indices, GpuArrayView metadata); void get_revert_mask(uint8_t* const bytecode, const GpuArrayView tracers_in, GpuArrayView reverting_mask); void compute_revert_discard_mask(const GpuArrayView reverting_mask, const GpuArrayView tracers_in, GpuArrayView revert_discard_mask); @@ -22,4 +23,4 @@ namespace fuzz_dedup { void inplace_bool_or_gpuarrays(const GpuArrayView a_inout, const GpuArrayView b_in); void inplace_bool_not_gpuarrays(const GpuArrayView a_inout); void inplace_bool_and_gpuarrays(const GpuArrayView a_inout, const GpuArrayView b_in); - } \ No newline at end of file +} \ No newline at end of file diff --git a/src/common/fuzz_algos/fuzz_state.cpp b/src/common/fuzz_algos/fuzz_state.cpp index 143b5277..fe14baa4 100644 --- a/src/common/fuzz_algos/fuzz_state.cpp +++ b/src/common/fuzz_algos/fuzz_state.cpp @@ -21,6 +21,7 @@ FuzzerState::FuzzerState(const ExtGpuEvmFuzzSetup& setup, const FuzzStorageCase& corpus(EvmFlatAbiTypes(fuzz_funcs.stateful_funcs[0].evm_flat_abi).calldata_size()), storage_corpus(initial_storage), storage_hashes_seen(max_corpus_size_pow_2 + 5), + ancestor_corpus(), storage_dep_map(max_corpus_size_pow_2 + 5), reverting_exit_datas_map(max_corpus_size_pow_2 + 5), reverting_corpus(EvmFlatAbiTypes(fuzz_funcs.stateful_funcs[0].evm_flat_abi).calldata_size()), @@ -120,10 +121,12 @@ std::tuple, std::optional, std::optional interesting_storages; if (n_interesting_storages > 0) { interesting_storages = FuzzStorageCaseBatch(run_ctx, scratch_storage_hashes_interesting_mask); @@ -261,6 +264,11 @@ std::tuple, std::optional storage_hashes_seen; + FuzzStorageBuffer ancestor_corpus; StoreDepMap storage_dep_map; DeviceHashSet reverting_exit_datas_map; diff --git a/src/common/fuzz_dss/fuzz_case_queue.h b/src/common/fuzz_dss/fuzz_case_queue.h index f0a6a6ee..46c12bc5 100644 --- a/src/common/fuzz_dss/fuzz_case_queue.h +++ b/src/common/fuzz_dss/fuzz_case_queue.h @@ -8,6 +8,7 @@ struct FuzzCaseInputs { uint32_t calldata_size; uint32_t n_cases; FuzzCaseInputs repeat_to_size(size_t new_size) const; + static FuzzCaseInputs empty(); }; diff --git a/src/common/fuzz_dss/fuzz_mutate.cpp b/src/common/fuzz_dss/fuzz_mutate.cpp index 5b3ffad1..58568f0c 100644 --- a/src/common/fuzz_dss/fuzz_mutate.cpp +++ b/src/common/fuzz_dss/fuzz_mutate.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -17,36 +18,80 @@ // --------- Storage selection kernels ---------- -__global__ void select_storage_cases_kernel(size_t corpus_size, GpuArrayView out_indices, const GpuArrayView max_indices, curandState_t* rng_states) { +__global__ void select_storage_cases_kernel(size_t corpus_size, GpuArrayView out_indices, const GpuArrayView max_indices, const GpuArrayView ratio_sorted_indices, curandState_t* rng_states) { auto out_inx = TID; if (out_inx >= out_indices.size) return; + double rand_val = curand_uniform(&rng_states[out_inx]); + const float thresholds[] = {0.10, 0.2}; + const float highest_threshold = thresholds[sizeof(thresholds) / sizeof(float) - 1]; // mix of random or max-sized - if (out_inx < double(out_indices.size) * 0.05) { + if (rand_val < thresholds[0]) { // Completely random out_indices[out_inx] = curand(&rng_states[out_inx]) % corpus_size; - } else { + } else if (rand_val < thresholds[1]) { // Select a random within the max-sized storages - out_indices[out_inx] = curand(&rng_states[out_inx]) % max_indices.size; + size_t selection_normalized_inx = curand_uniform(&rng_states[out_inx]) * max_indices.size; + selection_normalized_inx = min(max(selection_normalized_inx, (size_t)0), max_indices.size - 1); + out_indices[out_inx] = max_indices[selection_normalized_inx]; + } else { + // Select a random within the sorted indices by vote ratio + double normalized_randval = ((rand_val - highest_threshold) / (1 - highest_threshold)); + normalized_randval *= normalized_randval; + size_t selection_normalized_inx = ratio_sorted_indices.size * normalized_randval; + selection_normalized_inx = min(max(selection_normalized_inx, (size_t)0), ratio_sorted_indices.size - 1); + out_indices[out_inx] = ratio_sorted_indices[selection_normalized_inx]; } } // --------- /Storage selection kernels ---------- // --------- Storage selection precomputation logic --------- + +// Functor to compare (thrust comp op) vote ratios from metadata array pointer +struct VoteRatioComparator { + const StorageCaseMetadata* metadata; + + VoteRatioComparator(const StorageCaseMetadata* metadata) + : metadata(metadata) {} + + __host__ __device__ bool operator()(size_t a, size_t b) const { + // Comparison order to make default thrust sort produce descending vote ratio + return metadata[a].fuzz_case_metadata.vote_ratio() < metadata[b].fuzz_case_metadata.vote_ratio(); + } +}; + +// Array of indices into storage corpus that are sorted (decreasing) according to vote ratio +// TODO move to storage buffer? +GpuArray FuzzMutator::compute_ratio_sorted_indices(const FuzzStorageBuffer& storage_corpus) { + // Index iterator -> fill gpuarray + auto indices = GpuArray(storage_corpus.size); + thrust::sequence(thrust::device, indices.data, indices.data + indices.size); + + // Thrust argsort on fuzzcase_metadata.vote_ratio() + auto comparator = VoteRatioComparator(storage_corpus.metadatas.data); + thrust::sort(thrust::device, indices.data, indices.data + indices.size, comparator); + return indices; +} + +// TODO move this to storage buffer? GpuArray FuzzMutator::compute_max_storage_indices(const FuzzStorageBuffer& storage_corpus) { // Compute the max value of size auto n_storages = storage_corpus.size; - auto max_size_ptr = thrust::max_element(thrust::device, (size_t*)storage_corpus.kv_sizes, (size_t*)storage_corpus.kv_sizes + n_storages); + + // Raw ptr from the appendbuffer's internal gpuarray + auto sizes_it = storage_corpus.kv_sizes.data.data; + + auto max_size_ptr = thrust::max_element(thrust::device, sizes_it, sizes_it + n_storages); auto max_size = *max_size_ptr; - auto n_eq_max = thrust::count(thrust::device, (size_t*)storage_corpus.kv_sizes, (size_t*)storage_corpus.kv_sizes + n_storages, max_size); + auto n_eq_max = thrust::count(thrust::device, sizes_it, sizes_it + n_storages, max_size); // Store the indices into a new buffer // TODO figure out a way to do this without allocating memory GpuArray max_indices(n_eq_max); auto end_ptr = thrust::copy_if(thrust::device, thrust::make_counting_iterator(0), thrust::make_counting_iterator(n_storages), - (size_t*)storage_corpus.kv_sizes, + sizes_it, max_indices.data, thrust::placeholders::_1 == max_size); return max_indices; @@ -64,14 +109,23 @@ GpuArray FuzzMutator::compute_max_storage_indices(const FuzzStorageBuffe // Ideas for selection strategies: // (Strategy selection via weighted sampling, basically the same as calldata mutation) // TODO implement strategy selection via weighted sampling -// 1. Completely random (implemented) -// 2. Max-size only (random within those) +// 1. Completely random (not very useful) +// 2. Max-size only (random within those, implemented) // 3. Max-size *and* max values within those // 4. Size-weighted void FuzzMutator::select_storage_cases(const FuzzStorageBuffer& storage_corpus, GpuArrayView indices) { auto max_indices = compute_max_storage_indices(storage_corpus); - select_storage_cases_kernel<<>>(storage_corpus.size, indices, max_indices, rng_states); + auto ratio_sorted_indices = compute_ratio_sorted_indices(storage_corpus); + select_storage_cases_kernel<<>>(storage_corpus.size, indices, max_indices, ratio_sorted_indices, rng_states); safeStreamSync(); +#ifdef GPUEVM_DEBUG_FUZZER + // Check each storage index that it's not larger than the corpus size + for (auto i = 0; i < indices.size; i++) { + if (indices[i] >= storage_corpus.size) { + std::cout << "Storage index error" << std::endl; + } + } +#endif } // ---------- /Main storage selection dispatch ---------- diff --git a/src/common/fuzz_dss/fuzz_mutate.h b/src/common/fuzz_dss/fuzz_mutate.h index d0dfcbac..b5aed1a8 100644 --- a/src/common/fuzz_dss/fuzz_mutate.h +++ b/src/common/fuzz_dss/fuzz_mutate.h @@ -40,6 +40,7 @@ class FuzzMutator { // ---------- Storage selection functions ---------- GpuArray compute_max_storage_indices(const FuzzStorageBuffer& storage_corpus); + GpuArray compute_ratio_sorted_indices(const FuzzStorageBuffer& storage_corpus); // ----------/Storage selection functions ---------- // ---------- Calldata mutation functions ---------- diff --git a/src/common/fuzz_dss/fuzz_storage.cpp b/src/common/fuzz_dss/fuzz_storage.cpp index 7508f199..df8864d5 100644 --- a/src/common/fuzz_dss/fuzz_storage.cpp +++ b/src/common/fuzz_dss/fuzz_storage.cpp @@ -9,6 +9,7 @@ #include // -------- FuzzStorageCaseBatch implementation --------- +// TODO this shouldn't necessarily be thread_local, we should have all the tempbufs in a tempbuf manager thingamabob static thread_local TempStorageBuffer marked_indices_tempbuf(sizeof(size_t) * MAX_N_SEPARATE_CONTEXTS); // Custom boolmask-to-indices function @@ -201,6 +202,64 @@ void FuzzStorageCaseBatch::gather_kvs(const GpuevmCtx& ctx, size_t n_marked) { gpuevm_nvtx_pop(); } +FuzzStorageCaseBatch FuzzStorageCaseBatch::filter(const GpuArrayView& mask) { + // Similar to fuzzstoragebuffer::filter + auto n_culled = device_array_count(mask, false); + + // If there's nothing to cull, we're done + if (n_culled == 0) { + return FuzzStorageCaseBatch(0, + GpuArray(0), + GpuArray(0), + GpuArray(0), + GpuArray(0), + GpuArray(0), + GpuArray(0), + GpuArray(0), + GpuArray(0)); + } + + // Create expanded masks + GpuArray culled_kv_mask(keys.size); + device_array_rle_decode(kv_sizes.view(), mask, culled_kv_mask.view()); + GpuArray culled_producing_calldata_mask(producing_calldatas.size); + device_array_rle_decode(producing_calldata_sizes.view(), mask, culled_producing_calldata_mask.view()); + // Cull the variably-sized parts + auto [nonculled_keys, culled_keys] = device_array_partition(keys.view(), culled_kv_mask.view()); + auto [nonculled_vals, culled_vals] = device_array_partition(vals.view(), culled_kv_mask.view()); + + auto [nonculled_producing_calldatas, culled_producing_calldatas] = device_array_partition(producing_calldatas.view(), culled_producing_calldata_mask.view()); + + auto [nonculled_kv_sizes, culled_kv_sizes] = device_array_partition(kv_sizes.view(), mask); + auto [nonculled_producing_calldata_sizes, culled_producing_calldata_sizes] = device_array_partition(producing_calldata_sizes.view(), mask); + auto [nonculled_hashes, culled_hashes] = device_array_partition(hashes.view(), mask); + auto [nonculled_parent_hashes, culled_parent_hashes] = device_array_partition(parent_hashes.view(), mask); + + auto [nonculled_metadata, culled_metadata] = device_array_partition(metadata.view(), mask); + + // Set the ones for the current object + n_cases = n_cases - n_culled; + keys = std::move(nonculled_keys); + vals = std::move(nonculled_vals); + kv_sizes = std::move(nonculled_kv_sizes); + hashes = std::move(nonculled_hashes); + parent_hashes = std::move(nonculled_parent_hashes); + producing_calldatas = std::move(nonculled_producing_calldatas); + producing_calldata_sizes = std::move(nonculled_producing_calldata_sizes); + metadata = std::move(nonculled_metadata); + + return FuzzStorageCaseBatch( + n_culled, + culled_keys, + culled_vals, + culled_kv_sizes, + culled_hashes, + culled_parent_hashes, + culled_producing_calldatas, + culled_producing_calldata_sizes, + culled_metadata); +} + // -------- /FuzzStorageCaseBatch implementation --------- // -------- AppendBuffer implementation ----------- @@ -208,6 +267,9 @@ void FuzzStorageCaseBatch::gather_kvs(const GpuevmCtx& ctx, size_t n_marked) { template AppendBuffer::AppendBuffer(size_t cap) : cap(max(min_size, cap)), used_size(0), data(max(min_size, cap)) {} +template +AppendBuffer::AppendBuffer(const GpuArrayView& arr) : cap(arr.size), used_size(arr.size), data(arr) {} + template size_t AppendBuffer::capacity() const { return cap; @@ -232,6 +294,33 @@ void AppendBuffer::push(const T& val) { used_size++; } +template +void AppendBuffer::filter(const GpuArrayView& mask) { + assert(mask.size == size()); + if (size() == 0) return; + + // Since the array-filter-inout doesn't support overlapping in and out arrays, we must allocate ourselves to a new buffer + auto n_marked = device_array_count(mask, true); + GpuArray new_data(n_marked); + device_array_filter_inout(data.view(), mask, new_data.view()); + data = std::move(new_data); + used_size = n_marked; + cap = n_marked; + maybe_resize(); +} + +// Keeps only the true-marked, returns the false-marked (i.e. becomes the true-marked itself) +template +AppendBuffer AppendBuffer::partition(const GpuArrayView& mask) { + if (size() == 0) return AppendBuffer(0); + // NB We return a this.view, which uses the used data size as size, not the capacity as data.view() would + auto [true_buf, false_buf] = device_array_partition(this->view(), mask); + data = std::move(true_buf); + used_size = data.size; + cap = data.size; + return AppendBuffer(false_buf); +} + template T& AppendBuffer::operator[](size_t idx) const { return data[idx]; @@ -300,6 +389,7 @@ template struct AppendBuffer; template struct AppendBuffer; template struct AppendBuffer; template struct AppendBuffer; +template struct AppendBuffer; // ---------- /Explicit instantiations for AppendBuffer -------- @@ -327,10 +417,80 @@ void FuzzStorageBuffer::push(const FuzzStorageCaseView& storage) { producing_calldata_start_inxs.push(producing_calldata_start_inxs[size - 1] + producing_calldata_sizes[size - 1]); } hash_to_idx[storage.hash] = size; + size++; gpuevm_nvtx_pop(); } +FuzzStorageCaseBatch FuzzStorageBuffer::filter(const GpuArrayView& mask) { + // RLE-filtering: keys, vals + // Special care (non-appendbuffers and not rle): hash-to-idx, kv_start_inxs, producing_calldata_start_inxs, producing_calldatas + // Members to normal-filter: + // Normal: + // - kv_sizes + // - hashes + // - parent_hashes + // - producing_calldata_sizes + // - metadatas + + gpuevm_nvtx_push("FuzzStorageBuffer::filter"); + + // We have to produce this before we change the size of the kv sizes array + // To split the keys & vals that were culled from those who were not, we need to generate a mask that uses the kv sizes as runlengths and the normal mask as mask + GpuArray culled_kv_items_mask(keys.size()); + device_array_rle_decode(kv_sizes.view(), mask, culled_kv_items_mask.view()); + // Similarly for producing calldatas + GpuArray culled_producing_calldata_items_mask(producing_calldatas.size()); + device_array_rle_decode(producing_calldata_sizes.view(), mask, culled_producing_calldata_items_mask.view()); + + // Normal simple culling + // Note that this changes our members inplace! + auto culled_kv_sizes = kv_sizes.partition(mask); + // We essentially have this value for free now, since it obvs applies to all the entries + auto n_culled = culled_kv_sizes.size(); + auto n_nonculled = size - n_culled; + + auto culled_hashes = hashes.partition(mask); + auto culled_parent_hashes = parent_hashes.partition(mask); + auto culled_producing_calldata_sizes = producing_calldata_sizes.partition(mask); + auto culled_metadatas = metadatas.partition(mask); + + // For our own kv_start_inxs member, we need to resize it to the non-culled size + [[maybe_unused]] auto culled_kv_start_inxs = kv_start_inxs.partition(mask); + // Starts are a prefix sum of the sizes + thrust::exclusive_scan(thrust::device, kv_sizes.data.data, kv_sizes.data.data + n_nonculled, kv_start_inxs.data.data); + + // Need to resize and update the calldata start indices + [[maybe_unused]] auto culled_producing_calldata_start_inxs = producing_calldata_start_inxs.partition(mask); + thrust::exclusive_scan(thrust::device, producing_calldata_sizes.data.data, producing_calldata_sizes.data.data + n_nonculled, producing_calldata_start_inxs.data.data); + + // And now we filter keys, vals, and producing_calldatas + auto culled_keys = keys.partition(culled_kv_items_mask); + auto culled_vals = vals.partition(culled_kv_items_mask); + auto culled_producing_calldatas = producing_calldatas.partition(culled_producing_calldata_items_mask); + + // And in the end we need to change the hash-to-idx mapping + hash_to_idx.clear(); + // Fill the non-culled + for (size_t i = 0; i < n_nonculled; i++) { + hash_to_idx[hashes[i]] = i; + } + + size = n_nonculled; + + gpuevm_nvtx_pop(); + // Return a storagecasebatch containing the culled items only, i.e. the buffers we've got back from all the .partition calls + return FuzzStorageCaseBatch(n_culled, + culled_keys.data, + culled_vals.data, + culled_kv_sizes.data, + culled_hashes.data, + culled_parent_hashes.data, + culled_producing_calldatas.data, + culled_producing_calldata_sizes.data, + culled_metadatas.data); +} + // NB This doesn't check if the hashes are already in the batch void FuzzStorageBuffer::push_batch(const FuzzStorageCaseBatch& batch) { gpuevm_nvtx_push("FuzzStorageBuffer::push_batch"); @@ -378,6 +538,18 @@ FuzzStorageCaseView FuzzStorageBuffer::operator[](size_t idx) const { metadatas[idx]); } +// Gives back a mask of same size as the batch (i.e. number of storage cases in the batch), marking whether the case is a direct parent anything in the current storage +GpuArray FuzzStorageBuffer::get_ancestor_mask(const FuzzStorageCaseBatch& batch) const { + // We are essentially doing a batchlookup of the hashes in batch, to the parent_hashes (as a set) of the current storage + // At most load factor 0.5 = 1/2 + DeviceHashSet parent_hashes_set(ceil_log2(size) + 1); + // Insert all the parent hashes + // TODO do the bulk functions of hashset/hashmap actually handle >MAX_N_CONTEXTS items at once operations correctly? + parent_hashes_set.insert_bulk(parent_hashes.data.view()); + // Do the batch lookup + return parent_hashes_set.lookup_bulk(batch.hashes); +} + std::vector FuzzStorageBuffer::get_storage_chain(storage_hash_t hash) const { gpuevm_nvtx_push("FuzzStorageBuffer::get_storage_chain"); std::vector chain; @@ -399,12 +571,15 @@ void __global__ scatter_kvs_kernel( const GpuArrayView kv_sizes, const GpuArrayView hashes, const GpuArrayView parent_hashes, - const GpuArrayView kv_start_inxs) { + const GpuArrayView kv_start_inxs, + GpuArrayView metadata, + bool count_visits) { // NB Very important that this TID is the same as is used in MAP_INX in gpu_map auto tid = TID; - assert(tid < MAP_N_CTXS); + assert(tid < MAP_N_CTXS && "Thread index out of bounds"); const auto storage_inx = storage_indices[tid]; + assert(storage_inx < kv_sizes.size && "Storage index out of bounds"); const auto kv_size = kv_sizes[storage_inx]; const auto kv_start_inx = kv_start_inxs[storage_inx]; const auto parent_hash = parent_hashes[storage_inx]; @@ -421,15 +596,20 @@ void __global__ scatter_kvs_kernel( gpu_map.insert(key, val); } + if (count_visits) { + atomicAdd((unsigned long long*)&(metadata[storage_inx].fuzz_case_metadata.visits), (unsigned long long)1); + } + #ifdef GPUEVM_DEBUG // We only need the hash for correctness checking, i.e. the hash after inserting everything should be the same as the one recorded in the buffer const auto hash = hashes[storage_inx]; - assert(gpu_map.get_rolling_hash_ptr(tid)->get_internal_hash() == hash); + // TODO bring back + // assert(gpu_map.get_rolling_hash_ptr(tid)->get_internal_hash() == hash); #endif } template -void FuzzStorageBuffer::scatter_kvs(GpuMap& gpu_map, const GpuArrayView& storage_indices) { +void FuzzStorageBuffer::scatter_kvs(GpuMap& gpu_map, const GpuArrayView& storage_indices, bool count_visits) { gpuevm_nvtx_push("FuzzStorageBuffer::scatter_kvs"); // Because of how we're implemented gpumap to be fast when running in the interpreter, if each thread does a single operation on storage // in a tid <-> evm_inx fashion, memory operations will be pretty coalesced and thus fast @@ -447,15 +627,26 @@ void FuzzStorageBuffer::scatter_kvs(GpuMap& gpu_map, const gpu_map.fastSetZero(); safeStreamSync(); +#ifdef GPUEVM_DEBUG_FUZZER + // Loop through the storage indices, and check whether they're out of bounds (longer than the kv_sizes array) + if (size != kv_sizes.size()) { + std::cout << "Size error" << std::endl; + } + for (size_t i = 0; i < storage_indices.size; i++) { + if (storage_indices[i] >= kv_sizes.size()) { + std::cout << "Storage index error" << std::endl; + } + } +#endif // This does all the magic, but needs everything in the form of arrayviews - scatter_kvs_kernel<<>>(gpu_map, storage_indices, keys.view(), vals.view(), kv_sizes.view(), hashes.view(), parent_hashes.view(), kv_start_inxs.view()); + scatter_kvs_kernel<<>>(gpu_map, storage_indices, keys.view(), vals.view(), kv_sizes.view(), hashes.view(), parent_hashes.view(), kv_start_inxs.view(), metadatas.view(), count_visits); safeStreamSync(); gpuevm_nvtx_pop(); } // Explicit instantiation of scatter_kvs for the default gpuevmctx -template void FuzzStorageBuffer::scatter_kvs(GpuMap& gpu_map, const GpuArrayView& storage_indices); +template void FuzzStorageBuffer::scatter_kvs(GpuMap& gpu_map, const GpuArrayView& storage_indices, bool count_visits); // Functor getting a (inverted and into arithmetic type) bool from StorageCaseMetadata struct GetUnchecked { @@ -482,4 +673,33 @@ size_t FuzzStorageBuffer::num_unchecked_cases() const { return *cub_temp_buffer.inout_size_ptr; } +void FuzzStorageBuffer::get_unchecked_indices(GpuArrayView out_indices, size_t n_indices) const { + gpuevm_nvtx_push("FuzzStorageBuffer::get_unchecked_indices"); + /** Writes the n_indices first indices that are marked as unchecked into the out_indices array + * Errors if there are less than n_indices that are unchecked, or if the passed view is not of appropriate size + */ + // TODO make this gpu accelerated + assert(out_indices.size >= n_indices && "Output indices array is not of correct size"); + + size_t out_idx = 0; + for (size_t i = 0; i < size; i++) { + if (!metadatas[i].checked_invariants && out_idx < n_indices) { + out_indices[out_idx++] = i; + } + } + assert(out_idx == n_indices && "Not enough unchecked cases to get"); + // Unified memory needs full system sync + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + gpuevm_nvtx_pop(); +} + +void FuzzStorageBuffer::mark_checked(const GpuArrayView& indices) { + gpuevm_nvtx_push("FuzzStorageBuffer::mark_checked"); + // TODO make this gpu accelerated + for (size_t i = 0; i < indices.size; i++) { + metadatas[indices[i]].checked_invariants = true; + } + gpuevm_nvtx_pop(); +} + // ---------- /FuzzStorageBuffer implementation ---------- diff --git a/src/common/fuzz_dss/fuzz_storage.h b/src/common/fuzz_dss/fuzz_storage.h index e3b7d345..48d9a2c7 100644 --- a/src/common/fuzz_dss/fuzz_storage.h +++ b/src/common/fuzz_dss/fuzz_storage.h @@ -79,6 +79,10 @@ struct FuzzStorageCaseBatch { // Dummy for development FuzzStorageCaseBatch() : n_cases(0), keys(nullptr, 0), kv_sizes(nullptr, 0), vals(nullptr, 0), hashes(nullptr, 0), parent_hashes(nullptr, 0), producing_calldatas(nullptr, 0), producing_calldata_sizes(nullptr, 0), metadata(nullptr, 0) {} + // Fully constructed from passed items + FuzzStorageCaseBatch(size_t n_cases, const GpuArrayView& keys, const GpuArrayView& vals, const GpuArrayView& kv_sizes, const GpuArrayView& hashes, const GpuArrayView& parent_hashes, const GpuArrayView& producing_calldatas, const GpuArrayView& producing_calldata_sizes, const GpuArrayView& metadata) + : n_cases(n_cases), keys(keys), vals(vals), kv_sizes(kv_sizes), hashes(hashes), parent_hashes(parent_hashes), producing_calldatas(producing_calldatas), producing_calldata_sizes(producing_calldata_sizes), metadata(metadata) {} + FuzzStorageCaseBatch(const GpuevmCtx& ctx, const GpuArrayView& mask); __host__ friend std::ostream& operator<<(std::ostream& os, const FuzzStorageCaseBatch& batch) { os << "FuzzStorageCaseBatch {\n"; @@ -103,6 +107,9 @@ struct FuzzStorageCaseBatch { return os; } + // Filters the batch based on the passed in mask, and returns a new batch with the culled-out elements + FuzzStorageCaseBatch filter(const GpuArrayView& mask); + // We'd like the functions below to be private, but due to them using the nvcc --expt-extended-lambda flag with device lambdas they can't be. // private: // Gathers both hash and parenthash of each marked (true) in the mask @@ -117,12 +124,22 @@ template struct AppendBuffer { AppendBuffer() : AppendBuffer(min_size) {} AppendBuffer(size_t cap); + // Copies the data from the passed in array into the buffer + AppendBuffer(const GpuArrayView& arr); size_t capacity() const; size_t size() const; void push(const GpuArrayView& arr); void push(const T& val); + // Keeps (and keeps order) only the elements for which the mask is true + // almost always doesn't keep existing buffer, so it's not safe to keep views to the buffer when invoking this + void filter(const GpuArrayView& mask); + + // Partitions the buffer into two, one with the true-masked elements, and one with the false-masked elements + // The false-masked elements are returned, the buffer becomes the true-marked ones inplace + AppendBuffer partition(const GpuArrayView& mask); + T& operator[](size_t idx) const; operator T*() const { @@ -162,6 +179,16 @@ struct FuzzStorageBuffer { push(storage); } + // We can also initialize with all the members + FuzzStorageBuffer(const AppendBuffer& keys, const AppendBuffer& vals, const AppendBuffer& kv_sizes, + const AppendBuffer& hashes, const AppendBuffer& parent_hashes, + const AppendBuffer& producing_calldatas, const AppendBuffer& producing_calldata_sizes, + const AppendBuffer& metadatas, + const AppendBuffer& kv_start_inxs, + const AppendBuffer& producing_calldata_start_inxs, + const std::unordered_map& hash_to_idx) + : keys(keys), vals(vals), kv_sizes(kv_sizes), hashes(hashes), parent_hashes(parent_hashes), producing_calldatas(producing_calldatas), producing_calldata_sizes(producing_calldata_sizes), metadatas(metadatas), kv_start_inxs(kv_start_inxs), producing_calldata_start_inxs(producing_calldata_start_inxs), hash_to_idx(hash_to_idx) {} + // Number of storage cases in the queue size_t size = 0; @@ -169,22 +196,36 @@ struct FuzzStorageBuffer { void push(const FuzzStorageCaseView& storage); // NB This does not filter on if the hash is already in the buf! - // TODO we should filter here + // TODO we should filter here, in fact the seen hashes hashset should be a part of this struct void push_batch(const FuzzStorageCaseBatch& batch); + // Filters the storage buffer to only contain the elements for which the mask is true, + // returns a batch (not a buffer!) that contains the storage elements marked as non-true (i.e. this is basically an ordered in-place partition routine) + FuzzStorageCaseBatch filter(const GpuArrayView& mask); + FuzzStorageCaseView operator[](size_t idx) const; + // Gives back a mask of same size as the batch (i.e. number of storage cases in the batch), marking whether the case is a direct parent anything in the current storage + GpuArray get_ancestor_mask(const FuzzStorageCaseBatch& batch) const; + // NB The views returned from this point to internal (gpu) buffers, so will be invalid when this goes out of scope std::vector get_storage_chain(storage_hash_t hash) const; // Given an array of indices of size MAP_N_CTXS, scatter the kvs of the storage cases at those indices into the gpumap // This basically does a bunch of parallel inserts, but in a as-fast-as-possible way + // The last argument marks whether we should count / increment the indicates storages 'visits' metadata field template - void scatter_kvs(GpuMap& gpu_map, const GpuArrayView& storage_indices); + void scatter_kvs(GpuMap& gpu_map, const GpuArrayView& storage_indices, bool count_visits = false); // The number of storage cases in the buffer that haven't been been checked for invariant violations size_t num_unchecked_cases() const; + // Writes the n_indices first indices that are marked as unchecked into the out_indices array + void get_unchecked_indices(GpuArrayView out_indices, size_t n_indices) const; + + // Sets the checked_invariants metadata field to true for the passed indices + void mark_checked(const GpuArrayView& indices); + // Most things below here are not supposed to be touched from the outside, but we do need to sometimes, so we're keeping them public // It's an invariant that keys.size == vals.size == Sum(i=0...n_entries) sizes[i] // Similarly for the producing_calldatas (since each calldata could be of different sizes) diff --git a/src/common/fuzz_dss/gpu_array_utils.cpp b/src/common/fuzz_dss/gpu_array_utils.cpp index f136a7e4..a54d797f 100644 --- a/src/common/fuzz_dss/gpu_array_utils.cpp +++ b/src/common/fuzz_dss/gpu_array_utils.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -6,7 +7,9 @@ #include #include #include +#include #include +#include #include #include @@ -131,7 +134,7 @@ GpuArray device_array_uniquify(const GpuArrayView& in) { // Filters a gpuarray into an output array, based on the passed boolmask. Preserves order // Doesn't allocate any new memory, uses passed buffers and assumes output fits # marked elements template -void device_array_filter_inout(const GpuArrayView& in, const GpuArrayView& mask, GpuArrayView& out) { +void device_array_filter_inout(const GpuArrayView& in, const GpuArrayView& mask, const GpuArrayView& out) { size_t temp_storage_bytes = 0; cub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, (T*)in.data, (bool*)mask.data, (T*)out.data, cub_temp_buffer.inout_size_ptr, mask.size); @@ -149,6 +152,44 @@ GpuArray device_array_filter(const GpuArrayView& in, const GpuArrayView +struct CubInvertMaskConversionOp { + __host__ __device__ __forceinline__ bool operator()(const bool& val) const { + if (do_invert) return !val; + return val; + } +}; + +// Partitions a gpuarray into true and false arrays, both passed in as views, based on a boolmask. Preserves order +// Note that the out parameters are not const because we modify the pointer directly, and assume that the sizes are correctly set +template +void device_array_partition_inout(const GpuArrayView& in, const GpuArrayView& mask, const GpuArrayView& out_true, const GpuArrayView& out_false) { + assert(mask.size == in.size); + size_t temp_storage_bytes = 0; + // This is two filter calls, just unrolled into here + cub::TransformInputIterator, bool*> mask_uninverted(mask.data, CubInvertMaskConversionOp()); + cub::DeviceSelect::Flagged((void*)nullptr, temp_storage_bytes, (T*)in.data, mask_uninverted, (T*)out_true.data, cub_temp_buffer.inout_size_ptr, mask.size); + cub_temp_buffer.ensure(temp_storage_bytes); + cub::DeviceSelect::Flagged((void*)cub_temp_buffer, temp_storage_bytes, (T*)in.data, mask_uninverted, (T*)out_true.data, cub_temp_buffer.inout_size_ptr, mask.size); + + // And the false-select + auto mask_inverted = cub::TransformInputIterator, bool*>(mask.data, CubInvertMaskConversionOp()); + cub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, (T*)in.data, mask_inverted, (T*)out_false.data, cub_temp_buffer.inout_size_ptr, mask.size); + cub_temp_buffer.ensure(temp_storage_bytes); + cub::DeviceSelect::Flagged((void*)cub_temp_buffer, temp_storage_bytes, (T*)in.data, mask_inverted, (T*)out_false.data, cub_temp_buffer.inout_size_ptr, mask.size); +} + +template +std::pair, GpuArray> device_array_partition(const GpuArrayView& in, const GpuArrayView& mask) { + assert(mask.size == in.size); + auto n_true = device_array_count(mask, true); + GpuArray out_true(n_true); + GpuArray out_false(mask.size - n_true); + device_array_partition_inout(in, mask, out_true.view(), out_false.view()); + return std::make_pair(out_true, out_false); +} + // Transforms a array of indices into a boolmask, of a given size (that must be larger than the largest index in the index array) // Takes in a pre-allocated bool array, sets all values to zero before marking indices! void device_array_indices_to_boolmask_inout(const GpuArrayView& indices, GpuArrayView& mask) { @@ -172,6 +213,56 @@ std::vector device_array_to_host(const GpuArrayView& a) { return result; } +// Modifies the output array in-place, and requires the passed output view to be correctly sized +// View is consted because we act on the underlying array, not the view data +template +void device_array_rle_decode(const GpuArrayView& run_lengths_view, + const GpuArrayView& run_values_view, + const GpuArrayView& output_view) { + typedef long long difference_type; + size_t* run_lengths_start = run_lengths_view.data; + size_t* run_lengths_end = run_lengths_view.data + run_lengths_view.size; + T* run_values_start = run_values_view.data; + T* output = const_cast(output_view.data); + + difference_type input_size = run_lengths_view.size; + // Note that the reduction operator is + by default and the unit is 0 by default a sum, + // so this is just summing the runlengths to get the size of the decoded buffer + difference_type output_size = thrust::reduce(run_lengths_start, run_lengths_end); + assert(output_view.size == output_size && "output_view must be of size output_size"); + if (output_size == 0) { + // Well at this point we're done, nothing to decode + return; + } + + // scan the counts to obtain output offsets for each input element + thrust::device_vector output_offsets(input_size, 0); + thrust::exclusive_scan(run_lengths_start, run_lengths_end, output_offsets.begin()); + + // scatter the nonzero counts into their corresponding output positions + thrust::device_vector output_indices(output_size, 0); + auto scatter_input_it = thrust::counting_iterator(0); + auto scatter_pred = [] __device__(difference_type x) { return x != 0; }; + thrust::scatter_if(thrust::device, scatter_input_it, + scatter_input_it + input_size, + output_offsets.data(), + run_lengths_start, + output_indices.data(), + scatter_pred); + + // compute max-scan over the output indices, filling in the holes + thrust::inclusive_scan(thrust::device, output_indices.data(), + output_indices.data() + output_size, + output_indices.data(), + thrust::maximum()); + + // gather input values according to index array (output = run_values_start[output_indices]) + thrust::gather(output_indices.begin(), + output_indices.end(), + thrust::device_pointer_cast(run_values_start), + thrust::device_pointer_cast(output)); +} + // -------- Explicit template instantiations of utility functions for gpuarrays ----- template std::vector device_array_to_host(const GpuArrayView&); @@ -182,5 +273,23 @@ template std::vector device_array_to_host(const GpuArrayView&) template size_t device_array_count(const GpuArrayView&, const uint32_t&); template size_t device_array_count(const GpuArrayView&, const bool&); template GpuArray device_array_uniquify(const GpuArrayView& in); -template void device_array_filter_inout(GpuArrayView const&, GpuArrayView const&, GpuArrayView&); -template void device_array_filter_inout(GpuArrayView const&, GpuArrayView const&, GpuArrayView&); +template void device_array_filter_inout(GpuArrayView const&, GpuArrayView const&, const GpuArrayView&); +template void device_array_filter_inout(GpuArrayView const&, GpuArrayView const&, const GpuArrayView&); +template void device_array_filter_inout(GpuArrayView const&, GpuArrayView const&, const GpuArrayView&); +template void device_array_filter_inout(GpuArrayView const&, GpuArrayView const&, const GpuArrayView&); +template void device_array_filter_inout(GpuArrayView const&, GpuArrayView const&, const GpuArrayView&); +template void device_array_filter_inout(GpuArrayView const&, GpuArrayView const&, const GpuArrayView&); +template std::pair, GpuArray> device_array_partition(const GpuArrayView&, const GpuArrayView&); +template std::pair, GpuArray> device_array_partition(const GpuArrayView&, const GpuArrayView&); +template std::pair, GpuArray> device_array_partition(const GpuArrayView&, const GpuArrayView&); +template std::pair, GpuArray> device_array_partition(const GpuArrayView&, const GpuArrayView&); +template std::pair, GpuArray> device_array_partition(const GpuArrayView&, const GpuArrayView&); +template void device_array_rle_decode(const GpuArrayView& run_lengths_view, + const GpuArrayView& run_values_view, + const GpuArrayView& output_view); +template void device_array_rle_decode(const GpuArrayView& run_lengths_view, + const GpuArrayView& run_values_view, + const GpuArrayView& output_view); +// This is why we pull in metadata defs +template std::pair, GpuArray> device_array_partition(const GpuArrayView&, const GpuArrayView&); +template void device_array_filter_inout(GpuArrayView const&, GpuArrayView const&, const GpuArrayView&); \ No newline at end of file diff --git a/src/common/fuzz_dss/gpu_array_utils.h b/src/common/fuzz_dss/gpu_array_utils.h index 2c82d2a5..3af4bffa 100644 --- a/src/common/fuzz_dss/gpu_array_utils.h +++ b/src/common/fuzz_dss/gpu_array_utils.h @@ -2,6 +2,8 @@ #include +#include + // -------- Declaration of templated utilities for gpuarrays -------- struct DeviceLessThan { @@ -28,11 +30,15 @@ GpuArray device_array_unique_from_runs(const GpuArrayView& in); // Filters a gpuarray into an output array, based on the passed boolmask. Preserves order template -void device_array_filter_inout(const GpuArrayView& in, const GpuArrayView& mask, GpuArrayView& out); +GpuArray device_array_filter(const GpuArrayView& in, const GpuArrayView& mask); -// Filters a gpuarray into an output array, based on the passed boolmask. Preserves order +// Partitions a gpuarray into true and false arrays, both passed in as views, based on a boolmask. Assumes caller has checked that out views fit the output. Preserves order template -GpuArray device_array_filter(const GpuArrayView& in, const GpuArrayView& mask); +void device_array_partition_inout(const GpuArrayView& in, const GpuArrayView& mask, const GpuArrayView& out_true, const GpuArrayView& out_false); + +// Partitions a gpuarray into true and false arrays, giving back pair of (marked_true, marked_false) that it allocates. Preserves order +template +std::pair, GpuArray> device_array_partition(const GpuArrayView& in, const GpuArrayView& mask); // Counts number of occurences (doesn't deduplicate etc) in the array template @@ -52,10 +58,6 @@ GpuArray device_array_unique_from_runs(const GpuArrayView& in); template GpuArray device_array_uniquify(const GpuArrayView& in); -// Filters a gpuarray into an output array, based on the passed boolmask. Preserves order -template -void device_array_filter_inout(const GpuArrayView& in, const GpuArrayView& mask, GpuArrayView& out); - // Filters a gpuarray into an output array, based on the passed boolmask. Preserves order template GpuArray device_array_filter(const GpuArrayView& in, const GpuArrayView& mask); @@ -83,7 +85,7 @@ GpuArray device_array_uniquify(const GpuArrayView& in); // Filters a gpuarray into an output array, based on the passed boolmask. Preserves order // Doesn't allocate any new memory, uses passed buffers and assumes output fits # marked elements template -void device_array_filter_inout(const GpuArrayView& in, const GpuArrayView& mask, GpuArrayView& out); +void device_array_filter_inout(const GpuArrayView& in, const GpuArrayView& mask, const GpuArrayView& out); // Filters a gpuarray into an output array, based on the passed boolmask. Preserves order template @@ -101,4 +103,12 @@ void device_array_indices_to_boolmask_inout(const GpuArrayView& indices, template std::vector device_array_to_host(const GpuArrayView& a); +// Does the opposite of RLE +// From https://github.com/NVIDIA/thrust/blob/master/examples/expand.cu +// Modifies the output array in-place, and requires the passed output view to be correctly sized +template +void device_array_rle_decode(const GpuArrayView& run_lengths, + const GpuArrayView& run_values, + const GpuArrayView& output); + // -------- /Declaration of templated utilities for gpuarrays -------- diff --git a/src/common/fuzzer/fuzzer.cpp b/src/common/fuzzer/fuzzer.cpp index 5589ed7d..452f46ff 100644 --- a/src/common/fuzzer/fuzzer.cpp +++ b/src/common/fuzzer/fuzzer.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -59,70 +60,69 @@ ExtGpuEvmFuzzRets run_fuzzer(const ExtGpuEvmFuzzSetup& setup, const FuzzStorageC fuzz_check_invariants(fuzz_state); } - if (fuzz_state.corpus.size() > fuzz_state.max_corpus_size) { -#ifdef GPUEVM_DEBUG_FUZZER - std::cout << "Culling corpus" << std::endl; -#endif - fuzz_cull(fuzz_state); - } else { + // TODO think about good thresholds/triggers for this + if ((fuzz_state.storage_corpus.size * fuzz_state.corpus.size()) > (MAX_N_SEPARATE_CONTEXTS * 100)) { + if (run_inx % 40 == 0) { + fuzz_cull(fuzz_state); + } + } #ifdef GPUEVM_DEBUG_FUZZER - std::cout << "run_inx: " << run_inx << ", corpus size: " << fuzz_state.corpus.size() << " and " << fuzz_state.reverting_corpus.size() << " reverting cases" << std::endl; - auto pre_map = fuzz_state.dist_map.to_host_map(); + std::cout << "run_inx: " << run_inx << ", corpus size: " << fuzz_state.corpus.size() << " and " << fuzz_state.reverting_corpus.size() << " reverting cases" << std::endl; + auto pre_map = fuzz_state.dist_map.to_host_map(); #endif - auto pre_coverage = fuzz_active_ctx().rti_maps.coverage_map.tot_coverage(); - auto [add_to_corpus, opt_reverting_corpus, opt_new_storages] = fuzz_evolve(fuzz_state); + auto pre_coverage = fuzz_active_ctx().rti_maps.coverage_map.tot_coverage(); + auto [add_to_corpus, opt_reverting_corpus, opt_new_storages] = fuzz_evolve(fuzz_state); + gpuevm_nvtx_pop(); + gpuevm_nvtx_push("calldata_push_to_corpus"); + fuzz_state.corpus.pushn(add_to_corpus); + gpuevm_nvtx_pop(); + + if (opt_reverting_corpus.has_value()) { + gpuevm_nvtx_push("reverting_corpus_push"); + fuzz_state.reverting_corpus.pushn(opt_reverting_corpus.value()); gpuevm_nvtx_pop(); - gpuevm_nvtx_push("calldata_push_to_corpus"); - fuzz_state.corpus.pushn(add_to_corpus); + std::cout << "Found new reverting cases, added " << opt_reverting_corpus.value().n_cases << " cases to reverting-corpus" << std::endl; + } + if (opt_new_storages.has_value()) { + gpuevm_nvtx_push("new_storages_push"); + auto new_storages = opt_new_storages.value(); + fuzz_state.storage_corpus.push_batch(new_storages); gpuevm_nvtx_pop(); - - if (opt_reverting_corpus.has_value()) { - gpuevm_nvtx_push("reverting_corpus_push"); - fuzz_state.reverting_corpus.pushn(opt_reverting_corpus.value()); - gpuevm_nvtx_pop(); - std::cout << "Found new reverting cases, added " << opt_reverting_corpus.value().n_cases << " cases to reverting-corpus" << std::endl; - } - if (opt_new_storages.has_value()) { - gpuevm_nvtx_push("new_storages_push"); - auto new_storages = opt_new_storages.value(); - fuzz_state.storage_corpus.push_batch(new_storages); - gpuevm_nvtx_pop(); #ifdef GPUEVM_DEBUG_FUZZER - std::cout << "Found new interesting storages, added " << new_storages.n_cases << " storages to storage-corpus" << std::endl; - std::cout << "New storage batch: " << new_storages << std::endl; + std::cout << "Found new interesting storages, added " << new_storages.n_cases << " storages to storage-corpus" << std::endl; + std::cout << "New storage batch: " << new_storages << std::endl; #endif - } + } - auto post_coverage = fuzz_active_ctx().rti_maps.coverage_map.tot_coverage(); - if (post_coverage > pre_coverage) { - std::cout << "[Coverage increased from " << pre_coverage << " to " << post_coverage << " (+" << post_coverage - pre_coverage << ") in run " << run_inx << "]" << std::flush; - } + auto post_coverage = fuzz_active_ctx().rti_maps.coverage_map.tot_coverage(); + if (post_coverage > pre_coverage) { + std::cout << "[Coverage increased from " << pre_coverage << " to " << post_coverage << " (+" << post_coverage - pre_coverage << ") in run " << run_inx << "]" << std::flush; + } #if GPUEVM_DEBUG_FUZZER - auto post_map = fuzz_state.dist_map.to_host_map(); - // std::cout << "Current distmap: " << fuzz_state.dist_map << std::endl; - std::unordered_map diff_map; - std::unordered_map prev_to_diff_map; - for (auto& kv : post_map) { - if (pre_map.find(kv.first) == pre_map.end()) { - diff_map[kv.first] = kv.second; - prev_to_diff_map[kv.first] = distmap_val_t(); - } else if (pre_map[kv.first] != kv.second) { - diff_map[kv.first] = kv.second; - prev_to_diff_map[kv.first] = pre_map[kv.first]; - } + auto post_map = fuzz_state.dist_map.to_host_map(); + // std::cout << "Current distmap: " << fuzz_state.dist_map << std::endl; + std::unordered_map diff_map; + std::unordered_map prev_to_diff_map; + for (auto& kv : post_map) { + if (pre_map.find(kv.first) == pre_map.end()) { + diff_map[kv.first] = kv.second; + prev_to_diff_map[kv.first] = distmap_val_t(); + } else if (pre_map[kv.first] != kv.second) { + diff_map[kv.first] = kv.second; + prev_to_diff_map[kv.first] = pre_map[kv.first]; } - if (diff_map.size() > 0) { - auto seen_covhashes_vec = dev_hashset_to_vec(fuzz_state.covhashes_seen); - std::cout << "Current (" << seen_covhashes_vec.size() << " unique) seen covhashes: " << seen_covhashes_vec << std::endl; - std::cout << "Current distmap size (in pairs): " << fuzz_state.dist_map.size() << std::endl; - std::cout << "Dist map diff size: " << diff_map.size() << std::endl; - for (auto& kv : diff_map) { - std::cout << "Instrhash: " << std::hex << std::setfill('0') << std::setw(8) << kv.first << std::dec << ", branchdist: " << kv.second << ", prev branchdist: " << prev_to_diff_map[kv.first] << std::endl; - } + } + if (diff_map.size() > 0) { + auto seen_covhashes_vec = dev_hashset_to_vec(fuzz_state.covhashes_seen); + std::cout << "Current (" << seen_covhashes_vec.size() << " unique) seen covhashes: " << seen_covhashes_vec << std::endl; + std::cout << "Current distmap size (in pairs): " << fuzz_state.dist_map.size() << std::endl; + std::cout << "Dist map diff size: " << diff_map.size() << std::endl; + for (auto& kv : diff_map) { + std::cout << "Instrhash: " << std::hex << std::setfill('0') << std::setw(8) << kv.first << std::dec << ", branchdist: " << kv.second << ", prev branchdist: " << prev_to_diff_map[kv.first] << std::endl; } + } #endif - } if (run_inx < fuzz_state.n_fuzz_par_runs - 1) { std::cout << "-"; diff --git a/src/common/gpu_dss/gpu_hashmap.h b/src/common/gpu_dss/gpu_hashmap.h index 93e275ed..844fee99 100644 --- a/src/common/gpu_dss/gpu_hashmap.h +++ b/src/common/gpu_dss/gpu_hashmap.h @@ -173,13 +173,6 @@ class GpuMap { MAP_HOSTDEVFUNC void insert(const uint256& key, const uint256& val) { MAP_ASSERT(n_fill[MAP_INX(0)] < MAP_CAP); - // debug printing - // printf("INSERT | Inserting key: "); - // for (int i = 0; i < 4; i++) { - // printf("%016lx", ((uint64_t*)&key)[i]); - //} - // printf("\n"); - hash_t place = find(key); hash_t pos = place & MAP_EMPTY_MASK; diff --git a/src/common/ingest/evm_runtime_ctx.cpp b/src/common/ingest/evm_runtime_ctx.cpp index e11bd8f0..5d3812f2 100644 --- a/src/common/ingest/evm_runtime_ctx.cpp +++ b/src/common/ingest/evm_runtime_ctx.cpp @@ -154,19 +154,3 @@ void GpuevmCtx::dealloc() { calleraddresses.dealloc(); CUDA_SAFE_CALL(cudaDeviceSynchronize()); } - -void GpuevmCtx::printReturnValues() { - // Execution is done here, fetch outputs or integrate into a larger context/fuzzer/MEV searcher or w/e we want. - for (int evm_inx = 0; evm_inx < n_runners; ++evm_inx) { - uint32_t return_offset = return_offsets_buf[evm_inx]; - uint32_t return_size = return_sizes_buf[evm_inx]; - uint8_t *return_data = &memory_buf[evm_inx * MEM_SIZE_BYTES] + return_offset; - // Print it as hex, using std::cout std::hex and std::setw / std::setfill - std::cout << "EVM #" << evm_inx << " returned " << return_size << " bytes: " << std::endl; - for (uint32_t i = 0; i < return_size; ++i) { - std::cout << std::hex << std::setw(2) << std::setfill('0') << (int)return_data[i]; - } - std::cout << std::dec << std::endl; - break; // TODO remove and actually do printing of a single evm or something, triggered from fuzzoutput printing - } -} \ No newline at end of file diff --git a/src/common/ingest/evm_runtime_ctx.h b/src/common/ingest/evm_runtime_ctx.h index 6bc0151f..cddec0a5 100644 --- a/src/common/ingest/evm_runtime_ctx.h +++ b/src/common/ingest/evm_runtime_ctx.h @@ -43,5 +43,4 @@ class GpuevmCtx { void dealloc_calldatas(); void dealloc(); - void printReturnValues(); }; diff --git a/src/common/utils/utils.cpp b/src/common/utils/utils.cpp index d1c3021f..a8a1fb4c 100644 --- a/src/common/utils/utils.cpp +++ b/src/common/utils/utils.cpp @@ -81,8 +81,24 @@ template std::ostream &operator<<(std::ostream &os, const std::unordered_map time_point(time); + return time_point; +#else return std::chrono::high_resolution_clock::now(); +#endif } fuzz_time_diff time_diff(fuzz_time_point start, fuzz_time_point end) { diff --git a/src/common/utils/utils.h b/src/common/utils/utils.h index bc89fffb..3b1c9ab8 100644 --- a/src/common/utils/utils.h +++ b/src/common/utils/utils.h @@ -21,7 +21,8 @@ typedef uint64_t fuzz_time_diff; namespace timing { // ------- Timing for within-app uses --------- - fuzz_time_point current_time(); + __device__ uint64_t device_nano_time(); + __host__ __device__ fuzz_time_point current_time(); fuzz_time_diff time_diff(fuzz_time_point start, fuzz_time_point end); // Appropriately uses SI prefixes to print