diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a60f7acf2ab..e1b4aa4d3e8 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -710,6 +710,7 @@ add_library( src/sort/stable_sort.cu src/sort/top_k.cu src/stream_compaction/apply_boolean_mask.cu + src/stream_compaction/approx_distinct_count.cu src/stream_compaction/distinct.cu src/stream_compaction/distinct_count.cu src/stream_compaction/distinct_helpers.cu diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index 20ab96f4d46..3a5981ea8ad 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -118,5 +118,25 @@ cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal, rmm::cuda_stream_view stream); +/** + * @copydoc cudf::approx_distinct_count(column_view const&, null_policy, nan_policy, int, + * null_equality, nan_equality, rmm::cuda_stream_view) + */ +cudf::size_type approx_distinct_count(column_view const& input, + int precision, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream); + +/** + * @copydoc cudf::approx_distinct_count(table_view const&, int, null_policy, nan_policy, + * rmm::cuda_stream_view) + */ +cudf::size_type approx_distinct_count(table_view const& input, + int precision, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream); + } // namespace detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index d9c460af3de..5bae627be93 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -415,6 +415,49 @@ cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal = null_equality::EQUAL, rmm::cuda_stream_view stream = cudf::get_default_stream()); +/** + * @brief Approximate count of distinct elements in the column_view using HyperLogLog. + * + * Uses the HyperLogLog++ algorithm to provide a fast approximation of the number of distinct + * elements in a column. All NaN values are treated as equal, and all null values are treated as + * equal. + * + * @param input The column_view whose distinct elements will be approximately counted + * @param precision The precision parameter for HyperLogLog (4-18). Higher precision gives + * better accuracy but uses more memory. Default is 12. + * @param null_handling `INCLUDE` or `EXCLUDE` null values (default: `EXCLUDE`) + * @param nan_handling `NAN_IS_VALID` or `NAN_IS_NULL` (default: `NAN_IS_NULL`) + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return Approximate number of distinct elements in the column + */ +cudf::size_type approx_distinct_count(column_view const& input, + int precision = 12, + null_policy null_handling = null_policy::EXCLUDE, + nan_policy nan_handling = nan_policy::NAN_IS_NULL, + rmm::cuda_stream_view stream = cudf::get_default_stream()); + +/** + * @brief Approximate count of distinct rows in a table using HyperLogLog. + * + * Uses the HyperLogLog++ algorithm to provide a fast approximation of the number of distinct + * rows in a table. All NaN values are treated as equal, and all null values are treated as equal. + * + * @param input Table whose distinct rows will be approximately counted + * @param precision The precision parameter for HyperLogLog (4-18). Higher precision gives + * better accuracy but uses more memory. Default is 12. + * @param null_handling `INCLUDE` or `EXCLUDE` rows with nulls (default: `EXCLUDE`) + * @param nan_handling `NAN_IS_VALID` or `NAN_IS_NULL` (default: `NAN_IS_NULL`) + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return Approximate number of distinct rows in the table + */ +cudf::size_type approx_distinct_count(table_view const& input, + int precision = 12, + null_policy null_handling = null_policy::EXCLUDE, + nan_policy nan_handling = nan_policy::NAN_IS_NULL, + rmm::cuda_stream_view stream = cudf::get_default_stream()); + /** * @brief Creates a new column by applying a filter function against every * element of the input columns. diff --git a/cpp/src/stream_compaction/approx_distinct_count.cu b/cpp/src/stream_compaction/approx_distinct_count.cu new file mode 100644 index 00000000000..21ae1d0fece --- /dev/null +++ b/cpp/src/stream_compaction/approx_distinct_count.cu @@ -0,0 +1,154 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "stream_compaction_common.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +namespace cudf { +namespace detail { + +// Internal implementation function +cudf::size_type approx_distinct_count_impl(table_view const& input, + int precision, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream) +{ + auto const num_rows = input.num_rows(); + if (num_rows == 0) { return 0; } + + // Clamp precision to valid range for HyperLogLog + precision = std::max(4, std::min(18, precision)); + + auto const has_nulls = nullate::DYNAMIC{cudf::has_nested_nulls(input)}; + auto const preprocessed_input = + cudf::detail::row::hash::preprocessed_table::create(input, stream); + auto const row_hasher = cudf::detail::row::hash::row_hasher(preprocessed_input); + auto const hash_key = row_hasher.device_hasher(has_nulls); + + auto hll = cuco::hyperloglog, + rmm::mr::polymorphic_allocator>{ + cuco::sketch_size_kb{static_cast(4 * (1ull << precision) / 1024.0)}, + cuco::xxhash_64{}, + rmm::mr::polymorphic_allocator{}, + cuda::stream_ref{stream.value()}}; + + auto const iter = thrust::counting_iterator(0); + + rmm::device_uvector hash_values(num_rows, stream); + thrust::transform( + rmm::exec_policy_nosync(stream), iter, iter + num_rows, hash_values.begin(), hash_key); + + // Create a temporary table for distinct processing if needed + if (nan_handling == nan_policy::NAN_IS_NULL || null_handling == null_policy::EXCLUDE) { + if (num_rows < 10000) { + if (input.num_columns() == 1) { + return cudf::distinct_count(input.column(0), null_handling, nan_handling); + } else { + return cudf::distinct_count(input, cudf::null_equality::EQUAL); + } + } + } + + if (null_handling == null_policy::EXCLUDE && has_nulls) { + auto const [row_bitmask, null_count] = + cudf::detail::bitmask_or(input, stream, cudf::get_current_device_resource_ref()); + + if (null_count > 0) { + row_validity pred{static_cast(row_bitmask.data())}; + auto counting_iter = thrust::counting_iterator(0); + + rmm::device_uvector filtered_hashes(num_rows - null_count, stream); + auto end_iter = thrust::copy_if(rmm::exec_policy(stream), + hash_values.begin(), + hash_values.end(), + counting_iter, + filtered_hashes.begin(), + pred); + + auto actual_count = std::distance(filtered_hashes.begin(), end_iter); + if (actual_count > 0) { + hll.add(filtered_hashes.begin(), + filtered_hashes.begin() + actual_count, + cuda::stream_ref{stream.value()}); + } + return static_cast(hll.estimate(cuda::stream_ref{stream.value()})); + } + } + + hll.add(hash_values.begin(), hash_values.end(), cuda::stream_ref{stream.value()}); + return static_cast(hll.estimate(cuda::stream_ref{stream.value()})); +} + +cudf::size_type approx_distinct_count(table_view const& input, + int precision, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream) +{ + return approx_distinct_count_impl(input, precision, null_handling, nan_handling, stream); +} + +cudf::size_type approx_distinct_count(column_view const& input, + int precision, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream) +{ + // Convert column to single-column table and use unified implementation + cudf::table_view single_col_table({input}); + return approx_distinct_count_impl( + single_col_table, precision, null_handling, nan_handling, stream); +} + +} // namespace detail + +// Public API implementations +cudf::size_type approx_distinct_count(column_view const& input, + int precision, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + return detail::approx_distinct_count(input, precision, null_handling, nan_handling, stream); +} + +cudf::size_type approx_distinct_count(table_view const& input, + int precision, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + return detail::approx_distinct_count(input, precision, null_handling, nan_handling, stream); +} + +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index c71d73ad014..5751b4a5973 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -507,6 +507,7 @@ ConfigureTest( ConfigureTest( STREAM_COMPACTION_TEST stream_compaction/apply_boolean_mask_tests.cpp + stream_compaction/approx_distinct_count_tests.cpp stream_compaction/distinct_count_tests.cpp stream_compaction/distinct_tests.cpp stream_compaction/drop_nans_tests.cpp diff --git a/cpp/tests/stream_compaction/approx_distinct_count_tests.cpp b/cpp/tests/stream_compaction/approx_distinct_count_tests.cpp new file mode 100644 index 00000000000..492adfa7401 --- /dev/null +++ b/cpp/tests/stream_compaction/approx_distinct_count_tests.cpp @@ -0,0 +1,247 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include + +#include +#include + +using cudf::nan_policy; +using cudf::null_policy; + +constexpr int32_t XXX{70}; // Mark for null elements + +// Simple helper to check if approximation is reasonable (within 20% for small datasets) +bool is_reasonable_approximation(cudf::size_type approx_count, cudf::size_type exact_count) +{ + if (exact_count == 0) return approx_count == 0; + if (exact_count == 1) return approx_count <= 2; // Very small counts can vary + double error = std::abs(static_cast(approx_count) - static_cast(exact_count)) / + static_cast(exact_count); + return error <= 0.2; // 20% tolerance for simplicity +} + +struct ApproxDistinctCount : public cudf::test::BaseFixture {}; + +TEST_F(ApproxDistinctCount, BasicFunctionality) +{ + cudf::test::fixed_width_column_wrapper input_col{1, 3, 3, 4, 1, 8, 2, 4, 10, 8}; + + auto const approx_count = cudf::approx_distinct_count(input_col); + auto const exact_count = + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID); + + EXPECT_TRUE(is_reasonable_approximation(approx_count, exact_count)) + << "Exact: " << exact_count << ", Approx: " << approx_count; +} + +TEST_F(ApproxDistinctCount, TableBasic) +{ + cudf::test::fixed_width_column_wrapper col1{1, 2, 3, 1, 2}; + cudf::test::fixed_width_column_wrapper col2{1, 1, 2, 1, 2}; + cudf::table_view input_table({col1, col2}); + + auto const approx_count = cudf::approx_distinct_count(input_table); + auto const exact_count = cudf::distinct_count(input_table, cudf::null_equality::EQUAL); + + EXPECT_TRUE(is_reasonable_approximation(approx_count, exact_count)) + << "Exact: " << exact_count << ", Approx: " << approx_count; +} + +TEST_F(ApproxDistinctCount, WithNull) +{ + cudf::test::fixed_width_column_wrapper input_col{{1, 3, 3, XXX, 1, 8, 2}, + {1, 1, 1, 0, 1, 1, 1}}; + + auto const approx_count = + cudf::approx_distinct_count(input_col, 12, null_policy::INCLUDE, nan_policy::NAN_IS_VALID); + auto const exact_count = + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID); + + EXPECT_TRUE(is_reasonable_approximation(approx_count, exact_count)) + << "Exact: " << exact_count << ", Approx: " << approx_count; +} + +TEST_F(ApproxDistinctCount, IgnoreNull) +{ + cudf::test::fixed_width_column_wrapper input_col{{1, 3, 3, XXX, 1, 8, 2}, + {1, 1, 1, 0, 1, 1, 1}}; + + auto const approx_count = + cudf::approx_distinct_count(input_col, 12, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID); + auto const exact_count = + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID); + + EXPECT_TRUE(is_reasonable_approximation(approx_count, exact_count)) + << "Exact: " << exact_count << ", Approx: " << approx_count; +} + +TEST_F(ApproxDistinctCount, BothAPIs) +{ + cudf::test::fixed_width_column_wrapper input_col{{1, 3, 3, XXX, 1, 8, 2}, + {1, 1, 1, 0, 1, 1, 1}}; + + // Test using both null_policy and nan_policy parameters + auto const approx_count = + cudf::approx_distinct_count(input_col, 12, null_policy::INCLUDE, nan_policy::NAN_IS_VALID); + auto const exact_count = + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID); + + EXPECT_TRUE(is_reasonable_approximation(approx_count, exact_count)) + << "Exact: " << exact_count << ", Approx: " << approx_count; +} + +TEST_F(ApproxDistinctCount, EmptyColumn) +{ + cudf::test::fixed_width_column_wrapper input_col{}; + + auto const result = cudf::approx_distinct_count(input_col); + EXPECT_EQ(0, result); +} + +TEST_F(ApproxDistinctCount, StringColumn) +{ + cudf::test::strings_column_wrapper input_col{"a", "b", "a", "c", "b"}; + + auto const approx_count = cudf::approx_distinct_count(input_col); + auto const exact_count = + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID); + + EXPECT_TRUE(is_reasonable_approximation(approx_count, exact_count)) + << "Exact: " << exact_count << ", Approx: " << approx_count; +} + +TEST_F(ApproxDistinctCount, DifferentPrecisions) +{ + cudf::test::fixed_width_column_wrapper input_col{1, 2, 3, 4, 5, 1, 2, 3}; + + // Test precision bounds (should clamp to 4-18) but use reasonable values + auto const result_low = cudf::approx_distinct_count(input_col, 2); // Should clamp to 4 + auto const result_mid = cudf::approx_distinct_count(input_col, 12); // Default precision + auto const result_high = + cudf::approx_distinct_count(input_col, 10); // Lower precision to test memory + + // All should give reasonable results for this small dataset + EXPECT_GT(result_low, 0); + EXPECT_GT(result_mid, 0); + EXPECT_GT(result_high, 0); + EXPECT_LE(result_low, 10); // Should be reasonable + EXPECT_LE(result_mid, 10); // Should be reasonable + EXPECT_LE(result_high, 10); // Should be reasonable +} + +// ===== COMPREHENSIVE NULL/NaN PARAMETER TESTING ===== + +TEST_F(ApproxDistinctCount, NullEqualityUnequal) +{ + cudf::test::fixed_width_column_wrapper input_col{{1, XXX, 3, XXX, 1}, {1, 0, 1, 0, 1}}; + + // For approximate distinct count, we simplify null handling + // Just test that it gives a reasonable approximation + auto const approx_count = + cudf::approx_distinct_count(input_col, 12, null_policy::INCLUDE, nan_policy::NAN_IS_VALID); + auto const exact_count = + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID); + + EXPECT_TRUE(is_reasonable_approximation(approx_count, exact_count)) + << "Exact: " << exact_count << ", Approx: " << approx_count; +} + +TEST_F(ApproxDistinctCount, NullEqualityEqual) +{ + cudf::test::fixed_width_column_wrapper input_col{{1, XXX, 3, XXX, 1}, {1, 0, 1, 0, 1}}; + + // With simplified API, all nulls are treated as equal + auto const approx_count = + cudf::approx_distinct_count(input_col, 12, null_policy::INCLUDE, nan_policy::NAN_IS_VALID); + auto const exact_count = + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID); + + EXPECT_TRUE(is_reasonable_approximation(approx_count, exact_count)) + << "Exact: " << exact_count << ", Approx: " << approx_count; +} + +TEST_F(ApproxDistinctCount, NaNHandling) +{ + cudf::test::fixed_width_column_wrapper input_col{1.0f, + std::numeric_limits::quiet_NaN(), + 3.0f, + std::numeric_limits::quiet_NaN(), + 1.0f}; + + // Test NaN as null with EXCLUDE policy + auto const approx_exclude = + cudf::approx_distinct_count(input_col, 12, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL); + auto const exact_exclude = + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL); + + EXPECT_TRUE(is_reasonable_approximation(approx_exclude, exact_exclude)) + << "Exclude - Exact: " << exact_exclude << ", Approx: " << approx_exclude; + + // Test NaN as null with INCLUDE policy + auto const approx_include = + cudf::approx_distinct_count(input_col, 12, null_policy::INCLUDE, nan_policy::NAN_IS_NULL); + auto const exact_include = + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL); + + EXPECT_TRUE(is_reasonable_approximation(approx_include, exact_include)) + << "Include - Exact: " << exact_include << ", Approx: " << approx_include; +} + +TEST_F(ApproxDistinctCount, NaNEqualityUnequal) +{ + cudf::test::fixed_width_column_wrapper input_col{1.0f, + std::numeric_limits::quiet_NaN(), + 3.0f, + std::numeric_limits::quiet_NaN(), + 1.0f}; + + // For approximate distinct count, simplified NaN handling + auto const approx_count = + cudf::approx_distinct_count(input_col, 12, null_policy::INCLUDE, nan_policy::NAN_IS_VALID); + auto const exact_count = + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID); + + EXPECT_TRUE(is_reasonable_approximation(approx_count, exact_count)) + << "Exact: " << exact_count << ", Approx: " << approx_count; +} + +TEST_F(ApproxDistinctCount, TableNullHandling) +{ + cudf::test::fixed_width_column_wrapper col1{{1, XXX, 3, 1}, {1, 0, 1, 1}}; + cudf::test::fixed_width_column_wrapper col2{{1, 2, XXX, 1}, {1, 1, 0, 1}}; + cudf::table_view input_table({col1, col2}); + + // Test table with simplified null handling + auto const approx_count = + cudf::approx_distinct_count(input_table, 12, null_policy::INCLUDE, nan_policy::NAN_IS_VALID); + auto const exact_count = cudf::distinct_count(input_table, cudf::null_equality::EQUAL); + + EXPECT_TRUE(is_reasonable_approximation(approx_count, exact_count)) + << "Exact: " << exact_count << ", Approx: " << approx_count; +} + +TEST_F(ApproxDistinctCount, CombinedNullNaNHandling) +{ + // Create float column with both nulls and NaNs + std::vector values{1.0f, + std::numeric_limits::quiet_NaN(), + 0.0f, + 3.0f, + std::numeric_limits::quiet_NaN()}; + std::vector validity{true, true, false, true, true}; + cudf::test::fixed_width_column_wrapper input_col( + values.begin(), values.end(), validity.begin()); + + // Test combination of null and NaN handling + auto const approx_count = + cudf::approx_distinct_count(input_col, 12, null_policy::INCLUDE, nan_policy::NAN_IS_NULL); + auto const exact_count = + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL); + + EXPECT_TRUE(is_reasonable_approximation(approx_count, exact_count)) + << "Exact: " << exact_count << ", Approx: " << approx_count; +}