From 3451cb875da4c99bb12a1c4540e7661aa299b603 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 20 Feb 2025 02:36:00 +0000 Subject: [PATCH] Add dictionary filtering basics --- .../experimental/dictionary_page_filter.cu | 25 ++++++++++++++ .../experimental/hybrid_scan_helpers.cpp | 34 +++++++++++-------- .../experimental/hybrid_scan_helpers.hpp | 26 ++++++++++++-- 3 files changed, 67 insertions(+), 18 deletions(-) diff --git a/cpp/src/io/parquet/experimental/dictionary_page_filter.cu b/cpp/src/io/parquet/experimental/dictionary_page_filter.cu index f94dea6500b..5022a4c8cd1 100644 --- a/cpp/src/io/parquet/experimental/dictionary_page_filter.cu +++ b/cpp/src/io/parquet/experimental/dictionary_page_filter.cu @@ -106,4 +106,29 @@ dictionary_literals_and_operators_collector::get_literals_and_operators() && return {std::move(_literals), std::move(_operators)}; } +std::optional>> +aggregate_reader_metadata::apply_dictionary_filter( + cudf::host_span dictionaries, + host_span const> input_row_group_indices, + host_span const> literals, + host_span const> operators, + size_type total_row_groups, + host_span output_dtypes, + host_span dictionary_col_schemas, + std::reference_wrapper filter, + rmm::cuda_stream_view stream) const +{ + return {}; +} + +std::vector aggregate_reader_metadata::materialize_dictionaries( + cudf::host_span dictionary_page_data, + host_span const> input_row_group_indices, + host_span output_dtypes, + host_span dictionary_col_schemas, + rmm::cuda_stream_view stream) const +{ + return {}; +} + } // namespace cudf::experimental::io::parquet::detail diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp index 7127eb6b608..edddf45d6b7 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp @@ -405,11 +405,11 @@ aggregate_reader_metadata::filter_row_groups_with_dictionary_pages( }); // Number of surviving row groups after applying stats filter - // auto const total_row_groups = std::accumulate( - // row_group_indices.begin(), - // row_group_indices.end(), - // size_type{0}, - // [](auto& sum, auto const& per_file_row_groups) { return sum + per_file_row_groups.size(); }); + auto const total_row_groups = std::accumulate( + row_group_indices.begin(), + row_group_indices.end(), + size_type{0}, + [](auto& sum, auto const& per_file_row_groups) { return sum + per_file_row_groups.size(); }); // Collect literals and operators for dictionary page filtering for each input table column auto const [literals, operators] = @@ -430,18 +430,22 @@ aggregate_reader_metadata::filter_row_groups_with_dictionary_pages( if (dictionary_col_schemas.empty()) { return all_row_group_indices; } // TODO: Decode dictionary pages and filter row groups based on dictionary pages - // auto const dictionary_filtered_row_groups = apply_dictionary_filter(dictionary_page_data, - // row_group_indices, - // literals, - // operators, - // total_row_groups, - // output_dtypes, - // dictionary_col_schemas, - // filter.value(), - // stream); + auto dictionaries = materialize_dictionaries( + dictionary_page_data, row_group_indices, output_dtypes, dictionary_col_schemas, stream); + + // TODO: Probe the dictionaries to get surviving row groups + auto const dictionary_filtered_row_groups = apply_dictionary_filter(dictionaries, + row_group_indices, + literals, + operators, + total_row_groups, + output_dtypes, + dictionary_col_schemas, + filter.value(), + stream); // return dictionary_filtered_row_groups.value_or(all_row_group_indices); - return all_row_group_indices; + return dictionary_filtered_row_groups.value_or(all_row_group_indices); } std::vector> aggregate_reader_metadata::filter_row_groups_with_bloom_filters( diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp index 94c67507732..39d0d78e83c 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp @@ -47,14 +47,34 @@ struct metadata : private cudf::io::parquet::detail::metadata { class aggregate_reader_metadata : public cudf::io::parquet::detail::aggregate_reader_metadata { private: + + /** + * @brief Materializes column chunk dictionary pages into `cuco::static_set`s + * + * @param dictionary_page_data Dictionary page data device buffers for each input row group + * @param input_row_group_indices Lists of input row groups, one per source + * @param total_row_groups Total number of row groups in `input_row_group_indices` + * @param output_dtypes Datatypes of output columns + * @param dictionary_col_schemas schema indices of dictionary columns only + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return A flattened list of `cuco::static_set_ref` device buffers for each predicate column + * across row groups + */ + [[nodiscard]] std::vector materialize_dictionaries( + cudf::host_span dictionary_page_data, + host_span const> input_row_group_indices, + host_span output_dtypes, + host_span dictionary_col_schemas, + rmm::cuda_stream_view stream) const; + /** * @brief Filters the row groups using dictionary pages * - * @param dictionary_page_data Dictionary page data device buffers for each input row group + * @param dictionaries `cuco::static_set_ref` device buffers for column chunk dictionary * @param input_row_group_indices Lists of input row groups, one per source * @param literals Lists of literals, one per input column * @param operators Lists of operators, one per input column - * @param dictionary_operators * @param total_row_groups Total number of row groups in `input_row_group_indices` * @param output_dtypes Datatypes of output columns * @param dictionary_col_schemas schema indices of dictionary columns only @@ -64,7 +84,7 @@ class aggregate_reader_metadata : public cudf::io::parquet::detail::aggregate_re * @return A pair of filtered row group indices if any is filtered. */ [[nodiscard]] std::optional>> apply_dictionary_filter( - std::vector& dictionary_page_data, + cudf::host_span dictionaries, host_span const> input_row_group_indices, host_span const> literals, host_span const> operators,