Skip to content

Commit

Permalink
Add dictionary filtering basics
Browse files Browse the repository at this point in the history
  • Loading branch information
mhaseeb123 committed Feb 20, 2025
1 parent 242df4f commit 3451cb8
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 18 deletions.
25 changes: 25 additions & 0 deletions cpp/src/io/parquet/experimental/dictionary_page_filter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -106,4 +106,29 @@ dictionary_literals_and_operators_collector::get_literals_and_operators() &&
return {std::move(_literals), std::move(_operators)};
}

std::optional<std::vector<std::vector<size_type>>>
aggregate_reader_metadata::apply_dictionary_filter(
cudf::host_span<rmm::device_buffer> dictionaries,
host_span<std::vector<size_type> const> input_row_group_indices,
host_span<std::vector<ast::literal*> const> literals,
host_span<std::vector<ast::ast_operator> const> operators,
size_type total_row_groups,
host_span<data_type const> output_dtypes,
host_span<int const> dictionary_col_schemas,
std::reference_wrapper<ast::expression const> filter,
rmm::cuda_stream_view stream) const
{
return {};
}

std::vector<rmm::device_buffer> aggregate_reader_metadata::materialize_dictionaries(
cudf::host_span<rmm::device_buffer> dictionary_page_data,
host_span<std::vector<size_type> const> input_row_group_indices,
host_span<data_type const> output_dtypes,
host_span<int const> dictionary_col_schemas,
rmm::cuda_stream_view stream) const
{
return {};
}

} // namespace cudf::experimental::io::parquet::detail
34 changes: 19 additions & 15 deletions cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -405,11 +405,11 @@ aggregate_reader_metadata::filter_row_groups_with_dictionary_pages(
});

// Number of surviving row groups after applying stats filter
// auto const total_row_groups = std::accumulate(
// row_group_indices.begin(),
// row_group_indices.end(),
// size_type{0},
// [](auto& sum, auto const& per_file_row_groups) { return sum + per_file_row_groups.size(); });
auto const total_row_groups = std::accumulate(
row_group_indices.begin(),
row_group_indices.end(),
size_type{0},
[](auto& sum, auto const& per_file_row_groups) { return sum + per_file_row_groups.size(); });

// Collect literals and operators for dictionary page filtering for each input table column
auto const [literals, operators] =
Expand All @@ -430,18 +430,22 @@ aggregate_reader_metadata::filter_row_groups_with_dictionary_pages(
if (dictionary_col_schemas.empty()) { return all_row_group_indices; }

// TODO: Decode dictionary pages and filter row groups based on dictionary pages
// auto const dictionary_filtered_row_groups = apply_dictionary_filter(dictionary_page_data,
// row_group_indices,
// literals,
// operators,
// total_row_groups,
// output_dtypes,
// dictionary_col_schemas,
// filter.value(),
// stream);
auto dictionaries = materialize_dictionaries(
dictionary_page_data, row_group_indices, output_dtypes, dictionary_col_schemas, stream);

// TODO: Probe the dictionaries to get surviving row groups
auto const dictionary_filtered_row_groups = apply_dictionary_filter(dictionaries,
row_group_indices,
literals,
operators,
total_row_groups,
output_dtypes,
dictionary_col_schemas,
filter.value(),
stream);

// return dictionary_filtered_row_groups.value_or(all_row_group_indices);
return all_row_group_indices;
return dictionary_filtered_row_groups.value_or(all_row_group_indices);
}

std::vector<std::vector<size_type>> aggregate_reader_metadata::filter_row_groups_with_bloom_filters(
Expand Down
26 changes: 23 additions & 3 deletions cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,34 @@ struct metadata : private cudf::io::parquet::detail::metadata {

class aggregate_reader_metadata : public cudf::io::parquet::detail::aggregate_reader_metadata {
private:

/**
* @brief Materializes column chunk dictionary pages into `cuco::static_set`s
*
* @param dictionary_page_data Dictionary page data device buffers for each input row group
* @param input_row_group_indices Lists of input row groups, one per source
* @param total_row_groups Total number of row groups in `input_row_group_indices`
* @param output_dtypes Datatypes of output columns
* @param dictionary_col_schemas schema indices of dictionary columns only
* @param stream CUDA stream used for device memory operations and kernel launches
*
* @return A flattened list of `cuco::static_set_ref` device buffers for each predicate column
* across row groups
*/
[[nodiscard]] std::vector<rmm::device_buffer> materialize_dictionaries(
cudf::host_span<rmm::device_buffer> dictionary_page_data,
host_span<std::vector<size_type> const> input_row_group_indices,
host_span<data_type const> output_dtypes,
host_span<int const> dictionary_col_schemas,
rmm::cuda_stream_view stream) const;

/**
* @brief Filters the row groups using dictionary pages
*
* @param dictionary_page_data Dictionary page data device buffers for each input row group
* @param dictionaries `cuco::static_set_ref` device buffers for column chunk dictionary
* @param input_row_group_indices Lists of input row groups, one per source
* @param literals Lists of literals, one per input column
* @param operators Lists of operators, one per input column
* @param dictionary_operators
* @param total_row_groups Total number of row groups in `input_row_group_indices`
* @param output_dtypes Datatypes of output columns
* @param dictionary_col_schemas schema indices of dictionary columns only
Expand All @@ -64,7 +84,7 @@ class aggregate_reader_metadata : public cudf::io::parquet::detail::aggregate_re
* @return A pair of filtered row group indices if any is filtered.
*/
[[nodiscard]] std::optional<std::vector<std::vector<size_type>>> apply_dictionary_filter(
std::vector<rmm::device_buffer>& dictionary_page_data,
cudf::host_span<rmm::device_buffer> dictionaries,
host_span<std::vector<size_type> const> input_row_group_indices,
host_span<std::vector<ast::literal*> const> literals,
host_span<std::vector<ast::ast_operator> const> operators,
Expand Down

0 comments on commit 3451cb8

Please sign in to comment.