diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0282282b5f3..8952f11dd23 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -528,6 +528,7 @@ add_library( src/io/parquet/reader_impl_chunking.cu src/io/parquet/reader_impl_helpers.cpp src/io/parquet/reader_impl_preprocess.cu + src/io/parquet/stats_filter_helpers.cpp src/io/parquet/writer_impl.cu src/io/parquet/writer_impl_helpers.cpp src/io/parquet/decode_fixed.cu diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index e1d7dbb03b3..c2d1495a5d9 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ #include "reader_impl_helpers.hpp" +#include "stats_filter_helpers.hpp" #include #include @@ -21,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -34,7 +34,6 @@ #include #include -#include #include #include #include @@ -42,89 +41,25 @@ namespace cudf::io::parquet::detail { namespace { + /** - * @brief Converts statistics in column chunks to 2 device columns - min, max values. + * @brief Converts column chunk statistics to 2 device columns - min, max values. + * + * Each column's number of rows equals the total number of row groups. * */ -struct stats_caster { +struct row_group_stats_caster : public stats_caster_base { size_type total_row_groups; std::vector const& per_file_metadata; host_span const> row_group_indices; - template - static ToType targetType(FromType const value) - { - if constexpr (cudf::is_timestamp()) { - return static_cast( - typename ToType::duration{static_cast(value)}); - } else if constexpr (std::is_same_v) { - return ToType{nullptr, 0}; - } else { - return static_cast(value); - } - } - - // uses storage type as T - template () or cudf::is_nested())> - static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) - { - CUDF_FAIL("unsupported type for stats casting"); - } - - template ())> - static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) - { - CUDF_EXPECTS(type == BOOLEAN, "Invalid type and stats combination"); - return targetType(*reinterpret_cast(stats_val)); - } - - // integral but not boolean, and fixed_point, and chrono. - template () and !cudf::is_boolean()) or - cudf::is_fixed_point() or cudf::is_chrono())> - static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) - { - switch (type) { - case INT32: return targetType(*reinterpret_cast(stats_val)); - case INT64: return targetType(*reinterpret_cast(stats_val)); - case INT96: // Deprecated in parquet specification - return targetType(static_cast<__int128_t>(reinterpret_cast(stats_val)[0]) - << 32 | - reinterpret_cast(stats_val)[2]); - case BYTE_ARRAY: [[fallthrough]]; - case FIXED_LEN_BYTE_ARRAY: - if (stats_size == sizeof(T)) { - // if type size == length of stats_val. then typecast and return. - if constexpr (cudf::is_chrono()) { - return targetType(*reinterpret_cast(stats_val)); - } else { - return targetType(*reinterpret_cast(stats_val)); - } - } - // unsupported type - default: CUDF_FAIL("Invalid type and stats combination"); - } - } - - template ())> - static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) - { - switch (type) { - case FLOAT: return targetType(*reinterpret_cast(stats_val)); - case DOUBLE: return targetType(*reinterpret_cast(stats_val)); - default: CUDF_FAIL("Invalid type and stats combination"); - } - } - - template )> - static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) + row_group_stats_caster(size_type total_row_groups, + std::vector const& per_file_metadata, + host_span const> row_group_indices) + : total_row_groups{total_row_groups}, + per_file_metadata{per_file_metadata}, + row_group_indices{row_group_indices} { - switch (type) { - case BYTE_ARRAY: [[fallthrough]]; - case FIXED_LEN_BYTE_ARRAY: - return string_view(reinterpret_cast(stats_val), stats_size); - default: CUDF_FAIL("Invalid type and stats combination"); - } } // Creates device columns from column statistics (min, max) @@ -139,82 +74,8 @@ struct stats_caster { if constexpr (cudf::is_compound() && !std::is_same_v) { CUDF_FAIL("Compound types do not have statistics"); } else { - // Local struct to hold host columns - struct host_column { - // using thrust::host_vector because std::vector uses bitmap instead of byte per bool. - cudf::detail::host_vector val; - std::vector null_mask; - cudf::size_type null_count = 0; - host_column(size_type total_row_groups, rmm::cuda_stream_view stream) - : val{cudf::detail::make_host_vector(total_row_groups, stream)}, - null_mask( - cudf::util::div_rounding_up_safe( - cudf::bitmask_allocation_size_bytes(total_row_groups), sizeof(bitmask_type)), - ~bitmask_type{0}) - { - } - - void set_index(size_type index, - std::optional> const& binary_value, - Type const type) - { - if (binary_value.has_value()) { - val[index] = convert(binary_value.value().data(), binary_value.value().size(), type); - } - if (not binary_value.has_value()) { - clear_bit_unsafe(null_mask.data(), index); - null_count++; - } - } - - static auto make_strings_children(host_span host_strings, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) - { - auto const total_char_count = std::accumulate( - host_strings.begin(), host_strings.end(), 0, [](auto sum, auto const& str) { - return sum + str.size_bytes(); - }); - auto chars = cudf::detail::make_empty_host_vector(total_char_count, stream); - auto offsets = - cudf::detail::make_empty_host_vector(host_strings.size() + 1, stream); - offsets.push_back(0); - for (auto const& str : host_strings) { - auto tmp = - str.empty() ? std::string_view{} : std::string_view(str.data(), str.size_bytes()); - chars.insert(chars.end(), std::cbegin(tmp), std::cend(tmp)); - offsets.push_back(offsets.back() + tmp.length()); - } - auto d_chars = cudf::detail::make_device_uvector_async(chars, stream, mr); - auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, stream, mr); - return std::tuple{std::move(d_chars), std::move(d_offsets)}; - } - - auto to_device(cudf::data_type dtype, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) - { - if constexpr (std::is_same_v) { - auto [d_chars, d_offsets] = make_strings_children(val, stream, mr); - return cudf::make_strings_column( - val.size(), - std::make_unique(std::move(d_offsets), rmm::device_buffer{}, 0), - d_chars.release(), - null_count, - rmm::device_buffer{ - null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr}); - } - return std::make_unique( - dtype, - val.size(), - cudf::detail::make_device_uvector_async(val, stream, mr).release(), - rmm::device_buffer{ - null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr}, - null_count); - } - }; // local struct host_column - host_column min(total_row_groups, stream); - host_column max(total_row_groups, stream); + host_column min(total_row_groups, stream); + host_column max(total_row_groups, stream); size_type stats_idx = 0; for (size_t src_idx = 0; src_idx < row_group_indices.size(); ++src_idx) { for (auto const rg_idx : row_group_indices[src_idx]) { @@ -248,146 +109,6 @@ struct stats_caster { } }; -/** - * @brief Converts AST expression to StatsAST for comparing with column statistics - * This is used in row group filtering based on predicate. - * statistics min value of a column is referenced by column_index*2 - * statistics max value of a column is referenced by column_index*2+1 - * - */ -class stats_expression_converter : public ast::detail::expression_transformer { - public: - stats_expression_converter(ast::expression const& expr, size_type const& num_columns) - : _num_columns{num_columns} - { - expr.accept(*this); - } - - /** - * @copydoc ast::detail::expression_transformer::visit(ast::literal const& ) - */ - std::reference_wrapper visit(ast::literal const& expr) override - { - return expr; - } - - /** - * @copydoc ast::detail::expression_transformer::visit(ast::column_reference const& ) - */ - std::reference_wrapper visit(ast::column_reference const& expr) override - { - CUDF_EXPECTS(expr.get_table_source() == ast::table_reference::LEFT, - "Statistics AST supports only left table"); - CUDF_EXPECTS(expr.get_column_index() < _num_columns, - "Column index cannot be more than number of columns in the table"); - return expr; - } - - /** - * @copydoc ast::detail::expression_transformer::visit(ast::column_name_reference const& ) - */ - std::reference_wrapper visit( - ast::column_name_reference const& expr) override - { - CUDF_FAIL("Column name reference is not supported in statistics AST"); - } - - /** - * @copydoc ast::detail::expression_transformer::visit(ast::operation const& ) - */ - std::reference_wrapper visit(ast::operation const& expr) override - { - using cudf::ast::ast_operator; - auto const operands = expr.get_operands(); - auto const op = expr.get_operator(); - - if (auto* v = dynamic_cast(&operands[0].get())) { - // First operand should be column reference, second should be literal. - CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 2, - "Only binary operations are supported on column reference"); - CUDF_EXPECTS(dynamic_cast(&operands[1].get()) != nullptr, - "Second operand of binary operation with column reference must be a literal"); - v->accept(*this); - // Push literal into the ast::tree - auto const& literal = - _stats_expr.push(*dynamic_cast(&operands[1].get())); - auto const col_index = v->get_column_index(); - switch (op) { - /* transform to stats conditions. op(col, literal) - col1 == val --> vmin <= val && vmax >= val - col1 != val --> !(vmin == val && vmax == val) - col1 > val --> vmax > val - col1 < val --> vmin < val - col1 >= val --> vmax >= val - col1 <= val --> vmin <= val - */ - case ast_operator::EQUAL: { - auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); - auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); - _stats_expr.push(ast::operation{ - ast::ast_operator::LOGICAL_AND, - _stats_expr.push(ast::operation{ast_operator::GREATER_EQUAL, vmax, literal}), - _stats_expr.push(ast::operation{ast_operator::LESS_EQUAL, vmin, literal})}); - break; - } - case ast_operator::NOT_EQUAL: { - auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); - auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); - _stats_expr.push(ast::operation{ - ast_operator::LOGICAL_OR, - _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmin, vmax}), - _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmax, literal})}); - break; - } - case ast_operator::LESS: [[fallthrough]]; - case ast_operator::LESS_EQUAL: { - auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); - _stats_expr.push(ast::operation{op, vmin, literal}); - break; - } - case ast_operator::GREATER: [[fallthrough]]; - case ast_operator::GREATER_EQUAL: { - auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); - _stats_expr.push(ast::operation{op, vmax, literal}); - break; - } - default: CUDF_FAIL("Unsupported operation in Statistics AST"); - }; - } else { - auto new_operands = visit_operands(operands); - if (cudf::ast::detail::ast_operator_arity(op) == 2) { - _stats_expr.push(ast::operation{op, new_operands.front(), new_operands.back()}); - } else if (cudf::ast::detail::ast_operator_arity(op) == 1) { - _stats_expr.push(ast::operation{op, new_operands.front()}); - } - } - return _stats_expr.back(); - } - - /** - * @brief Returns the AST to apply on Column chunk statistics. - * - * @return AST operation expression - */ - [[nodiscard]] std::reference_wrapper get_stats_expr() const - { - return _stats_expr.back(); - } - - private: - std::vector> visit_operands( - cudf::host_span const> operands) - { - std::vector> transformed_operands; - for (auto const& operand : operands) { - auto const new_operand = operand.get().accept(*this); - transformed_operands.push_back(new_operand); - } - return transformed_operands; - } - ast::tree _stats_expr; - size_type _num_columns; -}; } // namespace std::optional>> aggregate_reader_metadata::apply_stats_filters( @@ -404,7 +125,7 @@ std::optional>> aggregate_reader_metadata::ap // where min(col[i]) = columns[i*2], max(col[i])=columns[i*2+1] // For each column, it contains #sources * #column_chunks_per_src rows. std::vector> columns; - stats_caster const stats_col{ + row_group_stats_caster const stats_col{ static_cast(total_row_groups), per_file_metadata, input_row_group_indices}; for (size_t col_idx = 0; col_idx < output_dtypes.size(); col_idx++) { auto const schema_idx = output_column_schemas[col_idx]; diff --git a/cpp/src/io/parquet/stats_filter_helpers.cpp b/cpp/src/io/parquet/stats_filter_helpers.cpp new file mode 100644 index 00000000000..ef022b418bf --- /dev/null +++ b/cpp/src/io/parquet/stats_filter_helpers.cpp @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "stats_filter_helpers.hpp" + +#include "io/parquet/parquet_common.hpp" + +#include +#include +#include +#include + +namespace cudf::io::parquet::detail { + +stats_expression_converter::stats_expression_converter(ast::expression const& expr, + size_type num_columns) + : _num_columns{num_columns} +{ + expr.accept(*this); +} + +std::reference_wrapper stats_expression_converter::visit( + ast::literal const& expr) +{ + return expr; +} + +std::reference_wrapper stats_expression_converter::visit( + ast::column_reference const& expr) +{ + CUDF_EXPECTS(expr.get_table_source() == ast::table_reference::LEFT, + "Statistics AST supports only left table"); + CUDF_EXPECTS(expr.get_column_index() < _num_columns, + "Column index cannot be more than number of columns in the table"); + return expr; +} + +std::reference_wrapper stats_expression_converter::visit( + ast::column_name_reference const& expr) +{ + CUDF_FAIL("Column name reference is not supported in statistics AST"); +} + +std::reference_wrapper stats_expression_converter::visit( + ast::operation const& expr) +{ + using cudf::ast::ast_operator; + auto const operands = expr.get_operands(); + auto const op = expr.get_operator(); + + if (auto* v = dynamic_cast(&operands[0].get())) { + // First operand should be column reference, second should be literal. + CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 2, + "Only binary operations are supported on column reference"); + CUDF_EXPECTS(dynamic_cast(&operands[1].get()) != nullptr, + "Second operand of binary operation with column reference must be a literal"); + v->accept(*this); + // Push literal into the ast::tree + auto const& literal = _stats_expr.push(*dynamic_cast(&operands[1].get())); + auto const col_index = v->get_column_index(); + switch (op) { + /* transform to stats conditions. op(col, literal) + col1 == val --> vmin <= val && vmax >= val + col1 != val --> !(vmin == val && vmax == val) + col1 > val --> vmax > val + col1 < val --> vmin < val + col1 >= val --> vmax >= val + col1 <= val --> vmin <= val + */ + case ast_operator::EQUAL: { + auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); + auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); + _stats_expr.push(ast::operation{ + ast::ast_operator::LOGICAL_AND, + _stats_expr.push(ast::operation{ast_operator::GREATER_EQUAL, vmax, literal}), + _stats_expr.push(ast::operation{ast_operator::LESS_EQUAL, vmin, literal})}); + break; + } + case ast_operator::NOT_EQUAL: { + auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); + auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); + _stats_expr.push( + ast::operation{ast_operator::LOGICAL_OR, + _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmin, vmax}), + _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmax, literal})}); + break; + } + case ast_operator::LESS: [[fallthrough]]; + case ast_operator::LESS_EQUAL: { + auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); + _stats_expr.push(ast::operation{op, vmin, literal}); + break; + } + case ast_operator::GREATER: [[fallthrough]]; + case ast_operator::GREATER_EQUAL: { + auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); + _stats_expr.push(ast::operation{op, vmax, literal}); + break; + } + default: CUDF_FAIL("Unsupported operation in Statistics AST"); + }; + } else { + auto new_operands = visit_operands(operands); + if (cudf::ast::detail::ast_operator_arity(op) == 2) { + _stats_expr.push(ast::operation{op, new_operands.front(), new_operands.back()}); + } else if (cudf::ast::detail::ast_operator_arity(op) == 1) { + _stats_expr.push(ast::operation{op, new_operands.front()}); + } + } + return _stats_expr.back(); +} + +std::reference_wrapper stats_expression_converter::get_stats_expr() const +{ + return _stats_expr.back(); +} + +std::vector> +stats_expression_converter::visit_operands( + cudf::host_span const> operands) +{ + std::vector> transformed_operands; + std::transform(operands.begin(), + operands.end(), + std::back_inserter(transformed_operands), + [t = this](auto& operand) { return operand.get().accept(*t); }); + + return transformed_operands; +} + +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/stats_filter_helpers.hpp b/cpp/src/io/parquet/stats_filter_helpers.hpp new file mode 100644 index 00000000000..63b57f46539 --- /dev/null +++ b/cpp/src/io/parquet/stats_filter_helpers.hpp @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "io/parquet/parquet_common.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace cudf::io::parquet::detail { + +/** + * @brief Base utilities for converting and casting stats values + * + * Derived classes handle row group or page-level statistics as needed. + * + */ +class stats_caster_base { + protected: + template + static inline ToType targetType(FromType const value) + { + if constexpr (cudf::is_timestamp()) { + return static_cast( + typename ToType::duration{static_cast(value)}); + } else if constexpr (std::is_same_v) { + return ToType{nullptr, 0}; + } else { + return static_cast(value); + } + } + + // uses storage type as T + template () or cudf::is_nested())> + static inline T convert(uint8_t const* stats_val, size_t stats_size, Type const type) + { + CUDF_FAIL("unsupported type for stats casting"); + } + + template ())> + static inline T convert(uint8_t const* stats_val, size_t stats_size, Type const type) + { + CUDF_EXPECTS(type == BOOLEAN, "Invalid type and stats combination"); + return stats_caster_base::targetType(*reinterpret_cast(stats_val)); + } + + // integral but not boolean, and fixed_point, and chrono. + template () and !cudf::is_boolean()) or + cudf::is_fixed_point() or cudf::is_chrono())> + static inline T convert(uint8_t const* stats_val, size_t stats_size, Type const type) + { + switch (type) { + case INT32: + return stats_caster_base::targetType(*reinterpret_cast(stats_val)); + case INT64: + return stats_caster_base::targetType(*reinterpret_cast(stats_val)); + case INT96: // Deprecated in parquet specification + return stats_caster_base::targetType( + static_cast<__int128_t>(reinterpret_cast(stats_val)[0]) << 32 | + reinterpret_cast(stats_val)[2]); + case BYTE_ARRAY: [[fallthrough]]; + case FIXED_LEN_BYTE_ARRAY: + if (stats_size == sizeof(T)) { + // if type size == length of stats_val. then typecast and return. + if constexpr (cudf::is_chrono()) { + return stats_caster_base::targetType( + *reinterpret_cast(stats_val)); + } else { + return stats_caster_base::targetType(*reinterpret_cast(stats_val)); + } + } + // unsupported type + default: CUDF_FAIL("Invalid type and stats combination"); + } + } + + template ())> + static inline T convert(uint8_t const* stats_val, size_t stats_size, Type const type) + { + switch (type) { + case FLOAT: + return stats_caster_base::targetType(*reinterpret_cast(stats_val)); + case DOUBLE: + return stats_caster_base::targetType(*reinterpret_cast(stats_val)); + default: CUDF_FAIL("Invalid type and stats combination"); + } + } + + template )> + static inline T convert(uint8_t const* stats_val, size_t stats_size, Type const type) + { + switch (type) { + case BYTE_ARRAY: [[fallthrough]]; + case FIXED_LEN_BYTE_ARRAY: + return string_view(reinterpret_cast(stats_val), stats_size); + default: CUDF_FAIL("Invalid type and stats combination"); + } + } + + /** + * @brief Local struct to hold host columns during stats based filtering + * + * @tparam T Type of the column + */ + template + struct host_column { + // using thrust::host_vector because std::vector uses bitmap instead of byte per bool. + cudf::detail::host_vector val; + std::vector null_mask; + cudf::size_type null_count = 0; + + host_column(size_type total_row_groups, rmm::cuda_stream_view stream) + : val{cudf::detail::make_host_vector(total_row_groups, stream)}, + null_mask(cudf::util::div_rounding_up_safe( + cudf::bitmask_allocation_size_bytes(total_row_groups), sizeof(bitmask_type)), + ~bitmask_type{0}) + { + } + + void inline set_index(size_type index, + std::optional> const& binary_value, + Type const type) + { + if (binary_value.has_value()) { + val[index] = stats_caster_base::convert( + binary_value.value().data(), binary_value.value().size(), type); + } + if (not binary_value.has_value()) { + clear_bit_unsafe(null_mask.data(), index); + null_count++; + } + } + static inline std::tuple, rmm::device_uvector> + make_strings_children(host_span host_strings, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + { + auto const total_char_count = + std::accumulate(host_strings.begin(), host_strings.end(), 0, [](auto sum, auto const& str) { + return sum + str.size_bytes(); + }); + auto chars = cudf::detail::make_empty_host_vector(total_char_count, stream); + auto offsets = + cudf::detail::make_empty_host_vector(host_strings.size() + 1, stream); + offsets.push_back(0); + for (auto const& str : host_strings) { + auto tmp = + str.empty() ? std::string_view{} : std::string_view(str.data(), str.size_bytes()); + chars.insert(chars.end(), std::cbegin(tmp), std::cend(tmp)); + offsets.push_back(offsets.back() + tmp.length()); + } + auto d_chars = cudf::detail::make_device_uvector_async(chars, stream, mr); + auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, stream, mr); + return std::tuple{std::move(d_chars), std::move(d_offsets)}; + } + + std::unique_ptr inline to_device(cudf::data_type dtype, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + { + if constexpr (std::is_same_v) { + auto [d_chars, d_offsets] = make_strings_children(val, stream, mr); + return cudf::make_strings_column( + val.size(), + std::make_unique(std::move(d_offsets), rmm::device_buffer{}, 0), + d_chars.release(), + null_count, + rmm::device_buffer{ + null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr}); + } + return std::make_unique( + dtype, + val.size(), + cudf::detail::make_device_uvector_async(val, stream, mr).release(), + rmm::device_buffer{ + null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr}, + null_count); + } + }; +}; + +/** + * @brief Converts AST expression to StatsAST for comparing with column statistics + * + * This is used in row group filtering based on predicate. + * statistics min value of a column is referenced by column_index*2 + * statistics max value of a column is referenced by column_index*2+1 + */ +class stats_expression_converter : public ast::detail::expression_transformer { + public: + stats_expression_converter(ast::expression const& expr, size_type num_columns); + + /** + * @copydoc ast::detail::expression_transformer::visit(ast::literal const& ) + */ + std::reference_wrapper visit(ast::literal const& expr) override; + + /** + * @copydoc ast::detail::expression_transformer::visit(ast::column_reference const& ) + */ + std::reference_wrapper visit(ast::column_reference const& expr) override; + + /** + * @copydoc ast::detail::expression_transformer::visit(ast::column_name_reference const& ) + */ + std::reference_wrapper visit( + ast::column_name_reference const& expr) override; + + /** + * @copydoc ast::detail::expression_transformer::visit(ast::operation const& ) + */ + std::reference_wrapper visit(ast::operation const& expr) override; + + /** + * @brief Returns the AST to apply on Column chunk statistics. + * + * @return AST operation expression + */ + [[nodiscard]] std::reference_wrapper get_stats_expr() const; + + private: + std::vector> visit_operands( + cudf::host_span const> operands); + + ast::tree _stats_expr; + size_type _num_columns; +}; + +} // namespace cudf::io::parquet::detail