diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 11104037c5e..148861c0fa2 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -34,6 +34,7 @@ jobs: branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} + node_type: "cpu16" python-build: needs: [cpp-build] secrets: inherit @@ -77,6 +78,7 @@ jobs: branch: ${{ inputs.branch }} sha: ${{ inputs.sha }} date: ${{ inputs.date }} + node_type: "cpu16" script: ci/build_wheel_libcudf.sh wheel-publish-libcudf: needs: wheel-build-libcudf diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 7046fd0e5dc..8357a12e221 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -46,18 +46,6 @@ jobs: arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_cpp_memcheck.sh" - static-configure: - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04 - with: - build_type: ${{ inputs.build_type }} - branch: ${{ inputs.branch }} - date: ${{ inputs.date }} - sha: ${{ inputs.sha }} - # Use the wheel container so we can skip conda solves and since our - # primary static consumers (Spark) are not in conda anyway. - container_image: "rapidsai/ci-wheel:latest" - run_script: "ci/configure_cpp_static.sh" cpp-linters: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04 diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh index 3466edacfc5..1df7bb61834 100755 --- a/ci/test_cudf_polars_polars_tests.sh +++ b/ci/test_cudf_polars_polars_tests.sh @@ -26,6 +26,8 @@ git clone https://github.com/pola-rs/polars.git --branch "${TAG}" --depth 1 # Install requirements for running polars tests rapids-logger "Install polars test requirements" +# TODO: Remove sed command when polars-cloud supports 1.23 +sed -i '/^polars-cloud$/d' polars/py-polars/requirements-dev.txt rapids-pip-retry install -r polars/py-polars/requirements-dev.txt -r polars/py-polars/requirements-ci.txt # shellcheck disable=SC2317 diff --git a/ci/test_narwhals.sh b/ci/test_narwhals.sh index 4a32ff0b0fd..28eceff2f80 100755 --- a/ci/test_narwhals.sh +++ b/ci/test_narwhals.sh @@ -26,6 +26,7 @@ rapids-logger "Run narwhals tests for cuDF" python -m pytest \ --cache-clear \ --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-narwhals.xml" \ + -p cudf.testing.narwhals_test_plugin \ --numprocesses=8 \ --dist=worksteal \ --constructors=cudf diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index e7dbb765099..a23981b4e72 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -66,7 +66,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.20,<1.23 +- polars>=1.20,<1.24 - pre-commit - ptxcompiler - pyarrow>=14.0.0,<20.0.0a0 diff --git a/conda/environments/all_cuda-128_arch-x86_64.yaml b/conda/environments/all_cuda-128_arch-x86_64.yaml index 342ec8d4b59..e2b9302dc36 100644 --- a/conda/environments/all_cuda-128_arch-x86_64.yaml +++ b/conda/environments/all_cuda-128_arch-x86_64.yaml @@ -64,7 +64,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.20,<1.23 +- polars>=1.20,<1.24 - pre-commit - pyarrow>=14.0.0,<20.0.0a0 - pydata-sphinx-theme>=0.15.4 diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml index 1d36ab2a3e4..64a147d3c63 100644 --- a/conda/recipes/cudf-polars/meta.yaml +++ b/conda/recipes/cudf-polars/meta.yaml @@ -43,7 +43,7 @@ requirements: run: - python - pylibcudf ={{ version }} - - polars >=1.20,<1.23 + - polars >=1.20,<1.24 - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} test: diff --git a/cpp/benchmarks/common/random_distribution_factory.cuh b/cpp/benchmarks/common/random_distribution_factory.cuh index c27616132d0..32424fbaaa3 100644 --- a/cpp/benchmarks/common/random_distribution_factory.cuh +++ b/cpp/benchmarks/common/random_distribution_factory.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,6 +29,7 @@ #include #include +#include #include #include diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 43f060fdafa..5f978a0d8ec 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -125,5 +125,99 @@ std::unique_ptr minhash64( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the minhash values for each input row + * + * This function uses MurmurHash3_x86_32 for the hash algorithm. + * + * The input row is first hashed using the given `seed` over a sliding window + * of `ngrams` of strings. These hash values are then combined with the `a` + * and `b` parameter values using the following formula: + * ``` + * max_hash = max of uint32 + * mp = (1 << 61) - 1 + * hv[i] = hash value of a ngrams at i + * pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash + * ``` + * + * This calculation is performed on each set of ngrams and the minimum value + * is computed as follows: + * ``` + * mh[j,i] = min(pv[i]) for all ngrams in row j + * and where i=[0,a.size()) + * ``` + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if the ngrams < 2 + * @throw std::invalid_argument if parameter_a is empty + * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()` + * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit + * + * @param input Strings column to compute minhash + * @param ngrams The number of strings to hash within each row + * @param seed Seed value used for the hash algorithm + * @param parameter_a Values used for the permuted calculation + * @param parameter_b Values used for the permuted calculation + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr minhash_ngrams( + cudf::lists_column_view const& input, + cudf::size_type ngrams, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + +/** + * @brief Returns the minhash values for each input row + * + * This function uses MurmurHash3_x64_128 for the hash algorithm. + * + * The input row is first hashed using the given `seed` over a sliding window + * of `ngrams` of strings. These hash values are then combined with the `a` + * and `b` parameter values using the following formula: + * ``` + * max_hash = max of uint64 + * mp = (1 << 61) - 1 + * hv[i] = hash value of a ngrams at i + * pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash + * ``` + * + * This calculation is performed on each set of ngrams and the minimum value + * is computed as follows: + * ``` + * mh[j,i] = min(pv[i]) for all ngrams in row j + * and where i=[0,a.size()) + * ``` + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if the ngrams < 2 + * @throw std::invalid_argument if parameter_a is empty + * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()` + * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit + * + * @param input List strings column to compute minhash + * @param ngrams The number of strings to hash within each row + * @param seed Seed value used for the hash algorithm + * @param parameter_a Values used for the permuted calculation + * @param parameter_b Values used for the permuted calculation + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr minhash64_ngrams( + cudf::lists_column_view const& input, + cudf::size_type ngrams, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** @} */ // end of group } // namespace CUDF_EXPORT nvtext diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu index 9dc39f01ab3..c304d705f9b 100644 --- a/cpp/src/column/column_device_view.cu +++ b/cpp/src/column/column_device_view.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,7 @@ #include #include +#include #include namespace cudf { diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 53c1d335a40..204aca8a69c 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -36,6 +36,7 @@ #include #include +#include #include namespace cudf::io { diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu index 7b9fc25d1cc..e506d60a2be 100644 --- a/cpp/src/io/json/host_tree_algorithms.cu +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -46,6 +46,7 @@ #include #include +#include namespace cudf::io::json::detail { diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 0c95c2b05e8..c265ac5e316 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -43,6 +43,7 @@ #include #include +#include #include namespace cudf::io::json::detail { diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp index 050bf692c14..77643d294e8 100644 --- a/cpp/src/io/orc/aggregate_orc_metadata.cpp +++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp @@ -19,6 +19,7 @@ #include "io/utilities/row_selection.hpp" #include +#include #include namespace cudf::io::orc::detail { diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index dbf5e293c4e..3a20ffbce19 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -64,6 +64,7 @@ #include #include +#include #include #include #include diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index 03a37327e9b..be1e7d38fff 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -40,6 +40,7 @@ #include #include +#include #include namespace cudf::io::parquet::detail { diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 9e50fafa8a7..4a410cec558 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -53,6 +53,7 @@ #include #include +#include #include #include #include diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu index 469442d46d4..d7b1bf360fe 100644 --- a/cpp/src/lists/dremel.cu +++ b/cpp/src/lists/dremel.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,6 +36,8 @@ #include #include +#include + namespace cudf::detail { namespace { /** diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh index d22fb04696c..6071a9fdd2d 100644 --- a/cpp/src/strings/regex/regex.cuh +++ b/cpp/src/strings/regex/regex.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,6 +27,7 @@ #include #include +#include #include namespace cudf { diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu index 0777253bb38..af8b53ccd8c 100644 --- a/cpp/src/strings/replace/multi_re.cu +++ b/cpp/src/strings/replace/multi_re.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -39,6 +39,7 @@ #include #include +#include namespace cudf { namespace strings { diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu index 990c4855a14..d77cc0cf17a 100644 --- a/cpp/src/table/row_operators.cu +++ b/cpp/src/table/row_operators.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,6 +33,8 @@ #include +#include + namespace cudf { namespace experimental { diff --git a/cpp/src/text/bpe/load_merge_pairs.cu b/cpp/src/text/bpe/load_merge_pairs.cu index a13a435a271..9118fe54ab2 100644 --- a/cpp/src/text/bpe/load_merge_pairs.cu +++ b/cpp/src/text/bpe/load_merge_pairs.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,6 +33,7 @@ #include #include +#include #include #include diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 50c16c8ba6c..663595af5df 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -62,19 +63,20 @@ constexpr cudf::thread_index_type tile_size = block_size; constexpr cuda::std::size_t params_per_thread = 16; // Separate kernels are used to process strings above and below this value (in bytes). -constexpr cudf::size_type wide_string_threshold = 1 << 18; // 256K +constexpr cudf::size_type wide_row_threshold = 1 << 18; // 256K // The number of blocks per string for the above-threshold kernel processing. -constexpr cudf::size_type blocks_per_string = 64; +constexpr cudf::size_type blocks_per_row = 64; // The above values were determined using the redpajama and books_sample datasets /** * @brief Hashing kernel launched as a thread per tile-size (block or warp) + * for strings column * * This kernel computes the hashes for each string using the seed and the specified * hash function. The width is used to compute rolling substrings to hash over. * The hashes are stored in d_hashes to be used in the minhash_kernel. * - * This kernel also counts the number of strings above the wide_string_threshold + * This kernel also counts the number of strings above the wide_row_threshold * and proactively initializes the output values for those strings. * * @tparam HashFunction The hash function to use for this kernel @@ -84,7 +86,7 @@ constexpr cudf::size_type blocks_per_string = 64; * @param seed The seed used for the hash function * @param width Width in characters used for determining substrings to hash * @param d_hashes The resulting hash values are stored here - * @param threshold_count Stores the number of strings above wide_string_threshold + * @param threshold_count Stores the number of strings above wide_row_threshold * @param param_count Number of parameters (used for the proactive initialize) * @param d_results Final results vector (used for the proactive initialize) */ @@ -146,7 +148,7 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, } // logic appended here so an extra kernel is not required - if (size_bytes >= wide_string_threshold) { + if (size_bytes >= wide_row_threshold) { if (lane_idx == 0) { // count the number of wide strings cuda::atomic_ref ref{*threshold_count}; @@ -160,31 +162,130 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, } } +/** + * @brief Hashing kernel launched as a thread per tile-size (block or warp) + * for a lists column + * + * This kernel computes the hashes for each row using the seed and the specified + * hash function. The ngrams identifies consecutive strings to hash over in + * sliding window formation. The hashes are stored in d_hashes and used as input + * to the minhash_kernel. + * + * This kernel also counts the number of rows above the wide_row_threshold + * and proactively initializes the output values for those rows. + * + * @tparam HashFunction The hash function to use for this kernel + * @tparam hash_value_type Derived from HashFunction result_type + * + * @param d_input The input column to hash + * @param seed The seed used for the hash function + * @param ngrams Number of strings in each row to hash + * @param d_hashes The resulting hash values are stored here + * @param threshold_count Stores the number of rows above wide_row_threshold + * @param param_count Number of parameters (used for the proactive initialize) + * @param d_results Final results vector (used for the proactive initialize) + */ +template +CUDF_KERNEL void minhash_ngrams_kernel(cudf::detail::lists_column_device_view const d_input, + hash_value_type seed, + cudf::size_type ngrams, + hash_value_type* d_hashes, + cudf::size_type* threshold_count, + cudf::size_type param_count, + hash_value_type* d_results) +{ + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const row_idx = tid / tile_size; + if (row_idx >= d_input.size()) { return; } + if (d_input.is_null(row_idx)) { return; } + + // retrieve this row's offset to locate the output position in d_hashes + auto const offsets_itr = d_input.offsets().data() + d_input.offset(); + auto const offset = offsets_itr[row_idx]; + auto const size_row = offsets_itr[row_idx + 1] - offset; + if (size_row == 0) { return; } + + auto const d_row = cudf::list_device_view(d_input, row_idx); + auto const lane_idx = static_cast(tid % tile_size); + + // hashes for this row/thread are stored here + auto seed_hashes = d_hashes + offset - offsets_itr[0] + lane_idx; + auto const hasher = HashFunction(seed); + + for (auto idx = lane_idx; idx < size_row; idx += tile_size, seed_hashes += tile_size) { + if (d_row.is_null(idx)) { + *seed_hashes = 0; + continue; + } + + auto next_idx = cuda::std::min(idx + ngrams, size_row - 1); + if ((idx != 0) && ((next_idx - idx) < ngrams)) { + *seed_hashes = 0; + continue; + } + + auto const first_str = d_row.element(idx); + auto const last_str = d_row.element(next_idx); + // build super-string since adjacent strings are contiguous in memory + auto const size = static_cast( + thrust::distance(first_str.data(), last_str.data()) + last_str.size_bytes()); + auto const hash_str = cudf::string_view(first_str.data(), size); + hash_value_type hv; + if constexpr (std::is_same_v) { + hv = hasher(hash_str); + } else { + hv = cuda::std::get<0>(hasher(hash_str)); + } + // disallowing hash to zero case + *seed_hashes = cuda::std::max(hv, hash_value_type{1}); + } + + // logic appended here to count long rows so an extra kernel is not required + if (size_row >= wide_row_threshold) { + if (lane_idx == 0) { + // count the number of wide rows + cuda::atomic_ref ref{*threshold_count}; + ref.fetch_add(1, cuda::std::memory_order_relaxed); + } + // initialize the output -- only needed for wider rows + auto d_output = d_results + (row_idx * param_count); + for (auto i = lane_idx; i < param_count; i += tile_size) { + d_output[i] = cuda::std::numeric_limits::max(); + } + } +} + /** * @brief Permutation calculation kernel * - * This kernel uses the hashes from the minhash_seed_kernel and the parameter_a and - * parameter_b values to compute the final output results. + * This kernel uses the hashes from the minhash_seed_kernel or minhash_ngrams_kernel + * and the 'parameter_a' and 'parameter_b' values to compute the final output. * The output is the number of input rows (N) by the number of parameter values (M). - * Each output[i] is the calculated result for parameter_a/b[0:M]. + * Each row output[i] is the calculated result for parameter_a/b[0:M]. + * + * This kernel is launched with either blocks per row of 1 for rows + * below the wide_row_threshold or blocks per row = blocks_per_rows + * for rows above wide_row_threshold. * - * This kernel is launched with either blocks per strings of 1 for strings - * below the wide_strings_threshold or blocks per string = blocks_per_strings - * for strings above wide_strings_threshold. + * Note that this was refactored to accommodate lists of strings which is possible + * since there is no need here to access the characters, only the hash values. + * The offsets and width are used to locate and count the hash values produced by + * kernels above for each input row. * + * @tparam offsets_type Type for the offsets iterator for the input column * @tparam hash_value_type Derived from HashFunction result_type - * @tparam blocks_per_string Number of blocks used to process each string + * @tparam blocks_per_row Number of blocks used to process each row * - * @param d_strings The input strings to hash - * @param indices The indices of the strings in d_strings to process + * @param offsets_itr The offsets are used to address the d_hashes + * @param indices The indices of the rows in the input column * @param parameter_a 1st set of parameters for the calculation result * @param parameter_b 2nd set of parameters for the calculation result - * @param width Used for calculating the number of available hashes in each string - * @param d_hashes The hash values computed in minhash_seed_kernel + * @param width Used for calculating the number of available hashes in each row + * @param d_hashes The hash values computed in one of the hash kernels * @param d_results Final results vector of calculate values */ -template -CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, +template +CUDF_KERNEL void minhash_kernel(offsets_type offsets_itr, cudf::device_span indices, cudf::device_span parameter_a, cudf::device_span parameter_b, @@ -193,41 +294,36 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, hash_value_type* d_results) { auto const tid = cudf::detail::grid_1d::global_thread_id(); - auto const idx = (tid / blocks_per_string) / block_size; + auto const idx = (tid / blocks_per_row) / block_size; if (idx >= indices.size()) { return; } - auto const str_idx = indices[idx]; - if (d_strings.is_null(str_idx)) { return; } + auto const row_idx = indices[idx]; auto const block = cooperative_groups::this_thread_block(); - int const section_idx = block.group_index().x % blocks_per_string; + int const section_idx = block.group_index().x % blocks_per_row; - auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); - auto const offsets_itr = - cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset()); - auto const offset = offsets_itr[str_idx]; - auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); + auto const offset = offsets_itr[row_idx]; + auto const row_size = static_cast(offsets_itr[row_idx + 1] - offset); // number of items to process in this block; - // last block also includes any remainder values from the size_bytes/blocks_per_string truncation + // last block also includes any remainder values from the row_size/blocks_per_row truncation // example: - // each section_size for string with size 588090 and blocks_per_string=64 is 9188 + // each section_size for string with size 588090 and blocks_per_row=64 is 9188 // except the last section which is 9188 + (588090 % 64) = 9246 - auto const section_size = - (size_bytes / blocks_per_string) + - (section_idx < (blocks_per_string - 1) ? 0 : size_bytes % blocks_per_string); - auto const section_offset = section_idx * (size_bytes / blocks_per_string); + auto const section_size = (row_size / blocks_per_row) + + (section_idx < (blocks_per_row - 1) ? 0 : row_size % blocks_per_row); + auto const section_offset = section_idx * (row_size / blocks_per_row); // hash values for this block/section auto const seed_hashes = d_hashes + offset - offsets_itr[0] + section_offset; // width used here as a max value since a string's char-count <= byte-count auto const hashes_size = - section_idx < (blocks_per_string - 1) + section_idx < (blocks_per_row - 1) ? section_size - : cuda::std::max(static_cast(size_bytes > 0), section_size - width + 1); + : cuda::std::max(static_cast(row_size > 0), section_size - width + 1); - auto const init = size_bytes == 0 ? 0 : cuda::std::numeric_limits::max(); + auto const init = row_size == 0 ? 0 : cuda::std::numeric_limits::max(); auto const lane_idx = block.thread_rank(); - auto const d_output = d_results + (str_idx * parameter_a.size()); + auto const d_output = d_results + (row_idx * parameter_a.size()); auto const begin = seed_hashes + lane_idx; auto const end = seed_hashes + hashes_size; @@ -273,7 +369,7 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, // cooperative groups does not have a min function and cub::BlockReduce was slower auto const minv = thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{}); - if constexpr (blocks_per_string > 1) { + if constexpr (blocks_per_row > 1) { // accumulates mins for each block into d_output cuda::atomic_ref ref{d_output[lane_idx + i]}; ref.fetch_min(minv, cuda::std::memory_order_relaxed); @@ -285,6 +381,46 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, } } +/** + * @brief Partition input rows by row size + * + * The returned index is the first row above the wide_row_threshold size. + * The returned vector are the indices partitioned above and below the + * wide_row_threshold size. + * + * @param size Number of rows in the input column + * @param threshold_count Number of rows above wide_row_threshold + * @param tfn Transform function returns the size of each row + * @param stream Stream used for allocation and kernel launches + */ +template +std::pair> partition_input( + cudf::size_type size, + cudf::size_type threshold_count, + transform_fn tfn, + rmm::cuda_stream_view stream) +{ + auto indices = rmm::device_uvector(size, stream); + thrust::sequence(rmm::exec_policy(stream), indices.begin(), indices.end()); + cudf::size_type threshold_index = threshold_count < size ? size : 0; + + // if we counted a split of above/below threshold then + // compute partitions based on the size of each string + if ((threshold_count > 0) && (threshold_count < size)) { + auto sizes = rmm::device_uvector(size, stream); + auto begin = thrust::counting_iterator(0); + auto end = begin + size; + thrust::transform(rmm::exec_policy_nosync(stream), begin, end, sizes.data(), tfn); + // these 2 are slightly faster than using partition() + thrust::sort_by_key( + rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), indices.begin()); + auto const lb = thrust::lower_bound( + rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), wide_row_threshold); + threshold_index = static_cast(thrust::distance(sizes.begin(), lb)); + } + return {threshold_index, std::move(indices)}; +} + template std::unique_ptr minhash_fn(cudf::strings_column_view const& input, hash_value_type seed, @@ -334,40 +470,112 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, d_threshold_count.data(), parameter_a.size(), d_results); - auto const threshold_count = d_threshold_count.value(stream); - auto indices = rmm::device_uvector(input.size(), stream); - thrust::sequence(rmm::exec_policy(stream), indices.begin(), indices.end()); - cudf::size_type threshold_index = threshold_count < input.size() ? input.size() : 0; + auto transform_fn = [d_strings = *d_strings] __device__(auto idx) -> cudf::size_type { + if (d_strings.is_null(idx)) { return 0; } + return d_strings.element(idx).size_bytes(); + }; + auto [threshold_index, indices] = + partition_input(input.size(), d_threshold_count.value(stream), transform_fn, stream); - // if we counted a split of above/below threshold then - // compute partitions based on the size of each string - if ((threshold_count > 0) && (threshold_count < input.size())) { - auto sizes = rmm::device_uvector(input.size(), stream); - thrust::transform(rmm::exec_policy_nosync(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(input.size()), - sizes.data(), - cuda::proclaim_return_type( - [d_strings = *d_strings] __device__(auto idx) -> cudf::size_type { - if (d_strings.is_null(idx)) { return 0; } - return d_strings.element(idx).size_bytes(); - })); - thrust::sort_by_key( - rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), indices.begin()); - auto const lb = thrust::lower_bound( - rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), wide_string_threshold); - threshold_index = static_cast(thrust::distance(sizes.begin(), lb)); + auto input_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); + using offsets_type = decltype(input_offsets); + + // handle the strings below the threshold width + if (threshold_index > 0) { + auto d_indices = cudf::device_span(indices.data(), threshold_index); + cudf::detail::grid_1d grid{static_cast(d_indices.size()) * block_size, + block_size}; + minhash_kernel + <<>>( + input_offsets, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); + } + + // handle the strings above the threshold width + if (threshold_index < input.size()) { + auto const count = static_cast(input.size() - threshold_index); + auto d_indices = + cudf::device_span(indices.data() + threshold_index, count); + cudf::detail::grid_1d grid{count * block_size * blocks_per_row, block_size}; + minhash_kernel + <<>>( + input_offsets, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); } + return results; +} + +template +std::unique_ptr minhash_ngrams_fn( + cudf::lists_column_view const& input, + cudf::size_type ngrams, + hash_value_type seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(ngrams >= 2, + "Parameter ngrams should be an integer value of 2 or greater", + std::invalid_argument); + CUDF_EXPECTS(!parameter_a.empty(), "Parameters A and B cannot be empty", std::invalid_argument); + CUDF_EXPECTS(parameter_a.size() == parameter_b.size(), + "Parameters A and B should have the same number of elements", + std::invalid_argument); + CUDF_EXPECTS( + (static_cast(input.size()) * parameter_a.size()) < + static_cast(std::numeric_limits::max()), + "The number of parameters times the number of input rows exceeds the column size limit", + std::overflow_error); + + auto const output_type = cudf::data_type{cudf::type_to_id()}; + if (input.is_empty()) { return cudf::make_empty_column(output_type); } + + auto const d_input = cudf::column_device_view::create(input.parent(), stream); + + auto results = + cudf::make_numeric_column(output_type, + input.size() * static_cast(parameter_a.size()), + cudf::mask_state::UNALLOCATED, + stream, + mr); + auto d_results = results->mutable_view().data(); + + cudf::detail::grid_1d grid{static_cast(input.size()) * block_size, + block_size}; + auto const hashes_size = input.child().size(); + auto d_hashes = rmm::device_uvector(hashes_size, stream); + auto d_threshold_count = cudf::detail::device_scalar(0, stream); + + auto d_list = cudf::detail::lists_column_device_view(*d_input); + minhash_ngrams_kernel + <<>>(d_list, + seed, + ngrams, + d_hashes.data(), + d_threshold_count.data(), + parameter_a.size(), + d_results); + + auto sizes_fn = [d_list] __device__(auto idx) -> cudf::size_type { + if (d_list.is_null(idx)) { return 0; } + return cudf::list_device_view(d_list, idx).size(); + }; + auto [threshold_index, indices] = + partition_input(input.size(), d_threshold_count.value(stream), sizes_fn, stream); + + auto input_offsets = input.offsets_begin(); // already includes input.offset() + using offset_type = decltype(input_offsets); + // handle the strings below the threshold width if (threshold_index > 0) { auto d_indices = cudf::device_span(indices.data(), threshold_index); cudf::detail::grid_1d grid{static_cast(d_indices.size()) * block_size, block_size}; - minhash_kernel + minhash_kernel <<>>( - *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); + input_offsets, d_indices, parameter_a, parameter_b, ngrams, d_hashes.data(), d_results); } // handle the strings above the threshold width @@ -375,10 +583,10 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, auto const count = static_cast(input.size() - threshold_index); auto d_indices = cudf::device_span(indices.data() + threshold_index, count); - cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size}; - minhash_kernel + cudf::detail::grid_1d grid{count * block_size * blocks_per_row, block_size}; + minhash_kernel <<>>( - *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); + input_offsets, d_indices, parameter_a, parameter_b, ngrams, d_hashes.data(), d_results); } return results; @@ -426,6 +634,20 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); } +std::unique_ptr minhash_ngrams(cudf::lists_column_view const& input, + cudf::size_type ngrams, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; + auto hashes = detail::minhash_ngrams_fn( + input, ngrams, seed, parameter_a, parameter_b, stream, mr); + return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); +} + std::unique_ptr minhash64(cudf::strings_column_view const& input, uint64_t seed, cudf::device_span parameter_a, @@ -440,6 +662,20 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); } +std::unique_ptr minhash64_ngrams(cudf::lists_column_view const& input, + cudf::size_type ngrams, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; + auto hashes = detail::minhash_ngrams_fn( + input, ngrams, seed, parameter_a, parameter_b, stream, mr); + return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); +} + } // namespace detail std::unique_ptr minhash(cudf::strings_column_view const& input, @@ -454,6 +690,19 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, return detail::minhash(input, seed, parameter_a, parameter_b, width, stream, mr); } +std::unique_ptr minhash_ngrams(cudf::lists_column_view const& input, + cudf::size_type ngrams, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + +{ + CUDF_FUNC_RANGE(); + return detail::minhash_ngrams(input, ngrams, seed, parameter_a, parameter_b, stream, mr); +} + std::unique_ptr minhash64(cudf::strings_column_view const& input, uint64_t seed, cudf::device_span parameter_a, @@ -466,4 +715,17 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr); } +std::unique_ptr minhash64_ngrams(cudf::lists_column_view const& input, + cudf::size_type ngrams, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + +{ + CUDF_FUNC_RANGE(); + return detail::minhash64_ngrams(input, ngrams, seed, parameter_a, parameter_b, stream, mr); +} + } // namespace nvtext diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu index 883a5093bd1..ad92e322ee2 100644 --- a/cpp/tests/groupby/tdigest_tests.cu +++ b/cpp/tests/groupby/tdigest_tests.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,6 +30,8 @@ #include #include +#include + namespace { /** * @brief Functor to generate a tdigest by key. diff --git a/cpp/tests/io/metadata_utilities.cpp b/cpp/tests/io/metadata_utilities.cpp index 380d66c53f9..980d8d8b3d1 100644 --- a/cpp/tests/io/metadata_utilities.cpp +++ b/cpp/tests/io/metadata_utilities.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,8 @@ #include #include +#include + namespace cudf::test { void expect_metadata_equal(cudf::io::table_input_metadata in_meta, diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index e201dc0565c..d99e19822c0 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -33,6 +33,7 @@ #include #include +#include using cudf::test::iterators::no_nulls; diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp index 5f911597b02..c6c419706e0 100644 --- a/cpp/tests/reductions/scan_tests.cpp +++ b/cpp/tests/reductions/scan_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,6 +28,7 @@ #include #include +#include #include using aggregation = cudf::aggregation; diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 8bfb17e0efd..db43484ab09 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -187,6 +187,15 @@ TEST_F(MinHashTest, EmptyTest) auto params64 = cudf::test::fixed_width_column_wrapper({1, 2, 3}); results = nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); EXPECT_EQ(results->size(), 0); + + auto empty = cudf::test::lists_column_wrapper(); + auto lview = cudf::lists_column_view(empty); + results = + nvtext::minhash_ngrams(lview, 4, 0, cudf::column_view(params), cudf::column_view(params)); + EXPECT_EQ(results->size(), 0); + results = + nvtext::minhash64_ngrams(lview, 4, 0, cudf::column_view(params64), cudf::column_view(params64)); + EXPECT_EQ(results->size(), 0); } TEST_F(MinHashTest, ErrorsTest) @@ -194,17 +203,20 @@ TEST_F(MinHashTest, ErrorsTest) auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"}); auto view = cudf::strings_column_view(input); auto empty = cudf::test::fixed_width_column_wrapper(); - EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0), - std::invalid_argument); + auto eview = cudf::column_view(empty); + EXPECT_THROW(nvtext::minhash(view, 0, eview, eview, 0), std::invalid_argument); auto empty64 = cudf::test::fixed_width_column_wrapper(); - EXPECT_THROW( - nvtext::minhash64(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0), - std::invalid_argument); - EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4), - std::invalid_argument); - EXPECT_THROW( - nvtext::minhash64(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4), - std::invalid_argument); + auto eview64 = cudf::column_view(empty64); + EXPECT_THROW(nvtext::minhash64(view, 0, eview64, eview64, 0), std::invalid_argument); + EXPECT_THROW(nvtext::minhash(view, 0, eview, eview, 4), std::invalid_argument); + EXPECT_THROW(nvtext::minhash64(view, 0, eview64, eview64, 4), std::invalid_argument); + + auto empty_list = cudf::test::lists_column_wrapper(); + auto lview = cudf::lists_column_view(empty_list); + EXPECT_THROW(nvtext::minhash_ngrams(lview, 0, 0, eview, eview), std::invalid_argument); + EXPECT_THROW(nvtext::minhash64_ngrams(lview, 0, 0, eview64, eview64), std::invalid_argument); + EXPECT_THROW(nvtext::minhash_ngrams(lview, 4, 0, eview, eview), std::invalid_argument); + EXPECT_THROW(nvtext::minhash64_ngrams(lview, 4, 0, eview64, eview64), std::invalid_argument); std::vector h_input(50000, ""); input = cudf::test::strings_column_wrapper(h_input.begin(), h_input.end()); @@ -212,16 +224,133 @@ TEST_F(MinHashTest, ErrorsTest) auto const zeroes = thrust::constant_iterator(0); auto params = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); - EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4), - std::overflow_error); + auto pview = cudf::column_view(params); + EXPECT_THROW(nvtext::minhash(view, 0, pview, pview, 4), std::overflow_error); auto params64 = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); - EXPECT_THROW( - nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4), - std::overflow_error); - - EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(empty), 4), - std::invalid_argument); - EXPECT_THROW( - nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4), - std::invalid_argument); + auto pview64 = cudf::column_view(params64); + EXPECT_THROW(nvtext::minhash64(view, 0, pview64, pview64, 4), std::overflow_error); + + auto offsets = cudf::test::fixed_width_column_wrapper( + thrust::counting_iterator(0), + thrust::counting_iterator(h_input.size() + 1)); + auto input_ngrams = + cudf::make_lists_column(h_input.size(), offsets.release(), input.release(), 0, {}); + lview = cudf::lists_column_view(input_ngrams->view()); + EXPECT_THROW(nvtext::minhash_ngrams(lview, 4, 0, pview, pview), std::overflow_error); + EXPECT_THROW(nvtext::minhash64_ngrams(lview, 4, 0, pview64, pview64), std::overflow_error); +} + +TEST_F(MinHashTest, Ngrams) +{ + using LCWS = cudf::test::lists_column_wrapper; + auto input = + LCWS({LCWS{"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog."}, + LCWS{"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "", "dog."}, + LCWS{"short", "row"}}); + + auto view = cudf::lists_column_view(input); + + auto first = thrust::counting_iterator(10); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = + nvtext::minhash_ngrams(view, 4, 0, cudf::column_view(params), cudf::column_view(params)); + using LCW32 = cudf::test::lists_column_wrapper; + // clang-format off + LCW32 expected({ + LCW32{ 230924604u, 55492793u, 963436400u}, + LCW32{ 230924604u, 367515795u, 963436400u}, + LCW32{2380648568u, 1330223236u, 279797904u} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = + nvtext::minhash64_ngrams(view, 4, 0, cudf::column_view(params64), cudf::column_view(params64)); + using LCW64 = cudf::test::lists_column_wrapper; + // clang-format off + LCW64 expected64({ + LCW64{ 208926840193078200ul, 576399628675212695ul, 312927673584437419ul}, + LCW64{ 677038498284219393ul, 326338087730412201ul, 298455901014050223ul}, + LCW64{1493265692486268500ul, 720255058049417768ul, 2253087432826260995ul} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); +} + +TEST_F(MinHashTest, NgramsWide) +{ + auto many = std::vector(1024, "hello"); + auto str_data = cudf::test::strings_column_wrapper(many.begin(), many.end()); + auto offsets = + cudf::test::fixed_width_column_wrapper({0ul, many.size() / 2, many.size()}); + auto input = cudf::make_lists_column(2, offsets.release(), str_data.release(), 0, {}); + + auto view = cudf::lists_column_view(input->view()); + + auto first = thrust::counting_iterator(10); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = + nvtext::minhash_ngrams(view, 4, 0, cudf::column_view(params), cudf::column_view(params)); + using LCW32 = cudf::test::lists_column_wrapper; + // clang-format off + LCW32 expected({ + LCW32{ 571536396u, 2346676954u, 4121817512u}, + LCW32{ 571536396u, 2346676954u, 4121817512u} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = + nvtext::minhash64_ngrams(view, 4, 0, cudf::column_view(params64), cudf::column_view(params64)); + using LCW64 = cudf::test::lists_column_wrapper; + // clang-format off + LCW64 expected64({ + LCW64{ 1947142336021414174ul, 1219519365938078011ul, 491896395854741840ul}, + LCW64{ 1947142336021414174ul, 1219519365938078011ul, 491896395854741840ul} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); +} + +TEST_F(MinHashTest, NgramsSliced) +{ + using LCWS = cudf::test::lists_column_wrapper; + auto input = + LCWS({LCWS{"ignored", "row"}, + LCWS{"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog."}, + LCWS{"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "", "dog."}, + LCWS{"short", "row"}, + LCWS{"ignored", "row"}}); + + auto view = cudf::lists_column_view(cudf::slice(input, {1, 4}).front()); + auto first = thrust::counting_iterator(10); + + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = + nvtext::minhash_ngrams(view, 4, 0, cudf::column_view(params), cudf::column_view(params)); + + using LCW32 = cudf::test::lists_column_wrapper; + // clang-format off + LCW32 expected({ + LCW32{ 230924604u, 55492793u, 963436400u}, + LCW32{ 230924604u, 367515795u, 963436400u}, + LCW32{2380648568u, 1330223236u, 279797904u} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = + nvtext::minhash64_ngrams(view, 4, 0, cudf::column_view(params64), cudf::column_view(params64)); + using LCW64 = cudf::test::lists_column_wrapper; + // clang-format off + LCW64 expected64({ + LCW64{ 208926840193078200ul, 576399628675212695ul, 312927673584437419ul}, + LCW64{ 677038498284219393ul, 326338087730412201ul, 298455901014050223ul}, + LCW64{1493265692486268500ul, 720255058049417768ul, 2253087432826260995ul} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } diff --git a/dependencies.yaml b/dependencies.yaml index c7869eee922..1578dadc793 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -813,7 +813,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=1.20,<1.23 + - polars>=1.20,<1.24 run_cudf_polars_experimental: common: - output_types: [conda, requirements, pyproject] diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index c74da8d0ca9..92b37c4b3f2 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -207,6 +207,7 @@ def clean_all_xml_files(path): exclude_patterns = [ "venv", "**/includes/**", + "narwhals_test_plugin", ] # The name of the Pygments (syntax highlighting) style to use. @@ -585,6 +586,7 @@ def on_missing_reference(app, env, node, contnode): ("py:class", "pd.DataFrame"), ("py:class", "pandas.core.indexes.frozen.FrozenList"), ("py:class", "pa.Array"), + ("py:class", "pa.Decimal128Type"), ("py:class", "ScalarLike"), ("py:class", "ParentType"), ("py:class", "pyarrow.lib.DataType"), diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index a57ff9a7817..d41e448254c 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -36,6 +36,7 @@ ColumnBinaryOperand, ColumnLike, Dtype, + DtypeObj, ScalarLike, SeriesOrIndex, SeriesOrSingleColumnIndex, @@ -1168,7 +1169,7 @@ def _mimic_inplace( self._codes = other_col.codes return out - def view(self, dtype: Dtype) -> ColumnBase: + def view(self, dtype: DtypeObj) -> ColumnBase: raise NotImplementedError( "Categorical column views are not currently supported" ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 89ac39b2be5..61f4f7d52fb 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -950,7 +950,7 @@ def copy(self, deep: bool = True) -> Self: ), ) - def view(self, dtype: Dtype) -> ColumnBase: + def view(self, dtype: DtypeObj) -> ColumnBase: """ View the data underlying a column as different dtype. The source column must divide evenly into the size of @@ -959,13 +959,9 @@ def view(self, dtype: Dtype) -> ColumnBase: Parameters ---------- - dtype : NumPy dtype, string + dtype : Dtype object The dtype to view the data as - """ - - dtype = cudf.dtype(dtype) - if dtype.kind in ("o", "u", "s"): raise TypeError( "Bytes viewed as str without metadata is ambiguous" @@ -1586,7 +1582,7 @@ def distinct_count(self, dropna: bool = True) -> int: self._distinct_count[dropna] = result return self._distinct_count[dropna] - def can_cast_safely(self, to_dtype: Dtype) -> bool: + def can_cast_safely(self, to_dtype: DtypeObj) -> bool: raise NotImplementedError() @acquire_spill_lock() diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 92d5c39e69d..213e91d7b3f 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -47,6 +47,7 @@ ColumnBinaryOperand, DatetimeLikeScalar, Dtype, + DtypeObj, ScalarLike, ) from cudf.core.column.numerical import NumericalColumn @@ -837,7 +838,7 @@ def is_unique(self) -> bool: def isin(self, values: Sequence) -> ColumnBase: return cudf.core.tools.datetimes._isin_datetimelike(self, values) - def can_cast_safely(self, to_dtype: Dtype) -> bool: + def can_cast_safely(self, to_dtype: DtypeObj) -> bool: if to_dtype.kind == "M": # type: ignore[union-attr] to_res, _ = np.datetime_data(to_dtype) self_res, _ = np.datetime_data(self.dtype) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 3c603c8e6ef..8db6f805bce 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -13,7 +13,6 @@ import pylibcudf as plc import cudf -from cudf.api.types import is_scalar from cudf.core._internals import binaryop from cudf.core.buffer import acquire_spill_lock, as_buffer from cudf.core.column.column import ColumnBase @@ -73,11 +72,8 @@ def __cuda_array_interface__(self): def as_decimal_column( self, dtype: Dtype, - ) -> "DecimalBaseColumn": - if ( - isinstance(dtype, cudf.core.dtypes.DecimalDtype) - and dtype.scale < self.dtype.scale - ): + ) -> DecimalBaseColumn: + if isinstance(dtype, DecimalDtype) and dtype.scale < self.dtype.scale: warnings.warn( "cuDF truncates when downcasting decimals to a lower scale. " "To round, use Series.round() or DataFrame.round()." @@ -204,22 +200,17 @@ def normalize_binop_value(self, other) -> Self | cudf.Scalar: other = other.astype(self.dtype) return other if isinstance(other, cudf.Scalar) and isinstance( - # TODO: Should it be possible to cast scalars of other numerical - # types to decimal? other.dtype, - cudf.core.dtypes.DecimalDtype, + DecimalDtype, ): + # TODO: Should it be possible to cast scalars of other numerical + # types to decimal? if _same_precision_and_scale(self.dtype, other.dtype): other = other.astype(self.dtype) return other - elif is_scalar(other) and isinstance(other, (int, Decimal)): - other = Decimal(other) - metadata = other.as_tuple() - precision = max(len(metadata.digits), metadata.exponent) - scale = -cast(int, metadata.exponent) - return cudf.Scalar( - other, dtype=self.dtype.__class__(precision, scale) - ) + elif isinstance(other, (int, Decimal)): + dtype = self.dtype._from_decimal(Decimal(other)) + return cudf.Scalar(other, dtype=dtype) return NotImplemented def as_numerical_column( @@ -373,11 +364,6 @@ def __init__( children=children, ) - def __setitem__(self, key, value): - if isinstance(value, np.integer): - value = int(value) - super().__setitem__(key, value) - @classmethod def from_arrow(cls, data: pa.Array): dtype = Decimal64Dtype.from_arrow(data.type) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 28e8b98edfe..b82ec1958fb 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -43,6 +43,7 @@ ColumnBinaryOperand, ColumnLike, Dtype, + DtypeObj, ScalarLike, SeriesOrIndex, ) @@ -5532,6 +5533,120 @@ def minhash64( self._column.minhash64(seed, a_column, b_column, width) # type: ignore[arg-type] ) + def minhash_ngrams( + self, ngrams: int, seed: np.uint32, a: ColumnLike, b: ColumnLike + ) -> SeriesOrIndex: + """ + Compute the minhash of a list column of strings. + + This uses the MurmurHash3_x86_32 algorithm for the hash function. + + Calculation uses the formula (hv * a + b) % mersenne_prime + where hv is the hash of a ngrams of strings within each row, + a and b are provided values and mersenne_prime is 2^61-1. + + Parameters + ---------- + ngrams : int + Number of strings to hash within each row. + seed : uint32 + The seed used for the hash algorithm. + a : ColumnLike + Values for minhash calculation. + Must be of type uint32. + b : ColumnLike + Values for minhash calculation. + Must be of type uint32. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> s = cudf.Series([['this', 'is', 'my'], ['favorite', 'book']]) + >>> a = cudf.Series([1, 2, 3], dtype=np.uint32) + >>> b = cudf.Series([4, 5, 6], dtype=np.uint32) + >>> s.str.minhash_ngrams(ngrams=2, seed=0, a=a, b=b) + 0 [416367551, 832735099, 1249102647] + 1 [1906668704, 3813337405, 1425038810] + dtype: list + """ + a_column = column.as_column(a) + if a_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(a)}" + ) + b_column = column.as_column(b) + if b_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(b)}" + ) + plc_column = plc.nvtext.minhash.minhash_ngrams( + self._column.to_pylibcudf(mode="read"), + ngrams, + seed, + a._column.to_pylibcudf(mode="read"), + b._column.to_pylibcudf(mode="read"), + ) + result = ColumnBase.from_pylibcudf(plc_column) + return self._return_or_inplace(result) + + def minhash64_ngrams( + self, ngrams: int, seed: np.uint64, a: ColumnLike, b: ColumnLike + ) -> SeriesOrIndex: + """ + Compute the minhash of a list column of strings. + + This uses the MurmurHash3_x64_128 algorithm for the hash function. + + Calculation uses the formula (hv * a + b) % mersenne_prime + where hv is the hash of a ngrams of strings within each row, + a and b are provided values and mersenne_prime is 2^61-1. + + Parameters + ---------- + ngrams : int + Number of strings to hash within each row. + seed : uint64 + The seed used for the hash algorithm. + a : ColumnLike + Values for minhash calculation. + Must be of type uint64. + b : ColumnLike + Values for minhash calculation. + Must be of type uint64. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> s = cudf.Series([['this', 'is', 'my'], ['favorite', 'book']]) + >>> a = cudf.Series([2, 3], dtype=np.uint64) + >>> b = cudf.Series([5, 6], dtype=np.uint64) + >>> s.str.minhash64_ngrams(ngrams=2, seed=0, a=a, b=b) + 0 [1304293339825194559, 1956440009737791829] + 1 [472203876238918632, 1861227318965224922] + dtype: list + """ + a_column = column.as_column(a) + if a_column.dtype != np.uint64: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(a)}" + ) + b_column = column.as_column(b) + if b_column.dtype != np.uint64: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(b)}" + ) + plc_column = plc.nvtext.minhash.minhash64_ngrams( + self._column.to_pylibcudf(mode="read"), + ngrams, + seed, + a._column.to_pylibcudf(mode="read"), + b._column.to_pylibcudf(mode="read"), + ) + result = ColumnBase.from_pylibcudf(plc_column) + return self._return_or_inplace(result) + def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: """ Compute the Jaccard index between this column and the given @@ -5640,7 +5755,7 @@ def __init__( if not isinstance(data, Buffer): raise ValueError("data must be a Buffer") if dtype != CUDF_STRING_DTYPE: - raise ValueError(f"dtypy must be {CUDF_STRING_DTYPE}") + raise ValueError(f"dtype must be {CUDF_STRING_DTYPE}") if len(children) > 1: raise ValueError("StringColumn must have at most 1 offset column.") @@ -5826,23 +5941,22 @@ def __contains__(self, item: ScalarLike) -> bool: other = [item] if is_scalar(item) else item return self.contains(column.as_column(other, dtype=self.dtype)).any() - def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: - out_dtype = cudf.api.types.dtype(dtype) - if out_dtype.kind == "b": + def as_numerical_column(self, dtype: np.dtype) -> NumericalColumn: + if dtype.kind == "b": with acquire_spill_lock(): plc_column = plc.strings.attributes.count_characters( self.to_pylibcudf(mode="read") ) result = ColumnBase.from_pylibcudf(plc_column) return (result > np.int8(0)).fillna(False) - elif out_dtype.kind in {"i", "u"}: + elif dtype.kind in {"i", "u"}: if not self.is_integer().all(): raise ValueError( "Could not convert strings to integer " "type due to presence of non-integer values." ) cast_func = plc.strings.convert.convert_integers.to_integers - elif out_dtype.kind == "f": + elif dtype.kind == "f": if not self.is_float().all(): raise ValueError( "Could not convert strings to float " @@ -5850,10 +5964,8 @@ def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: ) cast_func = plc.strings.convert.convert_floats.to_floats else: - raise ValueError( - f"dtype must be a numerical type, not {out_dtype}" - ) - plc_dtype = dtype_to_pylibcudf_type(out_dtype) + raise ValueError(f"dtype must be a numerical type, not {dtype}") + plc_dtype = dtype_to_pylibcudf_type(dtype) with acquire_spill_lock(): return type(self).from_pylibcudf( # type: ignore[return-value] cast_func(self.to_pylibcudf(mode="read"), plc_dtype) @@ -5973,17 +6085,15 @@ def to_pandas( else: return super().to_pandas(nullable=nullable, arrow_type=arrow_type) - def can_cast_safely(self, to_dtype: Dtype) -> bool: - to_dtype = cudf.api.types.dtype(to_dtype) - + def can_cast_safely(self, to_dtype: DtypeObj) -> bool: if self.dtype == to_dtype: return True - elif to_dtype.kind in {"i", "u"} and not self.is_integer().all(): - return False - elif to_dtype.kind == "f" and not self.is_float().all(): - return False - else: + elif to_dtype.kind in {"i", "u"} and self.is_integer().all(): + return True + elif to_dtype.kind == "f" and self.is_float().all(): return True + else: + return False def find_and_replace( self, @@ -6122,12 +6232,11 @@ def _binaryop( return NotImplemented @copy_docstring(ColumnBase.view) - def view(self, dtype) -> ColumnBase: + def view(self, dtype: DtypeObj) -> ColumnBase: if self.null_count > 0: raise ValueError( "Can not produce a view of a string column with nulls" ) - dtype = cudf.api.types.dtype(dtype) str_byte_offset = self.base_children[0].element_indexing(self.offset) str_end_byte_offset = self.base_children[0].element_indexing( self.offset + self.size diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 8b0ef9f0cc8..e4d47f492c2 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -28,7 +28,12 @@ if TYPE_CHECKING: from collections.abc import Sequence - from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype + from cudf._typing import ( + ColumnBinaryOperand, + DatetimeLikeScalar, + Dtype, + DtypeObj, + ) _unit_to_nanoseconds_conversion = { "ns": 1, @@ -309,7 +314,9 @@ def total_seconds(self) -> ColumnBase: # https://github.com/rapidsai/cudf/issues/17664 return ( (self.astype(np.dtype(np.int64)) * conversion) - .astype(cudf.Decimal128Dtype(38, 9)) + .astype( + cudf.Decimal128Dtype(cudf.Decimal128Dtype.MAX_PRECISION, 9) + ) .round(decimals=abs(int(math.log10(conversion)))) .astype(np.dtype(np.float64)) ) @@ -378,10 +385,10 @@ def find_and_replace( ), ) - def can_cast_safely(self, to_dtype: Dtype) -> bool: - if to_dtype.kind == "m": # type: ignore[union-attr] + def can_cast_safely(self, to_dtype: DtypeObj) -> bool: + if to_dtype.kind == "m": to_res, _ = np.datetime_data(to_dtype) - self_res, _ = np.datetime_data(self.dtype) + self_res = self.time_unit max_int = np.iinfo(np.int64).max diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 977208f5eb4..ac9c4d23cc2 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -776,35 +776,36 @@ def _recursively_replace_fields(self, result: dict) -> dict: class DecimalDtype(_BaseDtype): _metadata = ("precision", "scale") - def __init__(self, precision, scale=0): + def __init__(self, precision: int, scale: int = 0) -> None: self._validate(precision, scale) - self._typ = pa.decimal128(precision, scale) + self._precision = precision + self._scale = scale @property - def str(self): + def str(self) -> str: return f"{self.name!s}({self.precision}, {self.scale})" @property - def precision(self): + def precision(self) -> int: """ The decimal precision, in number of decimal digits (an integer). """ - return self._typ.precision + return self._precision @precision.setter - def precision(self, value): + def precision(self, value: int) -> None: self._validate(value, self.scale) - self._typ = pa.decimal128(precision=value, scale=self.scale) + self._precision = value @property - def scale(self): + def scale(self) -> int: """ The decimal scale (an integer). """ - return self._typ.scale + return self._scale @property - def itemsize(self): + def itemsize(self) -> int: """ Length of one column element in bytes. """ @@ -815,14 +816,14 @@ def type(self): # might need to account for precision and scale here return decimal.Decimal - def to_arrow(self): + def to_arrow(self) -> pa.Decimal128Type: """ Return the equivalent ``pyarrow`` dtype. """ - return self._typ + return pa.decimal128(self.precision, self.scale) @classmethod - def from_arrow(cls, typ): + def from_arrow(cls, typ: pa.Decimal128Type) -> Self: """ Construct a cudf decimal dtype from a ``pyarrow`` dtype @@ -856,23 +857,23 @@ def __repr__(self): ) @classmethod - def _validate(cls, precision, scale=0): + def _validate(cls, precision: int, scale: int) -> None: if precision > cls.MAX_PRECISION: raise ValueError( f"Cannot construct a {cls.__name__}" f" with precision > {cls.MAX_PRECISION}" ) if abs(scale) > precision: - raise ValueError(f"scale={scale} exceeds precision={precision}") + raise ValueError(f"{scale=} cannot exceed {precision=}") @classmethod - def _from_decimal(cls, decimal): + def _from_decimal(cls, decimal: decimal.Decimal) -> Self: """ Create a cudf.DecimalDtype from a decimal.Decimal object """ metadata = decimal.as_tuple() - precision = max(len(metadata.digits), -metadata.exponent) - return cls(precision, -metadata.exponent) + precision = max(len(metadata.digits), -metadata.exponent) # type: ignore[operator] + return cls(precision, -metadata.exponent) # type: ignore[operator] def serialize(self) -> tuple[dict, list]: return ( @@ -885,7 +886,7 @@ def serialize(self) -> tuple[dict, list]: ) @classmethod - def deserialize(cls, header: dict, frames: list): + def deserialize(cls, header: dict, frames: list) -> Self: _check_type(cls, header, frames, is_valid_class=issubclass) return cls(header["precision"], header["scale"]) @@ -896,8 +897,8 @@ def __eq__(self, other: Dtype) -> bool: return False return self.precision == other.precision and self.scale == other.scale - def __hash__(self): - return hash(self._typ) + def __hash__(self) -> int: + return hash(self.to_arrow()) @doc_apply( diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index c5d2fd349e9..7d76907916f 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1526,9 +1526,9 @@ def pivot_table( ---------- data : DataFrame values : column name or list of column names to aggregate, optional - index : list of column names + index : scalar or list of column names Values to group by in the rows. - columns : list of column names + columns : scalar or list of column names Values to group by in the columns. aggfunc : str or dict, default "mean" If dict is passed, the key is column to aggregate @@ -1562,6 +1562,11 @@ def pivot_table( if sort is not True: raise NotImplementedError("sort is not supported yet") + if is_scalar(index): + index = [index] + if is_scalar(columns): + columns = [columns] + keys = index + columns values_passed = values is not None @@ -1620,15 +1625,8 @@ def pivot_table( table = table.fillna(fill_value) # discard the top level - if values_passed and not values_multi and table._data.multiindex: - column_names = table._data.level_names[1:] - table_columns = tuple( - map(lambda column: column[1:], table._column_names) - ) - table.columns = pd.MultiIndex.from_tuples( - tuples=table_columns, names=column_names - ) - + if values_passed and not values_multi and table._data.nlevels > 1: + table.columns = table._data.to_pandas_index.droplevel(0) if len(index) == 0 and len(columns) > 0: table = table.T diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index cf85282cccb..29139768a36 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -85,9 +85,9 @@ def _preprocess_host_value(value, dtype) -> tuple[ScalarLike, Dtype]: return value.as_py(), dtype if isinstance(dtype, cudf.core.dtypes.DecimalDtype): - value = pa.scalar( - value, type=pa.decimal128(dtype.precision, dtype.scale) - ).as_py() + if isinstance(value, np.integer): + value = int(value) + value = pa.scalar(value, type=dtype.to_arrow()).as_py() if isinstance(value, decimal.Decimal) and dtype is None: dtype = cudf.Decimal128Dtype._from_decimal(value) diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py index 52fc945709e..742a6b57e59 100644 --- a/python/cudf/cudf/pandas/__init__.py +++ b/python/cudf/cudf/pandas/__init__.py @@ -8,12 +8,17 @@ import pylibcudf import rmm.mr -from .fast_slow_proxy import is_proxy_instance, is_proxy_object +from .fast_slow_proxy import ( + as_proxy_object, + is_proxy_instance, + is_proxy_object, +) from .magics import load_ipython_extension from .profiler import Profiler __all__ = [ "Profiler", + "as_proxy_object", "install", "is_proxy_instance", "is_proxy_object", diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 45944452c17..147971e8bee 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -151,7 +151,7 @@ def make_final_proxy_type( additional_attributes Mapping of additional attributes to add to the class (optional), these will override any defaulted attributes (e.g. - ``__init__`). If you want to remove a defaulted attribute + ``__init__``). If you want to remove a defaulted attribute completely, pass the special sentinel ``_DELETE`` as a value. postprocess Optional function called to allow the proxy to postprocess @@ -1335,6 +1335,31 @@ def _get_proxy_base_class(cls): return object +def as_proxy_object(obj: Any) -> Any: + """ + Wraps a cudf or pandas object in a proxy object if applicable. + + There will be no memory transfer, i.e., GPU objects stay on GPU and + CPU objects stay on CPU. The object will be wrapped in a + proxy object. This is useful for ensuring that the object is + compatible with the fast-slow proxy system. + + Parameters + ---------- + obj : Any + The object to wrap. + + Returns + ------- + Any + The wrapped proxy object if applicable, otherwise the original object. + """ + if _is_final_type(obj): + typ = get_final_type_map()[type(obj)] + return typ._fsproxy_wrap(obj, None) + return obj + + def is_proxy_instance(obj, type): return is_proxy_object(obj) and obj.__class__.__name__ == type.__name__ diff --git a/python/cudf/cudf/testing/__init__.py b/python/cudf/cudf/testing/__init__.py index 4e92b43b9f9..a4afa54f754 100644 --- a/python/cudf/cudf/testing/__init__.py +++ b/python/cudf/cudf/testing/__init__.py @@ -1,5 +1,6 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. +from cudf.testing import narwhals_test_plugin from cudf.testing.testing import ( assert_eq, assert_frame_equal, diff --git a/python/cudf/cudf/testing/narwhals_test_plugin.py b/python/cudf/cudf/testing/narwhals_test_plugin.py new file mode 100644 index 00000000000..d794bd0120a --- /dev/null +++ b/python/cudf/cudf/testing/narwhals_test_plugin.py @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Plugin for running narwhals test suite with cudf.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Mapping + +EXPECTED_FAILURES: Mapping[str, str] = { + "tests/frame/select_test.py::test_select_duplicates[cudf]": "cuDF doesn't support having multiple columns with same names", +} + + +def pytest_collection_modifyitems(session, config, items) -> None: + """Mark known failing tests.""" + import pytest + + for item in items: + if item.nodeid in EXPECTED_FAILURES: + exp_val = EXPECTED_FAILURES[item.nodeid] + item.add_marker(pytest.mark.xfail(reason=exp_val)) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 2996a88c171..b7cd2388f30 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -290,6 +290,8 @@ def test_column_chunked_array_creation(): ], ) def test_column_view_valid_numeric_to_numeric(data, from_dtype, to_dtype): + from_dtype = np.dtype(from_dtype) + to_dtype = np.dtype(to_dtype) cpu_data = np.asarray(data, dtype=from_dtype) gpu_data = as_column(data, dtype=from_dtype) @@ -314,6 +316,8 @@ def test_column_view_valid_numeric_to_numeric(data, from_dtype, to_dtype): ], ) def test_column_view_invalid_numeric_to_numeric(data, from_dtype, to_dtype): + from_dtype = np.dtype(from_dtype) + to_dtype = np.dtype(to_dtype) cpu_data = np.asarray(data, dtype=from_dtype) gpu_data = as_column(data, dtype=from_dtype) @@ -337,6 +341,7 @@ def test_column_view_invalid_numeric_to_numeric(data, from_dtype, to_dtype): ], ) def test_column_view_valid_string_to_numeric(data, to_dtype): + to_dtype = np.dtype(to_dtype) expect = cudf.Series._from_column(cudf.Series(data)._column.view(to_dtype)) got = cudf.Series(str_host_view(data, to_dtype)) @@ -352,7 +357,7 @@ def test_column_view_nulls_widths_even(): sr = cudf.Series(data, dtype="int32") expect = cudf.Series(expect_data, dtype="float32") - got = cudf.Series._from_column(sr._column.view("float32")) + got = cudf.Series._from_column(sr._column.view(np.dtype(np.float32))) assert_eq(expect, got) @@ -364,7 +369,7 @@ def test_column_view_nulls_widths_even(): sr = cudf.Series(data, dtype="float64") expect = cudf.Series(expect_data, dtype="int64") - got = cudf.Series._from_column(sr._column.view("int64")) + got = cudf.Series._from_column(sr._column.view(np.dtype(np.int64))) assert_eq(expect, got) @@ -376,7 +381,7 @@ def test_column_view_numeric_slice(slc): expect = cudf.Series(data[slc].view("int64")) got = cudf.Series._from_column( - sr._column.slice(slc.start, slc.stop).view("int64") + sr._column.slice(slc.start, slc.stop).view(np.dtype(np.int64)) ) assert_eq(expect, got) @@ -389,7 +394,9 @@ def test_column_view_string_slice(slc): data = ["a", "bcde", "cd", "efg", "h"] expect = cudf.Series._from_column( - cudf.Series(data)._column.slice(slc.start, slc.stop).view("int8") + cudf.Series(data) + ._column.slice(slc.start, slc.stop) + .view(np.dtype(np.int8)) ) got = cudf.Series(str_host_view(data[slc], "int8")) diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 7fbe072dde7..eae73e47955 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -798,6 +798,25 @@ def test_dataframe_pivot_table_simple(aggfunc, fill_value): assert_eq(expected, actual, check_dtype=False) +@pytest.mark.parametrize("index", ["A", ["A"]]) +@pytest.mark.parametrize("columns", ["C", ["C"]]) +def test_pivot_table_scalar_index_columns(index, columns): + data = { + "A": ["one", "one", "two", "three"] * 6, + "B": ["A", "B", "C"] * 8, + "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, + "D": range(24), + "E": range(24), + } + result = cudf.DataFrame(data).pivot_table( + values="D", index=index, columns=columns, aggfunc="sum" + ) + expected = pd.DataFrame(data).pivot_table( + values="D", index=index, columns=columns, aggfunc="sum" + ) + assert_eq(result, expected) + + def test_crosstab_simple(): a = np.array( [ diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index dc45827d2e8..47b41bd1e39 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -930,6 +930,48 @@ def test_minhash(): strings.str.minhash64(1, a=params, b=params, width=8) +def test_minhash_ngrams(): + strings = cudf.Series( + [["this", "is", "my"], ["favorite", "book", "today"]] + ) + + params = cudf.Series([1, 2, 3], dtype=np.uint32) + expected = cudf.Series( + [ + cudf.Series([416367548, 832735096, 1249102644], dtype=np.uint32), + cudf.Series([1408797893, 2817595786, 4226393679], dtype=np.uint32), + ] + ) + actual = strings.str.minhash_ngrams(ngrams=2, seed=0, a=params, b=params) + assert_eq(expected, actual) + + params = cudf.Series([1, 2, 3], dtype=np.uint64) + expected = cudf.Series( + [ + cudf.Series( + [652146669912597278, 1304293339825194556, 1956440009737791826], + dtype=np.uint64, + ), + cudf.Series( + [1776622609581023632, 1247402209948353305, 718181810315682986], + dtype=np.uint64, + ), + ] + ) + actual = strings.str.minhash64_ngrams(ngrams=2, seed=0, a=params, b=params) + assert_eq(expected, actual) + + # test wrong input types + with pytest.raises(ValueError): + strings.str.minhash_ngrams(ngrams=7, seed=1, a="a", b="b") + with pytest.raises(ValueError): + params = cudf.Series([0, 1, 2], dtype=np.int32) + strings.str.minhash_ngrams(ngrams=6, seed=1, a=params, b=params) + with pytest.raises(ValueError): + params = cudf.Series([0, 1, 2], dtype=np.uint32) + strings.str.minhash64_ngrams(ngrams=8, seed=1, a=params, b=params) + + def test_jaccard_index(): str1 = cudf.Series(["the brown dog", "jumped about"]) str2 = cudf.Series(["the black cat", "jumped around"]) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index fd946937945..2678a4f8116 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -18,9 +18,10 @@ import cudf.api.types from cudf.core import column from cudf.core.buffer import as_buffer +from cudf.utils.dtypes import SIZE_TYPE_DTYPE # The size of the mask in bytes -mask_dtype = cudf.api.types.dtype(np.int32) +mask_dtype = SIZE_TYPE_DTYPE mask_bitsize = mask_dtype.itemsize * 8 # Mapping from ufuncs to the corresponding binary operators. diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 47de8fb1435..d3bfd9298c2 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -44,6 +44,7 @@ OOMFallbackError, TypeFallbackError, _Unusable, + as_proxy_object, is_proxy_object, ) from cudf.testing import assert_eq @@ -1979,6 +1980,93 @@ def test_numpy_data_access(): assert type(expected) is type(actual) +@pytest.mark.parametrize( + "obj", + [ + pd.DataFrame({"a": [1, 2, 3]}), + pd.Series([1, 2, 3]), + pd.Index([1, 2, 3]), + pd.Categorical([1, 2, 3]), + pd.to_datetime(["2021-01-01", "2021-01-02"]), + pd.to_timedelta(["1 days", "2 days"]), + xpd.DataFrame({"a": [1, 2, 3]}), + xpd.Series([1, 2, 3]), + xpd.Index([1, 2, 3]), + xpd.Categorical([1, 2, 3]), + xpd.to_datetime(["2021-01-01", "2021-01-02"]), + xpd.to_timedelta(["1 days", "2 days"]), + cudf.DataFrame({"a": [1, 2, 3]}), + cudf.Series([1, 2, 3]), + cudf.Index([1, 2, 3]), + cudf.Index([1, 2, 3], dtype="category"), + cudf.to_datetime(["2021-01-01", "2021-01-02"]), + cudf.Index([1, 2, 3], dtype="timedelta64[ns]"), + [1, 2, 3], + {"a": 1, "b": 2}, + (1, 2, 3), + ], +) +def test_as_proxy_object(obj): + proxy_obj = as_proxy_object(obj) + if isinstance( + obj, + ( + pd.DataFrame, + pd.Series, + pd.Index, + pd.Categorical, + xpd.DataFrame, + xpd.Series, + xpd.Index, + xpd.Categorical, + cudf.DataFrame, + cudf.Series, + cudf.Index, + ), + ): + assert is_proxy_object(proxy_obj) + if isinstance(proxy_obj, xpd.DataFrame): + tm.assert_frame_equal(proxy_obj, xpd.DataFrame(obj)) + elif isinstance(proxy_obj, xpd.Series): + tm.assert_series_equal(proxy_obj, xpd.Series(obj)) + elif isinstance(proxy_obj, xpd.Index): + tm.assert_index_equal(proxy_obj, xpd.Index(obj)) + else: + tm.assert_equal(proxy_obj, obj) + else: + assert not is_proxy_object(proxy_obj) + assert proxy_obj == obj + + +def test_as_proxy_object_doesnot_copy_series(): + s = pd.Series([1, 2, 3]) + proxy_obj = as_proxy_object(s) + s[0] = 10 + assert proxy_obj[0] == 10 + tm.assert_series_equal(s, proxy_obj) + + +def test_as_proxy_object_doesnot_copy_dataframe(): + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + proxy_obj = as_proxy_object(df) + df.iloc[0, 0] = 10 + assert proxy_obj.iloc[0, 0] == 10 + tm.assert_frame_equal(df, proxy_obj) + + +def test_as_proxy_object_doesnot_copy_index(): + idx = pd.Index([1, 2, 3]) + proxy_obj = as_proxy_object(idx) + assert proxy_obj._fsproxy_wrapped is idx + + +def test_as_proxy_object_no_op_for_intermediates(): + s = pd.Series(["abc", "def", "ghi"]) + str_attr = s.str + proxy_obj = as_proxy_object(str_attr) + assert proxy_obj is str_attr + + def test_pickle_round_trip_proxy_numpy_array(array): arr, proxy_arr = array pickled_arr = BytesIO() diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index a7b10a6e8fa..9b798688992 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -197,7 +197,6 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/io/test_multiscan.py::test_include_file_paths[scan_csv-write_csv]": "Need to expose include_file_paths xref: cudf#18012", "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics[False]": "Debug output on stderr doesn't match", # Maybe flaky, order-dependent? - "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order", "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero", } diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 6bb5d78c488..85a4f007cf0 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 """Datatype utilities.""" @@ -71,7 +71,9 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: ------- True if casting is supported, False otherwise """ - has_empty = from_.id() == plc.TypeId.EMPTY or to.id() == plc.TypeId.EMPTY + to_is_empty = to.id() == plc.TypeId.EMPTY + from_is_empty = from_.id() == plc.TypeId.EMPTY + has_empty = to_is_empty or from_is_empty return ( ( from_ == to @@ -84,8 +86,16 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: ) ) ) - or (from_.id() == plc.TypeId.STRING and is_numeric_not_bool(to)) - or (to.id() == plc.TypeId.STRING and is_numeric_not_bool(from_)) + or ( + from_.id() == plc.TypeId.STRING + and not to_is_empty + and is_numeric_not_bool(to) + ) + or ( + to.id() == plc.TypeId.STRING + and not from_is_empty + and is_numeric_not_bool(from_) + ) ) diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 9026a0c29ca..e9fc054efc2 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -19,7 +19,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "polars>=1.20,<1.23", + "polars>=1.20,<1.24", "pylibcudf==25.4.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index 9d1e8cba425..bfbb99e8eb0 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler @@ -25,3 +25,19 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: const column_view &b, const size_type width, ) except + + + cdef unique_ptr[column] minhash_ngrams( + const column_view &strings, + const size_type ngrams, + const uint32_t seed, + const column_view &a, + const column_view &b, + ) except + + + cdef unique_ptr[column] minhash64_ngrams( + const column_view &strings, + const size_type ngrams, + const uint64_t seed, + const column_view &a, + const column_view &b, + ) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd index 93f13a7e11f..33749141590 100644 --- a/python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.vector cimport vector from pylibcudf.exception_handler cimport libcudf_exception_handler @@ -6,22 +6,22 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/utilities/traits.hpp" namespace "cudf" nogil: - cdef bool is_relationally_comparable(data_type) - cdef bool is_equality_comparable(data_type) - cdef bool is_numeric(data_type) - cdef bool is_numeric_not_bool(data_type) - cdef bool is_index_type(data_type) - cdef bool is_unsigned(data_type) - cdef bool is_integral(data_type) - cdef bool is_integral_not_bool(data_type) - cdef bool is_floating_point(data_type) - cdef bool is_boolean(data_type) - cdef bool is_timestamp(data_type) - cdef bool is_fixed_point(data_type) - cdef bool is_duration(data_type) - cdef bool is_chrono(data_type) - cdef bool is_dictionary(data_type) - cdef bool is_fixed_width(data_type) - cdef bool is_compound(data_type) - cdef bool is_nested(data_type) - cdef bool is_bit_castable(data_type, data_type) + cdef bool is_relationally_comparable(data_type) except +libcudf_exception_handler + cdef bool is_equality_comparable(data_type) except +libcudf_exception_handler + cdef bool is_numeric(data_type) except +libcudf_exception_handler + cdef bool is_numeric_not_bool(data_type) except +libcudf_exception_handler + cdef bool is_index_type(data_type) except +libcudf_exception_handler + cdef bool is_unsigned(data_type) except +libcudf_exception_handler + cdef bool is_integral(data_type) except +libcudf_exception_handler + cdef bool is_integral_not_bool(data_type) except +libcudf_exception_handler + cdef bool is_floating_point(data_type) except +libcudf_exception_handler + cdef bool is_boolean(data_type) except +libcudf_exception_handler + cdef bool is_timestamp(data_type) except +libcudf_exception_handler + cdef bool is_fixed_point(data_type) except +libcudf_exception_handler + cdef bool is_duration(data_type) except +libcudf_exception_handler + cdef bool is_chrono(data_type) except +libcudf_exception_handler + cdef bool is_dictionary(data_type) except +libcudf_exception_handler + cdef bool is_fixed_width(data_type) except +libcudf_exception_handler + cdef bool is_compound(data_type) except +libcudf_exception_handler + cdef bool is_nested(data_type) except +libcudf_exception_handler + cdef bool is_bit_castable(data_type, data_type) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd index 0af53748cdc..f1e099ca7da 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libc.stdint cimport uint32_t, uint64_t from pylibcudf.column cimport Column @@ -24,3 +24,19 @@ cpdef Column minhash64( Column b, size_type width ) + +cpdef Column minhash_ngrams( + Column input, + size_type width, + uint32_t seed, + Column a, + Column b +) + +cpdef Column minhash64_ngrams( + Column input, + size_type width, + uint64_t seed, + Column a, + Column b +) diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi index 5d88cfbbea0..bb50a150798 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.column import Column @@ -8,3 +8,9 @@ def minhash( def minhash64( input: Column, seed: int, a: Column, b: Column, width: int ) -> Column: ... +def minhash_ngrams( + input: Column, ngrams: int, seed: int, a: Column, b: Column +) -> Column: ... +def minhash64_ngrams( + input: Column, ngrams: int, seed: int, a: Column, b: Column +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx index 84811cda867..cdc4a4f3ac8 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr @@ -8,12 +8,16 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.nvtext.minhash cimport ( minhash as cpp_minhash, minhash64 as cpp_minhash64, + minhash_ngrams as cpp_minhash_ngrams, + minhash64_ngrams as cpp_minhash64_ngrams, ) from pylibcudf.libcudf.types cimport size_type __all__ = [ "minhash", "minhash64", + "minhash_ngrams", + "minhash64_ngrams", ] cpdef Column minhash( @@ -103,3 +107,93 @@ cpdef Column minhash64( ) return Column.from_libcudf(move(c_result)) + +cpdef Column minhash_ngrams( + Column input, + size_type ngrams, + uint32_t seed, + Column a, + Column b +): + """ + Returns the minhash values for each input row of strings. + This function uses MurmurHash3_x86_32 for the hash algorithm. + + For details, see :cpp:func:`minhash_ngrams`. + + Parameters + ---------- + input : Column + List column of strings to compute minhash + ngrams : size_type + Number of consecutive strings to hash in each row + seed : uint32_t + Seed used for the hash function + a : Column + 1st parameter value used for the minhash algorithm. + b : Column + 2nd parameter value used for the minhash algorithm. + + Returns + ------- + Column + List column of minhash values for each row per + value in columns a and b. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_minhash_ngrams( + input.view(), + ngrams, + seed, + a.view(), + b.view() + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column minhash64_ngrams( + Column input, + size_type ngrams, + uint64_t seed, + Column a, + Column b +): + """ + Returns the minhash values for each input row of strings. + This function uses MurmurHash3_x64_128 for the hash algorithm. + + For details, see :cpp:func:`minhash64_ngrams`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + ngrams : size_type + Number of consecutive strings to hash in each row + seed : uint64_t + Seed used for the hash function + a : Column + 1st parameter value used for the minhash algorithm. + b : Column + 2nd parameter value used for the minhash algorithm. + + Returns + ------- + Column + List column of minhash values for each row per + value in columns a and b. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_minhash64_ngrams( + input.view(), + ngrams, + seed, + a.view(), + b.view() + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py index ad7a6f7a762..ff8545f0617 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import pyarrow as pa import pytest @@ -33,3 +33,49 @@ def test_minhash(minhash_input_data, width): assert pa_result.type == pa.list_( pa.field("element", seed_type, nullable=False) ) + + +@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()]) +def minhash_ngrams_input_data(request): + input_arr = pa.array( + [ + ["foo", "bar", "foo foo", "bar bar", "foo bar", "bar foo"], + [ + "one", + "two", + "three", + "four", + "five", + "six", + "seven", + "eight", + "nine", + "ten", + "eleven", + ], + ] + ) + ab = pa.array([2, 3, 4, 5], request.param) + return input_arr, ab, request.param + + +@pytest.mark.parametrize("ngrams", [5, 10]) +def test_minhash_ngrams(minhash_ngrams_input_data, ngrams): + input_arr, ab, seed_type = minhash_ngrams_input_data + minhash_func = ( + plc.nvtext.minhash.minhash_ngrams + if seed_type == pa.uint32() + else plc.nvtext.minhash.minhash64_ngrams + ) + result = minhash_func( + plc.interop.from_arrow(input_arr), + ngrams, + 0, + plc.interop.from_arrow(ab), + plc.interop.from_arrow(ab), + ) + pa_result = plc.interop.to_arrow(result) + assert all(len(got) == len(ab) for got, s in zip(pa_result, input_arr)) + assert pa_result.type == pa.list_( + pa.field("element", seed_type, nullable=False) + )