diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 38b890893d0..2c583598f54 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -24,7 +24,6 @@ jobs: - conda-python-cudf-tests - conda-python-other-tests - conda-java-tests - - static-configure - conda-notebook-tests - docs-build - wheel-build-libcudf @@ -192,16 +191,6 @@ jobs: arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_java.sh" - static-configure: - needs: checks - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04 - with: - build_type: pull-request - # Use the wheel container so we can skip conda solves and since our - # primary static consumers (Spark) are not in conda anyway. - container_image: "rapidsai/ci-wheel:latest" - run_script: "ci/configure_cpp_static.sh" conda-notebook-tests: needs: [conda-python-build, changed-files] secrets: inherit diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 12f6d751493..7046fd0e5dc 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -168,3 +168,14 @@ jobs: date: ${{ inputs.date }} sha: ${{ inputs.sha }} script: "ci/test_cudf_polars_polars_tests.sh" + narwhals-tests: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04 + with: + build_type: ${{ inputs.build_type }} + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + node_type: "gpu-l4-latest-1" + container_image: "rapidsai/ci-conda:latest" + run_script: ci/test_narwhals.sh diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index 3d06eacf9ff..0c324d01cdf 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. set -euo pipefail @@ -18,7 +18,7 @@ rapids-logger "Begin cpp build" sccache --zero-stats # With boa installed conda build forward to boa -RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry build \ conda/recipes/libcudf sccache --show-adv-stats diff --git a/ci/build_python.sh b/ci/build_python.sh index ed90041cc77..abbdc3f3a3b 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. set -euo pipefail @@ -25,7 +25,7 @@ sccache --zero-stats # node works correctly # With boa installed conda build forwards to the boa builder -RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \ --no-test \ --channel "${CPP_CHANNEL}" \ conda/recipes/pylibcudf @@ -33,7 +33,7 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ sccache --show-adv-stats sccache --zero-stats -RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ @@ -42,13 +42,13 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ sccache --show-adv-stats sccache --zero-stats -RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ conda/recipes/dask-cudf -RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ @@ -56,13 +56,13 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ sccache --show-adv-stats -RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ conda/recipes/custreamz -RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ diff --git a/ci/configure_cpp_static.sh b/ci/configure_cpp_static.sh deleted file mode 100755 index 3d0647a96f6..00000000000 --- a/ci/configure_cpp_static.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024-2025, NVIDIA CORPORATION. - -set -euo pipefail - -source rapids-date-string - -rapids-logger "Configure static cpp build" - -ENV_YAML_DIR="$(mktemp -d)" -REQUIREMENTS_FILE="${ENV_YAML_DIR}/requirements.txt" - -rapids-dependency-file-generator \ - --output requirements \ - --file-key test_static_build \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${REQUIREMENTS_FILE}" - -rapids-pip-retry install -r "${REQUIREMENTS_FILE}" -pyenv rehash - -cmake -S cpp -B build_static -GNinja -DBUILD_SHARED_LIBS=OFF -DCUDF_USE_ARROW_STATIC=ON -DBUILD_TESTS=OFF diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index e7e0cb17606..a23981b4e72 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -54,9 +54,9 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.2.0,<0.3.0a0 -- numba>=0.59.1,<0.61.0a0 -- numpy>=1.23,<3.0a0 +- numba-cuda>=0.4.0,<0.5.0a0 +- numba>=0.59.1,<0.62.0a0 +- numpy>=1.23,<2.1 - numpydoc - nvcc_linux-64=11.8 - nvcomp==4.2.0.11 diff --git a/conda/environments/all_cuda-128_arch-x86_64.yaml b/conda/environments/all_cuda-128_arch-x86_64.yaml index c95b8b10299..e2b9302dc36 100644 --- a/conda/environments/all_cuda-128_arch-x86_64.yaml +++ b/conda/environments/all_cuda-128_arch-x86_64.yaml @@ -53,9 +53,9 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.2.0,<0.3.0a0 -- numba>=0.59.1,<0.61.0a0 -- numpy>=1.23,<3.0a0 +- numba-cuda>=0.4.0,<0.5.0a0 +- numba>=0.59.1,<0.62.0a0 +- numpy>=1.23,<2.1 - numpydoc - nvcomp==4.2.0.11 - nvtx>=0.2.1 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index f817bc12c5b..43060ef1c87 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -75,9 +75,9 @@ requirements: - typing_extensions >=4.0.0 - pandas >=2.0,<2.2.4dev0 - cupy >=12.0.0 - - numba-cuda >=0.2.0,<0.3.0a0 - - numba >=0.59.1,<0.61.0a0 - - numpy >=1.23,<3.0a0 + - numba-cuda >=0.4.0,<0.5.0a0 + - numba >=0.59.1,<0.62.0a0 + - numpy >=1.23,<2.1 - pyarrow>=14.0.0,<20.0.0a0 - libcudf ={{ version }} - pylibcudf ={{ version }} diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml index 14e2f31a5a5..ae02cf8d4e5 100644 --- a/conda/recipes/pylibcudf/meta.yaml +++ b/conda/recipes/pylibcudf/meta.yaml @@ -73,7 +73,7 @@ requirements: - python - typing_extensions >=4.0.0 - pandas >=2.0,<2.2.4dev0 - - numpy >=1.23,<3.0a0 + - numpy >=1.23,<2.1 - pyarrow>=14.0.0,<20.0.0a0 - libcudf ={{ version }} - {{ pin_compatible('rmm', max_pin='x.x') }} diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp index 594dc0de28a..494d5722ae4 100644 --- a/cpp/benchmarks/text/normalize.cpp +++ b/cpp/benchmarks/text/normalize.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,8 +48,11 @@ static void bench_normalize(nvbench::state& state) [&](nvbench::launch& launch) { auto result = nvtext::normalize_spaces(input); }); } else { bool const to_lower = (normalize_type == "to_lower"); + // we expect the normalizer to be created once and re-used + // so creating it is not measured + auto normalizer = nvtext::create_character_normalizer(to_lower); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = nvtext::normalize_characters(input, to_lower); + auto result = nvtext::normalize_characters(input, *normalizer); }); } } @@ -57,6 +60,6 @@ static void bench_normalize(nvbench::state& state) NVBENCH_BENCH(bench_normalize) .set_name("normalize") .add_int64_axis("min_width", {0}) - .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("max_width", {128, 256}) .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"spaces", "characters", "to_lower"}); diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp index d276c5df7dc..8fb1f30f961 100644 --- a/cpp/include/cudf/strings/detail/utilities.hpp +++ b/cpp/include/cudf/strings/detail/utilities.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -96,5 +96,17 @@ int64_t get_offset_value(cudf::column_view const& offsets, size_type index, rmm::cuda_stream_view stream); +/** + * @brief Return the first and last offset in the given strings column + * + * This accounts for sliced input columns as well. + * + * @param input Strings column + * @param stream CUDA stream used for device memory operations and kernel launches + * @return First and last offset values + */ +std::pair get_first_and_last_offset(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream); + } // namespace strings::detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp index 74325f4a406..70ee7891ad7 100644 --- a/cpp/include/nvtext/normalize.hpp +++ b/cpp/include/nvtext/normalize.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include #include @@ -107,5 +108,113 @@ std::unique_ptr normalize_characters( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Normalizer object to be used with nvtext::normalize_characters + * + * Use nvtext::create_normalizer to create this object. + * + * This normalizer includes: + * + * - adding padding around punctuation (unicode category starts with "P") + * as well as certain ASCII symbols like "^" and "$" + * - adding padding around the [CJK Unicode block + * characters](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)) + * - changing whitespace (e.g. `"\t", "\n", "\r"`) to just space `" "` + * - removing control characters (unicode categories "Cc" and "Cf") + * + * The padding process adds a single space before and after the character. + * Details on _unicode category_ can be found here: + * https://unicodebook.readthedocs.io/unicode.html#categories + * + * If `do_lower_case = true`, lower-casing also removes any accents. The + * accents cannot be removed from upper-case characters without lower-casing + * and lower-casing cannot be performed without also removing accents. + * However, if the accented character is already lower-case, then only the + * accent is removed. + * + * If `special_tokens` are included the padding after `[` and before `]` is not + * inserted if the characters between them match one of the given tokens. + * Also, the `special_tokens` are expected to include the `[]` characters + * at the beginning of and end of each string appropriately. + */ +struct character_normalizer { + /** + * @brief Normalizer object constructor + * + * This initializes and holds the character normalizing tables and settings. + * + * @param do_lower_case If true, upper-case characters are converted to + * lower-case and accents are stripped from those characters. + * If false, accented and upper-case characters are not transformed. + * @param special_tokens Each row is a token including the `[]` brackets. + * For example: `[BOS]`, `[EOS]`, `[UNK]`, `[SEP]`, `[PAD]`, `[CLS]`, `[MASK]` + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + */ + character_normalizer(bool do_lower_case, + cudf::strings_column_view const& special_tokens, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + ~character_normalizer(); + + struct character_normalizer_impl; + std::unique_ptr _impl; +}; + +/** + * @brief Create a normalizer object + * + * Creates a normalizer object which can be reused on multiple calls to + * nvtext::normalize_characters + * + * @see nvtext::character_normalizer + * + * @param do_lower_case If true, upper-case characters are converted to + * lower-case and accents are stripped from those characters. + * If false, accented and upper-case characters are not transformed. + * @param special_tokens Individual tokens including `[]` brackets. + * Default is no special tokens. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Object to be used with nvtext::normalize_characters + */ +std::unique_ptr create_character_normalizer( + bool do_lower_case, + cudf::strings_column_view const& special_tokens = cudf::strings_column_view(cudf::column_view{ + cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0}), + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + +/** + * @brief Normalizes the text in input strings column + * + * @see nvtext::character_normalizer for details on the normalizer behavior + * + * @code{.pseudo} + * cn = create_character_normalizer(true) + * s = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"] + * s1 = normalize_characters(s,cn) + * s1 is now ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "] + * + * cn = create_character_normalizer(false) + * s2 = normalize_characters(s,cn) + * s2 is now ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "] + * @endcode + * + * A null input element at row `i` produces a corresponding null entry + * for row `i` in the output column. + * + * @param input The input strings to normalize + * @param normalizer Normalizer to use for this function + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Memory resource to allocate any returned objects + * @return Normalized strings column + */ +std::unique_ptr normalize_characters( + cudf::strings_column_view const& input, + character_normalizer const& normalizer, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** @} */ // end of group } // namespace CUDF_EXPORT nvtext diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index 45bd4615435..c5d46598d4a 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -180,6 +180,18 @@ int64_t get_offset_value(cudf::column_view const& offsets, : cudf::detail::get_value(offsets, index, stream); } +std::pair get_first_and_last_offset(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream) +{ + if (input.is_empty()) { return {0L, 0L}; } + auto const first_offset = (input.offset() == 0) ? 0 + : cudf::strings::detail::get_offset_value( + input.offsets(), input.offset(), stream); + auto const last_offset = + cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream); + return {first_offset, last_offset}; +} + } // namespace detail rmm::device_uvector create_string_vector_from_column( diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 7e2b766862d..0e680e98ec5 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "text/normalize.cuh" #include "text/subword/detail/data_normalizer.hpp" #include "text/subword/detail/tokenizer_utils.cuh" #include "text/utilities/tokenize_ops.cuh" @@ -22,10 +23,11 @@ #include #include #include -#include #include #include #include +#include +#include #include #include #include @@ -38,9 +40,13 @@ #include +#include +#include +#include #include #include #include +#include #include #include @@ -103,6 +109,12 @@ constexpr uint32_t UTF8_1BYTE = 0x0080; constexpr uint32_t UTF8_2BYTE = 0x0800; constexpr uint32_t UTF8_3BYTE = 0x01'0000; +__device__ int8_t cp_to_utf8(uint32_t codepoint, char* out) +{ + auto utf8 = cudf::strings::detail::codepoint_to_utf8(codepoint); + return cudf::strings::detail::from_char_utf8(utf8, out); +} + /** * @brief Convert code-point arrays into UTF-8 bytes for each string. */ @@ -148,26 +160,8 @@ struct codepoint_to_utf8_fn { // convert each code-point to 1-4 UTF-8 encoded bytes char* out_ptr = d_chars + d_offsets[idx]; for (uint32_t jdx = 0; jdx < count; ++jdx) { - uint32_t code_point = *str_cps++; - if (code_point < UTF8_1BYTE) // ASCII range - *out_ptr++ = static_cast(code_point); - else if (code_point < UTF8_2BYTE) { // create two-byte UTF-8 - // b00001xxx:byyyyyyyy => b110xxxyy:b10yyyyyy - *out_ptr++ = static_cast((((code_point << 2) & 0x00'1F00) | 0x00'C000) >> 8); - *out_ptr++ = static_cast((code_point & 0x3F) | 0x0080); - } else if (code_point < UTF8_3BYTE) { // create three-byte UTF-8 - // bxxxxxxxx:byyyyyyyy => b1110xxxx:b10xxxxyy:b10yyyyyy - *out_ptr++ = static_cast((((code_point << 4) & 0x0F'0000) | 0x00E0'0000) >> 16); - *out_ptr++ = static_cast((((code_point << 2) & 0x00'3F00) | 0x00'8000) >> 8); - *out_ptr++ = static_cast((code_point & 0x3F) | 0x0080); - } else { // create four-byte UTF-8 - // maximum code-point value is 0x0011'0000 - // b000xxxxx:byyyyyyyy:bzzzzzzzz => b11110xxx:b10xxyyyy:b10yyyyzz:b10zzzzzz - *out_ptr++ = static_cast((((code_point << 6) & 0x0700'0000u) | 0xF000'0000u) >> 24); - *out_ptr++ = static_cast((((code_point << 4) & 0x003F'0000u) | 0x0080'0000u) >> 16); - *out_ptr++ = static_cast((((code_point << 2) & 0x00'3F00u) | 0x00'8000u) >> 8); - *out_ptr++ = static_cast((code_point & 0x3F) | 0x0080); - } + uint32_t codepoint = *str_cps++; + out_ptr += cp_to_utf8(codepoint, out_ptr); } } }; @@ -261,4 +255,361 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con return detail::normalize_characters(input, do_lower_case, stream, mr); } +struct character_normalizer::character_normalizer_impl { + rmm::device_uvector cp_metadata; + rmm::device_uvector aux_table; + bool do_lower_case; + std::unique_ptr special_tokens; + rmm::device_uvector special_tokens_view; + + cudf::device_span get_special_tokens() const + { + return special_tokens_view; + } + + character_normalizer_impl(rmm::device_uvector&& cp_metadata, + rmm::device_uvector&& aux_table, + bool do_lower_case, + std::unique_ptr&& special_tokens, + rmm::device_uvector&& special_tokens_view) + : cp_metadata(std::move(cp_metadata)), + aux_table(std::move(aux_table)), + do_lower_case{do_lower_case}, + special_tokens{std::move(special_tokens)}, + special_tokens_view{std::move(special_tokens_view)} + { + } +}; + +character_normalizer::character_normalizer(bool do_lower_case, + cudf::strings_column_view const& special_tokens, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref) +{ + auto cp_metadata = nvtext::detail::get_codepoint_metadata(stream); + auto aux_table = nvtext::detail::get_aux_codepoint_data(stream); + CUDF_EXPECTS( + !special_tokens.has_nulls(), "special tokens should not have nulls", std::invalid_argument); + + auto sorted = std::move( + cudf::sort(cudf::table_view({special_tokens.parent()}), {}, {}, stream)->release().front()); + if (do_lower_case) { + // lower-case the tokens so they will match the normalized input + sorted = cudf::strings::to_lower(cudf::strings_column_view(sorted->view()), stream); + } + + auto tokens_view = cudf::strings::detail::create_string_vector_from_column( + cudf::strings_column_view(sorted->view()), stream, cudf::get_current_device_resource_ref()); + + _impl = std::make_unique(std::move(cp_metadata), + std::move(aux_table), + do_lower_case, + std::move(sorted), + std::move(tokens_view)); +} + +character_normalizer::~character_normalizer() {} + +std::unique_ptr create_character_normalizer( + bool do_lower_case, + cudf::strings_column_view const& special_tokens, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return std::make_unique(do_lower_case, special_tokens, stream, mr); +} + +namespace detail { +namespace { + +/** + * @brief Kernel handles fixing up the normalized data to account for any special tokens + * + * This undoes the padding added around the `[]` for patterns matching the strings in the + * special_tokens array. + * + * Launched as a thread per input byte (total_count). + * + * @param d_normalized The normalized set of UTF-8 characters; 3 uints per input byte + * @param total_count Number of bytes represented by d_normalized; len(d_normalized)/3 + * @param special_tokens Tokens to check against + */ +CUDF_KERNEL void special_tokens_kernel(uint32_t* d_normalized, + int64_t total_count, + cudf::device_span special_tokens) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + if (idx >= total_count) { return; } + auto const begin = d_normalized + (idx * MAX_NEW_CHARS) + 1; + if (*begin != '[') { return; } + auto const end = begin + cuda::std::min(6L, total_count - idx) * MAX_NEW_CHARS; + auto const match = thrust::find(thrust::seq, begin, end, static_cast(']')); + if (match == end) { return; } + char candidate[8]; + auto const ch_begin = + thrust::transform_iterator(begin, [](auto v) { return static_cast(v); }); + auto const ch_end = ch_begin + thrust::distance(begin, match + 1); + auto last = thrust::copy_if( + thrust::seq, ch_begin, ch_end, candidate, [](auto c) { return c != 0 && c != ' '; }); + *last = 0; // only needed for debug + + auto const size = static_cast(thrust::distance(candidate, last)); + auto const token = cudf::string_view(candidate, size); + // the binary_search expects the special_tokens to be sorted + if (!thrust::binary_search(thrust::seq, special_tokens.begin(), special_tokens.end(), token)) { + return; + } + + // fix up chars to remove the extra spaces + *(begin + 1) = 0; // removes space after '[' + *(match - 1) = 0; // removes space before ']' +} + +/** + * @brief The normalizer kernel + * + * Launched as a thread per input byte (total_bytes). + * + * Converts the input d_chars into codepoints to lookup in the provided tables. + * Once processed, the d_output contains 3 uints per input byte each encoded + * as output UTF-8. Any zero values are to removed by a subsequent kernel call. + * + * @param d_chars The characters for the input strings column to normalize + * @param total_bytes The number of bytes in the d_chars + * @param cp_metadata First lookup table for codepoint metadata + * @param aux_table Second lookup table containing possible replacement characters + * @param do_lower_case True if the normalization includes lower-casing characters + * @param d_output The output of the normalization (UTF-8 encoded) + */ +CUDF_KERNEL void data_normalizer_kernel(char const* d_chars, + int64_t total_bytes, + codepoint_metadata_type const* cp_metadata, + aux_codepoint_data_type const* aux_table, + bool do_lower_case, + uint32_t* d_output) +{ + uint32_t replacement[MAX_NEW_CHARS] = {0}; + + auto const idx = cudf::detail::grid_1d::global_thread_id(); + + if ((idx < total_bytes) && cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) { + auto const cp = [utf8 = d_chars + idx] { + cudf::char_utf8 ch_utf8 = *utf8; + if (ch_utf8 > 0x7F) { cudf::strings::detail::to_char_utf8(utf8, ch_utf8); } + return cudf::strings::detail::utf8_to_codepoint(ch_utf8); + }(); + auto const metadata = cp_metadata[cp]; + + if (!should_remove_cp(metadata, do_lower_case)) { + int8_t num_new_chars = 1; + // retrieve the normalized value for cp + auto const new_cp = do_lower_case || always_replace(metadata) ? get_first_cp(metadata) : cp; + replacement[0] = new_cp == 0 ? cp : new_cp; + + if (do_lower_case && is_multi_char_transform(metadata)) { + auto const next_cps = aux_table[cp]; + replacement[1] = static_cast(next_cps >> 32); + replacement[2] = static_cast(next_cps & 0xFFFFFFFF); + num_new_chars = 2 + (replacement[2] != 0); + } + + if (should_add_spaces(metadata, do_lower_case) && (num_new_chars == 1)) { + replacement[1] = replacement[0]; + replacement[0] = SPACE_CODE_POINT; // add spaces around the new codepoint + replacement[2] = SPACE_CODE_POINT; + num_new_chars = 3; + } + + // convert codepoints back to UTF-8 in-place + for (int k = 0; k < num_new_chars; ++k) { + auto const new_cp = replacement[k]; + if (new_cp) { cp_to_utf8(new_cp, reinterpret_cast(replacement + k)); } + } + } + } + + // employ an optimized coalesced writer to output replacement as a block of transposed data + using block_store = + cub::BlockStore; + __shared__ typename block_store::TempStorage bs_stg; + auto block_base = d_output + blockIdx.x * blockDim.x * MAX_NEW_CHARS; + block_store(bs_stg).Store(block_base, replacement); +} + +/** + * @brief Computes the output sizes for each row + * + * The input offsets are used with segmented-reduce to count the number of + * non-zero values for each output row. + * + * @param d_normalized The UTF-8 encoded normalized values + * @param offsets These identify the row boundaries + * @param offset Only non-zero if the input column has been sliced + * @param size The number of output rows (sames as the number of input rows) + * @param stream Stream used for allocating device memory and launching kernels + * @return The sizes of each output row + */ +template +rmm::device_uvector compute_sizes(cudf::device_span d_normalized, + OffsetType offsets, + int64_t offset, + cudf::size_type size, + rmm::cuda_stream_view stream) +{ + auto output_sizes = rmm::device_uvector(size, stream); + + auto d_data = d_normalized.data(); + + // counts the non-zero bytes in the d_data array + auto d_in = cudf::detail::make_counting_transform_iterator( + 0, cuda::proclaim_return_type([d_data] __device__(auto idx) { + idx = idx * MAX_NEW_CHARS; + // transform function counts number of non-zero bytes in uint32_t value + auto tfn = [](uint32_t v) -> cudf::size_type { + return ((v & 0xFF) > 0) + ((v & 0xFF00) > 0) + ((v & 0xFF0000) > 0) + + ((v & 0xFF000000) > 0); + }; + auto const begin = d_data + idx; + auto const end = begin + MAX_NEW_CHARS; + return thrust::transform_reduce(thrust::seq, begin, end, tfn, 0, thrust::plus{}); + })); + + // DeviceSegmentedReduce is used to compute the size of each output row + auto d_out = output_sizes.begin(); + auto temp = std::size_t{0}; + if (offset == 0) { + cub::DeviceSegmentedReduce::Sum( + nullptr, temp, d_in, d_out, size, offsets, offsets + 1, stream.value()); + auto d_temp = rmm::device_buffer{temp, stream}; + cub::DeviceSegmentedReduce::Sum( + d_temp.data(), temp, d_in, d_out, size, offsets, offsets + 1, stream.value()); + } else { + // offsets need to be normalized for segmented-reduce to work efficiently + auto offsets_itr = thrust::transform_iterator( + offsets, + cuda::proclaim_return_type([offset] __device__(auto o) { return o - offset; })); + cub::DeviceSegmentedReduce::Sum( + nullptr, temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value()); + auto d_temp = rmm::device_buffer{temp, stream}; + cub::DeviceSegmentedReduce::Sum( + d_temp.data(), temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value()); + } + + return output_sizes; +} + +// handles ranges above int32 max +template +OutputIterator remove_copy_safe(InputIterator first, + InputIterator last, + OutputIterator result, + T const& value, + rmm::cuda_stream_view stream) +{ + auto const copy_size = std::min(static_cast(std::distance(first, last)), + static_cast(std::numeric_limits::max())); + + auto itr = first; + while (itr != last) { + auto const copy_end = + static_cast(std::distance(itr, last)) <= copy_size ? last : itr + copy_size; + result = thrust::remove_copy(rmm::exec_policy(stream), itr, copy_end, result, value); + itr = copy_end; + } + return result; +} + +// handles ranges above int32 max +template +Iterator remove_safe(Iterator first, Iterator last, T const& value, rmm::cuda_stream_view stream) +{ + auto const size = std::min(static_cast(std::distance(first, last)), + static_cast(std::numeric_limits::max())); + + auto result = first; + auto itr = first; + while (itr != last) { + auto end = static_cast(std::distance(itr, last)) <= size ? last : itr + size; + result = thrust::remove(rmm::exec_policy(stream), itr, end, value); + itr = end; + } + return result; +} +} // namespace + +std::unique_ptr normalize_characters(cudf::strings_column_view const& input, + character_normalizer const& normalizer, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + if (input.is_empty()) { return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); } + + auto [first_offset, last_offset] = + cudf::strings::detail::get_first_and_last_offset(input, stream); + auto const chars_size = last_offset - first_offset; + auto const d_input_chars = input.chars_begin(stream) + first_offset; + + if (chars_size == 0) { return std::make_unique(input.parent(), stream, mr); } + + constexpr int64_t block_size = 256; + cudf::detail::grid_1d grid{chars_size, block_size}; + auto const max_new_char_total = cudf::util::round_up_safe(chars_size, block_size) * MAX_NEW_CHARS; + + auto const& parameters = normalizer._impl; + + auto d_normalized = rmm::device_uvector(max_new_char_total, stream); + data_normalizer_kernel<<>>( + d_input_chars, + chars_size, + parameters->cp_metadata.data(), + parameters->aux_table.data(), + parameters->do_lower_case, + d_normalized.data()); + + // This removes space added around any special tokens in the form of [ttt]. + // An alternate approach is to do a multi-replace of '[ ttt ]' with '[ttt]' right + // before returning the output strings column. + auto const special_tokens = parameters->get_special_tokens(); + if (!special_tokens.empty()) { + special_tokens_kernel<<>>( + d_normalized.data(), chars_size, special_tokens); + } + + // Use segmented-reduce over the non-zero codepoints to get the size of the output rows + auto const input_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); + auto output_sizes = + compute_sizes(d_normalized, input_offsets, first_offset, input.size(), stream); + + // convert the sizes to offsets + auto [offsets, total_size] = cudf::strings::detail::make_offsets_child_column( + output_sizes.begin(), output_sizes.end(), stream, mr); + + // create output chars by calling remove_copy(0) on the bytes in d_normalized + auto chars = rmm::device_uvector(total_size, stream, mr); + auto const begin = reinterpret_cast(d_normalized.begin()); + // the remove() above speeds up the remove_copy() by roughly 10% + auto const end = + reinterpret_cast(remove_safe(d_normalized.begin(), d_normalized.end(), 0, stream)); + remove_copy_safe(begin, end, chars.data(), 0, stream); + + return cudf::make_strings_column(input.size(), + std::move(offsets), + chars.release(), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); +} + +} // namespace detail + +std::unique_ptr normalize_characters(cudf::strings_column_view const& input, + character_normalizer const& normalizer, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::normalize_characters(input, normalizer, stream, mr); +} + } // namespace nvtext diff --git a/cpp/src/text/normalize.cuh b/cpp/src/text/normalize.cuh new file mode 100644 index 00000000000..3972726d536 --- /dev/null +++ b/cpp/src/text/normalize.cuh @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "text/subword/detail/cp_data.h" + +namespace nvtext { +namespace detail { + +/** + * @brief Bit used to filter out invalid code points. + * + * When normalizing characters to code point values, if this bit is set, + * the code point should be filtered out before returning from the normalizer. + */ +constexpr uint32_t FILTER_BIT = 22; + +/** + * @brief Retrieve new code point from metadata value. + * + * @param metadata Value from the codepoint_metadata table. + * @return The replacement character if appropriate. + */ +__device__ constexpr uint32_t get_first_cp(uint32_t metadata) { return metadata & NEW_CP_MASK; } + +/** + * @brief Retrieve token category from the metadata value. + * + * Category values are 0-5: + * 0 - character should be padded + * 1 - pad character if lower-case + * 2 - character should be removed + * 3 - remove character if lower-case + * 4 - whitespace character -- always replace + * 5 - uncategorized + * + * @param metadata Value from the codepoint_metadata table. + * @return Category value. + */ +__device__ constexpr uint32_t extract_token_cat(uint32_t metadata) +{ + return (metadata >> TOKEN_CAT_SHIFT) & TOKEN_CAT_MASK; +} + +/** + * @brief Return true if category of metadata value specifies the character should be replaced. + */ +__device__ constexpr bool should_remove_cp(uint32_t metadata, bool lower_case) +{ + auto const cat = extract_token_cat(metadata); + return (cat == TOKEN_CAT_REMOVE_CHAR) || (lower_case && (cat == TOKEN_CAT_REMOVE_CHAR_IF_LOWER)); +} + +/** + * @brief Return true if category of metadata value specifies the character should be padded. + */ +__device__ constexpr bool should_add_spaces(uint32_t metadata, bool lower_case) +{ + auto const cat = extract_token_cat(metadata); + return (cat == TOKEN_CAT_ADD_SPACE) || (lower_case && (cat == TOKEN_CAT_ADD_SPACE_IF_LOWER)); +} + +/** + * @brief Return true if category of metadata value specifies the character should be replaced. + */ +__device__ constexpr bool always_replace(uint32_t metadata) +{ + return extract_token_cat(metadata) == TOKEN_CAT_ALWAYS_REPLACE; +} + +/** + * @brief Returns true if metadata value includes a multi-character transform bit equal to 1. + */ +__device__ constexpr bool is_multi_char_transform(uint32_t metadata) +{ + return (metadata >> MULTICHAR_SHIFT) & MULTICHAR_MASK; +} + +/** + * @brief Returns true if the byte passed in could be a valid head byte for + * a utf8 character. That is, not binary `10xxxxxx` + */ +__device__ constexpr bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6) != 2; } + +} // namespace detail +} // namespace nvtext diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu index 7a39199011e..4c54409c41a 100644 --- a/cpp/src/text/subword/data_normalizer.cu +++ b/cpp/src/text/subword/data_normalizer.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "text/normalize.cuh" #include "text/subword/detail/data_normalizer.hpp" #include "text/subword/detail/tokenizer_utils.cuh" @@ -38,81 +39,6 @@ namespace nvtext { namespace detail { namespace { -/** - * @brief Bit used to filter out invalid code points. - * - * When normalizing characters to code point values, if this bit is set, - * the code point should be filtered out before returning from the normalizer. - */ -constexpr uint32_t FILTER_BIT = 22; - -/** - * @brief Retrieve new code point from metadata value. - * - * @param metadata Value from the codepoint_metadata table. - * @return The replacement character if appropriate. - */ -__device__ uint32_t get_first_cp(uint32_t metadata) { return metadata & NEW_CP_MASK; } - -/** - * @brief Retrieve token category from the metadata value. - * - * Category values are 0-5: - * 0 - character should be padded - * 1 - pad character if lower-case - * 2 - character should be removed - * 3 - remove character if lower-case - * 4 - whitespace character -- always replace - * 5 - uncategorized - * - * @param metadata Value from the codepoint_metadata table. - * @return Category value. - */ -__device__ uint32_t extract_token_cat(uint32_t metadata) -{ - return (metadata >> TOKEN_CAT_SHIFT) & TOKEN_CAT_MASK; -} - -/** - * @brief Return true if category of metadata value specifies the character should be replaced. - */ -__device__ bool should_remove_cp(uint32_t metadata, bool lower_case) -{ - auto const cat = extract_token_cat(metadata); - return (cat == TOKEN_CAT_REMOVE_CHAR) || (lower_case && (cat == TOKEN_CAT_REMOVE_CHAR_IF_LOWER)); -} - -/** - * @brief Return true if category of metadata value specifies the character should be padded. - */ -__device__ bool should_add_spaces(uint32_t metadata, bool lower_case) -{ - auto const cat = extract_token_cat(metadata); - return (cat == TOKEN_CAT_ADD_SPACE) || (lower_case && (cat == TOKEN_CAT_ADD_SPACE_IF_LOWER)); -} - -/** - * @brief Return true if category of metadata value specifies the character should be replaced. - */ -__device__ bool always_replace(uint32_t metadata) -{ - return extract_token_cat(metadata) == TOKEN_CAT_ALWAYS_REPLACE; -} - -/** - * @brief Returns true if metadata value includes a multi-character transform bit equal to 1. - */ -__device__ bool is_multi_char_transform(uint32_t metadata) -{ - return (metadata >> MULTICHAR_SHIFT) & MULTICHAR_MASK; -} - -/** - * @brief Returns true if the byte passed in could be a valid head byte for - * a utf8 character. That is, not binary `10xxxxxx` - */ -__device__ bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6) != 2; } - /** * @brief Converts a UTF-8 character into a unicode code point value. * diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp index 2515cc917fa..530148eb654 100644 --- a/cpp/tests/text/normalize_tests.cpp +++ b/cpp/tests/text/normalize_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -74,6 +74,10 @@ TEST_F(TextNormalizeTest, NormalizeEmptyTest) EXPECT_EQ(results->size(), 0); results = nvtext::normalize_characters(strings_view, false); EXPECT_EQ(results->size(), 0); + + auto normalizer = nvtext::create_character_normalizer(true); + results = nvtext::normalize_characters(strings_view, *normalizer); + EXPECT_EQ(results->size(), 0); } TEST_F(TextNormalizeTest, AllNullStrings) @@ -84,6 +88,10 @@ TEST_F(TextNormalizeTest, AllNullStrings) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); results = nvtext::normalize_characters(strings_view, false); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); + + auto normalizer = nvtext::create_character_normalizer(true); + results = nvtext::normalize_characters(strings_view, *normalizer); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); } TEST_F(TextNormalizeTest, SomeNullStrings) @@ -93,27 +101,21 @@ TEST_F(TextNormalizeTest, SomeNullStrings) auto results = nvtext::normalize_characters(strings_view, false); cudf::test::strings_column_wrapper expected({"", " . ", "a"}, {false, true, true}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto normalizer = nvtext::create_character_normalizer(true); + results = nvtext::normalize_characters(strings_view, *normalizer); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } TEST_F(TextNormalizeTest, NormalizeCharacters) { // These include punctuation, accents, whitespace, and CJK characters - std::vector h_strings{"abc£def", - nullptr, - "éè â îô\taeio", - "\tĂĆĖÑ Ü", - "ACEN U", - "P^NP", - "$41.07", - "[a,b]", - "丏丟", - ""}; - auto validity = - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); - cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity); - cudf::strings_column_view strings_view(strings); + auto input = cudf::test::strings_column_wrapper( + {"abc£def", "", "éè â îô\taeio", "\tĂĆĖÑ Ü", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟", ""}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); + auto sv = cudf::strings_column_view(input); { - auto results = nvtext::normalize_characters(strings_view, true); + auto results = nvtext::normalize_characters(sv, true); cudf::test::strings_column_wrapper expected({"abc£def", "", "ee a io aeio", @@ -124,11 +126,11 @@ TEST_F(TextNormalizeTest, NormalizeCharacters) " [ a , b ] ", " 丏 丟 ", ""}, - validity); + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - auto results = nvtext::normalize_characters(strings_view, false); + auto results = nvtext::normalize_characters(sv, false); cudf::test::strings_column_wrapper expected({"abc£def", "", "éè â îô aeio", @@ -139,11 +141,117 @@ TEST_F(TextNormalizeTest, NormalizeCharacters) " [ a , b ] ", " 丏 丟 ", ""}, - validity); + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } +TEST_F(TextNormalizeTest, WithNormalizer) +{ + auto long_row = + "this entry is intended to pad out past 256 bytes which is currently the block size"; + // the following include punctuation, accents, whitespace, and CJK characters + auto input = cudf::test::strings_column_wrapper({"abc£def", + "", + "éè â îô\taeio", + "\tĂĆĖÑ Ü", + "ACEN U", + "P^NP", + "$41.07", + "[a,b]", + "丏丟", + "", + long_row, + long_row, + long_row}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + + auto const sv = cudf::strings_column_view(input); + + auto normalizer = nvtext::create_character_normalizer(true); + auto results = nvtext::normalize_characters(sv, *normalizer); + auto expected = cudf::test::strings_column_wrapper({"abc£def", + "", + "ee a io aeio", + " acen u", + "acen u", + "p ^ np", + " $ 41 . 07", + " [ a , b ] ", + " 丏 丟 ", + "", + long_row, + long_row, + long_row}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::normalize_characters(sv, *normalizer); // test normalizer re-use + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + normalizer = nvtext::create_character_normalizer(false); + results = nvtext::normalize_characters(sv, *normalizer); + expected = cudf::test::strings_column_wrapper({"abc£def", + "", + "éè â îô aeio", + " ĂĆĖÑ Ü", + "ACEN U", + "P ^ NP", + " $ 41 . 07", + " [ a , b ] ", + " 丏 丟 ", + "", + long_row, + long_row, + long_row}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::normalize_characters(sv, *normalizer); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST_F(TextNormalizeTest, SpecialTokens) +{ + auto long_row = + "this entry is intended to pad out past 256 bytes which is currently the block size"; + auto input = + cudf::test::strings_column_wrapper({"[BOS]Some strings with [PAD] special[SEP]tokens[EOS]", + "[bos]these should[sep]work too[eos]", + "some[non]tokens[eol]too", + long_row, + long_row, + long_row}); + + auto sv = cudf::strings_column_view(input); + auto special_tokens = cudf::test::strings_column_wrapper({"[BOS]", "[EOS]", "[SEP]", "[PAD]"}); + auto stv = cudf::strings_column_view(special_tokens); + + auto normalizer = nvtext::create_character_normalizer(true, stv); + auto results = nvtext::normalize_characters(sv, *normalizer); + auto expected = cudf::test::strings_column_wrapper( + {" [bos] some strings with [pad] special [sep] tokens [eos] ", + " [bos] these should [sep] work too [eos] ", + "some [ non ] tokens [ eol ] too", + long_row, + long_row, + long_row}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::normalize_characters(sv, *normalizer); // and again + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + normalizer = nvtext::create_character_normalizer(false, stv); + results = nvtext::normalize_characters(sv, *normalizer); + expected = cudf::test::strings_column_wrapper( + {" [BOS] Some strings with [PAD] special [SEP] tokens [EOS] ", + " [ bos ] these should [ sep ] work too [ eos ] ", + "some [ non ] tokens [ eol ] too", + long_row, + long_row, + long_row}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::normalize_characters(sv, *normalizer); // and again + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(TextNormalizeTest, NormalizeSlicedColumn) { cudf::test::strings_column_wrapper strings( @@ -151,10 +259,21 @@ TEST_F(TextNormalizeTest, NormalizeSlicedColumn) std::vector sliced = cudf::split(strings, {4}); auto results = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), true); - cudf::test::strings_column_wrapper expected({"abc£def", "ee a io aeio", "acen u", "p ^ np"}); + auto expected = + cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), false); + expected = cudf::test::strings_column_wrapper({" $ 41 . 07", " [ a , b ] ", " 丏 丟 "}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto normalizer = nvtext::create_character_normalizer(true); + results = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), *normalizer); + expected = cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), false); - cudf::test::strings_column_wrapper expected2({" $ 41 . 07", " [ a , b ] ", " 丏 丟 "}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); + normalizer = nvtext::create_character_normalizer(false); + results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), *normalizer); + expected = cudf::test::strings_column_wrapper({" $ 41 . 07", " [ a , b ] ", " 丏 丟 "}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } diff --git a/dependencies.yaml b/dependencies.yaml index c5216a9cb1f..1578dadc793 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -723,7 +723,7 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - fsspec>=0.6.0 - - &numpy numpy>=1.23,<3.0a0 + - &numpy numpy>=1.23,<2.1 - pandas>=2.0,<2.2.4dev0 run_pylibcudf: common: @@ -753,8 +753,8 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cachetools - - &numba-cuda-dep numba-cuda>=0.2.0,<0.3.0a0 - - &numba-dep numba>=0.59.1,<0.61.0a0 + - &numba-cuda-dep numba-cuda>=0.4.0,<0.5.0a0 + - &numba-dep numba>=0.59.1,<0.62.0a0 - nvtx>=0.2.1 - packaging - rich @@ -885,7 +885,8 @@ dependencies: matrices: - matrix: {dependencies: "oldest"} packages: - - numba-cuda==0.2.0 + - numba-cuda==0.4.0 + - numba==0.59.1 - pandas==2.0.* - matrix: {dependencies: "latest"} packages: diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index 2a17bc5dbb7..090e475471d 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -37,7 +37,3 @@ rapids_cython_init() add_subdirectory(cudf/_lib) add_subdirectory(udf_cpp) - -if(DEFINED cython_lib_dir) - rapids_cython_add_rpath_entries(TARGET cudf PATHS "${cython_lib_dir}") -endif() diff --git a/python/cudf/cudf/core/character_normalizer.py b/python/cudf/cudf/core/character_normalizer.py new file mode 100644 index 00000000000..1240c0e1eb7 --- /dev/null +++ b/python/cudf/cudf/core/character_normalizer.py @@ -0,0 +1,46 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +from __future__ import annotations + +import pylibcudf as plc + +import cudf + + +class CharacterNormalizer: + """ + A normalizer object used to normalize input text. + + Parameters + ---------- + do_lower : bool + If True, the normalizer should also lower-case + while normalizing. + special_tokens : cudf.Series + Series of special tokens. + """ + + def __init__( + self, + do_lower: bool, + special_tokens: cudf.Series = cudf.Series([], dtype="object"), + ) -> None: + self.normalizer = plc.nvtext.normalize.CharacterNormalizer( + do_lower, special_tokens._column.to_pylibcudf(mode="read") + ) + + def normalize(self, text: cudf.Series) -> cudf.Series: + """ + Parameters + ---------- + text : cudf.Series + The strings to be normalized. + + Returns + ------- + cudf.Series + Normalized strings + """ + result = text._column.normalize_characters(self.normalizer) + + return cudf.Series._from_column(result) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 04a72017c33..c0ad33ec7d6 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4679,8 +4679,10 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: r""" Normalizes strings characters for tokenizing. - This uses the normalizer that is built into the - subword_tokenize function which includes: + .. deprecated:: 25.04 + Use `CharacterNormalizer` instead. + + The normalizer function includes: - adding padding around punctuation (unicode category starts with "P") as well as certain ASCII symbols like "^" and "$" @@ -4720,8 +4722,13 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: 2 $ 99 dtype: object """ + warnings.warn( + "normalize_characters is deprecated and will be removed in a future " + "version. Use CharacterNormalizer instead.", + FutureWarning, + ) return self._return_or_inplace( - self._column.normalize_characters(do_lower) + self._column.characters_normalize(do_lower) ) def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: @@ -6256,14 +6263,25 @@ def normalize_spaces(self) -> Self: ) @acquire_spill_lock() - def normalize_characters(self, do_lower: bool = True) -> Self: + def characters_normalize(self, do_lower: bool = True) -> Self: return ColumnBase.from_pylibcudf( # type: ignore[return-value] - plc.nvtext.normalize.normalize_characters( + plc.nvtext.normalize.characters_normalize( self.to_pylibcudf(mode="read"), do_lower, ) ) + @acquire_spill_lock() + def normalize_characters( + self, normalizer: plc.nvtext.normalize.CharacterNormalizer + ) -> Self: + return ColumnBase.from_pylibcudf( # type: ignore[return-value] + plc.nvtext.normalize.normalize_characters( + self.to_pylibcudf(mode="read"), + normalizer, + ) + ) + @acquire_spill_lock() def replace_tokens( self, targets: Self, replacements: Self, delimiter: plc.Scalar diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 9c48b31a309..211d161696e 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3308,9 +3308,13 @@ def _split(self, splits, keep_index: bool = True) -> list[Self]: splits, ) + @acquire_spill_lock() + def split_from_pylibcudf(split: list[plc.Column]) -> list[ColumnBase]: + return [ColumnBase.from_pylibcudf(col) for col in split] + return [ self._from_columns_like_self( - [ColumnBase.from_pylibcudf(col) for col in split], + split_from_pylibcudf(split), self._column_names, self.index.names if keep_index else None, ) diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py index 13d98e43ddc..08226dd7f6d 100644 --- a/python/cudf/cudf/tests/test_spilling.py +++ b/python/cudf/cudf/tests/test_spilling.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. from __future__ import annotations import contextlib @@ -784,3 +784,12 @@ def test_spilling_and_copy_on_write(manager: SpillManager): assert not a.is_spilled assert a.owner.exposed assert not b.owner.exposed + + +def test_scatter_by_map(): + data = range(10) + with cudf.option_context("spill", True): + df = cudf.DataFrame(data) + result = df.scatter_by_map(data) + for i, res in zip(data, result): + assert_eq(res, cudf.DataFrame([i], index=[i])) diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 86e1e46c1a2..dc45827d2e8 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -8,6 +8,7 @@ import cudf from cudf.core.byte_pair_encoding import BytePairEncoder +from cudf.core.character_normalizer import CharacterNormalizer from cudf.core.tokenize_vocabulary import TokenizeVocabulary from cudf.testing import assert_eq @@ -251,7 +252,8 @@ def test_normalize_characters(): ] ) - actual = strings.str.normalize_characters() + normalizer_lower = CharacterNormalizer(True) + actual = normalizer_lower.normalize(strings.str) assert type(expected) is type(actual) assert_eq(expected, actual) @@ -265,7 +267,9 @@ def test_normalize_characters(): "Stock ^ $ 1", ] ) - actual = strings.str.normalize_characters(do_lower=False) + + normalizer = CharacterNormalizer(False) + actual = normalizer.normalize(strings.str) assert type(expected) is type(actual) assert_eq(expected, actual) diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 16cd97677ef..8b8abe90ac9 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -24,9 +24,9 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "libcudf==25.4.*,>=0.0.0a0", - "numba-cuda>=0.2.0,<0.3.0a0", - "numba>=0.59.1,<0.61.0a0", - "numpy>=1.23,<3.0a0", + "numba-cuda>=0.4.0,<0.5.0a0", + "numba>=0.59.1,<0.62.0a0", + "numpy>=1.23,<2.1", "nvtx>=0.2.1", "packaging", "pandas>=2.0,<2.2.4dev0", diff --git a/python/cudf_kafka/CMakeLists.txt b/python/cudf_kafka/CMakeLists.txt index 3e12eb6aa41..13b859bc33b 100644 --- a/python/cudf_kafka/CMakeLists.txt +++ b/python/cudf_kafka/CMakeLists.txt @@ -35,7 +35,3 @@ include(rapids-cython-core) rapids_cython_init() add_subdirectory(cudf_kafka/_lib) - -if(DEFINED cython_lib_dir) - rapids_cython_add_rpath_entries(TARGET cudf_kafka PATHS "${cython_lib_dir}") -endif() diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index f8c0a7d9d4b..e9fc054efc2 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -35,7 +35,7 @@ classifiers = [ [project.optional-dependencies] test = [ - "numpy>=1.23,<3.0a0", + "numpy>=1.23,<2.1", "pytest-cov", "pytest-xdist", "pytest<8", diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index 9c58a24c065..8ff0db084b1 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -1,9 +1,7 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -import os - import pytest import polars as pl @@ -203,8 +201,11 @@ def test_scan_csv_multi(tmp_path, filename, glob, nrows_skiprows): f.write("""foo,bar,baz\n1,2,3\n3,4,5""") with (tmp_path / "test*.csv").open("w") as f: f.write("""foo,bar,baz\n1,2,3\n3,4,5""") - os.chdir(tmp_path) - q = pl.scan_csv(filename, glob=glob, n_rows=n_rows, skip_rows=skiprows) + if isinstance(filename, list): + source = [tmp_path / fn for fn in filename] + else: + source = tmp_path / filename + q = pl.scan_csv(source, glob=glob, n_rows=n_rows, skip_rows=skiprows) assert_gpu_result_equal(q) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py index c0792663c7e..c0b9d71653c 100644 --- a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py +++ b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py @@ -434,18 +434,12 @@ def set_object_dtypes_from_pa_schema(df, schema): # pyarrow schema. if schema: for col_name, col in df._data.items(): - if col_name is None: - # Pyarrow cannot handle `None` as a field name. - # However, this should be a simple range index that - # we can ignore anyway - continue - typ = cudf_dtype_from_pa_type(schema.field(col_name).type) - if ( - col_name in schema.names - and not isinstance(typ, (cudf.ListDtype, cudf.StructDtype)) - and isinstance(col, cudf.core.column.StringColumn) - ): - df._data[col_name] = col.astype(typ) + if col_name in schema.names: + typ = cudf_dtype_from_pa_type(schema.field(col_name).type) + if not isinstance( + typ, (cudf.ListDtype, cudf.StructDtype) + ) and isinstance(col, cudf.core.column.StringColumn): + df._data[col_name] = col.astype(typ) to_parquet = dd.to_parquet diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 9f7031f4d2a..3a88668e6d2 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +import pyarrow as pa import pytest import dask @@ -486,6 +487,52 @@ def test_create_metadata_file_inconsistent_schema(tmpdir): dd.assert_eq(ddf1.compute(), ddf2.compute()) +@pytest.mark.parametrize("specify_schema", [True, False]) +def test_read_inconsistent_schema(tmpdir, specify_schema): + if specify_schema: + # If we specify the expected schema, + # we also need to specify the partitioning. + kwargs = { + "dataset": { + "schema": pa.schema( + [ + ("id", pa.int64()), + ("text", pa.string()), + ("meta1", pa.struct([("field1", pa.string())])), + ] + ), + "partitioning": None, + }, + } + else: + kwargs = {} + + records = [ + {"id": 123, "text": "foo"}, + { + "text": "bar", + "meta1": [{"field1": "cat"}], + "id": 456, + }, + ] + columns = ["text", "id"] + pd.DataFrame(records[:1]).to_parquet(tmpdir / "part.0.parquet") + pd.DataFrame(records[1:]).to_parquet(tmpdir / "part.1.parquet") + # Check that cuDF and Dask cuDF match + dd.assert_eq( + cudf.read_parquet( + tmpdir, columns=columns, allow_mismatched_pq_schemas=True + ), + dask_cudf.read_parquet(tmpdir, columns=columns, **kwargs), + check_index=False, + ) + # Check that "pandas" and "cudf" backends match + dd.assert_eq( + dd.read_parquet(tmpdir, columns=columns), + dask_cudf.read_parquet(tmpdir, columns=columns, **kwargs), + ) + + @pytest.mark.parametrize( "data", [ @@ -526,7 +573,6 @@ def test_cudf_list_struct_write(tmpdir): def test_null_partition(tmpdir): - import pyarrow as pa from pyarrow.dataset import HivePartitioning ids = pd.Series([0, 1, None], dtype="Int64") diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 87bf282f376..83493d7f2a4 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -22,7 +22,7 @@ dependencies = [ "cudf==25.4.*,>=0.0.0a0", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", - "numpy>=1.23,<3.0a0", + "numpy>=1.23,<2.1", "pandas>=2.0,<2.2.4dev0", "pynvml>=12.0.0,<13.0.0a0", "rapids-dask-dependency==25.4.*,>=0.0.0a0", @@ -47,8 +47,8 @@ cudf = "dask_cudf.backends:CudfBackendEntrypoint" [project.optional-dependencies] test = [ "dask-cuda==25.4.*,>=0.0.0a0", - "numba-cuda>=0.2.0,<0.3.0a0", - "numba>=0.59.1,<0.61.0a0", + "numba-cuda>=0.4.0,<0.5.0a0", + "numba>=0.59.1,<0.62.0a0", "pytest-cov", "pytest-xdist", "pytest<8", diff --git a/python/pylibcudf/CMakeLists.txt b/python/pylibcudf/CMakeLists.txt index fe6e73a3f14..153570a4a7e 100644 --- a/python/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/CMakeLists.txt @@ -37,7 +37,3 @@ include(rapids-cython-core) rapids_cython_init() add_subdirectory(pylibcudf) - -if(DEFINED cython_lib_dir) - rapids_cython_add_rpath_entries(TARGET cudf PATHS "${cython_lib_dir}") -endif() diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd index f8b082c8429..2cf2bfb8ac9 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler @@ -16,3 +16,16 @@ cdef extern from "nvtext/normalize.hpp" namespace "nvtext" nogil: const column_view & strings, bool do_lower_case ) except +libcudf_exception_handler + + cdef struct character_normalizer "nvtext::character_normalizer": + pass + + cdef unique_ptr[character_normalizer] create_character_normalizer( + bool do_lower_case, + const column_view & strings + ) except +libcudf_exception_handler + + cdef unique_ptr[column] normalize_characters( + const column_view & strings, + const character_normalizer & normalizer + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd index 90676145afa..e6688e19762 100644 --- a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd @@ -1,9 +1,18 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp cimport bool +from libcpp.memory cimport unique_ptr from pylibcudf.column cimport Column +from pylibcudf.libcudf.nvtext.normalize cimport character_normalizer +cdef class CharacterNormalizer: + cdef unique_ptr[character_normalizer] c_obj cpdef Column normalize_spaces(Column input) -cpdef Column normalize_characters(Column input, bool do_lower_case) +cpdef Column characters_normalize(Column input, bool do_lower_case) + +cpdef Column normalize_characters( + Column input, + CharacterNormalizer normalizer +) diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi index 1d90a5a8960..d722ef6c79e 100644 --- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi @@ -1,6 +1,12 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.column import Column +class CharacterNormalizer: + def __init__(self, do_lower_case: bool, special_tokens: Column): ... + def normalize_spaces(input: Column) -> Column: ... -def normalize_characters(input: Column, do_lower_case: bool) -> Column: ... +def characters_normalize(input: Column, do_lower_case: bool) -> Column: ... +def normalize_characters( + input: Column, normalizer: CharacterNormalizer +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx index b259ccaefa6..6a18c205841 100644 --- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx @@ -1,16 +1,37 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. +from cython.operator cimport dereference from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.nvtext.normalize cimport ( - normalize_characters as cpp_normalize_characters, - normalize_spaces as cpp_normalize_spaces, -) +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext cimport normalize as cpp_normalize -__all__ = ["normalize_characters", "normalize_spaces"] +__all__ = [ + "CharacterNormalizer" + "normalize_characters", + "normalize_spaces", + "characters_normalize" +] + +cdef class CharacterNormalizer: + """The normalizer object to be used with ``normalize_characters``. + + For details, see :cpp:class:`cudf::nvtext::character_normalizer`. + """ + def __cinit__(self, bool do_lower_case, Column tokens): + cdef column_view c_tokens = tokens.view() + with nogil: + self.c_obj = move( + cpp_normalize.create_character_normalizer( + do_lower_case, + c_tokens + ) + ) + + __hash__ = None cpdef Column normalize_spaces(Column input): """ @@ -32,12 +53,12 @@ cpdef Column normalize_spaces(Column input): cdef unique_ptr[column] c_result with nogil: - c_result = cpp_normalize_spaces(input.view()) + c_result = cpp_normalize.normalize_spaces(input.view()) return Column.from_libcudf(move(c_result)) -cpdef Column normalize_characters(Column input, bool do_lower_case): +cpdef Column characters_normalize(Column input, bool do_lower_case): """ Normalizes strings characters for tokenizing. @@ -60,6 +81,38 @@ cpdef Column normalize_characters(Column input, bool do_lower_case): cdef unique_ptr[column] c_result with nogil: - c_result = cpp_normalize_characters(input.view(), do_lower_case) + c_result = cpp_normalize.normalize_characters( + input.view(), + do_lower_case + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column normalize_characters(Column input, CharacterNormalizer normalizer): + """ + Normalizes strings characters for tokenizing. + + For details, see :cpp:func:`normalize_characters` + + Parameters + ---------- + input : Column + Input strings + normalizer : CharacterNormalizer + Normalizer object used for modifying the input column text + + Returns + ------- + Column + Normalized strings column + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_normalize.normalize_characters( + input.view(), + dereference(normalizer.c_obj.get()) + ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py index 25b6d1389ec..47bbb191be6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import pyarrow as pa import pytest @@ -15,7 +15,7 @@ def norm_spaces_input_data(): @pytest.fixture(scope="module") def norm_chars_input_data(): - arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"] + arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]", "[pad]"] return pa.array(arr) @@ -29,15 +29,98 @@ def test_normalize_spaces(norm_spaces_input_data): @pytest.mark.parametrize("do_lower", [True, False]) def test_normalize_characters(norm_chars_input_data, do_lower): - result = plc.nvtext.normalize.normalize_characters( + result = plc.nvtext.normalize.characters_normalize( plc.interop.from_arrow(norm_chars_input_data), do_lower, ) - expected = pa.array( - ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "] + if do_lower: + expected = pa.array( + [ + "eaio eaio", + "acenu", + "acenu", + " $ 24 . 08", + " [ a , bb ] ", + " [ pad ] ", + ] + ) + else: + expected = pa.array( + [ + "éâîô eaio", + "ĂĆĖÑÜ", + "ACENU", + " $ 24 . 08", + " [ a , bb ] ", + " [ pad ] ", + ] + ) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("do_lower", [True, False]) +def test_normalizer(norm_chars_input_data, do_lower): + result = plc.nvtext.normalize.normalize_characters( + plc.interop.from_arrow(norm_chars_input_data), + plc.nvtext.normalize.CharacterNormalizer( + do_lower, + plc.column_factories.make_empty_column(plc.types.TypeId.STRING), + ), + ) + if do_lower: + expected = pa.array( + [ + "eaio eaio", + "acenu", + "acenu", + " $ 24 . 08", + " [ a , bb ] ", + " [ pad ] ", + ] + ) + else: + expected = pa.array( + [ + "éâîô eaio", + "ĂĆĖÑÜ", + "ACENU", + " $ 24 . 08", + " [ a , bb ] ", + " [ pad ] ", + ] + ) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("do_lower", [True, False]) +def test_normalizer_with_special_tokens(norm_chars_input_data, do_lower): + special_tokens = pa.array(["[pad]"]) + result = plc.nvtext.normalize.normalize_characters( + plc.interop.from_arrow(norm_chars_input_data), + plc.nvtext.normalize.CharacterNormalizer( + do_lower, plc.interop.from_arrow(special_tokens) + ), ) - if not do_lower: + if do_lower: + expected = pa.array( + [ + "eaio eaio", + "acenu", + "acenu", + " $ 24 . 08", + " [ a , bb ] ", + " [pad] ", + ] + ) + else: expected = pa.array( - ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "] + [ + "éâîô eaio", + "ĂĆĖÑÜ", + "ACENU", + " $ 24 . 08", + " [ a , bb ] ", + " [pad] ", + ] ) assert_column_eq(result, expected) diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index 939da65c1ec..e12d1ffdb39 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -42,7 +42,7 @@ classifiers = [ test = [ "fastavro>=0.22.9", "hypothesis", - "numpy>=1.23,<3.0a0", + "numpy>=1.23,<2.1", "pandas", "pytest-cov", "pytest-xdist",