From 54c15b2a1a61f4d88437ab0433eecf27241bda77 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 25 Feb 2025 16:21:01 -0600 Subject: [PATCH 01/13] Use conda-build instead of conda-mambabuild (#18092) This changes from `conda mambabuild` to `conda build`. Conda now uses the mamba solver so no performance regressions are expected. This is a temporary change as we plan to migrate to `rattler-build` in the near future. However, this is needed sooner to drop `boa` and unblock Python 3.13 migrations. xref: https://github.com/rapidsai/build-planning/issues/149 Authors: - Bradley Dice (https://github.com/bdice) Approvers: - James Lamb (https://github.com/jameslamb) - Jake Awe (https://github.com/AyodeAwe) URL: https://github.com/rapidsai/cudf/pull/18092 --- ci/build_cpp.sh | 4 ++-- ci/build_python.sh | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index 3d06eacf9ff..0c324d01cdf 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. set -euo pipefail @@ -18,7 +18,7 @@ rapids-logger "Begin cpp build" sccache --zero-stats # With boa installed conda build forward to boa -RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry build \ conda/recipes/libcudf sccache --show-adv-stats diff --git a/ci/build_python.sh b/ci/build_python.sh index ed90041cc77..abbdc3f3a3b 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. set -euo pipefail @@ -25,7 +25,7 @@ sccache --zero-stats # node works correctly # With boa installed conda build forwards to the boa builder -RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \ --no-test \ --channel "${CPP_CHANNEL}" \ conda/recipes/pylibcudf @@ -33,7 +33,7 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ sccache --show-adv-stats sccache --zero-stats -RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ @@ -42,13 +42,13 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ sccache --show-adv-stats sccache --zero-stats -RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ conda/recipes/dask-cudf -RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ @@ -56,13 +56,13 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ sccache --show-adv-stats -RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ conda/recipes/custreamz -RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ From 0f7a17f8767dfe5c00ea31feb894cf38a9fc1b6d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 25 Feb 2025 15:17:40 -0800 Subject: [PATCH 02/13] Update numba dep and upper-bound numpy (#18078) This PR updates to numba-cuda 0.4 and numba 0.61. A numpy upper-bound is added since it looks like numpy 2.1 made some changes with which we are currently incompatible. Previously numba provided that upper bound for us. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/18078 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 6 +++--- conda/environments/all_cuda-128_arch-x86_64.yaml | 6 +++--- conda/recipes/cudf/meta.yaml | 6 +++--- conda/recipes/pylibcudf/meta.yaml | 2 +- dependencies.yaml | 9 +++++---- python/cudf/pyproject.toml | 6 +++--- python/cudf_polars/pyproject.toml | 2 +- python/dask_cudf/pyproject.toml | 6 +++--- python/pylibcudf/pyproject.toml | 2 +- 9 files changed, 23 insertions(+), 22 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index cc674732ba4..e7dbb765099 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -54,9 +54,9 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.2.0,<0.3.0a0 -- numba>=0.59.1,<0.61.0a0 -- numpy>=1.23,<3.0a0 +- numba-cuda>=0.4.0,<0.5.0a0 +- numba>=0.59.1,<0.62.0a0 +- numpy>=1.23,<2.1 - numpydoc - nvcc_linux-64=11.8 - nvcomp==4.2.0.11 diff --git a/conda/environments/all_cuda-128_arch-x86_64.yaml b/conda/environments/all_cuda-128_arch-x86_64.yaml index 7593a72cc68..342ec8d4b59 100644 --- a/conda/environments/all_cuda-128_arch-x86_64.yaml +++ b/conda/environments/all_cuda-128_arch-x86_64.yaml @@ -53,9 +53,9 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.2.0,<0.3.0a0 -- numba>=0.59.1,<0.61.0a0 -- numpy>=1.23,<3.0a0 +- numba-cuda>=0.4.0,<0.5.0a0 +- numba>=0.59.1,<0.62.0a0 +- numpy>=1.23,<2.1 - numpydoc - nvcomp==4.2.0.11 - nvtx>=0.2.1 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index f817bc12c5b..43060ef1c87 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -75,9 +75,9 @@ requirements: - typing_extensions >=4.0.0 - pandas >=2.0,<2.2.4dev0 - cupy >=12.0.0 - - numba-cuda >=0.2.0,<0.3.0a0 - - numba >=0.59.1,<0.61.0a0 - - numpy >=1.23,<3.0a0 + - numba-cuda >=0.4.0,<0.5.0a0 + - numba >=0.59.1,<0.62.0a0 + - numpy >=1.23,<2.1 - pyarrow>=14.0.0,<20.0.0a0 - libcudf ={{ version }} - pylibcudf ={{ version }} diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml index 14e2f31a5a5..ae02cf8d4e5 100644 --- a/conda/recipes/pylibcudf/meta.yaml +++ b/conda/recipes/pylibcudf/meta.yaml @@ -73,7 +73,7 @@ requirements: - python - typing_extensions >=4.0.0 - pandas >=2.0,<2.2.4dev0 - - numpy >=1.23,<3.0a0 + - numpy >=1.23,<2.1 - pyarrow>=14.0.0,<20.0.0a0 - libcudf ={{ version }} - {{ pin_compatible('rmm', max_pin='x.x') }} diff --git a/dependencies.yaml b/dependencies.yaml index e7840d56880..c7869eee922 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -723,7 +723,7 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - fsspec>=0.6.0 - - &numpy numpy>=1.23,<3.0a0 + - &numpy numpy>=1.23,<2.1 - pandas>=2.0,<2.2.4dev0 run_pylibcudf: common: @@ -753,8 +753,8 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cachetools - - &numba-cuda-dep numba-cuda>=0.2.0,<0.3.0a0 - - &numba-dep numba>=0.59.1,<0.61.0a0 + - &numba-cuda-dep numba-cuda>=0.4.0,<0.5.0a0 + - &numba-dep numba>=0.59.1,<0.62.0a0 - nvtx>=0.2.1 - packaging - rich @@ -885,7 +885,8 @@ dependencies: matrices: - matrix: {dependencies: "oldest"} packages: - - numba-cuda==0.2.0 + - numba-cuda==0.4.0 + - numba==0.59.1 - pandas==2.0.* - matrix: {dependencies: "latest"} packages: diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 16cd97677ef..8b8abe90ac9 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -24,9 +24,9 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "libcudf==25.4.*,>=0.0.0a0", - "numba-cuda>=0.2.0,<0.3.0a0", - "numba>=0.59.1,<0.61.0a0", - "numpy>=1.23,<3.0a0", + "numba-cuda>=0.4.0,<0.5.0a0", + "numba>=0.59.1,<0.62.0a0", + "numpy>=1.23,<2.1", "nvtx>=0.2.1", "packaging", "pandas>=2.0,<2.2.4dev0", diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 872c08a66f9..9026a0c29ca 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -35,7 +35,7 @@ classifiers = [ [project.optional-dependencies] test = [ - "numpy>=1.23,<3.0a0", + "numpy>=1.23,<2.1", "pytest-cov", "pytest-xdist", "pytest<8", diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 87bf282f376..83493d7f2a4 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -22,7 +22,7 @@ dependencies = [ "cudf==25.4.*,>=0.0.0a0", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", - "numpy>=1.23,<3.0a0", + "numpy>=1.23,<2.1", "pandas>=2.0,<2.2.4dev0", "pynvml>=12.0.0,<13.0.0a0", "rapids-dask-dependency==25.4.*,>=0.0.0a0", @@ -47,8 +47,8 @@ cudf = "dask_cudf.backends:CudfBackendEntrypoint" [project.optional-dependencies] test = [ "dask-cuda==25.4.*,>=0.0.0a0", - "numba-cuda>=0.2.0,<0.3.0a0", - "numba>=0.59.1,<0.61.0a0", + "numba-cuda>=0.4.0,<0.5.0a0", + "numba>=0.59.1,<0.62.0a0", "pytest-cov", "pytest-xdist", "pytest<8", diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index 939da65c1ec..e12d1ffdb39 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -42,7 +42,7 @@ classifiers = [ test = [ "fastavro>=0.22.9", "hypothesis", - "numpy>=1.23,<3.0a0", + "numpy>=1.23,<2.1", "pandas", "pytest-cov", "pytest-xdist", From 8d6bdc34c4b2d0d6be614c04af16b8064d2c723d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 25 Feb 2025 15:21:19 -0800 Subject: [PATCH 03/13] Remove static configure step (#18091) This check has been superseded by #17781. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cudf/pull/18091 --- .github/workflows/pr.yaml | 11 ----------- ci/configure_cpp_static.sh | 21 --------------------- 2 files changed, 32 deletions(-) delete mode 100755 ci/configure_cpp_static.sh diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 38b890893d0..2c583598f54 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -24,7 +24,6 @@ jobs: - conda-python-cudf-tests - conda-python-other-tests - conda-java-tests - - static-configure - conda-notebook-tests - docs-build - wheel-build-libcudf @@ -192,16 +191,6 @@ jobs: arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_java.sh" - static-configure: - needs: checks - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04 - with: - build_type: pull-request - # Use the wheel container so we can skip conda solves and since our - # primary static consumers (Spark) are not in conda anyway. - container_image: "rapidsai/ci-wheel:latest" - run_script: "ci/configure_cpp_static.sh" conda-notebook-tests: needs: [conda-python-build, changed-files] secrets: inherit diff --git a/ci/configure_cpp_static.sh b/ci/configure_cpp_static.sh deleted file mode 100755 index 3d0647a96f6..00000000000 --- a/ci/configure_cpp_static.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024-2025, NVIDIA CORPORATION. - -set -euo pipefail - -source rapids-date-string - -rapids-logger "Configure static cpp build" - -ENV_YAML_DIR="$(mktemp -d)" -REQUIREMENTS_FILE="${ENV_YAML_DIR}/requirements.txt" - -rapids-dependency-file-generator \ - --output requirements \ - --file-key test_static_build \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${REQUIREMENTS_FILE}" - -rapids-pip-retry install -r "${REQUIREMENTS_FILE}" -pyenv rehash - -cmake -S cpp -B build_static -GNinja -DBUILD_SHARED_LIBS=OFF -DCUDF_USE_ARROW_STATIC=ON -DBUILD_TESTS=OFF From e365986cf886fe3a9531952fe5b91a34ca466c45 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 25 Feb 2025 17:32:23 -0600 Subject: [PATCH 04/13] Run narwhals tests nightly. (#18093) This enables narwhals tests in nightly CI. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Matthew Murray (https://github.com/Matt711) - Gil Forsyth (https://github.com/gforsyth) URL: https://github.com/rapidsai/cudf/pull/18093 --- .github/workflows/test.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 12f6d751493..7046fd0e5dc 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -168,3 +168,14 @@ jobs: date: ${{ inputs.date }} sha: ${{ inputs.sha }} script: "ci/test_cudf_polars_polars_tests.sh" + narwhals-tests: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04 + with: + build_type: ${{ inputs.build_type }} + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + node_type: "gpu-l4-latest-1" + container_image: "rapidsai/ci-conda:latest" + run_script: ci/test_narwhals.sh From 18a5412ced238630bb1a6f5b15e6f319dd388090 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 25 Feb 2025 18:57:00 -0500 Subject: [PATCH 05/13] Add new nvtext::normalize_characters API (#17818) Adds new normalizer APIs as part of the rework for the subword-tokenizer. The new API is split into 2 parts. First a normalizer object is created with appropriate state: lower-case and special-tokens. The normalizing tables are currently hardcoded inside libcudf. Future versions of the this may load these tables from some other source. The 2nd API is given the input strings column and the normalizer object and returns a normalized strings column. The normalizer object can be reused on all subsequent `normalize_characters` calls. The current `nvtext::normalize_characters` loads the normalizing tables on each call which can be significant overhead. This API will be deprecated and replaced by these 2 new ones. Some utility functions from that implementation have been refactored to be used by both until the old one is removed. The first API creates the normalizer object. ```cpp std::unique_ptr create_character_normalizer( bool do_lower_case, cudf::strings_column_view const& special_tokens, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); ``` The 2nd API uses the normalizer on a strings column: ```cpp std::unique_ptr normalize_characters( cudf::strings_column_view const& input, character_normalizer const& normalizer, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); ``` Using the python interface: ```python import cudf from cudf.core.character_normalizer import CharacterNormalizer cn = CharacterNormalizer(do_lower=False) sn = cn.normalize(input_strings) ``` Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Tianyu Liu (https://github.com/kingcrimsontianyu) - Karthikeyan (https://github.com/karthikeyann) - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17818 --- cpp/benchmarks/text/normalize.cpp | 9 +- cpp/include/cudf/strings/detail/utilities.hpp | 14 +- cpp/include/nvtext/normalize.hpp | 111 ++++- cpp/src/strings/utilities.cu | 14 +- cpp/src/text/normalize.cu | 395 +++++++++++++++++- cpp/src/text/normalize.cuh | 100 +++++ cpp/src/text/subword/data_normalizer.cu | 76 +--- cpp/tests/text/normalize_tests.cpp | 165 +++++++- python/cudf/cudf/core/character_normalizer.py | 46 ++ python/cudf/cudf/core/column/string.py | 28 +- .../cudf/cudf/tests/text/test_text_methods.py | 8 +- .../pylibcudf/libcudf/nvtext/normalize.pxd | 15 +- .../pylibcudf/pylibcudf/nvtext/normalize.pxd | 13 +- .../pylibcudf/pylibcudf/nvtext/normalize.pyi | 10 +- .../pylibcudf/pylibcudf/nvtext/normalize.pyx | 71 +++- .../pylibcudf/tests/test_nvtext_normalize.py | 97 ++++- 16 files changed, 1018 insertions(+), 154 deletions(-) create mode 100644 cpp/src/text/normalize.cuh create mode 100644 python/cudf/cudf/core/character_normalizer.py diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp index 594dc0de28a..494d5722ae4 100644 --- a/cpp/benchmarks/text/normalize.cpp +++ b/cpp/benchmarks/text/normalize.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,8 +48,11 @@ static void bench_normalize(nvbench::state& state) [&](nvbench::launch& launch) { auto result = nvtext::normalize_spaces(input); }); } else { bool const to_lower = (normalize_type == "to_lower"); + // we expect the normalizer to be created once and re-used + // so creating it is not measured + auto normalizer = nvtext::create_character_normalizer(to_lower); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = nvtext::normalize_characters(input, to_lower); + auto result = nvtext::normalize_characters(input, *normalizer); }); } } @@ -57,6 +60,6 @@ static void bench_normalize(nvbench::state& state) NVBENCH_BENCH(bench_normalize) .set_name("normalize") .add_int64_axis("min_width", {0}) - .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("max_width", {128, 256}) .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"spaces", "characters", "to_lower"}); diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp index d276c5df7dc..8fb1f30f961 100644 --- a/cpp/include/cudf/strings/detail/utilities.hpp +++ b/cpp/include/cudf/strings/detail/utilities.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -96,5 +96,17 @@ int64_t get_offset_value(cudf::column_view const& offsets, size_type index, rmm::cuda_stream_view stream); +/** + * @brief Return the first and last offset in the given strings column + * + * This accounts for sliced input columns as well. + * + * @param input Strings column + * @param stream CUDA stream used for device memory operations and kernel launches + * @return First and last offset values + */ +std::pair get_first_and_last_offset(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream); + } // namespace strings::detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp index 74325f4a406..70ee7891ad7 100644 --- a/cpp/include/nvtext/normalize.hpp +++ b/cpp/include/nvtext/normalize.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include #include @@ -107,5 +108,113 @@ std::unique_ptr normalize_characters( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Normalizer object to be used with nvtext::normalize_characters + * + * Use nvtext::create_normalizer to create this object. + * + * This normalizer includes: + * + * - adding padding around punctuation (unicode category starts with "P") + * as well as certain ASCII symbols like "^" and "$" + * - adding padding around the [CJK Unicode block + * characters](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)) + * - changing whitespace (e.g. `"\t", "\n", "\r"`) to just space `" "` + * - removing control characters (unicode categories "Cc" and "Cf") + * + * The padding process adds a single space before and after the character. + * Details on _unicode category_ can be found here: + * https://unicodebook.readthedocs.io/unicode.html#categories + * + * If `do_lower_case = true`, lower-casing also removes any accents. The + * accents cannot be removed from upper-case characters without lower-casing + * and lower-casing cannot be performed without also removing accents. + * However, if the accented character is already lower-case, then only the + * accent is removed. + * + * If `special_tokens` are included the padding after `[` and before `]` is not + * inserted if the characters between them match one of the given tokens. + * Also, the `special_tokens` are expected to include the `[]` characters + * at the beginning of and end of each string appropriately. + */ +struct character_normalizer { + /** + * @brief Normalizer object constructor + * + * This initializes and holds the character normalizing tables and settings. + * + * @param do_lower_case If true, upper-case characters are converted to + * lower-case and accents are stripped from those characters. + * If false, accented and upper-case characters are not transformed. + * @param special_tokens Each row is a token including the `[]` brackets. + * For example: `[BOS]`, `[EOS]`, `[UNK]`, `[SEP]`, `[PAD]`, `[CLS]`, `[MASK]` + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + */ + character_normalizer(bool do_lower_case, + cudf::strings_column_view const& special_tokens, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + ~character_normalizer(); + + struct character_normalizer_impl; + std::unique_ptr _impl; +}; + +/** + * @brief Create a normalizer object + * + * Creates a normalizer object which can be reused on multiple calls to + * nvtext::normalize_characters + * + * @see nvtext::character_normalizer + * + * @param do_lower_case If true, upper-case characters are converted to + * lower-case and accents are stripped from those characters. + * If false, accented and upper-case characters are not transformed. + * @param special_tokens Individual tokens including `[]` brackets. + * Default is no special tokens. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Object to be used with nvtext::normalize_characters + */ +std::unique_ptr create_character_normalizer( + bool do_lower_case, + cudf::strings_column_view const& special_tokens = cudf::strings_column_view(cudf::column_view{ + cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0}), + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + +/** + * @brief Normalizes the text in input strings column + * + * @see nvtext::character_normalizer for details on the normalizer behavior + * + * @code{.pseudo} + * cn = create_character_normalizer(true) + * s = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"] + * s1 = normalize_characters(s,cn) + * s1 is now ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "] + * + * cn = create_character_normalizer(false) + * s2 = normalize_characters(s,cn) + * s2 is now ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "] + * @endcode + * + * A null input element at row `i` produces a corresponding null entry + * for row `i` in the output column. + * + * @param input The input strings to normalize + * @param normalizer Normalizer to use for this function + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Memory resource to allocate any returned objects + * @return Normalized strings column + */ +std::unique_ptr normalize_characters( + cudf::strings_column_view const& input, + character_normalizer const& normalizer, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** @} */ // end of group } // namespace CUDF_EXPORT nvtext diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index 45bd4615435..c5d46598d4a 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -180,6 +180,18 @@ int64_t get_offset_value(cudf::column_view const& offsets, : cudf::detail::get_value(offsets, index, stream); } +std::pair get_first_and_last_offset(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream) +{ + if (input.is_empty()) { return {0L, 0L}; } + auto const first_offset = (input.offset() == 0) ? 0 + : cudf::strings::detail::get_offset_value( + input.offsets(), input.offset(), stream); + auto const last_offset = + cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream); + return {first_offset, last_offset}; +} + } // namespace detail rmm::device_uvector create_string_vector_from_column( diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 7e2b766862d..0e680e98ec5 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "text/normalize.cuh" #include "text/subword/detail/data_normalizer.hpp" #include "text/subword/detail/tokenizer_utils.cuh" #include "text/utilities/tokenize_ops.cuh" @@ -22,10 +23,11 @@ #include #include #include -#include #include #include #include +#include +#include #include #include #include @@ -38,9 +40,13 @@ #include +#include +#include +#include #include #include #include +#include #include #include @@ -103,6 +109,12 @@ constexpr uint32_t UTF8_1BYTE = 0x0080; constexpr uint32_t UTF8_2BYTE = 0x0800; constexpr uint32_t UTF8_3BYTE = 0x01'0000; +__device__ int8_t cp_to_utf8(uint32_t codepoint, char* out) +{ + auto utf8 = cudf::strings::detail::codepoint_to_utf8(codepoint); + return cudf::strings::detail::from_char_utf8(utf8, out); +} + /** * @brief Convert code-point arrays into UTF-8 bytes for each string. */ @@ -148,26 +160,8 @@ struct codepoint_to_utf8_fn { // convert each code-point to 1-4 UTF-8 encoded bytes char* out_ptr = d_chars + d_offsets[idx]; for (uint32_t jdx = 0; jdx < count; ++jdx) { - uint32_t code_point = *str_cps++; - if (code_point < UTF8_1BYTE) // ASCII range - *out_ptr++ = static_cast(code_point); - else if (code_point < UTF8_2BYTE) { // create two-byte UTF-8 - // b00001xxx:byyyyyyyy => b110xxxyy:b10yyyyyy - *out_ptr++ = static_cast((((code_point << 2) & 0x00'1F00) | 0x00'C000) >> 8); - *out_ptr++ = static_cast((code_point & 0x3F) | 0x0080); - } else if (code_point < UTF8_3BYTE) { // create three-byte UTF-8 - // bxxxxxxxx:byyyyyyyy => b1110xxxx:b10xxxxyy:b10yyyyyy - *out_ptr++ = static_cast((((code_point << 4) & 0x0F'0000) | 0x00E0'0000) >> 16); - *out_ptr++ = static_cast((((code_point << 2) & 0x00'3F00) | 0x00'8000) >> 8); - *out_ptr++ = static_cast((code_point & 0x3F) | 0x0080); - } else { // create four-byte UTF-8 - // maximum code-point value is 0x0011'0000 - // b000xxxxx:byyyyyyyy:bzzzzzzzz => b11110xxx:b10xxyyyy:b10yyyyzz:b10zzzzzz - *out_ptr++ = static_cast((((code_point << 6) & 0x0700'0000u) | 0xF000'0000u) >> 24); - *out_ptr++ = static_cast((((code_point << 4) & 0x003F'0000u) | 0x0080'0000u) >> 16); - *out_ptr++ = static_cast((((code_point << 2) & 0x00'3F00u) | 0x00'8000u) >> 8); - *out_ptr++ = static_cast((code_point & 0x3F) | 0x0080); - } + uint32_t codepoint = *str_cps++; + out_ptr += cp_to_utf8(codepoint, out_ptr); } } }; @@ -261,4 +255,361 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con return detail::normalize_characters(input, do_lower_case, stream, mr); } +struct character_normalizer::character_normalizer_impl { + rmm::device_uvector cp_metadata; + rmm::device_uvector aux_table; + bool do_lower_case; + std::unique_ptr special_tokens; + rmm::device_uvector special_tokens_view; + + cudf::device_span get_special_tokens() const + { + return special_tokens_view; + } + + character_normalizer_impl(rmm::device_uvector&& cp_metadata, + rmm::device_uvector&& aux_table, + bool do_lower_case, + std::unique_ptr&& special_tokens, + rmm::device_uvector&& special_tokens_view) + : cp_metadata(std::move(cp_metadata)), + aux_table(std::move(aux_table)), + do_lower_case{do_lower_case}, + special_tokens{std::move(special_tokens)}, + special_tokens_view{std::move(special_tokens_view)} + { + } +}; + +character_normalizer::character_normalizer(bool do_lower_case, + cudf::strings_column_view const& special_tokens, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref) +{ + auto cp_metadata = nvtext::detail::get_codepoint_metadata(stream); + auto aux_table = nvtext::detail::get_aux_codepoint_data(stream); + CUDF_EXPECTS( + !special_tokens.has_nulls(), "special tokens should not have nulls", std::invalid_argument); + + auto sorted = std::move( + cudf::sort(cudf::table_view({special_tokens.parent()}), {}, {}, stream)->release().front()); + if (do_lower_case) { + // lower-case the tokens so they will match the normalized input + sorted = cudf::strings::to_lower(cudf::strings_column_view(sorted->view()), stream); + } + + auto tokens_view = cudf::strings::detail::create_string_vector_from_column( + cudf::strings_column_view(sorted->view()), stream, cudf::get_current_device_resource_ref()); + + _impl = std::make_unique(std::move(cp_metadata), + std::move(aux_table), + do_lower_case, + std::move(sorted), + std::move(tokens_view)); +} + +character_normalizer::~character_normalizer() {} + +std::unique_ptr create_character_normalizer( + bool do_lower_case, + cudf::strings_column_view const& special_tokens, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return std::make_unique(do_lower_case, special_tokens, stream, mr); +} + +namespace detail { +namespace { + +/** + * @brief Kernel handles fixing up the normalized data to account for any special tokens + * + * This undoes the padding added around the `[]` for patterns matching the strings in the + * special_tokens array. + * + * Launched as a thread per input byte (total_count). + * + * @param d_normalized The normalized set of UTF-8 characters; 3 uints per input byte + * @param total_count Number of bytes represented by d_normalized; len(d_normalized)/3 + * @param special_tokens Tokens to check against + */ +CUDF_KERNEL void special_tokens_kernel(uint32_t* d_normalized, + int64_t total_count, + cudf::device_span special_tokens) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + if (idx >= total_count) { return; } + auto const begin = d_normalized + (idx * MAX_NEW_CHARS) + 1; + if (*begin != '[') { return; } + auto const end = begin + cuda::std::min(6L, total_count - idx) * MAX_NEW_CHARS; + auto const match = thrust::find(thrust::seq, begin, end, static_cast(']')); + if (match == end) { return; } + char candidate[8]; + auto const ch_begin = + thrust::transform_iterator(begin, [](auto v) { return static_cast(v); }); + auto const ch_end = ch_begin + thrust::distance(begin, match + 1); + auto last = thrust::copy_if( + thrust::seq, ch_begin, ch_end, candidate, [](auto c) { return c != 0 && c != ' '; }); + *last = 0; // only needed for debug + + auto const size = static_cast(thrust::distance(candidate, last)); + auto const token = cudf::string_view(candidate, size); + // the binary_search expects the special_tokens to be sorted + if (!thrust::binary_search(thrust::seq, special_tokens.begin(), special_tokens.end(), token)) { + return; + } + + // fix up chars to remove the extra spaces + *(begin + 1) = 0; // removes space after '[' + *(match - 1) = 0; // removes space before ']' +} + +/** + * @brief The normalizer kernel + * + * Launched as a thread per input byte (total_bytes). + * + * Converts the input d_chars into codepoints to lookup in the provided tables. + * Once processed, the d_output contains 3 uints per input byte each encoded + * as output UTF-8. Any zero values are to removed by a subsequent kernel call. + * + * @param d_chars The characters for the input strings column to normalize + * @param total_bytes The number of bytes in the d_chars + * @param cp_metadata First lookup table for codepoint metadata + * @param aux_table Second lookup table containing possible replacement characters + * @param do_lower_case True if the normalization includes lower-casing characters + * @param d_output The output of the normalization (UTF-8 encoded) + */ +CUDF_KERNEL void data_normalizer_kernel(char const* d_chars, + int64_t total_bytes, + codepoint_metadata_type const* cp_metadata, + aux_codepoint_data_type const* aux_table, + bool do_lower_case, + uint32_t* d_output) +{ + uint32_t replacement[MAX_NEW_CHARS] = {0}; + + auto const idx = cudf::detail::grid_1d::global_thread_id(); + + if ((idx < total_bytes) && cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) { + auto const cp = [utf8 = d_chars + idx] { + cudf::char_utf8 ch_utf8 = *utf8; + if (ch_utf8 > 0x7F) { cudf::strings::detail::to_char_utf8(utf8, ch_utf8); } + return cudf::strings::detail::utf8_to_codepoint(ch_utf8); + }(); + auto const metadata = cp_metadata[cp]; + + if (!should_remove_cp(metadata, do_lower_case)) { + int8_t num_new_chars = 1; + // retrieve the normalized value for cp + auto const new_cp = do_lower_case || always_replace(metadata) ? get_first_cp(metadata) : cp; + replacement[0] = new_cp == 0 ? cp : new_cp; + + if (do_lower_case && is_multi_char_transform(metadata)) { + auto const next_cps = aux_table[cp]; + replacement[1] = static_cast(next_cps >> 32); + replacement[2] = static_cast(next_cps & 0xFFFFFFFF); + num_new_chars = 2 + (replacement[2] != 0); + } + + if (should_add_spaces(metadata, do_lower_case) && (num_new_chars == 1)) { + replacement[1] = replacement[0]; + replacement[0] = SPACE_CODE_POINT; // add spaces around the new codepoint + replacement[2] = SPACE_CODE_POINT; + num_new_chars = 3; + } + + // convert codepoints back to UTF-8 in-place + for (int k = 0; k < num_new_chars; ++k) { + auto const new_cp = replacement[k]; + if (new_cp) { cp_to_utf8(new_cp, reinterpret_cast(replacement + k)); } + } + } + } + + // employ an optimized coalesced writer to output replacement as a block of transposed data + using block_store = + cub::BlockStore; + __shared__ typename block_store::TempStorage bs_stg; + auto block_base = d_output + blockIdx.x * blockDim.x * MAX_NEW_CHARS; + block_store(bs_stg).Store(block_base, replacement); +} + +/** + * @brief Computes the output sizes for each row + * + * The input offsets are used with segmented-reduce to count the number of + * non-zero values for each output row. + * + * @param d_normalized The UTF-8 encoded normalized values + * @param offsets These identify the row boundaries + * @param offset Only non-zero if the input column has been sliced + * @param size The number of output rows (sames as the number of input rows) + * @param stream Stream used for allocating device memory and launching kernels + * @return The sizes of each output row + */ +template +rmm::device_uvector compute_sizes(cudf::device_span d_normalized, + OffsetType offsets, + int64_t offset, + cudf::size_type size, + rmm::cuda_stream_view stream) +{ + auto output_sizes = rmm::device_uvector(size, stream); + + auto d_data = d_normalized.data(); + + // counts the non-zero bytes in the d_data array + auto d_in = cudf::detail::make_counting_transform_iterator( + 0, cuda::proclaim_return_type([d_data] __device__(auto idx) { + idx = idx * MAX_NEW_CHARS; + // transform function counts number of non-zero bytes in uint32_t value + auto tfn = [](uint32_t v) -> cudf::size_type { + return ((v & 0xFF) > 0) + ((v & 0xFF00) > 0) + ((v & 0xFF0000) > 0) + + ((v & 0xFF000000) > 0); + }; + auto const begin = d_data + idx; + auto const end = begin + MAX_NEW_CHARS; + return thrust::transform_reduce(thrust::seq, begin, end, tfn, 0, thrust::plus{}); + })); + + // DeviceSegmentedReduce is used to compute the size of each output row + auto d_out = output_sizes.begin(); + auto temp = std::size_t{0}; + if (offset == 0) { + cub::DeviceSegmentedReduce::Sum( + nullptr, temp, d_in, d_out, size, offsets, offsets + 1, stream.value()); + auto d_temp = rmm::device_buffer{temp, stream}; + cub::DeviceSegmentedReduce::Sum( + d_temp.data(), temp, d_in, d_out, size, offsets, offsets + 1, stream.value()); + } else { + // offsets need to be normalized for segmented-reduce to work efficiently + auto offsets_itr = thrust::transform_iterator( + offsets, + cuda::proclaim_return_type([offset] __device__(auto o) { return o - offset; })); + cub::DeviceSegmentedReduce::Sum( + nullptr, temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value()); + auto d_temp = rmm::device_buffer{temp, stream}; + cub::DeviceSegmentedReduce::Sum( + d_temp.data(), temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value()); + } + + return output_sizes; +} + +// handles ranges above int32 max +template +OutputIterator remove_copy_safe(InputIterator first, + InputIterator last, + OutputIterator result, + T const& value, + rmm::cuda_stream_view stream) +{ + auto const copy_size = std::min(static_cast(std::distance(first, last)), + static_cast(std::numeric_limits::max())); + + auto itr = first; + while (itr != last) { + auto const copy_end = + static_cast(std::distance(itr, last)) <= copy_size ? last : itr + copy_size; + result = thrust::remove_copy(rmm::exec_policy(stream), itr, copy_end, result, value); + itr = copy_end; + } + return result; +} + +// handles ranges above int32 max +template +Iterator remove_safe(Iterator first, Iterator last, T const& value, rmm::cuda_stream_view stream) +{ + auto const size = std::min(static_cast(std::distance(first, last)), + static_cast(std::numeric_limits::max())); + + auto result = first; + auto itr = first; + while (itr != last) { + auto end = static_cast(std::distance(itr, last)) <= size ? last : itr + size; + result = thrust::remove(rmm::exec_policy(stream), itr, end, value); + itr = end; + } + return result; +} +} // namespace + +std::unique_ptr normalize_characters(cudf::strings_column_view const& input, + character_normalizer const& normalizer, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + if (input.is_empty()) { return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); } + + auto [first_offset, last_offset] = + cudf::strings::detail::get_first_and_last_offset(input, stream); + auto const chars_size = last_offset - first_offset; + auto const d_input_chars = input.chars_begin(stream) + first_offset; + + if (chars_size == 0) { return std::make_unique(input.parent(), stream, mr); } + + constexpr int64_t block_size = 256; + cudf::detail::grid_1d grid{chars_size, block_size}; + auto const max_new_char_total = cudf::util::round_up_safe(chars_size, block_size) * MAX_NEW_CHARS; + + auto const& parameters = normalizer._impl; + + auto d_normalized = rmm::device_uvector(max_new_char_total, stream); + data_normalizer_kernel<<>>( + d_input_chars, + chars_size, + parameters->cp_metadata.data(), + parameters->aux_table.data(), + parameters->do_lower_case, + d_normalized.data()); + + // This removes space added around any special tokens in the form of [ttt]. + // An alternate approach is to do a multi-replace of '[ ttt ]' with '[ttt]' right + // before returning the output strings column. + auto const special_tokens = parameters->get_special_tokens(); + if (!special_tokens.empty()) { + special_tokens_kernel<<>>( + d_normalized.data(), chars_size, special_tokens); + } + + // Use segmented-reduce over the non-zero codepoints to get the size of the output rows + auto const input_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); + auto output_sizes = + compute_sizes(d_normalized, input_offsets, first_offset, input.size(), stream); + + // convert the sizes to offsets + auto [offsets, total_size] = cudf::strings::detail::make_offsets_child_column( + output_sizes.begin(), output_sizes.end(), stream, mr); + + // create output chars by calling remove_copy(0) on the bytes in d_normalized + auto chars = rmm::device_uvector(total_size, stream, mr); + auto const begin = reinterpret_cast(d_normalized.begin()); + // the remove() above speeds up the remove_copy() by roughly 10% + auto const end = + reinterpret_cast(remove_safe(d_normalized.begin(), d_normalized.end(), 0, stream)); + remove_copy_safe(begin, end, chars.data(), 0, stream); + + return cudf::make_strings_column(input.size(), + std::move(offsets), + chars.release(), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); +} + +} // namespace detail + +std::unique_ptr normalize_characters(cudf::strings_column_view const& input, + character_normalizer const& normalizer, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::normalize_characters(input, normalizer, stream, mr); +} + } // namespace nvtext diff --git a/cpp/src/text/normalize.cuh b/cpp/src/text/normalize.cuh new file mode 100644 index 00000000000..3972726d536 --- /dev/null +++ b/cpp/src/text/normalize.cuh @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "text/subword/detail/cp_data.h" + +namespace nvtext { +namespace detail { + +/** + * @brief Bit used to filter out invalid code points. + * + * When normalizing characters to code point values, if this bit is set, + * the code point should be filtered out before returning from the normalizer. + */ +constexpr uint32_t FILTER_BIT = 22; + +/** + * @brief Retrieve new code point from metadata value. + * + * @param metadata Value from the codepoint_metadata table. + * @return The replacement character if appropriate. + */ +__device__ constexpr uint32_t get_first_cp(uint32_t metadata) { return metadata & NEW_CP_MASK; } + +/** + * @brief Retrieve token category from the metadata value. + * + * Category values are 0-5: + * 0 - character should be padded + * 1 - pad character if lower-case + * 2 - character should be removed + * 3 - remove character if lower-case + * 4 - whitespace character -- always replace + * 5 - uncategorized + * + * @param metadata Value from the codepoint_metadata table. + * @return Category value. + */ +__device__ constexpr uint32_t extract_token_cat(uint32_t metadata) +{ + return (metadata >> TOKEN_CAT_SHIFT) & TOKEN_CAT_MASK; +} + +/** + * @brief Return true if category of metadata value specifies the character should be replaced. + */ +__device__ constexpr bool should_remove_cp(uint32_t metadata, bool lower_case) +{ + auto const cat = extract_token_cat(metadata); + return (cat == TOKEN_CAT_REMOVE_CHAR) || (lower_case && (cat == TOKEN_CAT_REMOVE_CHAR_IF_LOWER)); +} + +/** + * @brief Return true if category of metadata value specifies the character should be padded. + */ +__device__ constexpr bool should_add_spaces(uint32_t metadata, bool lower_case) +{ + auto const cat = extract_token_cat(metadata); + return (cat == TOKEN_CAT_ADD_SPACE) || (lower_case && (cat == TOKEN_CAT_ADD_SPACE_IF_LOWER)); +} + +/** + * @brief Return true if category of metadata value specifies the character should be replaced. + */ +__device__ constexpr bool always_replace(uint32_t metadata) +{ + return extract_token_cat(metadata) == TOKEN_CAT_ALWAYS_REPLACE; +} + +/** + * @brief Returns true if metadata value includes a multi-character transform bit equal to 1. + */ +__device__ constexpr bool is_multi_char_transform(uint32_t metadata) +{ + return (metadata >> MULTICHAR_SHIFT) & MULTICHAR_MASK; +} + +/** + * @brief Returns true if the byte passed in could be a valid head byte for + * a utf8 character. That is, not binary `10xxxxxx` + */ +__device__ constexpr bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6) != 2; } + +} // namespace detail +} // namespace nvtext diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu index 7a39199011e..4c54409c41a 100644 --- a/cpp/src/text/subword/data_normalizer.cu +++ b/cpp/src/text/subword/data_normalizer.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "text/normalize.cuh" #include "text/subword/detail/data_normalizer.hpp" #include "text/subword/detail/tokenizer_utils.cuh" @@ -38,81 +39,6 @@ namespace nvtext { namespace detail { namespace { -/** - * @brief Bit used to filter out invalid code points. - * - * When normalizing characters to code point values, if this bit is set, - * the code point should be filtered out before returning from the normalizer. - */ -constexpr uint32_t FILTER_BIT = 22; - -/** - * @brief Retrieve new code point from metadata value. - * - * @param metadata Value from the codepoint_metadata table. - * @return The replacement character if appropriate. - */ -__device__ uint32_t get_first_cp(uint32_t metadata) { return metadata & NEW_CP_MASK; } - -/** - * @brief Retrieve token category from the metadata value. - * - * Category values are 0-5: - * 0 - character should be padded - * 1 - pad character if lower-case - * 2 - character should be removed - * 3 - remove character if lower-case - * 4 - whitespace character -- always replace - * 5 - uncategorized - * - * @param metadata Value from the codepoint_metadata table. - * @return Category value. - */ -__device__ uint32_t extract_token_cat(uint32_t metadata) -{ - return (metadata >> TOKEN_CAT_SHIFT) & TOKEN_CAT_MASK; -} - -/** - * @brief Return true if category of metadata value specifies the character should be replaced. - */ -__device__ bool should_remove_cp(uint32_t metadata, bool lower_case) -{ - auto const cat = extract_token_cat(metadata); - return (cat == TOKEN_CAT_REMOVE_CHAR) || (lower_case && (cat == TOKEN_CAT_REMOVE_CHAR_IF_LOWER)); -} - -/** - * @brief Return true if category of metadata value specifies the character should be padded. - */ -__device__ bool should_add_spaces(uint32_t metadata, bool lower_case) -{ - auto const cat = extract_token_cat(metadata); - return (cat == TOKEN_CAT_ADD_SPACE) || (lower_case && (cat == TOKEN_CAT_ADD_SPACE_IF_LOWER)); -} - -/** - * @brief Return true if category of metadata value specifies the character should be replaced. - */ -__device__ bool always_replace(uint32_t metadata) -{ - return extract_token_cat(metadata) == TOKEN_CAT_ALWAYS_REPLACE; -} - -/** - * @brief Returns true if metadata value includes a multi-character transform bit equal to 1. - */ -__device__ bool is_multi_char_transform(uint32_t metadata) -{ - return (metadata >> MULTICHAR_SHIFT) & MULTICHAR_MASK; -} - -/** - * @brief Returns true if the byte passed in could be a valid head byte for - * a utf8 character. That is, not binary `10xxxxxx` - */ -__device__ bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6) != 2; } - /** * @brief Converts a UTF-8 character into a unicode code point value. * diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp index 2515cc917fa..530148eb654 100644 --- a/cpp/tests/text/normalize_tests.cpp +++ b/cpp/tests/text/normalize_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -74,6 +74,10 @@ TEST_F(TextNormalizeTest, NormalizeEmptyTest) EXPECT_EQ(results->size(), 0); results = nvtext::normalize_characters(strings_view, false); EXPECT_EQ(results->size(), 0); + + auto normalizer = nvtext::create_character_normalizer(true); + results = nvtext::normalize_characters(strings_view, *normalizer); + EXPECT_EQ(results->size(), 0); } TEST_F(TextNormalizeTest, AllNullStrings) @@ -84,6 +88,10 @@ TEST_F(TextNormalizeTest, AllNullStrings) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); results = nvtext::normalize_characters(strings_view, false); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); + + auto normalizer = nvtext::create_character_normalizer(true); + results = nvtext::normalize_characters(strings_view, *normalizer); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); } TEST_F(TextNormalizeTest, SomeNullStrings) @@ -93,27 +101,21 @@ TEST_F(TextNormalizeTest, SomeNullStrings) auto results = nvtext::normalize_characters(strings_view, false); cudf::test::strings_column_wrapper expected({"", " . ", "a"}, {false, true, true}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto normalizer = nvtext::create_character_normalizer(true); + results = nvtext::normalize_characters(strings_view, *normalizer); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } TEST_F(TextNormalizeTest, NormalizeCharacters) { // These include punctuation, accents, whitespace, and CJK characters - std::vector h_strings{"abc£def", - nullptr, - "éè â îô\taeio", - "\tĂĆĖÑ Ü", - "ACEN U", - "P^NP", - "$41.07", - "[a,b]", - "丏丟", - ""}; - auto validity = - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); - cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity); - cudf::strings_column_view strings_view(strings); + auto input = cudf::test::strings_column_wrapper( + {"abc£def", "", "éè â îô\taeio", "\tĂĆĖÑ Ü", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟", ""}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); + auto sv = cudf::strings_column_view(input); { - auto results = nvtext::normalize_characters(strings_view, true); + auto results = nvtext::normalize_characters(sv, true); cudf::test::strings_column_wrapper expected({"abc£def", "", "ee a io aeio", @@ -124,11 +126,11 @@ TEST_F(TextNormalizeTest, NormalizeCharacters) " [ a , b ] ", " 丏 丟 ", ""}, - validity); + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - auto results = nvtext::normalize_characters(strings_view, false); + auto results = nvtext::normalize_characters(sv, false); cudf::test::strings_column_wrapper expected({"abc£def", "", "éè â îô aeio", @@ -139,11 +141,117 @@ TEST_F(TextNormalizeTest, NormalizeCharacters) " [ a , b ] ", " 丏 丟 ", ""}, - validity); + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } +TEST_F(TextNormalizeTest, WithNormalizer) +{ + auto long_row = + "this entry is intended to pad out past 256 bytes which is currently the block size"; + // the following include punctuation, accents, whitespace, and CJK characters + auto input = cudf::test::strings_column_wrapper({"abc£def", + "", + "éè â îô\taeio", + "\tĂĆĖÑ Ü", + "ACEN U", + "P^NP", + "$41.07", + "[a,b]", + "丏丟", + "", + long_row, + long_row, + long_row}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + + auto const sv = cudf::strings_column_view(input); + + auto normalizer = nvtext::create_character_normalizer(true); + auto results = nvtext::normalize_characters(sv, *normalizer); + auto expected = cudf::test::strings_column_wrapper({"abc£def", + "", + "ee a io aeio", + " acen u", + "acen u", + "p ^ np", + " $ 41 . 07", + " [ a , b ] ", + " 丏 丟 ", + "", + long_row, + long_row, + long_row}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::normalize_characters(sv, *normalizer); // test normalizer re-use + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + normalizer = nvtext::create_character_normalizer(false); + results = nvtext::normalize_characters(sv, *normalizer); + expected = cudf::test::strings_column_wrapper({"abc£def", + "", + "éè â îô aeio", + " ĂĆĖÑ Ü", + "ACEN U", + "P ^ NP", + " $ 41 . 07", + " [ a , b ] ", + " 丏 丟 ", + "", + long_row, + long_row, + long_row}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::normalize_characters(sv, *normalizer); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST_F(TextNormalizeTest, SpecialTokens) +{ + auto long_row = + "this entry is intended to pad out past 256 bytes which is currently the block size"; + auto input = + cudf::test::strings_column_wrapper({"[BOS]Some strings with [PAD] special[SEP]tokens[EOS]", + "[bos]these should[sep]work too[eos]", + "some[non]tokens[eol]too", + long_row, + long_row, + long_row}); + + auto sv = cudf::strings_column_view(input); + auto special_tokens = cudf::test::strings_column_wrapper({"[BOS]", "[EOS]", "[SEP]", "[PAD]"}); + auto stv = cudf::strings_column_view(special_tokens); + + auto normalizer = nvtext::create_character_normalizer(true, stv); + auto results = nvtext::normalize_characters(sv, *normalizer); + auto expected = cudf::test::strings_column_wrapper( + {" [bos] some strings with [pad] special [sep] tokens [eos] ", + " [bos] these should [sep] work too [eos] ", + "some [ non ] tokens [ eol ] too", + long_row, + long_row, + long_row}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::normalize_characters(sv, *normalizer); // and again + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + normalizer = nvtext::create_character_normalizer(false, stv); + results = nvtext::normalize_characters(sv, *normalizer); + expected = cudf::test::strings_column_wrapper( + {" [BOS] Some strings with [PAD] special [SEP] tokens [EOS] ", + " [ bos ] these should [ sep ] work too [ eos ] ", + "some [ non ] tokens [ eol ] too", + long_row, + long_row, + long_row}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::normalize_characters(sv, *normalizer); // and again + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(TextNormalizeTest, NormalizeSlicedColumn) { cudf::test::strings_column_wrapper strings( @@ -151,10 +259,21 @@ TEST_F(TextNormalizeTest, NormalizeSlicedColumn) std::vector sliced = cudf::split(strings, {4}); auto results = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), true); - cudf::test::strings_column_wrapper expected({"abc£def", "ee a io aeio", "acen u", "p ^ np"}); + auto expected = + cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), false); + expected = cudf::test::strings_column_wrapper({" $ 41 . 07", " [ a , b ] ", " 丏 丟 "}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto normalizer = nvtext::create_character_normalizer(true); + results = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), *normalizer); + expected = cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), false); - cudf::test::strings_column_wrapper expected2({" $ 41 . 07", " [ a , b ] ", " 丏 丟 "}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); + normalizer = nvtext::create_character_normalizer(false); + results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), *normalizer); + expected = cudf::test::strings_column_wrapper({" $ 41 . 07", " [ a , b ] ", " 丏 丟 "}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } diff --git a/python/cudf/cudf/core/character_normalizer.py b/python/cudf/cudf/core/character_normalizer.py new file mode 100644 index 00000000000..1240c0e1eb7 --- /dev/null +++ b/python/cudf/cudf/core/character_normalizer.py @@ -0,0 +1,46 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +from __future__ import annotations + +import pylibcudf as plc + +import cudf + + +class CharacterNormalizer: + """ + A normalizer object used to normalize input text. + + Parameters + ---------- + do_lower : bool + If True, the normalizer should also lower-case + while normalizing. + special_tokens : cudf.Series + Series of special tokens. + """ + + def __init__( + self, + do_lower: bool, + special_tokens: cudf.Series = cudf.Series([], dtype="object"), + ) -> None: + self.normalizer = plc.nvtext.normalize.CharacterNormalizer( + do_lower, special_tokens._column.to_pylibcudf(mode="read") + ) + + def normalize(self, text: cudf.Series) -> cudf.Series: + """ + Parameters + ---------- + text : cudf.Series + The strings to be normalized. + + Returns + ------- + cudf.Series + Normalized strings + """ + result = text._column.normalize_characters(self.normalizer) + + return cudf.Series._from_column(result) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 04a72017c33..c0ad33ec7d6 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4679,8 +4679,10 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: r""" Normalizes strings characters for tokenizing. - This uses the normalizer that is built into the - subword_tokenize function which includes: + .. deprecated:: 25.04 + Use `CharacterNormalizer` instead. + + The normalizer function includes: - adding padding around punctuation (unicode category starts with "P") as well as certain ASCII symbols like "^" and "$" @@ -4720,8 +4722,13 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: 2 $ 99 dtype: object """ + warnings.warn( + "normalize_characters is deprecated and will be removed in a future " + "version. Use CharacterNormalizer instead.", + FutureWarning, + ) return self._return_or_inplace( - self._column.normalize_characters(do_lower) + self._column.characters_normalize(do_lower) ) def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: @@ -6256,14 +6263,25 @@ def normalize_spaces(self) -> Self: ) @acquire_spill_lock() - def normalize_characters(self, do_lower: bool = True) -> Self: + def characters_normalize(self, do_lower: bool = True) -> Self: return ColumnBase.from_pylibcudf( # type: ignore[return-value] - plc.nvtext.normalize.normalize_characters( + plc.nvtext.normalize.characters_normalize( self.to_pylibcudf(mode="read"), do_lower, ) ) + @acquire_spill_lock() + def normalize_characters( + self, normalizer: plc.nvtext.normalize.CharacterNormalizer + ) -> Self: + return ColumnBase.from_pylibcudf( # type: ignore[return-value] + plc.nvtext.normalize.normalize_characters( + self.to_pylibcudf(mode="read"), + normalizer, + ) + ) + @acquire_spill_lock() def replace_tokens( self, targets: Self, replacements: Self, delimiter: plc.Scalar diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 86e1e46c1a2..dc45827d2e8 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -8,6 +8,7 @@ import cudf from cudf.core.byte_pair_encoding import BytePairEncoder +from cudf.core.character_normalizer import CharacterNormalizer from cudf.core.tokenize_vocabulary import TokenizeVocabulary from cudf.testing import assert_eq @@ -251,7 +252,8 @@ def test_normalize_characters(): ] ) - actual = strings.str.normalize_characters() + normalizer_lower = CharacterNormalizer(True) + actual = normalizer_lower.normalize(strings.str) assert type(expected) is type(actual) assert_eq(expected, actual) @@ -265,7 +267,9 @@ def test_normalize_characters(): "Stock ^ $ 1", ] ) - actual = strings.str.normalize_characters(do_lower=False) + + normalizer = CharacterNormalizer(False) + actual = normalizer.normalize(strings.str) assert type(expected) is type(actual) assert_eq(expected, actual) diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd index f8b082c8429..2cf2bfb8ac9 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler @@ -16,3 +16,16 @@ cdef extern from "nvtext/normalize.hpp" namespace "nvtext" nogil: const column_view & strings, bool do_lower_case ) except +libcudf_exception_handler + + cdef struct character_normalizer "nvtext::character_normalizer": + pass + + cdef unique_ptr[character_normalizer] create_character_normalizer( + bool do_lower_case, + const column_view & strings + ) except +libcudf_exception_handler + + cdef unique_ptr[column] normalize_characters( + const column_view & strings, + const character_normalizer & normalizer + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd index 90676145afa..e6688e19762 100644 --- a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd @@ -1,9 +1,18 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp cimport bool +from libcpp.memory cimport unique_ptr from pylibcudf.column cimport Column +from pylibcudf.libcudf.nvtext.normalize cimport character_normalizer +cdef class CharacterNormalizer: + cdef unique_ptr[character_normalizer] c_obj cpdef Column normalize_spaces(Column input) -cpdef Column normalize_characters(Column input, bool do_lower_case) +cpdef Column characters_normalize(Column input, bool do_lower_case) + +cpdef Column normalize_characters( + Column input, + CharacterNormalizer normalizer +) diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi index 1d90a5a8960..d722ef6c79e 100644 --- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi @@ -1,6 +1,12 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.column import Column +class CharacterNormalizer: + def __init__(self, do_lower_case: bool, special_tokens: Column): ... + def normalize_spaces(input: Column) -> Column: ... -def normalize_characters(input: Column, do_lower_case: bool) -> Column: ... +def characters_normalize(input: Column, do_lower_case: bool) -> Column: ... +def normalize_characters( + input: Column, normalizer: CharacterNormalizer +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx index b259ccaefa6..6a18c205841 100644 --- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx @@ -1,16 +1,37 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. +from cython.operator cimport dereference from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.nvtext.normalize cimport ( - normalize_characters as cpp_normalize_characters, - normalize_spaces as cpp_normalize_spaces, -) +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext cimport normalize as cpp_normalize -__all__ = ["normalize_characters", "normalize_spaces"] +__all__ = [ + "CharacterNormalizer" + "normalize_characters", + "normalize_spaces", + "characters_normalize" +] + +cdef class CharacterNormalizer: + """The normalizer object to be used with ``normalize_characters``. + + For details, see :cpp:class:`cudf::nvtext::character_normalizer`. + """ + def __cinit__(self, bool do_lower_case, Column tokens): + cdef column_view c_tokens = tokens.view() + with nogil: + self.c_obj = move( + cpp_normalize.create_character_normalizer( + do_lower_case, + c_tokens + ) + ) + + __hash__ = None cpdef Column normalize_spaces(Column input): """ @@ -32,12 +53,12 @@ cpdef Column normalize_spaces(Column input): cdef unique_ptr[column] c_result with nogil: - c_result = cpp_normalize_spaces(input.view()) + c_result = cpp_normalize.normalize_spaces(input.view()) return Column.from_libcudf(move(c_result)) -cpdef Column normalize_characters(Column input, bool do_lower_case): +cpdef Column characters_normalize(Column input, bool do_lower_case): """ Normalizes strings characters for tokenizing. @@ -60,6 +81,38 @@ cpdef Column normalize_characters(Column input, bool do_lower_case): cdef unique_ptr[column] c_result with nogil: - c_result = cpp_normalize_characters(input.view(), do_lower_case) + c_result = cpp_normalize.normalize_characters( + input.view(), + do_lower_case + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column normalize_characters(Column input, CharacterNormalizer normalizer): + """ + Normalizes strings characters for tokenizing. + + For details, see :cpp:func:`normalize_characters` + + Parameters + ---------- + input : Column + Input strings + normalizer : CharacterNormalizer + Normalizer object used for modifying the input column text + + Returns + ------- + Column + Normalized strings column + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_normalize.normalize_characters( + input.view(), + dereference(normalizer.c_obj.get()) + ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py index 25b6d1389ec..47bbb191be6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import pyarrow as pa import pytest @@ -15,7 +15,7 @@ def norm_spaces_input_data(): @pytest.fixture(scope="module") def norm_chars_input_data(): - arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"] + arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]", "[pad]"] return pa.array(arr) @@ -29,15 +29,98 @@ def test_normalize_spaces(norm_spaces_input_data): @pytest.mark.parametrize("do_lower", [True, False]) def test_normalize_characters(norm_chars_input_data, do_lower): - result = plc.nvtext.normalize.normalize_characters( + result = plc.nvtext.normalize.characters_normalize( plc.interop.from_arrow(norm_chars_input_data), do_lower, ) - expected = pa.array( - ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "] + if do_lower: + expected = pa.array( + [ + "eaio eaio", + "acenu", + "acenu", + " $ 24 . 08", + " [ a , bb ] ", + " [ pad ] ", + ] + ) + else: + expected = pa.array( + [ + "éâîô eaio", + "ĂĆĖÑÜ", + "ACENU", + " $ 24 . 08", + " [ a , bb ] ", + " [ pad ] ", + ] + ) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("do_lower", [True, False]) +def test_normalizer(norm_chars_input_data, do_lower): + result = plc.nvtext.normalize.normalize_characters( + plc.interop.from_arrow(norm_chars_input_data), + plc.nvtext.normalize.CharacterNormalizer( + do_lower, + plc.column_factories.make_empty_column(plc.types.TypeId.STRING), + ), + ) + if do_lower: + expected = pa.array( + [ + "eaio eaio", + "acenu", + "acenu", + " $ 24 . 08", + " [ a , bb ] ", + " [ pad ] ", + ] + ) + else: + expected = pa.array( + [ + "éâîô eaio", + "ĂĆĖÑÜ", + "ACENU", + " $ 24 . 08", + " [ a , bb ] ", + " [ pad ] ", + ] + ) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("do_lower", [True, False]) +def test_normalizer_with_special_tokens(norm_chars_input_data, do_lower): + special_tokens = pa.array(["[pad]"]) + result = plc.nvtext.normalize.normalize_characters( + plc.interop.from_arrow(norm_chars_input_data), + plc.nvtext.normalize.CharacterNormalizer( + do_lower, plc.interop.from_arrow(special_tokens) + ), ) - if not do_lower: + if do_lower: + expected = pa.array( + [ + "eaio eaio", + "acenu", + "acenu", + " $ 24 . 08", + " [ a , bb ] ", + " [pad] ", + ] + ) + else: expected = pa.array( - ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "] + [ + "éâîô eaio", + "ĂĆĖÑÜ", + "ACENU", + " $ 24 . 08", + " [ a , bb ] ", + " [pad] ", + ] ) assert_column_eq(result, expected) From 5eb552754020bed652f3f278a6b5cc494eeb9bce Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 25 Feb 2025 16:15:40 -0800 Subject: [PATCH 06/13] Remove unused var (#18096) The `cython_lib_dir` was removed as part of the switch to publishing a libcudf wheel. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/18096 --- python/cudf/CMakeLists.txt | 4 ---- python/cudf_kafka/CMakeLists.txt | 4 ---- python/pylibcudf/CMakeLists.txt | 4 ---- 3 files changed, 12 deletions(-) diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index 2a17bc5dbb7..090e475471d 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -37,7 +37,3 @@ rapids_cython_init() add_subdirectory(cudf/_lib) add_subdirectory(udf_cpp) - -if(DEFINED cython_lib_dir) - rapids_cython_add_rpath_entries(TARGET cudf PATHS "${cython_lib_dir}") -endif() diff --git a/python/cudf_kafka/CMakeLists.txt b/python/cudf_kafka/CMakeLists.txt index 3e12eb6aa41..13b859bc33b 100644 --- a/python/cudf_kafka/CMakeLists.txt +++ b/python/cudf_kafka/CMakeLists.txt @@ -35,7 +35,3 @@ include(rapids-cython-core) rapids_cython_init() add_subdirectory(cudf_kafka/_lib) - -if(DEFINED cython_lib_dir) - rapids_cython_add_rpath_entries(TARGET cudf_kafka PATHS "${cython_lib_dir}") -endif() diff --git a/python/pylibcudf/CMakeLists.txt b/python/pylibcudf/CMakeLists.txt index fe6e73a3f14..153570a4a7e 100644 --- a/python/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/CMakeLists.txt @@ -37,7 +37,3 @@ include(rapids-cython-core) rapids_cython_init() add_subdirectory(pylibcudf) - -if(DEFINED cython_lib_dir) - rapids_cython_add_rpath_entries(TARGET cudf PATHS "${cython_lib_dir}") -endif() From d8b3d801ec4830102242db1fa60a88e1a0bb7299 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 25 Feb 2025 16:58:14 -0800 Subject: [PATCH 07/13] Fix scatter_by_map with spilling enabled (#18095) closes https://github.com/rapidsai/cudf/issues/18088 Before the old Cython bindings of `columns_split` spill locked the conversion from libcudf to a cudf Python column. When I replaced these bindings, this spill locking was removed during the refactor. I'm spot checking that other APIs are not affected. If so I can open PRs for those Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/18095 --- python/cudf/cudf/core/indexed_frame.py | 6 +++++- python/cudf/cudf/tests/test_spilling.py | 11 ++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 9c48b31a309..211d161696e 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3308,9 +3308,13 @@ def _split(self, splits, keep_index: bool = True) -> list[Self]: splits, ) + @acquire_spill_lock() + def split_from_pylibcudf(split: list[plc.Column]) -> list[ColumnBase]: + return [ColumnBase.from_pylibcudf(col) for col in split] + return [ self._from_columns_like_self( - [ColumnBase.from_pylibcudf(col) for col in split], + split_from_pylibcudf(split), self._column_names, self.index.names if keep_index else None, ) diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py index 13d98e43ddc..08226dd7f6d 100644 --- a/python/cudf/cudf/tests/test_spilling.py +++ b/python/cudf/cudf/tests/test_spilling.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. from __future__ import annotations import contextlib @@ -784,3 +784,12 @@ def test_spilling_and_copy_on_write(manager: SpillManager): assert not a.is_spilled assert a.owner.exposed assert not b.owner.exposed + + +def test_scatter_by_map(): + data = range(10) + with cudf.option_context("spill", True): + df = cudf.DataFrame(data) + result = df.scatter_by_map(data) + for i, res in zip(data, result): + assert_eq(res, cudf.DataFrame([i], index=[i])) From 46b9799ea55b899e08f6b758ec90e9742a72d159 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Tue, 25 Feb 2025 19:14:57 -0600 Subject: [PATCH 08/13] Fix `test_scan_csv_multi` cudf-polars test (#18064) The current implementation of `test_scan_csv_multi` does not work if the compute task is run on distinct worker processes (because it changes directory in lieu of using a proper file path). Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Peter Andreas Entschev (https://github.com/pentschev) - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/18064 --- python/cudf_polars/tests/test_scan.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index 9c58a24c065..8ff0db084b1 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -1,9 +1,7 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -import os - import pytest import polars as pl @@ -203,8 +201,11 @@ def test_scan_csv_multi(tmp_path, filename, glob, nrows_skiprows): f.write("""foo,bar,baz\n1,2,3\n3,4,5""") with (tmp_path / "test*.csv").open("w") as f: f.write("""foo,bar,baz\n1,2,3\n3,4,5""") - os.chdir(tmp_path) - q = pl.scan_csv(filename, glob=glob, n_rows=n_rows, skip_rows=skiprows) + if isinstance(filename, list): + source = [tmp_path / fn for fn in filename] + else: + source = tmp_path / filename + q = pl.scan_csv(source, glob=glob, n_rows=n_rows, skip_rows=skiprows) assert_gpu_result_equal(q) From 72d5792c79f11c90f43c6991dd54e082b3c0ad98 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Wed, 26 Feb 2025 08:14:26 -0600 Subject: [PATCH 09/13] Relax inconsistent schema handling in `dask_cudf.read_parquet` (#17554) Addresses an issue raised offline by @praateekmahajan Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Tom Augspurger (https://github.com/TomAugspurger) - Mads R. B. Kristensen (https://github.com/madsbk) URL: https://github.com/rapidsai/cudf/pull/17554 --- .../dask_cudf/dask_cudf/_legacy/io/parquet.py | 18 +++---- .../dask_cudf/io/tests/test_parquet.py | 48 ++++++++++++++++++- 2 files changed, 53 insertions(+), 13 deletions(-) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py index c0792663c7e..c0b9d71653c 100644 --- a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py +++ b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py @@ -434,18 +434,12 @@ def set_object_dtypes_from_pa_schema(df, schema): # pyarrow schema. if schema: for col_name, col in df._data.items(): - if col_name is None: - # Pyarrow cannot handle `None` as a field name. - # However, this should be a simple range index that - # we can ignore anyway - continue - typ = cudf_dtype_from_pa_type(schema.field(col_name).type) - if ( - col_name in schema.names - and not isinstance(typ, (cudf.ListDtype, cudf.StructDtype)) - and isinstance(col, cudf.core.column.StringColumn) - ): - df._data[col_name] = col.astype(typ) + if col_name in schema.names: + typ = cudf_dtype_from_pa_type(schema.field(col_name).type) + if not isinstance( + typ, (cudf.ListDtype, cudf.StructDtype) + ) and isinstance(col, cudf.core.column.StringColumn): + df._data[col_name] = col.astype(typ) to_parquet = dd.to_parquet diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 9f7031f4d2a..3a88668e6d2 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +import pyarrow as pa import pytest import dask @@ -486,6 +487,52 @@ def test_create_metadata_file_inconsistent_schema(tmpdir): dd.assert_eq(ddf1.compute(), ddf2.compute()) +@pytest.mark.parametrize("specify_schema", [True, False]) +def test_read_inconsistent_schema(tmpdir, specify_schema): + if specify_schema: + # If we specify the expected schema, + # we also need to specify the partitioning. + kwargs = { + "dataset": { + "schema": pa.schema( + [ + ("id", pa.int64()), + ("text", pa.string()), + ("meta1", pa.struct([("field1", pa.string())])), + ] + ), + "partitioning": None, + }, + } + else: + kwargs = {} + + records = [ + {"id": 123, "text": "foo"}, + { + "text": "bar", + "meta1": [{"field1": "cat"}], + "id": 456, + }, + ] + columns = ["text", "id"] + pd.DataFrame(records[:1]).to_parquet(tmpdir / "part.0.parquet") + pd.DataFrame(records[1:]).to_parquet(tmpdir / "part.1.parquet") + # Check that cuDF and Dask cuDF match + dd.assert_eq( + cudf.read_parquet( + tmpdir, columns=columns, allow_mismatched_pq_schemas=True + ), + dask_cudf.read_parquet(tmpdir, columns=columns, **kwargs), + check_index=False, + ) + # Check that "pandas" and "cudf" backends match + dd.assert_eq( + dd.read_parquet(tmpdir, columns=columns), + dask_cudf.read_parquet(tmpdir, columns=columns, **kwargs), + ) + + @pytest.mark.parametrize( "data", [ @@ -526,7 +573,6 @@ def test_cudf_list_struct_write(tmpdir): def test_null_partition(tmpdir): - import pyarrow as pa from pyarrow.dataset import HivePartitioning ids = pd.Series([0, 1, None], dtype="Int64") From e5d866bc68c4762ebd6e3e888e4abeaf4ccd9302 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 26 Feb 2025 07:01:27 -0800 Subject: [PATCH 10/13] Short circuit Index.equal if compared Index isn't same type (#18067) closes https://github.com/rapidsai/cudf/issues/8689 Before, comparing two different Index subclasses would execute a GPU kernel when we know they wouldn't be equal (e.g. DatetimeIndex equals RangeIndex). This PR add a short circuit clause to check that we are comparing the same subclasses. Also ensures we don't return a `np.bool_` object from this result. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/18067 --- python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/core/index.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 06dc4058115..67a0aa7a781 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -713,7 +713,7 @@ def all(self, skipna: bool = True) -> bool: # is empty. if self.null_count == self.size: return True - return self.reduce("all") + return bool(self.reduce("all")) def any(self, skipna: bool = True) -> bool: # Early exit for fast cases. diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 1730a692dc1..f4e5f6e96ae 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1286,6 +1286,15 @@ def equals(self, other) -> bool: elif other_is_categorical and not self_is_categorical: self = self.astype(other.dtype) check_dtypes = True + elif ( + not self_is_categorical + and not other_is_categorical + and not isinstance(other, RangeIndex) + and not isinstance(self, type(other)) + ): + # Can compare Index to CategoricalIndex or RangeIndex + # Other comparisons are invalid + return False try: return self._column.equals( From 1a8d6368405fac3c5e55592fef2d9259081b045c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 26 Feb 2025 11:00:27 -0800 Subject: [PATCH 11/13] Enforce deprecation of dtype parameter in sum/product (#18070) xref https://github.com/rapidsai/cudf/pull/16313 Deprecated in 24.08 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/18070 --- python/cudf/cudf/core/column/column.py | 20 +++++--------------- python/cudf/cudf/core/column/timedelta.py | 3 +-- python/cudf/cudf/core/indexed_frame.py | 8 -------- python/cudf/cudf/tests/test_reductions.py | 8 -------- 4 files changed, 6 insertions(+), 33 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 67a0aa7a781..b57d1f03981 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2,7 +2,6 @@ from __future__ import annotations -import warnings from collections import abc from collections.abc import MutableSequence, Sequence from functools import cached_property @@ -1946,8 +1945,7 @@ def _reduce( skipna=skipna, min_count=min_count ) if isinstance(preprocessed, ColumnBase): - dtype = kwargs.pop("dtype", None) - return preprocessed.reduce(op, dtype, **kwargs) + return preprocessed.reduce(op, **kwargs) return preprocessed def _can_return_nan(self, skipna: bool | None = None) -> bool: @@ -2110,16 +2108,8 @@ def scan(self, scan_op: str, inclusive: bool, **kwargs) -> Self: ) ) - def reduce(self, reduction_op: str, dtype=None, **kwargs) -> ScalarLike: - if dtype is not None: - warnings.warn( - "dtype is deprecated and will be remove in a future release. " - "Cast the result (e.g. .astype) after the operation instead.", - FutureWarning, - ) - col_dtype = dtype - else: - col_dtype = self._reduction_result_dtype(reduction_op) + def reduce(self, reduction_op: str, **kwargs) -> ScalarLike: + col_dtype = self._reduction_result_dtype(reduction_op) # check empty case if len(self) <= self.null_count: @@ -2148,7 +2138,7 @@ def reduce(self, reduction_op: str, dtype=None, **kwargs) -> ScalarLike: }: scale = -plc_scalar.type().scale() # https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql - p = col_dtype.precision + p = col_dtype.precision # type: ignore[union-attr] nrows = len(self) if reduction_op in {"min", "max"}: new_p = p @@ -2162,7 +2152,7 @@ def reduce(self, reduction_op: str, dtype=None, **kwargs) -> ScalarLike: raise NotImplementedError( f"{reduction_op} not implemented for decimal types." ) - precision = max(min(new_p, col_dtype.MAX_PRECISION), 0) + precision = max(min(new_p, col_dtype.MAX_PRECISION), 0) # type: ignore[union-attr] new_dtype = type(col_dtype)(precision, scale) result_col = result_col.astype(new_dtype) elif isinstance(col_dtype, IntervalDtype): diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 1cbbac0f8cc..8b0ef9f0cc8 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -452,14 +452,13 @@ def sum( self, skipna: bool | None = None, min_count: int = 0, - dtype: Dtype | None = None, ) -> pd.Timedelta: return pd.Timedelta( # Since sum isn't overridden in Numerical[Base]Column, mypy only # sees the signature from Reducible (which doesn't have the extra # parameters from ColumnBase._reduce) so we have to ignore this. self.astype(np.dtype(np.int64)).sum( # type: ignore - skipna=skipna, min_count=min_count, dtype=dtype + skipna=skipna, min_count=min_count ), unit=self.time_unit, ).as_unit(self.time_unit) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 211d161696e..9d426ad6bf7 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1328,7 +1328,6 @@ def sum( self, axis=no_default, skipna=True, - dtype=None, numeric_only=False, min_count=0, **kwargs, @@ -1342,8 +1341,6 @@ def sum( Axis for the function to be applied on. skipna: bool, default True Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. numeric_only : bool, default False If True, includes only float, int, boolean columns. If False, will raise error in-case there are @@ -1373,7 +1370,6 @@ def sum( "sum", axis=axis, skipna=skipna, - dtype=dtype, numeric_only=numeric_only, min_count=min_count, **kwargs, @@ -1384,7 +1380,6 @@ def product( self, axis=no_default, skipna=True, - dtype=None, numeric_only=False, min_count=0, **kwargs, @@ -1398,8 +1393,6 @@ def product( Axis for the function to be applied on. skipna: bool, default True Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. numeric_only : bool, default False If True, includes only float, int, boolean columns. If False, will raise error in-case there are @@ -1432,7 +1425,6 @@ def product( "prod" if axis in {1, "columns"} else "product", axis=axis, skipna=skipna, - dtype=dtype, numeric_only=numeric_only, min_count=min_count, **kwargs, diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index 80ffce9e8be..75e38b9246a 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -512,14 +512,6 @@ def test_reduction_column_multiindex(): assert_eq(result, expected) -@pytest.mark.parametrize("op", ["sum", "product"]) -def test_dtype_deprecated(op): - ser = cudf.Series(range(5)) - with pytest.warns(FutureWarning): - result = getattr(ser, op)(dtype=np.dtype(np.int8)) - assert isinstance(result, np.int8) - - @pytest.mark.parametrize( "columns", [pd.RangeIndex(2), pd.Index([0, 1], dtype="int8")] ) From 54e740af7a08b99cca84f4f668886031a2c36e71 Mon Sep 17 00:00:00 2001 From: MithunR Date: Wed, 26 Feb 2025 11:12:10 -0800 Subject: [PATCH 12/13] Remove static column vectors from window function tests. (#18099) Fixes #18079. This commit fixes the failures reported in #18079, where the use of static column vector objects in the tests causes the use of a CUDA runtime context before it's been initialized, causing the tests to fail with: ``` parallel_for failed: cudaErrorInvalidResourceHandle: invalid resource handle ``` The solution is to switch the static column vectors to runtime, as a member of the test utility class `rolling_runner`. Authors: - MithunR (https://github.com/mythrocks) Approvers: - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/18099 --- cpp/tests/rolling/offset_row_window_test.cpp | 28 +++++++++++--------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/cpp/tests/rolling/offset_row_window_test.cpp b/cpp/tests/rolling/offset_row_window_test.cpp index dcaa47e722b..4477ca388df 100644 --- a/cpp/tests/rolling/offset_row_window_test.cpp +++ b/cpp/tests/rolling/offset_row_window_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -43,18 +43,21 @@ auto constexpr null = int32_t{0}; // NULL representation for int32_t; auto no_nulls_list() { return nulls_at({}); } struct OffsetRowWindowTest : public cudf::test::BaseFixture { - static ints_column const _keys; // {0, 0, 0, 0, 0, 0, 1, 1, 1, 1}; - static ints_column const _values; // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - struct rolling_runner { cudf::window_bounds _preceding, _following; cudf::size_type _min_periods; bool _grouped = true; + ints_column const _keys; // {0, 0, 0, 0, 0, 0, 1, 1, 1, 1}; + ints_column const _values; // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; rolling_runner(cudf::window_bounds const& preceding, cudf::window_bounds const& following, cudf::size_type min_periods_ = 1) - : _preceding{preceding}, _following{following}, _min_periods{min_periods_} + : _preceding{preceding}, + _following{following}, + _min_periods{min_periods_}, + _keys{0, 0, 0, 0, 0, 0, 1, 1, 1, 1}, + _values{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} { } @@ -80,9 +83,6 @@ struct OffsetRowWindowTest : public cudf::test::BaseFixture { }; }; -ints_column const OffsetRowWindowTest::_keys{0, 0, 0, 0, 0, 0, 1, 1, 1, 1}; -ints_column const OffsetRowWindowTest::_values{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - auto const AGG_COUNT_NON_NULL = cudf::make_count_aggregation(cudf::null_policy::EXCLUDE); auto const AGG_COUNT_ALL = @@ -96,7 +96,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Grouped_3_to_Minus_1) { auto const preceding = cudf::window_bounds::get(3); auto const following = cudf::window_bounds::get(-1); - auto run_rolling = rolling_runner{preceding, following}.min_periods(1).grouped(true); + auto run_rolling = rolling_runner{preceding, following}; + run_rolling.min_periods(1).grouped(true); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL), ints_column{{0, 1, 2, 2, 2, 2, 0, 1, 2, 2}, nulls_at({0, 6})}); @@ -136,7 +137,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_3_to_Minus_1) { auto const preceding = cudf::window_bounds::get(3); auto const following = cudf::window_bounds::get(-1); - auto run_rolling = rolling_runner{preceding, following}.min_periods(1).grouped(false); + auto run_rolling = rolling_runner{preceding, following}; + run_rolling.min_periods(1).grouped(false); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL), ints_column{{0, 1, 2, 2, 2, 2, 2, 2, 2, 2}, nulls_at({0})}); @@ -176,7 +178,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Grouped_0_to_2) { auto const preceding = cudf::window_bounds::get(0); auto const following = cudf::window_bounds::get(2); - auto run_rolling = rolling_runner{preceding, following}.min_periods(1).grouped(true); + auto run_rolling = rolling_runner{preceding, following}; + run_rolling.min_periods(1).grouped(true); CUDF_TEST_EXPECT_COLUMNS_EQUAL( *run_rolling(*AGG_COUNT_NON_NULL), @@ -219,7 +222,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_0_to_2) { auto const preceding = cudf::window_bounds::get(0); auto const following = cudf::window_bounds::get(2); - auto run_rolling = rolling_runner{preceding, following}.min_periods(1).grouped(false); + auto run_rolling = rolling_runner{preceding, following}; + run_rolling.min_periods(1).grouped(false); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL), ints_column{{2, 2, 2, 2, 2, 2, 2, 2, 1, null}, nulls_at({9})}); From 79d0b75a5327f72cdc14297885257a8979bdf0f2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 26 Feb 2025 12:01:22 -0800 Subject: [PATCH 13/13] Align StringColumn constructor with ColumnBase base class (#18086) With this PR, the constructors of all subclasses of `ColumnBase` are aligned. This will allow us to, in the future, more easily align on an interface for a public `ColumnBase` that other libraries can use to extend cudf. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/18086 --- python/cudf/cudf/core/column/column.py | 9 +++++---- python/cudf/cudf/core/column/string.py | 24 ++++++++++++++---------- python/cudf/cudf/tests/test_string.py | 10 +++++++++- 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index b57d1f03981..89ac39b2be5 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2312,13 +2312,14 @@ def build_column( offset=offset, null_count=null_count, ) - elif dtype.type in (np.object_, np.str_): + elif dtype == CUDF_STRING_DTYPE: return cudf.core.column.StringColumn( - data=data, - mask=mask, + data=data, # type: ignore[arg-type] size=size, + dtype=dtype, + mask=mask, offset=offset, - children=children, + children=children, # type: ignore[arg-type] null_count=null_count, ) elif isinstance(dtype, ListDtype): diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index c0ad33ec7d6..28e8b98edfe 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -21,7 +21,7 @@ import cudf.core.column.datetime as datetime from cudf.api.types import is_integer, is_scalar, is_string_dtype from cudf.core._internals import binaryop -from cudf.core.buffer import acquire_spill_lock +from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column.column import ColumnBase from cudf.core.column.methods import ColumnMethods from cudf.core.scalar import pa_scalar_to_plc_scalar @@ -46,7 +46,6 @@ ScalarLike, SeriesOrIndex, ) - from cudf.core.buffer import Buffer from cudf.core.column.lists import ListColumn from cudf.core.column.numerical import NumericalColumn @@ -5595,13 +5594,14 @@ class StringColumn(column.ColumnBase): Parameters ---------- + data : Buffer + Buffer of the string data mask : Buffer The validity mask offset : int Data offset children : Tuple[Column] - Two non-null columns containing the string data and offsets - respectively + Columns containing the offsets """ _start_offset: int | None @@ -5629,14 +5629,20 @@ class StringColumn(column.ColumnBase): def __init__( self, - data: Buffer | None = None, + data: Buffer, + size: int | None, + dtype: np.dtype, mask: Buffer | None = None, - size: int | None = None, # TODO: make non-optional offset: int = 0, null_count: int | None = None, - children: tuple["column.ColumnBase", ...] = (), + children: tuple[column.ColumnBase] = (), # type: ignore[assignment] ): - dtype = cudf.api.types.dtype("object") + if not isinstance(data, Buffer): + raise ValueError("data must be a Buffer") + if dtype != CUDF_STRING_DTYPE: + raise ValueError(f"dtypy must be {CUDF_STRING_DTYPE}") + if len(children) > 1: + raise ValueError("StringColumn must have at most 1 offset column.") if size is None: for child in children: @@ -5731,8 +5737,6 @@ def base_size(self) -> int: # override for string column @property def data(self): - if self.base_data is None: - return None if self._data is None: if ( self.offset == 0 diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 164fcb06624..18aee0001c4 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -13,8 +13,11 @@ import pyarrow as pa import pytest +import rmm + import cudf from cudf import concat +from cudf.core.buffer import as_buffer from cudf.core.column.string import StringColumn from cudf.core.index import Index from cudf.testing import assert_eq @@ -1202,7 +1205,12 @@ def test_string_misc_name(ps_gs, name): def test_string_no_children_properties(): - empty_col = StringColumn(children=()) + empty_col = StringColumn( + as_buffer(rmm.DeviceBuffer(size=0)), + size=0, + dtype=np.dtype("object"), + children=(), + ) assert empty_col.base_children == () assert empty_col.base_size == 0