Merge remote-tracking branch 'upstream/branch-25.04' into cln/dtype/a…

…stype
rapidsai · Feb 26, 2025 · d58363f · d58363f
2 parents 9b500b4 + 79d0b75
commit d58363f
Show file tree

Hide file tree

Showing 42 changed files with 1,186 additions and 301 deletions.
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -24,7 +24,6 @@ jobs:
       - conda-python-cudf-tests
       - conda-python-other-tests
       - conda-java-tests
-      - static-configure
       - conda-notebook-tests
       - docs-build
       - wheel-build-libcudf
@@ -192,16 +191,6 @@ jobs:
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_java.sh"
-  static-configure:
-    needs: checks
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
-    with:
-      build_type: pull-request
-      # Use the wheel container so we can skip conda solves and since our
-      # primary static consumers (Spark) are not in conda anyway.
-      container_image: "rapidsai/ci-wheel:latest"
-      run_script: "ci/configure_cpp_static.sh"
   conda-notebook-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -168,3 +168,14 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       script: "ci/test_cudf_polars_polars_tests.sh"
+  narwhals-tests:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: ${{ inputs.build_type }}
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      node_type: "gpu-l4-latest-1"
+      container_image: "rapidsai/ci-conda:latest"
+      run_script: ci/test_narwhals.sh
diff --git a/ci/configure_cpp_static.sh b/ci/configure_cpp_static.sh
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -54,9 +54,9 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.2.0,<0.3.0a0
-- numba>=0.59.1,<0.61.0a0
-- numpy>=1.23,<3.0a0
+- numba-cuda>=0.4.0,<0.5.0a0
+- numba>=0.59.1,<0.62.0a0
+- numpy>=1.23,<2.1
 - numpydoc
 - nvcc_linux-64=11.8
 - nvcomp==4.2.0.11

diff --git a/conda/environments/all_cuda-128_arch-x86_64.yaml b/conda/environments/all_cuda-128_arch-x86_64.yaml
@@ -53,9 +53,9 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.2.0,<0.3.0a0
-- numba>=0.59.1,<0.61.0a0
-- numpy>=1.23,<3.0a0
+- numba-cuda>=0.4.0,<0.5.0a0
+- numba>=0.59.1,<0.62.0a0
+- numpy>=1.23,<2.1
 - numpydoc
 - nvcomp==4.2.0.11
 - nvtx>=0.2.1

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -75,9 +75,9 @@ requirements:
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.4dev0
     - cupy >=12.0.0
-    - numba-cuda >=0.2.0,<0.3.0a0
-    - numba >=0.59.1,<0.61.0a0
-    - numpy >=1.23,<3.0a0
+    - numba-cuda >=0.4.0,<0.5.0a0
+    - numba >=0.59.1,<0.62.0a0
+    - numpy >=1.23,<2.1
     - pyarrow>=14.0.0,<20.0.0a0
     - libcudf ={{ version }}
     - pylibcudf ={{ version }}

diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
@@ -73,7 +73,7 @@ requirements:
     - python
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.4dev0
-    - numpy >=1.23,<3.0a0
+    - numpy >=1.23,<2.1
     - pyarrow>=14.0.0,<20.0.0a0
     - libcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}

diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,15 +48,18 @@ static void bench_normalize(nvbench::state& state)
                [&](nvbench::launch& launch) { auto result = nvtext::normalize_spaces(input); });
   } else {
     bool const to_lower = (normalize_type == "to_lower");
+    // we expect the normalizer to be created once and re-used
+    // so creating it is not measured
+    auto normalizer = nvtext::create_character_normalizer(to_lower);
     state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-      auto result = nvtext::normalize_characters(input, to_lower);
+      auto result = nvtext::normalize_characters(input, *normalizer);
     });
   }
 }
 
 NVBENCH_BENCH(bench_normalize)
   .set_name("normalize")
   .add_int64_axis("min_width", {0})
-  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("max_width", {128, 256})
   .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"spaces", "characters", "to_lower"});
diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -96,5 +96,17 @@ int64_t get_offset_value(cudf::column_view const& offsets,
                          size_type index,
                          rmm::cuda_stream_view stream);
 
+/**
+ * @brief Return the first and last offset in the given strings column
+ *
+ * This accounts for sliced input columns as well.
+ *
+ * @param input Strings column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return First and last offset values
+ */
+std::pair<int64_t, int64_t> get_first_and_last_offset(cudf::strings_column_view const& input,
+                                                      rmm::cuda_stream_view stream);
+
 }  // namespace strings::detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -107,5 +108,113 @@ std::unique_ptr<cudf::column> normalize_characters(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Normalizer object to be used with nvtext::normalize_characters
+ *
+ * Use nvtext::create_normalizer to create this object.
+ *
+ * This normalizer includes:
+ *
+ * - adding padding around punctuation (unicode category starts with "P")
+ *   as well as certain ASCII symbols like "^" and "$"
+ * - adding padding around the [CJK Unicode block
+ * characters](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block))
+ * - changing whitespace (e.g. `"\t", "\n", "\r"`) to just space `" "`
+ * - removing control characters (unicode categories "Cc" and "Cf")
+ *
+ * The padding process adds a single space before and after the character.
+ * Details on _unicode category_ can be found here:
+ * https://unicodebook.readthedocs.io/unicode.html#categories
+ *
+ * If `do_lower_case = true`, lower-casing also removes any accents. The
+ * accents cannot be removed from upper-case characters without lower-casing
+ * and lower-casing cannot be performed without also removing accents.
+ * However, if the accented character is already lower-case, then only the
+ * accent is removed.
+ *
+ * If `special_tokens` are included the padding after `[` and before `]` is not
+ * inserted if the characters between them match one of the given tokens.
+ * Also, the `special_tokens` are expected to include the `[]` characters
+ * at the beginning of and end of each string appropriately.
+ */
+struct character_normalizer {
+  /**
+   * @brief Normalizer object constructor
+   *
+   * This initializes and holds the character normalizing tables and settings.
+   *
+   * @param do_lower_case If true, upper-case characters are converted to
+   *        lower-case and accents are stripped from those characters.
+   *        If false, accented and upper-case characters are not transformed.
+   * @param special_tokens Each row is a token including the `[]` brackets.
+   *        For example: `[BOS]`, `[EOS]`, `[UNK]`, `[SEP]`, `[PAD]`, `[CLS]`, `[MASK]`
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource used to allocate the returned column's device memory
+   */
+  character_normalizer(bool do_lower_case,
+                       cudf::strings_column_view const& special_tokens,
+                       rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                       rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+  ~character_normalizer();
+
+  struct character_normalizer_impl;
+  std::unique_ptr<character_normalizer_impl> _impl;
+};
+
+/**
+ * @brief Create a normalizer object
+ *
+ * Creates a normalizer object which can be reused on multiple calls to
+ * nvtext::normalize_characters
+ *
+ * @see nvtext::character_normalizer
+ *
+ * @param do_lower_case If true, upper-case characters are converted to
+ *        lower-case and accents are stripped from those characters.
+ *        If false, accented and upper-case characters are not transformed.
+ * @param special_tokens Individual tokens including `[]` brackets.
+ *        Default is no special tokens.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Object to be used with nvtext::normalize_characters
+ */
+std::unique_ptr<character_normalizer> create_character_normalizer(
+  bool do_lower_case,
+  cudf::strings_column_view const& special_tokens = cudf::strings_column_view(cudf::column_view{
+    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0}),
+  rmm::cuda_stream_view stream                    = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr               = cudf::get_current_device_resource_ref());
+
+/**
+ * @brief Normalizes the text in input strings column
+ *
+ * @see nvtext::character_normalizer for details on the normalizer behavior
+ *
+ * @code{.pseudo}
+ * cn = create_character_normalizer(true)
+ * s = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"]
+ * s1 = normalize_characters(s,cn)
+ * s1 is now ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "]
+ *
+ * cn = create_character_normalizer(false)
+ * s2 = normalize_characters(s,cn)
+ * s2 is now ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "]
+ * @endcode
+ *
+ * A null input element at row `i` produces a corresponding null entry
+ * for row `i` in the output column.
+ *
+ * @param input The input strings to normalize
+ * @param normalizer Normalizer to use for this function
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Memory resource to allocate any returned objects
+ * @return Normalized strings column
+ */
+std::unique_ptr<cudf::column> normalize_characters(
+  cudf::strings_column_view const& input,
+  character_normalizer const& normalizer,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -180,6 +180,18 @@ int64_t get_offset_value(cudf::column_view const& offsets,
                                 : cudf::detail::get_value<int32_t>(offsets, index, stream);
 }
 
+std::pair<int64_t, int64_t> get_first_and_last_offset(cudf::strings_column_view const& input,
+                                                      rmm::cuda_stream_view stream)
+{
+  if (input.is_empty()) { return {0L, 0L}; }
+  auto const first_offset = (input.offset() == 0) ? 0
+                                                  : cudf::strings::detail::get_offset_value(
+                                                      input.offsets(), input.offset(), stream);
+  auto const last_offset =
+    cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream);
+  return {first_offset, last_offset};
+}
+
 }  // namespace detail
 
 rmm::device_uvector<string_view> create_string_vector_from_column(