Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-25.04' into cln/dtype/a…
Browse files Browse the repository at this point in the history
…stype
  • Loading branch information
mroeschke committed Feb 26, 2025
2 parents 9b500b4 + 79d0b75 commit d58363f
Show file tree
Hide file tree
Showing 42 changed files with 1,186 additions and 301 deletions.
11 changes: 0 additions & 11 deletions .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ jobs:
- conda-python-cudf-tests
- conda-python-other-tests
- conda-java-tests
- static-configure
- conda-notebook-tests
- docs-build
- wheel-build-libcudf
Expand Down Expand Up @@ -192,16 +191,6 @@ jobs:
arch: "amd64"
container_image: "rapidsai/ci-conda:latest"
run_script: "ci/test_java.sh"
static-configure:
needs: checks
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
build_type: pull-request
# Use the wheel container so we can skip conda solves and since our
# primary static consumers (Spark) are not in conda anyway.
container_image: "rapidsai/ci-wheel:latest"
run_script: "ci/configure_cpp_static.sh"
conda-notebook-tests:
needs: [conda-python-build, changed-files]
secrets: inherit
Expand Down
11 changes: 11 additions & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,14 @@ jobs:
date: ${{ inputs.date }}
sha: ${{ inputs.sha }}
script: "ci/test_cudf_polars_polars_tests.sh"
narwhals-tests:
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
build_type: ${{ inputs.build_type }}
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
sha: ${{ inputs.sha }}
node_type: "gpu-l4-latest-1"
container_image: "rapidsai/ci-conda:latest"
run_script: ci/test_narwhals.sh
21 changes: 0 additions & 21 deletions ci/configure_cpp_static.sh

This file was deleted.

6 changes: 3 additions & 3 deletions conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,9 @@ dependencies:
- nbsphinx
- ninja
- notebook
- numba-cuda>=0.2.0,<0.3.0a0
- numba>=0.59.1,<0.61.0a0
- numpy>=1.23,<3.0a0
- numba-cuda>=0.4.0,<0.5.0a0
- numba>=0.59.1,<0.62.0a0
- numpy>=1.23,<2.1
- numpydoc
- nvcc_linux-64=11.8
- nvcomp==4.2.0.11
Expand Down
6 changes: 3 additions & 3 deletions conda/environments/all_cuda-128_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ dependencies:
- nbsphinx
- ninja
- notebook
- numba-cuda>=0.2.0,<0.3.0a0
- numba>=0.59.1,<0.61.0a0
- numpy>=1.23,<3.0a0
- numba-cuda>=0.4.0,<0.5.0a0
- numba>=0.59.1,<0.62.0a0
- numpy>=1.23,<2.1
- numpydoc
- nvcomp==4.2.0.11
- nvtx>=0.2.1
Expand Down
6 changes: 3 additions & 3 deletions conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,9 @@ requirements:
- typing_extensions >=4.0.0
- pandas >=2.0,<2.2.4dev0
- cupy >=12.0.0
- numba-cuda >=0.2.0,<0.3.0a0
- numba >=0.59.1,<0.61.0a0
- numpy >=1.23,<3.0a0
- numba-cuda >=0.4.0,<0.5.0a0
- numba >=0.59.1,<0.62.0a0
- numpy >=1.23,<2.1
- pyarrow>=14.0.0,<20.0.0a0
- libcudf ={{ version }}
- pylibcudf ={{ version }}
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/pylibcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ requirements:
- python
- typing_extensions >=4.0.0
- pandas >=2.0,<2.2.4dev0
- numpy >=1.23,<3.0a0
- numpy >=1.23,<2.1
- pyarrow>=14.0.0,<20.0.0a0
- libcudf ={{ version }}
- {{ pin_compatible('rmm', max_pin='x.x') }}
Expand Down
9 changes: 6 additions & 3 deletions cpp/benchmarks/text/normalize.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
* Copyright (c) 2021-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -48,15 +48,18 @@ static void bench_normalize(nvbench::state& state)
[&](nvbench::launch& launch) { auto result = nvtext::normalize_spaces(input); });
} else {
bool const to_lower = (normalize_type == "to_lower");
// we expect the normalizer to be created once and re-used
// so creating it is not measured
auto normalizer = nvtext::create_character_normalizer(to_lower);
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = nvtext::normalize_characters(input, to_lower);
auto result = nvtext::normalize_characters(input, *normalizer);
});
}
}

NVBENCH_BENCH(bench_normalize)
.set_name("normalize")
.add_int64_axis("min_width", {0})
.add_int64_axis("max_width", {32, 64, 128, 256})
.add_int64_axis("max_width", {128, 256})
.add_int64_axis("num_rows", {32768, 262144, 2097152})
.add_string_axis("type", {"spaces", "characters", "to_lower"});
14 changes: 13 additions & 1 deletion cpp/include/cudf/strings/detail/utilities.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -96,5 +96,17 @@ int64_t get_offset_value(cudf::column_view const& offsets,
size_type index,
rmm::cuda_stream_view stream);

/**
* @brief Return the first and last offset in the given strings column
*
* This accounts for sliced input columns as well.
*
* @param input Strings column
* @param stream CUDA stream used for device memory operations and kernel launches
* @return First and last offset values
*/
std::pair<int64_t, int64_t> get_first_and_last_offset(cudf::strings_column_view const& input,
rmm::cuda_stream_view stream);

} // namespace strings::detail
} // namespace CUDF_EXPORT cudf
111 changes: 110 additions & 1 deletion cpp/include/nvtext/normalize.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,6 +16,7 @@
#pragma once

#include <cudf/column/column.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/export.hpp>
#include <cudf/utilities/memory_resource.hpp>
Expand Down Expand Up @@ -107,5 +108,113 @@ std::unique_ptr<cudf::column> normalize_characters(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Normalizer object to be used with nvtext::normalize_characters
*
* Use nvtext::create_normalizer to create this object.
*
* This normalizer includes:
*
* - adding padding around punctuation (unicode category starts with "P")
* as well as certain ASCII symbols like "^" and "$"
* - adding padding around the [CJK Unicode block
* characters](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block))
* - changing whitespace (e.g. `"\t", "\n", "\r"`) to just space `" "`
* - removing control characters (unicode categories "Cc" and "Cf")
*
* The padding process adds a single space before and after the character.
* Details on _unicode category_ can be found here:
* https://unicodebook.readthedocs.io/unicode.html#categories
*
* If `do_lower_case = true`, lower-casing also removes any accents. The
* accents cannot be removed from upper-case characters without lower-casing
* and lower-casing cannot be performed without also removing accents.
* However, if the accented character is already lower-case, then only the
* accent is removed.
*
* If `special_tokens` are included the padding after `[` and before `]` is not
* inserted if the characters between them match one of the given tokens.
* Also, the `special_tokens` are expected to include the `[]` characters
* at the beginning of and end of each string appropriately.
*/
struct character_normalizer {
/**
* @brief Normalizer object constructor
*
* This initializes and holds the character normalizing tables and settings.
*
* @param do_lower_case If true, upper-case characters are converted to
* lower-case and accents are stripped from those characters.
* If false, accented and upper-case characters are not transformed.
* @param special_tokens Each row is a token including the `[]` brackets.
* For example: `[BOS]`, `[EOS]`, `[UNK]`, `[SEP]`, `[PAD]`, `[CLS]`, `[MASK]`
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
*/
character_normalizer(bool do_lower_case,
cudf::strings_column_view const& special_tokens,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
~character_normalizer();

struct character_normalizer_impl;
std::unique_ptr<character_normalizer_impl> _impl;
};

/**
* @brief Create a normalizer object
*
* Creates a normalizer object which can be reused on multiple calls to
* nvtext::normalize_characters
*
* @see nvtext::character_normalizer
*
* @param do_lower_case If true, upper-case characters are converted to
* lower-case and accents are stripped from those characters.
* If false, accented and upper-case characters are not transformed.
* @param special_tokens Individual tokens including `[]` brackets.
* Default is no special tokens.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return Object to be used with nvtext::normalize_characters
*/
std::unique_ptr<character_normalizer> create_character_normalizer(
bool do_lower_case,
cudf::strings_column_view const& special_tokens = cudf::strings_column_view(cudf::column_view{
cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0}),
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Normalizes the text in input strings column
*
* @see nvtext::character_normalizer for details on the normalizer behavior
*
* @code{.pseudo}
* cn = create_character_normalizer(true)
* s = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"]
* s1 = normalize_characters(s,cn)
* s1 is now ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "]
*
* cn = create_character_normalizer(false)
* s2 = normalize_characters(s,cn)
* s2 is now ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "]
* @endcode
*
* A null input element at row `i` produces a corresponding null entry
* for row `i` in the output column.
*
* @param input The input strings to normalize
* @param normalizer Normalizer to use for this function
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Memory resource to allocate any returned objects
* @return Normalized strings column
*/
std::unique_ptr<cudf::column> normalize_characters(
cudf::strings_column_view const& input,
character_normalizer const& normalizer,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/** @} */ // end of group
} // namespace CUDF_EXPORT nvtext
14 changes: 13 additions & 1 deletion cpp/src/strings/utilities.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -180,6 +180,18 @@ int64_t get_offset_value(cudf::column_view const& offsets,
: cudf::detail::get_value<int32_t>(offsets, index, stream);
}

std::pair<int64_t, int64_t> get_first_and_last_offset(cudf::strings_column_view const& input,
rmm::cuda_stream_view stream)
{
if (input.is_empty()) { return {0L, 0L}; }
auto const first_offset = (input.offset() == 0) ? 0
: cudf::strings::detail::get_offset_value(
input.offsets(), input.offset(), stream);
auto const last_offset =
cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream);
return {first_offset, last_offset};
}

} // namespace detail

rmm::device_uvector<string_view> create_string_vector_from_column(
Expand Down
Loading

0 comments on commit d58363f

Please sign in to comment.