diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 11104037c5e..148861c0fa2 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -34,6 +34,7 @@ jobs: branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} + node_type: "cpu16" python-build: needs: [cpp-build] secrets: inherit @@ -77,6 +78,7 @@ jobs: branch: ${{ inputs.branch }} sha: ${{ inputs.sha }} date: ${{ inputs.date }} + node_type: "cpu16" script: ci/build_wheel_libcudf.sh wheel-publish-libcudf: needs: wheel-build-libcudf diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 38b890893d0..2c583598f54 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -24,7 +24,6 @@ jobs: - conda-python-cudf-tests - conda-python-other-tests - conda-java-tests - - static-configure - conda-notebook-tests - docs-build - wheel-build-libcudf @@ -192,16 +191,6 @@ jobs: arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_java.sh" - static-configure: - needs: checks - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04 - with: - build_type: pull-request - # Use the wheel container so we can skip conda solves and since our - # primary static consumers (Spark) are not in conda anyway. - container_image: "rapidsai/ci-wheel:latest" - run_script: "ci/configure_cpp_static.sh" conda-notebook-tests: needs: [conda-python-build, changed-files] secrets: inherit diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 12f6d751493..8357a12e221 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -46,18 +46,6 @@ jobs: arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_cpp_memcheck.sh" - static-configure: - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04 - with: - build_type: ${{ inputs.build_type }} - branch: ${{ inputs.branch }} - date: ${{ inputs.date }} - sha: ${{ inputs.sha }} - # Use the wheel container so we can skip conda solves and since our - # primary static consumers (Spark) are not in conda anyway. - container_image: "rapidsai/ci-wheel:latest" - run_script: "ci/configure_cpp_static.sh" cpp-linters: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04 @@ -168,3 +156,14 @@ jobs: date: ${{ inputs.date }} sha: ${{ inputs.sha }} script: "ci/test_cudf_polars_polars_tests.sh" + narwhals-tests: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04 + with: + build_type: ${{ inputs.build_type }} + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + node_type: "gpu-l4-latest-1" + container_image: "rapidsai/ci-conda:latest" + run_script: ci/test_narwhals.sh diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index 3d06eacf9ff..78a15bc8092 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. set -euo pipefail @@ -17,10 +17,24 @@ rapids-logger "Begin cpp build" sccache --zero-stats -# With boa installed conda build forward to boa -RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild \ - conda/recipes/libcudf +RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) +export RAPIDS_PACKAGE_VERSION + +source rapids-rattler-channel-string + +# --no-build-id allows for caching with `sccache` +# more info is available at +# https://rattler.build/latest/tips_and_tricks/#using-sccache-or-ccache-with-rattler-build +rattler-build build --recipe conda/recipes/libcudf \ + --experimental \ + --no-build-id \ + --channel-priority disabled \ + --output-dir "$RAPIDS_CONDA_BLD_OUTPUT_DIR" \ + "${RATTLER_CHANNELS[@]}" sccache --show-adv-stats +# remove build_cache directory +rm -rf "$RAPIDS_CONDA_BLD_OUTPUT_DIR"/build_cache + rapids-upload-conda-to-s3 cpp diff --git a/ci/build_python.sh b/ci/build_python.sh index ed90041cc77..1dd8b67dfbb 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -1,10 +1,8 @@ #!/bin/bash -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. set -euo pipefail -rapids-configure-conda-channels - source rapids-configure-sccache source rapids-date-string @@ -19,53 +17,100 @@ rapids-logger "Begin py build" CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) +RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) +export RAPIDS_PACKAGE_VERSION + +# populates `RATTLER_CHANNELS` array +source rapids-rattler-channel-string + +rapids-logger "Prepending channel ${CPP_CHANNEL} to RATTLER_CHANNELS" + +RATTLER_CHANNELS=("--channel" "${CPP_CHANNEL}" "${RATTLER_CHANNELS[@]}") + sccache --zero-stats -# TODO: Remove `--no-test` flag once importing on a CPU -# node works correctly -# With boa installed conda build forwards to the boa builder +rapids-logger "Building pylibcudf" -RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ - --no-test \ - --channel "${CPP_CHANNEL}" \ - conda/recipes/pylibcudf +# TODO: Remove `--test skip` flag once importing on a CPU node works correctly +# --no-build-id allows for caching with `sccache` +# more info is available at +# https://rattler.build/latest/tips_and_tricks/#using-sccache-or-ccache-with-rattler-build +rattler-build build --recipe conda/recipes/pylibcudf \ + --experimental \ + --no-build-id \ + --channel-priority disabled \ + --output-dir "$RAPIDS_CONDA_BLD_OUTPUT_DIR" \ + --test skip \ + "${RATTLER_CHANNELS[@]}" sccache --show-adv-stats sccache --zero-stats -RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ - --no-test \ - --channel "${CPP_CHANNEL}" \ - --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ - conda/recipes/cudf +rapids-logger "Building cudf" + +rattler-build build --recipe conda/recipes/cudf \ + --experimental \ + --no-build-id \ + --channel-priority disabled \ + --output-dir "$RAPIDS_CONDA_BLD_OUTPUT_DIR" \ + --test skip \ + "${RATTLER_CHANNELS[@]}" + +sccache --show-adv-stats +sccache --zero-stats + +rapids-logger "Building dask-cudf" + +rattler-build build --recipe conda/recipes/dask-cudf \ + --experimental \ + --no-build-id \ + --channel-priority disabled \ + --output-dir "$RAPIDS_CONDA_BLD_OUTPUT_DIR" \ + --test skip \ + "${RATTLER_CHANNELS[@]}" + +sccache --show-adv-stats +sccache --zero-stats + +rapids-logger "Building cudf_kafka" + +rattler-build build --recipe conda/recipes/cudf_kafka \ + --experimental \ + --no-build-id \ + --channel-priority disabled \ + --output-dir "$RAPIDS_CONDA_BLD_OUTPUT_DIR" \ + --test skip \ + "${RATTLER_CHANNELS[@]}" + +sccache --show-adv-stats +sccache --zero-stats + +rapids-logger "Building custreamz" + +rattler-build build --recipe conda/recipes/custreamz \ + --experimental \ + --no-build-id \ + --channel-priority disabled \ + --output-dir "$RAPIDS_CONDA_BLD_OUTPUT_DIR" \ + --test skip \ + "${RATTLER_CHANNELS[@]}" sccache --show-adv-stats sccache --zero-stats -RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ - --no-test \ - --channel "${CPP_CHANNEL}" \ - --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ - conda/recipes/dask-cudf +rapids-logger "Building cudf-polars" -RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ - --no-test \ - --channel "${CPP_CHANNEL}" \ - --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ - conda/recipes/cudf_kafka +rattler-build build --recipe conda/recipes/cudf-polars \ + --experimental \ + --no-build-id \ + --channel-priority disabled \ + --output-dir "$RAPIDS_CONDA_BLD_OUTPUT_DIR" \ + --test skip \ + "${RATTLER_CHANNELS[@]}" sccache --show-adv-stats -RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ - --no-test \ - --channel "${CPP_CHANNEL}" \ - --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ - conda/recipes/custreamz - -RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ - --no-test \ - --channel "${CPP_CHANNEL}" \ - --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ - conda/recipes/cudf-polars +# remove build_cache directory +rm -rf "$RAPIDS_CONDA_BLD_OUTPUT_DIR"/build_cache rapids-upload-conda-to-s3 python diff --git a/ci/configure_cpp_static.sh b/ci/configure_cpp_static.sh deleted file mode 100755 index 3d0647a96f6..00000000000 --- a/ci/configure_cpp_static.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024-2025, NVIDIA CORPORATION. - -set -euo pipefail - -source rapids-date-string - -rapids-logger "Configure static cpp build" - -ENV_YAML_DIR="$(mktemp -d)" -REQUIREMENTS_FILE="${ENV_YAML_DIR}/requirements.txt" - -rapids-dependency-file-generator \ - --output requirements \ - --file-key test_static_build \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${REQUIREMENTS_FILE}" - -rapids-pip-retry install -r "${REQUIREMENTS_FILE}" -pyenv rehash - -cmake -S cpp -B build_static -GNinja -DBUILD_SHARED_LIBS=OFF -DCUDF_USE_ARROW_STATIC=ON -DBUILD_TESTS=OFF diff --git a/ci/run_cudf_polars_pytests.sh b/ci/run_cudf_polars_pytests.sh index e881055e9e3..5a1d5f56bf0 100755 --- a/ci/run_cudf_polars_pytests.sh +++ b/ci/run_cudf_polars_pytests.sh @@ -17,5 +17,5 @@ python -m pytest --cache-clear "$@" tests --executor dask-experimental # Test the "dask-experimental" executor with Distributed cluster # Not all tests pass yet, deselecting by name those that are failing. python -m pytest --cache-clear "$@" tests --executor dask-experimental --dask-cluster \ - -k "not test_groupby_maintain_order_random and not test_scan_csv_multi and not test_select_literal_series" \ - --cov-fail-under=89 # Override coverage, Distributed cluster coverage not yet 100% + -k "not test_groupby_maintain_order_random and not test_scan_csv_multi and not test_select_literal_series and not test_can_convert_lists and not test_executor_basics and not test_replace_literal and not test_hconcat_different_heights and not test_join and not test_dataframescan and not test_strip_chars" \ + --cov-fail-under=80 # Override coverage, Distributed cluster coverage not yet 100% diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh index 3466edacfc5..1df7bb61834 100755 --- a/ci/test_cudf_polars_polars_tests.sh +++ b/ci/test_cudf_polars_polars_tests.sh @@ -26,6 +26,8 @@ git clone https://github.com/pola-rs/polars.git --branch "${TAG}" --depth 1 # Install requirements for running polars tests rapids-logger "Install polars test requirements" +# TODO: Remove sed command when polars-cloud supports 1.23 +sed -i '/^polars-cloud$/d' polars/py-polars/requirements-dev.txt rapids-pip-retry install -r polars/py-polars/requirements-dev.txt -r polars/py-polars/requirements-ci.txt # shellcheck disable=SC2317 diff --git a/ci/test_narwhals.sh b/ci/test_narwhals.sh index 4a32ff0b0fd..28eceff2f80 100755 --- a/ci/test_narwhals.sh +++ b/ci/test_narwhals.sh @@ -26,6 +26,7 @@ rapids-logger "Run narwhals tests for cuDF" python -m pytest \ --cache-clear \ --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-narwhals.xml" \ + -p cudf.testing.narwhals_test_plugin \ --numprocesses=8 \ --dist=worksteal \ --constructors=cudf diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index cc674732ba4..a23981b4e72 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -54,9 +54,9 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.2.0,<0.3.0a0 -- numba>=0.59.1,<0.61.0a0 -- numpy>=1.23,<3.0a0 +- numba-cuda>=0.4.0,<0.5.0a0 +- numba>=0.59.1,<0.62.0a0 +- numpy>=1.23,<2.1 - numpydoc - nvcc_linux-64=11.8 - nvcomp==4.2.0.11 @@ -66,7 +66,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.20,<1.23 +- polars>=1.20,<1.24 - pre-commit - ptxcompiler - pyarrow>=14.0.0,<20.0.0a0 diff --git a/conda/environments/all_cuda-128_arch-x86_64.yaml b/conda/environments/all_cuda-128_arch-x86_64.yaml index 7593a72cc68..e2b9302dc36 100644 --- a/conda/environments/all_cuda-128_arch-x86_64.yaml +++ b/conda/environments/all_cuda-128_arch-x86_64.yaml @@ -53,9 +53,9 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.2.0,<0.3.0a0 -- numba>=0.59.1,<0.61.0a0 -- numpy>=1.23,<3.0a0 +- numba-cuda>=0.4.0,<0.5.0a0 +- numba>=0.59.1,<0.62.0a0 +- numpy>=1.23,<2.1 - numpydoc - nvcomp==4.2.0.11 - nvtx>=0.2.1 @@ -64,7 +64,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.20,<1.23 +- polars>=1.20,<1.24 - pre-commit - pyarrow>=14.0.0,<20.0.0a0 - pydata-sphinx-theme>=0.15.4 diff --git a/conda/recipes/cudf-polars/build.sh b/conda/recipes/cudf-polars/build.sh deleted file mode 100644 index 06e2f1bcb99..00000000000 --- a/conda/recipes/cudf-polars/build.sh +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -# This assumes the script is executed from the root of the repo directory -./build.sh cudf_polars diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml deleted file mode 100644 index 1d36ab2a3e4..00000000000 --- a/conda/recipes/cudf-polars/meta.yaml +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) 2024-2025, NVIDIA CORPORATION. - -{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} -{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} -{% set py_version = environ['CONDA_PY'] %} -{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} -{% set cuda_major = cuda_version.split('.')[0] %} -{% set date_string = environ['RAPIDS_DATE_STRING'] %} - -package: - name: cudf-polars - version: {{ version }} - -source: - path: ../../.. - -build: - number: {{ GIT_DESCRIBE_NUMBER }} - string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} - script_env: - - AWS_ACCESS_KEY_ID - - AWS_SECRET_ACCESS_KEY - - AWS_SESSION_TOKEN - - CMAKE_C_COMPILER_LAUNCHER - - CMAKE_CUDA_COMPILER_LAUNCHER - - CMAKE_CXX_COMPILER_LAUNCHER - - CMAKE_GENERATOR - - PARALLEL_LEVEL - - SCCACHE_BUCKET - - SCCACHE_IDLE_TIMEOUT - - SCCACHE_REGION - - SCCACHE_S3_KEY_PREFIX=cudf-polars-aarch64 # [aarch64] - - SCCACHE_S3_KEY_PREFIX=cudf-polars-linux64 # [linux64] - - SCCACHE_S3_USE_SSL - - SCCACHE_S3_NO_CREDENTIALS - -requirements: - host: - - python - - rapids-build-backend >=0.3.0,<0.4.0.dev0 - - setuptools - - cuda-version ={{ cuda_version }} - run: - - python - - pylibcudf ={{ version }} - - polars >=1.20,<1.23 - - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} - -test: - requires: - - cuda-version ={{ cuda_version }} - imports: - - cudf_polars - - -about: - home: https://rapids.ai/ - license: Apache-2.0 - license_family: APACHE - license_file: LICENSE - summary: cudf-polars library diff --git a/conda/recipes/cudf-polars/recipe.yaml b/conda/recipes/cudf-polars/recipe.yaml new file mode 100644 index 00000000000..8eaf7e4f843 --- /dev/null +++ b/conda/recipes/cudf-polars/recipe.yaml @@ -0,0 +1,67 @@ +# Copyright (c) 2018-2025, NVIDIA CORPORATION. +schema_version: 1 + +context: + version: ${{ env.get("RAPIDS_PACKAGE_VERSION") }} + minor_version: ${{ (version | split("."))[:2] | join(".") }} + cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }} + cuda_major: '${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[0] }}' + date_string: '${{ env.get("RAPIDS_DATE_STRING") }}' + py_version: ${{ env.get("RAPIDS_PY_VERSION") }} + py_buildstring: ${{ py_version | version_to_buildstring }} + head_rev: ${{ git.head_rev(".")[:8] }} + +package: + name: cudf-polars + version: ${{ version }} + +source: + path: ../../.. + +build: + string: cuda${{ cuda_major }}_py${{ py_buildstring }}_${{ date_string }}_${{ head_rev }} + script: + content: | + ./build.sh cudf_polars + secrets: + - AWS_ACCESS_KEY_ID + - AWS_SECRET_ACCESS_KEY + - AWS_SESSION_TOKEN + env: + CMAKE_C_COMPILER_LAUNCHER: ${{ env.get("CMAKE_C_COMPILER_LAUNCHER") }} + CMAKE_CUDA_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CUDA_COMPILER_LAUNCHER") }} + CMAKE_CXX_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CXX_COMPILER_LAUNCHER") }} + CMAKE_GENERATOR: ${{ env.get("CMAKE_GENERATOR") }} + SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET") }} + SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT") }} + SCCACHE_REGION: ${{ env.get("SCCACHE_REGION") }} + SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL") }} + SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS") }} + SCCACHE_S3_KEY_PREFIX: cudf-polars-${{ env.get("RAPIDS_CONDA_ARCH") }} + +requirements: + host: + - python =${{ py_version }} + - pip + - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - setuptools + - cuda-version =${{ cuda_version }} + run: + - python + - pylibcudf =${{ version }} + - polars >=1.20,<1.24 + - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} + ignore_run_exports: + by_name: + - cuda-version + +tests: + - python: + imports: + - cudf_polars + pip_check: false + +about: + homepage: ${{ load_from_file("python/cudf_polars/pyproject.toml").project.urls.Homepage }} + license: ${{ load_from_file("python/cudf_polars/pyproject.toml").project.license.text }} + summary: ${{ load_from_file("python/cudf_polars/pyproject.toml").project.description }} diff --git a/conda/recipes/cudf/build.sh b/conda/recipes/cudf/build.sh deleted file mode 100644 index 43d046402c7..00000000000 --- a/conda/recipes/cudf/build.sh +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. - -# This assumes the script is executed from the root of the repo directory -./build.sh cudf diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml deleted file mode 100644 index f817bc12c5b..00000000000 --- a/conda/recipes/cudf/meta.yaml +++ /dev/null @@ -1,119 +0,0 @@ -# Copyright (c) 2018-2025, NVIDIA CORPORATION. - -{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} -{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} -{% set py_version = environ['CONDA_PY'] %} -{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} -{% set cuda_major = cuda_version.split('.')[0] %} -{% set date_string = environ['RAPIDS_DATE_STRING'] %} - -package: - name: cudf - version: {{ version }} - -source: - path: ../../.. - -build: - number: {{ GIT_DESCRIBE_NUMBER }} - string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} - script_env: - - AWS_ACCESS_KEY_ID - - AWS_SECRET_ACCESS_KEY - - AWS_SESSION_TOKEN - - CMAKE_C_COMPILER_LAUNCHER - - CMAKE_CUDA_COMPILER_LAUNCHER - - CMAKE_CXX_COMPILER_LAUNCHER - - CMAKE_GENERATOR - - PARALLEL_LEVEL - - SCCACHE_BUCKET - - SCCACHE_IDLE_TIMEOUT - - SCCACHE_REGION - - SCCACHE_S3_KEY_PREFIX=cudf-aarch64 # [aarch64] - - SCCACHE_S3_KEY_PREFIX=cudf-linux64 # [linux64] - - SCCACHE_S3_USE_SSL - - SCCACHE_S3_NO_CREDENTIALS - ignore_run_exports_from: - - {{ compiler('cuda') }} - {% if cuda_major != "11" %} - - cuda-cudart-dev - - libcufile-dev # [linux64] - {% endif %} - -requirements: - build: - - cmake {{ cmake_version }} - - ninja - - {{ compiler('c') }} - - {{ compiler('cxx') }} - {% if cuda_major == "11" %} - - {{ compiler('cuda') }} ={{ cuda_version }} - {% else %} - - {{ compiler('cuda') }} - {% endif %} - - cuda-version ={{ cuda_version }} - - {{ stdlib("c") }} - host: - - python - - cython >=3.0.3 - - rapids-build-backend >=0.3.0,<0.4.0.dev0 - - scikit-build-core >=0.10.0 - - dlpack >=0.8,<1.0 - - libcudf ={{ version }} - - pylibcudf ={{ version }} - - rmm ={{ minor_version }} - {% if cuda_major == "11" %} - - cudatoolkit - {% else %} - - cuda-cudart-dev - - cuda-nvrtc - - libcufile-dev # [linux64] - {% endif %} - - cuda-version ={{ cuda_version }} - run: - - python - - typing_extensions >=4.0.0 - - pandas >=2.0,<2.2.4dev0 - - cupy >=12.0.0 - - numba-cuda >=0.2.0,<0.3.0a0 - - numba >=0.59.1,<0.61.0a0 - - numpy >=1.23,<3.0a0 - - pyarrow>=14.0.0,<20.0.0a0 - - libcudf ={{ version }} - - pylibcudf ={{ version }} - - {{ pin_compatible('rmm', max_pin='x.x') }} - - fsspec >=0.6.0 - {% if cuda_major == "11" %} - - cudatoolkit - - ptxcompiler >=0.7.0 - - cubinlinker # CUDA enhanced compatibility. - - cuda-python >=11.8.5,<12.0a0 - {% else %} - - cuda-cudart - - libcufile # [linux64] - # Needed by Numba for CUDA support - - cuda-nvcc-impl - # TODO: Add nvjitlink here - # xref: https://github.com/rapidsai/cudf/issues/12822 - - cuda-nvrtc - - cuda-python >=12.6.2,<13.0a0 - - pynvjitlink - {% endif %} - - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} - - nvtx >=0.2.1 - - packaging - - cachetools - - rich - -test: - requires: - - cuda-version ={{ cuda_version }} - imports: - - cudf - -about: - home: https://rapids.ai/ - license: Apache-2.0 - license_family: APACHE - license_file: LICENSE - summary: cuDF GPU DataFrame core library diff --git a/conda/recipes/cudf/recipe.yaml b/conda/recipes/cudf/recipe.yaml new file mode 100644 index 00000000000..2cb330fb76d --- /dev/null +++ b/conda/recipes/cudf/recipe.yaml @@ -0,0 +1,126 @@ +# Copyright (c) 2018-2025, NVIDIA CORPORATION. +schema_version: 1 + +context: + version: ${{ env.get("RAPIDS_PACKAGE_VERSION") }} + minor_version: ${{ (version | split("."))[:2] | join(".") }} + cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }} + cuda_major: '${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[0] }}' + date_string: '${{ env.get("RAPIDS_DATE_STRING") }}' + py_version: ${{ env.get("RAPIDS_PY_VERSION") }} + py_buildstring: ${{ py_version | version_to_buildstring }} + head_rev: ${{ git.head_rev(".")[:8] }} + +package: + name: cudf + version: ${{ version }} + +source: + path: ../../.. + +build: + string: cuda${{ cuda_major }}_py${{ py_buildstring }}_${{ date_string }}_${{ head_rev }} + script: + content: | + ./build.sh cudf + secrets: + - AWS_ACCESS_KEY_ID + - AWS_SECRET_ACCESS_KEY + - AWS_SESSION_TOKEN + env: + CMAKE_C_COMPILER_LAUNCHER: ${{ env.get("CMAKE_C_COMPILER_LAUNCHER") }} + CMAKE_CUDA_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CUDA_COMPILER_LAUNCHER") }} + CMAKE_CXX_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CXX_COMPILER_LAUNCHER") }} + CMAKE_GENERATOR: ${{ env.get("CMAKE_GENERATOR") }} + SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET") }} + SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT") }} + SCCACHE_REGION: ${{ env.get("SCCACHE_REGION") }} + SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL") }} + SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS") }} + SCCACHE_S3_KEY_PREFIX: cudf-${{ env.get("RAPIDS_CONDA_ARCH") }} + +requirements: + build: + - cmake ${{ cmake_version }} + - ninja + - ${{ compiler("c") }} + - ${{ compiler("cxx") }} + - ${{ compiler("cuda") }} + - cuda-version =${{ cuda_version }} + - ${{ stdlib("c") }} + host: + - python =${{ py_version }} + - pip + - cython >=3.0.3 + - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - scikit-build-core >=0.10.0 + - dlpack >=0.8,<1.0 + - libcudf =${{ version }} + - pylibcudf =${{ version }} + - rmm =${{ minor_version }} + - if: cuda_major == "11" + then: + - cudatoolkit + else: + - cuda-cudart-dev + - cuda-nvrtc + - if: linux64 + then: + - libcufile-dev + - cuda-version =${{ cuda_version }} + run: + - python + - typing_extensions >=4.0.0 + - pandas >=2.0,<2.2.4dev0 + - cupy >=12.0.0 + - numba-cuda >=0.4.0,<0.5.0a0 + - numba >=0.59.1,<0.62.0a0 + - numpy >=1.23,<2.1 + - pyarrow>=14.0.0,<20.0.0a0 + - libcudf =${{ version }} + - pylibcudf =${{ version }} + - ${{ pin_compatible("rmm", upper_bound="x.x") }} + - fsspec >=0.6.0 + - if: cuda_major == "11" + then: + - cudatoolkit + - ptxcompiler >=0.7.0 + - cubinlinker # CUDA enhanced compatibility. + - cuda-python >=11.8.5,<12.0a0 + else: + - cuda-cudart + # Needed by Numba for CUDA support + - cuda-nvcc-impl + # TODO: Add nvjitlink here + # xref: https://github.com/rapidsai/cudf/issues/12822 + - cuda-nvrtc + - cuda-python >=12.6.2,<13.0a0 + - pynvjitlink + - if: linux64 + then: + - libcufile + - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} + - nvtx >=0.2.1 + - packaging + - cachetools + - rich + ignore_run_exports: + from_package: + - if: cuda_major != "11" + then: + - cuda-cudart-dev + - if: linux64 + then: libcufile-dev + by_name: + - cuda-version + +tests: + - python: + imports: + - cudf + pip_check: false + +about: + homepage: ${{ load_from_file("python/cudf/pyproject.toml").project.urls.Homepage }} + license: ${{ load_from_file("python/cudf/pyproject.toml").project.license.text }} + summary: ${{ load_from_file("python/cudf/pyproject.toml").project.description }} diff --git a/conda/recipes/cudf_kafka/build.sh b/conda/recipes/cudf_kafka/build.sh deleted file mode 100644 index 9458349d101..00000000000 --- a/conda/recipes/cudf_kafka/build.sh +++ /dev/null @@ -1,3 +0,0 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. - -./build.sh -v cudf_kafka diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml deleted file mode 100644 index a070c041d99..00000000000 --- a/conda/recipes/cudf_kafka/meta.yaml +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. - -{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} -{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} -{% set py_version = environ['CONDA_PY'] %} -{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} -{% set cuda_major = cuda_version.split('.')[0] %} -{% set date_string = environ['RAPIDS_DATE_STRING'] %} - -package: - name: cudf_kafka - version: {{ version }} - -source: - path: ../../.. - -build: - number: {{ GIT_DESCRIBE_NUMBER }} - string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} - script_env: - - AWS_ACCESS_KEY_ID - - AWS_SECRET_ACCESS_KEY - - AWS_SESSION_TOKEN - - CMAKE_C_COMPILER_LAUNCHER - - CMAKE_CUDA_COMPILER_LAUNCHER - - CMAKE_CXX_COMPILER_LAUNCHER - - CMAKE_GENERATOR - - PARALLEL_LEVEL - - SCCACHE_BUCKET - - SCCACHE_IDLE_TIMEOUT - - SCCACHE_REGION - - SCCACHE_S3_KEY_PREFIX=cudf-kafka-aarch64 # [aarch64] - - SCCACHE_S3_KEY_PREFIX=cudf-kafka-linux64 # [linux64] - - SCCACHE_S3_USE_SSL - - SCCACHE_S3_NO_CREDENTIALS - ignore_run_exports_from: - - {{ compiler('cuda') }} - {% if cuda_major != "11" %} - - cuda-cudart-dev - {% endif %} - -requirements: - build: - - cmake {{ cmake_version }} - - ninja - - {{ compiler('c') }} - - {{ compiler('cxx') }} - {% if cuda_major == "11" %} - - {{ compiler('cuda') }} ={{ cuda_version }} - {% else %} - - {{ compiler('cuda') }} - {% endif %} - - cuda-version ={{ cuda_version }} - - {{ stdlib("c") }} - host: - - python - - cython >=3.0.3 - - cuda-version ={{ cuda_version }} - - pylibcudf ={{ version }} - - libcudf_kafka ={{ version }} - - rapids-build-backend >=0.3.0,<0.4.0.dev0 - - scikit-build-core >=0.10.0 - {% if cuda_major != "11" %} - - cuda-cudart-dev - {% endif %} - run: - - python - - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} - - libcudf_kafka ={{ version }} - - pylibcudf ={{ version }} - {% if cuda_major != "11" %} - - cuda-cudart - {% endif %} - -test: - requires: - - cuda-version ={{ cuda_version }} - imports: - - cudf_kafka - -about: - home: https://rapids.ai/ - license: Apache-2.0 - license_family: APACHE - license_file: LICENSE - summary: libcudf_kafka library diff --git a/conda/recipes/cudf_kafka/recipe.yaml b/conda/recipes/cudf_kafka/recipe.yaml new file mode 100644 index 00000000000..aba9d979e44 --- /dev/null +++ b/conda/recipes/cudf_kafka/recipe.yaml @@ -0,0 +1,85 @@ +# Copyright (c) 2018-2025, NVIDIA CORPORATION. +schema_version: 1 + +context: + version: ${{ env.get("RAPIDS_PACKAGE_VERSION") }} + minor_version: ${{ (version | split("."))[:2] | join(".") }} + cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }} + cuda_major: '${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[0] }}' + date_string: '${{ env.get("RAPIDS_DATE_STRING") }}' + py_version: ${{ env.get("RAPIDS_PY_VERSION") }} + py_buildstring: ${{ py_version | version_to_buildstring }} + head_rev: ${{ git.head_rev(".")[:8] }} + +package: + name: cudf_kafka + version: ${{ version }} + +source: + path: ../../.. + +build: + string: cuda${{ cuda_major }}_py${{ py_buildstring }}_${{ date_string }}_${{ head_rev }} + script: + content: | + ./build.sh cudf_kafka + secrets: + - AWS_ACCESS_KEY_ID + - AWS_SECRET_ACCESS_KEY + - AWS_SESSION_TOKEN + env: + CMAKE_C_COMPILER_LAUNCHER: ${{ env.get("CMAKE_C_COMPILER_LAUNCHER") }} + CMAKE_CUDA_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CUDA_COMPILER_LAUNCHER") }} + CMAKE_CXX_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CXX_COMPILER_LAUNCHER") }} + CMAKE_GENERATOR: ${{ env.get("CMAKE_GENERATOR") }} + SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET") }} + SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT") }} + SCCACHE_REGION: ${{ env.get("SCCACHE_REGION") }} + SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL") }} + SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS") }} + SCCACHE_S3_KEY_PREFIX: cudf-kafka-${{ env.get("RAPIDS_CONDA_ARCH") }} + +requirements: + build: + - cmake ${{ cmake_version }} + - ninja + - ${{ compiler("c") }} + - ${{ compiler("cxx") }} + - ${{ compiler("cuda") }} + - cuda-version =${{ cuda_version }} + - ${{ stdlib("c") }} + host: + - python =${{ py_version }} + - pip + - cython >=3.0.3 + - cuda-version =${{ cuda_version }} + - pylibcudf =${{ version }} + - libcudf_kafka =${{ version }} + - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - scikit-build-core >=0.10.0 + - if: cuda_major != "11" + then: cuda-cudart-dev + run: + - python + - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} + - libcudf_kafka =${{ version }} + - pylibcudf =${{ version }} + - if: cuda_major != "11" + then: cuda-cudart + ignore_run_exports: + from_package: + - if: cuda_major != "11" + then: cuda-cudart-dev + by_name: + - cuda-version + +tests: + - python: + imports: + - cudf_kafka + pip_check: false + +about: + homepage: ${{ load_from_file("python/cudf_kafka/pyproject.toml").project.urls.Homepage }} + license: ${{ load_from_file("python/cudf_kafka/pyproject.toml").project.license.text }} + summary: ${{ load_from_file("python/cudf_kafka/pyproject.toml").project.description }} diff --git a/conda/recipes/custreamz/build.sh b/conda/recipes/custreamz/build.sh deleted file mode 100644 index 88fccf90c69..00000000000 --- a/conda/recipes/custreamz/build.sh +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. - -# This assumes the script is executed from the root of the repo directory -./build.sh -v custreamz diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml deleted file mode 100644 index a031f05a73a..00000000000 --- a/conda/recipes/custreamz/meta.yaml +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} -{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} -{% set py_version = environ['CONDA_PY'] %} -{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} -{% set cuda_major = cuda_version.split('.')[0] %} -{% set date_string = environ['RAPIDS_DATE_STRING'] %} - -package: - name: custreamz - version: {{ version }} - -source: - path: ../../.. - -build: - number: {{ GIT_DESCRIBE_NUMBER }} - string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} - script_env: - - AWS_ACCESS_KEY_ID - - AWS_SECRET_ACCESS_KEY - - AWS_SESSION_TOKEN - - CMAKE_C_COMPILER_LAUNCHER - - CMAKE_CUDA_COMPILER_LAUNCHER - - CMAKE_CXX_COMPILER_LAUNCHER - - CMAKE_GENERATOR - - PARALLEL_LEVEL - - SCCACHE_BUCKET - - SCCACHE_IDLE_TIMEOUT - - SCCACHE_REGION - - SCCACHE_S3_KEY_PREFIX=custreamz-aarch64 # [aarch64] - - SCCACHE_S3_KEY_PREFIX=custreamz-linux64 # [linux64] - - SCCACHE_S3_USE_SSL - - SCCACHE_S3_NO_CREDENTIALS - -requirements: - host: - - python - - rapids-build-backend >=0.3.0,<0.4.0.dev0 - - setuptools - - python-confluent-kafka >=2.5.0,<2.6.0a0 - - cudf_kafka ={{ version }} - - cuda-version ={{ cuda_version }} - run: - - python - - streamz - - cudf ={{ version }} - - cudf_kafka ={{ version }} - - rapids-dask-dependency ={{ minor_version }} - - python-confluent-kafka >=2.5.0,<2.6.0a0 - - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} - -test: - requires: - - cuda-version ={{ cuda_version }} - imports: - - custreamz - -about: - home: https://rapids.ai/ - license: Apache-2.0 - license_family: APACHE - license_file: LICENSE - summary: cuStreamz library diff --git a/conda/recipes/custreamz/recipe.yaml b/conda/recipes/custreamz/recipe.yaml new file mode 100644 index 00000000000..4713df9efad --- /dev/null +++ b/conda/recipes/custreamz/recipe.yaml @@ -0,0 +1,54 @@ +# Copyright (c) 2018-2025, NVIDIA CORPORATION. +schema_version: 1 + +context: + version: ${{ env.get("RAPIDS_PACKAGE_VERSION") }} + minor_version: ${{ (version | split("."))[:2] | join(".") }} + cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }} + cuda_major: '${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[0] }}' + date_string: '${{ env.get("RAPIDS_DATE_STRING") }}' + py_version: ${{ env.get("RAPIDS_PY_VERSION") }} + py_buildstring: ${{ py_version | version_to_buildstring }} + head_rev: ${{ git.head_rev(".")[:8] }} + +package: + name: custreamz + version: ${{ version }} + +source: + path: ../../.. + +build: + string: cuda${{ cuda_major }}_py${{ py_buildstring }}_${{ date_string }}_${{ head_rev }} + script: + content: | + ./build.sh custreamz + +requirements: + host: + - python =${{ py_version }} + - pip + - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - setuptools + - python-confluent-kafka >=2.5.0,<2.6.0a0 + - cudf_kafka =${{ version }} + - cuda-version =${{ cuda_version }} + run: + - python + - streamz + - cudf =${{ version }} + - cudf_kafka =${{ version }} + - rapids-dask-dependency =${{ minor_version }} + - python-confluent-kafka >=2.5.0,<2.6.0a0 + - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} + +tests: + - python: + imports: + - custreamz + pip_check: false + +about: + homepage: ${{ load_from_file("python/custreamz/pyproject.toml").project.urls.Homepage }} + license: ${{ load_from_file("python/custreamz/pyproject.toml").project.license.text }} + summary: ${{ load_from_file("python/custreamz/pyproject.toml").project.description }} diff --git a/conda/recipes/dask-cudf/build.sh b/conda/recipes/dask-cudf/build.sh deleted file mode 100644 index 473f52c28a0..00000000000 --- a/conda/recipes/dask-cudf/build.sh +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2018-2019, NVIDIA CORPORATION. - -# This assumes the script is executed from the root of the repo directory -./build.sh dask_cudf diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml deleted file mode 100644 index a476d5d53df..00000000000 --- a/conda/recipes/dask-cudf/meta.yaml +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} -{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} -{% set py_version = environ['CONDA_PY'] %} -{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} -{% set cuda_major = cuda_version.split('.')[0] %} -{% set date_string = environ['RAPIDS_DATE_STRING'] %} - -package: - name: dask-cudf - version: {{ version }} - -source: - path: ../../.. - -build: - number: {{ GIT_DESCRIBE_NUMBER }} - string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} - script_env: - - AWS_ACCESS_KEY_ID - - AWS_SECRET_ACCESS_KEY - - AWS_SESSION_TOKEN - - CMAKE_C_COMPILER_LAUNCHER - - CMAKE_CUDA_COMPILER_LAUNCHER - - CMAKE_CXX_COMPILER_LAUNCHER - - CMAKE_GENERATOR - - PARALLEL_LEVEL - - SCCACHE_BUCKET - - SCCACHE_IDLE_TIMEOUT - - SCCACHE_REGION - - SCCACHE_S3_KEY_PREFIX=dask-cudf-aarch64 # [aarch64] - - SCCACHE_S3_KEY_PREFIX=dask-cudf-linux64 # [linux64] - - SCCACHE_S3_USE_SSL - - SCCACHE_S3_NO_CREDENTIALS - -requirements: - host: - - python - - rapids-build-backend >=0.3.0,<0.4.0.dev0 - - setuptools - - cuda-version ={{ cuda_version }} - run: - - python - - cudf ={{ version }} - - pynvml >=12.0.0,<13.0.0a0 - - rapids-dask-dependency ={{ minor_version }} - - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} - -test: - requires: - - cuda-version ={{ cuda_version }} - imports: - - dask_cudf - - -about: - home: https://rapids.ai/ - license: Apache-2.0 - license_family: APACHE - license_file: LICENSE - summary: dask-cudf library diff --git a/conda/recipes/dask-cudf/recipe.yaml b/conda/recipes/dask-cudf/recipe.yaml new file mode 100644 index 00000000000..997150d2832 --- /dev/null +++ b/conda/recipes/dask-cudf/recipe.yaml @@ -0,0 +1,50 @@ +# Copyright (c) 2018-2025, NVIDIA CORPORATION. +schema_version: 1 + +context: + version: ${{ env.get("RAPIDS_PACKAGE_VERSION") }} + minor_version: ${{ (version | split("."))[:2] | join(".") }} + cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }} + cuda_major: '${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[0] }}' + date_string: '${{ env.get("RAPIDS_DATE_STRING") }}' + py_version: ${{ env.get("RAPIDS_PY_VERSION") }} + py_buildstring: ${{ py_version | version_to_buildstring }} + head_rev: ${{ git.head_rev(".")[:8] }} + +package: + name: dask-cudf + version: ${{ version }} + +source: + path: ../../.. + +build: + string: cuda${{ cuda_major }}_py${{ py_buildstring }}_${{ date_string }}_${{ head_rev }} + script: + content: | + ./build.sh dask_cudf + +requirements: + host: + - python =${{ py_version }} + - pip + - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - setuptools + - cuda-version =${{ cuda_version }} + run: + - python + - cudf =${{ version }} + - pynvml >=12.0.0,<13.0.0a0 + - rapids-dask-dependency =${{ minor_version }} + - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} + +tests: + - python: + imports: + - dask_cudf + pip_check: false + +about: + homepage: ${{ load_from_file("python/dask_cudf/pyproject.toml").project.urls.Homepage }} + license: ${{ load_from_file("python/dask_cudf/pyproject.toml").project.license.text }} + summary: ${{ load_from_file("python/dask_cudf/pyproject.toml").project.description }} diff --git a/conda/recipes/libcudf/build.sh b/conda/recipes/libcudf/build.sh deleted file mode 100644 index a3a0415575b..00000000000 --- a/conda/recipes/libcudf/build.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -export cudf_ROOT="$(realpath ./cpp/build)" - -./build.sh -n -v \ - libcudf libcudf_kafka benchmarks tests \ - --build_metrics --incl_cache_stats --allgpuarch \ - --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib -DCUDF_ENABLE_ARROW_S3=ON\" diff --git a/conda/recipes/libcudf/install_libcudf.sh b/conda/recipes/libcudf/install_libcudf.sh deleted file mode 100644 index 173f8cfa90f..00000000000 --- a/conda/recipes/libcudf/install_libcudf.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -# Copyright (c) 2018-2022, NVIDIA CORPORATION. - -cmake --install cpp/build diff --git a/conda/recipes/libcudf/install_libcudf_example.sh b/conda/recipes/libcudf/install_libcudf_example.sh deleted file mode 100644 index 1a52dec99e3..00000000000 --- a/conda/recipes/libcudf/install_libcudf_example.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -# build and install libcudf examples -./cpp/examples/build.sh --install diff --git a/conda/recipes/libcudf/install_libcudf_kafka.sh b/conda/recipes/libcudf/install_libcudf_kafka.sh deleted file mode 100644 index 9eae2510027..00000000000 --- a/conda/recipes/libcudf/install_libcudf_kafka.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -# Copyright (c) 2018-2022, NVIDIA CORPORATION. - -cmake --install cpp/libcudf_kafka/build diff --git a/conda/recipes/libcudf/install_libcudf_tests.sh b/conda/recipes/libcudf/install_libcudf_tests.sh deleted file mode 100644 index 069462eec9d..00000000000 --- a/conda/recipes/libcudf/install_libcudf_tests.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -# Copyright (c) 2018-2022, NVIDIA CORPORATION. - -cmake --install cpp/build --component testing -cmake --install cpp/libcudf_kafka/build --component testing diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml deleted file mode 100644 index f7bd7280f0f..00000000000 --- a/conda/recipes/libcudf/meta.yaml +++ /dev/null @@ -1,220 +0,0 @@ -# Copyright (c) 2018-2025, NVIDIA CORPORATION. - -{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} -{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} -{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} -{% set cuda_major = cuda_version.split('.')[0] %} -{% set date_string = environ['RAPIDS_DATE_STRING'] %} - -package: - name: libcudf-split - -source: - path: ../../.. - -build: - script_env: - - AWS_ACCESS_KEY_ID - - AWS_SECRET_ACCESS_KEY - - AWS_SESSION_TOKEN - - CMAKE_C_COMPILER_LAUNCHER - - CMAKE_CUDA_COMPILER_LAUNCHER - - CMAKE_CXX_COMPILER_LAUNCHER - - CMAKE_GENERATOR - - PARALLEL_LEVEL - - RAPIDS_ARTIFACTS_DIR - - SCCACHE_BUCKET - - SCCACHE_IDLE_TIMEOUT - - SCCACHE_REGION - - SCCACHE_S3_KEY_PREFIX=libcudf-aarch64 # [aarch64] - - SCCACHE_S3_KEY_PREFIX=libcudf-linux64 # [linux64] - - SCCACHE_S3_USE_SSL - - SCCACHE_S3_NO_CREDENTIALS - -requirements: - build: - - cmake {{ cmake_version }} - - {{ compiler('c') }} - - {{ compiler('cxx') }} - {% if cuda_major == "11" %} - - {{ compiler('cuda') }} ={{ cuda_version }} - {% else %} - - {{ compiler('cuda') }} - {% endif %} - - cuda-version ={{ cuda_version }} - - ninja - - {{ stdlib("c") }} - host: - - librmm ={{ minor_version }} - - libkvikio ={{ minor_version }} - {% if cuda_major == "11" %} - - cudatoolkit - - libcufile {{ cuda11_libcufile_host_version }} # [linux64] - - libcufile-dev {{ cuda11_libcufile_host_version }} # [linux64] - - libcurand {{ cuda11_libcurand_host_version }} - - libcurand-dev {{ cuda11_libcurand_host_version }} - - cuda-nvrtc ={{ cuda_version }} - - cuda-nvrtc-dev ={{ cuda_version }} - - cuda-nvtx ={{ cuda_version }} - {% else %} - - cuda-nvrtc-dev - - cuda-nvtx-dev - - libcufile-dev # [linux64] - - libcurand-dev - {% endif %} - - cuda-version ={{ cuda_version }} - - nvcomp {{ nvcomp_version }} - - dlpack {{ dlpack_version }} - - librdkafka {{ librdkafka_version }} - - flatbuffers {{ flatbuffers_version }} - - rapids-logger =0.1 - - zlib {{ zlib_version }} - -outputs: - - name: libcudf - version: {{ version }} - script: install_libcudf.sh - build: - number: {{ GIT_DESCRIBE_NUMBER }} - string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} - run_exports: - - {{ pin_subpackage("libcudf", max_pin="x.x") }} - ignore_run_exports_from: - - {{ compiler('cuda') }} - requirements: - build: - - cmake {{ cmake_version }} - host: - - cuda-version ={{ cuda_version }} - run: - - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} - {% if cuda_major == "11" %} - - cudatoolkit - - libcufile {{ cuda11_libcufile_run_version }} # [linux64] - {% else %} - - cuda-nvrtc - - libcufile # [linux64] - {% endif %} - - nvcomp {{ nvcomp_version }} - - librmm ={{ minor_version }} - - libkvikio ={{ minor_version }} - - dlpack {{ dlpack_version }} - - rapids-logger =0.1 - test: - commands: - - test -f $PREFIX/lib/libcudf.so - - test -f $PREFIX/include/cudf/column/column.hpp - about: - home: https://rapids.ai/ - license: Apache-2.0 - license_family: APACHE - license_file: LICENSE - summary: libcudf library - - name: libcudf_kafka - version: {{ version }} - script: install_libcudf_kafka.sh - build: - number: {{ GIT_DESCRIBE_NUMBER }} - string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} - ignore_run_exports_from: - - {{ compiler('cuda') }} - requirements: - build: - - cmake {{ cmake_version }} - host: - - librdkafka {{ librdkafka_version }} - - {{ pin_subpackage('libcudf', exact=True) }} - run: - - librdkafka {{ librdkafka_version }} - - {{ pin_subpackage('libcudf', exact=True) }} - test: - commands: - - test -f $PREFIX/lib/libcudf_kafka.so - about: - home: https://rapids.ai/ - license: Apache-2.0 - license_family: APACHE - license_file: LICENSE - summary: libcudf_kafka library - - name: libcudf-example - version: {{ version }} - script: install_libcudf_example.sh - build: - number: {{ GIT_DESCRIBE_NUMBER }} - string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} - ignore_run_exports_from: - - {{ compiler('cuda') }} - {% if cuda_major != "11" %} - - cuda-nvtx-dev - {% endif %} - requirements: - build: - - cmake {{ cmake_version }} - - {{ compiler('c') }} - - {{ compiler('cxx') }} - {% if cuda_major == "11" %} - - {{ compiler('cuda') }} ={{ cuda_version }} - {% else %} - - {{ compiler('cuda') }} - {% endif %} - - cuda-version ={{ cuda_version }} - - ninja - - {{ stdlib("c") }} - host: - - {{ pin_subpackage('libcudf', exact=True) }} - {% if cuda_major == "11" %} - - cuda-nvtx ={{ cuda_version }} - {% else %} - - cuda-nvtx-dev - {% endif %} - - cuda-version ={{ cuda_version }} - run: - - {{ pin_subpackage('libcudf', exact=True) }} - - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} - {% if cuda_major != "11" %} - - cuda-nvtx - {% endif %} - about: - home: https://rapids.ai/ - license: Apache-2.0 - license_family: APACHE - license_file: LICENSE - summary: libcudf example executables - - name: libcudf-tests - version: {{ version }} - script: install_libcudf_tests.sh - build: - number: {{ GIT_DESCRIBE_NUMBER }} - string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} - ignore_run_exports_from: - - {{ compiler('cuda') }} - {% if cuda_major != "11" %} - - libcurand-dev - {% endif %} - requirements: - build: - - cmake {{ cmake_version }} - host: - - {{ pin_subpackage('libcudf', exact=True) }} - - {{ pin_subpackage('libcudf_kafka', exact=True) }} - - cuda-version ={{ cuda_version }} - {% if cuda_major == "11" %} - - libcurand {{ cuda11_libcurand_run_version }} - {% else %} - - libcurand-dev - {% endif %} - run: - - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} - - {{ pin_subpackage('libcudf', exact=True) }} - - {{ pin_subpackage('libcudf_kafka', exact=True) }} - {% if cuda_major == "11" %} - - libcurand {{ cuda11_libcurand_run_version }} - {% else %} - - libcurand - {% endif %} - about: - home: https://rapids.ai/ - license: Apache-2.0 - license_family: APACHE - license_file: LICENSE - summary: libcudf test & benchmark executables diff --git a/conda/recipes/libcudf/recipe.yaml b/conda/recipes/libcudf/recipe.yaml new file mode 100644 index 00000000000..8653dc68a9f --- /dev/null +++ b/conda/recipes/libcudf/recipe.yaml @@ -0,0 +1,323 @@ +# Copyright (c) 2018-2025, NVIDIA CORPORATION. +schema_version: 1 + +context: + version: ${{ env.get("RAPIDS_PACKAGE_VERSION") }} + minor_version: ${{ (version | split("."))[:2] | join(".") }} + cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }} + cuda_major: '${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[0] }}' + date_string: '${{ env.get("RAPIDS_DATE_STRING") }}' + head_rev: ${{ git.head_rev(".")[:8] }} + +recipe: + name: libcudf-split + +cache: + source: + path: ../../.. + + build: + script: + content: | + + # Remove `-fdebug-prefix-map` line from CFLAGS and CXXFLAGS so the + # incrementing version number in the compile line doesn't break the + # cache + set -x + export CFLAGS=$(echo $CFLAGS | sed -E 's@\-fdebug\-prefix\-map[^ ]*@@g') + export CXXFLAGS=$(echo $CXXFLAGS | sed -E 's@\-fdebug\-prefix\-map[^ ]*@@g') + set +x + + ./build.sh -n -v \ + libcudf libcudf_kafka benchmarks tests \ + --build_metrics --incl_cache_stats --allgpuarch \ + --cmake-args=\"-DCUDF_ENABLE_ARROW_S3=ON\" + secrets: + - AWS_ACCESS_KEY_ID + - AWS_SECRET_ACCESS_KEY + - AWS_SESSION_TOKEN + env: + CMAKE_C_COMPILER_LAUNCHER: ${{ env.get("CMAKE_C_COMPILER_LAUNCHER") }} + CMAKE_CUDA_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CUDA_COMPILER_LAUNCHER") }} + CMAKE_CXX_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CXX_COMPILER_LAUNCHER") }} + CMAKE_GENERATOR: ${{ env.get("CMAKE_GENERATOR") }} + PARALLEL_LEVEL: ${{ env.get("PARALLEL_LEVEL") }} + RAPIDS_ARTIFACTS_DIR: ${{ env.get("RAPIDS_ARTIFACTS_DIR") }} + SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET") }} + SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT") }} + SCCACHE_REGION: ${{ env.get("SCCACHE_REGION") }} + SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL") }} + SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS") }} + SCCACHE_S3_KEY_PREFIX: libcudf-${{ env.get("RAPIDS_CONDA_ARCH") }} + + requirements: + build: + - ${{ compiler("c") }} + - ${{ compiler("cxx") }} + - ${{ compiler("cuda") }} + - cuda-version =${{ cuda_version }} + - cmake ${{ cmake_version }} + - ninja + - ${{ stdlib("c") }} + host: + - librmm =${{ minor_version }} + - libkvikio =${{ minor_version }} + - if: cuda_major == "11" + then: + - cudatoolkit + - libcurand =${{ cuda11_libcurand_host_version }} + - libcurand-dev =${{ cuda11_libcurand_host_version }} + - cuda-nvrtc =${{ cuda_version }} + - cuda-nvrtc-dev =${{ cuda_version }} + - cuda-nvtx =${{ cuda_version }} + - if: linux64 + then: + - libcufile =${{ cuda11_libcufile_host_version }} + - libcufile-dev =${{ cuda11_libcufile_host_version }} + else: + - cuda-nvrtc-dev + - cuda-nvtx-dev + - libcurand-dev + - if: linux64 + then: + - libcufile-dev + - cuda-version =${{ cuda_version }} + - nvcomp ${{ nvcomp_version }} + - dlpack ${{ dlpack_version }} + - librdkafka ${{ librdkafka_version }} + - flatbuffers =${{ flatbuffers_version }} + - rapids-logger =0.1 + - zlib ${{ zlib_version }} + +outputs: + - package: + name: libcudf + version: ${{ version }} + build: + script: + - cmake --install cpp/build + string: cuda${{ cuda_major }}_${{ date_string }}_${{ head_rev }} + dynamic_linking: + overlinking_behavior: "error" + requirements: + build: + - cmake ${{ cmake_version }} + - ${{ compiler("c") }} + host: + - cuda-version =${{ cuda_version }} + - libkvikio =${{ minor_version }} + - nvcomp ${{ nvcomp_version }} + - rapids-logger =0.1 + - zlib ${{ zlib_version }} + - if: cuda_major == "11" + then: cudatoolkit + else: cuda-cudart-dev + run: + - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} + - if: cuda_major == "11" + then: + - cudatoolkit + - if: linux64 + then: + - libcufile ${{ cuda11_libcufile_run_version }} + else: + - cuda-nvrtc + - if: linux64 + then: + - libcufile + - nvcomp ${{ nvcomp_version }} + - librmm =${{ minor_version }} + - libkvikio =${{ minor_version }} + - dlpack ${{ dlpack_version }} + - rapids-logger =0.1 + run_exports: + - ${{ pin_subpackage("libcudf", upper_bound="x.x") }} + ignore_run_exports: + by_name: + - cuda-cudart + - cuda-nvrtc + - cuda-nvtx + - cuda-version + - flatbuffers + - libcurand + - libkvikio + - librdkafka + - librmm + - nvcomp + tests: + - script: + - test -f $PREFIX/lib/libcudf.so + - test -f $PREFIX/include/cudf/column/column.hpp + about: + homepage: ${{ load_from_file("python/libcudf/pyproject.toml").project.urls.Homepage }} + license: ${{ load_from_file("python/libcudf/pyproject.toml").project.license.text }} + summary: ${{ load_from_file("python/libcudf/pyproject.toml").project.description }} + + - package: + name: libcudf_kafka + version: ${{ version }} + build: + script: + - cmake --install cpp/libcudf_kafka/build + string: cuda${{ cuda_major }}_${{ date_string }}_${{ head_rev }} + dynamic_linking: + overlinking_behavior: "error" + requirements: + build: + - cmake ${{ cmake_version }} + - ${{ stdlib("c") }} + host: + - librdkafka ${{ librdkafka_version }} + - ${{ pin_subpackage("libcudf", exact=True) }} + run: + - librdkafka ${{ librdkafka_version }} + - ${{ pin_subpackage("libcudf", exact=True) }} + ignore_run_exports: + by_name: + - cuda-cudart + - cuda-nvrtc + - cuda-nvtx + - cuda-version + - flatbuffers + - libcurand + - libkvikio + - librdkafka + - librmm + - nvcomp + tests: + - script: + - test -f $PREFIX/lib/libcudf_kafka.so + about: + homepage: https://rapids.ai/ + license: Apache-2.0 + summary: libcudf_kafka library + + - package: + name: libcudf-example + version: ${{ version }} + build: + script: + content: | + ./cpp/examples/build.sh --install + secrets: + - AWS_ACCESS_KEY_ID + - AWS_SECRET_ACCESS_KEY + - AWS_SESSION_TOKEN + env: + CMAKE_C_COMPILER_LAUNCHER: ${{ env.get("CMAKE_C_COMPILER_LAUNCHER") }} + CMAKE_CUDA_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CUDA_COMPILER_LAUNCHER") }} + CMAKE_CXX_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CXX_COMPILER_LAUNCHER") }} + CMAKE_GENERATOR: ${{ env.get("CMAKE_GENERATOR") }} + PARALLEL_LEVEL: ${{ env.get("PARALLEL_LEVEL") }} + RAPIDS_ARTIFACTS_DIR: ${{ env.get("RAPIDS_ARTIFACTS_DIR") }} + SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET") }} + SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT") }} + SCCACHE_REGION: ${{ env.get("SCCACHE_REGION") }} + SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL") }} + SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS") }} + SCCACHE_S3_KEY_PREFIX: libcudf-${{ env.get("RAPIDS_CONDA_ARCH") }} + string: cuda${{ cuda_major }}_${{ date_string }}_${{ head_rev }} + dynamic_linking: + overlinking_behavior: "error" + requirements: + build: + - ${{ compiler("c") }} + - ${{ compiler("cuda") }} + - ${{ compiler("cxx") }} + - ${{ stdlib("c") }} + - cmake ${{ cmake_version }} + - cuda-version =${{ cuda_version }} + - ninja + host: + - ${{ pin_subpackage("libcudf", exact=True) }} + - cuda-version =${{ cuda_version }} + - if: cuda_major == "11" + then: + - cuda-nvtx =${{ cuda_version }} + - cudatoolkit + else: + - cuda-nvtx-dev + - cuda-cudart-dev + run: + - ${{ pin_subpackage("libcudf", exact=True) }} + - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} + - if: cuda_major != "11" + then: + - cuda-nvtx + ignore_run_exports: + from_package: + - if: cuda_major != "11" + then: + - cuda-nvtx-dev + by_name: + - cuda-cudart + - cuda-nvrtc + - cuda-nvtx + - cuda-version + - flatbuffers + - libcurand + - libkvikio + - librdkafka + - librmm + - nvcomp + about: + homepage: ${{ load_from_file("python/libcudf/pyproject.toml").project.urls.Homepage }} + license: ${{ load_from_file("python/libcudf/pyproject.toml").project.license.text }} + summary: libcudf example executables + + - package: + name: libcudf-tests + version: ${{ version }} + build: + script: + - cmake --install cpp/build --component testing + - cmake --install cpp/libcudf_kafka/build --component testing + string: cuda${{ cuda_major }}_${{ date_string }}_${{ head_rev }} + dynamic_linking: + overlinking_behavior: "error" + missing_dso_allowlist: + - "libnvidia-ml.so.1" + requirements: + build: + - cmake ${{ cmake_version }} + - ${{ stdlib("c") }} + host: + - ${{ pin_subpackage("libcudf", exact=True) }} + - ${{ pin_subpackage("libcudf_kafka", exact=True) }} + - cuda-version =${{ cuda_version }} + - if: cuda_major == "11" + then: + - libcurand ${{ cuda11_libcurand_run_version }} + - cudatoolkit + else: + - libcurand-dev + - cuda-cudart-dev + run: + - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} + - ${{ pin_subpackage("libcudf", exact=True) }} + - ${{ pin_subpackage("libcudf_kafka", exact=True) }} + - if: cuda_major == "11" + then: + - libcurand ${{ cuda11_libcurand_run_version }} + else: + - libcurand + ignore_run_exports: + from_package: + - if: cuda_major != "11" + then: + - libcurand-dev + by_name: + - cuda-cudart + - cuda-nvrtc + - cuda-nvtx + - cuda-version + - flatbuffers + - libcurand + - libkvikio + - librdkafka + - librmm + - nvcomp + about: + homepage: ${{ load_from_file("python/libcudf/pyproject.toml").project.urls.Homepage }} + license: ${{ load_from_file("python/libcudf/pyproject.toml").project.license.text }} + summary: libcudf test & benchmark executables diff --git a/conda/recipes/pylibcudf/build.sh b/conda/recipes/pylibcudf/build.sh deleted file mode 100644 index 483346504db..00000000000 --- a/conda/recipes/pylibcudf/build.sh +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -# This assumes the script is executed from the root of the repo directory -./build.sh pylibcudf diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml deleted file mode 100644 index 14e2f31a5a5..00000000000 --- a/conda/recipes/pylibcudf/meta.yaml +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright (c) 2018-2025, NVIDIA CORPORATION. - -{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} -{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} -{% set py_version = environ['CONDA_PY'] %} -{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} -{% set cuda_major = cuda_version.split('.')[0] %} -{% set date_string = environ['RAPIDS_DATE_STRING'] %} - -package: - name: pylibcudf - version: {{ version }} - -source: - path: ../../.. - -build: - number: {{ GIT_DESCRIBE_NUMBER }} - string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} - script_env: - - AWS_ACCESS_KEY_ID - - AWS_SECRET_ACCESS_KEY - - AWS_SESSION_TOKEN - - CMAKE_C_COMPILER_LAUNCHER - - CMAKE_CUDA_COMPILER_LAUNCHER - - CMAKE_CXX_COMPILER_LAUNCHER - - CMAKE_GENERATOR - - PARALLEL_LEVEL - - SCCACHE_BUCKET - - SCCACHE_IDLE_TIMEOUT - - SCCACHE_REGION - - SCCACHE_S3_KEY_PREFIX=pylibcudf-aarch64 # [aarch64] - - SCCACHE_S3_KEY_PREFIX=pylibcudf-linux64 # [linux64] - - SCCACHE_S3_USE_SSL - - SCCACHE_S3_NO_CREDENTIALS - ignore_run_exports_from: - - {{ compiler('cuda') }} - {% if cuda_major != "11" %} - - cuda-cudart-dev - - libcufile-dev # [linux64] - {% endif %} - -requirements: - build: - - cmake {{ cmake_version }} - - ninja - - {{ compiler('c') }} - - {{ compiler('cxx') }} - {% if cuda_major == "11" %} - - {{ compiler('cuda') }} ={{ cuda_version }} - {% else %} - - {{ compiler('cuda') }} - {% endif %} - - cuda-version ={{ cuda_version }} - - {{ stdlib("c") }} - host: - - python - - cython >=3.0.3 - - rapids-build-backend >=0.3.0,<0.4.0.dev0 - - scikit-build-core >=0.10.0 - - dlpack >=0.8,<1.0 - - libcudf ={{ version }} - - rmm ={{ minor_version }} - {% if cuda_major == "11" %} - - cudatoolkit - {% else %} - - cuda-cudart-dev - - cuda-nvrtc - - libcufile-dev # [linux64] - {% endif %} - - cuda-version ={{ cuda_version }} - run: - - python - - typing_extensions >=4.0.0 - - pandas >=2.0,<2.2.4dev0 - - numpy >=1.23,<3.0a0 - - pyarrow>=14.0.0,<20.0.0a0 - - libcudf ={{ version }} - - {{ pin_compatible('rmm', max_pin='x.x') }} - - fsspec >=0.6.0 - {% if cuda_major == "11" %} - - cuda-python >=11.8.5,<12.0a0 - {% else %} - - cuda-python >=12.6.2,<13.0a0 - {% endif %} - - nvtx >=0.2.1 - - packaging - -test: - requires: - - cuda-version ={{ cuda_version }} - imports: - - pylibcudf - -about: - home: https://rapids.ai/ - license: Apache-2.0 - license_family: APACHE - license_file: LICENSE - summary: pylibcudf library diff --git a/conda/recipes/pylibcudf/recipe.yaml b/conda/recipes/pylibcudf/recipe.yaml new file mode 100644 index 00000000000..476f4d83960 --- /dev/null +++ b/conda/recipes/pylibcudf/recipe.yaml @@ -0,0 +1,106 @@ +# Copyright (c) 2018-2025, NVIDIA CORPORATION. +schema_version: 1 + +context: + version: ${{ env.get("RAPIDS_PACKAGE_VERSION") }} + minor_version: ${{ (version | split("."))[:2] | join(".") }} + cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }} + cuda_major: '${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[0] }}' + date_string: '${{ env.get("RAPIDS_DATE_STRING") }}' + py_version: ${{ env.get("RAPIDS_PY_VERSION") }} + py_buildstring: ${{ py_version | version_to_buildstring }} + head_rev: ${{ git.head_rev(".")[:8] }} + +package: + name: pylibcudf + version: ${{ version }} + +source: + path: ../../.. + +build: + string: cuda${{ cuda_major }}_py${{ py_buildstring }}_${{ date_string }}_${{ head_rev }} + script: + content: | + ./build.sh pylibcudf + secrets: + - AWS_ACCESS_KEY_ID + - AWS_SECRET_ACCESS_KEY + - AWS_SESSION_TOKEN + env: + CMAKE_C_COMPILER_LAUNCHER: ${{ env.get("CMAKE_C_COMPILER_LAUNCHER") }} + CMAKE_CUDA_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CUDA_COMPILER_LAUNCHER") }} + CMAKE_CXX_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CXX_COMPILER_LAUNCHER") }} + CMAKE_GENERATOR: ${{ env.get("CMAKE_GENERATOR") }} + SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET") }} + SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT") }} + SCCACHE_REGION: ${{ env.get("SCCACHE_REGION") }} + SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL") }} + SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS") }} + SCCACHE_S3_KEY_PREFIX: pylibcudf-${{ env.get("RAPIDS_CONDA_ARCH") }} + +requirements: + build: + - cmake ${{ cmake_version }} + - ninja + - ${{ compiler("c") }} + - ${{ compiler("cxx") }} + - ${{ compiler("cuda") }} + - cuda-version =${{ cuda_version }} + - ${{ stdlib("c") }} + host: + - python =${{ py_version }} + - pip + - cython >=3.0.3 + - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - scikit-build-core >=0.10.0 + - dlpack >=0.8,<1.0 + - libcudf =${{ version }} + - rmm =${{ minor_version }} + - if: cuda_major == "11" + then: + - cudatoolkit + else: + - cuda-cudart-dev + - cuda-nvrtc + - if: linux64 + then: + - libcufile-dev + - cuda-version =${{ cuda_version }} + run: + - python + - typing_extensions >=4.0.0 + - pandas >=2.0,<2.2.4dev0 + - numpy >=1.23,<2.1 + - pyarrow>=14.0.0,<20.0.0a0 + - libcudf =${{ version }} + - ${{ pin_compatible("rmm", upper_bound="x.x") }} + - fsspec >=0.6.0 + - if: cuda_major == "11" + then: + - cuda-python >=11.8.5,<12.0a0 + else: + - cuda-python >=12.6.2,<13.0a0 + - nvtx >=0.2.1 + - packaging + ignore_run_exports: + from_package: + - if: cuda_major != "11" + then: + - cuda-cudart-dev + - if: linux64 + then: + - libcufile-dev + by_name: + - cuda-version + +tests: + - python: + imports: + - pylibcudf + pip_check: false + +about: + homepage: ${{ load_from_file("python/pylibcudf/pyproject.toml").project.urls.Homepage }} + license: ${{ load_from_file("python/pylibcudf/pyproject.toml").project.license.text }} + summary: ${{ load_from_file("python/pylibcudf/pyproject.toml").project.description }} diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 03f11cc957b..549cb8e5d5d 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -344,11 +344,18 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp) # ################################################################################################## # * ast benchmark --------------------------------------------------------------------------------- -ConfigureNVBench(AST_NVBENCH ast/transform.cpp) +ConfigureNVBench(AST_NVBENCH ast/polynomials.cpp ast/transform.cpp) # ################################################################################################## # * binaryop benchmark ---------------------------------------------------------------------------- -ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp) +ConfigureNVBench( + BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp binaryop/polynomials.cpp +) + +# ################################################################################################## +# * transform benchmark +# --------------------------------------------------------------------------------- +ConfigureNVBench(TRANSFORM_NVBENCH transform/polynomials.cpp) # ################################################################################################## # * nvtext benchmark ------------------------------------------------------------------- diff --git a/cpp/benchmarks/ast/polynomials.cpp b/cpp/benchmarks/ast/polynomials.cpp new file mode 100644 index 00000000000..b8e4ca46b72 --- /dev/null +++ b/cpp/benchmarks/ast/polynomials.cpp @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include + +template +static void BM_ast_polynomials(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const order = static_cast(state.get_int64("order")); + + CUDF_EXPECTS(order > 0, "Polynomial order must be greater than 0"); + + data_profile profile; + profile.set_distribution_params(cudf::type_to_id(), + distribution_id::NORMAL, + static_cast(0), + static_cast(1)); + auto table = create_random_table({cudf::type_to_id()}, row_count{num_rows}, profile); + auto column_view = table->get_column(0); + + std::vector> constants; + { + std::random_device random_device; + std::mt19937 generator; + std::uniform_real_distribution distribution{0, 1}; + + std::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(order + 1), + std::back_inserter(constants), + [&](int) { return distribution(generator); }); + } + + cudf::ast::tree tree{}; + + auto& column_ref = tree.push(cudf::ast::column_reference{0}); + + // computes polynomials: (((ax + b)x + c)x + d)x + e... = ax**4 + bx**3 + cx**2 + dx + e.... + tree.push(cudf::ast::literal{constants[0]}); + + for (cudf::size_type i = 0; i < order; i++) { + auto& product = + tree.push(cudf::ast::operation{cudf::ast::ast_operator::MUL, tree.back(), column_ref}); + auto& constant = tree.push(cudf::ast::literal{constants[i + 1]}); + tree.push(cudf::ast::operation{cudf::ast::ast_operator::ADD, product, constant}); + } + + // Use the number of bytes read from global memory + state.add_global_memory_reads(num_rows); + state.add_global_memory_writes(num_rows); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::scoped_range range{"benchmark_iteration"}; + cudf::compute_column(*table, tree.back(), launch.get_stream().get_stream()); + }); +} + +#define AST_POLYNOMIAL_BENCHMARK_DEFINE(name, key_type) \ + static void name(::nvbench::state& st) { ::BM_ast_polynomials(st); } \ + NVBENCH_BENCH(name) \ + .set_name(#name) \ + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}) \ + .add_int64_axis("order", {1, 2, 4, 8, 16, 32}) + +AST_POLYNOMIAL_BENCHMARK_DEFINE(ast_polynomials_float32, float); + +AST_POLYNOMIAL_BENCHMARK_DEFINE(ast_polynomials_float64, double); diff --git a/cpp/benchmarks/binaryop/polynomials.cpp b/cpp/benchmarks/binaryop/polynomials.cpp new file mode 100644 index 00000000000..782ae1db927 --- /dev/null +++ b/cpp/benchmarks/binaryop/polynomials.cpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include + +template +static void BM_binaryop_polynomials(nvbench::state& state) +{ + auto const num_rows{static_cast(state.get_int64("num_rows"))}; + auto const order{static_cast(state.get_int64("order"))}; + + CUDF_EXPECTS(order > 0, "Polynomial order must be greater than 0"); + + data_profile profile; + profile.set_distribution_params(cudf::type_to_id(), + distribution_id::NORMAL, + static_cast(0), + static_cast(1)); + auto table = create_random_table({cudf::type_to_id()}, row_count{num_rows}, profile); + auto column_view = table->get_column(0); + + std::vector> constants; + { + std::random_device random_device; + std::mt19937 generator; + std::uniform_real_distribution distribution{0, 1}; + + std::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(order + 1), + std::back_inserter(constants), + [&](int) { return cudf::numeric_scalar(distribution(generator)); }); + } + + // Use the number of bytes read from global memory + state.add_global_memory_reads(num_rows); + state.add_global_memory_writes(num_rows); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + // computes polynomials: (((ax + b)x + c)x + d)x + e... = ax**4 + bx**3 + cx**2 + dx + e.... + cudf::scoped_range range{"benchmark_iteration"}; + rmm::cuda_stream_view stream{launch.get_stream().get_stream()}; + std::vector> intermediates; + + auto result = cudf::make_column_from_scalar(constants[0], num_rows, stream); + + for (cudf::size_type i = 0; i < order; i++) { + auto product = cudf::binary_operation(result->view(), + column_view, + cudf::binary_operator::MUL, + cudf::data_type{cudf::type_to_id()}, + stream); + auto sum = cudf::binary_operation(product->view(), + constants[i + 1], + cudf::binary_operator::ADD, + cudf::data_type{cudf::type_to_id()}, + stream); + intermediates.push_back(std::move(product)); + intermediates.push_back(std::move(result)); + result = std::move(sum); + } + }); +} + +#define BINARYOP_POLYNOMIALS_BENCHMARK_DEFINE(name, key_type) \ + \ + static void name(::nvbench::state& st) { ::BM_binaryop_polynomials(st); } \ + NVBENCH_BENCH(name) \ + .set_name(#name) \ + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}) \ + .add_int64_axis("order", {1, 2, 4, 8, 16, 32}) + +BINARYOP_POLYNOMIALS_BENCHMARK_DEFINE(binaryop_polynomials_float32, float); + +BINARYOP_POLYNOMIALS_BENCHMARK_DEFINE(binaryop_polynomials_float64, double); diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index 8d6aacd2ef1..f1af62eaa87 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -580,7 +580,7 @@ std::unique_ptr create_random_utf8_string_column(data_profile cons null_mask.begin(), lengths.begin(), cuda::proclaim_return_type([] __device__(auto) { return 0; }), - thrust::logical_not{}); + cuda::std::logical_not{}); auto valid_lengths = thrust::make_transform_iterator( thrust::make_zip_iterator(thrust::make_tuple(lengths.begin(), null_mask.begin())), valid_or_zero{}); diff --git a/cpp/benchmarks/common/random_distribution_factory.cuh b/cpp/benchmarks/common/random_distribution_factory.cuh index c27616132d0..32424fbaaa3 100644 --- a/cpp/benchmarks/common/random_distribution_factory.cuh +++ b/cpp/benchmarks/common/random_distribution_factory.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,6 +29,7 @@ #include #include +#include #include #include diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp index 594dc0de28a..494d5722ae4 100644 --- a/cpp/benchmarks/text/normalize.cpp +++ b/cpp/benchmarks/text/normalize.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,8 +48,11 @@ static void bench_normalize(nvbench::state& state) [&](nvbench::launch& launch) { auto result = nvtext::normalize_spaces(input); }); } else { bool const to_lower = (normalize_type == "to_lower"); + // we expect the normalizer to be created once and re-used + // so creating it is not measured + auto normalizer = nvtext::create_character_normalizer(to_lower); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = nvtext::normalize_characters(input, to_lower); + auto result = nvtext::normalize_characters(input, *normalizer); }); } } @@ -57,6 +60,6 @@ static void bench_normalize(nvbench::state& state) NVBENCH_BENCH(bench_normalize) .set_name("normalize") .add_int64_axis("min_width", {0}) - .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("max_width", {128, 256}) .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"spaces", "characters", "to_lower"}); diff --git a/cpp/benchmarks/transform/polynomials.cpp b/cpp/benchmarks/transform/polynomials.cpp new file mode 100644 index 00000000000..07f8a47c771 --- /dev/null +++ b/cpp/benchmarks/transform/polynomials.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include + +template +static void BM_transform_polynomials(nvbench::state& state) +{ + auto const num_rows{static_cast(state.get_int64("num_rows"))}; + auto const order{static_cast(state.get_int64("order"))}; + + CUDF_EXPECTS(order > 0, "Polynomial order must be greater than 0"); + + data_profile profile; + profile.set_distribution_params(cudf::type_to_id(), + distribution_id::NORMAL, + static_cast(0), + static_cast(1)); + auto column = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + + std::vector> constants; + + std::transform( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(order + 1), + std::back_inserter(constants), + [&](int) { return create_random_column(cudf::type_to_id(), row_count{1}, profile); }); + + // Use the number of bytes read from global memory + state.add_global_memory_reads(num_rows); + state.add_global_memory_writes(num_rows); + + std::vector inputs{*column}; + std::transform(constants.begin(), + constants.end(), + std::back_inserter(inputs), + [](auto& col) -> cudf::column_view { return *col; }); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + // computes polynomials: (((ax + b)x + c)x + d)x + e... = ax**4 + bx**3 + cx**2 + dx + e.... + + cudf::scoped_range range{"benchmark_iteration"}; + + std::string type = cudf::type_to_name(cudf::data_type{cudf::type_to_id()}); + + std::string params_decl = type + " c0"; + std::string expr = "c0"; + + for (cudf::size_type i = 1; i < order + 1; i++) { + expr = "( " + expr + " ) * x + c" + std::to_string(i); + params_decl += ", " + type + " c" + std::to_string(i); + } + + static_assert(std::is_same_v || std::is_same_v); + + // clang-format off + std::string udf = + "__device__ inline void compute_polynomial(" + type + "* out, " + type + " x, " + params_decl + ")" + +"{ " +" *out = " + expr + ";" +"}"; + + // clang-format on + + cudf::transform(inputs, + udf, + cudf::data_type{cudf::type_to_id()}, + false, + launch.get_stream().get_stream()); + }); +} + +#define TRANSFORM_POLYNOMIALS_BENCHMARK_DEFINE(name, key_type) \ + \ + static void name(::nvbench::state& st) { ::BM_transform_polynomials(st); } \ + NVBENCH_BENCH(name) \ + .set_name(#name) \ + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}) \ + .add_int64_axis("order", {1, 2, 4, 8, 16, 32}) + +TRANSFORM_POLYNOMIALS_BENCHMARK_DEFINE(transform_polynomials_float32, float); + +TRANSFORM_POLYNOMIALS_BENCHMARK_DEFINE(transform_polynomials_float64, double); diff --git a/cpp/examples/interop/interop.cpp b/cpp/examples/interop/interop.cpp index 133a4e3a514..b01b04489a6 100644 --- a/cpp/examples/interop/interop.cpp +++ b/cpp/examples/interop/interop.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,7 +24,7 @@ #include #include -// Helper functuons to create StringViews +// Helper functions to create StringViews inline arrow::StringViewType::c_type to_inline_string_view(const void* data, int32_t const& size) { arrow::StringViewType::c_type out; diff --git a/cpp/include/cudf/detail/utilities/functional.hpp b/cpp/include/cudf/detail/utilities/functional.hpp new file mode 100644 index 00000000000..114c69bbe46 --- /dev/null +++ b/cpp/include/cudf/detail/utilities/functional.hpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace cudf::detail { + +#if CCCL_MAJOR_VERSION >= 3 +using cuda::maximum; +using cuda::minimum; +#else +using thrust::maximum; +using thrust::minimum; +#endif + +} // namespace cudf::detail diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index 810f0377597..276a1ea77e2 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -327,6 +327,8 @@ unique_device_array_t to_arrow_host( * * @throws cudf::data_type_error if the input array is not a struct array. * + * @throws std::overflow_error if the input arrow object exceeds the column size limit. + * * The conversion will not call release on the input Array. * * @param schema `ArrowSchema` pointer to describe the type of the data @@ -367,6 +369,8 @@ std::unique_ptr from_arrow_column( * * @throws std::invalid_argument if the device_type is not `ARROW_DEVICE_CPU` * + * @throws std::overflow_error if the input arrow object exceeds the column size limit. + * * @throws cudf::data_type_error if the input array is not a struct array, * non-struct arrays should be passed to `from_arrow_host_column` instead. * @@ -411,6 +415,8 @@ std::unique_ptr from_arrow_stream( * * @throws cudf::data_type_error if input arrow data type is not supported in cudf. * + * @throws std::overflow_error if the input arrow object exceeds the column size limit. + * * The conversion will not call release on the input Array. * * @param schema `ArrowSchema` pointer to describe the type of the data @@ -483,6 +489,8 @@ using unique_table_view_t = * * @throws cudf::data_type_error if the input arrow data type is not supported. * + * @throws std::overflow_error if the input arrow object exceeds the column size limit. + * * Each child of the input struct will be the columns of the resulting table_view. * * @note The custom deleter used for the unique_ptr to the table_view maintains ownership @@ -528,6 +536,8 @@ using unique_column_view_t = * * @throws cudf::data_type_error input arrow data type is not supported. * + * @throws std::overflow_error if the input arrow object exceeds the column size limit. + * * @note The custom deleter used for the unique_ptr to the table_view maintains ownership * over any memory which is allocated, such as converting boolean columns from the bitmap * used by Arrow to the 1-byte per value for cudf. diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index 7bec40893fd..92859ec0895 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -67,7 +67,7 @@ class datasource { /** * @brief Base class destructor */ - virtual ~buffer() {} + virtual ~buffer() = default; /** * @brief Factory to construct a datasource buffer object from a container. @@ -156,7 +156,7 @@ class datasource { /** * @brief Base class destructor */ - virtual ~datasource(){}; + virtual ~datasource() = default; /** * @brief Returns a buffer with a subset of data from the source. @@ -168,6 +168,21 @@ class datasource { */ virtual std::unique_ptr host_read(size_t offset, size_t size) = 0; + /** + * @brief Asynchronously reads a specified portion of data from the datasource. + * + * This function initiates an asynchronous read operation that reads `size` bytes of data + * starting from the given `offset` in the datasource. Depending on the concrete datasource + * implementation, the read operation may be deferred until the returned future is waited upon. + * + * @param offset The starting position in the datasource from which to read. + * @param size The number of bytes to read from the datasource. + * @return A std::future that will hold a unique pointer to a datasource::buffer containing + * the read data once the operation completes. + */ + virtual std::future> host_read_async(size_t offset, + size_t size); + /** * @brief Reads a selected range into a preallocated buffer. * @@ -179,6 +194,22 @@ class datasource { */ virtual size_t host_read(size_t offset, size_t size, uint8_t* dst) = 0; + /** + * @brief Asynchronously reads data from the source into the provided host memory buffer. + * + * This function initiates an asynchronous read operation from the data source starting at the + * specified offset and reads the specified number of bytes into the destination buffer. Depending + * on the concrete datasource implementation, the read operation may be deferred and will be + * executed when the returned future is waited upon. + * + * @param offset The starting position in the data source from which to read. + * @param size The number of bytes to read from the data source. + * @param dst Pointer to the destination buffer where the read data will be stored. + * @return A std::future object that will hold the number of bytes read once the operation + * completes. + */ + virtual std::future host_read_async(size_t offset, size_t size, uint8_t* dst); + /** * @brief Whether or not this source supports reading directly into device memory. * @@ -296,7 +327,7 @@ class datasource { */ class non_owning_buffer : public buffer { public: - non_owning_buffer() {} + non_owning_buffer() = default; /** * @brief Construct a new non owning buffer object diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp index d276c5df7dc..8fb1f30f961 100644 --- a/cpp/include/cudf/strings/detail/utilities.hpp +++ b/cpp/include/cudf/strings/detail/utilities.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -96,5 +96,17 @@ int64_t get_offset_value(cudf::column_view const& offsets, size_type index, rmm::cuda_stream_view stream); +/** + * @brief Return the first and last offset in the given strings column + * + * This accounts for sliced input columns as well. + * + * @param input Strings column + * @param stream CUDA stream used for device memory operations and kernel launches + * @return First and last offset values + */ +std::pair get_first_and_last_offset(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream); + } // namespace strings::detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index b91748cfc7d..15539c50da9 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -443,10 +443,12 @@ __device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, siz __device__ inline string_view string_view::substr(size_type pos, size_type count) const { if (pos < 0 || pos >= length()) { return string_view{}; } - auto const itr = begin() + pos; - auto const spos = itr.byte_offset(); - auto const epos = count >= 0 ? (itr + count).byte_offset() : size_bytes(); - return {data() + spos, epos - spos}; + auto const spos = begin() + pos; + auto const epos = count >= 0 ? (spos + count) : const_iterator{*this, _length, size_bytes()}; + auto ss = string_view{data() + spos.byte_offset(), epos.byte_offset() - spos.byte_offset()}; + // this potentially saves redundant character counting downstream + if (_length != UNKNOWN_STRING_LENGTH) { ss._length = epos.position() - spos.position(); } + return ss; } __device__ inline size_type string_view::character_offset(size_type bytepos) const diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index 8214ea6e83b..6ace930c1fe 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,6 +33,7 @@ #include #include +#include #include #include #include @@ -1466,9 +1467,9 @@ class device_row_comparator { auto rvalid = detail::make_validity_iterator(rcol); if (nulls_are_equal == null_equality::UNEQUAL) { if (thrust::any_of( - thrust::seq, lvalid, lvalid + lcol.size(), thrust::logical_not()) or + thrust::seq, lvalid, lvalid + lcol.size(), cuda::std::logical_not()) or thrust::any_of( - thrust::seq, rvalid, rvalid + rcol.size(), thrust::logical_not())) { + thrust::seq, rvalid, rvalid + rcol.size(), cuda::std::logical_not())) { return false; } } else { diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 43f060fdafa..5f978a0d8ec 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -125,5 +125,99 @@ std::unique_ptr minhash64( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the minhash values for each input row + * + * This function uses MurmurHash3_x86_32 for the hash algorithm. + * + * The input row is first hashed using the given `seed` over a sliding window + * of `ngrams` of strings. These hash values are then combined with the `a` + * and `b` parameter values using the following formula: + * ``` + * max_hash = max of uint32 + * mp = (1 << 61) - 1 + * hv[i] = hash value of a ngrams at i + * pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash + * ``` + * + * This calculation is performed on each set of ngrams and the minimum value + * is computed as follows: + * ``` + * mh[j,i] = min(pv[i]) for all ngrams in row j + * and where i=[0,a.size()) + * ``` + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if the ngrams < 2 + * @throw std::invalid_argument if parameter_a is empty + * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()` + * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit + * + * @param input Strings column to compute minhash + * @param ngrams The number of strings to hash within each row + * @param seed Seed value used for the hash algorithm + * @param parameter_a Values used for the permuted calculation + * @param parameter_b Values used for the permuted calculation + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr minhash_ngrams( + cudf::lists_column_view const& input, + cudf::size_type ngrams, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + +/** + * @brief Returns the minhash values for each input row + * + * This function uses MurmurHash3_x64_128 for the hash algorithm. + * + * The input row is first hashed using the given `seed` over a sliding window + * of `ngrams` of strings. These hash values are then combined with the `a` + * and `b` parameter values using the following formula: + * ``` + * max_hash = max of uint64 + * mp = (1 << 61) - 1 + * hv[i] = hash value of a ngrams at i + * pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash + * ``` + * + * This calculation is performed on each set of ngrams and the minimum value + * is computed as follows: + * ``` + * mh[j,i] = min(pv[i]) for all ngrams in row j + * and where i=[0,a.size()) + * ``` + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if the ngrams < 2 + * @throw std::invalid_argument if parameter_a is empty + * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()` + * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit + * + * @param input List strings column to compute minhash + * @param ngrams The number of strings to hash within each row + * @param seed Seed value used for the hash algorithm + * @param parameter_a Values used for the permuted calculation + * @param parameter_b Values used for the permuted calculation + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr minhash64_ngrams( + cudf::lists_column_view const& input, + cudf::size_type ngrams, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** @} */ // end of group } // namespace CUDF_EXPORT nvtext diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp index 74325f4a406..70ee7891ad7 100644 --- a/cpp/include/nvtext/normalize.hpp +++ b/cpp/include/nvtext/normalize.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include #include @@ -107,5 +108,113 @@ std::unique_ptr normalize_characters( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Normalizer object to be used with nvtext::normalize_characters + * + * Use nvtext::create_normalizer to create this object. + * + * This normalizer includes: + * + * - adding padding around punctuation (unicode category starts with "P") + * as well as certain ASCII symbols like "^" and "$" + * - adding padding around the [CJK Unicode block + * characters](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)) + * - changing whitespace (e.g. `"\t", "\n", "\r"`) to just space `" "` + * - removing control characters (unicode categories "Cc" and "Cf") + * + * The padding process adds a single space before and after the character. + * Details on _unicode category_ can be found here: + * https://unicodebook.readthedocs.io/unicode.html#categories + * + * If `do_lower_case = true`, lower-casing also removes any accents. The + * accents cannot be removed from upper-case characters without lower-casing + * and lower-casing cannot be performed without also removing accents. + * However, if the accented character is already lower-case, then only the + * accent is removed. + * + * If `special_tokens` are included the padding after `[` and before `]` is not + * inserted if the characters between them match one of the given tokens. + * Also, the `special_tokens` are expected to include the `[]` characters + * at the beginning of and end of each string appropriately. + */ +struct character_normalizer { + /** + * @brief Normalizer object constructor + * + * This initializes and holds the character normalizing tables and settings. + * + * @param do_lower_case If true, upper-case characters are converted to + * lower-case and accents are stripped from those characters. + * If false, accented and upper-case characters are not transformed. + * @param special_tokens Each row is a token including the `[]` brackets. + * For example: `[BOS]`, `[EOS]`, `[UNK]`, `[SEP]`, `[PAD]`, `[CLS]`, `[MASK]` + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + */ + character_normalizer(bool do_lower_case, + cudf::strings_column_view const& special_tokens, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + ~character_normalizer(); + + struct character_normalizer_impl; + std::unique_ptr _impl; +}; + +/** + * @brief Create a normalizer object + * + * Creates a normalizer object which can be reused on multiple calls to + * nvtext::normalize_characters + * + * @see nvtext::character_normalizer + * + * @param do_lower_case If true, upper-case characters are converted to + * lower-case and accents are stripped from those characters. + * If false, accented and upper-case characters are not transformed. + * @param special_tokens Individual tokens including `[]` brackets. + * Default is no special tokens. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Object to be used with nvtext::normalize_characters + */ +std::unique_ptr create_character_normalizer( + bool do_lower_case, + cudf::strings_column_view const& special_tokens = cudf::strings_column_view(cudf::column_view{ + cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0}), + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + +/** + * @brief Normalizes the text in input strings column + * + * @see nvtext::character_normalizer for details on the normalizer behavior + * + * @code{.pseudo} + * cn = create_character_normalizer(true) + * s = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"] + * s1 = normalize_characters(s,cn) + * s1 is now ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "] + * + * cn = create_character_normalizer(false) + * s2 = normalize_characters(s,cn) + * s2 is now ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "] + * @endcode + * + * A null input element at row `i` produces a corresponding null entry + * for row `i` in the output column. + * + * @param input The input strings to normalize + * @param normalizer Normalizer to use for this function + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Memory resource to allocate any returned objects + * @return Normalized strings column + */ +std::unique_ptr normalize_characters( + cudf::strings_column_view const& input, + character_normalizer const& normalizer, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** @} */ // end of group } // namespace CUDF_EXPORT nvtext diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu index 3c558f1e264..70e26ae4285 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cu +++ b/cpp/src/binaryop/compiled/binary_ops.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -241,8 +242,8 @@ struct null_considering_binop { return invalid_str; else if (lhs_valid && rhs_valid) { return (op == binary_operator::NULL_MAX) - ? thrust::maximum()(lhs_value, rhs_value) - : thrust::minimum()(lhs_value, rhs_value); + ? cudf::detail::maximum()(lhs_value, rhs_value) + : cudf::detail::minimum()(lhs_value, rhs_value); } else if (lhs_valid) return lhs_value; else diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu index 9dc39f01ab3..c304d705f9b 100644 --- a/cpp/src/column/column_device_view.cu +++ b/cpp/src/column/column_device_view.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,7 @@ #include #include +#include #include namespace cudf { diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu index 2e78954d78a..2695288af64 100644 --- a/cpp/src/filling/repeat.cu +++ b/cpp/src/filling/repeat.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -81,7 +82,7 @@ struct count_checker { if (static_cast(std::numeric_limits::max()) > std::numeric_limits::max()) { auto max = thrust::reduce( - rmm::exec_policy(stream), count.begin(), count.end(), 0, thrust::maximum()); + rmm::exec_policy(stream), count.begin(), count.end(), 0, cudf::detail::maximum()); CUDF_EXPECTS(max <= std::numeric_limits::max(), "count exceeds the column size limit", std::overflow_error); diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu index 583357d9090..a0ba81bccb2 100644 --- a/cpp/src/groupby/sort/group_rank_scan.cu +++ b/cpp/src/groupby/sort/group_rank_scan.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -146,7 +147,7 @@ std::unique_ptr rank_generator(column_view const& grouped_values, group_labels_begin + group_labels.size(), mutable_rank_begin, mutable_rank_begin, - thrust::equal_to{}, + cuda::std::equal_to{}, scan_op); return ranks; } diff --git a/cpp/src/groupby/sort/group_replace_nulls.cu b/cpp/src/groupby/sort/group_replace_nulls.cu index 088ed05e5eb..f94ae71a23c 100644 --- a/cpp/src/groupby/sort/group_replace_nulls.cu +++ b/cpp/src/groupby/sort/group_replace_nulls.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ #include +#include #include #include #include @@ -55,7 +56,7 @@ std::unique_ptr group_replace_nulls(cudf::column_view const& grouped_val thrust::make_tuple(gather_map.begin(), thrust::make_discard_iterator())); auto func = cudf::detail::replace_policy_functor(); - thrust::equal_to eq; + cuda::std::equal_to eq; if (replace_policy == cudf::replace_policy::PRECEDING) { thrust::inclusive_scan_by_key(rmm::exec_policy(stream), group_labels.begin(), diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh index a90445fabe1..160d0a3b276 100644 --- a/cpp/src/groupby/sort/group_scan_util.cuh +++ b/cpp/src/groupby/sort/group_scan_util.cuh @@ -37,6 +37,7 @@ #include #include +#include #include #include #include @@ -122,7 +123,7 @@ struct group_scan_functor() group_labels.end(), inp_iter, out_iter, - thrust::equal_to{}, + cuda::std::equal_to{}, binop); }; @@ -167,7 +168,7 @@ struct group_scan_functor(0), gather_map.begin(), - thrust::equal_to{}, + cuda::std::equal_to{}, binop_generator.binop()); // diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh index 662c380eff5..9dba468bf14 100644 --- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh +++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh @@ -175,7 +175,7 @@ struct group_reduction_functor< inp_iter, thrust::make_discard_iterator(), out_iter, - thrust::equal_to{}, + cuda::std::equal_to{}, binop); }; @@ -201,7 +201,7 @@ struct group_reduction_functor< rmm::device_uvector validity(num_groups, stream); do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr), validity.begin(), - thrust::logical_or{}); + cuda::std::logical_or{}); auto [null_mask, null_count] = cudf::detail::valid_if(validity.begin(), validity.end(), cuda::std::identity{}, stream, mr); @@ -238,7 +238,7 @@ struct group_reduction_functor< inp_iter, thrust::make_discard_iterator(), out_iter, - thrust::equal_to{}, + cuda::std::equal_to{}, binop); }; @@ -254,7 +254,7 @@ struct group_reduction_functor< auto validity = rmm::device_uvector(num_groups, stream); do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr), validity.begin(), - thrust::logical_or{}); + cuda::std::logical_or{}); auto [null_mask, null_count] = cudf::detail::valid_if(validity.begin(), validity.end(), cuda::std::identity{}, stream, mr); diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu index 29c4dfd35ac..836da2987e2 100644 --- a/cpp/src/interop/from_arrow_device.cu +++ b/cpp/src/interop/from_arrow_device.cu @@ -40,6 +40,10 @@ #include #include +#include +#include +#include + namespace cudf { namespace detail { @@ -317,6 +321,11 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_EXPECTS( + input->length <= static_cast(std::numeric_limits::max()), + "Total number of rows in Arrow column exceeds the column size limit.", + std::overflow_error); + return type.id() != type_id::EMPTY ? std::move(type_dispatcher( type, dispatch_from_arrow_device{}, schema, input, type, skip_mask, stream, mr)) diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index ea5487a2960..0be1557faaf 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -43,6 +43,10 @@ #include #include +#include +#include +#include + namespace cudf { namespace detail { @@ -381,6 +385,11 @@ std::unique_ptr get_column_copy(ArrowSchemaView* schema, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_EXPECTS( + input->length <= static_cast(std::numeric_limits::max()), + "Total number of rows in Arrow column exceeds the column size limit.", + std::overflow_error); + return type.id() != type_id::EMPTY ? std::move(type_dispatcher( type, dispatch_copy_from_arrow_host{stream, mr}, schema, input, type, skip_mask)) diff --git a/cpp/src/interop/from_arrow_stream.cu b/cpp/src/interop/from_arrow_stream.cu index deff62be576..ce1db96ca43 100644 --- a/cpp/src/interop/from_arrow_stream.cu +++ b/cpp/src/interop/from_arrow_stream.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -121,6 +121,7 @@ std::unique_ptr
from_arrow_stream(ArrowArrayStream* input, schema.release(&schema); + if (chunks.size() == 1) { return std::move(chunks[0]); } auto chunk_views = std::vector{}; chunk_views.reserve(chunks.size()); std::transform( diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu index 11d5749ee38..2be2e42c2b3 100644 --- a/cpp/src/io/avro/reader_impl.cu +++ b/cpp/src/io/avro/reader_impl.cu @@ -21,6 +21,7 @@ #include "io/utilities/hostdevice_vector.hpp" #include +#include #include #include #include @@ -300,8 +301,10 @@ rmm::device_buffer decompress_data(datasource& source, size_t const uncompressed_data_size = std::reduce(uncompressed_data_sizes.begin(), uncompressed_data_sizes.end()); - size_t const max_uncomp_block_size = std::reduce( - uncompressed_data_sizes.begin(), uncompressed_data_sizes.end(), 0, thrust::maximum()); + size_t const max_uncomp_block_size = std::reduce(uncompressed_data_sizes.begin(), + uncompressed_data_sizes.end(), + 0, + cudf::detail::maximum()); size_t temp_size = 0; status = diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu index cf5996dfd93..30501c3f2e2 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cu +++ b/cpp/src/io/comp/nvcomp_adapter.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #include "nvcomp_adapter.cuh" +#include #include #include @@ -122,7 +123,7 @@ std::pair max_chunk_and_total_input_size(device_span()); + cudf::detail::maximum()); auto const sum = thrust::reduce(rmm::exec_policy(stream), input_sizes.begin(), input_sizes.end()); return {max, sum}; } diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh index 7b217d08da3..4b80b981030 100644 --- a/cpp/src/io/fst/logical_stack.cuh +++ b/cpp/src/io/fst/logical_stack.cuh @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -400,7 +401,7 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols, d_kv_operations.Current(), detail::AddStackLevelFromStackOp{symbol_to_stack_op}, num_symbols_in, - cub::Equality{}, + cuda::std::equal_to{}, stream)); stack_level_scan_bytes = std::max(gen_segments_scan_bytes, scan_by_key_bytes); } else { @@ -499,7 +500,7 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols, d_kv_operations.Current(), detail::AddStackLevelFromStackOp{symbol_to_stack_op}, num_symbols_in, - cub::Equality{}, + cuda::std::equal_to{}, stream)); } else { CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan( diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 53c1d335a40..204aca8a69c 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -36,6 +36,7 @@ #include #include +#include #include namespace cudf::io { diff --git a/cpp/src/io/json/column_tree_construction.cu b/cpp/src/io/json/column_tree_construction.cu index c4fe7926706..13d1751e03d 100644 --- a/cpp/src/io/json/column_tree_construction.cu +++ b/cpp/src/io/json/column_tree_construction.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include "nested_json.hpp" #include +#include #include #include #include @@ -208,7 +209,7 @@ std::tuple reduce_to_column_tree( thrust::make_constant_iterator(1), non_leaf_nodes.begin(), non_leaf_nodes_children.begin(), - thrust::equal_to()); + cuda::std::equal_to()); thrust::scatter(rmm::exec_policy_nosync(stream), non_leaf_nodes_children.begin(), diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu index 7b9fc25d1cc..712d280c11f 100644 --- a/cpp/src/io/json/host_tree_algorithms.cu +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -46,6 +47,7 @@ #include #include +#include namespace cudf::io::json::detail { @@ -1006,13 +1008,13 @@ void scatter_offsets(tree_meta_t const& tree, col.string_offsets.begin(), col.string_offsets.end(), col.string_offsets.begin(), - thrust::maximum{}); + cudf::detail::maximum{}); } else if (col.type == json_col_t::ListColumn) { thrust::inclusive_scan(rmm::exec_policy_nosync(stream), col.child_offsets.begin(), col.child_offsets.end(), col.child_offsets.begin(), - thrust::maximum{}); + cudf::detail::maximum{}); } } stream.synchronize(); diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 1fe58a0449f..c0790c2f73d 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -130,8 +131,8 @@ reduce_to_column_tree(tree_meta_t const& tree, ordered_row_offsets, unique_col_ids.begin(), max_row_offsets.begin(), - thrust::equal_to(), - thrust::maximum()); + cuda::std::equal_to(), + cudf::detail::maximum()); // 3. reduce_by_key {col_id}, {node_categories} - custom opp (*+v=*, v+v=v, *+#=E) rmm::device_uvector column_categories(num_columns, stream); @@ -142,7 +143,7 @@ reduce_to_column_tree(tree_meta_t const& tree, thrust::make_permutation_iterator(tree.node_categories.begin(), ordered_node_ids.begin()), unique_col_ids.begin(), column_categories.begin(), - thrust::equal_to(), + cuda::std::equal_to(), [] __device__(NodeT type_a, NodeT type_b) -> NodeT { auto is_a_leaf = (type_a == NC_VAL || type_a == NC_STR); auto is_b_leaf = (type_b == NC_VAL || type_b == NC_STR); diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index e2fe926ea19..e0d6f51aad9 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -213,8 +214,8 @@ void propagate_first_sibling_to_other(cudf::device_span node_l sorted_node_levels.end(), thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()), thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()), - thrust::equal_to{}, - thrust::maximum{}); + cuda::std::equal_to{}, + cudf::detail::maximum{}); } // Generates a tree representation of the given tokens, token_indices. diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 0c95c2b05e8..c265ac5e316 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -43,6 +43,7 @@ #include #include +#include #include namespace cudf::io::json::detail { diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index 1587c4da9c8..b8f0fe7cb07 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -333,8 +333,8 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, validity_iterator, d_str_separator.begin(), false, - thrust::equal_to{}, - thrust::logical_or{}); + cuda::std::equal_to{}, + cuda::std::logical_or{}); thrust::for_each(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(total_rows), diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp index 050bf692c14..77643d294e8 100644 --- a/cpp/src/io/orc/aggregate_orc_metadata.cpp +++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp @@ -19,6 +19,7 @@ #include "io/utilities/row_selection.hpp" #include +#include #include namespace cudf::io::orc::detail { diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu index 5c663950b00..5b0c7ae11a9 100644 --- a/cpp/src/io/orc/reader_impl_chunking.cu +++ b/cpp/src/io/orc/reader_impl_chunking.cu @@ -486,13 +486,11 @@ void reader_impl::load_next_stripe_data(read_mode mode) // Load stripe data into memory: // - // If we load data from sources into host buffers, we need to transfer (async) data to device - // memory. Such host buffers need to be kept alive until we sync the transfers. - std::vector> host_read_buffers; - - // If we load data directly from sources into device memory, the loads are also async. - // Thus, we need to make sure to sync all them at the end. + // Storing the future and the expected size of the read data std::vector, std::size_t>> device_read_tasks; + // Storing the future, the expected size of the read data and the device destination pointer + std::vector>, std::size_t, uint8_t*>> + host_read_tasks; // Range of the read info (offset, length) to read for the current being loaded stripes. auto const [read_begin, read_end] = @@ -518,24 +516,22 @@ void reader_impl::load_next_stripe_data(read_mode mode) source_ptr->device_read_async( read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream), read_info.length); - } else { - auto buffer = source_ptr->host_read(read_info.offset, read_info.length); - CUDF_EXPECTS(buffer->size() == read_info.length, "Unexpected discrepancy in bytes read."); - CUDF_CUDA_TRY(cudaMemcpyAsync(dst_base + read_info.dst_pos, - buffer->data(), - read_info.length, - cudaMemcpyDefault, - _stream.value())); - host_read_buffers.emplace_back(std::move(buffer)); + host_read_tasks.emplace_back(source_ptr->host_read_async(read_info.offset, read_info.length), + read_info.length, + dst_base + read_info.dst_pos); } } - - if (host_read_buffers.size() > 0) { // if there was host read - _stream.synchronize(); - host_read_buffers.clear(); // its data was copied to device memory after stream sync + std::vector> host_read_buffers; + for (auto& [fut, expected_size, dev_dst] : host_read_tasks) { // if there were host reads + host_read_buffers.emplace_back(fut.get()); + auto* host_buffer = host_read_buffers.back().get(); + CUDF_EXPECTS(host_buffer->size() == expected_size, "Unexpected discrepancy in bytes read."); + CUDF_CUDA_TRY(cudaMemcpyAsync( + dev_dst, host_buffer->data(), host_buffer->size(), cudaMemcpyDefault, _stream.value())); } - for (auto& task : device_read_tasks) { // if there was device read + + for (auto& task : device_read_tasks) { // if there were device reads CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read."); } diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index c0887304db9..426e470a151 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -18,6 +18,7 @@ #include "io/utilities/column_buffer.hpp" #include "orc_gpu.hpp" +#include #include #include @@ -1511,10 +1512,11 @@ static __device__ void DecodeRowPositions(orcdec_state_s* s, } if (t == nrows - 1) { s->u.rowdec.nz_count = min(nz_count, s->top.data.max_vals); } __syncthreads(); + // TBD: Brute-forcing this, there might be a more efficient way to find the thread with the // last row last_row = (nz_count == s->u.rowdec.nz_count) ? row_plus1 : 0; - last_row = block_reduce(temp_storage).Reduce(last_row, cub::Max()); + last_row = block_reduce(temp_storage).Reduce(last_row, cudf::detail::maximum{}); nz_pos = (valid) ? nz_count : 0; if (t == 0) { s->top.data.nrows = last_row; } if (valid && nz_pos - 1 < s->u.rowdec.nz_count) { s->u.rowdec.row[nz_pos - 1] = row_plus1; } diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 3a1f3a88da4..2ccf3f5d284 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -366,8 +367,9 @@ static __device__ uint32_t IntegerRLE( orcenc_state_s* s, T const* inbuf, uint32_t inpos, uint32_t numvals, int t, Storage& temp_storage) { using block_reduce = cub::BlockReduce; - uint8_t* dst = s->stream.data_ptrs[cid] + s->strm_pos[cid]; - uint32_t out_cnt = 0; + + uint8_t* dst = s->stream.data_ptrs[cid] + s->strm_pos[cid]; + uint32_t out_cnt = 0; __shared__ uint64_t block_vmin; while (numvals > 0) { @@ -413,9 +415,9 @@ static __device__ uint32_t IntegerRLE( T vmin = (t < literal_run) ? v0 : cuda::std::numeric_limits::max(); T vmax = (t < literal_run) ? v0 : cuda::std::numeric_limits::min(); uint32_t literal_mode, literal_w; - vmin = block_reduce(temp_storage).Reduce(vmin, cub::Min()); + vmin = block_reduce(temp_storage).Reduce(vmin, cudf::detail::minimum{}); __syncthreads(); - vmax = block_reduce(temp_storage).Reduce(vmax, cub::Max()); + vmax = block_reduce(temp_storage).Reduce(vmax, cudf::detail::maximum{}); if (t == 0) { uint32_t mode1_w, mode2_w; typename std::make_unsigned::type vrange_mode1, vrange_mode2; diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index dbf5e293c4e..217aff48d5e 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -64,6 +64,7 @@ #include #include +#include #include #include #include @@ -2225,6 +2226,22 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, std::move(dict_order_owner)}; } +[[nodiscard]] uint32_t find_largest_stream_size(device_2dspan ss, + rmm::cuda_stream_view stream) +{ + auto const longest_stream = thrust::max_element( + rmm::exec_policy(stream), + ss.data(), + ss.data() + ss.count(), + cuda::proclaim_return_type([] __device__(auto const& lhs, auto const& rhs) { + return lhs.stream_size < rhs.stream_size; + })); + + auto const h_longest_stream = cudf::detail::make_host_vector_sync( + device_span{longest_stream, 1}, stream); + return h_longest_stream[0].stream_size; +} + /** * @brief Perform the processing steps needed to convert the input table into the output ORC data * for writing, such as compression and ORC encoding. @@ -2318,7 +2335,9 @@ auto convert_table_to_orc_data(table_view const& input, size_t compressed_bfr_size = 0; size_t num_compressed_blocks = 0; - auto const max_compressed_block_size = max_compressed_size(compression, compression_blocksize); + auto const largest_stream_size = find_largest_stream_size(strm_descs, stream); + auto const max_compressed_block_size = + max_compressed_size(compression, std::min(largest_stream_size, compression_blocksize)); auto const padded_max_compressed_block_size = util::round_up_unsafe(max_compressed_block_size, block_align); auto const padded_block_header_size = diff --git a/cpp/src/io/parquet/delta_enc.cuh b/cpp/src/io/parquet/delta_enc.cuh index 56b7c8065ee..8dba755b73a 100644 --- a/cpp/src/io/parquet/delta_enc.cuh +++ b/cpp/src/io/parquet/delta_enc.cuh @@ -19,6 +19,7 @@ #include "parquet_gpu.hpp" #include +#include #include #include @@ -221,6 +222,7 @@ class delta_binary_packer { inline __device__ uint8_t* flush() { using cudf::detail::warp_size; + __shared__ T block_min; int const t = threadIdx.x; @@ -240,7 +242,7 @@ class delta_binary_packer { : cuda::std::numeric_limits::max(); // Find min delta for the block. - auto const min_delta = block_reduce(*_block_tmp).Reduce(delta, cub::Min()); + auto const min_delta = block_reduce(*_block_tmp).Reduce(delta, cudf::detail::minimum{}); if (t == 0) { block_min = min_delta; } __syncthreads(); @@ -250,7 +252,7 @@ class delta_binary_packer { // Get max normalized delta for each warp, and use that to determine how many bits to use // for the bitpacking of this warp. - U const warp_max = warp_reduce(_warp_tmp[warp_id]).Reduce(norm_delta, cub::Max()); + U const warp_max = warp_reduce(_warp_tmp[warp_id]).Reduce(norm_delta, cudf::detail::maximum{}); __syncwarp(); if (lane_id == 0) { _mb_bits[warp_id] = sizeof(long long) * 8 - __clzll(warp_max); } diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index 7d670057cf9..fe9b05c8054 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -21,6 +21,7 @@ #include "rle_stream.cuh" #include +#include #include #include @@ -498,6 +499,7 @@ __device__ thrust::pair totalDeltaByteArraySize(uint8_t const* d { using cudf::detail::warp_size; using WarpReduce = cub::WarpReduce; + __shared__ typename WarpReduce::TempStorage temp_storage[2]; __shared__ __align__(16) delta_binary_decoder prefixes; @@ -550,7 +552,8 @@ __device__ thrust::pair totalDeltaByteArraySize(uint8_t const* d // note: warp_sum will only be valid on lane 0. auto const warp_sum = WarpReduce(temp_storage[warp_id]).Sum(lane_sum); __syncwarp(); - auto const warp_max = WarpReduce(temp_storage[warp_id]).Reduce(lane_max, cub::Max()); + auto const warp_max = + WarpReduce(temp_storage[warp_id]).Reduce(lane_max, cudf::detail::maximum{}); if (lane_id == 0) { total_bytes += warp_sum; diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index 03a37327e9b..5242b18b574 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -40,6 +40,7 @@ #include #include +#include #include namespace cudf::io::parquet::detail { @@ -1148,7 +1149,7 @@ void include_decompression_scratch_size(device_span chunk page_keys + pages.size(), decomp_iter, decomp_info.begin(), - thrust::equal_to{}, + cuda::std::equal_to{}, decomp_sum{}); // retrieve to host so we can call nvcomp to get compression scratch sizes @@ -1387,7 +1388,7 @@ void reader::impl::setup_next_subpass(read_mode mode) page_keys + pass.pages.size(), page_size, c_info.begin(), - thrust::equal_to{}, + cuda::std::equal_to{}, cumulative_page_sum{}); // include scratch space needed for decompression. for certain codecs (eg ZSTD) this @@ -1702,7 +1703,7 @@ void reader::impl::compute_output_chunks_for_subpass() page_keys + subpass.pages.size(), page_input, c_info.begin(), - thrust::equal_to{}, + cuda::std::equal_to{}, cumulative_page_sum{}); auto iter = thrust::make_counting_iterator(0); // cap the max row in all pages by the max row we expect in the subpass. input chunking diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index e1e9bac5a07..052ed80bc14 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -608,7 +609,7 @@ void decode_page_headers(pass_intermediate_data& pass, level_bit_size, level_bit_size + pass.chunks.size(), 0, - thrust::maximum()); + cudf::detail::maximum()); pass.level_type_size = std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8)); // sort the pages in chunk/schema order. diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 9e50fafa8a7..4a410cec558 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -53,6 +53,7 @@ #include #include +#include #include #include #include diff --git a/cpp/src/io/statistics/typed_statistics_chunk.cuh b/cpp/src/io/statistics/typed_statistics_chunk.cuh index dc023e69423..34e663447e3 100644 --- a/cpp/src/io/statistics/typed_statistics_chunk.cuh +++ b/cpp/src/io/statistics/typed_statistics_chunk.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,6 +27,7 @@ #include "statistics_type_identification.cuh" #include "temp_storage_wrapper.cuh" +#include #include #include @@ -202,11 +203,12 @@ __inline__ __device__ typed_statistics_chunk block_reduce( using E = typename detail::extrema_type::type; using extrema_reduce = cub::BlockReduce; using count_reduce = cub::BlockReduce; - output_chunk.minimum_value = - extrema_reduce(storage.template get()).Reduce(output_chunk.minimum_value, cub::Min()); + + output_chunk.minimum_value = extrema_reduce(storage.template get()) + .Reduce(output_chunk.minimum_value, cudf::detail::minimum{}); __syncthreads(); - output_chunk.maximum_value = - extrema_reduce(storage.template get()).Reduce(output_chunk.maximum_value, cub::Max()); + output_chunk.maximum_value = extrema_reduce(storage.template get()) + .Reduce(output_chunk.maximum_value, cudf::detail::maximum{}); __syncthreads(); output_chunk.non_nulls = count_reduce(storage.template get()).Sum(output_chunk.non_nulls); diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp index 46816604918..fa6f04eed73 100644 --- a/cpp/src/io/utilities/config_utils.cpp +++ b/cpp/src/io/utilities/config_utils.cpp @@ -36,10 +36,10 @@ void set_up_kvikio() cudaFree(nullptr); auto const compat_mode = kvikio::getenv_or("KVIKIO_COMPAT_MODE", kvikio::CompatMode::ON); - kvikio::defaults::compat_mode_reset(compat_mode); + kvikio::defaults::set_compat_mode(compat_mode); auto const nthreads = getenv_or("KVIKIO_NTHREADS", 4u); - kvikio::defaults::thread_pool_nthreads_reset(nthreads); + kvikio::defaults::set_thread_pool_nthreads(nthreads); }); } diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 2750a17d328..c6391d49294 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -814,7 +815,7 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, str_tuples + col_size, cuda::proclaim_return_type([] __device__(auto t) { return t.second; }), size_type{0}, - thrust::maximum{}); + cudf::detail::maximum{}); auto sizes = rmm::device_uvector(col_size, stream); auto d_sizes = sizes.data(); diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 2cb2b303cb3..2f181188fb2 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -44,37 +44,56 @@ namespace io { namespace { /** - * @brief Base class for file input. Only implements direct device reads. + * @brief Base class for kvikIO-based data sources. */ -class file_source : public datasource { - public: - explicit file_source(char const* filepath) - { - kvikio_integration::set_up_kvikio(); - _kvikio_file = kvikio::FileHandle(filepath, "r"); - CUDF_EXPECTS(!_kvikio_file.closed(), "KvikIO did not open the file successfully."); - CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode %s.", - _kvikio_file.get_compat_mode_manager().is_compat_mode_preferred() ? "on" : "off"); - } +template +class kvikio_source : public datasource { + class kvikio_initializer { + public: + kvikio_initializer() { kvikio_integration::set_up_kvikio(); } + }; - std::unique_ptr host_read(size_t offset, size_t size) override + std::pair, std::future> clamped_read_to_vector(size_t offset, + size_t size) { // Clamp length to available data auto const read_size = std::min(size, this->size() - offset); std::vector v(read_size); - CUDF_EXPECTS(_kvikio_file.pread(v.data(), read_size, offset).get() == read_size, "read failed"); + return {std::move(v), _kvikio_handle.pread(v.data(), read_size, offset)}; + } + + public: + kvikio_source(HandleT&& h) : _kvikio_handle(std::move(h)) {} + std::unique_ptr host_read(size_t offset, size_t size) override + { + auto [v, fut] = clamped_read_to_vector(offset, size); + fut.get(); return buffer::create(std::move(v)); } + std::future> host_read_async(size_t offset, + size_t size) override + { + auto clamped_read = clamped_read_to_vector(offset, size); + return std::async(std::launch::deferred, [cr = std::move(clamped_read)]() mutable { + cr.second.get(); + return buffer::create(std::move(cr.first)); + }); + } + size_t host_read(size_t offset, size_t size, uint8_t* dst) override + { + return host_read_async(offset, size, dst).get(); + } + + std::future host_read_async(size_t offset, size_t size, uint8_t* dst) override { // Clamp length to available data auto const read_size = std::min(size, this->size() - offset); - CUDF_EXPECTS(_kvikio_file.pread(dst, read_size, offset).get() == read_size, "read failed"); - return read_size; + return _kvikio_handle.pread(dst, read_size, offset); } - ~file_source() override = default; + ~kvikio_source() override = default; [[nodiscard]] bool supports_device_read() const override { return true; } @@ -91,7 +110,7 @@ class file_source : public datasource { CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); auto const read_size = std::min(size, this->size() - offset); - return _kvikio_file.pread(dst, read_size, offset); + return _kvikio_handle.pread(dst, read_size, offset); } size_t device_read(size_t offset, @@ -113,10 +132,29 @@ class file_source : public datasource { return datasource::buffer::create(std::move(out_data)); } - [[nodiscard]] size_t size() const override { return _kvikio_file.nbytes(); } + [[nodiscard]] size_t size() const override { return _kvikio_handle.nbytes(); } + + kvikio_initializer _; protected: - kvikio::FileHandle _kvikio_file; + HandleT _kvikio_handle; +}; + +/** + * @brief A class representing a file source using kvikIO. + * + * This class is derived from `kvikio_source` and is used to handle file operations + * using kvikIO library. + */ +class file_source : public kvikio_source { + public: + explicit file_source(char const* filepath) : kvikio_source{kvikio::FileHandle(filepath, "r")} + { + CUDF_EXPECTS(!_kvikio_handle.closed(), "KvikIO did not open the file successfully."); + CUDF_LOG_INFO( + "Reading a file using kvikIO, with compatibility mode %s.", + _kvikio_handle.get_compat_mode_manager().is_compat_mode_preferred() ? "on" : "off"); + } }; /** @@ -132,7 +170,7 @@ class memory_mapped_source : public file_source { { if (this->size() != 0) { // Memory mapping is not exclusive, so we can include the whole region we expect to read - map(_kvikio_file.fd(), offset, max_size_estimate); + map(_kvikio_handle.fd(), offset, max_size_estimate); } } @@ -331,6 +369,17 @@ class user_datasource_wrapper : public datasource { return source->host_read(offset, size); } + std::future host_read_async(size_t offset, size_t size, uint8_t* dst) override + { + return source->host_read_async(offset, size, dst); + } + + std::future> host_read_async(size_t offset, + size_t size) override + { + return source->host_read_async(offset, size); + } + [[nodiscard]] bool supports_device_read() const override { return source->supports_device_read(); @@ -376,68 +425,18 @@ class user_datasource_wrapper : public datasource { /** * @brief Remote file source backed by KvikIO, which handles S3 filepaths seamlessly. */ -class remote_file_source : public datasource { - static std::unique_ptr create_s3_endpoint(char const* filepath) +class remote_file_source : public kvikio_source { + static auto create_s3_handle(char const* filepath) { auto [bucket_name, bucket_object] = kvikio::S3Endpoint::parse_s3_url(filepath); - return std::make_unique(bucket_name, bucket_object); + return kvikio::RemoteHandle{std::make_unique(bucket_name, bucket_object)}; } public: - explicit remote_file_source(char const* filepath) : _kvikio_file{create_s3_endpoint(filepath)} {} + explicit remote_file_source(char const* filepath) : kvikio_source{create_s3_handle(filepath)} {} ~remote_file_source() override = default; - [[nodiscard]] bool supports_device_read() const override { return true; } - - [[nodiscard]] bool is_device_read_preferred(size_t size) const override { return true; } - - [[nodiscard]] size_t size() const override { return _kvikio_file.nbytes(); } - - std::future device_read_async(size_t offset, - size_t size, - uint8_t* dst, - rmm::cuda_stream_view stream) override - { - CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); - - auto const read_size = std::min(size, this->size() - offset); - return _kvikio_file.pread(dst, read_size, offset); - } - - size_t device_read(size_t offset, - size_t size, - uint8_t* dst, - rmm::cuda_stream_view stream) override - { - return device_read_async(offset, size, dst, stream).get(); - } - - std::unique_ptr device_read(size_t offset, - size_t size, - rmm::cuda_stream_view stream) override - { - rmm::device_buffer out_data(size, stream); - size_t const read = - device_read(offset, size, reinterpret_cast(out_data.data()), stream); - out_data.resize(read, stream); - return datasource::buffer::create(std::move(out_data)); - } - - size_t host_read(size_t offset, size_t size, uint8_t* dst) override - { - auto const read_size = std::min(size, this->size() - offset); - return _kvikio_file.pread(dst, read_size, offset).get(); - } - - std::unique_ptr host_read(size_t offset, size_t size) override - { - auto const count = std::min(size, this->size() - offset); - std::vector h_data(count); - this->host_read(offset, count, h_data.data()); - return datasource::buffer::create(std::move(h_data)); - } - /** * @brief Is `url` referring to a remote file supported by KvikIO? * @@ -449,9 +448,6 @@ class remote_file_source : public datasource { static std::regex const pattern{R"(^s3://)", std::regex_constants::icase}; return std::regex_search(url, pattern); } - - private: - kvikio::RemoteHandle _kvikio_file; }; #else /** @@ -509,5 +505,18 @@ std::unique_ptr datasource::create(datasource* source) return std::make_unique(source); } +std::future> datasource::host_read_async(size_t offset, + size_t size) +{ + return std::async(std::launch::deferred, + [this, offset, size] { return host_read(offset, size); }); +} + +std::future datasource::host_read_async(size_t offset, size_t size, uint8_t* dst) +{ + return std::async(std::launch::deferred, + [this, offset, size, dst] { return host_read(offset, size, dst); }); +} + } // namespace io } // namespace cudf diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu index 469442d46d4..d7b1bf360fe 100644 --- a/cpp/src/lists/dremel.cu +++ b/cpp/src/lists/dremel.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,6 +36,8 @@ #include #include +#include + namespace cudf::detail { namespace { /** diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu index 6f2acbb0712..0ed4b5193b7 100644 --- a/cpp/src/lists/set_operations.cu +++ b/cpp/src/lists/set_operations.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -103,8 +104,8 @@ std::unique_ptr have_overlap(lists_column_view const& lhs, contained.begin(), // values to reduce list_indices.begin(), // out keys overlap_results.begin(), // out values - thrust::equal_to{}, // comp for keys - thrust::logical_or{}); // reduction op for values + cuda::std::equal_to{}, // comp for keys + cuda::std::logical_or{}); // reduction op for values auto const num_non_empty_segments = thrust::distance(overlap_results.begin(), end.second); auto [null_mask, null_count] = diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu index 3a365477366..83423649507 100644 --- a/cpp/src/quantiles/tdigest/tdigest.cu +++ b/cpp/src/quantiles/tdigest/tdigest.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -395,7 +396,7 @@ std::unique_ptr percentile_approx(tdigest_column_view const& input, return std::pair{rmm::device_buffer{}, null_count}; } return cudf::detail::valid_if( - tdigest_is_empty, tdigest_is_empty + tdv.size(), thrust::logical_not{}, stream, mr); + tdigest_is_empty, tdigest_is_empty + tdv.size(), cuda::std::logical_not{}, stream, mr); }(); return cudf::make_lists_column(input.size(), diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index fd98d262154..f07b8695024 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -793,7 +794,7 @@ std::unique_ptr compute_tdigests(int delta, centroids_begin, // values thrust::make_discard_iterator(), // key output output, // output - thrust::equal_to{}, // key equality check + cuda::std::equal_to{}, // key equality check merge_centroids{}); // create final tdigest column @@ -1161,8 +1162,8 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, min_iter, thrust::make_discard_iterator(), merged_min_col->mutable_view().begin(), - thrust::equal_to{}, // key equality check - thrust::minimum{}); + cuda::std::equal_to{}, // key equality check + cudf::detail::minimum{}); auto merged_max_col = cudf::make_numeric_column( data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr); @@ -1176,8 +1177,8 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, max_iter, thrust::make_discard_iterator(), merged_max_col->mutable_view().begin(), - thrust::equal_to{}, // key equality check - thrust::maximum{}); + cuda::std::equal_to{}, // key equality check + cudf::detail::maximum{}); auto tdigest_offsets = tdv.centroids().offsets(); diff --git a/cpp/src/reductions/segmented/simple.cuh b/cpp/src/reductions/segmented/simple.cuh index 6c35e750e6b..d9b1fefe09a 100644 --- a/cpp/src/reductions/segmented/simple.cuh +++ b/cpp/src/reductions/segmented/simple.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -249,7 +250,7 @@ std::unique_ptr fixed_point_segmented_reduction( counts.begin(), counts.end(), size_type{0}, - thrust::maximum{}); + cudf::detail::maximum{}); auto const new_scale = numeric::scale_type{col.type().scale() * max_count}; diff --git a/cpp/src/rolling/detail/rolling_collect_list.cu b/cpp/src/rolling/detail/rolling_collect_list.cu index 8a98b65b406..d189b397afd 100644 --- a/cpp/src/rolling/detail/rolling_collect_list.cu +++ b/cpp/src/rolling/detail/rolling_collect_list.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -53,6 +54,7 @@ std::unique_ptr get_list_child_to_list_row_mapping(cudf::column_view con // offsets == [0, 2, 5, 5, 8, 11, 13] // scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0] // + auto const num_child_rows{ cudf::detail::get_value(offsets, offsets.size() - 1, stream)}; auto per_row_mapping = make_fixed_width_column( @@ -83,7 +85,7 @@ std::unique_ptr get_list_child_to_list_row_mapping(cudf::column_view con per_row_mapping_begin, per_row_mapping_begin + num_child_rows, per_row_mapping_begin, - thrust::maximum{}); + cudf::detail::maximum{}); return per_row_mapping; } diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu index e7dca2277ec..35a9a3ec38d 100644 --- a/cpp/src/sort/rank.cu +++ b/cpp/src/sort/rank.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -145,7 +146,7 @@ void tie_break_ranks_transform(cudf::device_span dense_rank_sor tie_iter, thrust::make_discard_iterator(), tie_sorted.begin(), - thrust::equal_to{}, + cuda::std::equal_to{}, tie_breaker); using TransformerReturnType = cuda::std::decay_t>; @@ -202,7 +203,7 @@ void rank_min(cudf::device_span group_keys, thrust::make_counting_iterator(1), sorted_order_view, rank_mutable_view.begin(), - thrust::minimum{}, + cudf::detail::minimum{}, cuda::std::identity{}, stream); } @@ -220,7 +221,7 @@ void rank_max(cudf::device_span group_keys, thrust::make_counting_iterator(1), sorted_order_view, rank_mutable_view.begin(), - thrust::maximum{}, + cudf::detail::maximum{}, cuda::std::identity{}, stream); } diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh index d22fb04696c..6071a9fdd2d 100644 --- a/cpp/src/strings/regex/regex.cuh +++ b/cpp/src/strings/regex/regex.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,6 +27,7 @@ #include #include +#include #include namespace cudf { diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu index 0777253bb38..af8b53ccd8c 100644 --- a/cpp/src/strings/replace/multi_re.cu +++ b/cpp/src/strings/replace/multi_re.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -39,6 +39,7 @@ #include #include +#include namespace cudf { namespace strings { diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu index 352ca83c8b2..9d30e3d0026 100644 --- a/cpp/src/strings/split/split.cu +++ b/cpp/src/strings/split/split.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -135,7 +136,7 @@ std::unique_ptr
split_fn(strings_column_view const& input, return static_cast(d_offsets[idx + 1] - d_offsets[idx]); }), 0, - thrust::maximum{}); + cudf::detail::maximum{}); // build strings columns for each token position for (size_type col = 0; col < columns_count; ++col) { @@ -346,7 +347,7 @@ std::unique_ptr
whitespace_split_fn(size_type strings_count, // column count is the maximum number of tokens for any string size_type const columns_count = thrust::reduce( - rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, thrust::maximum{}); + rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, cudf::detail::maximum{}); std::vector> results; // boundary case: if no columns, return one null column (issue #119) diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index ef96b9d3f36..68b610bcb93 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -227,7 +228,7 @@ std::unique_ptr
split_re(strings_column_view const& input, return static_cast(d_offsets[idx + 1] - d_offsets[idx]); }), 0, - thrust::maximum{}); + cudf::detail::maximum{}); // boundary case: if no columns, return one all-null column (custrings issue #119) if (columns_count == 0) { diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index 45bd4615435..c5d46598d4a 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -180,6 +180,18 @@ int64_t get_offset_value(cudf::column_view const& offsets, : cudf::detail::get_value(offsets, index, stream); } +std::pair get_first_and_last_offset(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream) +{ + if (input.is_empty()) { return {0L, 0L}; } + auto const first_offset = (input.offset() == 0) ? 0 + : cudf::strings::detail::get_offset_value( + input.offsets(), input.offset(), stream); + auto const last_offset = + cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream); + return {first_offset, last_offset}; +} + } // namespace detail rmm::device_uvector create_string_vector_from_column( diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu index 990c4855a14..d77cc0cf17a 100644 --- a/cpp/src/table/row_operators.cu +++ b/cpp/src/table/row_operators.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,6 +33,8 @@ #include +#include + namespace cudf { namespace experimental { diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu index 0aacfd16f67..972bcc32077 100644 --- a/cpp/src/text/bpe/byte_pair_encoding.cu +++ b/cpp/src/text/bpe/byte_pair_encoding.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -212,7 +213,8 @@ CUDF_KERNEL void bpe_parallel_fn(cudf::column_device_view const d_strings, } } // compute the min rank across the block - auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid); + auto const reduce_rank = + block_reduce(temp_storage).Reduce(min_rank, cudf::detail::minimum{}, num_valid); if (lane_idx == 0) { block_min_rank = reduce_rank; } __syncthreads(); @@ -277,7 +279,8 @@ CUDF_KERNEL void bpe_parallel_fn(cudf::column_device_view const d_strings, } // re-compute the minimum rank across the block (since new pairs are created above) - auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid); + auto const reduce_rank = + block_reduce(temp_storage).Reduce(min_rank, cudf::detail::minimum{}, num_valid); if (lane_idx == 0) { block_min_rank = reduce_rank; } __syncthreads(); } // if no min ranks are found we are done, otherwise start again diff --git a/cpp/src/text/bpe/load_merge_pairs.cu b/cpp/src/text/bpe/load_merge_pairs.cu index a13a435a271..9118fe54ab2 100644 --- a/cpp/src/text/bpe/load_merge_pairs.cu +++ b/cpp/src/text/bpe/load_merge_pairs.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,6 +33,7 @@ #include #include +#include #include #include diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 50c16c8ba6c..61a7375772b 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -21,8 +21,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -62,19 +64,20 @@ constexpr cudf::thread_index_type tile_size = block_size; constexpr cuda::std::size_t params_per_thread = 16; // Separate kernels are used to process strings above and below this value (in bytes). -constexpr cudf::size_type wide_string_threshold = 1 << 18; // 256K +constexpr cudf::size_type wide_row_threshold = 1 << 18; // 256K // The number of blocks per string for the above-threshold kernel processing. -constexpr cudf::size_type blocks_per_string = 64; +constexpr cudf::size_type blocks_per_row = 64; // The above values were determined using the redpajama and books_sample datasets /** * @brief Hashing kernel launched as a thread per tile-size (block or warp) + * for strings column * * This kernel computes the hashes for each string using the seed and the specified * hash function. The width is used to compute rolling substrings to hash over. * The hashes are stored in d_hashes to be used in the minhash_kernel. * - * This kernel also counts the number of strings above the wide_string_threshold + * This kernel also counts the number of strings above the wide_row_threshold * and proactively initializes the output values for those strings. * * @tparam HashFunction The hash function to use for this kernel @@ -84,7 +87,7 @@ constexpr cudf::size_type blocks_per_string = 64; * @param seed The seed used for the hash function * @param width Width in characters used for determining substrings to hash * @param d_hashes The resulting hash values are stored here - * @param threshold_count Stores the number of strings above wide_string_threshold + * @param threshold_count Stores the number of strings above wide_row_threshold * @param param_count Number of parameters (used for the proactive initialize) * @param d_results Final results vector (used for the proactive initialize) */ @@ -146,7 +149,7 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, } // logic appended here so an extra kernel is not required - if (size_bytes >= wide_string_threshold) { + if (size_bytes >= wide_row_threshold) { if (lane_idx == 0) { // count the number of wide strings cuda::atomic_ref ref{*threshold_count}; @@ -160,31 +163,130 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, } } +/** + * @brief Hashing kernel launched as a thread per tile-size (block or warp) + * for a lists column + * + * This kernel computes the hashes for each row using the seed and the specified + * hash function. The ngrams identifies consecutive strings to hash over in + * sliding window formation. The hashes are stored in d_hashes and used as input + * to the minhash_kernel. + * + * This kernel also counts the number of rows above the wide_row_threshold + * and proactively initializes the output values for those rows. + * + * @tparam HashFunction The hash function to use for this kernel + * @tparam hash_value_type Derived from HashFunction result_type + * + * @param d_input The input column to hash + * @param seed The seed used for the hash function + * @param ngrams Number of strings in each row to hash + * @param d_hashes The resulting hash values are stored here + * @param threshold_count Stores the number of rows above wide_row_threshold + * @param param_count Number of parameters (used for the proactive initialize) + * @param d_results Final results vector (used for the proactive initialize) + */ +template +CUDF_KERNEL void minhash_ngrams_kernel(cudf::detail::lists_column_device_view const d_input, + hash_value_type seed, + cudf::size_type ngrams, + hash_value_type* d_hashes, + cudf::size_type* threshold_count, + cudf::size_type param_count, + hash_value_type* d_results) +{ + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const row_idx = tid / tile_size; + if (row_idx >= d_input.size()) { return; } + if (d_input.is_null(row_idx)) { return; } + + // retrieve this row's offset to locate the output position in d_hashes + auto const offsets_itr = d_input.offsets().data() + d_input.offset(); + auto const offset = offsets_itr[row_idx]; + auto const size_row = offsets_itr[row_idx + 1] - offset; + if (size_row == 0) { return; } + + auto const d_row = cudf::list_device_view(d_input, row_idx); + auto const lane_idx = static_cast(tid % tile_size); + + // hashes for this row/thread are stored here + auto seed_hashes = d_hashes + offset - offsets_itr[0] + lane_idx; + auto const hasher = HashFunction(seed); + + for (auto idx = lane_idx; idx < size_row; idx += tile_size, seed_hashes += tile_size) { + if (d_row.is_null(idx)) { + *seed_hashes = 0; + continue; + } + + auto next_idx = cuda::std::min(idx + ngrams, size_row - 1); + if ((idx != 0) && ((next_idx - idx) < ngrams)) { + *seed_hashes = 0; + continue; + } + + auto const first_str = d_row.element(idx); + auto const last_str = d_row.element(next_idx); + // build super-string since adjacent strings are contiguous in memory + auto const size = static_cast( + thrust::distance(first_str.data(), last_str.data()) + last_str.size_bytes()); + auto const hash_str = cudf::string_view(first_str.data(), size); + hash_value_type hv; + if constexpr (std::is_same_v) { + hv = hasher(hash_str); + } else { + hv = cuda::std::get<0>(hasher(hash_str)); + } + // disallowing hash to zero case + *seed_hashes = cuda::std::max(hv, hash_value_type{1}); + } + + // logic appended here to count long rows so an extra kernel is not required + if (size_row >= wide_row_threshold) { + if (lane_idx == 0) { + // count the number of wide rows + cuda::atomic_ref ref{*threshold_count}; + ref.fetch_add(1, cuda::std::memory_order_relaxed); + } + // initialize the output -- only needed for wider rows + auto d_output = d_results + (row_idx * param_count); + for (auto i = lane_idx; i < param_count; i += tile_size) { + d_output[i] = cuda::std::numeric_limits::max(); + } + } +} + /** * @brief Permutation calculation kernel * - * This kernel uses the hashes from the minhash_seed_kernel and the parameter_a and - * parameter_b values to compute the final output results. + * This kernel uses the hashes from the minhash_seed_kernel or minhash_ngrams_kernel + * and the 'parameter_a' and 'parameter_b' values to compute the final output. * The output is the number of input rows (N) by the number of parameter values (M). - * Each output[i] is the calculated result for parameter_a/b[0:M]. + * Each row output[i] is the calculated result for parameter_a/b[0:M]. + * + * This kernel is launched with either blocks per row of 1 for rows + * below the wide_row_threshold or blocks per row = blocks_per_rows + * for rows above wide_row_threshold. * - * This kernel is launched with either blocks per strings of 1 for strings - * below the wide_strings_threshold or blocks per string = blocks_per_strings - * for strings above wide_strings_threshold. + * Note that this was refactored to accommodate lists of strings which is possible + * since there is no need here to access the characters, only the hash values. + * The offsets and width are used to locate and count the hash values produced by + * kernels above for each input row. * + * @tparam offsets_type Type for the offsets iterator for the input column * @tparam hash_value_type Derived from HashFunction result_type - * @tparam blocks_per_string Number of blocks used to process each string + * @tparam blocks_per_row Number of blocks used to process each row * - * @param d_strings The input strings to hash - * @param indices The indices of the strings in d_strings to process + * @param offsets_itr The offsets are used to address the d_hashes + * @param indices The indices of the rows in the input column * @param parameter_a 1st set of parameters for the calculation result * @param parameter_b 2nd set of parameters for the calculation result - * @param width Used for calculating the number of available hashes in each string - * @param d_hashes The hash values computed in minhash_seed_kernel + * @param width Used for calculating the number of available hashes in each row + * @param d_hashes The hash values computed in one of the hash kernels * @param d_results Final results vector of calculate values */ -template -CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, +template +CUDF_KERNEL void minhash_kernel(offsets_type offsets_itr, cudf::device_span indices, cudf::device_span parameter_a, cudf::device_span parameter_b, @@ -193,41 +295,36 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, hash_value_type* d_results) { auto const tid = cudf::detail::grid_1d::global_thread_id(); - auto const idx = (tid / blocks_per_string) / block_size; + auto const idx = (tid / blocks_per_row) / block_size; if (idx >= indices.size()) { return; } - auto const str_idx = indices[idx]; - if (d_strings.is_null(str_idx)) { return; } + auto const row_idx = indices[idx]; auto const block = cooperative_groups::this_thread_block(); - int const section_idx = block.group_index().x % blocks_per_string; + int const section_idx = block.group_index().x % blocks_per_row; - auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); - auto const offsets_itr = - cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset()); - auto const offset = offsets_itr[str_idx]; - auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); + auto const offset = offsets_itr[row_idx]; + auto const row_size = static_cast(offsets_itr[row_idx + 1] - offset); // number of items to process in this block; - // last block also includes any remainder values from the size_bytes/blocks_per_string truncation + // last block also includes any remainder values from the row_size/blocks_per_row truncation // example: - // each section_size for string with size 588090 and blocks_per_string=64 is 9188 + // each section_size for string with size 588090 and blocks_per_row=64 is 9188 // except the last section which is 9188 + (588090 % 64) = 9246 - auto const section_size = - (size_bytes / blocks_per_string) + - (section_idx < (blocks_per_string - 1) ? 0 : size_bytes % blocks_per_string); - auto const section_offset = section_idx * (size_bytes / blocks_per_string); + auto const section_size = (row_size / blocks_per_row) + + (section_idx < (blocks_per_row - 1) ? 0 : row_size % blocks_per_row); + auto const section_offset = section_idx * (row_size / blocks_per_row); // hash values for this block/section auto const seed_hashes = d_hashes + offset - offsets_itr[0] + section_offset; // width used here as a max value since a string's char-count <= byte-count auto const hashes_size = - section_idx < (blocks_per_string - 1) + section_idx < (blocks_per_row - 1) ? section_size - : cuda::std::max(static_cast(size_bytes > 0), section_size - width + 1); + : cuda::std::max(static_cast(row_size > 0), section_size - width + 1); - auto const init = size_bytes == 0 ? 0 : cuda::std::numeric_limits::max(); + auto const init = row_size == 0 ? 0 : cuda::std::numeric_limits::max(); auto const lane_idx = block.thread_rank(); - auto const d_output = d_results + (str_idx * parameter_a.size()); + auto const d_output = d_results + (row_idx * parameter_a.size()); auto const begin = seed_hashes + lane_idx; auto const end = seed_hashes + hashes_size; @@ -272,8 +369,8 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, auto const values = block_values + (lane_idx * block_size); // cooperative groups does not have a min function and cub::BlockReduce was slower auto const minv = - thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{}); - if constexpr (blocks_per_string > 1) { + thrust::reduce(thrust::seq, values, values + block_size, init, cudf::detail::minimum{}); + if constexpr (blocks_per_row > 1) { // accumulates mins for each block into d_output cuda::atomic_ref ref{d_output[lane_idx + i]}; ref.fetch_min(minv, cuda::std::memory_order_relaxed); @@ -285,6 +382,46 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, } } +/** + * @brief Partition input rows by row size + * + * The returned index is the first row above the wide_row_threshold size. + * The returned vector are the indices partitioned above and below the + * wide_row_threshold size. + * + * @param size Number of rows in the input column + * @param threshold_count Number of rows above wide_row_threshold + * @param tfn Transform function returns the size of each row + * @param stream Stream used for allocation and kernel launches + */ +template +std::pair> partition_input( + cudf::size_type size, + cudf::size_type threshold_count, + transform_fn tfn, + rmm::cuda_stream_view stream) +{ + auto indices = rmm::device_uvector(size, stream); + thrust::sequence(rmm::exec_policy(stream), indices.begin(), indices.end()); + cudf::size_type threshold_index = threshold_count < size ? size : 0; + + // if we counted a split of above/below threshold then + // compute partitions based on the size of each string + if ((threshold_count > 0) && (threshold_count < size)) { + auto sizes = rmm::device_uvector(size, stream); + auto begin = thrust::counting_iterator(0); + auto end = begin + size; + thrust::transform(rmm::exec_policy_nosync(stream), begin, end, sizes.data(), tfn); + // these 2 are slightly faster than using partition() + thrust::sort_by_key( + rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), indices.begin()); + auto const lb = thrust::lower_bound( + rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), wide_row_threshold); + threshold_index = static_cast(thrust::distance(sizes.begin(), lb)); + } + return {threshold_index, std::move(indices)}; +} + template std::unique_ptr minhash_fn(cudf::strings_column_view const& input, hash_value_type seed, @@ -334,40 +471,112 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, d_threshold_count.data(), parameter_a.size(), d_results); - auto const threshold_count = d_threshold_count.value(stream); - auto indices = rmm::device_uvector(input.size(), stream); - thrust::sequence(rmm::exec_policy(stream), indices.begin(), indices.end()); - cudf::size_type threshold_index = threshold_count < input.size() ? input.size() : 0; + auto transform_fn = [d_strings = *d_strings] __device__(auto idx) -> cudf::size_type { + if (d_strings.is_null(idx)) { return 0; } + return d_strings.element(idx).size_bytes(); + }; + auto [threshold_index, indices] = + partition_input(input.size(), d_threshold_count.value(stream), transform_fn, stream); - // if we counted a split of above/below threshold then - // compute partitions based on the size of each string - if ((threshold_count > 0) && (threshold_count < input.size())) { - auto sizes = rmm::device_uvector(input.size(), stream); - thrust::transform(rmm::exec_policy_nosync(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(input.size()), - sizes.data(), - cuda::proclaim_return_type( - [d_strings = *d_strings] __device__(auto idx) -> cudf::size_type { - if (d_strings.is_null(idx)) { return 0; } - return d_strings.element(idx).size_bytes(); - })); - thrust::sort_by_key( - rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), indices.begin()); - auto const lb = thrust::lower_bound( - rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), wide_string_threshold); - threshold_index = static_cast(thrust::distance(sizes.begin(), lb)); + auto input_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); + using offsets_type = decltype(input_offsets); + + // handle the strings below the threshold width + if (threshold_index > 0) { + auto d_indices = cudf::device_span(indices.data(), threshold_index); + cudf::detail::grid_1d grid{static_cast(d_indices.size()) * block_size, + block_size}; + minhash_kernel + <<>>( + input_offsets, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); + } + + // handle the strings above the threshold width + if (threshold_index < input.size()) { + auto const count = static_cast(input.size() - threshold_index); + auto d_indices = + cudf::device_span(indices.data() + threshold_index, count); + cudf::detail::grid_1d grid{count * block_size * blocks_per_row, block_size}; + minhash_kernel + <<>>( + input_offsets, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); } + return results; +} + +template +std::unique_ptr minhash_ngrams_fn( + cudf::lists_column_view const& input, + cudf::size_type ngrams, + hash_value_type seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(ngrams >= 2, + "Parameter ngrams should be an integer value of 2 or greater", + std::invalid_argument); + CUDF_EXPECTS(!parameter_a.empty(), "Parameters A and B cannot be empty", std::invalid_argument); + CUDF_EXPECTS(parameter_a.size() == parameter_b.size(), + "Parameters A and B should have the same number of elements", + std::invalid_argument); + CUDF_EXPECTS( + (static_cast(input.size()) * parameter_a.size()) < + static_cast(std::numeric_limits::max()), + "The number of parameters times the number of input rows exceeds the column size limit", + std::overflow_error); + + auto const output_type = cudf::data_type{cudf::type_to_id()}; + if (input.is_empty()) { return cudf::make_empty_column(output_type); } + + auto const d_input = cudf::column_device_view::create(input.parent(), stream); + + auto results = + cudf::make_numeric_column(output_type, + input.size() * static_cast(parameter_a.size()), + cudf::mask_state::UNALLOCATED, + stream, + mr); + auto d_results = results->mutable_view().data(); + + cudf::detail::grid_1d grid{static_cast(input.size()) * block_size, + block_size}; + auto const hashes_size = input.child().size(); + auto d_hashes = rmm::device_uvector(hashes_size, stream); + auto d_threshold_count = cudf::detail::device_scalar(0, stream); + + auto d_list = cudf::detail::lists_column_device_view(*d_input); + minhash_ngrams_kernel + <<>>(d_list, + seed, + ngrams, + d_hashes.data(), + d_threshold_count.data(), + parameter_a.size(), + d_results); + + auto sizes_fn = [d_list] __device__(auto idx) -> cudf::size_type { + if (d_list.is_null(idx)) { return 0; } + return cudf::list_device_view(d_list, idx).size(); + }; + auto [threshold_index, indices] = + partition_input(input.size(), d_threshold_count.value(stream), sizes_fn, stream); + + auto input_offsets = input.offsets_begin(); // already includes input.offset() + using offset_type = decltype(input_offsets); + // handle the strings below the threshold width if (threshold_index > 0) { auto d_indices = cudf::device_span(indices.data(), threshold_index); cudf::detail::grid_1d grid{static_cast(d_indices.size()) * block_size, block_size}; - minhash_kernel + minhash_kernel <<>>( - *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); + input_offsets, d_indices, parameter_a, parameter_b, ngrams, d_hashes.data(), d_results); } // handle the strings above the threshold width @@ -375,10 +584,10 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, auto const count = static_cast(input.size() - threshold_index); auto d_indices = cudf::device_span(indices.data() + threshold_index, count); - cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size}; - minhash_kernel + cudf::detail::grid_1d grid{count * block_size * blocks_per_row, block_size}; + minhash_kernel <<>>( - *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); + input_offsets, d_indices, parameter_a, parameter_b, ngrams, d_hashes.data(), d_results); } return results; @@ -426,6 +635,20 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); } +std::unique_ptr minhash_ngrams(cudf::lists_column_view const& input, + cudf::size_type ngrams, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; + auto hashes = detail::minhash_ngrams_fn( + input, ngrams, seed, parameter_a, parameter_b, stream, mr); + return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); +} + std::unique_ptr minhash64(cudf::strings_column_view const& input, uint64_t seed, cudf::device_span parameter_a, @@ -440,6 +663,20 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); } +std::unique_ptr minhash64_ngrams(cudf::lists_column_view const& input, + cudf::size_type ngrams, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; + auto hashes = detail::minhash_ngrams_fn( + input, ngrams, seed, parameter_a, parameter_b, stream, mr); + return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); +} + } // namespace detail std::unique_ptr minhash(cudf::strings_column_view const& input, @@ -454,6 +691,19 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, return detail::minhash(input, seed, parameter_a, parameter_b, width, stream, mr); } +std::unique_ptr minhash_ngrams(cudf::lists_column_view const& input, + cudf::size_type ngrams, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + +{ + CUDF_FUNC_RANGE(); + return detail::minhash_ngrams(input, ngrams, seed, parameter_a, parameter_b, stream, mr); +} + std::unique_ptr minhash64(cudf::strings_column_view const& input, uint64_t seed, cudf::device_span parameter_a, @@ -466,4 +716,17 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr); } +std::unique_ptr minhash64_ngrams(cudf::lists_column_view const& input, + cudf::size_type ngrams, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + +{ + CUDF_FUNC_RANGE(); + return detail::minhash64_ngrams(input, ngrams, seed, parameter_a, parameter_b, stream, mr); +} + } // namespace nvtext diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 7e2b766862d..0e680e98ec5 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "text/normalize.cuh" #include "text/subword/detail/data_normalizer.hpp" #include "text/subword/detail/tokenizer_utils.cuh" #include "text/utilities/tokenize_ops.cuh" @@ -22,10 +23,11 @@ #include #include #include -#include #include #include #include +#include +#include #include #include #include @@ -38,9 +40,13 @@ #include +#include +#include +#include #include #include #include +#include #include #include @@ -103,6 +109,12 @@ constexpr uint32_t UTF8_1BYTE = 0x0080; constexpr uint32_t UTF8_2BYTE = 0x0800; constexpr uint32_t UTF8_3BYTE = 0x01'0000; +__device__ int8_t cp_to_utf8(uint32_t codepoint, char* out) +{ + auto utf8 = cudf::strings::detail::codepoint_to_utf8(codepoint); + return cudf::strings::detail::from_char_utf8(utf8, out); +} + /** * @brief Convert code-point arrays into UTF-8 bytes for each string. */ @@ -148,26 +160,8 @@ struct codepoint_to_utf8_fn { // convert each code-point to 1-4 UTF-8 encoded bytes char* out_ptr = d_chars + d_offsets[idx]; for (uint32_t jdx = 0; jdx < count; ++jdx) { - uint32_t code_point = *str_cps++; - if (code_point < UTF8_1BYTE) // ASCII range - *out_ptr++ = static_cast(code_point); - else if (code_point < UTF8_2BYTE) { // create two-byte UTF-8 - // b00001xxx:byyyyyyyy => b110xxxyy:b10yyyyyy - *out_ptr++ = static_cast((((code_point << 2) & 0x00'1F00) | 0x00'C000) >> 8); - *out_ptr++ = static_cast((code_point & 0x3F) | 0x0080); - } else if (code_point < UTF8_3BYTE) { // create three-byte UTF-8 - // bxxxxxxxx:byyyyyyyy => b1110xxxx:b10xxxxyy:b10yyyyyy - *out_ptr++ = static_cast((((code_point << 4) & 0x0F'0000) | 0x00E0'0000) >> 16); - *out_ptr++ = static_cast((((code_point << 2) & 0x00'3F00) | 0x00'8000) >> 8); - *out_ptr++ = static_cast((code_point & 0x3F) | 0x0080); - } else { // create four-byte UTF-8 - // maximum code-point value is 0x0011'0000 - // b000xxxxx:byyyyyyyy:bzzzzzzzz => b11110xxx:b10xxyyyy:b10yyyyzz:b10zzzzzz - *out_ptr++ = static_cast((((code_point << 6) & 0x0700'0000u) | 0xF000'0000u) >> 24); - *out_ptr++ = static_cast((((code_point << 4) & 0x003F'0000u) | 0x0080'0000u) >> 16); - *out_ptr++ = static_cast((((code_point << 2) & 0x00'3F00u) | 0x00'8000u) >> 8); - *out_ptr++ = static_cast((code_point & 0x3F) | 0x0080); - } + uint32_t codepoint = *str_cps++; + out_ptr += cp_to_utf8(codepoint, out_ptr); } } }; @@ -261,4 +255,361 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con return detail::normalize_characters(input, do_lower_case, stream, mr); } +struct character_normalizer::character_normalizer_impl { + rmm::device_uvector cp_metadata; + rmm::device_uvector aux_table; + bool do_lower_case; + std::unique_ptr special_tokens; + rmm::device_uvector special_tokens_view; + + cudf::device_span get_special_tokens() const + { + return special_tokens_view; + } + + character_normalizer_impl(rmm::device_uvector&& cp_metadata, + rmm::device_uvector&& aux_table, + bool do_lower_case, + std::unique_ptr&& special_tokens, + rmm::device_uvector&& special_tokens_view) + : cp_metadata(std::move(cp_metadata)), + aux_table(std::move(aux_table)), + do_lower_case{do_lower_case}, + special_tokens{std::move(special_tokens)}, + special_tokens_view{std::move(special_tokens_view)} + { + } +}; + +character_normalizer::character_normalizer(bool do_lower_case, + cudf::strings_column_view const& special_tokens, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref) +{ + auto cp_metadata = nvtext::detail::get_codepoint_metadata(stream); + auto aux_table = nvtext::detail::get_aux_codepoint_data(stream); + CUDF_EXPECTS( + !special_tokens.has_nulls(), "special tokens should not have nulls", std::invalid_argument); + + auto sorted = std::move( + cudf::sort(cudf::table_view({special_tokens.parent()}), {}, {}, stream)->release().front()); + if (do_lower_case) { + // lower-case the tokens so they will match the normalized input + sorted = cudf::strings::to_lower(cudf::strings_column_view(sorted->view()), stream); + } + + auto tokens_view = cudf::strings::detail::create_string_vector_from_column( + cudf::strings_column_view(sorted->view()), stream, cudf::get_current_device_resource_ref()); + + _impl = std::make_unique(std::move(cp_metadata), + std::move(aux_table), + do_lower_case, + std::move(sorted), + std::move(tokens_view)); +} + +character_normalizer::~character_normalizer() {} + +std::unique_ptr create_character_normalizer( + bool do_lower_case, + cudf::strings_column_view const& special_tokens, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return std::make_unique(do_lower_case, special_tokens, stream, mr); +} + +namespace detail { +namespace { + +/** + * @brief Kernel handles fixing up the normalized data to account for any special tokens + * + * This undoes the padding added around the `[]` for patterns matching the strings in the + * special_tokens array. + * + * Launched as a thread per input byte (total_count). + * + * @param d_normalized The normalized set of UTF-8 characters; 3 uints per input byte + * @param total_count Number of bytes represented by d_normalized; len(d_normalized)/3 + * @param special_tokens Tokens to check against + */ +CUDF_KERNEL void special_tokens_kernel(uint32_t* d_normalized, + int64_t total_count, + cudf::device_span special_tokens) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + if (idx >= total_count) { return; } + auto const begin = d_normalized + (idx * MAX_NEW_CHARS) + 1; + if (*begin != '[') { return; } + auto const end = begin + cuda::std::min(6L, total_count - idx) * MAX_NEW_CHARS; + auto const match = thrust::find(thrust::seq, begin, end, static_cast(']')); + if (match == end) { return; } + char candidate[8]; + auto const ch_begin = + thrust::transform_iterator(begin, [](auto v) { return static_cast(v); }); + auto const ch_end = ch_begin + thrust::distance(begin, match + 1); + auto last = thrust::copy_if( + thrust::seq, ch_begin, ch_end, candidate, [](auto c) { return c != 0 && c != ' '; }); + *last = 0; // only needed for debug + + auto const size = static_cast(thrust::distance(candidate, last)); + auto const token = cudf::string_view(candidate, size); + // the binary_search expects the special_tokens to be sorted + if (!thrust::binary_search(thrust::seq, special_tokens.begin(), special_tokens.end(), token)) { + return; + } + + // fix up chars to remove the extra spaces + *(begin + 1) = 0; // removes space after '[' + *(match - 1) = 0; // removes space before ']' +} + +/** + * @brief The normalizer kernel + * + * Launched as a thread per input byte (total_bytes). + * + * Converts the input d_chars into codepoints to lookup in the provided tables. + * Once processed, the d_output contains 3 uints per input byte each encoded + * as output UTF-8. Any zero values are to removed by a subsequent kernel call. + * + * @param d_chars The characters for the input strings column to normalize + * @param total_bytes The number of bytes in the d_chars + * @param cp_metadata First lookup table for codepoint metadata + * @param aux_table Second lookup table containing possible replacement characters + * @param do_lower_case True if the normalization includes lower-casing characters + * @param d_output The output of the normalization (UTF-8 encoded) + */ +CUDF_KERNEL void data_normalizer_kernel(char const* d_chars, + int64_t total_bytes, + codepoint_metadata_type const* cp_metadata, + aux_codepoint_data_type const* aux_table, + bool do_lower_case, + uint32_t* d_output) +{ + uint32_t replacement[MAX_NEW_CHARS] = {0}; + + auto const idx = cudf::detail::grid_1d::global_thread_id(); + + if ((idx < total_bytes) && cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) { + auto const cp = [utf8 = d_chars + idx] { + cudf::char_utf8 ch_utf8 = *utf8; + if (ch_utf8 > 0x7F) { cudf::strings::detail::to_char_utf8(utf8, ch_utf8); } + return cudf::strings::detail::utf8_to_codepoint(ch_utf8); + }(); + auto const metadata = cp_metadata[cp]; + + if (!should_remove_cp(metadata, do_lower_case)) { + int8_t num_new_chars = 1; + // retrieve the normalized value for cp + auto const new_cp = do_lower_case || always_replace(metadata) ? get_first_cp(metadata) : cp; + replacement[0] = new_cp == 0 ? cp : new_cp; + + if (do_lower_case && is_multi_char_transform(metadata)) { + auto const next_cps = aux_table[cp]; + replacement[1] = static_cast(next_cps >> 32); + replacement[2] = static_cast(next_cps & 0xFFFFFFFF); + num_new_chars = 2 + (replacement[2] != 0); + } + + if (should_add_spaces(metadata, do_lower_case) && (num_new_chars == 1)) { + replacement[1] = replacement[0]; + replacement[0] = SPACE_CODE_POINT; // add spaces around the new codepoint + replacement[2] = SPACE_CODE_POINT; + num_new_chars = 3; + } + + // convert codepoints back to UTF-8 in-place + for (int k = 0; k < num_new_chars; ++k) { + auto const new_cp = replacement[k]; + if (new_cp) { cp_to_utf8(new_cp, reinterpret_cast(replacement + k)); } + } + } + } + + // employ an optimized coalesced writer to output replacement as a block of transposed data + using block_store = + cub::BlockStore; + __shared__ typename block_store::TempStorage bs_stg; + auto block_base = d_output + blockIdx.x * blockDim.x * MAX_NEW_CHARS; + block_store(bs_stg).Store(block_base, replacement); +} + +/** + * @brief Computes the output sizes for each row + * + * The input offsets are used with segmented-reduce to count the number of + * non-zero values for each output row. + * + * @param d_normalized The UTF-8 encoded normalized values + * @param offsets These identify the row boundaries + * @param offset Only non-zero if the input column has been sliced + * @param size The number of output rows (sames as the number of input rows) + * @param stream Stream used for allocating device memory and launching kernels + * @return The sizes of each output row + */ +template +rmm::device_uvector compute_sizes(cudf::device_span d_normalized, + OffsetType offsets, + int64_t offset, + cudf::size_type size, + rmm::cuda_stream_view stream) +{ + auto output_sizes = rmm::device_uvector(size, stream); + + auto d_data = d_normalized.data(); + + // counts the non-zero bytes in the d_data array + auto d_in = cudf::detail::make_counting_transform_iterator( + 0, cuda::proclaim_return_type([d_data] __device__(auto idx) { + idx = idx * MAX_NEW_CHARS; + // transform function counts number of non-zero bytes in uint32_t value + auto tfn = [](uint32_t v) -> cudf::size_type { + return ((v & 0xFF) > 0) + ((v & 0xFF00) > 0) + ((v & 0xFF0000) > 0) + + ((v & 0xFF000000) > 0); + }; + auto const begin = d_data + idx; + auto const end = begin + MAX_NEW_CHARS; + return thrust::transform_reduce(thrust::seq, begin, end, tfn, 0, thrust::plus{}); + })); + + // DeviceSegmentedReduce is used to compute the size of each output row + auto d_out = output_sizes.begin(); + auto temp = std::size_t{0}; + if (offset == 0) { + cub::DeviceSegmentedReduce::Sum( + nullptr, temp, d_in, d_out, size, offsets, offsets + 1, stream.value()); + auto d_temp = rmm::device_buffer{temp, stream}; + cub::DeviceSegmentedReduce::Sum( + d_temp.data(), temp, d_in, d_out, size, offsets, offsets + 1, stream.value()); + } else { + // offsets need to be normalized for segmented-reduce to work efficiently + auto offsets_itr = thrust::transform_iterator( + offsets, + cuda::proclaim_return_type([offset] __device__(auto o) { return o - offset; })); + cub::DeviceSegmentedReduce::Sum( + nullptr, temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value()); + auto d_temp = rmm::device_buffer{temp, stream}; + cub::DeviceSegmentedReduce::Sum( + d_temp.data(), temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value()); + } + + return output_sizes; +} + +// handles ranges above int32 max +template +OutputIterator remove_copy_safe(InputIterator first, + InputIterator last, + OutputIterator result, + T const& value, + rmm::cuda_stream_view stream) +{ + auto const copy_size = std::min(static_cast(std::distance(first, last)), + static_cast(std::numeric_limits::max())); + + auto itr = first; + while (itr != last) { + auto const copy_end = + static_cast(std::distance(itr, last)) <= copy_size ? last : itr + copy_size; + result = thrust::remove_copy(rmm::exec_policy(stream), itr, copy_end, result, value); + itr = copy_end; + } + return result; +} + +// handles ranges above int32 max +template +Iterator remove_safe(Iterator first, Iterator last, T const& value, rmm::cuda_stream_view stream) +{ + auto const size = std::min(static_cast(std::distance(first, last)), + static_cast(std::numeric_limits::max())); + + auto result = first; + auto itr = first; + while (itr != last) { + auto end = static_cast(std::distance(itr, last)) <= size ? last : itr + size; + result = thrust::remove(rmm::exec_policy(stream), itr, end, value); + itr = end; + } + return result; +} +} // namespace + +std::unique_ptr normalize_characters(cudf::strings_column_view const& input, + character_normalizer const& normalizer, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + if (input.is_empty()) { return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); } + + auto [first_offset, last_offset] = + cudf::strings::detail::get_first_and_last_offset(input, stream); + auto const chars_size = last_offset - first_offset; + auto const d_input_chars = input.chars_begin(stream) + first_offset; + + if (chars_size == 0) { return std::make_unique(input.parent(), stream, mr); } + + constexpr int64_t block_size = 256; + cudf::detail::grid_1d grid{chars_size, block_size}; + auto const max_new_char_total = cudf::util::round_up_safe(chars_size, block_size) * MAX_NEW_CHARS; + + auto const& parameters = normalizer._impl; + + auto d_normalized = rmm::device_uvector(max_new_char_total, stream); + data_normalizer_kernel<<>>( + d_input_chars, + chars_size, + parameters->cp_metadata.data(), + parameters->aux_table.data(), + parameters->do_lower_case, + d_normalized.data()); + + // This removes space added around any special tokens in the form of [ttt]. + // An alternate approach is to do a multi-replace of '[ ttt ]' with '[ttt]' right + // before returning the output strings column. + auto const special_tokens = parameters->get_special_tokens(); + if (!special_tokens.empty()) { + special_tokens_kernel<<>>( + d_normalized.data(), chars_size, special_tokens); + } + + // Use segmented-reduce over the non-zero codepoints to get the size of the output rows + auto const input_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); + auto output_sizes = + compute_sizes(d_normalized, input_offsets, first_offset, input.size(), stream); + + // convert the sizes to offsets + auto [offsets, total_size] = cudf::strings::detail::make_offsets_child_column( + output_sizes.begin(), output_sizes.end(), stream, mr); + + // create output chars by calling remove_copy(0) on the bytes in d_normalized + auto chars = rmm::device_uvector(total_size, stream, mr); + auto const begin = reinterpret_cast(d_normalized.begin()); + // the remove() above speeds up the remove_copy() by roughly 10% + auto const end = + reinterpret_cast(remove_safe(d_normalized.begin(), d_normalized.end(), 0, stream)); + remove_copy_safe(begin, end, chars.data(), 0, stream); + + return cudf::make_strings_column(input.size(), + std::move(offsets), + chars.release(), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); +} + +} // namespace detail + +std::unique_ptr normalize_characters(cudf::strings_column_view const& input, + character_normalizer const& normalizer, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::normalize_characters(input, normalizer, stream, mr); +} + } // namespace nvtext diff --git a/cpp/src/text/normalize.cuh b/cpp/src/text/normalize.cuh new file mode 100644 index 00000000000..3972726d536 --- /dev/null +++ b/cpp/src/text/normalize.cuh @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "text/subword/detail/cp_data.h" + +namespace nvtext { +namespace detail { + +/** + * @brief Bit used to filter out invalid code points. + * + * When normalizing characters to code point values, if this bit is set, + * the code point should be filtered out before returning from the normalizer. + */ +constexpr uint32_t FILTER_BIT = 22; + +/** + * @brief Retrieve new code point from metadata value. + * + * @param metadata Value from the codepoint_metadata table. + * @return The replacement character if appropriate. + */ +__device__ constexpr uint32_t get_first_cp(uint32_t metadata) { return metadata & NEW_CP_MASK; } + +/** + * @brief Retrieve token category from the metadata value. + * + * Category values are 0-5: + * 0 - character should be padded + * 1 - pad character if lower-case + * 2 - character should be removed + * 3 - remove character if lower-case + * 4 - whitespace character -- always replace + * 5 - uncategorized + * + * @param metadata Value from the codepoint_metadata table. + * @return Category value. + */ +__device__ constexpr uint32_t extract_token_cat(uint32_t metadata) +{ + return (metadata >> TOKEN_CAT_SHIFT) & TOKEN_CAT_MASK; +} + +/** + * @brief Return true if category of metadata value specifies the character should be replaced. + */ +__device__ constexpr bool should_remove_cp(uint32_t metadata, bool lower_case) +{ + auto const cat = extract_token_cat(metadata); + return (cat == TOKEN_CAT_REMOVE_CHAR) || (lower_case && (cat == TOKEN_CAT_REMOVE_CHAR_IF_LOWER)); +} + +/** + * @brief Return true if category of metadata value specifies the character should be padded. + */ +__device__ constexpr bool should_add_spaces(uint32_t metadata, bool lower_case) +{ + auto const cat = extract_token_cat(metadata); + return (cat == TOKEN_CAT_ADD_SPACE) || (lower_case && (cat == TOKEN_CAT_ADD_SPACE_IF_LOWER)); +} + +/** + * @brief Return true if category of metadata value specifies the character should be replaced. + */ +__device__ constexpr bool always_replace(uint32_t metadata) +{ + return extract_token_cat(metadata) == TOKEN_CAT_ALWAYS_REPLACE; +} + +/** + * @brief Returns true if metadata value includes a multi-character transform bit equal to 1. + */ +__device__ constexpr bool is_multi_char_transform(uint32_t metadata) +{ + return (metadata >> MULTICHAR_SHIFT) & MULTICHAR_MASK; +} + +/** + * @brief Returns true if the byte passed in could be a valid head byte for + * a utf8 character. That is, not binary `10xxxxxx` + */ +__device__ constexpr bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6) != 2; } + +} // namespace detail +} // namespace nvtext diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu index 7a39199011e..4c54409c41a 100644 --- a/cpp/src/text/subword/data_normalizer.cu +++ b/cpp/src/text/subword/data_normalizer.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "text/normalize.cuh" #include "text/subword/detail/data_normalizer.hpp" #include "text/subword/detail/tokenizer_utils.cuh" @@ -38,81 +39,6 @@ namespace nvtext { namespace detail { namespace { -/** - * @brief Bit used to filter out invalid code points. - * - * When normalizing characters to code point values, if this bit is set, - * the code point should be filtered out before returning from the normalizer. - */ -constexpr uint32_t FILTER_BIT = 22; - -/** - * @brief Retrieve new code point from metadata value. - * - * @param metadata Value from the codepoint_metadata table. - * @return The replacement character if appropriate. - */ -__device__ uint32_t get_first_cp(uint32_t metadata) { return metadata & NEW_CP_MASK; } - -/** - * @brief Retrieve token category from the metadata value. - * - * Category values are 0-5: - * 0 - character should be padded - * 1 - pad character if lower-case - * 2 - character should be removed - * 3 - remove character if lower-case - * 4 - whitespace character -- always replace - * 5 - uncategorized - * - * @param metadata Value from the codepoint_metadata table. - * @return Category value. - */ -__device__ uint32_t extract_token_cat(uint32_t metadata) -{ - return (metadata >> TOKEN_CAT_SHIFT) & TOKEN_CAT_MASK; -} - -/** - * @brief Return true if category of metadata value specifies the character should be replaced. - */ -__device__ bool should_remove_cp(uint32_t metadata, bool lower_case) -{ - auto const cat = extract_token_cat(metadata); - return (cat == TOKEN_CAT_REMOVE_CHAR) || (lower_case && (cat == TOKEN_CAT_REMOVE_CHAR_IF_LOWER)); -} - -/** - * @brief Return true if category of metadata value specifies the character should be padded. - */ -__device__ bool should_add_spaces(uint32_t metadata, bool lower_case) -{ - auto const cat = extract_token_cat(metadata); - return (cat == TOKEN_CAT_ADD_SPACE) || (lower_case && (cat == TOKEN_CAT_ADD_SPACE_IF_LOWER)); -} - -/** - * @brief Return true if category of metadata value specifies the character should be replaced. - */ -__device__ bool always_replace(uint32_t metadata) -{ - return extract_token_cat(metadata) == TOKEN_CAT_ALWAYS_REPLACE; -} - -/** - * @brief Returns true if metadata value includes a multi-character transform bit equal to 1. - */ -__device__ bool is_multi_char_transform(uint32_t metadata) -{ - return (metadata >> MULTICHAR_SHIFT) & MULTICHAR_MASK; -} - -/** - * @brief Returns true if the byte passed in could be a valid head byte for - * a utf8 character. That is, not binary `10xxxxxx` - */ -__device__ bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6) != 2; } - /** * @brief Converts a UTF-8 character into a unicode code point value. * diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index 94d27d976c3..e41d772a479 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -29,6 +29,7 @@ namespace cudf { namespace { + class fixed_pinned_pool_memory_resource { using upstream_mr = rmm::mr::pinned_host_memory_resource; using host_pooled_mr = rmm::mr::pool_memory_resource; diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index cfc6a0dc425..e3ca8b70b87 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -309,7 +309,7 @@ ConfigureTest( ConfigureTest( ORC_TEST io/orc_chunked_reader_test.cu io/orc_test.cpp GPUS 1 - PERCENT 30 + PERCENT 100 ) ConfigureTest( PARQUET_TEST @@ -340,7 +340,7 @@ ConfigureTest(JSON_TREE_CSR io/json/json_tree_csr.cu) ConfigureTest( DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp GPUS 1 - PERCENT 30 + PERCENT 100 ) target_link_libraries(DATA_CHUNK_SOURCE_TEST PRIVATE ZLIB::ZLIB) ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu) diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu index 883a5093bd1..ad92e322ee2 100644 --- a/cpp/tests/groupby/tdigest_tests.cu +++ b/cpp/tests/groupby/tdigest_tests.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,6 +30,8 @@ #include #include +#include + namespace { /** * @brief Functor to generate a tdigest by key. diff --git a/cpp/tests/io/metadata_utilities.cpp b/cpp/tests/io/metadata_utilities.cpp index 380d66c53f9..980d8d8b3d1 100644 --- a/cpp/tests/io/metadata_utilities.cpp +++ b/cpp/tests/io/metadata_utilities.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,8 @@ #include #include +#include + namespace cudf::test { void expect_metadata_equal(cudf::io::table_input_metadata in_meta, diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index e201dc0565c..d99e19822c0 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -33,6 +33,7 @@ #include #include +#include using cudf::test::iterators::no_nulls; diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh index 119d8e7b138..d6a991f675c 100644 --- a/cpp/tests/iterator/iterator_tests.cuh +++ b/cpp/tests/iterator/iterator_tests.cuh @@ -19,6 +19,7 @@ #include #include +#include #include // for meanvar #include #include @@ -28,7 +29,7 @@ #include #include -#include +#include #include #include #include @@ -59,7 +60,7 @@ struct IteratorTest : public cudf::test::BaseFixture { d_in, dev_result.begin(), num_items, - thrust::minimum{}, + cudf::detail::minimum{}, init, cudf::get_default_stream().value()); @@ -72,7 +73,7 @@ struct IteratorTest : public cudf::test::BaseFixture { d_in, dev_result.begin(), num_items, - thrust::minimum{}, + cudf::detail::minimum{}, init, cudf::get_default_stream().value()); @@ -98,7 +99,7 @@ struct IteratorTest : public cudf::test::BaseFixture { d_in_last, dev_expected.begin(), dev_results.begin(), - thrust::equal_to{}); + cuda::std::equal_to{}); auto result = thrust::all_of(rmm::exec_policy(cudf::get_default_stream()), dev_results.begin(), dev_results.end(), diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp index 5f911597b02..c6c419706e0 100644 --- a/cpp/tests/reductions/scan_tests.cpp +++ b/cpp/tests/reductions/scan_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,6 +28,7 @@ #include #include +#include #include using aggregation = cudf::aggregation; diff --git a/cpp/tests/rolling/offset_row_window_test.cpp b/cpp/tests/rolling/offset_row_window_test.cpp index dcaa47e722b..4477ca388df 100644 --- a/cpp/tests/rolling/offset_row_window_test.cpp +++ b/cpp/tests/rolling/offset_row_window_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -43,18 +43,21 @@ auto constexpr null = int32_t{0}; // NULL representation for int32_t; auto no_nulls_list() { return nulls_at({}); } struct OffsetRowWindowTest : public cudf::test::BaseFixture { - static ints_column const _keys; // {0, 0, 0, 0, 0, 0, 1, 1, 1, 1}; - static ints_column const _values; // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - struct rolling_runner { cudf::window_bounds _preceding, _following; cudf::size_type _min_periods; bool _grouped = true; + ints_column const _keys; // {0, 0, 0, 0, 0, 0, 1, 1, 1, 1}; + ints_column const _values; // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; rolling_runner(cudf::window_bounds const& preceding, cudf::window_bounds const& following, cudf::size_type min_periods_ = 1) - : _preceding{preceding}, _following{following}, _min_periods{min_periods_} + : _preceding{preceding}, + _following{following}, + _min_periods{min_periods_}, + _keys{0, 0, 0, 0, 0, 0, 1, 1, 1, 1}, + _values{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} { } @@ -80,9 +83,6 @@ struct OffsetRowWindowTest : public cudf::test::BaseFixture { }; }; -ints_column const OffsetRowWindowTest::_keys{0, 0, 0, 0, 0, 0, 1, 1, 1, 1}; -ints_column const OffsetRowWindowTest::_values{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - auto const AGG_COUNT_NON_NULL = cudf::make_count_aggregation(cudf::null_policy::EXCLUDE); auto const AGG_COUNT_ALL = @@ -96,7 +96,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Grouped_3_to_Minus_1) { auto const preceding = cudf::window_bounds::get(3); auto const following = cudf::window_bounds::get(-1); - auto run_rolling = rolling_runner{preceding, following}.min_periods(1).grouped(true); + auto run_rolling = rolling_runner{preceding, following}; + run_rolling.min_periods(1).grouped(true); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL), ints_column{{0, 1, 2, 2, 2, 2, 0, 1, 2, 2}, nulls_at({0, 6})}); @@ -136,7 +137,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_3_to_Minus_1) { auto const preceding = cudf::window_bounds::get(3); auto const following = cudf::window_bounds::get(-1); - auto run_rolling = rolling_runner{preceding, following}.min_periods(1).grouped(false); + auto run_rolling = rolling_runner{preceding, following}; + run_rolling.min_periods(1).grouped(false); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL), ints_column{{0, 1, 2, 2, 2, 2, 2, 2, 2, 2}, nulls_at({0})}); @@ -176,7 +178,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Grouped_0_to_2) { auto const preceding = cudf::window_bounds::get(0); auto const following = cudf::window_bounds::get(2); - auto run_rolling = rolling_runner{preceding, following}.min_periods(1).grouped(true); + auto run_rolling = rolling_runner{preceding, following}; + run_rolling.min_periods(1).grouped(true); CUDF_TEST_EXPECT_COLUMNS_EQUAL( *run_rolling(*AGG_COUNT_NON_NULL), @@ -219,7 +222,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_0_to_2) { auto const preceding = cudf::window_bounds::get(0); auto const following = cudf::window_bounds::get(2); - auto run_rolling = rolling_runner{preceding, following}.min_periods(1).grouped(false); + auto run_rolling = rolling_runner{preceding, following}; + run_rolling.min_periods(1).grouped(false); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL), ints_column{{2, 2, 2, 2, 2, 2, 2, 2, 1, null}, nulls_at({9})}); diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 8bfb17e0efd..db43484ab09 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -187,6 +187,15 @@ TEST_F(MinHashTest, EmptyTest) auto params64 = cudf::test::fixed_width_column_wrapper({1, 2, 3}); results = nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); EXPECT_EQ(results->size(), 0); + + auto empty = cudf::test::lists_column_wrapper(); + auto lview = cudf::lists_column_view(empty); + results = + nvtext::minhash_ngrams(lview, 4, 0, cudf::column_view(params), cudf::column_view(params)); + EXPECT_EQ(results->size(), 0); + results = + nvtext::minhash64_ngrams(lview, 4, 0, cudf::column_view(params64), cudf::column_view(params64)); + EXPECT_EQ(results->size(), 0); } TEST_F(MinHashTest, ErrorsTest) @@ -194,17 +203,20 @@ TEST_F(MinHashTest, ErrorsTest) auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"}); auto view = cudf::strings_column_view(input); auto empty = cudf::test::fixed_width_column_wrapper(); - EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0), - std::invalid_argument); + auto eview = cudf::column_view(empty); + EXPECT_THROW(nvtext::minhash(view, 0, eview, eview, 0), std::invalid_argument); auto empty64 = cudf::test::fixed_width_column_wrapper(); - EXPECT_THROW( - nvtext::minhash64(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0), - std::invalid_argument); - EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4), - std::invalid_argument); - EXPECT_THROW( - nvtext::minhash64(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4), - std::invalid_argument); + auto eview64 = cudf::column_view(empty64); + EXPECT_THROW(nvtext::minhash64(view, 0, eview64, eview64, 0), std::invalid_argument); + EXPECT_THROW(nvtext::minhash(view, 0, eview, eview, 4), std::invalid_argument); + EXPECT_THROW(nvtext::minhash64(view, 0, eview64, eview64, 4), std::invalid_argument); + + auto empty_list = cudf::test::lists_column_wrapper(); + auto lview = cudf::lists_column_view(empty_list); + EXPECT_THROW(nvtext::minhash_ngrams(lview, 0, 0, eview, eview), std::invalid_argument); + EXPECT_THROW(nvtext::minhash64_ngrams(lview, 0, 0, eview64, eview64), std::invalid_argument); + EXPECT_THROW(nvtext::minhash_ngrams(lview, 4, 0, eview, eview), std::invalid_argument); + EXPECT_THROW(nvtext::minhash64_ngrams(lview, 4, 0, eview64, eview64), std::invalid_argument); std::vector h_input(50000, ""); input = cudf::test::strings_column_wrapper(h_input.begin(), h_input.end()); @@ -212,16 +224,133 @@ TEST_F(MinHashTest, ErrorsTest) auto const zeroes = thrust::constant_iterator(0); auto params = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); - EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4), - std::overflow_error); + auto pview = cudf::column_view(params); + EXPECT_THROW(nvtext::minhash(view, 0, pview, pview, 4), std::overflow_error); auto params64 = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); - EXPECT_THROW( - nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4), - std::overflow_error); - - EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(empty), 4), - std::invalid_argument); - EXPECT_THROW( - nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4), - std::invalid_argument); + auto pview64 = cudf::column_view(params64); + EXPECT_THROW(nvtext::minhash64(view, 0, pview64, pview64, 4), std::overflow_error); + + auto offsets = cudf::test::fixed_width_column_wrapper( + thrust::counting_iterator(0), + thrust::counting_iterator(h_input.size() + 1)); + auto input_ngrams = + cudf::make_lists_column(h_input.size(), offsets.release(), input.release(), 0, {}); + lview = cudf::lists_column_view(input_ngrams->view()); + EXPECT_THROW(nvtext::minhash_ngrams(lview, 4, 0, pview, pview), std::overflow_error); + EXPECT_THROW(nvtext::minhash64_ngrams(lview, 4, 0, pview64, pview64), std::overflow_error); +} + +TEST_F(MinHashTest, Ngrams) +{ + using LCWS = cudf::test::lists_column_wrapper; + auto input = + LCWS({LCWS{"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog."}, + LCWS{"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "", "dog."}, + LCWS{"short", "row"}}); + + auto view = cudf::lists_column_view(input); + + auto first = thrust::counting_iterator(10); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = + nvtext::minhash_ngrams(view, 4, 0, cudf::column_view(params), cudf::column_view(params)); + using LCW32 = cudf::test::lists_column_wrapper; + // clang-format off + LCW32 expected({ + LCW32{ 230924604u, 55492793u, 963436400u}, + LCW32{ 230924604u, 367515795u, 963436400u}, + LCW32{2380648568u, 1330223236u, 279797904u} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = + nvtext::minhash64_ngrams(view, 4, 0, cudf::column_view(params64), cudf::column_view(params64)); + using LCW64 = cudf::test::lists_column_wrapper; + // clang-format off + LCW64 expected64({ + LCW64{ 208926840193078200ul, 576399628675212695ul, 312927673584437419ul}, + LCW64{ 677038498284219393ul, 326338087730412201ul, 298455901014050223ul}, + LCW64{1493265692486268500ul, 720255058049417768ul, 2253087432826260995ul} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); +} + +TEST_F(MinHashTest, NgramsWide) +{ + auto many = std::vector(1024, "hello"); + auto str_data = cudf::test::strings_column_wrapper(many.begin(), many.end()); + auto offsets = + cudf::test::fixed_width_column_wrapper({0ul, many.size() / 2, many.size()}); + auto input = cudf::make_lists_column(2, offsets.release(), str_data.release(), 0, {}); + + auto view = cudf::lists_column_view(input->view()); + + auto first = thrust::counting_iterator(10); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = + nvtext::minhash_ngrams(view, 4, 0, cudf::column_view(params), cudf::column_view(params)); + using LCW32 = cudf::test::lists_column_wrapper; + // clang-format off + LCW32 expected({ + LCW32{ 571536396u, 2346676954u, 4121817512u}, + LCW32{ 571536396u, 2346676954u, 4121817512u} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = + nvtext::minhash64_ngrams(view, 4, 0, cudf::column_view(params64), cudf::column_view(params64)); + using LCW64 = cudf::test::lists_column_wrapper; + // clang-format off + LCW64 expected64({ + LCW64{ 1947142336021414174ul, 1219519365938078011ul, 491896395854741840ul}, + LCW64{ 1947142336021414174ul, 1219519365938078011ul, 491896395854741840ul} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); +} + +TEST_F(MinHashTest, NgramsSliced) +{ + using LCWS = cudf::test::lists_column_wrapper; + auto input = + LCWS({LCWS{"ignored", "row"}, + LCWS{"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog."}, + LCWS{"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "", "dog."}, + LCWS{"short", "row"}, + LCWS{"ignored", "row"}}); + + auto view = cudf::lists_column_view(cudf::slice(input, {1, 4}).front()); + auto first = thrust::counting_iterator(10); + + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = + nvtext::minhash_ngrams(view, 4, 0, cudf::column_view(params), cudf::column_view(params)); + + using LCW32 = cudf::test::lists_column_wrapper; + // clang-format off + LCW32 expected({ + LCW32{ 230924604u, 55492793u, 963436400u}, + LCW32{ 230924604u, 367515795u, 963436400u}, + LCW32{2380648568u, 1330223236u, 279797904u} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = + nvtext::minhash64_ngrams(view, 4, 0, cudf::column_view(params64), cudf::column_view(params64)); + using LCW64 = cudf::test::lists_column_wrapper; + // clang-format off + LCW64 expected64({ + LCW64{ 208926840193078200ul, 576399628675212695ul, 312927673584437419ul}, + LCW64{ 677038498284219393ul, 326338087730412201ul, 298455901014050223ul}, + LCW64{1493265692486268500ul, 720255058049417768ul, 2253087432826260995ul} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp index 2515cc917fa..530148eb654 100644 --- a/cpp/tests/text/normalize_tests.cpp +++ b/cpp/tests/text/normalize_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -74,6 +74,10 @@ TEST_F(TextNormalizeTest, NormalizeEmptyTest) EXPECT_EQ(results->size(), 0); results = nvtext::normalize_characters(strings_view, false); EXPECT_EQ(results->size(), 0); + + auto normalizer = nvtext::create_character_normalizer(true); + results = nvtext::normalize_characters(strings_view, *normalizer); + EXPECT_EQ(results->size(), 0); } TEST_F(TextNormalizeTest, AllNullStrings) @@ -84,6 +88,10 @@ TEST_F(TextNormalizeTest, AllNullStrings) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); results = nvtext::normalize_characters(strings_view, false); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); + + auto normalizer = nvtext::create_character_normalizer(true); + results = nvtext::normalize_characters(strings_view, *normalizer); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); } TEST_F(TextNormalizeTest, SomeNullStrings) @@ -93,27 +101,21 @@ TEST_F(TextNormalizeTest, SomeNullStrings) auto results = nvtext::normalize_characters(strings_view, false); cudf::test::strings_column_wrapper expected({"", " . ", "a"}, {false, true, true}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto normalizer = nvtext::create_character_normalizer(true); + results = nvtext::normalize_characters(strings_view, *normalizer); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } TEST_F(TextNormalizeTest, NormalizeCharacters) { // These include punctuation, accents, whitespace, and CJK characters - std::vector h_strings{"abc£def", - nullptr, - "éè â îô\taeio", - "\tĂĆĖÑ Ü", - "ACEN U", - "P^NP", - "$41.07", - "[a,b]", - "丏丟", - ""}; - auto validity = - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); - cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity); - cudf::strings_column_view strings_view(strings); + auto input = cudf::test::strings_column_wrapper( + {"abc£def", "", "éè â îô\taeio", "\tĂĆĖÑ Ü", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟", ""}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); + auto sv = cudf::strings_column_view(input); { - auto results = nvtext::normalize_characters(strings_view, true); + auto results = nvtext::normalize_characters(sv, true); cudf::test::strings_column_wrapper expected({"abc£def", "", "ee a io aeio", @@ -124,11 +126,11 @@ TEST_F(TextNormalizeTest, NormalizeCharacters) " [ a , b ] ", " 丏 丟 ", ""}, - validity); + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - auto results = nvtext::normalize_characters(strings_view, false); + auto results = nvtext::normalize_characters(sv, false); cudf::test::strings_column_wrapper expected({"abc£def", "", "éè â îô aeio", @@ -139,11 +141,117 @@ TEST_F(TextNormalizeTest, NormalizeCharacters) " [ a , b ] ", " 丏 丟 ", ""}, - validity); + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } +TEST_F(TextNormalizeTest, WithNormalizer) +{ + auto long_row = + "this entry is intended to pad out past 256 bytes which is currently the block size"; + // the following include punctuation, accents, whitespace, and CJK characters + auto input = cudf::test::strings_column_wrapper({"abc£def", + "", + "éè â îô\taeio", + "\tĂĆĖÑ Ü", + "ACEN U", + "P^NP", + "$41.07", + "[a,b]", + "丏丟", + "", + long_row, + long_row, + long_row}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + + auto const sv = cudf::strings_column_view(input); + + auto normalizer = nvtext::create_character_normalizer(true); + auto results = nvtext::normalize_characters(sv, *normalizer); + auto expected = cudf::test::strings_column_wrapper({"abc£def", + "", + "ee a io aeio", + " acen u", + "acen u", + "p ^ np", + " $ 41 . 07", + " [ a , b ] ", + " 丏 丟 ", + "", + long_row, + long_row, + long_row}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::normalize_characters(sv, *normalizer); // test normalizer re-use + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + normalizer = nvtext::create_character_normalizer(false); + results = nvtext::normalize_characters(sv, *normalizer); + expected = cudf::test::strings_column_wrapper({"abc£def", + "", + "éè â îô aeio", + " ĂĆĖÑ Ü", + "ACEN U", + "P ^ NP", + " $ 41 . 07", + " [ a , b ] ", + " 丏 丟 ", + "", + long_row, + long_row, + long_row}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::normalize_characters(sv, *normalizer); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST_F(TextNormalizeTest, SpecialTokens) +{ + auto long_row = + "this entry is intended to pad out past 256 bytes which is currently the block size"; + auto input = + cudf::test::strings_column_wrapper({"[BOS]Some strings with [PAD] special[SEP]tokens[EOS]", + "[bos]these should[sep]work too[eos]", + "some[non]tokens[eol]too", + long_row, + long_row, + long_row}); + + auto sv = cudf::strings_column_view(input); + auto special_tokens = cudf::test::strings_column_wrapper({"[BOS]", "[EOS]", "[SEP]", "[PAD]"}); + auto stv = cudf::strings_column_view(special_tokens); + + auto normalizer = nvtext::create_character_normalizer(true, stv); + auto results = nvtext::normalize_characters(sv, *normalizer); + auto expected = cudf::test::strings_column_wrapper( + {" [bos] some strings with [pad] special [sep] tokens [eos] ", + " [bos] these should [sep] work too [eos] ", + "some [ non ] tokens [ eol ] too", + long_row, + long_row, + long_row}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::normalize_characters(sv, *normalizer); // and again + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + normalizer = nvtext::create_character_normalizer(false, stv); + results = nvtext::normalize_characters(sv, *normalizer); + expected = cudf::test::strings_column_wrapper( + {" [BOS] Some strings with [PAD] special [SEP] tokens [EOS] ", + " [ bos ] these should [ sep ] work too [ eos ] ", + "some [ non ] tokens [ eol ] too", + long_row, + long_row, + long_row}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::normalize_characters(sv, *normalizer); // and again + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(TextNormalizeTest, NormalizeSlicedColumn) { cudf::test::strings_column_wrapper strings( @@ -151,10 +259,21 @@ TEST_F(TextNormalizeTest, NormalizeSlicedColumn) std::vector sliced = cudf::split(strings, {4}); auto results = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), true); - cudf::test::strings_column_wrapper expected({"abc£def", "ee a io aeio", "acen u", "p ^ np"}); + auto expected = + cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), false); + expected = cudf::test::strings_column_wrapper({" $ 41 . 07", " [ a , b ] ", " 丏 丟 "}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto normalizer = nvtext::create_character_normalizer(true); + results = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), *normalizer); + expected = cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), false); - cudf::test::strings_column_wrapper expected2({" $ 41 . 07", " [ a , b ] ", " 丏 丟 "}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); + normalizer = nvtext::create_character_normalizer(false); + results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), *normalizer); + expected = cudf::test::strings_column_wrapper({" $ 41 . 07", " [ a , b ] ", " 丏 丟 "}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } diff --git a/dependencies.yaml b/dependencies.yaml index e7840d56880..1578dadc793 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -723,7 +723,7 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - fsspec>=0.6.0 - - &numpy numpy>=1.23,<3.0a0 + - &numpy numpy>=1.23,<2.1 - pandas>=2.0,<2.2.4dev0 run_pylibcudf: common: @@ -753,8 +753,8 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cachetools - - &numba-cuda-dep numba-cuda>=0.2.0,<0.3.0a0 - - &numba-dep numba>=0.59.1,<0.61.0a0 + - &numba-cuda-dep numba-cuda>=0.4.0,<0.5.0a0 + - &numba-dep numba>=0.59.1,<0.62.0a0 - nvtx>=0.2.1 - packaging - rich @@ -813,7 +813,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=1.20,<1.23 + - polars>=1.20,<1.24 run_cudf_polars_experimental: common: - output_types: [conda, requirements, pyproject] @@ -885,7 +885,8 @@ dependencies: matrices: - matrix: {dependencies: "oldest"} packages: - - numba-cuda==0.2.0 + - numba-cuda==0.4.0 + - numba==0.59.1 - pandas==2.0.* - matrix: {dependencies: "latest"} packages: diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index c74da8d0ca9..92b37c4b3f2 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -207,6 +207,7 @@ def clean_all_xml_files(path): exclude_patterns = [ "venv", "**/includes/**", + "narwhals_test_plugin", ] # The name of the Pygments (syntax highlighting) style to use. @@ -585,6 +586,7 @@ def on_missing_reference(app, env, node, contnode): ("py:class", "pd.DataFrame"), ("py:class", "pandas.core.indexes.frozen.FrozenList"), ("py:class", "pa.Array"), + ("py:class", "pa.Decimal128Type"), ("py:class", "ScalarLike"), ("py:class", "ParentType"), ("py:class", "pyarrow.lib.DataType"), diff --git a/docs/cudf/source/user_guide/cupy-interop.ipynb b/docs/cudf/source/user_guide/cupy-interop.ipynb index 112f0bcfca6..93e62d90c0f 100644 --- a/docs/cudf/source/user_guide/cupy-interop.ipynb +++ b/docs/cudf/source/user_guide/cupy-interop.ipynb @@ -566,7 +566,7 @@ "%%timeit\n", "\n", "fortran_arr = cp.asfortranarray(reshaped_arr)\n", - "reshaped_df = cudf.from_dlpack(fortran_arr.toDlpack())" + "reshaped_df = cudf.from_dlpack(fortran_arr.__dlpack__())" ] }, { @@ -1418,7 +1418,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.12.9" } }, "nbformat": 4, diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index 2a17bc5dbb7..090e475471d 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -37,7 +37,3 @@ rapids_cython_init() add_subdirectory(cudf/_lib) add_subdirectory(udf_cpp) - -if(DEFINED cython_lib_dir) - rapids_cython_add_rpath_entries(TARGET cudf PATHS "${cython_lib_dir}") -endif() diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 37ef83c8820..8d7d64ab31e 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -73,19 +73,6 @@ def is_numeric_dtype(obj): return pd_types.is_numeric_dtype(obj) -# A version of numerical type check that does not include cudf decimals for -# places where we need to distinguish fixed and floating point numbers. -def _is_non_decimal_numeric_dtype(obj): - if isinstance(obj, _BaseDtype) or isinstance( - getattr(obj, "dtype", None), _BaseDtype - ): - return False - try: - return pd_types.is_numeric_dtype(obj) - except TypeError: - return False - - def is_integer(obj): """Return True if given object is integer. diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 73011d6ffe0..cf49dfb2194 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -7,9 +7,13 @@ import numpy as np import cudf -from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar +from cudf.api.types import is_scalar from cudf.core.dtypes import CategoricalDtype -from cudf.utils.dtypes import find_common_type, is_mixed_with_object_dtype +from cudf.utils.dtypes import ( + find_common_type, + is_dtype_obj_numeric, + is_mixed_with_object_dtype, +) if TYPE_CHECKING: from cudf._typing import DtypeObj, ScalarLike @@ -18,7 +22,7 @@ def _normalize_categorical(input_col, other): if isinstance(input_col, cudf.core.column.CategoricalColumn): - if cudf.api.types.is_scalar(other): + if is_scalar(other): try: other = input_col._encode(other) except ValueError: @@ -81,7 +85,7 @@ def _check_and_cast_columns_with_other( ) return _normalize_categorical(source_col, other.astype(source_dtype)) - if _is_non_decimal_numeric_dtype(source_dtype) and as_column( + if is_dtype_obj_numeric(source_dtype, include_decimal=False) and as_column( other ).can_cast_safely(source_dtype): common_dtype = source_dtype diff --git a/python/cudf/cudf/core/character_normalizer.py b/python/cudf/cudf/core/character_normalizer.py new file mode 100644 index 00000000000..1240c0e1eb7 --- /dev/null +++ b/python/cudf/cudf/core/character_normalizer.py @@ -0,0 +1,46 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +from __future__ import annotations + +import pylibcudf as plc + +import cudf + + +class CharacterNormalizer: + """ + A normalizer object used to normalize input text. + + Parameters + ---------- + do_lower : bool + If True, the normalizer should also lower-case + while normalizing. + special_tokens : cudf.Series + Series of special tokens. + """ + + def __init__( + self, + do_lower: bool, + special_tokens: cudf.Series = cudf.Series([], dtype="object"), + ) -> None: + self.normalizer = plc.nvtext.normalize.CharacterNormalizer( + do_lower, special_tokens._column.to_pylibcudf(mode="read") + ) + + def normalize(self, text: cudf.Series) -> cudf.Series: + """ + Parameters + ---------- + text : cudf.Series + The strings to be normalized. + + Returns + ------- + cudf.Series + Normalized strings + """ + result = text._column.normalize_characters(self.normalizer) + + return cudf.Series._from_column(result) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index a57ff9a7817..ed285934161 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -14,12 +14,14 @@ import pylibcudf as plc import cudf +from cudf.api.types import is_scalar from cudf.core.column import column from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import CategoricalDtype, IntervalDtype from cudf.core.scalar import pa_scalar_to_plc_scalar from cudf.utils.dtypes import ( SIZE_TYPE_DTYPE, + cudf_dtype_to_pa_type, find_common_type, is_mixed_with_object_dtype, min_signed_type, @@ -36,6 +38,7 @@ ColumnBinaryOperand, ColumnLike, Dtype, + DtypeObj, ScalarLike, SeriesOrIndex, SeriesOrSingleColumnIndex, @@ -621,12 +624,10 @@ def ordered(self) -> bool: return self.dtype.ordered def __setitem__(self, key, value): - if cudf.api.types.is_scalar( - value - ) and cudf.utils.utils._is_null_host_scalar(value): + if is_scalar(value) and cudf.utils.utils._is_null_host_scalar(value): to_add_categories = 0 else: - if cudf.api.types.is_scalar(value): + if is_scalar(value): arr = column.as_column(value, length=1, nan_as_null=False) else: arr = column.as_column(value, nan_as_null=False) @@ -642,7 +643,7 @@ def __setitem__(self, key, value): "category, set the categories first" ) - if cudf.api.types.is_scalar(value): + if is_scalar(value): value = self._encode(value) if value is not None else value else: value = cudf.core.column.as_column(value).astype(self.dtype) @@ -1041,9 +1042,9 @@ def notnull(self) -> ColumnBase: def _validate_fillna_value( self, fill_value: ScalarLike | ColumnLike - ) -> cudf.Scalar | ColumnBase: + ) -> plc.Scalar | ColumnBase: """Align fill_value for .fillna based on column type.""" - if cudf.api.types.is_scalar(fill_value): + if is_scalar(fill_value): if fill_value != _DEFAULT_CATEGORICAL_VALUE: try: fill_value = self._encode(fill_value) @@ -1051,7 +1052,11 @@ def _validate_fillna_value( raise ValueError( f"{fill_value=} must be in categories" ) from err - return cudf.Scalar(fill_value, dtype=self.codes.dtype) + return pa_scalar_to_plc_scalar( + pa.scalar( + fill_value, type=cudf_dtype_to_pa_type(self.codes.dtype) + ) + ) else: fill_value = column.as_column(fill_value, nan_as_null=False) if isinstance(fill_value.dtype, CategoricalDtype): @@ -1168,7 +1173,7 @@ def _mimic_inplace( self._codes = other_col.codes return out - def view(self, dtype: Dtype) -> ColumnBase: + def view(self, dtype: DtypeObj) -> ColumnBase: raise NotImplementedError( "Categorical column views are not currently supported" ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 06dc4058115..5a8064dc49d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2,7 +2,6 @@ from __future__ import annotations -import warnings from collections import abc from collections.abc import MutableSequence, Sequence from functools import cached_property @@ -24,13 +23,10 @@ import cudf from cudf.api.types import ( - _is_non_decimal_numeric_dtype, _is_pandas_nullable_extension_dtype, infer_dtype, - is_decimal_dtype, is_dtype_equal, is_scalar, - is_string_dtype, ) from cudf.core._compat import PANDAS_GE_210 from cudf.core._internals import ( @@ -70,6 +66,7 @@ find_common_type, get_time_unit, is_column_like, + is_dtype_obj_numeric, is_mixed_with_object_dtype, min_signed_type, min_unsigned_type, @@ -713,7 +710,7 @@ def all(self, skipna: bool = True) -> bool: # is empty. if self.null_count == self.size: return True - return self.reduce("all") + return bool(self.reduce("all")) def any(self, skipna: bool = True) -> bool: # Early exit for fast cases. @@ -859,7 +856,7 @@ def _fill( if end <= begin or begin >= self.size: return self if inplace else self.copy() - if not inplace or is_string_dtype(self.dtype): + if not inplace or self.dtype == CUDF_STRING_DTYPE: with acquire_spill_lock(): result = type(self).from_pylibcudf( plc.filling.fill( @@ -869,7 +866,7 @@ def _fill( fill_value, ) ) - if is_string_dtype(self.dtype): + if self.dtype == CUDF_STRING_DTYPE: return self._mimic_inplace(result, inplace=True) return result # type: ignore[return-value] @@ -892,12 +889,11 @@ def _fill( @acquire_spill_lock() def shift(self, offset: int, fill_value: ScalarLike) -> Self: - if not isinstance(fill_value, cudf.Scalar): - fill_value = cudf.Scalar(fill_value, dtype=self.dtype) + plc_fill_value = self._scalar_to_plc_scalar(fill_value) plc_col = plc.copying.shift( self.to_pylibcudf(mode="read"), offset, - fill_value.device_value, + plc_fill_value, ) return type(self).from_pylibcudf(plc_col) # type: ignore[return-value] @@ -951,7 +947,7 @@ def copy(self, deep: bool = True) -> Self: ), ) - def view(self, dtype: Dtype) -> ColumnBase: + def view(self, dtype: DtypeObj) -> ColumnBase: """ View the data underlying a column as different dtype. The source column must divide evenly into the size of @@ -960,13 +956,9 @@ def view(self, dtype: Dtype) -> ColumnBase: Parameters ---------- - dtype : NumPy dtype, string + dtype : Dtype object The dtype to view the data as - """ - - dtype = cudf.dtype(dtype) - if dtype.kind in ("o", "u", "s"): raise TypeError( "Bytes viewed as str without metadata is ambiguous" @@ -1193,13 +1185,21 @@ def _check_scatter_key_length( f"{num_keys}" ) + def _scalar_to_plc_scalar(self, scalar: ScalarLike) -> plc.Scalar: + """Return a pylibcudf.Scalar that matches the type of self.dtype""" + if not isinstance(scalar, pa.Scalar): + scalar = pa.scalar(scalar) + return pa_scalar_to_plc_scalar( + scalar.cast(cudf_dtype_to_pa_type(self.dtype)) + ) + def _validate_fillna_value( self, fill_value: ScalarLike | ColumnLike - ) -> cudf.Scalar | ColumnBase: + ) -> plc.Scalar | ColumnBase: """Align fill_value for .fillna based on column type.""" if is_scalar(fill_value): - return cudf.Scalar(fill_value, dtype=self.dtype) - return as_column(fill_value) + return self._scalar_to_plc_scalar(fill_value) + return as_column(fill_value).astype(self.dtype) @acquire_spill_lock() def replace( @@ -1245,8 +1245,8 @@ def fillna( if method == "ffill" else plc.replace.ReplacePolicy.FOLLOWING ) - elif is_scalar(fill_value): - plc_replace = cudf.Scalar(fill_value).device_value + elif isinstance(fill_value, plc.Scalar): + plc_replace = fill_value else: plc_replace = fill_value.to_pylibcudf(mode="read") plc_column = plc.replace.replace_nulls( @@ -1587,7 +1587,7 @@ def distinct_count(self, dropna: bool = True) -> int: self._distinct_count[dropna] = result return self._distinct_count[dropna] - def can_cast_safely(self, to_dtype: Dtype) -> bool: + def can_cast_safely(self, to_dtype: DtypeObj) -> bool: raise NotImplementedError() @acquire_spill_lock() @@ -1597,7 +1597,10 @@ def cast(self, dtype: Dtype) -> ColumnBase: self.to_pylibcudf(mode="read"), dtype_to_pylibcudf_type(dtype) ) ) - if is_decimal_dtype(result.dtype): + if isinstance( + result.dtype, + (cudf.Decimal128Dtype, cudf.Decimal64Dtype, cudf.Decimal32Dtype), + ): result.dtype.precision = dtype.precision # type: ignore[union-attr] return result @@ -1946,8 +1949,7 @@ def _reduce( skipna=skipna, min_count=min_count ) if isinstance(preprocessed, ColumnBase): - dtype = kwargs.pop("dtype", None) - return preprocessed.reduce(op, dtype, **kwargs) + return preprocessed.reduce(op, **kwargs) return preprocessed def _can_return_nan(self, skipna: bool | None = None) -> bool: @@ -2110,16 +2112,8 @@ def scan(self, scan_op: str, inclusive: bool, **kwargs) -> Self: ) ) - def reduce(self, reduction_op: str, dtype=None, **kwargs) -> ScalarLike: - if dtype is not None: - warnings.warn( - "dtype is deprecated and will be remove in a future release. " - "Cast the result (e.g. .astype) after the operation instead.", - FutureWarning, - ) - col_dtype = dtype - else: - col_dtype = self._reduction_result_dtype(reduction_op) + def reduce(self, reduction_op: str, **kwargs) -> ScalarLike: + col_dtype = self._reduction_result_dtype(reduction_op) # check empty case if len(self) <= self.null_count: @@ -2148,7 +2142,7 @@ def reduce(self, reduction_op: str, dtype=None, **kwargs) -> ScalarLike: }: scale = -plc_scalar.type().scale() # https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql - p = col_dtype.precision + p = col_dtype.precision # type: ignore[union-attr] nrows = len(self) if reduction_op in {"min", "max"}: new_p = p @@ -2162,7 +2156,7 @@ def reduce(self, reduction_op: str, dtype=None, **kwargs) -> ScalarLike: raise NotImplementedError( f"{reduction_op} not implemented for decimal types." ) - precision = max(min(new_p, col_dtype.MAX_PRECISION), 0) + precision = max(min(new_p, col_dtype.MAX_PRECISION), 0) # type: ignore[union-attr] new_dtype = type(col_dtype)(precision, scale) result_col = result_col.astype(new_dtype) elif isinstance(col_dtype, IntervalDtype): @@ -2322,13 +2316,14 @@ def build_column( offset=offset, null_count=null_count, ) - elif dtype.type in (np.object_, np.str_): + elif dtype == CUDF_STRING_DTYPE: return cudf.core.column.StringColumn( - data=data, - mask=mask, + data=data, # type: ignore[arg-type] size=size, + dtype=dtype, + mask=mask, offset=offset, - children=children, + children=children, # type: ignore[arg-type] null_count=null_count, ) elif isinstance(dtype, ListDtype): @@ -2999,7 +2994,8 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: # Notice, we can always cast pure null columns not_null_col_dtypes = [o.dtype for o in objs if o.null_count != len(o)] if len(not_null_col_dtypes) and all( - _is_non_decimal_numeric_dtype(dtype) and dtype.kind == "M" + is_dtype_obj_numeric(dtype, include_decimal=False) + and dtype.kind == "M" for dtype in not_null_col_dtypes ): common_dtype = find_common_type(not_null_col_dtypes) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 92d5c39e69d..64ddcae72a7 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -45,8 +45,10 @@ from cudf._typing import ( ColumnBinaryOperand, + ColumnLike, DatetimeLikeScalar, Dtype, + DtypeObj, ScalarLike, ) from cudf.core.column.numerical import NumericalColumn @@ -268,6 +270,19 @@ def __contains__(self, item: ScalarLike) -> bool: "cudf.core.column.NumericalColumn", self.astype(np.dtype(np.int64)) ) + def _validate_fillna_value( + self, fill_value: ScalarLike | ColumnLike + ) -> plc.Scalar | ColumnBase: + """Align fill_value for .fillna based on column type.""" + if ( + isinstance(fill_value, np.datetime64) + and self.time_unit != np.datetime_data(fill_value)[0] + ): + fill_value = fill_value.astype(self.dtype) + elif isinstance(fill_value, str) and fill_value.lower() == "nat": + fill_value = np.datetime64(fill_value, self.time_unit) + return super()._validate_fillna_value(fill_value) + @functools.cached_property def time_unit(self) -> str: return np.datetime_data(self.dtype)[0] @@ -837,7 +852,7 @@ def is_unique(self) -> bool: def isin(self, values: Sequence) -> ColumnBase: return cudf.core.tools.datetimes._isin_datetimelike(self, values) - def can_cast_safely(self, to_dtype: Dtype) -> bool: + def can_cast_safely(self, to_dtype: DtypeObj) -> bool: if to_dtype.kind == "M": # type: ignore[union-attr] to_res, _ = np.datetime_data(to_dtype) self_res, _ = np.datetime_data(self.dtype) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 3c603c8e6ef..848faf6a9ee 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -13,7 +13,6 @@ import pylibcudf as plc import cudf -from cudf.api.types import is_scalar from cudf.core._internals import binaryop from cudf.core.buffer import acquire_spill_lock, as_buffer from cudf.core.column.column import ColumnBase @@ -25,7 +24,8 @@ DecimalDtype, ) from cudf.core.mixins import BinaryOperand -from cudf.utils.dtypes import CUDF_STRING_DTYPE +from cudf.core.scalar import pa_scalar_to_plc_scalar +from cudf.utils.dtypes import CUDF_STRING_DTYPE, cudf_dtype_to_pa_type from cudf.utils.utils import pa_mask_buffer_to_mask if TYPE_CHECKING: @@ -73,11 +73,8 @@ def __cuda_array_interface__(self): def as_decimal_column( self, dtype: Dtype, - ) -> "DecimalBaseColumn": - if ( - isinstance(dtype, cudf.core.dtypes.DecimalDtype) - and dtype.scale < self.dtype.scale - ): + ) -> DecimalBaseColumn: + if isinstance(dtype, DecimalDtype) and dtype.scale < self.dtype.scale: warnings.warn( "cuDF truncates when downcasting decimals to a lower scale. " "To round, use Series.round() or DataFrame.round()." @@ -169,16 +166,35 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str): return result + def _scalar_to_plc_scalar(self, scalar: ScalarLike) -> plc.Scalar: + """Return a pylibcudf.Scalar that matches the type of self.dtype""" + if not isinstance(scalar, pa.Scalar): + # e.g casting int to decimal type isn't allow, but OK in the constructor? + pa_scalar = pa.scalar( + scalar, type=cudf_dtype_to_pa_type(self.dtype) + ) + else: + pa_scalar = scalar.cast(cudf_dtype_to_pa_type(self.dtype)) + plc_scalar = pa_scalar_to_plc_scalar(pa_scalar) + if isinstance(self.dtype, (Decimal32Dtype, Decimal64Dtype)): + # pyarrow.Scalar only supports Decimal128 so conversion + # from pyarrow would only return a pylibcudf.Scalar with Decimal128 + col = ColumnBase.from_pylibcudf( + plc.Column.from_scalar(plc_scalar, 1) + ).astype(self.dtype) + return plc.copying.get_element(col.to_pylibcudf(mode="read"), 0) + return plc_scalar + def _validate_fillna_value( self, fill_value: ScalarLike | ColumnLike - ) -> cudf.Scalar | ColumnBase: + ) -> plc.Scalar | ColumnBase: """Align fill_value for .fillna based on column type.""" if isinstance(fill_value, (int, Decimal)): - return cudf.Scalar(fill_value, dtype=self.dtype) + return super()._validate_fillna_value(fill_value) elif isinstance(fill_value, ColumnBase) and ( isinstance(self.dtype, DecimalDtype) or self.dtype.kind in "iu" ): - return fill_value.astype(self.dtype) + return super()._validate_fillna_value(fill_value) raise TypeError( "Decimal columns only support using fillna with decimal and " "integer values" @@ -204,22 +220,17 @@ def normalize_binop_value(self, other) -> Self | cudf.Scalar: other = other.astype(self.dtype) return other if isinstance(other, cudf.Scalar) and isinstance( - # TODO: Should it be possible to cast scalars of other numerical - # types to decimal? other.dtype, - cudf.core.dtypes.DecimalDtype, + DecimalDtype, ): + # TODO: Should it be possible to cast scalars of other numerical + # types to decimal? if _same_precision_and_scale(self.dtype, other.dtype): other = other.astype(self.dtype) return other - elif is_scalar(other) and isinstance(other, (int, Decimal)): - other = Decimal(other) - metadata = other.as_tuple() - precision = max(len(metadata.digits), metadata.exponent) - scale = -cast(int, metadata.exponent) - return cudf.Scalar( - other, dtype=self.dtype.__class__(precision, scale) - ) + elif isinstance(other, (int, Decimal)): + dtype = self.dtype._from_decimal(Decimal(other)) + return cudf.Scalar(other, dtype=dtype) return NotImplemented def as_numerical_column( @@ -373,11 +384,6 @@ def __init__( children=children, ) - def __setitem__(self, key, value): - if isinstance(value, np.integer): - value = int(value) - super().__setitem__(key, value) - @classmethod def from_arrow(cls, data: pa.Array): dtype = Decimal64Dtype.from_arrow(data.type) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 837763ee30c..ca29f83225b 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -14,7 +14,7 @@ import cudf import cudf.core.column.column as column -from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar +from cudf.api.types import is_scalar from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase, as_column from cudf.core.column.methods import ColumnMethods, ParentType @@ -22,7 +22,7 @@ from cudf.core.dtypes import ListDtype from cudf.core.missing import NA from cudf.core.scalar import pa_scalar_to_plc_scalar -from cudf.utils.dtypes import SIZE_TYPE_DTYPE +from cudf.utils.dtypes import SIZE_TYPE_DTYPE, is_dtype_obj_numeric if TYPE_CHECKING: from collections.abc import Sequence @@ -718,8 +718,8 @@ def take(self, lists_indices: ColumnLike) -> ParentType: "lists_indices and list column is of different size." ) if ( - not _is_non_decimal_numeric_dtype( - lists_indices_col.children[1].dtype + not is_dtype_obj_numeric( + lists_indices_col.children[1].dtype, include_decimal=False ) or lists_indices_col.children[1].dtype.kind not in "iu" ): diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index b42e4419d72..e545bb4bc5e 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -5,8 +5,6 @@ from typing import Literal, Union, overload import cudf -import cudf.core.column -import cudf.core.column_accessor from cudf.utils.utils import NotIterable ParentType = Union["cudf.Series", "cudf.core.index.Index"] diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index eecb294acee..249afe9aba6 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -14,7 +14,7 @@ import cudf import cudf.core.column.column as column -from cudf.api.types import is_integer, is_scalar +from cudf.api.types import infer_dtype, is_integer, is_scalar from cudf.core._internals import binaryop from cudf.core.buffer import acquire_spill_lock, as_buffer from cudf.core.column.column import ColumnBase, as_column @@ -439,7 +439,7 @@ def _process_values_for_isin( except (MixedTypeError, TypeError) as e: # There is a corner where `values` can be of `object` dtype # but have values of homogeneous type. - inferred_dtype = cudf.api.types.infer_dtype(values) + inferred_dtype = infer_dtype(values) if ( self.dtype.kind in {"i", "u"} and inferred_dtype == "integer" ) or ( @@ -559,15 +559,20 @@ def find_and_replace( def _validate_fillna_value( self, fill_value: ScalarLike | ColumnLike - ) -> cudf.Scalar | ColumnBase: + ) -> plc.Scalar | ColumnBase: """Align fill_value for .fillna based on column type.""" if is_scalar(fill_value): - cudf_obj: cudf.Scalar | ColumnBase = cudf.Scalar(fill_value) - if not as_column(cudf_obj).can_cast_safely(self.dtype): + cudf_obj = ColumnBase.from_pylibcudf( + plc.Column.from_scalar( + pa_scalar_to_plc_scalar(pa.scalar(fill_value)), 1 + ) + ) + if not cudf_obj.can_cast_safely(self.dtype): raise TypeError( f"Cannot safely cast non-equivalent " f"{type(fill_value).__name__} to {self.dtype.name}" ) + return super()._validate_fillna_value(fill_value) else: cudf_obj = as_column(fill_value, nan_as_null=False) if not cudf_obj.can_cast_safely(self.dtype): # type: ignore[attr-defined] @@ -576,7 +581,7 @@ def _validate_fillna_value( f"{cudf_obj.dtype.type.__name__} to " f"{self.dtype.type.__name__}" ) - return cudf_obj.astype(self.dtype) + return cudf_obj.astype(self.dtype) def can_cast_safely(self, to_dtype: DtypeObj) -> bool: """ diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 04a72017c33..9f3512369a0 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -16,12 +16,11 @@ import pylibcudf as plc import cudf -import cudf.api.types import cudf.core.column.column as column import cudf.core.column.datetime as datetime -from cudf.api.types import is_integer, is_scalar, is_string_dtype +from cudf.api.types import is_integer, is_scalar from cudf.core._internals import binaryop -from cudf.core.buffer import acquire_spill_lock +from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column.column import ColumnBase from cudf.core.column.methods import ColumnMethods from cudf.core.scalar import pa_scalar_to_plc_scalar @@ -43,10 +42,10 @@ ColumnBinaryOperand, ColumnLike, Dtype, + DtypeObj, ScalarLike, SeriesOrIndex, ) - from cudf.core.buffer import Buffer from cudf.core.column.lists import ListColumn from cudf.core.column.numerical import NumericalColumn @@ -76,7 +75,7 @@ def __init__(self, parent): if isinstance(parent.dtype, cudf.ListDtype) else parent.dtype ) - if not is_string_dtype(value_type): + if value_type != CUDF_STRING_DTYPE: raise AttributeError( "Can only use .str accessor with string values" ) @@ -4679,8 +4678,10 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: r""" Normalizes strings characters for tokenizing. - This uses the normalizer that is built into the - subword_tokenize function which includes: + .. deprecated:: 25.04 + Use `CharacterNormalizer` instead. + + The normalizer function includes: - adding padding around punctuation (unicode category starts with "P") as well as certain ASCII symbols like "^" and "$" @@ -4720,8 +4721,13 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: 2 $ 99 dtype: object """ + warnings.warn( + "normalize_characters is deprecated and will be removed in a future " + "version. Use CharacterNormalizer instead.", + FutureWarning, + ) return self._return_or_inplace( - self._column.normalize_characters(do_lower) + self._column.characters_normalize(do_lower) ) def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: @@ -5526,6 +5532,120 @@ def minhash64( self._column.minhash64(seed, a_column, b_column, width) # type: ignore[arg-type] ) + def minhash_ngrams( + self, ngrams: int, seed: np.uint32, a: ColumnLike, b: ColumnLike + ) -> SeriesOrIndex: + """ + Compute the minhash of a list column of strings. + + This uses the MurmurHash3_x86_32 algorithm for the hash function. + + Calculation uses the formula (hv * a + b) % mersenne_prime + where hv is the hash of a ngrams of strings within each row, + a and b are provided values and mersenne_prime is 2^61-1. + + Parameters + ---------- + ngrams : int + Number of strings to hash within each row. + seed : uint32 + The seed used for the hash algorithm. + a : ColumnLike + Values for minhash calculation. + Must be of type uint32. + b : ColumnLike + Values for minhash calculation. + Must be of type uint32. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> s = cudf.Series([['this', 'is', 'my'], ['favorite', 'book']]) + >>> a = cudf.Series([1, 2, 3], dtype=np.uint32) + >>> b = cudf.Series([4, 5, 6], dtype=np.uint32) + >>> s.str.minhash_ngrams(ngrams=2, seed=0, a=a, b=b) + 0 [416367551, 832735099, 1249102647] + 1 [1906668704, 3813337405, 1425038810] + dtype: list + """ + a_column = column.as_column(a) + if a_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(a)}" + ) + b_column = column.as_column(b) + if b_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(b)}" + ) + plc_column = plc.nvtext.minhash.minhash_ngrams( + self._column.to_pylibcudf(mode="read"), + ngrams, + seed, + a._column.to_pylibcudf(mode="read"), + b._column.to_pylibcudf(mode="read"), + ) + result = ColumnBase.from_pylibcudf(plc_column) + return self._return_or_inplace(result) + + def minhash64_ngrams( + self, ngrams: int, seed: np.uint64, a: ColumnLike, b: ColumnLike + ) -> SeriesOrIndex: + """ + Compute the minhash of a list column of strings. + + This uses the MurmurHash3_x64_128 algorithm for the hash function. + + Calculation uses the formula (hv * a + b) % mersenne_prime + where hv is the hash of a ngrams of strings within each row, + a and b are provided values and mersenne_prime is 2^61-1. + + Parameters + ---------- + ngrams : int + Number of strings to hash within each row. + seed : uint64 + The seed used for the hash algorithm. + a : ColumnLike + Values for minhash calculation. + Must be of type uint64. + b : ColumnLike + Values for minhash calculation. + Must be of type uint64. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> s = cudf.Series([['this', 'is', 'my'], ['favorite', 'book']]) + >>> a = cudf.Series([2, 3], dtype=np.uint64) + >>> b = cudf.Series([5, 6], dtype=np.uint64) + >>> s.str.minhash64_ngrams(ngrams=2, seed=0, a=a, b=b) + 0 [1304293339825194559, 1956440009737791829] + 1 [472203876238918632, 1861227318965224922] + dtype: list + """ + a_column = column.as_column(a) + if a_column.dtype != np.uint64: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(a)}" + ) + b_column = column.as_column(b) + if b_column.dtype != np.uint64: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(b)}" + ) + plc_column = plc.nvtext.minhash.minhash64_ngrams( + self._column.to_pylibcudf(mode="read"), + ngrams, + seed, + a._column.to_pylibcudf(mode="read"), + b._column.to_pylibcudf(mode="read"), + ) + result = ColumnBase.from_pylibcudf(plc_column) + return self._return_or_inplace(result) + def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: """ Compute the Jaccard index between this column and the given @@ -5588,13 +5708,14 @@ class StringColumn(column.ColumnBase): Parameters ---------- + data : Buffer + Buffer of the string data mask : Buffer The validity mask offset : int Data offset children : Tuple[Column] - Two non-null columns containing the string data and offsets - respectively + Columns containing the offsets """ _start_offset: int | None @@ -5622,14 +5743,20 @@ class StringColumn(column.ColumnBase): def __init__( self, - data: Buffer | None = None, + data: Buffer, + size: int | None, + dtype: np.dtype, mask: Buffer | None = None, - size: int | None = None, # TODO: make non-optional offset: int = 0, null_count: int | None = None, - children: tuple["column.ColumnBase", ...] = (), + children: tuple[column.ColumnBase] = (), # type: ignore[assignment] ): - dtype = cudf.api.types.dtype("object") + if not isinstance(data, Buffer): + raise ValueError("data must be a Buffer") + if dtype != CUDF_STRING_DTYPE: + raise ValueError(f"dtype must be {CUDF_STRING_DTYPE}") + if len(children) > 1: + raise ValueError("StringColumn must have at most 1 offset column.") if size is None: for child in children: @@ -5724,8 +5851,6 @@ def base_size(self) -> int: # override for string column @property def data(self): - if self.base_data is None: - return None if self._data is None: if ( self.offset == 0 @@ -5815,23 +5940,22 @@ def __contains__(self, item: ScalarLike) -> bool: other = [item] if is_scalar(item) else item return self.contains(column.as_column(other, dtype=self.dtype)).any() - def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: - out_dtype = cudf.api.types.dtype(dtype) - if out_dtype.kind == "b": + def as_numerical_column(self, dtype: np.dtype) -> NumericalColumn: + if dtype.kind == "b": with acquire_spill_lock(): plc_column = plc.strings.attributes.count_characters( self.to_pylibcudf(mode="read") ) result = ColumnBase.from_pylibcudf(plc_column) return (result > np.int8(0)).fillna(False) - elif out_dtype.kind in {"i", "u"}: + elif dtype.kind in {"i", "u"}: if not self.is_integer().all(): raise ValueError( "Could not convert strings to integer " "type due to presence of non-integer values." ) cast_func = plc.strings.convert.convert_integers.to_integers - elif out_dtype.kind == "f": + elif dtype.kind == "f": if not self.is_float().all(): raise ValueError( "Could not convert strings to float " @@ -5839,10 +5963,8 @@ def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: ) cast_func = plc.strings.convert.convert_floats.to_floats else: - raise ValueError( - f"dtype must be a numerical type, not {out_dtype}" - ) - plc_dtype = dtype_to_pylibcudf_type(out_dtype) + raise ValueError(f"dtype must be a numerical type, not {dtype}") + plc_dtype = dtype_to_pylibcudf_type(dtype) with acquire_spill_lock(): return type(self).from_pylibcudf( # type: ignore[return-value] cast_func(self.to_pylibcudf(mode="read"), plc_dtype) @@ -5962,17 +6084,15 @@ def to_pandas( else: return super().to_pandas(nullable=nullable, arrow_type=arrow_type) - def can_cast_safely(self, to_dtype: Dtype) -> bool: - to_dtype = cudf.api.types.dtype(to_dtype) - + def can_cast_safely(self, to_dtype: DtypeObj) -> bool: if self.dtype == to_dtype: return True - elif to_dtype.kind in {"i", "u"} and not self.is_integer().all(): - return False - elif to_dtype.kind == "f" and not self.is_float().all(): - return False - else: + elif to_dtype.kind in {"i", "u"} and self.is_integer().all(): + return True + elif to_dtype.kind == "f" and self.is_float().all(): return True + else: + return False def find_and_replace( self, @@ -6111,12 +6231,11 @@ def _binaryop( return NotImplemented @copy_docstring(ColumnBase.view) - def view(self, dtype) -> ColumnBase: + def view(self, dtype: DtypeObj) -> ColumnBase: if self.null_count > 0: raise ValueError( "Can not produce a view of a string column with nulls" ) - dtype = cudf.api.types.dtype(dtype) str_byte_offset = self.base_children[0].element_indexing(self.offset) str_end_byte_offset = self.base_children[0].element_indexing( self.offset + self.size @@ -6256,14 +6375,25 @@ def normalize_spaces(self) -> Self: ) @acquire_spill_lock() - def normalize_characters(self, do_lower: bool = True) -> Self: + def characters_normalize(self, do_lower: bool = True) -> Self: return ColumnBase.from_pylibcudf( # type: ignore[return-value] - plc.nvtext.normalize.normalize_characters( + plc.nvtext.normalize.characters_normalize( self.to_pylibcudf(mode="read"), do_lower, ) ) + @acquire_spill_lock() + def normalize_characters( + self, normalizer: plc.nvtext.normalize.CharacterNormalizer + ) -> Self: + return ColumnBase.from_pylibcudf( # type: ignore[return-value] + plc.nvtext.normalize.normalize_characters( + self.to_pylibcudf(mode="read"), + normalizer, + ) + ) + @acquire_spill_lock() def replace_tokens( self, targets: Self, replacements: Self, delimiter: plc.Scalar diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 1cbbac0f8cc..654d2c2b800 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -28,7 +28,14 @@ if TYPE_CHECKING: from collections.abc import Sequence - from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype + from cudf._typing import ( + ColumnBinaryOperand, + ColumnLike, + DatetimeLikeScalar, + Dtype, + DtypeObj, + ScalarLike, + ) _unit_to_nanoseconds_conversion = { "ns": 1, @@ -137,6 +144,19 @@ def __contains__(self, item: DatetimeLikeScalar) -> bool: "cudf.core.column.NumericalColumn", self.astype(np.dtype(np.int64)) ) + def _validate_fillna_value( + self, fill_value: ScalarLike | ColumnLike + ) -> plc.Scalar | ColumnBase: + """Align fill_value for .fillna based on column type.""" + if ( + isinstance(fill_value, np.timedelta64) + and self.time_unit != np.datetime_data(fill_value)[0] + ): + fill_value = fill_value.astype(self.dtype) + elif isinstance(fill_value, str) and fill_value.lower() == "nat": + fill_value = np.timedelta64(fill_value, self.time_unit) + return super()._validate_fillna_value(fill_value) + @property def values(self): """ @@ -309,7 +329,9 @@ def total_seconds(self) -> ColumnBase: # https://github.com/rapidsai/cudf/issues/17664 return ( (self.astype(np.dtype(np.int64)) * conversion) - .astype(cudf.Decimal128Dtype(38, 9)) + .astype( + cudf.Decimal128Dtype(cudf.Decimal128Dtype.MAX_PRECISION, 9) + ) .round(decimals=abs(int(math.log10(conversion)))) .astype(np.dtype(np.float64)) ) @@ -378,10 +400,10 @@ def find_and_replace( ), ) - def can_cast_safely(self, to_dtype: Dtype) -> bool: - if to_dtype.kind == "m": # type: ignore[union-attr] + def can_cast_safely(self, to_dtype: DtypeObj) -> bool: + if to_dtype.kind == "m": to_res, _ = np.datetime_data(to_dtype) - self_res, _ = np.datetime_data(self.dtype) + self_res = self.time_unit max_int = np.iinfo(np.int64).max @@ -452,14 +474,13 @@ def sum( self, skipna: bool | None = None, min_count: int = 0, - dtype: Dtype | None = None, ) -> pd.Timedelta: return pd.Timedelta( # Since sum isn't overridden in Numerical[Base]Column, mypy only # sees the signature from Reducible (which doesn't have the extra # parameters from ColumnBase._reduce) so we have to ignore this. self.astype(np.dtype(np.int64)).sum( # type: ignore - skipna=skipna, min_count=min_count, dtype=dtype + skipna=skipna, min_count=min_count ), unit=self.time_unit, ).as_unit(self.time_unit) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 69db055fe87..eec0bacd5c8 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -35,17 +35,13 @@ import pylibcudf as plc import cudf -import cudf.core.common from cudf.api.extensions import no_default from cudf.api.types import ( _is_scalar_or_zero_d_array, is_dict_like, is_dtype_equal, is_list_like, - is_numeric_dtype, - is_object_dtype, is_scalar, - is_string_dtype, ) from cudf.core import column, indexing_utils, reshape from cudf.core._compat import PANDAS_LT_300 @@ -91,6 +87,7 @@ cudf_dtype_from_pydata_dtype, find_common_type, is_column_like, + is_dtype_obj_numeric, min_signed_type, ) from cudf.utils.performance_tracking import _performance_tracking @@ -146,7 +143,7 @@ def __setitem__(self, key, value): return self._setitem_tuple_arg(key, value) @_performance_tracking - def _can_downcast_to_series(self, df, arg): + def _can_downcast_to_series(self, df: DataFrame, arg): """ This method encapsulates the logic used to determine whether or not the result of a loc/iloc @@ -171,8 +168,8 @@ def _can_downcast_to_series(self, df, arg): arg[1], slice ): return True - dtypes = df.dtypes.values.tolist() - all_numeric = all(is_numeric_dtype(t) for t in dtypes) + dtypes = [dtype for _, dtype in df._dtypes] + all_numeric = all(is_dtype_obj_numeric(t) for t in dtypes) if all_numeric or ( len(dtypes) and all(t == dtypes[0] for t in dtypes) ): @@ -349,7 +346,7 @@ def _getitem_tuple_arg(self, arg): df.index.name = columns_df.index.name if not isinstance( df.index, MultiIndex - ) and is_numeric_dtype(df.index.dtype): + ) and is_dtype_obj_numeric(df.index.dtype): # Preserve the original index type. df.index = df.index.astype(self._frame.index.dtype) df = df.sort_values(by=[tmp_col_name, cantor_name]) @@ -2055,18 +2052,28 @@ def _make_operands_and_index_for_binop( dict[str | None, tuple[ColumnBase, Any, bool, Any]] | NotImplementedType, BaseIndex | None, - bool, + dict[str, Any], ]: lhs, rhs = self._data, other index = self.index fill_requires_key = False left_default: Any = False equal_columns = False - can_use_self_column_name = True + ca_attributes: dict[str, Any] = {} + + def _fill_same_ca_attributes( + attrs: dict[str, Any], ca: ColumnAccessor + ) -> dict[str, Any]: + attrs["rangeindex"] = ca.rangeindex + attrs["multiindex"] = ca.multiindex + attrs["label_dtype"] = ca.label_dtype + attrs["level_names"] = ca.level_names + return attrs if _is_scalar_or_zero_d_array(other): rhs = {name: other for name in self._data} equal_columns = True + ca_attributes = _fill_same_ca_attributes(ca_attributes, self._data) elif isinstance(other, Series): if ( not (self_pd_columns := self._data.to_pandas_index).equals( @@ -2085,9 +2092,12 @@ def _make_operands_and_index_for_binop( # NULL!) and the right value (result is NaN). left_default = as_column(np.nan, length=len(self)) equal_columns = other_pd_index.equals(self_pd_columns) - can_use_self_column_name = ( - equal_columns or other_pd_index.names == self_pd_columns.names - ) + if equal_columns: + ca_attributes = _fill_same_ca_attributes( + ca_attributes, self._data + ) + elif other_pd_index.names == self_pd_columns.names: + ca_attributes["level_names"] = self._data.level_names elif isinstance(other, DataFrame): if ( not can_reindex @@ -2110,17 +2120,19 @@ def _make_operands_and_index_for_binop( # the fill value. left_default = fill_value equal_columns = self._column_names == other._column_names - can_use_self_column_name = ( - equal_columns - or self._data._level_names == other._data._level_names - ) + if self._data.to_pandas_index.equals(other._data.to_pandas_index): + ca_attributes = _fill_same_ca_attributes( + ca_attributes, self._data + ) + elif self._data._level_names == other._data._level_names: + ca_attributes["level_names"] = self._data.level_names elif isinstance(other, (dict, abc.Mapping)): # Need to fail early on host mapping types because we ultimately # convert everything to a dict. - return NotImplemented, None, True + return NotImplemented, None, ca_attributes if not isinstance(rhs, (dict, abc.Mapping)): - return NotImplemented, None, True + return NotImplemented, None, ca_attributes operands = { k: ( @@ -2150,8 +2162,8 @@ def _make_operands_and_index_for_binop( raise ValueError("other must be a DataFrame or Series.") sorted_dict = {key: operands[key] for key in column_names_list} - return sorted_dict, index, can_use_self_column_name - return operands, index, can_use_self_column_name + return sorted_dict, index, ca_attributes + return operands, index, ca_attributes @classmethod @_performance_tracking @@ -3144,7 +3156,7 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None): # If other was provided, process that next. if isinstance(other, DataFrame): other_cols = [other._data[col] for col in self._column_names] - elif cudf.api.types.is_scalar(other): + elif is_scalar(other): other_cols = [other] * len(self._column_names) elif isinstance(other, cudf.Series): other_cols = other.to_pandas() @@ -3774,14 +3786,14 @@ def agg(self, aggs, axis=None): * Not supporting: ``axis``, ``*args``, ``**kwargs`` """ - dtypes = [self[col].dtype for col in self._column_names] + dtypes = [dtype for _, dtype in self._dtypes] common_dtype = find_common_type(dtypes) if common_dtype.kind != "b" and any( dtype.kind == "b" for dtype in dtypes ): raise MixedTypeError("Cannot create a column with mixed types") - if any(is_string_dtype(dt) for dt in dtypes): + if any(dt == CUDF_STRING_DTYPE for dt in dtypes): raise NotImplementedError( "DataFrame.agg() is not supported for " "frames containing string columns" @@ -4920,7 +4932,7 @@ def apply_rows( """ for col in incols: current_col_dtype = self._data[col].dtype - if is_string_dtype(current_col_dtype) or isinstance( + if current_col_dtype == CUDF_STRING_DTYPE or isinstance( current_col_dtype, cudf.CategoricalDtype ): raise TypeError( @@ -6280,8 +6292,8 @@ def make_false_column_like_self(): else: # These checks must happen after the conversions above # since numpy can't handle categorical dtypes. - self_is_str = is_string_dtype(self_col.dtype) - other_is_str = is_string_dtype(other_col.dtype) + self_is_str = self_col.dtype == CUDF_STRING_DTYPE + other_is_str = other_col.dtype == CUDF_STRING_DTYPE if self_is_str != other_is_str: # Strings can't compare to anything else. @@ -6338,8 +6350,8 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only): common_dtype = find_common_type(filtered.dtypes) if ( not numeric_only - and is_string_dtype(common_dtype) - and any(not is_string_dtype(dt) for dt in filtered.dtypes) + and common_dtype == CUDF_STRING_DTYPE + and any(dtype != CUDF_STRING_DTYPE for dtype in filtered._dtypes) ): raise TypeError( f"Cannot perform row-wise {method} across mixed-dtype columns," @@ -6462,7 +6474,9 @@ def _reduce( if numeric_only: numeric_cols = ( - name for name, dtype in self._dtypes if is_numeric_dtype(dtype) + name + for name, dtype in self._dtypes + if is_dtype_obj_numeric(dtype) ) source = self._get_columns_by_label(numeric_cols) if source.empty: @@ -6493,7 +6507,7 @@ def _reduce( raise NotImplementedError( f"Column {col_label} with type {col.dtype} does not support {op}" ) from err - elif not is_numeric_dtype(col.dtype): + elif not is_dtype_obj_numeric(col.dtype): raise TypeError( "Non numeric columns passed with " "`numeric_only=False`, pass `numeric_only=True` " @@ -6509,9 +6523,9 @@ def _reduce( source_dtypes = [dtype for _, dtype in source._dtypes] common_dtype = find_common_type(source_dtypes) if ( - is_object_dtype(common_dtype) + common_dtype == CUDF_STRING_DTYPE and any( - not is_object_dtype(dtype) for dtype in source_dtypes + dtype != CUDF_STRING_DTYPE for dtype in source_dtypes ) or common_dtype.kind != "b" and any(dtype.kind == "b" for dtype in source_dtypes) @@ -8589,7 +8603,7 @@ def _find_common_dtypes_and_categories( # default to the first non-null dtype dtypes[idx] = cols[0].dtype # If all the non-null dtypes are int/float, find a common dtype - if all(is_numeric_dtype(col.dtype) for col in cols): + if all(is_dtype_obj_numeric(col.dtype) for col in cols): dtypes[idx] = find_common_type([col.dtype for col in cols]) # If all categorical dtypes, combine the categories elif all( diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 977208f5eb4..ac9c4d23cc2 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -776,35 +776,36 @@ def _recursively_replace_fields(self, result: dict) -> dict: class DecimalDtype(_BaseDtype): _metadata = ("precision", "scale") - def __init__(self, precision, scale=0): + def __init__(self, precision: int, scale: int = 0) -> None: self._validate(precision, scale) - self._typ = pa.decimal128(precision, scale) + self._precision = precision + self._scale = scale @property - def str(self): + def str(self) -> str: return f"{self.name!s}({self.precision}, {self.scale})" @property - def precision(self): + def precision(self) -> int: """ The decimal precision, in number of decimal digits (an integer). """ - return self._typ.precision + return self._precision @precision.setter - def precision(self, value): + def precision(self, value: int) -> None: self._validate(value, self.scale) - self._typ = pa.decimal128(precision=value, scale=self.scale) + self._precision = value @property - def scale(self): + def scale(self) -> int: """ The decimal scale (an integer). """ - return self._typ.scale + return self._scale @property - def itemsize(self): + def itemsize(self) -> int: """ Length of one column element in bytes. """ @@ -815,14 +816,14 @@ def type(self): # might need to account for precision and scale here return decimal.Decimal - def to_arrow(self): + def to_arrow(self) -> pa.Decimal128Type: """ Return the equivalent ``pyarrow`` dtype. """ - return self._typ + return pa.decimal128(self.precision, self.scale) @classmethod - def from_arrow(cls, typ): + def from_arrow(cls, typ: pa.Decimal128Type) -> Self: """ Construct a cudf decimal dtype from a ``pyarrow`` dtype @@ -856,23 +857,23 @@ def __repr__(self): ) @classmethod - def _validate(cls, precision, scale=0): + def _validate(cls, precision: int, scale: int) -> None: if precision > cls.MAX_PRECISION: raise ValueError( f"Cannot construct a {cls.__name__}" f" with precision > {cls.MAX_PRECISION}" ) if abs(scale) > precision: - raise ValueError(f"scale={scale} exceeds precision={precision}") + raise ValueError(f"{scale=} cannot exceed {precision=}") @classmethod - def _from_decimal(cls, decimal): + def _from_decimal(cls, decimal: decimal.Decimal) -> Self: """ Create a cudf.DecimalDtype from a decimal.Decimal object """ metadata = decimal.as_tuple() - precision = max(len(metadata.digits), -metadata.exponent) - return cls(precision, -metadata.exponent) + precision = max(len(metadata.digits), -metadata.exponent) # type: ignore[operator] + return cls(precision, -metadata.exponent) # type: ignore[operator] def serialize(self) -> tuple[dict, list]: return ( @@ -885,7 +886,7 @@ def serialize(self) -> tuple[dict, list]: ) @classmethod - def deserialize(cls, header: dict, frames: list): + def deserialize(cls, header: dict, frames: list) -> Self: _check_type(cls, header, frames, is_valid_class=issubclass) return cls(header["precision"], header["scale"]) @@ -896,8 +897,8 @@ def __eq__(self, other: Dtype) -> bool: return False return self.precision == other.precision and self.scale == other.scale - def __hash__(self): - return hash(self._typ) + def __hash__(self) -> int: + return hash(self.to_arrow()) @doc_apply( diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 38b519c6d5f..df11ebd4f94 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -20,11 +20,7 @@ import cudf from cudf.api.extensions import no_default -from cudf.api.types import ( - is_list_like, - is_numeric_dtype, - is_string_dtype, -) +from cudf.api.types import is_list_like, is_scalar from cudf.core._compat import PANDAS_LT_300 from cudf.core._internals import aggregation, sorting, stream_compaction from cudf.core.abc import Serializable @@ -44,7 +40,12 @@ from cudf.core.multiindex import MultiIndex from cudf.core.scalar import pa_scalar_to_plc_scalar from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply -from cudf.utils.dtypes import SIZE_TYPE_DTYPE, cudf_dtype_to_pa_type +from cudf.utils.dtypes import ( + CUDF_STRING_DTYPE, + SIZE_TYPE_DTYPE, + cudf_dtype_to_pa_type, + is_dtype_obj_numeric, +) from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import GetAttrGetItemMixin @@ -91,7 +92,7 @@ @singledispatch def get_valid_aggregation(dtype): - if is_string_dtype(dtype): + if dtype == CUDF_STRING_DTYPE: return _STRING_AGGS return "ALL" @@ -1788,7 +1789,7 @@ def _post_process_chunk_results( ): if not len(chunk_results): return self.obj.head(0) - if isinstance(chunk_results, ColumnBase) or cudf.api.types.is_scalar( + if isinstance(chunk_results, ColumnBase) or is_scalar( chunk_results[0] ): data = ColumnAccessor( @@ -3077,7 +3078,9 @@ def _reduce_numeric_only(self, op: str): columns = list( name for name, dtype in self.obj._dtypes - if (is_numeric_dtype(dtype) and name not in self.grouping.names) + if ( + is_dtype_obj_numeric(dtype) and name not in self.grouping.names + ) ) return self[columns].agg(op) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 1730a692dc1..05a2a46c051 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -20,12 +20,11 @@ import cudf from cudf.api.extensions import no_default from cudf.api.types import ( - _is_non_decimal_numeric_dtype, is_dtype_equal, + is_hashable, is_integer, is_list_like, is_scalar, - is_string_dtype, ) from cudf.core._base_index import BaseIndex, _return_get_indexer_result from cudf.core._compat import PANDAS_LT_300 @@ -57,6 +56,7 @@ cudf_dtype_from_pa_type, cudf_dtype_to_pa_type, find_common_type, + is_dtype_obj_numeric, is_mixed_with_object_dtype, ) from cudf.utils.performance_tracking import _performance_tracking @@ -232,7 +232,7 @@ class RangeIndex(BaseIndex, BinaryOperand): def __init__( self, start, stop=None, step=1, dtype=None, copy=False, name=None ): - if not cudf.api.types.is_hashable(name): + if not is_hashable(name): raise ValueError("Name must be a hashable value.") self._name = name if dtype is not None and cudf.dtype(dtype).kind != "i": @@ -1286,6 +1286,15 @@ def equals(self, other) -> bool: elif other_is_categorical and not self_is_categorical: self = self.astype(other.dtype) check_dtypes = True + elif ( + not self_is_categorical + and not other_is_categorical + and not isinstance(other, RangeIndex) + and not isinstance(self, type(other)) + ): + # Can compare Index to CategoricalIndex or RangeIndex + # Other comparisons are invalid + return False try: return self._column.equals( @@ -1777,7 +1786,7 @@ def isin(self, values, level=None) -> cupy.ndarray: @property @_performance_tracking def str(self): - if is_string_dtype(self.dtype): + if self.dtype == CUDF_STRING_DTYPE: return StringMethods(parent=self) else: raise AttributeError( @@ -3357,7 +3366,7 @@ def interval_range( "freq, exactly three must be specified" ) - if periods is not None and not cudf.api.types.is_integer(periods): + if periods is not None and not is_integer(periods): warnings.warn( "Non-integer 'periods' in cudf.date_range, and cudf.interval_range" " are deprecated and will raise in a future version.", @@ -3381,7 +3390,9 @@ def interval_range( pa_freq = pa.scalar(freq) if any( - not _is_non_decimal_numeric_dtype(cudf_dtype_from_pa_type(x.type)) + not is_dtype_obj_numeric( + cudf_dtype_from_pa_type(x.type), include_decimal=False + ) for x in (pa_start, pa.scalar(periods), pa_freq, pa_end) ): raise ValueError("start, end, periods, freq must be numeric values.") diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 9c48b31a309..2f33a860608 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -26,11 +26,10 @@ import pylibcudf as plc import cudf -import cudf.core import cudf.core.algorithms +import cudf.core.common from cudf.api.extensions import no_default from cudf.api.types import ( - _is_non_decimal_numeric_dtype, is_dict_like, is_list_like, is_scalar, @@ -60,7 +59,11 @@ from cudf.utils import docutils, ioutils from cudf.utils._numba import _CUDFNumbaConfig from cudf.utils.docutils import copy_docstring -from cudf.utils.dtypes import SIZE_TYPE_DTYPE +from cudf.utils.dtypes import ( + SIZE_TYPE_DTYPE, + is_column_like, + is_dtype_obj_numeric, +) from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import _warn_no_dask_cudf @@ -71,6 +74,7 @@ ColumnLike, DataFrameOrSeries, Dtype, + DtypeObj, NotImplementedType, ) @@ -1328,7 +1332,6 @@ def sum( self, axis=no_default, skipna=True, - dtype=None, numeric_only=False, min_count=0, **kwargs, @@ -1342,8 +1345,6 @@ def sum( Axis for the function to be applied on. skipna: bool, default True Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. numeric_only : bool, default False If True, includes only float, int, boolean columns. If False, will raise error in-case there are @@ -1373,7 +1374,6 @@ def sum( "sum", axis=axis, skipna=skipna, - dtype=dtype, numeric_only=numeric_only, min_count=min_count, **kwargs, @@ -1384,7 +1384,6 @@ def product( self, axis=no_default, skipna=True, - dtype=None, numeric_only=False, min_count=0, **kwargs, @@ -1398,8 +1397,6 @@ def product( Axis for the function to be applied on. skipna: bool, default True Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. numeric_only : bool, default False If True, includes only float, int, boolean columns. If False, will raise error in-case there are @@ -1432,7 +1429,6 @@ def product( "prod" if axis in {1, "columns"} else "product", axis=axis, skipna=skipna, - dtype=dtype, numeric_only=numeric_only, min_count=min_count, **kwargs, @@ -3308,9 +3304,13 @@ def _split(self, splits, keep_index: bool = True) -> list[Self]: splits, ) + @acquire_spill_lock() + def split_from_pylibcudf(split: list[plc.Column]) -> list[ColumnBase]: + return [ColumnBase.from_pylibcudf(col) for col in split] + return [ self._from_columns_like_self( - [ColumnBase.from_pylibcudf(col) for col in split], + split_from_pylibcudf(split), self._column_names, self.index.names if keep_index else None, ) @@ -3912,7 +3912,7 @@ def _reindex( } result = self.__class__._from_data( - data=cudf.core.column_accessor.ColumnAccessor( + data=ColumnAccessor( cols, multiindex=multiindex, level_names=level_names, @@ -4892,20 +4892,16 @@ def _binaryop( ( operands, out_index, - can_use_self_column_name, + ca_attributes, ) = self._make_operands_and_index_for_binop( other, op, fill_value, reflect, can_reindex ) if operands is NotImplemented: return NotImplemented - - level_names = ( - self._data._level_names if can_use_self_column_name else None - ) return self._from_data( ColumnAccessor( type(self)._colwise_binop(operands, op), - level_names=level_names, + **ca_attributes, ), index=out_index, ) @@ -4921,7 +4917,7 @@ def _make_operands_and_index_for_binop( dict[str | None, tuple[ColumnBase, Any, bool, Any]] | NotImplementedType, cudf.BaseIndex | None, - bool, + dict[str, Any], ]: raise NotImplementedError( f"Binary operations are not supported for {self.__class__}" @@ -6410,9 +6406,9 @@ def rank( dropped_cols = False source = self if numeric_only: - if isinstance( - source, cudf.Series - ) and not _is_non_decimal_numeric_dtype(self.dtype): # type: ignore[attr-defined] + if isinstance(source, cudf.Series) and not is_dtype_obj_numeric( + source.dtype, include_decimal=False + ): # type: ignore[attr-defined] raise TypeError( "Series.rank does not allow numeric_only=True with " "non-numeric dtype." @@ -6420,7 +6416,7 @@ def rank( numeric_cols = ( name for name, dtype in self._dtypes - if _is_non_decimal_numeric_dtype(dtype) + if is_dtype_obj_numeric(dtype, include_decimal=False) ) source = self._get_columns_by_label(numeric_cols) if source.empty: @@ -6562,7 +6558,7 @@ def _check_duplicate_level_names(specified, level_names): @_performance_tracking def _get_replacement_values_for_columns( - to_replace: Any, value: Any, columns_dtype_map: dict[Any, Any] + to_replace: Any, value: Any, columns_dtype_map: dict[Any, DtypeObj] ) -> tuple[dict[Any, bool], dict[Any, Any], dict[Any, Any]]: """ Returns a per column mapping for the values to be replaced, new @@ -6595,24 +6591,22 @@ def _get_replacement_values_for_columns( if is_scalar(to_replace) and is_scalar(value): to_replace_columns = {col: [to_replace] for col in columns_dtype_map} values_columns = {col: [value] for col in columns_dtype_map} - elif cudf.api.types.is_list_like(to_replace) or isinstance( + elif is_list_like(to_replace) or isinstance( to_replace, (ColumnBase, BaseIndex) ): if is_scalar(value): to_replace_columns = {col: to_replace for col in columns_dtype_map} values_columns = { col: [value] - if _is_non_decimal_numeric_dtype(columns_dtype_map[col]) + if is_dtype_obj_numeric(dtype, include_decimal=False) else as_column( value, length=len(to_replace), dtype=cudf.dtype(type(value)), ) - for col in columns_dtype_map + for col, dtype in columns_dtype_map.items() } - elif cudf.api.types.is_list_like( - value - ) or cudf.utils.dtypes.is_column_like(value): + elif is_list_like(value) or is_column_like(value): if len(to_replace) != len(value): raise ValueError( f"Replacement lists must be " diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index c329bf11d97..331aa57fca4 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -9,9 +9,15 @@ import numpy as np import cudf -from cudf.api.types import is_decimal_dtype, is_dtype_equal, is_numeric_dtype +from cudf.api.types import is_dtype_equal from cudf.core.column import CategoricalColumn -from cudf.core.dtypes import CategoricalDtype +from cudf.core.dtypes import ( + CategoricalDtype, + Decimal32Dtype, + Decimal64Dtype, + Decimal128Dtype, +) +from cudf.utils.dtypes import is_dtype_obj_numeric if TYPE_CHECKING: from cudf.core.column import ColumnBase @@ -81,15 +87,17 @@ def _match_join_keys( if is_dtype_equal(ltype, rtype): return lcol, rcol - if is_decimal_dtype(ltype) or is_decimal_dtype(rtype): + if isinstance( + ltype, (Decimal32Dtype, Decimal64Dtype, Decimal128Dtype) + ) or isinstance(rtype, (Decimal32Dtype, Decimal64Dtype, Decimal128Dtype)): raise TypeError( "Decimal columns can only be merged with decimal columns " "of the same precision and scale" ) if ( - is_numeric_dtype(ltype) - and is_numeric_dtype(rtype) + is_dtype_obj_numeric(ltype) + and is_dtype_obj_numeric(rtype) and not (ltype.kind == "m" or rtype.kind == "m") ): common_type = ( diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 87a8849a260..f681c043186 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -17,7 +17,7 @@ import cudf from cudf.api.extensions import no_default -from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar +from cudf.api.types import is_integer, is_list_like, is_scalar from cudf.core import column from cudf.core._base_index import _return_get_indexer_result from cudf.core._internals import sorting @@ -33,7 +33,11 @@ ensure_index, ) from cudf.core.join._join_helpers import _match_join_keys -from cudf.utils.dtypes import SIZE_TYPE_DTYPE, is_column_like +from cudf.utils.dtypes import ( + CUDF_STRING_DTYPE, + SIZE_TYPE_DTYPE, + is_column_like, +) from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name @@ -42,7 +46,7 @@ from typing_extensions import Self - from cudf._typing import DataFrameOrSeries + from cudf._typing import DataFrameOrSeries, Dtype def _maybe_indices_to_slice(indices: cp.ndarray) -> slice | cp.ndarray: @@ -233,8 +237,8 @@ def to_series(self, index=None, name=None): ) @_performance_tracking - def astype(self, dtype, copy: bool = True) -> Self: - if not is_object_dtype(dtype): + def astype(self, dtype: Dtype, copy: bool = True) -> Self: + if cudf.dtype(dtype) != CUDF_STRING_DTYPE: raise TypeError( "Setting a MultiIndex dtype to anything other than object is " "not supported" @@ -1699,16 +1703,12 @@ def _is_sorted(self, ascending=None, null_position=None) -> bool: Returns True, if sorted as expected by ``ascending`` and ``null_position``, False otherwise. """ - if ascending is not None and not cudf.api.types.is_list_like( - ascending - ): + if ascending is not None and not is_list_like(ascending): raise TypeError( f"Expected a list-like or None for `ascending`, got " f"{type(ascending)}" ) - if null_position is not None and not cudf.api.types.is_list_like( - null_position - ): + if null_position is not None and not is_list_like(null_position): raise TypeError( f"Expected a list-like or None for `null_position`, got " f"{type(null_position)}" diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index c5d2fd349e9..b7412f2cc85 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -12,7 +12,7 @@ import cudf from cudf.api.extensions import no_default -from cudf.api.types import is_scalar +from cudf.api.types import is_list_like, is_scalar from cudf.core._compat import PANDAS_LT_300 from cudf.core.column import ( ColumnBase, @@ -1362,7 +1362,7 @@ def _one_hot_encode_column( def _length_check_params(obj, columns, name): - if cudf.api.types.is_list_like(obj): + if is_list_like(obj): if len(obj) != len(columns): raise ValueError( f"Length of '{name}' ({len(obj)}) did not match the " @@ -1526,9 +1526,9 @@ def pivot_table( ---------- data : DataFrame values : column name or list of column names to aggregate, optional - index : list of column names + index : scalar or list of column names Values to group by in the rows. - columns : list of column names + columns : scalar or list of column names Values to group by in the columns. aggfunc : str or dict, default "mean" If dict is passed, the key is column to aggregate @@ -1562,6 +1562,11 @@ def pivot_table( if sort is not True: raise NotImplementedError("sort is not supported yet") + if is_scalar(index): + index = [index] + if is_scalar(columns): + columns = [columns] + keys = index + columns values_passed = values is not None @@ -1620,15 +1625,8 @@ def pivot_table( table = table.fillna(fill_value) # discard the top level - if values_passed and not values_multi and table._data.multiindex: - column_names = table._data.level_names[1:] - table_columns = tuple( - map(lambda column: column[1:], table._column_names) - ) - table.columns = pd.MultiIndex.from_tuples( - tuples=table_columns, names=column_names - ) - + if values_passed and not values_multi and table._data.nlevels > 1: + table.columns = table._data.to_pandas_index.droplevel(0) if len(index) == 0 and len(columns) > 0: table = table.T diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index cf85282cccb..8579b7398f0 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -9,7 +9,6 @@ from typing import TYPE_CHECKING, Any import numpy as np -import pandas as pd import pyarrow as pa import pylibcudf as plc @@ -25,6 +24,7 @@ from cudf.core.missing import NA, NaT from cudf.core.mixins import BinaryOperand from cudf.utils.dtypes import ( + CUDF_STRING_DTYPE, cudf_dtype_from_pa_type, get_allowed_combinations_for_operator, to_cudf_compatible_scalar, @@ -85,9 +85,9 @@ def _preprocess_host_value(value, dtype) -> tuple[ScalarLike, Dtype]: return value.as_py(), dtype if isinstance(dtype, cudf.core.dtypes.DecimalDtype): - value = pa.scalar( - value, type=pa.decimal128(dtype.precision, dtype.scale) - ).as_py() + if isinstance(value, np.integer): + value = int(value) + value = pa.scalar(value, type=dtype.to_arrow()).as_py() if isinstance(value, decimal.Decimal) and dtype is None: dtype = cudf.Decimal128Dtype._from_decimal(value) @@ -191,7 +191,7 @@ def _to_plc_scalar(value: ScalarLike, dtype: Dtype) -> plc.Scalar: if isinstance(dtype, cudf.core.dtypes._BaseDtype): pa_type = dtype.to_arrow() - elif pd.api.types.is_string_dtype(dtype): + elif dtype == CUDF_STRING_DTYPE: # Have to manually convert object types, which we use internally # for strings but pyarrow only supports as unicode 'U' pa_type = pa.string() diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f6f1b31dc43..42247ce689e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -20,7 +20,6 @@ import cudf from cudf.api.extensions import no_default from cudf.api.types import ( - _is_non_decimal_numeric_dtype, _is_scalar_or_zero_d_array, is_dict_like, is_integer, @@ -64,6 +63,7 @@ from cudf.utils.dtypes import ( can_convert_to_column, find_common_type, + is_dtype_obj_numeric, is_mixed_with_object_dtype, to_cudf_compatible_scalar, ) @@ -357,7 +357,9 @@ def _loc_to_iloc(self, arg): "as labels (consistent with DataFrame behavior). To access " "a value by position, use `ser.iloc[pos]`" ) - if not _is_non_decimal_numeric_dtype(index_dtype) and not ( + if not is_dtype_obj_numeric( + index_dtype, include_decimal=False + ) and not ( isinstance(index_dtype, cudf.CategoricalDtype) and index_dtype.categories.dtype.kind in "iu" ): @@ -1531,7 +1533,7 @@ def _make_operands_and_index_for_binop( dict[str | None, tuple[ColumnBase, Any, bool, Any]] | NotImplementedType, BaseIndex | None, - bool, + dict[str, Any], ]: # Specialize binops to align indices. if isinstance(other, Series): @@ -1547,15 +1549,14 @@ def _make_operands_and_index_for_binop( else: lhs = self - try: - can_use_self_column_name = cudf.utils.utils._is_same_name( - self.name, other.name - ) - except AttributeError: - can_use_self_column_name = False + ca_attributes = {} + if hasattr(other, "name") and cudf.utils.utils._is_same_name( + self.name, other.name + ): + ca_attributes["level_names"] = self._data._level_names operands = lhs._make_operands_for_binop(other, fill_value, reflect) - return operands, lhs.index, can_use_self_column_name + return operands, lhs.index, ca_attributes @copy_docstring(CategoricalAccessor) # type: ignore @property diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index f9713ca62d1..aa59d3af640 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -12,12 +12,12 @@ from cudf.api.types import ( _is_scalar_or_zero_d_array, is_integer, - is_numeric_dtype, + is_scalar, ) from cudf.core.column import ColumnBase, as_column from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame -from cudf.utils.dtypes import SIZE_TYPE_DTYPE +from cudf.utils.dtypes import SIZE_TYPE_DTYPE, is_dtype_obj_numeric from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import NotIterable @@ -54,7 +54,7 @@ def _reduce( if axis not in (None, 0, no_default): raise NotImplementedError("axis parameter is not implemented yet") - if numeric_only and not is_numeric_dtype(self.dtype): + if numeric_only and not is_dtype_obj_numeric(self.dtype): raise TypeError( f"Series.{op} does not allow numeric_only={numeric_only} " "with non-numeric dtypes." @@ -374,7 +374,7 @@ def where(self, cond, other=None, inplace=False): """Array conditional must be same shape as self""" ) - if not cudf.api.types.is_scalar(other): + if not is_scalar(other): other = cudf.core.column.as_column(other) input_col, other = _check_and_cast_columns_with_other( diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py index c59a16f99f5..36f9eea0619 100644 --- a/python/cudf/cudf/core/subword_tokenizer.py +++ b/python/cudf/cudf/core/subword_tokenizer.py @@ -20,7 +20,7 @@ def _cast_to_appropriate_type(ar, cast_type): elif cast_type == "tf": from tensorflow.experimental.dlpack import from_dlpack - return from_dlpack(ar.astype(np.dtype(np.int32)).toDlpack()) + return from_dlpack(ar.astype(np.dtype(np.int32)).__dlpack__()) class SubwordTokenizer: diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 4478be2fd04..89abc120de9 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -882,7 +882,7 @@ def date_range( "three must be specified" ) - if periods is not None and not cudf.api.types.is_integer(periods): + if periods is not None and not is_integer(periods): warnings.warn( "Non-integer 'periods' in cudf.date_range, and cudf.interval_range" " are deprecated and will raise in a future version.", diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 9746234cfb1..18e96ee4a68 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -8,11 +8,14 @@ import pandas as pd import cudf -from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype from cudf.core.column import as_column from cudf.core.dtypes import CategoricalDtype from cudf.core.index import ensure_index -from cudf.utils.dtypes import can_convert_to_column +from cudf.utils.dtypes import ( + CUDF_STRING_DTYPE, + can_convert_to_column, + is_dtype_obj_numeric, +) if TYPE_CHECKING: from cudf.core.column.numerical import NumericalColumn @@ -142,7 +145,7 @@ def to_numeric( return arg else: raise e - elif is_string_dtype(dtype): + elif dtype == CUDF_STRING_DTYPE: try: col = _convert_str_col(col, errors, downcast) # type: ignore[arg-type] except ValueError as e: @@ -152,7 +155,7 @@ def to_numeric( raise e elif isinstance(dtype, (cudf.ListDtype, cudf.StructDtype)): raise ValueError("Input does not support nested datatypes") - elif _is_non_decimal_numeric_dtype(dtype): + elif is_dtype_obj_numeric(dtype, include_decimal=False): pass else: raise ValueError("Unrecognized datatype") @@ -218,7 +221,7 @@ def _convert_str_col( ------- Converted numeric column """ - if not is_string_dtype(col): + if col.dtype != CUDF_STRING_DTYPE: raise TypeError("col must be string dtype.") if col.is_integer().all(): diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 814d3e9fc85..943b6ebfd1c 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. import cupy as cp @@ -8,7 +8,7 @@ from numba.cuda.cudadrv.devices import get_context from numba.np import numpy_support -import cudf.core.udf.utils +from cudf.core.column import column_empty from cudf.core.udf.groupby_typing import ( SUPPORTED_GROUPBY_NUMPY_TYPES, Group, @@ -154,9 +154,7 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): offsets = cp.asarray(offsets) ngroups = len(offsets) - 1 - output = cudf.core.column.column_empty( - ngroups, dtype=return_type, for_numba=True - ) + output = column_empty(ngroups, dtype=return_type, for_numba=True) launch_args = [ offsets, output, diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py index 3e8a6ab400c..4b94e3e52b1 100644 --- a/python/cudf/cudf/core/window/ewm.py +++ b/python/cudf/cudf/core/window/ewm.py @@ -6,8 +6,8 @@ import numpy as np -from cudf.api.types import is_numeric_dtype from cudf.core.window.rolling import _RollingBase +from cudf.utils.dtypes import is_dtype_obj_numeric if TYPE_CHECKING: from cudf.core.column.column import ColumnBase @@ -184,7 +184,7 @@ def cov( def _apply_agg_column( self, source_column: ColumnBase, agg_name: str ) -> ColumnBase: - if not is_numeric_dtype(source_column.dtype): + if not is_dtype_obj_numeric(source_column.dtype): raise TypeError("No numeric types to aggregate") # libcudf ewm has special casing for nulls only diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index 3b3fd5f7c56..e7b224a40e7 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations import pylibcudf as plc @@ -6,6 +6,7 @@ import cudf from cudf.core.column import ColumnBase from cudf.utils import ioutils +from cudf.utils.dtypes import find_common_type, is_dtype_obj_numeric def from_dlpack(pycapsule_obj) -> cudf.Series | cudf.DataFrame: @@ -83,12 +84,12 @@ def to_dlpack(cudf_obj: cudf.Series | cudf.DataFrame | cudf.BaseIndex): ) if any( - not cudf.api.types._is_non_decimal_numeric_dtype(dtype) + not is_dtype_obj_numeric(dtype, include_decimal=False) for _, dtype in gdf._dtypes # type: ignore[union-attr] ): raise TypeError("non-numeric data not yet supported") - dtype = cudf.utils.dtypes.find_common_type( + dtype = find_common_type( [dtype for _, dtype in gdf._dtypes] # type: ignore[union-attr] ) gdf = gdf.astype(dtype) diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py index 52fc945709e..742a6b57e59 100644 --- a/python/cudf/cudf/pandas/__init__.py +++ b/python/cudf/cudf/pandas/__init__.py @@ -8,12 +8,17 @@ import pylibcudf import rmm.mr -from .fast_slow_proxy import is_proxy_instance, is_proxy_object +from .fast_slow_proxy import ( + as_proxy_object, + is_proxy_instance, + is_proxy_object, +) from .magics import load_ipython_extension from .profiler import Profiler __all__ = [ "Profiler", + "as_proxy_object", "install", "is_proxy_instance", "is_proxy_object", diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 45944452c17..147971e8bee 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -151,7 +151,7 @@ def make_final_proxy_type( additional_attributes Mapping of additional attributes to add to the class (optional), these will override any defaulted attributes (e.g. - ``__init__`). If you want to remove a defaulted attribute + ``__init__``). If you want to remove a defaulted attribute completely, pass the special sentinel ``_DELETE`` as a value. postprocess Optional function called to allow the proxy to postprocess @@ -1335,6 +1335,31 @@ def _get_proxy_base_class(cls): return object +def as_proxy_object(obj: Any) -> Any: + """ + Wraps a cudf or pandas object in a proxy object if applicable. + + There will be no memory transfer, i.e., GPU objects stay on GPU and + CPU objects stay on CPU. The object will be wrapped in a + proxy object. This is useful for ensuring that the object is + compatible with the fast-slow proxy system. + + Parameters + ---------- + obj : Any + The object to wrap. + + Returns + ------- + Any + The wrapped proxy object if applicable, otherwise the original object. + """ + if _is_final_type(obj): + typ = get_final_type_map()[type(obj)] + return typ._fsproxy_wrap(obj, None) + return obj + + def is_proxy_instance(obj, type): return is_proxy_object(obj) and obj.__class__.__name__ == type.__name__ diff --git a/python/cudf/cudf/testing/__init__.py b/python/cudf/cudf/testing/__init__.py index 4e92b43b9f9..a4afa54f754 100644 --- a/python/cudf/cudf/testing/__init__.py +++ b/python/cudf/cudf/testing/__init__.py @@ -1,5 +1,6 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. +from cudf.testing import narwhals_test_plugin from cudf.testing.testing import ( assert_eq, assert_frame_equal, diff --git a/python/cudf/cudf/testing/narwhals_test_plugin.py b/python/cudf/cudf/testing/narwhals_test_plugin.py new file mode 100644 index 00000000000..d794bd0120a --- /dev/null +++ b/python/cudf/cudf/testing/narwhals_test_plugin.py @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Plugin for running narwhals test suite with cudf.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Mapping + +EXPECTED_FAILURES: Mapping[str, str] = { + "tests/frame/select_test.py::test_select_duplicates[cudf]": "cuDF doesn't support having multiple columns with same names", +} + + +def pytest_collection_modifyitems(session, config, items) -> None: + """Mark known failing tests.""" + import pytest + + for item in items: + if item.nodeid in EXPECTED_FAILURES: + exp_val = EXPECTED_FAILURES[item.nodeid] + item.add_marker(pytest.mark.xfail(reason=exp_val)) diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 9c20a42d215..e1b0c17eb00 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -10,15 +10,15 @@ from pandas import testing as tm import cudf -from cudf.api.types import is_numeric_dtype, is_string_dtype from cudf.core.missing import NA, NaT +from cudf.utils.dtypes import CUDF_STRING_DTYPE, is_dtype_obj_numeric def dtype_can_compare_equal_to_other(dtype): # return True if values of this dtype can compare # as equal to equal values of a different dtype return not ( - is_string_dtype(dtype) + dtype == CUDF_STRING_DTYPE or isinstance( dtype, ( @@ -218,10 +218,10 @@ def assert_column_equal( elif not ( ( not dtype_can_compare_equal_to_other(left.dtype) - and is_numeric_dtype(right.dtype) + and is_dtype_obj_numeric(right.dtype) ) or ( - is_numeric_dtype(left.dtype) + is_dtype_obj_numeric(left.dtype) and not dtype_can_compare_equal_to_other(right.dtype) ) ): @@ -234,7 +234,7 @@ def assert_column_equal( if ( columns_equal and not check_exact - and is_numeric_dtype(left.dtype) + and is_dtype_obj_numeric(left.dtype) ): # non-null values must be the same columns_equal = cp.allclose( diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 2996a88c171..b7cd2388f30 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -290,6 +290,8 @@ def test_column_chunked_array_creation(): ], ) def test_column_view_valid_numeric_to_numeric(data, from_dtype, to_dtype): + from_dtype = np.dtype(from_dtype) + to_dtype = np.dtype(to_dtype) cpu_data = np.asarray(data, dtype=from_dtype) gpu_data = as_column(data, dtype=from_dtype) @@ -314,6 +316,8 @@ def test_column_view_valid_numeric_to_numeric(data, from_dtype, to_dtype): ], ) def test_column_view_invalid_numeric_to_numeric(data, from_dtype, to_dtype): + from_dtype = np.dtype(from_dtype) + to_dtype = np.dtype(to_dtype) cpu_data = np.asarray(data, dtype=from_dtype) gpu_data = as_column(data, dtype=from_dtype) @@ -337,6 +341,7 @@ def test_column_view_invalid_numeric_to_numeric(data, from_dtype, to_dtype): ], ) def test_column_view_valid_string_to_numeric(data, to_dtype): + to_dtype = np.dtype(to_dtype) expect = cudf.Series._from_column(cudf.Series(data)._column.view(to_dtype)) got = cudf.Series(str_host_view(data, to_dtype)) @@ -352,7 +357,7 @@ def test_column_view_nulls_widths_even(): sr = cudf.Series(data, dtype="int32") expect = cudf.Series(expect_data, dtype="float32") - got = cudf.Series._from_column(sr._column.view("float32")) + got = cudf.Series._from_column(sr._column.view(np.dtype(np.float32))) assert_eq(expect, got) @@ -364,7 +369,7 @@ def test_column_view_nulls_widths_even(): sr = cudf.Series(data, dtype="float64") expect = cudf.Series(expect_data, dtype="int64") - got = cudf.Series._from_column(sr._column.view("int64")) + got = cudf.Series._from_column(sr._column.view(np.dtype(np.int64))) assert_eq(expect, got) @@ -376,7 +381,7 @@ def test_column_view_numeric_slice(slc): expect = cudf.Series(data[slc].view("int64")) got = cudf.Series._from_column( - sr._column.slice(slc.start, slc.stop).view("int64") + sr._column.slice(slc.start, slc.stop).view(np.dtype(np.int64)) ) assert_eq(expect, got) @@ -389,7 +394,9 @@ def test_column_view_string_slice(slc): data = ["a", "bcde", "cd", "efg", "h"] expect = cudf.Series._from_column( - cudf.Series(data)._column.slice(slc.start, slc.stop).view("int8") + cudf.Series(data) + ._column.slice(slc.start, slc.stop) + .view(np.dtype(np.int8)) ) got = cudf.Series(str_host_view(data[slc], "int8")) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 15c11db5a84..d6bbbf601be 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -11083,6 +11083,21 @@ def test_dataframe_columns_set_preserve_type(klass): pd.testing.assert_index_equal(result, expected) +@pytest.mark.parametrize( + "expected", + [ + pd.RangeIndex(1, 2, name="a"), + pd.Index([1], dtype=np.int8, name="a"), + pd.MultiIndex.from_arrays([[1]], names=["a"]), + ], +) +@pytest.mark.parametrize("binop", [lambda df: df == df, lambda df: df - 1]) +def test_dataframe_binop_preserves_column_metadata(expected, binop): + df = cudf.DataFrame([1], columns=expected) + result = binop(df).columns + pd.testing.assert_index_equal(result, expected, exact=True) + + @pytest.mark.parametrize( "scalar", [ diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py index 20c24bd7564..187a5524e8e 100644 --- a/python/cudf/cudf/tests/test_dlpack.py +++ b/python/cudf/cudf/tests/test_dlpack.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import itertools from contextlib import ExitStack as does_not_raise @@ -140,7 +140,7 @@ def test_to_dlpack_cupy_2d(data_2d): def test_from_dlpack_cupy_1d(data_1d): cupy_array = cupy.array(data_1d) cupy_host_array = cupy_array.get() - dlt = cupy_array.toDlpack() + dlt = cupy_array.__dlpack__() gs = cudf.from_dlpack(dlt) cudf_host_array = gs.to_numpy(na_value=np.nan) @@ -151,7 +151,7 @@ def test_from_dlpack_cupy_1d(data_1d): def test_from_dlpack_cupy_2d(data_2d): cupy_array = cupy.array(data_2d, order="F") cupy_host_array = cupy_array.get().flatten() - dlt = cupy_array.toDlpack() + dlt = cupy_array.__dlpack__() gdf = cudf.from_dlpack(dlt) cudf_host_array = np.array(gdf.to_pandas()).flatten() diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index 80ffce9e8be..75e38b9246a 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -512,14 +512,6 @@ def test_reduction_column_multiindex(): assert_eq(result, expected) -@pytest.mark.parametrize("op", ["sum", "product"]) -def test_dtype_deprecated(op): - ser = cudf.Series(range(5)) - with pytest.warns(FutureWarning): - result = getattr(ser, op)(dtype=np.dtype(np.int8)) - assert isinstance(result, np.int8) - - @pytest.mark.parametrize( "columns", [pd.RangeIndex(2), pd.Index([0, 1], dtype="int8")] ) diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 7fbe072dde7..eae73e47955 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -798,6 +798,25 @@ def test_dataframe_pivot_table_simple(aggfunc, fill_value): assert_eq(expected, actual, check_dtype=False) +@pytest.mark.parametrize("index", ["A", ["A"]]) +@pytest.mark.parametrize("columns", ["C", ["C"]]) +def test_pivot_table_scalar_index_columns(index, columns): + data = { + "A": ["one", "one", "two", "three"] * 6, + "B": ["A", "B", "C"] * 8, + "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, + "D": range(24), + "E": range(24), + } + result = cudf.DataFrame(data).pivot_table( + values="D", index=index, columns=columns, aggfunc="sum" + ) + expected = pd.DataFrame(data).pivot_table( + values="D", index=index, columns=columns, aggfunc="sum" + ) + assert_eq(result, expected) + + def test_crosstab_simple(): a = np.array( [ diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py index 13d98e43ddc..08226dd7f6d 100644 --- a/python/cudf/cudf/tests/test_spilling.py +++ b/python/cudf/cudf/tests/test_spilling.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. from __future__ import annotations import contextlib @@ -784,3 +784,12 @@ def test_spilling_and_copy_on_write(manager: SpillManager): assert not a.is_spilled assert a.owner.exposed assert not b.owner.exposed + + +def test_scatter_by_map(): + data = range(10) + with cudf.option_context("spill", True): + df = cudf.DataFrame(data) + result = df.scatter_by_map(data) + for i, res in zip(data, result): + assert_eq(res, cudf.DataFrame([i], index=[i])) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 164fcb06624..18aee0001c4 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -13,8 +13,11 @@ import pyarrow as pa import pytest +import rmm + import cudf from cudf import concat +from cudf.core.buffer import as_buffer from cudf.core.column.string import StringColumn from cudf.core.index import Index from cudf.testing import assert_eq @@ -1202,7 +1205,12 @@ def test_string_misc_name(ps_gs, name): def test_string_no_children_properties(): - empty_col = StringColumn(children=()) + empty_col = StringColumn( + as_buffer(rmm.DeviceBuffer(size=0)), + size=0, + dtype=np.dtype("object"), + children=(), + ) assert empty_col.base_children == () assert empty_col.base_size == 0 diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 86e1e46c1a2..47b41bd1e39 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -8,6 +8,7 @@ import cudf from cudf.core.byte_pair_encoding import BytePairEncoder +from cudf.core.character_normalizer import CharacterNormalizer from cudf.core.tokenize_vocabulary import TokenizeVocabulary from cudf.testing import assert_eq @@ -251,7 +252,8 @@ def test_normalize_characters(): ] ) - actual = strings.str.normalize_characters() + normalizer_lower = CharacterNormalizer(True) + actual = normalizer_lower.normalize(strings.str) assert type(expected) is type(actual) assert_eq(expected, actual) @@ -265,7 +267,9 @@ def test_normalize_characters(): "Stock ^ $ 1", ] ) - actual = strings.str.normalize_characters(do_lower=False) + + normalizer = CharacterNormalizer(False) + actual = normalizer.normalize(strings.str) assert type(expected) is type(actual) assert_eq(expected, actual) @@ -926,6 +930,48 @@ def test_minhash(): strings.str.minhash64(1, a=params, b=params, width=8) +def test_minhash_ngrams(): + strings = cudf.Series( + [["this", "is", "my"], ["favorite", "book", "today"]] + ) + + params = cudf.Series([1, 2, 3], dtype=np.uint32) + expected = cudf.Series( + [ + cudf.Series([416367548, 832735096, 1249102644], dtype=np.uint32), + cudf.Series([1408797893, 2817595786, 4226393679], dtype=np.uint32), + ] + ) + actual = strings.str.minhash_ngrams(ngrams=2, seed=0, a=params, b=params) + assert_eq(expected, actual) + + params = cudf.Series([1, 2, 3], dtype=np.uint64) + expected = cudf.Series( + [ + cudf.Series( + [652146669912597278, 1304293339825194556, 1956440009737791826], + dtype=np.uint64, + ), + cudf.Series( + [1776622609581023632, 1247402209948353305, 718181810315682986], + dtype=np.uint64, + ), + ] + ) + actual = strings.str.minhash64_ngrams(ngrams=2, seed=0, a=params, b=params) + assert_eq(expected, actual) + + # test wrong input types + with pytest.raises(ValueError): + strings.str.minhash_ngrams(ngrams=7, seed=1, a="a", b="b") + with pytest.raises(ValueError): + params = cudf.Series([0, 1, 2], dtype=np.int32) + strings.str.minhash_ngrams(ngrams=6, seed=1, a=params, b=params) + with pytest.raises(ValueError): + params = cudf.Series([0, 1, 2], dtype=np.uint32) + strings.str.minhash64_ngrams(ngrams=8, seed=1, a=params, b=params) + + def test_jaccard_index(): str1 = cudf.Series(["the brown dog", "jumped about"]) str2 = cudf.Series(["the black cat", "jumped around"]) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 489b804583a..adee17e7bfb 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -612,6 +612,20 @@ def _get_base_dtype(dtype: pd.DatetimeTZDtype) -> np.dtype: return dtype.base +def is_dtype_obj_numeric( + dtype: DtypeObj, include_decimal: bool = True +) -> bool: + """Like is_numeric_dtype but does not introspect argument.""" + is_non_decimal = dtype.kind in set("iufb") + if include_decimal: + return is_non_decimal or isinstance( + dtype, + (cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype), + ) + else: + return is_non_decimal + + def dtype_to_pylibcudf_type(dtype) -> plc.DataType: if isinstance(dtype, cudf.ListDtype): return plc.DataType(plc.TypeId.LIST) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index fd946937945..601a7a369e8 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -15,12 +15,12 @@ import rmm import cudf -import cudf.api.types from cudf.core import column from cudf.core.buffer import as_buffer +from cudf.utils.dtypes import SIZE_TYPE_DTYPE # The size of the mask in bytes -mask_dtype = cudf.api.types.dtype(np.int32) +mask_dtype = SIZE_TYPE_DTYPE mask_bitsize = mask_dtype.itemsize * 8 # Mapping from ufuncs to the corresponding binary operators. diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 47de8fb1435..d3bfd9298c2 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -44,6 +44,7 @@ OOMFallbackError, TypeFallbackError, _Unusable, + as_proxy_object, is_proxy_object, ) from cudf.testing import assert_eq @@ -1979,6 +1980,93 @@ def test_numpy_data_access(): assert type(expected) is type(actual) +@pytest.mark.parametrize( + "obj", + [ + pd.DataFrame({"a": [1, 2, 3]}), + pd.Series([1, 2, 3]), + pd.Index([1, 2, 3]), + pd.Categorical([1, 2, 3]), + pd.to_datetime(["2021-01-01", "2021-01-02"]), + pd.to_timedelta(["1 days", "2 days"]), + xpd.DataFrame({"a": [1, 2, 3]}), + xpd.Series([1, 2, 3]), + xpd.Index([1, 2, 3]), + xpd.Categorical([1, 2, 3]), + xpd.to_datetime(["2021-01-01", "2021-01-02"]), + xpd.to_timedelta(["1 days", "2 days"]), + cudf.DataFrame({"a": [1, 2, 3]}), + cudf.Series([1, 2, 3]), + cudf.Index([1, 2, 3]), + cudf.Index([1, 2, 3], dtype="category"), + cudf.to_datetime(["2021-01-01", "2021-01-02"]), + cudf.Index([1, 2, 3], dtype="timedelta64[ns]"), + [1, 2, 3], + {"a": 1, "b": 2}, + (1, 2, 3), + ], +) +def test_as_proxy_object(obj): + proxy_obj = as_proxy_object(obj) + if isinstance( + obj, + ( + pd.DataFrame, + pd.Series, + pd.Index, + pd.Categorical, + xpd.DataFrame, + xpd.Series, + xpd.Index, + xpd.Categorical, + cudf.DataFrame, + cudf.Series, + cudf.Index, + ), + ): + assert is_proxy_object(proxy_obj) + if isinstance(proxy_obj, xpd.DataFrame): + tm.assert_frame_equal(proxy_obj, xpd.DataFrame(obj)) + elif isinstance(proxy_obj, xpd.Series): + tm.assert_series_equal(proxy_obj, xpd.Series(obj)) + elif isinstance(proxy_obj, xpd.Index): + tm.assert_index_equal(proxy_obj, xpd.Index(obj)) + else: + tm.assert_equal(proxy_obj, obj) + else: + assert not is_proxy_object(proxy_obj) + assert proxy_obj == obj + + +def test_as_proxy_object_doesnot_copy_series(): + s = pd.Series([1, 2, 3]) + proxy_obj = as_proxy_object(s) + s[0] = 10 + assert proxy_obj[0] == 10 + tm.assert_series_equal(s, proxy_obj) + + +def test_as_proxy_object_doesnot_copy_dataframe(): + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + proxy_obj = as_proxy_object(df) + df.iloc[0, 0] = 10 + assert proxy_obj.iloc[0, 0] == 10 + tm.assert_frame_equal(df, proxy_obj) + + +def test_as_proxy_object_doesnot_copy_index(): + idx = pd.Index([1, 2, 3]) + proxy_obj = as_proxy_object(idx) + assert proxy_obj._fsproxy_wrapped is idx + + +def test_as_proxy_object_no_op_for_intermediates(): + s = pd.Series(["abc", "def", "ghi"]) + str_attr = s.str + proxy_obj = as_proxy_object(str_attr) + assert proxy_obj is str_attr + + def test_pickle_round_trip_proxy_numpy_array(array): arr, proxy_arr = array pickled_arr = BytesIO() diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 16cd97677ef..2ce5131ea8e 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -15,7 +15,7 @@ readme = { file = "README.md", content-type = "text/markdown" } authors = [ { name = "NVIDIA Corporation" }, ] -license = { text = "Apache 2.0" } +license = { text = "Apache-2.0" } requires-python = ">=3.10" dependencies = [ "cachetools", @@ -24,9 +24,9 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "libcudf==25.4.*,>=0.0.0a0", - "numba-cuda>=0.2.0,<0.3.0a0", - "numba>=0.59.1,<0.61.0a0", - "numpy>=1.23,<3.0a0", + "numba-cuda>=0.4.0,<0.5.0a0", + "numba>=0.59.1,<0.62.0a0", + "numpy>=1.23,<2.1", "nvtx>=0.2.1", "packaging", "pandas>=2.0,<2.2.4dev0", diff --git a/python/cudf_kafka/CMakeLists.txt b/python/cudf_kafka/CMakeLists.txt index 3e12eb6aa41..13b859bc33b 100644 --- a/python/cudf_kafka/CMakeLists.txt +++ b/python/cudf_kafka/CMakeLists.txt @@ -35,7 +35,3 @@ include(rapids-cython-core) rapids_cython_init() add_subdirectory(cudf_kafka/_lib) - -if(DEFINED cython_lib_dir) - rapids_cython_add_rpath_entries(TARGET cudf_kafka PATHS "${cython_lib_dir}") -endif() diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index 424010e632c..764c8c64a7e 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -15,7 +15,7 @@ readme = { file = "README.md", content-type = "text/markdown" } authors = [ { name = "NVIDIA Corporation" }, ] -license = { text = "Apache 2.0" } +license = { text = "Apache-2.0" } requires-python = ">=3.10" dependencies = [ "cudf==25.4.*,>=0.0.0a0", diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index a7b10a6e8fa..9b798688992 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -197,7 +197,6 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/io/test_multiscan.py::test_include_file_paths[scan_csv-write_csv]": "Need to expose include_file_paths xref: cudf#18012", "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics[False]": "Debug output on stderr doesn't match", # Maybe flaky, order-dependent? - "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order", "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero", } diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 6bb5d78c488..85a4f007cf0 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 """Datatype utilities.""" @@ -71,7 +71,9 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: ------- True if casting is supported, False otherwise """ - has_empty = from_.id() == plc.TypeId.EMPTY or to.id() == plc.TypeId.EMPTY + to_is_empty = to.id() == plc.TypeId.EMPTY + from_is_empty = from_.id() == plc.TypeId.EMPTY + has_empty = to_is_empty or from_is_empty return ( ( from_ == to @@ -84,8 +86,16 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: ) ) ) - or (from_.id() == plc.TypeId.STRING and is_numeric_not_bool(to)) - or (to.id() == plc.TypeId.STRING and is_numeric_not_bool(from_)) + or ( + from_.id() == plc.TypeId.STRING + and not to_is_empty + and is_numeric_not_bool(to) + ) + or ( + to.id() == plc.TypeId.STRING + and not from_is_empty + and is_numeric_not_bool(from_) + ) ) diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 872c08a66f9..fb44caaa0c0 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -16,10 +16,10 @@ readme = { file = "README.md", content-type = "text/markdown" } authors = [ { name = "NVIDIA Corporation" }, ] -license = { text = "Apache 2.0" } +license = { text = "Apache-2.0" } requires-python = ">=3.10" dependencies = [ - "polars>=1.20,<1.23", + "polars>=1.20,<1.24", "pylibcudf==25.4.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -35,7 +35,7 @@ classifiers = [ [project.optional-dependencies] test = [ - "numpy>=1.23,<3.0a0", + "numpy>=1.23,<2.1", "pytest-cov", "pytest-xdist", "pytest<8", diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index 9c58a24c065..8ff0db084b1 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -1,9 +1,7 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -import os - import pytest import polars as pl @@ -203,8 +201,11 @@ def test_scan_csv_multi(tmp_path, filename, glob, nrows_skiprows): f.write("""foo,bar,baz\n1,2,3\n3,4,5""") with (tmp_path / "test*.csv").open("w") as f: f.write("""foo,bar,baz\n1,2,3\n3,4,5""") - os.chdir(tmp_path) - q = pl.scan_csv(filename, glob=glob, n_rows=n_rows, skip_rows=skiprows) + if isinstance(filename, list): + source = [tmp_path / fn for fn in filename] + else: + source = tmp_path / filename + q = pl.scan_csv(source, glob=glob, n_rows=n_rows, skip_rows=skiprows) assert_gpu_result_equal(q) diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml index 665b0a76ecf..b1fbe901189 100644 --- a/python/custreamz/pyproject.toml +++ b/python/custreamz/pyproject.toml @@ -16,7 +16,7 @@ readme = { file = "README.md", content-type = "text/markdown" } authors = [ { name = "NVIDIA Corporation" }, ] -license = { text = "Apache 2.0" } +license = { text = "Apache-2.0" } requires-python = ">=3.10" dependencies = [ "confluent-kafka>=2.5.0,<2.6.0a0", diff --git a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py index c0792663c7e..c0b9d71653c 100644 --- a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py +++ b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py @@ -434,18 +434,12 @@ def set_object_dtypes_from_pa_schema(df, schema): # pyarrow schema. if schema: for col_name, col in df._data.items(): - if col_name is None: - # Pyarrow cannot handle `None` as a field name. - # However, this should be a simple range index that - # we can ignore anyway - continue - typ = cudf_dtype_from_pa_type(schema.field(col_name).type) - if ( - col_name in schema.names - and not isinstance(typ, (cudf.ListDtype, cudf.StructDtype)) - and isinstance(col, cudf.core.column.StringColumn) - ): - df._data[col_name] = col.astype(typ) + if col_name in schema.names: + typ = cudf_dtype_from_pa_type(schema.field(col_name).type) + if not isinstance( + typ, (cudf.ListDtype, cudf.StructDtype) + ) and isinstance(col, cudf.core.column.StringColumn): + df._data[col_name] = col.astype(typ) to_parquet = dd.to_parquet diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 9f7031f4d2a..3a88668e6d2 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +import pyarrow as pa import pytest import dask @@ -486,6 +487,52 @@ def test_create_metadata_file_inconsistent_schema(tmpdir): dd.assert_eq(ddf1.compute(), ddf2.compute()) +@pytest.mark.parametrize("specify_schema", [True, False]) +def test_read_inconsistent_schema(tmpdir, specify_schema): + if specify_schema: + # If we specify the expected schema, + # we also need to specify the partitioning. + kwargs = { + "dataset": { + "schema": pa.schema( + [ + ("id", pa.int64()), + ("text", pa.string()), + ("meta1", pa.struct([("field1", pa.string())])), + ] + ), + "partitioning": None, + }, + } + else: + kwargs = {} + + records = [ + {"id": 123, "text": "foo"}, + { + "text": "bar", + "meta1": [{"field1": "cat"}], + "id": 456, + }, + ] + columns = ["text", "id"] + pd.DataFrame(records[:1]).to_parquet(tmpdir / "part.0.parquet") + pd.DataFrame(records[1:]).to_parquet(tmpdir / "part.1.parquet") + # Check that cuDF and Dask cuDF match + dd.assert_eq( + cudf.read_parquet( + tmpdir, columns=columns, allow_mismatched_pq_schemas=True + ), + dask_cudf.read_parquet(tmpdir, columns=columns, **kwargs), + check_index=False, + ) + # Check that "pandas" and "cudf" backends match + dd.assert_eq( + dd.read_parquet(tmpdir, columns=columns), + dask_cudf.read_parquet(tmpdir, columns=columns, **kwargs), + ) + + @pytest.mark.parametrize( "data", [ @@ -526,7 +573,6 @@ def test_cudf_list_struct_write(tmpdir): def test_null_partition(tmpdir): - import pyarrow as pa from pyarrow.dataset import HivePartitioning ids = pd.Series([0, 1, None], dtype="Int64") diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 87bf282f376..fd2bac3c0d2 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -16,13 +16,13 @@ readme = { file = "README.md", content-type = "text/markdown" } authors = [ { name = "NVIDIA Corporation" }, ] -license = { text = "Apache 2.0" } +license = { text = "Apache-2.0" } requires-python = ">=3.10" dependencies = [ "cudf==25.4.*,>=0.0.0a0", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", - "numpy>=1.23,<3.0a0", + "numpy>=1.23,<2.1", "pandas>=2.0,<2.2.4dev0", "pynvml>=12.0.0,<13.0.0a0", "rapids-dask-dependency==25.4.*,>=0.0.0a0", @@ -47,8 +47,8 @@ cudf = "dask_cudf.backends:CudfBackendEntrypoint" [project.optional-dependencies] test = [ "dask-cuda==25.4.*,>=0.0.0a0", - "numba-cuda>=0.2.0,<0.3.0a0", - "numba>=0.59.1,<0.61.0a0", + "numba-cuda>=0.4.0,<0.5.0a0", + "numba>=0.59.1,<0.62.0a0", "pytest-cov", "pytest-xdist", "pytest<8", diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml index 01fe6097936..784a0c49894 100644 --- a/python/libcudf/pyproject.toml +++ b/python/libcudf/pyproject.toml @@ -27,7 +27,7 @@ readme = { file = "README.md", content-type = "text/markdown" } authors = [ { name = "NVIDIA Corporation" }, ] -license = { text = "Apache 2.0" } +license = { text = "Apache-2.0" } requires-python = ">=3.10" classifiers = [ "Intended Audience :: Developers", diff --git a/python/pylibcudf/CMakeLists.txt b/python/pylibcudf/CMakeLists.txt index fe6e73a3f14..153570a4a7e 100644 --- a/python/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/CMakeLists.txt @@ -37,7 +37,3 @@ include(rapids-cython-core) rapids_cython_init() add_subdirectory(pylibcudf) - -if(DEFINED cython_lib_dir) - rapids_cython_add_rpath_entries(TARGET cudf PATHS "${cython_lib_dir}") -endif() diff --git a/python/pylibcudf/pylibcudf/gpumemoryview.pyi b/python/pylibcudf/pylibcudf/gpumemoryview.pyi index 50f1f39a515..236ff6e56a6 100644 --- a/python/pylibcudf/pylibcudf/gpumemoryview.pyi +++ b/python/pylibcudf/pylibcudf/gpumemoryview.pyi @@ -7,3 +7,6 @@ class gpumemoryview: def __init__(self, data: Any): ... @property def __cuda_array_interface__(self) -> Mapping[str, Any]: ... + def __len__(self) -> int: ... + @property + def nbytes(self) -> int: ... diff --git a/python/pylibcudf/pylibcudf/gpumemoryview.pyx b/python/pylibcudf/pylibcudf/gpumemoryview.pyx index 41316eddb60..954d35a6ce3 100644 --- a/python/pylibcudf/pylibcudf/gpumemoryview.pyx +++ b/python/pylibcudf/pylibcudf/gpumemoryview.pyx @@ -1,4 +1,7 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. + +import functools +import operator __all__ = ["gpumemoryview"] @@ -27,4 +30,19 @@ cdef class gpumemoryview: def __cuda_array_interface__(self): return self.obj.__cuda_array_interface__ + def __len__(self): + return self.obj.__cuda_array_interface__["shape"][0] + + @property + def nbytes(self): + cai = self.obj.__cuda_array_interface__ + shape, typestr = cai["shape"], cai["typestr"] + + # Get element size from typestr, format is two character specifying + # the type and the latter part is the number of bytes. E.g., ' Column: ... +def minhash_ngrams( + input: Column, ngrams: int, seed: int, a: Column, b: Column +) -> Column: ... +def minhash64_ngrams( + input: Column, ngrams: int, seed: int, a: Column, b: Column +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx index 84811cda867..cdc4a4f3ac8 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr @@ -8,12 +8,16 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.nvtext.minhash cimport ( minhash as cpp_minhash, minhash64 as cpp_minhash64, + minhash_ngrams as cpp_minhash_ngrams, + minhash64_ngrams as cpp_minhash64_ngrams, ) from pylibcudf.libcudf.types cimport size_type __all__ = [ "minhash", "minhash64", + "minhash_ngrams", + "minhash64_ngrams", ] cpdef Column minhash( @@ -103,3 +107,93 @@ cpdef Column minhash64( ) return Column.from_libcudf(move(c_result)) + +cpdef Column minhash_ngrams( + Column input, + size_type ngrams, + uint32_t seed, + Column a, + Column b +): + """ + Returns the minhash values for each input row of strings. + This function uses MurmurHash3_x86_32 for the hash algorithm. + + For details, see :cpp:func:`minhash_ngrams`. + + Parameters + ---------- + input : Column + List column of strings to compute minhash + ngrams : size_type + Number of consecutive strings to hash in each row + seed : uint32_t + Seed used for the hash function + a : Column + 1st parameter value used for the minhash algorithm. + b : Column + 2nd parameter value used for the minhash algorithm. + + Returns + ------- + Column + List column of minhash values for each row per + value in columns a and b. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_minhash_ngrams( + input.view(), + ngrams, + seed, + a.view(), + b.view() + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column minhash64_ngrams( + Column input, + size_type ngrams, + uint64_t seed, + Column a, + Column b +): + """ + Returns the minhash values for each input row of strings. + This function uses MurmurHash3_x64_128 for the hash algorithm. + + For details, see :cpp:func:`minhash64_ngrams`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + ngrams : size_type + Number of consecutive strings to hash in each row + seed : uint64_t + Seed used for the hash function + a : Column + 1st parameter value used for the minhash algorithm. + b : Column + 2nd parameter value used for the minhash algorithm. + + Returns + ------- + Column + List column of minhash values for each row per + value in columns a and b. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_minhash64_ngrams( + input.view(), + ngrams, + seed, + a.view(), + b.view() + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd index 90676145afa..e6688e19762 100644 --- a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd @@ -1,9 +1,18 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp cimport bool +from libcpp.memory cimport unique_ptr from pylibcudf.column cimport Column +from pylibcudf.libcudf.nvtext.normalize cimport character_normalizer +cdef class CharacterNormalizer: + cdef unique_ptr[character_normalizer] c_obj cpdef Column normalize_spaces(Column input) -cpdef Column normalize_characters(Column input, bool do_lower_case) +cpdef Column characters_normalize(Column input, bool do_lower_case) + +cpdef Column normalize_characters( + Column input, + CharacterNormalizer normalizer +) diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi index 1d90a5a8960..d722ef6c79e 100644 --- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi @@ -1,6 +1,12 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.column import Column +class CharacterNormalizer: + def __init__(self, do_lower_case: bool, special_tokens: Column): ... + def normalize_spaces(input: Column) -> Column: ... -def normalize_characters(input: Column, do_lower_case: bool) -> Column: ... +def characters_normalize(input: Column, do_lower_case: bool) -> Column: ... +def normalize_characters( + input: Column, normalizer: CharacterNormalizer +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx index b259ccaefa6..6a18c205841 100644 --- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx @@ -1,16 +1,37 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. +from cython.operator cimport dereference from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.nvtext.normalize cimport ( - normalize_characters as cpp_normalize_characters, - normalize_spaces as cpp_normalize_spaces, -) +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext cimport normalize as cpp_normalize -__all__ = ["normalize_characters", "normalize_spaces"] +__all__ = [ + "CharacterNormalizer" + "normalize_characters", + "normalize_spaces", + "characters_normalize" +] + +cdef class CharacterNormalizer: + """The normalizer object to be used with ``normalize_characters``. + + For details, see :cpp:class:`cudf::nvtext::character_normalizer`. + """ + def __cinit__(self, bool do_lower_case, Column tokens): + cdef column_view c_tokens = tokens.view() + with nogil: + self.c_obj = move( + cpp_normalize.create_character_normalizer( + do_lower_case, + c_tokens + ) + ) + + __hash__ = None cpdef Column normalize_spaces(Column input): """ @@ -32,12 +53,12 @@ cpdef Column normalize_spaces(Column input): cdef unique_ptr[column] c_result with nogil: - c_result = cpp_normalize_spaces(input.view()) + c_result = cpp_normalize.normalize_spaces(input.view()) return Column.from_libcudf(move(c_result)) -cpdef Column normalize_characters(Column input, bool do_lower_case): +cpdef Column characters_normalize(Column input, bool do_lower_case): """ Normalizes strings characters for tokenizing. @@ -60,6 +81,38 @@ cpdef Column normalize_characters(Column input, bool do_lower_case): cdef unique_ptr[column] c_result with nogil: - c_result = cpp_normalize_characters(input.view(), do_lower_case) + c_result = cpp_normalize.normalize_characters( + input.view(), + do_lower_case + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column normalize_characters(Column input, CharacterNormalizer normalizer): + """ + Normalizes strings characters for tokenizing. + + For details, see :cpp:func:`normalize_characters` + + Parameters + ---------- + input : Column + Input strings + normalizer : CharacterNormalizer + Normalizer object used for modifying the input column text + + Returns + ------- + Column + Normalized strings column + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_normalize.normalize_characters( + input.view(), + dereference(normalizer.c_obj.get()) + ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx index 35abab7e838..e252d3072aa 100644 --- a/python/pylibcudf/pylibcudf/scalar.pyx +++ b/python/pylibcudf/pylibcudf/scalar.pyx @@ -2,7 +2,16 @@ from cpython cimport bool as py_bool, datetime from cython cimport no_gc_clear -from libc.stdint cimport int64_t +from libc.stdint cimport ( + int8_t, + int16_t, + int32_t, + int64_t, + uint8_t, + uint16_t, + uint32_t, + uint64_t, +) from libcpp cimport bool as cbool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -25,6 +34,13 @@ from .types cimport DataType from functools import singledispatch +try: + import numpy as np + np_error = None +except ImportError as err: + np = None + np_error = err + __all__ = ["Scalar"] @@ -111,6 +127,24 @@ cdef class Scalar: """ return _from_py(py_val) + @classmethod + def from_numpy(cls, np_val): + """ + Convert a NumPy scalar to a Scalar. + + Parameters + ---------- + np_val: numpy.generic + Value to convert to a pylibcudf.Scalar + + Returns + ------- + Scalar + New pylibcudf.Scalar + """ + return _from_numpy(np_val) + + cdef Scalar _new_scalar(unique_ptr[scalar] c_obj, DataType dtype): cdef Scalar s = Scalar.__new__(Scalar) s.c_obj.swap(c_obj) @@ -166,3 +200,115 @@ def _(py_val): cdef unique_ptr[scalar] c_obj = make_string_scalar(py_val.encode()) cdef Scalar slr = _new_scalar(move(c_obj), dtype) return slr + + +@singledispatch +def _from_numpy(np_val): + if np_error is not None: + raise np_error + raise TypeError(f"{type(np_val).__name__} cannot be converted to pylibcudf.Scalar") + + +if np is not None: + @_from_numpy.register(np.datetime64) + @_from_numpy.register(np.timedelta64) + def _(np_val): + raise NotImplementedError( + f"{type(np_val).__name__} is currently not supported." + ) + + @_from_numpy.register(np.bool_) + def _(np_val): + cdef DataType dtype = DataType(type_id.BOOL8) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + cdef cbool c_val = np_val + (c_obj.get()).set_value(c_val) + cdef Scalar slr = _new_scalar(move(c_obj), dtype) + return slr + + @_from_numpy.register(np.str_) + def _(np_val): + cdef DataType dtype = DataType(type_id.STRING) + cdef unique_ptr[scalar] c_obj = make_string_scalar(np_val.item().encode()) + cdef Scalar slr = _new_scalar(move(c_obj), dtype) + return slr + + @_from_numpy.register(np.int8) + def _(np_val): + dtype = DataType(type_id.INT8) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + (c_obj.get()).set_value(np_val) + cdef Scalar slr = _new_scalar(move(c_obj), dtype) + return slr + + @_from_numpy.register(np.int16) + def _(np_val): + dtype = DataType(type_id.INT16) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + (c_obj.get()).set_value(np_val) + cdef Scalar slr = _new_scalar(move(c_obj), dtype) + return slr + + @_from_numpy.register(np.int32) + def _(np_val): + dtype = DataType(type_id.INT32) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + (c_obj.get()).set_value(np_val) + cdef Scalar slr = _new_scalar(move(c_obj), dtype) + return slr + + @_from_numpy.register(np.int64) + def _(np_val): + dtype = DataType(type_id.INT64) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + (c_obj.get()).set_value(np_val) + cdef Scalar slr = _new_scalar(move(c_obj), dtype) + return slr + + @_from_numpy.register(np.uint8) + def _(np_val): + dtype = DataType(type_id.UINT8) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + (c_obj.get()).set_value(np_val) + cdef Scalar slr = _new_scalar(move(c_obj), dtype) + return slr + + @_from_numpy.register(np.uint16) + def _(np_val): + dtype = DataType(type_id.UINT16) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + (c_obj.get()).set_value(np_val) + cdef Scalar slr = _new_scalar(move(c_obj), dtype) + return slr + + @_from_numpy.register(np.uint32) + def _(np_val): + dtype = DataType(type_id.UINT32) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + (c_obj.get()).set_value(np_val) + cdef Scalar slr = _new_scalar(move(c_obj), dtype) + return slr + + @_from_numpy.register(np.uint64) + def _(np_val): + dtype = DataType(type_id.UINT64) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + (c_obj.get()).set_value(np_val) + cdef Scalar slr = _new_scalar(move(c_obj), dtype) + return slr + + @_from_numpy.register(np.float32) + def _(np_val): + dtype = DataType(type_id.FLOAT32) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + (c_obj.get()).set_value(np_val) + cdef Scalar slr = _new_scalar(move(c_obj), dtype) + return slr + + @_from_numpy.register(np.float64) + def _(np_val): + dtype = DataType(type_id.FLOAT64) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + (c_obj.get()).set_value(np_val) + cdef Scalar slr = _new_scalar(move(c_obj), dtype) + return slr diff --git a/python/pylibcudf/pylibcudf/tests/test_gpumemoryview.py b/python/pylibcudf/pylibcudf/tests/test_gpumemoryview.py new file mode 100644 index 00000000000..187857c935a --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_gpumemoryview.py @@ -0,0 +1,58 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import itertools + +import numpy as np +import pytest + +import rmm + +import pylibcudf as plc + +DTYPES = [ + "u1", + "i2", + "f4", + "f8", + "f16", +] +SIZES = [ + 0, + 1, + 1000, + 1024, + 10000, +] + + +@pytest.fixture(params=tuple(itertools.product(SIZES, DTYPES)), ids=repr) +def np_array(request): + size, dtype = request.param + return np.empty((size,), dtype=dtype) + + +def test_cuda_array_interface(np_array): + buf = rmm.DeviceBuffer( + ptr=np_array.__array_interface__["data"][0], size=np_array.nbytes + ) + gpumemview = plc.gpumemoryview(buf) + + np_array_view = np_array.view("u1") + + ai = np_array_view.__array_interface__ + cai = gpumemview.__cuda_array_interface__ + assert cai["shape"] == ai["shape"] + assert cai["strides"] == ai["strides"] + assert cai["typestr"] == ai["typestr"] + + +def test_len(np_array): + buf = rmm.DeviceBuffer( + ptr=np_array.__array_interface__["data"][0], size=np_array.nbytes + ) + gpumemview = plc.gpumemoryview(buf) + + np_array_view = np_array.view("u1") + + assert len(gpumemview) == len(np_array_view) + assert gpumemview.nbytes == np_array.nbytes diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py index ad7a6f7a762..ff8545f0617 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import pyarrow as pa import pytest @@ -33,3 +33,49 @@ def test_minhash(minhash_input_data, width): assert pa_result.type == pa.list_( pa.field("element", seed_type, nullable=False) ) + + +@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()]) +def minhash_ngrams_input_data(request): + input_arr = pa.array( + [ + ["foo", "bar", "foo foo", "bar bar", "foo bar", "bar foo"], + [ + "one", + "two", + "three", + "four", + "five", + "six", + "seven", + "eight", + "nine", + "ten", + "eleven", + ], + ] + ) + ab = pa.array([2, 3, 4, 5], request.param) + return input_arr, ab, request.param + + +@pytest.mark.parametrize("ngrams", [5, 10]) +def test_minhash_ngrams(minhash_ngrams_input_data, ngrams): + input_arr, ab, seed_type = minhash_ngrams_input_data + minhash_func = ( + plc.nvtext.minhash.minhash_ngrams + if seed_type == pa.uint32() + else plc.nvtext.minhash.minhash64_ngrams + ) + result = minhash_func( + plc.interop.from_arrow(input_arr), + ngrams, + 0, + plc.interop.from_arrow(ab), + plc.interop.from_arrow(ab), + ) + pa_result = plc.interop.to_arrow(result) + assert all(len(got) == len(ab) for got, s in zip(pa_result, input_arr)) + assert pa_result.type == pa.list_( + pa.field("element", seed_type, nullable=False) + ) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py index 25b6d1389ec..47bbb191be6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import pyarrow as pa import pytest @@ -15,7 +15,7 @@ def norm_spaces_input_data(): @pytest.fixture(scope="module") def norm_chars_input_data(): - arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"] + arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]", "[pad]"] return pa.array(arr) @@ -29,15 +29,98 @@ def test_normalize_spaces(norm_spaces_input_data): @pytest.mark.parametrize("do_lower", [True, False]) def test_normalize_characters(norm_chars_input_data, do_lower): - result = plc.nvtext.normalize.normalize_characters( + result = plc.nvtext.normalize.characters_normalize( plc.interop.from_arrow(norm_chars_input_data), do_lower, ) - expected = pa.array( - ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "] + if do_lower: + expected = pa.array( + [ + "eaio eaio", + "acenu", + "acenu", + " $ 24 . 08", + " [ a , bb ] ", + " [ pad ] ", + ] + ) + else: + expected = pa.array( + [ + "éâîô eaio", + "ĂĆĖÑÜ", + "ACENU", + " $ 24 . 08", + " [ a , bb ] ", + " [ pad ] ", + ] + ) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("do_lower", [True, False]) +def test_normalizer(norm_chars_input_data, do_lower): + result = plc.nvtext.normalize.normalize_characters( + plc.interop.from_arrow(norm_chars_input_data), + plc.nvtext.normalize.CharacterNormalizer( + do_lower, + plc.column_factories.make_empty_column(plc.types.TypeId.STRING), + ), + ) + if do_lower: + expected = pa.array( + [ + "eaio eaio", + "acenu", + "acenu", + " $ 24 . 08", + " [ a , bb ] ", + " [ pad ] ", + ] + ) + else: + expected = pa.array( + [ + "éâîô eaio", + "ĂĆĖÑÜ", + "ACENU", + " $ 24 . 08", + " [ a , bb ] ", + " [ pad ] ", + ] + ) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("do_lower", [True, False]) +def test_normalizer_with_special_tokens(norm_chars_input_data, do_lower): + special_tokens = pa.array(["[pad]"]) + result = plc.nvtext.normalize.normalize_characters( + plc.interop.from_arrow(norm_chars_input_data), + plc.nvtext.normalize.CharacterNormalizer( + do_lower, plc.interop.from_arrow(special_tokens) + ), ) - if not do_lower: + if do_lower: + expected = pa.array( + [ + "eaio eaio", + "acenu", + "acenu", + " $ 24 . 08", + " [ a , bb ] ", + " [pad] ", + ] + ) + else: expected = pa.array( - ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "] + [ + "éâîô eaio", + "ĂĆĖÑÜ", + "ACENU", + " $ 24 . 08", + " [ a , bb ] ", + " [pad] ", + ] ) assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_scalar.py b/python/pylibcudf/pylibcudf/tests/test_scalar.py index 45afae91c9a..056fcd5f63c 100644 --- a/python/pylibcudf/pylibcudf/tests/test_scalar.py +++ b/python/pylibcudf/pylibcudf/tests/test_scalar.py @@ -7,6 +7,11 @@ import pylibcudf as plc +@pytest.fixture(scope="module") +def np(): + return pytest.importorskip("numpy") + + @pytest.mark.parametrize( "val", [True, False, -1, 0, 1 - 1.0, 0.0, 1.52, "", "a1!"] ) @@ -28,3 +33,40 @@ def test_from_py_notimplemented(val): def test_from_py_typeerror(val): with pytest.raises(TypeError): plc.Scalar.from_py(val) + + +@pytest.mark.parametrize( + "np_type", + [ + "bool_", + "str_", + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "float32", + "float64", + ], +) +def test_from_numpy(np, np_type): + np_klass = getattr(np, np_type) + np_val = np_klass("1" if np_type == "str_" else 1) + result = plc.Scalar.from_numpy(np_val) + expected = pa.scalar(np_val) + assert plc.interop.to_arrow(result).equals(expected) + + +@pytest.mark.parametrize("np_type", ["datetime64", "timedelta64"]) +def test_from_numpy_notimplemented(np, np_type): + np_val = getattr(np, np_type)(1, "ns") + with pytest.raises(NotImplementedError): + plc.Scalar.from_numpy(np_val) + + +def test_from_numpy_typeerror(np): + with pytest.raises(TypeError): + plc.Scalar.from_numpy(np.void(5)) diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index 939da65c1ec..8ea6f0e94a4 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -15,7 +15,7 @@ readme = { file = "README.md", content-type = "text/markdown" } authors = [ { name = "NVIDIA Corporation" }, ] -license = { text = "Apache 2.0" } +license = { text = "Apache-2.0" } requires-python = ">=3.10" dependencies = [ "cuda-python>=11.8.5,<12.0a0", @@ -42,7 +42,7 @@ classifiers = [ test = [ "fastavro>=0.22.9", "hypothesis", - "numpy>=1.23,<3.0a0", + "numpy>=1.23,<2.1", "pandas", "pytest-cov", "pytest-xdist",