Merge branch 'branch-25.04' into serialize-columns

rapidsai · Feb 13, 2025 · 6226c34 · 6226c34
2 parents 30793d1 + 7914858
commit 6226c34
Show file tree

Hide file tree

Showing 67 changed files with 911 additions and 1,384 deletions.
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -326,13 +326,14 @@ jobs:
   third-party-integration-tests-cudf-pandas:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
     with:
       build_type: pull-request
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       node_type: "gpu-l4-latest-1"
+      continue-on-error: true
       container_image: "rapidsai/ci-conda:latest"
       run_script: |
         ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -79,7 +79,6 @@ option(CUDA_ENABLE_LINEINFO
 option(CUDA_WARNINGS_AS_ERRORS "Enable -Werror=all-warnings for all CUDA compilation" ON)
 # cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
-option(CUDA_STATIC_CUFILE "Statically link cuFile" OFF)
 
 set(DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL ON)
 if(CUDA_STATIC_RUNTIME OR NOT BUILD_SHARED_LIBS)
@@ -464,7 +463,6 @@ add_library(
   src/hash/xxhash_64.cu
   src/interop/dlpack.cpp
   src/interop/arrow_utilities.cpp
-  src/interop/decimal_conversion_utilities.cu
   src/interop/to_arrow_device.cu
   src/interop/to_arrow_host.cu
   src/interop/from_arrow_device.cu
@@ -547,7 +545,6 @@ add_library(
   src/io/utilities/data_casting.cu
   src/io/utilities/data_sink.cpp
   src/io/utilities/datasource.cpp
-  src/io/utilities/file_io_utilities.cpp
   src/io/utilities/row_selection.cpp
   src/io/utilities/type_inference.cu
   src/io/utilities/trie.cu
@@ -923,15 +920,6 @@ target_compile_definitions(
 # Enable remote IO through KvikIO
 target_compile_definitions(cudf PRIVATE $<$<BOOL:${CUDF_KVIKIO_REMOTE_IO}>:CUDF_KVIKIO_REMOTE_IO>)
 
-# Enable cuFile support
-set(_cufile_suffix)
-if(CUDA_STATIC_CUFILE)
-  set(_cufile_suffix _static)
-endif()
-if(TARGET CUDA::cuFile${_cufile_suffix})
-  target_compile_definitions(cudf PRIVATE CUDF_CUFILE_FOUND)
-endif()
-
 # Remove this after upgrading to a CCCL that has a proper CMake option. See
 # https://github.com/NVIDIA/cccl/pull/2844
 target_compile_definitions(cudf PRIVATE THRUST_FORCE_32_BIT_OFFSET_TYPE=1)
@@ -944,7 +932,7 @@ target_link_libraries(
   cudf
   PUBLIC CCCL::CCCL rapids_logger::rapids_logger rmm::rmm $<BUILD_LOCAL_INTERFACE:BS::thread_pool>
   PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp
-          kvikio::kvikio $<TARGET_NAME_IF_EXISTS:CUDA::cuFile${_cufile_suffix}> nanoarrow
+          kvikio::kvikio nanoarrow
 )
 
 # Add Conda library, and include paths if specified

diff --git a/cpp/include/cudf/io/config_utils.hpp b/cpp/include/cudf/io/config_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,22 +19,7 @@
 
 namespace CUDF_EXPORT cudf {
 namespace io {
-namespace cufile_integration {
-
-/**
- * @brief Returns true if cuFile and its compatibility mode are enabled.
- */
-bool is_always_enabled();
-
-/**
- * @brief Returns true if only direct IO through cuFile is enabled (compatibility mode is disabled).
- */
-bool is_gds_enabled();
-
-/**
- * @brief Returns true if KvikIO is enabled.
- */
-bool is_kvikio_enabled();
+namespace kvikio_integration {
 
 /**
  * @brief Set KvikIO parameters, including:
@@ -45,7 +30,7 @@ bool is_kvikio_enabled();
  */
 void set_up_kvikio();
 
-}  // namespace cufile_integration
+}  // namespace kvikio_integration
 
 namespace nvcomp_integration {
 

diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -122,7 +122,7 @@ class data_sink {
    *
    * In the case where the sink type is itself a memory buffered write, this ends up
    * being effectively a second memcpy.  So a useful optimization for a "smart"
-   * custom data_sink is to do it's own internal management of the movement
+   * custom data_sink is to do its own internal management of the movement
    * of data between cpu and gpu; turning the internals of the writer into simply
    *
    * sink->device_write(device_buffer, size)

diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,27 +32,32 @@ namespace CUDF_EXPORT cudf {
  */
 
 /**
- * @brief Creates a new column by applying a unary function against every
- * element of an input column.
+ * @brief Creates a new column by applying a transform function against every
+ * element of the input columns.
  *
  * Computes:
- * `out[i] = F(in[i])`
+ * `out[i] = F(inputs[i]...)`.
  *
- * The output null mask is the same is the input null mask so if input[i] is
- * null then output[i] is also null
+ * Note that for every scalar in `inputs` (columns of size 1), `input[i] == input[0]`
  *
- * @param input         An immutable view of the input column to transform
- * @param unary_udf     The PTX/CUDA string of the unary function to apply
+ * The output null mask is the same as the null mask of the input columns, so if input[i] is
+ * null then output[i] is also null. The size of the resulting column is the size of the largest
+ * column.
+ * All input columns must have equivalent null masks.
+ *
+ *
+ * @param inputs        Immutable views of the input columns to transform
+ * @param transform_udf The PTX/CUDA string of the transform function to apply
  * @param output_type   The output type that is compatible with the output type in the UDF
  * @param is_ptx        true: the UDF is treated as PTX code; false: the UDF is treated as CUDA code
  * @param stream        CUDA stream used for device memory operations and kernel launches
  * @param mr            Device memory resource used to allocate the returned column's device memory
- * @return              The column resulting from applying the unary function to
+ * @return              The column resulting from applying the transform function to
  *                      every element of the input
  */
 std::unique_ptr<column> transform(
-  column_view const& input,
-  std::string const& unary_udf,
+  std::vector<column_view> const& inputs,
+  std::string const& transform_udf,
   data_type output_type,
   bool is_ptx,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),

diff --git a/cpp/src/interop/decimal_conversion_utilities.cu b/cpp/src/interop/decimal_conversion_utilities.cu
diff --git a/cpp/src/interop/decimal_conversion_utilities.cuh b/cpp/src/interop/decimal_conversion_utilities.cuh
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
@@ -15,7 +15,6 @@
  */
 
 #include "arrow_utilities.hpp"
-#include "decimal_conversion_utilities.cuh"
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>

diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu
@@ -15,7 +15,6 @@
  */
 
 #include "arrow_utilities.hpp"
-#include "decimal_conversion_utilities.cuh"
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/interop.hpp>

diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -250,13 +250,26 @@ struct dispatch_to_flatbuf {
   std::enable_if_t<cudf::is_fixed_point<T>(), void> operator()()
   {
     field_type_id = flatbuf::Type_Decimal;
-    field_offset  = flatbuf::CreateDecimal(fbb,
-                                          (col_meta.is_decimal_precision_set())
-                                             ? col_meta.get_decimal_precision()
-                                             : MAX_DECIMAL128_PRECISION,
-                                          col->type().scale(),
-                                          128)
-                     .Union();
+
+    auto const [max_precision, bitwidth] = []() constexpr -> std::pair<int32_t, int32_t> {
+      if constexpr (std::is_same_v<T, numeric::decimal32>) {
+        return {MAX_DECIMAL32_PRECISION, 32};
+      } else if constexpr (std::is_same_v<T, numeric::decimal64>) {
+        return {MAX_DECIMAL64_PRECISION, 64};
+      } else if constexpr (std::is_same_v<T, numeric::decimal128>) {
+        return {MAX_DECIMAL128_PRECISION, 128};
+      } else {
+        CUDF_FAIL("Unsupported fixed point type for arrow schema writer");
+      }
+    }();
+
+    field_offset =
+      flatbuf::CreateDecimal(
+        fbb,
+        (col_meta.is_decimal_precision_set()) ? col_meta.get_decimal_precision() : max_precision,
+        col->type().scale(),
+        bitwidth)
+        .Union();
   }
 
   template <typename T>

diff --git a/cpp/src/io/parquet/ipc/Schema_generated.h b/cpp/src/io/parquet/ipc/Schema_generated.h
@@ -1393,9 +1393,9 @@ inline ::flatbuffers::Offset<RunEndEncoded> CreateRunEndEncoded(
 }
 
 /// Exact decimal value represented as an integer value in two's
-/// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers
-/// are used. The representation uses the endianness indicated
-/// in the Schema.
+/// complement. Currently 32-bit (4-byte), 64-bit (8-byte),
+/// 128-bit (16-byte) and 256-bit (32-byte) integers are used.
+/// The representation uses the endianness indicated in the Schema.
 struct Decimal FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef DecimalBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -1407,7 +1407,7 @@ struct Decimal FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   int32_t precision() const { return GetField<int32_t>(VT_PRECISION, 0); }
   /// Number of digits after the decimal point "."
   int32_t scale() const { return GetField<int32_t>(VT_SCALE, 0); }
-  /// Number of bits per value. The only accepted widths are 128 and 256.
+  /// Number of bits per value. The accepted widths are 32, 64, 128 and 256.
   /// We use bitWidth for consistency with Int::bitWidth.
   int32_t bitWidth() const { return GetField<int32_t>(VT_BITWIDTH, 128); }
   bool Verify(::flatbuffers::Verifier& verifier) const

diff --git a/cpp/src/io/parquet/ipc/schema/Schema.fbs b/cpp/src/io/parquet/ipc/schema/Schema.fbs
@@ -45,6 +45,7 @@
 /// Version 1.3 - Add Run-End Encoded.
 /// Version 1.4 - Add BinaryView, Utf8View, variadicBufferCounts, ListView, and
 /// LargeListView.
+/// Version 1.5 - Add 32-bit and 64-bit as allowed bit widths for Decimal
 
 namespace cudf.io.parquet.flatbuf;
 
@@ -243,9 +244,9 @@ table RunEndEncoded {
 }
 
 /// Exact decimal value represented as an integer value in two's
-/// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers
-/// are used. The representation uses the endianness indicated
-/// in the Schema.
+/// complement. Currently 32-bit (4-byte), 64-bit (8-byte),
+/// 128-bit (16-byte) and 256-bit (32-byte) integers are used.
+/// The representation uses the endianness indicated in the Schema.
 table Decimal {
   /// Total number of decimal digits
   precision: int;