Merge pull request #737 from NVIDIA/branch-24.08

release 24.08 [skip ci]
NVIDIA · Sep 6, 2024 · 7f8e779 · 7f8e779
2 parents c7becc2 + bd106d3
commit 7f8e779
Show file tree

Hide file tree

Showing 49 changed files with 1,536 additions and 352 deletions.
diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml
@@ -18,7 +18,7 @@ name: auto-merge HEAD to BASE
 on:
   pull_request_target:
     branches:
-    - branch-24.06
+    - branch-24.08
     types: [closed]
 
 jobs:
@@ -29,14 +29,14 @@ jobs:
     steps:
       - uses: actions/checkout@v4
         with:
-          ref: branch-24.06 # force to fetch from latest upstream instead of PR ref
+          ref: branch-24.08 # force to fetch from latest upstream instead of PR ref
 
       - name: auto-merge job
         uses: ./.github/workflows/auto-merge
         env:
           OWNER: NVIDIA
           REPO_NAME: spark-rapids-ml
-          HEAD: branch-24.06
-          BASE: branch-24.08
+          HEAD: branch-24.08
+          BASE: branch-24.10
           AUTOMERGE_TOKEN: ${{ secrets.AUTOMERGE_TOKEN }} # use to merge PR
 
diff --git a/.github/workflows/gcs-benchmark.yml b/.github/workflows/gcs-benchmark.yml
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@ jobs:
     env:
       PROJECT: rapids-spark
       DATAPROC_REGION: us-central1
-      COMPUTE_REGION: ${{inputs.computer_region}}
+      COMPUTE_REGION: ${{ inputs.computer_region || 'us-central1' }}
       COMPUTE_ZONE: us-central1-a
       GCS_BUCKET: spark-rapids-ml-benchmarking
       KEY_FILE_CONTENT: ${{ secrets.GCLOUD_PRIVATE_KEY }}

diff --git a/.github/workflows/signoff-check.yml b/.github/workflows/signoff-check.yml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,13 +23,10 @@ jobs:
   signoff-check:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
-
-      - name: sigoff-check job
-        uses: ./.github/workflows/signoff-check
-        env:
-          OWNER: NVIDIA
-          REPO_NAME: spark-rapids-ml
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          PULL_NUMBER: ${{ github.event.number }}
-
+      - name: signoff
+        uses: NVIDIA/spark-rapids-common/signoff-check@main
+        with:
+          owner: ${{ github.repository_owner }}
+          repo: spark-rapids-ml
+          pull_number: ${{ github.event.number }}
+          token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/signoff-check/Dockerfile b/.github/workflows/signoff-check/Dockerfile
diff --git a/.github/workflows/signoff-check/signoff-check b/.github/workflows/signoff-check/signoff-check
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 Spark Rapids ML enables GPU accelerated distributed machine learning on [Apache Spark](https://spark.apache.org/).  It provides several PySpark ML compatible algorithms powered by the [RAPIDS cuML](https://docs.rapids.ai/api/cuml/stable/) library, along with a compatible Scala API for the PCA algorithm.
 
-These APIs seek to minimize any code changes to end user Spark code.  After your environment is configured to support GPUs (with drivers, CUDA toolkit, and RAPIDS dependencies), you should be able to just change an import statement or class name to take advantage of GPU acceleration.
+These APIs seek to minimize any code changes to end user Spark code.  After your environment is configured to support GPUs (with drivers, CUDA toolkit, and RAPIDS dependencies), you should be able to just change an import statement or class name to take advantage of GPU acceleration.   See [here](./python/README.md#clis-enabling-no-package-import-change) for experimental CLIs that enable GPU acceleration without the need for changing the `pyspark.ml` package names in an existing pyspark ml application.
 
 **Python**
 ```python

diff --git a/ci/Dockerfile b/ci/Dockerfile
@@ -37,6 +37,6 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86
     && conda config --set solver libmamba
 
 # install cuML
-ARG CUML_VER=24.06
-RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$CUML_VER python=3.9 cuda-version=11.8 \
+ARG CUML_VER=24.08
+RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$CUML_VER cuvs=$CUML_VER python=3.9 cuda-version=11.8 \
     && conda clean --all -f -y
diff --git a/ci/Jenkinsfile.premerge b/ci/Jenkinsfile.premerge
@@ -53,7 +53,8 @@ pipeline {
     }
 
     parameters {
-        string(name: 'REF', defaultValue: '',
+        // Put a default value for REF to avoid error when running the pipeline manually
+        string(name: 'REF', defaultValue: 'main',
             description: 'Merged commit of specific PR')
         string(name: 'GITHUB_DATA', defaultValue: '',
             description: 'Json-formatted github data from upstream blossom-ci')

diff --git a/ci/lint_python.py b/ci/lint_python.py
@@ -16,7 +16,9 @@
 SRC_PATHS = [
     "src/spark_rapids_ml",
     "tests",
+    "tests_large",
     "benchmark",
+    "tests_no_import_change",
 ]
 
 

diff --git a/docker/Dockerfile.pip b/docker/Dockerfile.pip
@@ -18,7 +18,7 @@ ARG CUDA_VERSION=11.8.0
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 
 ARG PYSPARK_VERSION=3.3.1
-ARG RAPIDS_VERSION=24.6.0
+ARG RAPIDS_VERSION=24.8.0
 ARG ARCH=amd64
 #ARG ARCH=arm64
 # Install packages to build spark-rapids-ml
@@ -40,6 +40,7 @@ RUN apt-get update -y \
 RUN pip install --no-cache-dir \
     cudf-cu11~=${RAPIDS_VERSION} \
     cuml-cu11~=${RAPIDS_VERSION} \
+    cuvs-cu11~=${RAPIDS_VERSION} \
     --extra-index-url=https://pypi.nvidia.com
 
 # install python dependencies

diff --git a/docker/Dockerfile.python b/docker/Dockerfile.python
@@ -17,7 +17,7 @@
 ARG CUDA_VERSION=11.8.0
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
 
-ARG CUML_VERSION=24.06
+ARG CUML_VERSION=24.08
 
 # Install packages to build spark-rapids-ml
 RUN apt update -y \

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -9,7 +9,7 @@
 project = 'spark-rapids-ml'
 copyright = '2024, NVIDIA'
 author = 'NVIDIA'
-release = '24.06.0'
+release = '24.08.0'
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

diff --git a/notebooks/aws-emr/init-bootstrap-action.sh b/notebooks/aws-emr/init-bootstrap-action.sh
@@ -8,14 +8,15 @@ sudo chmod a+rwx -R /sys/fs/cgroup/devices
 sudo yum install -y gcc openssl-devel bzip2-devel libffi-devel tar gzip wget make mysql-devel
 sudo bash -c "wget https://www.python.org/ftp/python/3.9.9/Python-3.9.9.tgz && tar xzf Python-3.9.9.tgz && cd Python-3.9.9 && ./configure --enable-optimizations && make altinstall"
 
-RAPIDS_VERSION=24.6.0
+RAPIDS_VERSION=24.8.0
 
 # install scikit-learn 
 sudo /usr/local/bin/pip3.9 install scikit-learn
 
 # install cudf and cuml
 sudo /usr/local/bin/pip3.9 install --no-cache-dir cudf-cu11==${RAPIDS_VERSION} \
     cuml-cu11==${RAPIDS_VERSION} \
+    cuvs-cu11==${RAPIDS_VERSION} \
     pylibraft-cu11==${RAPIDS_VERSION} \
     rmm-cu11==${RAPIDS_VERSION} \
     --extra-index-url=https://pypi.nvidia.com

diff --git a/notebooks/databricks/README.md b/notebooks/databricks/README.md
@@ -51,7 +51,7 @@ If you already have a Databricks account, you can run the example notebooks on a
       spark.task.resource.gpu.amount 1
       spark.databricks.delta.preview.enabled true
       spark.python.worker.reuse true
-      spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-24.04.1.jar:/databricks/spark/python
+      spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-24.06.1.jar:/databricks/spark/python
       spark.sql.execution.arrow.maxRecordsPerBatch 100000
       spark.rapids.memory.gpu.minAllocFraction 0.0001
       spark.plugins com.nvidia.spark.SQLPlugin

diff --git a/notebooks/databricks/init-pip-cuda-11.8.sh b/notebooks/databricks/init-pip-cuda-11.8.sh
@@ -4,8 +4,8 @@ SPARK_RAPIDS_ML_ZIP=/dbfs/path/to/zip/file
 # IMPORTANT: specify RAPIDS_VERSION fully 23.10.0 and not 23.10
 # also in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0)
 # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2)
-RAPIDS_VERSION=24.6.0
-SPARK_RAPIDS_VERSION=24.04.1
+RAPIDS_VERSION=24.8.0
+SPARK_RAPIDS_VERSION=24.06.1
 
 curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar
 
@@ -24,6 +24,7 @@ ln -s /usr/local/cuda-11.8 /usr/local/cuda
 # using ~= pulls in latest micro version patches
 /databricks/python/bin/pip install cudf-cu11~=${RAPIDS_VERSION} \
     cuml-cu11~=${RAPIDS_VERSION} \
+    cuvs-cu11~=${RAPIDS_VERSION} \
     pylibraft-cu11~=${RAPIDS_VERSION} \
     rmm-cu11~=${RAPIDS_VERSION} \
     --extra-index-url=https://pypi.nvidia.com

diff --git a/notebooks/dataproc/README.md b/notebooks/dataproc/README.md
@@ -29,7 +29,7 @@ If you already have a Dataproc account, you can run the example notebooks on a D
 - Create a cluster with at least two single-gpu workers.  **Note**: in addition to the initialization script from above, this also uses the standard [initialization actions](https://github.com/GoogleCloudDataproc/initialization-actions) for installing the GPU drivers and RAPIDS:
   ```
   export CUDA_VERSION=11.8
-  export RAPIDS_VERSION=24.6.0
+  export RAPIDS_VERSION=24.8.0
 
   gcloud dataproc clusters create $USER-spark-rapids-ml \
   --image-version=2.1-ubuntu \

diff --git a/notebooks/dataproc/spark_rapids_ml.sh b/notebooks/dataproc/spark_rapids_ml.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-RAPIDS_VERSION=24.6.0
+RAPIDS_VERSION=24.8.0
 
 # patch existing packages
 mamba install "llvmlite<0.40,>=0.39.0dev0" "numba>=0.56.2"
@@ -10,7 +10,7 @@ mamba uninstall -y pyarrow arrow
 
 # install cudf and cuml
 pip install --upgrade pip
-pip install cudf-cu11~=${RAPIDS_VERSION} cuml-cu11~=${RAPIDS_VERSION} \
+pip install cudf-cu11~=${RAPIDS_VERSION} cuml-cu11~=${RAPIDS_VERSION} cuvs-cu11~=${RAPIDS_VERSION} \
     pylibraft-cu11~=${RAPIDS_VERSION} \
     rmm-cu11~=${RAPIDS_VERSION} \
     --extra-index-url=https://pypi.nvidia.com

diff --git a/notebooks/logistic-regression.ipynb b/notebooks/logistic-regression.ipynb
@@ -124,7 +124,7 @@
    "metadata": {},
    "source": [
     "### Classifier builder\n",
-    "We will use this function to build both the Spark RAPIDS ML (GPU) and Spark ML (CPU) logistic regression classifier objects, demonstrating the common API, and verify they yield similar performance on our synthetic dataset.   NOTE: GPU LogisisticRegression does not yet support `standardization=True`"
+    "We will use this function to build both the Spark RAPIDS ML (GPU) and Spark ML (CPU) logistic regression classifier objects, demonstrating the common API, and verify they yield similar performance on our synthetic dataset."
    ]
   },
   {
@@ -134,7 +134,7 @@
    "outputs": [],
    "source": [
     "def build_lr_classifier(estimator_class):\n",
-    "    return ( estimator_class(standardization=False)\n",
+    "    return ( estimator_class()\n",
     "                .setFeaturesCol(\"features\")\n",
     "                .setLabelCol(\"label\")\n",
     "                .setRegParam(0.001)\n",
@@ -655,13 +655,6 @@
     "## Sparse Vectors"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Standardization needs to be false for now. Will be fixed in 24.02."
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -715,7 +708,6 @@
     "        regParam=0.01,\n",
     "        maxIter=100,\n",
     "        fitIntercept=True,\n",
-    "        standardization=False,\n",
     "        featuresCol=\"features\",\n",
     "        labelCol=\"label\",\n",
     "    )\n",

diff --git a/notebooks/umap.ipynb b/notebooks/umap.ipynb
@@ -652,8 +652,8 @@
     "import os\n",
     "import requests\n",
     "\n",
-    "SPARK_RAPIDS_VERSION = \"23.12.1\"\n",
-    "cuda_version = \"12\"\n",
+    "SPARK_RAPIDS_VERSION = \"24.06.1\"\n",
+    "cuda_version = \"11\"\n",
     "rapids_jar = f\"rapids-4-spark_2.12-{SPARK_RAPIDS_VERSION}.jar\"\n",
     "\n",
     "if not os.path.exists(rapids_jar):\n",