Merge pull request #512 from NVIDIA/branch-23.10

release 23.10 [skip ci]
NVIDIA · Nov 8, 2023 · f6fc5b8 · f6fc5b8
2 parents 5dab107 + 83c5f20
commit f6fc5b8
Show file tree

Hide file tree

Showing 52 changed files with 2,339 additions and 804 deletions.
diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml
@@ -18,7 +18,7 @@ name: auto-merge HEAD to BASE
 on:
   pull_request_target:
     branches:
-    - branch-23.08
+    - branch-23.10
     types: [closed]
 
 jobs:
@@ -29,14 +29,14 @@ jobs:
     steps:
       - uses: actions/checkout@v3
         with:
-          ref: branch-23.08 # force to fetch from latest upstream instead of PR ref
+          ref: branch-23.10 # force to fetch from latest upstream instead of PR ref
 
       - name: auto-merge job
         uses: ./.github/workflows/auto-merge
         env:
           OWNER: NVIDIA
           REPO_NAME: spark-rapids-ml
-          HEAD: branch-23.08
-          BASE: branch-23.10
+          HEAD: branch-23.10
+          BASE: branch-23.12
           AUTOMERGE_TOKEN: ${{ secrets.AUTOMERGE_TOKEN }} # use to merge PR
 
diff --git a/ci/Dockerfile b/ci/Dockerfile
@@ -37,6 +37,6 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86
     && conda config --set solver libmamba
 
 # install cuML
-ARG CUML_VER=23.08
+ARG CUML_VER=23.10
 RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$CUML_VER python=3.9 cuda-version=11.8 \
     && conda clean --all -f -y
diff --git a/ci/docs.sh b/ci/docs.sh
@@ -15,25 +15,47 @@
 # limitations under the License.
 #
 
-# get version tag
-TAG=$(git describe --tag)
-if [[ $? != 0 ]]; then
-    echo "Can only deploy from a version tag."
-    exit 1
+if [[ $1 == "nightly" ]]; then
+    TAG=$(git log -1 --format="%h")
+    BRANCH=$(git branch --show-current)
+else
+    # get version tag
+    TAG="v$VERSION"
 fi
 
 set -ex
 
 # build and publish docs
 pushd docs
+make clean
 make html
 git worktree add --track -b gh-pages _site origin/gh-pages
-cp -r build/html/* _site/api/python
-cp -r site/* _site
+
 pushd _site
+if [[ $1 == "nightly" ]]; then
+    # draft copy
+    api_dest=api/python-draft
+else
+    # release copy
+    api_dest=api/python
+    # also copy site wide changes for release
+    cp -r ../site/* .
+fi
+
+# in _site
+mkdir -p $api_dest
+cp -r ../build/html/* $api_dest/
+
 git add --all
-git commit -m "${TAG}"
-git push origin gh-pages
+dff=$(git diff --staged --stat)
+repo_url=$(git config --get remote.origin.url)
+url=${repo_url#https://}
+github_account=${GITHUB_ACCOUNT:-nvauto}
+if [[ -n $dff ]]; then
+    git commit -m "Update draft api docs to commit ${TAG} on ${BRANCH}"
+    git push -f https://${github_account}:${GITHUB_TOKEN}@${url} gh-pages
+fi
+
 popd #_site
-git worktree remove _site
+git worktree remove _site --force
 popd
diff --git a/ci/lint_python.py b/ci/lint_python.py
@@ -1,11 +1,13 @@
 from typing import Dict, List, Tuple
 
 import argparse
+from io import StringIO
 import os
 import subprocess
 import sys
 from multiprocessing import Pool, cpu_count
-from pylint import epylint
+from pylint.lint import Run
+from pylint.reporters.text import TextReporter
 
 # This script is copied from dmlc/xgboost
 
@@ -52,14 +54,16 @@ def __init__(self) -> None:
         ]
 
     def run(self, path: str) -> Tuple[Dict, str, str]:
-        (pylint_stdout, pylint_stderr) = epylint.py_run(
-            " ".join([str(path)] + self.pylint_opts), return_std=True
-        )
-        emap = {}
-        err = pylint_stderr.read()
 
+        pylint_output = StringIO()
+        reporter = TextReporter(pylint_output)
+        Run([str(path)] + self.pylint_opts, reporter=reporter, exit=False)
+
+        emap = {}
+        err = ""
+
         out = []
-        for line in pylint_stdout:
+        for line in pylint_output:
             out.append(line)
             key = line.split(":")[-1].split("(")[0].strip()
             if key not in self.pylint_cats:

diff --git a/ci/test.sh b/ci/test.sh
@@ -22,7 +22,7 @@ case $type in
   "pre-merge" | "")
     ut_args=""
     ;;
-  "nightly")
+  "nightly" | "release")
     ut_args="--runslow"
     ;;
   *)
@@ -45,8 +45,13 @@ pip install -r requirements_dev.txt && pip install -e .
 ./run_benchmark.sh $bench_args
 
 # check compatibility with Spark 3.3 in nightly run
+# also push draft release docs to gh-pages
 if [[ $type == "nightly" ]]; then
     pip uninstall pyspark -y
     pip install pyspark~=3.3.0
     ./run_benchmark.sh $bench_args
+    # if everything passed till now update draft release docs in gh-pages
+    # need to invoke docs.sh from top level of repo
+    cd .. # top level of repo
+    ci/docs.sh nightly
 fi
diff --git a/docker/Dockerfile.pip b/docker/Dockerfile.pip
@@ -18,7 +18,7 @@ ARG CUDA_VERSION=11.8.0
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 
 ARG PYSPARK_VERSION=3.3.1
-ARG RAPIDS_VERSION=23.8.0
+ARG RAPIDS_VERSION=23.10.0
 ARG ARCH=amd64
 #ARG ARCH=arm64
 # Install packages to build spark-rapids-ml

diff --git a/docker/Dockerfile.python b/docker/Dockerfile.python
@@ -17,7 +17,7 @@
 ARG CUDA_VERSION=11.8.0
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
 
-ARG CUML_VERSION=23.08
+ARG CUML_VERSION=23.10
 
 # Install packages to build spark-rapids-ml
 RUN apt update -y \

diff --git a/docs/site/_config.yml b/docs/site/_config.yml
@@ -51,4 +51,6 @@ exclude:
   - vendor/ruby/
 
 include:
-  - _static
+  - _static 
+  - _sphinx*
+
diff --git a/docs/site/api/index.md b/docs/site/api/index.md
@@ -4,4 +4,6 @@ nav_order: 5
 ---
 # API Documentation
 
-- [Python API](python)
+- Python API 
+  - [Stable Release](python)
+  - [Draft](python-draft)
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -9,7 +9,7 @@
 project = 'spark-rapids-ml'
 copyright = '2023, NVIDIA'
 author = 'NVIDIA'
-release = '23.8.0'
+release = '23.10.0'
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
@@ -41,6 +41,8 @@
 
 html_theme = 'pydata_sphinx_theme'
 
+html_show_sourcelink = False
+
 import inspect
 from spark_rapids_ml.utils import _unsupported_methods_attributes
 

diff --git a/jvm/README.md b/jvm/README.md
@@ -95,7 +95,7 @@ repository, usually in your `~/.m2/repository`.
 Add the artifact jar to the Spark, for example:
 ```bash
 ML_JAR="target/rapids-4-spark-ml_2.12-23.08.0-SNAPSHOT.jar"
-PLUGIN_JAR="~/.m2/repository/com/nvidia/rapids-4-spark_2.12/23.08.1-SNAPSHOT/rapids-4-spark_2.12-23.08.1-SNAPSHOT.jar"
+PLUGIN_JAR="~/.m2/repository/com/nvidia/rapids-4-spark_2.12/23.08.2-SNAPSHOT/rapids-4-spark_2.12-23.08.2-SNAPSHOT.jar"
 
 $SPARK_HOME/bin/spark-shell --master $SPARK_MASTER \
  --driver-memory 20G \

diff --git a/notebooks/aws-emr/init-bootstrap-action.sh b/notebooks/aws-emr/init-bootstrap-action.sh
@@ -8,7 +8,7 @@ sudo chmod a+rwx -R /sys/fs/cgroup/devices
 sudo yum install -y gcc openssl-devel bzip2-devel libffi-devel tar gzip wget make mysql-devel
 sudo bash -c "wget https://www.python.org/ftp/python/3.9.9/Python-3.9.9.tgz && tar xzf Python-3.9.9.tgz && cd Python-3.9.9 && ./configure --enable-optimizations && make altinstall"
 
-RAPIDS_VERSION=23.8.0
+RAPIDS_VERSION=23.10.0
 
 # install scikit-learn 
 sudo /usr/local/bin/pip3.9 install scikit-learn

diff --git a/notebooks/databricks/README.md b/notebooks/databricks/README.md
@@ -1,24 +1,24 @@
 ## Running notebooks on Databricks
 
 If you already have a Databricks account, you can run the example notebooks on a Databricks cluster, as follows:
-- Install the [databricks-cli](https://docs.databricks.com/dev-tools/cli/index.html).
+- Install the latest [databricks-cli](https://docs.databricks.com/dev-tools/cli/index.html).  Note that Databricks has deprecated the legacy python based cli in favor of a self contained executable. Make sure the new version is first on the executables PATH after installation.
 - Configure it with your workspace URL and an [access token](https://docs.databricks.com/dev-tools/api/latest/authentication.html).  For demonstration purposes, we will configure a new [connection profile](https://docs.databricks.com/dev-tools/cli/index.html#connection-profiles) named `spark-rapids-ml`.  If you already have a connection profile, just set the `PROFILE` environment variable accordingly and skip the configure step.
-  ```
+  ```bash
   export PROFILE=spark-rapids-ml
   databricks configure --token --profile ${PROFILE}
   ```
 - Create a zip file for the `spark-rapids-ml` package.
-  ```
+  ```bash
   cd spark-rapids-ml/python/src
   zip -r spark_rapids_ml.zip spark_rapids_ml
   ```
 - Copy the zip file to DBFS, setting `SAVE_DIR` to the directory of your choice.
-  ```"
+  ```bash
   export SAVE_DIR="/path/to/save/artifacts"
   databricks fs cp spark_rapids_ml.zip dbfs:${SAVE_DIR}/spark_rapids_ml.zip --profile ${PROFILE}
   ```
 - Edit the [init-pip-cuda-11.8.sh](init-pip-cuda-11.8.sh) init script to set the `SPARK_RAPIDS_ML_ZIP` variable to the DBFS location used above.
-  ```
+  ```bash
   cd spark-rapids-ml/notebooks/databricks
   sed -i"" -e "s;/path/to/zip/file;${SAVE_DIR}/spark_rapids_ml.zip;" init-pip-cuda-11.8.sh
   ```
@@ -28,20 +28,25 @@ If you already have a Databricks account, you can run the example notebooks on a
   - updates the CUDA runtime to 11.8 (required for Spark Rapids ML dependencies).
   - downloads and installs the [Spark-Rapids](https://github.com/NVIDIA/spark-rapids) plugin for accelerating data loading and Spark SQL.
   - installs various `cuXX` dependencies via pip.
-- Copy the modified `init-pip-cuda-11.8.sh` init script to DBFS.
-  ```
-  databricks fs cp init-pip-cuda-11.8.sh dbfs:${SAVE_DIR}/init-pip-cuda-11.8.sh --profile ${PROFILE}
+
+  **Note**: as of the last update of this README, Azure Databricks requires a CUDA driver forward compatibility package.  Uncomment the designated lines for this in the init script.  AWS Databricks does not need this and leave the lines commented in that case.
+
+- Copy the modified `init-pip-cuda-11.8.sh` init script to your *workspace* (not DBFS) (ex. workspace directory: /Users/< databricks-user-name >/init_scripts).
+  ```bash
+  export WS_SAVE_DIR="/path/to/directory/in/workspace"
+  databricks workspace mkdirs ${WS_SAVE_DIR} --profile ${PROFILE}
+  databricks workspace import --format AUTO --content $(base64 -i init-pip-cuda-11.8.sh) ${WS_SAVE_DIR}/init-pip-cuda-11.8.sh --profile ${PROFILE}
   ```
-- Create a cluster using **Databricks 11.3 LTS ML GPU Runtime** using at least two single-gpu workers and add the following configurations to the **Advanced options**.
+- Create a cluster using **Databricks 12.2 LTS ML GPU Runtime** using at least two single-gpu workers and add the following configurations to the **Advanced options**.
   - **Init Scripts**
-    - add the DBFS path to the uploaded init script, e.g. `dbfs:/path/to/save/artifacts/init-pip-cuda-11.8.sh`.
+    - add the workspace path to the uploaded init script, e.g. `${WS_SAVE_DIR}/init-pip-cuda-11.8.sh`.
   - **Spark**
     - **Spark config**
       ```
       spark.task.resource.gpu.amount 1
       spark.databricks.delta.preview.enabled true
       spark.python.worker.reuse true
-      spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-23.08.1.jar:/databricks/spark/python
+      spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-23.08.2.jar:/databricks/spark/python
       spark.sql.execution.arrow.maxRecordsPerBatch 100000
       spark.rapids.memory.gpu.minAllocFraction 0.0001
       spark.plugins com.nvidia.spark.SQLPlugin
@@ -63,8 +68,11 @@ If you already have a Databricks account, you can run the example notebooks on a
     - **Environment variables**
       ```
       LIBCUDF_CUFILE_POLICY=OFF
-      LD_LIBRARY_PATH=/usr/local/cuda/compat:/usr/local/cuda/lib64
       NCCL_DEBUG=INFO
       ```
+    - **Additional Environment variable for Azure Databricks**
+      ```
+      LD_LIBRARY_PATH=/usr/local/cuda/compat:/usr/local/cuda/lib64
+      ```
 - Start the configured cluster.
 - Select your workspace and upload the desired [notebook](../) via `Import` in the drop down menu for your workspace.
diff --git a/notebooks/databricks/init-pip-cuda-11.8.sh b/notebooks/databricks/init-pip-cuda-11.8.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
 # set portion of path below after /dbfs/ to dbfs zip file location
 SPARK_RAPIDS_ML_ZIP=/dbfs/path/to/zip/file
-# IMPORTANT: specify RAPIDS_VERSION fully 23.8.0 and not 23.8
+# IMPORTANT: specify RAPIDS_VERSION fully 23.10.0 and not 23.10
 # also RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0)
-# while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.1 and not 23.8.1)
-RAPIDS_VERSION=23.8.0
-SPARK_RAPIDS_VERSION=23.08.1
+# while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2)
+RAPIDS_VERSION=23.10.0
+SPARK_RAPIDS_VERSION=23.08.2
 
 curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar
 
@@ -14,21 +14,20 @@ wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/
 sh cuda_11.8.0_520.61.05_linux.run --silent --toolkit
 
 # install forward compatibility package due to old driver
-distro=ubuntu2004
-arch=x86_64
-apt-key del 7fa2af80
-wget https://developer.download.nvidia.com/compute/cuda/repos/$distro/$arch/cuda-keyring_1.0-1_all.deb
-dpkg -i cuda-keyring_1.0-1_all.deb
-apt-get update
-apt-get install -y cuda-compat-11-8
-
+# uncomment below lines on Azure Databricks
+# distro=ubuntu2004
+# arch=x86_64
+# apt-key del 7fa2af80
+# wget https://developer.download.nvidia.com/compute/cuda/repos/$distro/$arch/cuda-keyring_1.0-1_all.deb
+# dpkg -i cuda-keyring_1.0-1_all.deb
+# apt-get update
+# apt-get install -y cuda-compat-11-8
+# export LD_LIBRARY_PATH=/usr/local/cuda/compat:/usr/local/cuda/lib64
+# ldconfig
 
 # reset symlink and update library loading paths
-# **** set LD_LIBRARY_PATH as below in env var section of cluster config in DB cluster UI ****
 rm /usr/local/cuda
 ln -s /usr/local/cuda-11.8 /usr/local/cuda
-export LD_LIBRARY_PATH=/usr/local/cuda/compat:/usr/local/cuda/lib64
-ldconfig
 
 # upgrade pip
 /databricks/python/bin/pip install --upgrade pip

diff --git a/notebooks/dataproc/README.md b/notebooks/dataproc/README.md
@@ -29,7 +29,7 @@ If you already have a Dataproc account, you can run the example notebooks on a D
 - Create a cluster with at least two single-gpu workers.  **Note**: in addition to the initialization script from above, this also uses the standard [initialization actions](https://github.com/GoogleCloudDataproc/initialization-actions) for installing the GPU drivers and RAPIDS:
   ```
   export CUDA_VERSION=11.8
-  export RAPIDS_VERSION=23.8.0
+  export RAPIDS_VERSION=23.10.0
 
   gcloud dataproc clusters create $USER-spark-rapids-ml \
   --image-version=2.1-ubuntu \

diff --git a/notebooks/dataproc/spark_rapids_ml.sh b/notebooks/dataproc/spark_rapids_ml.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-RAPIDS_VERSION=23.8.0
+RAPIDS_VERSION=23.10.0
 
 # patch existing packages
 mamba install "llvmlite<0.40,>=0.39.0dev0" "numba>=0.56.2"