NVIDIA · SurajAralihalli · Nov 2, 2023 · Jul 25, 2023 · Aug 25, 2023 · Sep 12, 2023
diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml
@@ -18,7 +18,7 @@ name: auto-merge HEAD to BASE
 on:
   pull_request_target:
     branches:
-    - branch-23.08
+    - branch-23.10
     types: [closed]
 
 jobs:
@@ -29,14 +29,14 @@ jobs:
     steps:
       - uses: actions/checkout@v3
         with:
-          ref: branch-23.08 # force to fetch from latest upstream instead of PR ref
+          ref: branch-23.10 # force to fetch from latest upstream instead of PR ref
 
       - name: auto-merge job
         uses: ./.github/workflows/auto-merge
         env:
           OWNER: NVIDIA
           REPO_NAME: spark-rapids-examples
-          HEAD: branch-23.08
-          BASE: branch-23.10
+          HEAD: branch-23.10
+          BASE: branch-23.12
           AUTOMERGE_TOKEN: ${{ secrets.AUTOMERGE_TOKEN }} # use to merge PR
 
diff --git a/docs/get-started/xgboost-examples/csp/databricks/databricks.md b/docs/get-started/xgboost-examples/csp/databricks/databricks.md
@@ -21,7 +21,7 @@ Navigate to your home directory in the UI and select **Create** > **File** from
 create an `init.sh` scripts with contents:   
    ```bash
    #!/bin/bash
-   sudo wget -O /databricks/jars/rapids-4-spark_2.12-23.08.1.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.08.1/rapids-4-spark_2.12-23.08.1.jar
+   sudo wget -O /databricks/jars/rapids-4-spark_2.12-23.10.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.10.0/rapids-4-spark_2.12-23.10.0.jar
    ```
 1. Select the Databricks Runtime Version from one of the supported runtimes specified in the
    Prerequisites section.
@@ -68,7 +68,7 @@ create an `init.sh` scripts with contents:
     ```bash
     spark.rapids.sql.python.gpu.enabled true
     spark.python.daemon.module rapids.daemon_databricks
-    spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-23.08.1.jar:/databricks/spark/python
+    spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-23.10.0.jar:/databricks/spark/python
     ```
    Note that since python memory pool require installing the cudf library, so you need to install cudf library in 
    each worker nodes `pip install cudf-cu11 --extra-index-url=https://pypi.nvidia.com` or disable python memory pool

diff --git a/docs/get-started/xgboost-examples/csp/databricks/init.sh b/docs/get-started/xgboost-examples/csp/databricks/init.sh
@@ -1,7 +1,7 @@
 sudo rm -f /databricks/jars/spark--maven-trees--ml--10.x--xgboost-gpu--ml.dmlc--xgboost4j-gpu_2.12--ml.dmlc__xgboost4j-gpu_2.12__1.5.2.jar
 sudo rm -f /databricks/jars/spark--maven-trees--ml--10.x--xgboost-gpu--ml.dmlc--xgboost4j-spark-gpu_2.12--ml.dmlc__xgboost4j-spark-gpu_2.12__1.5.2.jar
 
-sudo wget -O /databricks/jars/rapids-4-spark_2.12-23.08.1.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.08.1/rapids-4-spark_2.12-23.08.1.jar
+sudo wget -O /databricks/jars/rapids-4-spark_2.12-23.10.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.10.0/rapids-4-spark_2.12-23.10.0.jar
 sudo wget -O /databricks/jars/xgboost4j-gpu_2.12-1.7.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-gpu_2.12/1.7.1/xgboost4j-gpu_2.12-1.7.1.jar
 sudo wget -O /databricks/jars/xgboost4j-spark-gpu_2.12-1.7.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-spark-gpu_2.12/1.7.1/xgboost4j-spark-gpu_2.12-1.7.1.jar
 ls -ltr

diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md b/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md
@@ -40,7 +40,7 @@ export SPARK_DOCKER_IMAGE=<gpu spark docker image repo and name>
 export SPARK_DOCKER_TAG=<spark docker image tag>
 
 pushd ${SPARK_HOME}
-wget https://github.com/NVIDIA/spark-rapids-examples/raw/branch-23.08/dockerfile/Dockerfile
+wget https://github.com/NVIDIA/spark-rapids-examples/raw/branch-23.10/dockerfile/Dockerfile
 
 # Optionally install additional jars into ${SPARK_HOME}/jars/
 

diff --git a/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md b/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md
@@ -5,7 +5,7 @@ For simplicity export the location to these jars. All examples assume the packag
 ### Download the jars
 
 Download the RAPIDS Accelerator for Apache Spark plugin jar
-  * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.08.1/rapids-4-spark_2.12-23.08.1.jar)
+  * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.10.0/rapids-4-spark_2.12-23.10.0.jar)
 
 ### Build XGBoost Python Examples
 

diff --git a/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md b/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md
@@ -5,7 +5,7 @@ For simplicity export the location to these jars. All examples assume the packag
 ### Download the jars
 
 1. Download the RAPIDS Accelerator for Apache Spark plugin jar
-   * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.08.1/rapids-4-spark_2.12-23.08.1.jar)
+   * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.10.0/rapids-4-spark_2.12-23.10.0.jar)
 
 ### Build XGBoost Scala Examples
 

diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile b/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile
@@ -17,7 +17,8 @@
 
 ARG CUDA_VER=11.8.0
 FROM nvidia/cuda:${CUDA_VER}-devel-ubuntu20.04
-ARG BRANCH_VER=23.08
+# Please do not update the BRANCH_VER version
+ARG BRANCH_VER=23.10
 
 RUN apt-get update
 RUN apt-get install -y wget ninja-build git

diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/README.md b/examples/ML+DL-Examples/Spark-cuML/pca/README.md
@@ -12,9 +12,9 @@ User can also download the release jar from Maven central:
 
 [rapids-4-spark-ml_2.12-22.02.0-cuda11.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-ml_2.12/22.02.0/rapids-4-spark-ml_2.12-22.02.0-cuda11.jar)
 
-[rapids-4-spark_2.12-23.08.1.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.08.1/rapids-4-spark_2.12-23.08.1.jar)
+[rapids-4-spark_2.12-23.10.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.10.0/rapids-4-spark_2.12-23.10.0.jar)
 
-Note: This demo could only work with v22.02.0 version.
+Note: This demo could only work with v22.02.0 spark-ml version, and only compatible with spark-rapids versions prior to 23.10.0 . Please do not update the version in release.
 
 ## Sample code
 
@@ -49,7 +49,7 @@ It is assumed that a Standalone Spark cluster has been set up, the `SPARK_MASTER
 
     ``` bash
     RAPIDS_ML_JAR=PATH_TO_rapids-4-spark-ml_2.12-22.02.0-cuda11.jar
-    PLUGIN_JAR=PATH_TO_rapids-4-spark_2.12-23.08.1.jar
+    PLUGIN_JAR=PATH_TO_rapids-4-spark_2.12-23.10.0.jar
 
     jupyter toree install                                \
     --spark_home=${SPARK_HOME}                             \

diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml b/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml
@@ -20,8 +20,7 @@
 
     <groupId>com.nvidia</groupId>
     <artifactId>PCAExample</artifactId>
-    <packaging>jar</packaging>
-    <version>23.08.0</version>
+    <version>23.10.0</version>
 
     <properties>
         <maven.compiler.source>8</maven.compiler.source>
@@ -51,6 +50,7 @@
         <dependency>
             <groupId>com.nvidia</groupId>
             <artifactId>rapids-4-spark-ml_2.12</artifactId>
+            <!--The last rapids-4-spark-ml release version is 22.02.0, snapshot version is 23.04.0-SNPASHOT! Please do not update the version-->
             <version>23.02.0</version>
         </dependency>
     </dependencies>

diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh b/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh
@@ -15,8 +15,9 @@
 # limitations under the License.
 #
 
+# Note that the last rapids-4-spark-ml release version is 22.02.0, snapshot version is 23.04.0-SNPASHOT, please do not update the version in release
 ML_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark-ml_2.12/23.04.0-SNAPSHOT/rapids-4-spark-ml_2.12-23.04.0-SNAPSHOT.jar
-PLUGIN_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark_2.12/23.08.0-SNAPSHOT/rapids-4-spark_2.12-23.08.0-SNAPSHOT.jar
+PLUGIN_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark_2.12/23.10.0-SNAPSHOT/rapids-4-spark_2.12-23.10.0-SNAPSHOT.jar
 Note: The last rapids-4-spark-ml release version is 22.02.0, snapshot version is 23.04.0-SNPASHOT.
 
 $SPARK_HOME/bin/spark-submit \
@@ -39,4 +40,4 @@ $SPARK_HOME/bin/spark-submit \
 --conf spark.network.timeout=1000s \
 --jars $ML_JAR,$PLUGIN_JAR \
 --class com.nvidia.spark.examples.pca.Main \
-/workspace/target/PCAExample-23.08.0-SNAPSHOT.jar
+/workspace/target/PCAExample-23.10.0-SNAPSHOT.jar
diff --git a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb
@@ -22,7 +22,7 @@
     "import os\n",
     "# Change to your cluster ip:port and directories\n",
     "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"spark:your-ip:port\")\n",
-    "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-path/rapids-4-spark_2.12-23.08.1.jar\")\n"
+    "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-path/rapids-4-spark_2.12-23.10.0.jar\")\n"
    ]
   },
   {

diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md
@@ -1,8 +1,30 @@
 # RAPIDS Accelerated UDF Examples
+
 This project contains sample implementations of RAPIDS accelerated user-defined functions.
 
+The ideal solution would be to replace the UDF with a series of DataFrame or SQL operations. If that
+is not possible, we also provide
+a [UDF compiler extension](https://nvidia.github.io/spark-rapids/docs/additional-functionality/udf-to-catalyst-expressions.html)
+to translate UDFs to Catalyst expressions. The extension is limited to only support compiling simple
+operations. For complicated cases, you can choose to implement a RAPIDS accelerated UDF.
+
 ## Spark Scala UDF Examples
 
+[URLDecode](src/main/scala/com/nvidia/spark/rapids/udf/scala/URLDecode.scala)
+is the simplest demo for getting started. From the code you can see there is an original CPU
+implementation provided by the `apply` method. We only need to implement the RapidsUDF interface
+which provides a single method we need to override called
+`evaluateColumnar`. The CPU URLDecode function processes the input row by row, but the GPU
+evaluateColumnar returns a cudf ColumnVector, because the GPU get its speed by performing operations
+on many rows at a time. In the `evaluateColumnar` function, there is a cudf implementation of URL
+decode that we're leveraging, so we don't need to write any native C++ code. This is all done
+through the [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable). The benefit to
+implement via the Java API is ease of development, but the memory model is not friendly for doing
+GPU operations because the JVM makes the assumption that everything we're trying to do is in heap
+memory. We need to free the GPU resources in a timely manner with try-finally blocks. Note that we
+need to implement both CPU and GPU functions so the UDF will still work if a higher-level operation
+involving the RAPIDS accelerated UDF falls back to the CPU.
+
 - [URLDecode](src/main/scala/com/nvidia/spark/rapids/udf/scala/URLDecode.scala)
   decodes URL-encoded strings using the
   [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable)
@@ -12,6 +34,23 @@ This project contains sample implementations of RAPIDS accelerated user-defined
 
 ## Spark Java UDF Examples
 
+Below are some examples for implementing RAPIDS accelerated Scala UDF via JNI and native code. If
+there is no existing simple Java API we could leverage, we can write native custom code.
+Take [CosineSimilarity](src/main/java/com/nvidia/spark/rapids/udf/java/CosineSimilarity.java) as the
+example, the Java class for the UDF is similar as the previous URLDecode/URLEncode demo. We need to
+implement a cosineSimilarity function in C++ code and goes into the native code as quickly as
+possible, because it is easier to write the code safely. In the native code, it `reinterpret_cast`
+the input to a column view, do some sanity checking and convert to list column views, then compute
+the cosine similarity, finally return the unique pointer to a column, release the underlying
+resources. On Java side we are going to wrap it in a column vector and own that resource.
+In `cosine_similarity.cu` we implement the computation as the actual CUDA kernel. In the CUDA kernel
+we can leverage the [Thrust template library](https://docs.nvidia.com/cuda/thrust/index.html) to
+write the standard algorithms for GPU parallelizing code. The benefit of implementing the UDF in
+native code is for maximum control over GPU memory utilization and performance. However the
+trade-off is a more complicated build environment, as we need to build against libcudf with
+significantly longer build times. Implementing a RAPIDS accelerated UDF in native code is a
+significant effort.
+
 - [URLDecode](src/main/java/com/nvidia/spark/rapids/udf/java/URLDecode.java)
   decodes URL-encoded strings using the
   [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable)
@@ -24,6 +63,8 @@ This project contains sample implementations of RAPIDS accelerated user-defined
 
 ## Hive UDF Examples
 
+Below are some examples for implementing RAPIDS accelerated Hive UDF via JNI and native code.
+
 - [URLDecode](src/main/java/com/nvidia/spark/rapids/udf/hive/URLDecode.java)
   implements a Hive simple UDF using the
   [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable)
@@ -37,78 +78,93 @@ This project contains sample implementations of RAPIDS accelerated user-defined
   [native code](src/main/cpp/src) to count words in strings
 
 ## Building and run the tests without Native Code Examples
-Some UDF examples use native code in their implementation.
-Building the native code requires a libcudf build environment, so these
-examples do not build by default.
+
+Some UDF examples use native code in their implementation. Building the native code requires a
+libcudf build environment, so these examples do not build by default.
 
 ### Prerequisites
-Download Spark and set SPARK_HOME environment variable.
-Refer to [Prerequisites](../../../docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md#Prerequisites)  
-Install python 3.8+, then install pytest, pyspark, sre_yield, findspark by using pip or conda.
-For example:
+
+Download [Apache Spark](https://spark.apache.org/downloads.html) and set `SPARK_HOME` environment variable.
+Install Python 3.8+, then install `pytest`, `sre_yield` by using pip or conda. For
+example:
+
 ```
+export SPARK_HOME=path-to-spark
 pip install pytest
-pip install pyspark
 pip install sre_yield
-pip install findspark
 ```
 
 Run the following command to build and run tests
+
 ```bash
+cd spark-rapids-examples/examples/UDF-Examples/RAPIDS-accelerated-UDFs
 mvn clean package
 ./run_pyspark_from_build.sh -m "not rapids_udf_example_native"
 ```
 
 ## Building with Native Code Examples and run test cases
-The `udf-native-examples` Maven profile
-can be used to include the native UDF examples in the build, i.e.: specify
- `-Pudf-native-examples` on the `mvn` command-line.
+
+The `udf-native-examples` Maven profile can be used to include the native UDF examples in the build,
+i.e.: specify
+`-Pudf-native-examples` on the `mvn` command-line.
 
 ### Creating a libcudf Build Environment
+
 Building the native code requires a libcudf build environment.  
-The `Dockerfile` in this directory can be used to setup a Docker image that
-provides a libcudf build environment. This repository will either need to be
-cloned or mounted into a container using that Docker image.
-The `Dockerfile` contains build arguments to control the Linux version,
-CUDA version, and other settings. See the top of the `Dockerfile` for details.
+The `Dockerfile` in this directory can be used to setup a Docker image that provides a libcudf build
+environment. This repository will either need to be cloned or mounted into a container using that
+Docker image. The `Dockerfile` contains build arguments to control the Linux version, CUDA version,
+and other settings. See the top of the `Dockerfile` for details.
 
 First install docker and [nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
 
 Run the following commands to build and start a docker
+
 ```bash
 cd spark-rapids-examples/examples/UDF-Examples/RAPIDS-accelerated-UDFs
 docker build -t my-local:my-udf-example-ubuntu .
 nvidia-docker run -it my-local:my-udf-example-ubuntu
 ```
 
 ### Build the udf-examples jar
-In the docker, clone the code and compile.
+
+In the Docker container, clone the code and compile.
+
 ```bash
 git clone https://github.com/NVIDIA/spark-rapids-examples.git
 cd spark-rapids-examples/examples/UDF-Examples/RAPIDS-accelerated-UDFs
 mvn clean package -Pudf-native-examples
 ```
-The building will spend some time like 1.5 hours.
-Then the rapids-4-spark-udf-examples*.jar is generated under RAPIDS-accelerated-UDFs/target directory.
+
+The build could take a long time (e.g.: 1.5 hours). Then the rapids-4-spark-udf-examples*.jar is
+generated under RAPIDS-accelerated-UDFs/target directory.
 
 ### Run all the examples including native examples in the docker
-Download Spark and set SPARK_HOME environment variable.
-Refer to [Prerequisites](../../../docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md#Prerequisites)   
-Set SPARK_HOME environment variable. 
+
+See the above [Prerequisites section](#prerequisites)
+
 ```
 export SPARK_HOME=path-to-spark
+pip install pytest
+pip install sre_yield
 ```
-Install python 3.8+, then install pytest, pyspark, sre_yield, findspark by using pip or conda.
-See above Prerequisites section
+
+Run the following command to run tests
+
 ```
 ./run_pyspark_from_build.sh
 ```
 
 ## How to run the Native UDFs on Spark local mode
-First finish the steps in "Building with Native Code Examples and run test cases" section, then do the following in the docker.
+
+First finish the steps in 
+[Building with Native Code Examples and run test cases](#building-with-native-code-examples-and-run-test-cases) section, 
+then do the following inside the Docker container.
 
 ### Get jars from Maven Central
-[rapids-4-spark_2.12-23.08.1.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.08.1/rapids-4-spark_2.12-23.08.1.jar)
+
+[rapids-4-spark_2.12-23.10.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.10.0/rapids-4-spark_2.12-23.10.0.jar)
+
 
 ### Launch a local mode Spark
 
@@ -151,5 +207,3 @@ spark.sql("CREATE TEMPORARY FUNCTION {} AS '{}'".format("wordcount", "com.nvidia
 spark.sql("select wordcount(c1) from tab group by c1").show()
 spark.sql("select wordcount(c1) from tab group by c1").explain()
 ```
-
-Refer to [more Spark modes](../../../docs/get-started/xgboost-examples/on-prem-cluster) to test against more Spark modes.
diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml
@@ -25,7 +25,7 @@
         user defined functions for use with the RAPIDS Accelerator
         for Apache Spark
     </description>
-    <version>23.08.0</version>
+    <version>23.10.0</version>
 
     <properties>
         <maven.compiler.source>1.8</maven.compiler.source>
@@ -37,7 +37,7 @@
         <cuda.version>cuda11</cuda.version>
         <scala.binary.version>2.12</scala.binary.version>
         <!-- Depends on release version, Snapshot version is not published to the Maven Central -->
-        <rapids4spark.version>23.08.1</rapids4spark.version>
+        <rapids4spark.version>23.10.0</rapids4spark.version>
         <spark.version>3.1.1</spark.version>
         <scala.version>2.12.15</scala.version>
         <udf.native.build.path>${project.build.directory}/cpp-build</udf.native.build.path>

diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/run_pyspark_from_build.sh b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/run_pyspark_from_build.sh
@@ -49,7 +49,7 @@ else
       "$SCRIPTPATH"
       "$SCRIPTPATH"/src/main/python)
 
-    # --ignore=target is used to exclude the target directory whihch contains unrelated python files.
+    # --ignore=target is used to exclude the target directory which contains unrelated python files.
     TEST_COMMON_OPTS=(-v
           -rfExXs
           "$TEST_ARGS"