diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml
index 33bf4c5c6..035c57a9c 100644
--- a/.github/workflows/auto-merge.yml
+++ b/.github/workflows/auto-merge.yml
@@ -18,7 +18,7 @@ name: auto-merge HEAD to BASE
 on:
   pull_request_target:
     branches:
-    - branch-22.10
+    - branch-22.12
     types: [closed]
 
 jobs:
@@ -27,15 +27,15 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
         with:
-          ref: branch-22.10 # force to fetch from latest upstream instead of PR ref
+          ref: branch-22.12 # force to fetch from latest upstream instead of PR ref
 
       - name: auto-merge job
         uses: ./.github/workflows/auto-merge
         env:
           OWNER: NVIDIA
           REPO_NAME: spark-rapids-examples
-          HEAD: branch-22.10
-          BASE: branch-22.12
+          HEAD: branch-22.12
+          BASE: branch-23.02
           AUTOMERGE_TOKEN: ${{ secrets.AUTOMERGE_TOKEN }} # use to merge PR
diff --git a/README.md b/README.md
index a84a738eb..6c4df4ca5 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ There are broadly four categories of examples in this repo:
 2. [Spark XGBoost](./examples/XGBoost-Examples) 
 3. [Deep Learning/Machine Learning](./examples/ML+DL-Examples) 
 4. [RAPIDS UDF](./examples/UDF-Examples)
+5. [Databricks Tools demo notebooks](./tools/databricks)
 
 For more information on each of the examples please look into respective categories.
 
diff --git a/docs/get-started/xgboost-examples/csp/aws/ec2.md b/docs/get-started/xgboost-examples/csp/aws/ec2.md
index b64fa7a77..0565ce601 100644
--- a/docs/get-started/xgboost-examples/csp/aws/ec2.md
+++ b/docs/get-started/xgboost-examples/csp/aws/ec2.md
@@ -177,8 +177,8 @@ spark-submit --master spark://$HOSTNAME:7077 \
         ${SAMPLE_JAR} \
         -num_workers=${NUM_EXECUTORS} \
         -format=csv \
-        -dataPath="train::s3a://spark-xgboost-mortgage-dataset/csv/train/2000Q1" \
-        -dataPath="trans::s3a://spark-xgboost-mortgage-dataset/csv/eval/2000Q1" \
+        -dataPath="train::your-train-data-path" \
+        -dataPath="trans::your-eval-data-path" \
         -numRound=100 -max_depth=8 -nthread=$NUM_EXECUTOR_CORES -showFeatures=0 \
         -tree_method=gpu_hist
 ```
diff --git a/docs/get-started/xgboost-examples/csp/databricks/generate-init-script-10.4.ipynb b/docs/get-started/xgboost-examples/csp/databricks/generate-init-script-10.4.ipynb
index f056dfdf9..09033b8e0 100644
--- a/docs/get-started/xgboost-examples/csp/databricks/generate-init-script-10.4.ipynb
+++ b/docs/get-started/xgboost-examples/csp/databricks/generate-init-script-10.4.ipynb
@@ -24,9 +24,9 @@
    "source": [
     "%sh\n",
     "cd ../../dbfs/FileStore/jars/\n",
-    "sudo wget -O rapids-4-spark_2.12-22.10.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar\n",
-    "sudo wget -O xgboost4j-gpu_2.12-1.6.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-gpu_2.12/1.6.1/xgboost4j-gpu_2.12-1.6.1.jar\n",
-    "sudo wget -O xgboost4j-spark-gpu_2.12-1.6.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-spark-gpu_2.12/1.6.1/xgboost4j-spark-gpu_2.12-1.6.1.jar\n",
+    "sudo wget -O rapids-4-spark_2.12-22.12.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar\n",
+    "sudo wget -O xgboost4j-gpu_2.12-1.7.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-gpu_2.12/1.7.1/xgboost4j-gpu_2.12-1.7.1.jar\n",
+    "sudo wget -O xgboost4j-spark-gpu_2.12-1.7.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-spark-gpu_2.12/1.7.1/xgboost4j-spark-gpu_2.12-1.7.1.jar\n",
     "ls -ltr\n",
     "\n",
     "# Your Jars are downloaded in dbfs:/FileStore/jars directory"
@@ -59,9 +59,9 @@
     "sudo rm -f /databricks/jars/spark--maven-trees--ml--10.x--xgboost-gpu--ml.dmlc--xgboost4j-gpu_2.12--ml.dmlc__xgboost4j-gpu_2.12__1.5.2.jar\n",
     "sudo rm -f /databricks/jars/spark--maven-trees--ml--10.x--xgboost-gpu--ml.dmlc--xgboost4j-spark-gpu_2.12--ml.dmlc__xgboost4j-spark-gpu_2.12__1.5.2.jar\n",
     "\n",
-    "sudo cp /dbfs/FileStore/jars/xgboost4j-gpu_2.12-1.6.1.jar /databricks/jars/\n",
-    "sudo cp /dbfs/FileStore/jars/rapids-4-spark_2.12-22.10.0.jar /databricks/jars/\n",
-    "sudo cp /dbfs/FileStore/jars/xgboost4j-spark-gpu_2.12-1.6.1.jar /databricks/jars/\"\"\", True)"
+    "sudo cp /dbfs/FileStore/jars/xgboost4j-gpu_2.12-1.7.1.jar /databricks/jars/\n",
+    "sudo cp /dbfs/FileStore/jars/rapids-4-spark_2.12-22.12.0.jar /databricks/jars/\n",
+    "sudo cp /dbfs/FileStore/jars/xgboost4j-spark-gpu_2.12-1.7.1.jar /databricks/jars/\"\"\", True)"
    ]
   },
   {
@@ -132,8 +132,8 @@
     "\n",
     "1. Edit your cluster, adding an initialization script from `dbfs:/databricks/init_scripts/init.sh` in the \"Advanced Options\" under \"Init Scripts\" tab\n",
     "2. Reboot the cluster\n",
-    "3. Go to \"Libraries\" tab under your cluster and install `dbfs:/FileStore/jars/xgboost4j-spark-gpu_2.12-1.6.1.jar` in your cluster by selecting the \"DBFS\" option for installing jars\n",
-    "4. Import the mortgage example notebook from `https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.10/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb`\n",
+    "3. Go to \"Libraries\" tab under your cluster and install `dbfs:/FileStore/jars/xgboost4j-spark-gpu_2.12-1.7.1.jar` in your cluster by selecting the \"DBFS\" option for installing jars\n",
+    "4. Import the mortgage example notebook from `https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.12/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb`\n",
     "5. Inside the mortgage example notebook, update the data paths\n",
     "  `train_data = reader.schema(schema).option('header', True).csv('/data/mortgage/csv/small-train.csv')`\n",
     "  `trans_data = reader.schema(schema).option('header', True).csv('/data/mortgage/csv/small-trans.csv')`"
diff --git a/docs/get-started/xgboost-examples/csp/databricks/generate-init-script.ipynb b/docs/get-started/xgboost-examples/csp/databricks/generate-init-script.ipynb
index 772453e39..b0799d5c1 100644
--- a/docs/get-started/xgboost-examples/csp/databricks/generate-init-script.ipynb
+++ b/docs/get-started/xgboost-examples/csp/databricks/generate-init-script.ipynb
@@ -24,9 +24,9 @@
    "source": [
     "%sh\n",
     "cd ../../dbfs/FileStore/jars/\n",
-    "sudo wget -O rapids-4-spark_2.12-22.10.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar\n",
-    "sudo wget -O xgboost4j-gpu_2.12-1.6.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-gpu_2.12/1.6.1/xgboost4j-gpu_2.12-1.6.1.jar\n",
-    "sudo wget -O xgboost4j-spark-gpu_2.12-1.6.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-spark-gpu_2.12/1.6.1/xgboost4j-spark-gpu_2.12-1.6.1.jar\n",
+    "sudo wget -O rapids-4-spark_2.12-22.12.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar\n",
+    "sudo wget -O xgboost4j-gpu_2.12-1.7.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-gpu_2.12/1.7.1/xgboost4j-gpu_2.12-1.7.1.jar\n",
+    "sudo wget -O xgboost4j-spark-gpu_2.12-1.7.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-spark-gpu_2.12/1.7.1/xgboost4j-spark-gpu_2.12-1.7.1.jar\n",
     "ls -ltr\n",
     "\n",
     "# Your Jars are downloaded in dbfs:/FileStore/jars directory"
@@ -59,9 +59,9 @@
     "sudo rm -f /databricks/jars/spark--maven-trees--ml--9.x--xgboost-gpu--ml.dmlc--xgboost4j-gpu_2.12--ml.dmlc__xgboost4j-gpu_2.12__1.4.1.jar\n",
     "sudo rm -f /databricks/jars/spark--maven-trees--ml--9.x--xgboost-gpu--ml.dmlc--xgboost4j-spark-gpu_2.12--ml.dmlc__xgboost4j-spark-gpu_2.12__1.4.1.jar\n",
     "\n",
-    "sudo cp /dbfs/FileStore/jars/xgboost4j-gpu_2.12-1.6.1.jar /databricks/jars/\n",
-    "sudo cp /dbfs/FileStore/jars/rapids-4-spark_2.12-22.10.0.jar /databricks/jars/\n",
-    "sudo cp /dbfs/FileStore/jars/xgboost4j-spark-gpu_2.12-1.6.1.jar /databricks/jars/\"\"\", True)"
+    "sudo cp /dbfs/FileStore/jars/xgboost4j-gpu_2.12-1.7.1.jar /databricks/jars/\n",
+    "sudo cp /dbfs/FileStore/jars/rapids-4-spark_2.12-22.12.0.jar /databricks/jars/\n",
+    "sudo cp /dbfs/FileStore/jars/xgboost4j-spark-gpu_2.12-1.7.1.jar /databricks/jars/\"\"\", True)"
    ]
   },
   {
@@ -132,8 +132,8 @@
     "\n",
     "1. Edit your cluster, adding an initialization script from `dbfs:/databricks/init_scripts/init.sh` in the \"Advanced Options\" under \"Init Scripts\" tab\n",
     "2. Reboot the cluster\n",
-    "3. Go to \"Libraries\" tab under your cluster and install `dbfs:/FileStore/jars/xgboost4j-spark-gpu_2.12-1.6.1.jar` in your cluster by selecting the \"DBFS\" option for installing jars\n",
-    "4. Import the mortgage example notebook from `https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.10/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb`\n",
+    "3. Go to \"Libraries\" tab under your cluster and install `dbfs:/FileStore/jars/xgboost4j-spark-gpu_2.12-1.7.1.jar` in your cluster by selecting the \"DBFS\" option for installing jars\n",
+    "4. Import the mortgage example notebook from `https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.12/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb`\n",
     "5. Inside the mortgage example notebook, update the data paths\n",
     "  `train_data = reader.schema(schema).option('header', True).csv('/data/mortgage/csv/small-train.csv')`\n",
     "  `trans_data = reader.schema(schema).option('header', True).csv('/data/mortgage/csv/small-trans.csv')`"
diff --git a/docs/get-started/xgboost-examples/notebook/python-notebook.md b/docs/get-started/xgboost-examples/notebook/python-notebook.md
index 3bfd71174..c8cf57c3c 100644
--- a/docs/get-started/xgboost-examples/notebook/python-notebook.md
+++ b/docs/get-started/xgboost-examples/notebook/python-notebook.md
@@ -67,7 +67,3 @@ and the home directory for Apache Spark respectively.
 - Mortgage ETL Notebook: [Python](../../../../examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb)
 - Taxi ETL Notebook: [Python](../../../../examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb)
 - Note: Agaricus does not have ETL part.
-   
-For PySpark based XGBoost, please refer to the
-[Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.04/docs/get-started/xgboost-examples/notebook/python-notebook.md) 
-that uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/). 
diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md b/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md
index 54a251fd1..9a869d59e 100644
--- a/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md
+++ b/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md
@@ -40,7 +40,7 @@ export SPARK_DOCKER_IMAGE=<gpu spark docker image repo and name>
 export SPARK_DOCKER_TAG=<spark docker image tag>
 
 pushd ${SPARK_HOME}
-wget https://github.com/NVIDIA/spark-rapids-examples/raw/branch-22.10/dockerfile/Dockerfile
+wget https://github.com/NVIDIA/spark-rapids-examples/raw/branch-22.12/dockerfile/Dockerfile
 
 # Optionally install additional jars into ${SPARK_HOME}/jars/
 
diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md b/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md
index 6132a7563..b41824fe2 100644
--- a/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md
+++ b/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md
@@ -12,11 +12,13 @@ Prerequisites
   * Multi-node clusters with homogenous GPU configuration
 * Software Requirements
   * Ubuntu 18.04, 20.04/CentOS7, CentOS8
-  * CUDA 11.0+
+  * CUDA 11.5+
   * NVIDIA driver compatible with your CUDA
   * NCCL 2.7.8+
-  * Python 3.6+
+  * Python 3.8 or 3.9
   * NumPy
+  * XGBoost 1.7.0+
+  * cudf-cu11  
 
 The number of GPUs in each host dictates the number of Spark executors that can run there.
 Additionally, cores per Spark executor and cores per Spark task must match, such that each executor can run 1 task at any given time.
@@ -47,6 +49,14 @@ And here are the steps to enable the GPU resources discovery for Spark 3.1+.
     spark.worker.resource.gpu.amount 1
     spark.worker.resource.gpu.discoveryScript ${SPARK_HOME}/examples/src/main/scripts/getGpusResources.sh
     ```
+3. Install the XGBoost, cudf-cu11, numpy libraries on all nodes before running XGBoost application.
+
+``` bash
+pip install xgboost
+pip install cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
+pip install numpy
+pip install scikit-learn
+```
 
 Get Application Files, Jar and Dataset
 -------------------------------
@@ -182,6 +192,10 @@ export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.gpu_main
 
 # tree construction algorithm
 export TREE_METHOD=gpu_hist
+
+# if you enable archive python environment
+export PYSPARK_DRIVER_PYTHON=python
+export PYSPARK_PYTHON=./environment/bin/python
 ```
 
 Run spark-submit:
@@ -197,8 +211,9 @@ ${SPARK_HOME}/bin/spark-submit
  --driver-memory ${SPARK_DRIVER_MEMORY}                                         \
  --executor-memory ${SPARK_EXECUTOR_MEMORY}                                     \
  --conf spark.cores.max=${TOTAL_CORES}                                          \
- --jars ${RAPIDS_JAR},${XGBOOST4J_JAR},${XGBOOST4J_SPARK_JAR}     \
- --py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP}                   \
+ --archives your_pyspark_venv.tar.gz#environment     #if you enabled archive python environment \
+ --jars ${RAPIDS_JAR}    \
+ --py-files ${SAMPLE_ZIP}                   \
  ${MAIN_PY}                                                     \
  --mainClass=${EXAMPLE_CLASS}                                                   \
  --dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/output/train/      \
@@ -261,6 +276,10 @@ export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.cpu_main
 
 # tree construction algorithm
 export TREE_METHOD=hist
+
+# if you enable archive python environment
+export PYSPARK_DRIVER_PYTHON=python
+export PYSPARK_PYTHON=./environment/bin/python
 ```
 
 This is the same command as for the GPU example, repeated for convenience:
@@ -271,8 +290,9 @@ ${SPARK_HOME}/bin/spark-submit
  --driver-memory ${SPARK_DRIVER_MEMORY}                                         \
  --executor-memory ${SPARK_EXECUTOR_MEMORY}                                     \
  --conf spark.cores.max=${TOTAL_CORES}                                          \
- --jars ${XGBOOST4J_JAR},${XGBOOST4J_SPARK_JAR}       \
- --py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP}                       \
+ --archives your_pyspark_venv.tar.gz#environment     #if you enabled archive python environment \
+ --jars ${RAPIDS_JAR}     \
+ --py-files ${SAMPLE_ZIP}                       \
  ${SPARK_PYTHON_ENTRYPOINT}                                                     \
  --mainClass=${EXAMPLE_CLASS}                                                   \
  --dataPath=train::${DATA_PATH}/mortgage/output/train/      \
diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md b/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md
index 9d92da01a..f2bff0fdd 100644
--- a/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md
+++ b/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md
@@ -12,12 +12,14 @@ Prerequisites
   * Multi-node clusters with homogenous GPU configuration
 * Software Requirements
   * Ubuntu 18.04, 20.04/CentOS7, CentOS8
-  * CUDA 11.0+
+  * CUDA 11.5+
   * NVIDIA driver compatible with your CUDA
   * NCCL 2.7.8+
-  * Python 3.6+
+  * Python 3.8 or 3.9
   * NumPy
-
+  * XGBoost 1.7.0+
+  * cudf-cu11  
+  
 The number of GPUs per NodeManager dictates the number of Spark executors that can run in that NodeManager. 
 Additionally, cores per Spark executor and cores per Spark task must match, such that each executor can run 1 task at any given time.
 
@@ -32,6 +34,32 @@ We use `SPARK_HOME` environment variable to point to the Apache Spark cluster.
 And as to how to enable GPU scheduling and isolation for Yarn,
 please refer to [here](https://hadoop.apache.org/docs/r3.1.0/hadoop-yarn/hadoop-yarn-site/UsingGpus.html).
 
+Please make sure to install the XGBoost, cudf-cu11, numpy libraries on all nodes before running XGBoost application.
+``` bash
+pip install xgboost
+pip install cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
+pip install numpy
+pip install scikit-learn
+```
+You can also create an isolated python environment by using (Virtualenv)[https://virtualenv.pypa.io/en/latest/],
+and then directly pass/unpack the archive file and enable the environment on executors
+by leveraging the --archives option or spark.archives configuration.
+``` bash
+# create an isolated python environment and install libraries
+python -m venv pyspark_venv
+source pyspark_venv/bin/activate
+pip install xgboost
+pip install cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
+pip install numpy
+pip install scikit-learn
+venv-pack -o pyspark_venv.tar.gz
+
+# enable archive python environment on executors
+export PYSPARK_DRIVER_PYTHON=python # Do not set in cluster modes.
+export PYSPARK_PYTHON=./environment/bin/python
+spark-submit --archives pyspark_venv.tar.gz#environment app.py
+```
+
 Get Application Files, Jar and Dataset
 -------------------------------
 
@@ -114,6 +142,10 @@ export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.gpu_main
 
 # tree construction algorithm
 export TREE_METHOD=gpu_hist
+
+# if you enable archive python environment
+export PYSPARK_DRIVER_PYTHON=python
+export PYSPARK_PYTHON=./environment/bin/python
 ```
 
 Run spark-submit:
@@ -129,11 +161,12 @@ ${SPARK_HOME}/bin/spark-submit
  --files ${SPARK_HOME}/examples/src/main/scripts/getGpusResources.sh            \
  --master yarn                                                                  \
  --deploy-mode ${SPARK_DEPLOY_MODE}                                             \
+ --archives your_pyspark_venv.tar.gz#environment     #if you enabled archive python environment \
  --num-executors ${SPARK_NUM_EXECUTORS}                                         \
  --driver-memory ${SPARK_DRIVER_MEMORY}                                         \
  --executor-memory ${SPARK_EXECUTOR_MEMORY}                                     \
- --jars ${RAPIDS_JAR},${XGBOOST4J_JAR}        \
- --py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP}                   \
+ --jars ${RAPIDS_JAR}        \
+ --py-files ${SAMPLE_ZIP}                   \
  ${MAIN_PY}                                                     \
  --mainClass=${EXAMPLE_CLASS}                                                   \
  --dataPath=train::${DATA_PATH}/mortgage/out/train/      \
@@ -190,6 +223,10 @@ export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.cpu_main
 
 # tree construction algorithm
 export TREE_METHOD=hist
+
+# if you enable archive python environment
+export PYSPARK_DRIVER_PYTHON=python
+export PYSPARK_PYTHON=./environment/bin/python
 ```
 
 This is the same command as for the GPU example, repeated for convenience:
@@ -197,12 +234,13 @@ This is the same command as for the GPU example, repeated for convenience:
 ``` bash
 ${SPARK_HOME}/bin/spark-submit                                                  \
  --master yarn                                                                  \
+ --archives your_pyspark_venv.tar.gz#environment     #if you enabled archive python environment \
  --deploy-mode ${SPARK_DEPLOY_MODE}                                             \
  --num-executors ${SPARK_NUM_EXECUTORS}                                         \
  --driver-memory ${SPARK_DRIVER_MEMORY}                                         \
  --executor-memory ${SPARK_EXECUTOR_MEMORY}                                     \
- --jars ${XGBOOST4J_JAR},${XGBOOST4J_SPARK_JAR}                                 \
- --py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP}                                  \
+ --jars ${RAPIDS_JAR}        \
+ --py-files ${SAMPLE_ZIP}                                  \
  ${MAIN_PY}                                                     \
  --mainClass=${EXAMPLE_CLASS}                                                   \
  --dataPath=train::${DATA_PATH}/mortgage/output/train/       \
diff --git a/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md b/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md
index ca9442f44..2178d6d75 100644
--- a/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md
+++ b/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md
@@ -9,7 +9,7 @@ For simplicity export the location to these jars. All examples assume the packag
    * [XGBoost4j-Spark Package](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/)
 
 2. Download the RAPIDS Accelerator for Apache Spark plugin jar
-   * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar)
+   * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar)
 
 ### Build XGBoost Python Examples
 
@@ -21,14 +21,3 @@ You need to copy the dataset to `/opt/xgboost`. Use the following links to downl
 1. [Mortgage dataset](/docs/get-started/xgboost-examples/dataset/mortgage.md)
 2. [Taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
 3. [Agaricus dataset](https://gust.dev/r/xgboost-agaricus)
-
-### Setup environments
-
-``` bash
-export SPARK_XGBOOST_DIR=/opt/xgboost
-export RAPIDS_JAR=${SPARK_XGBOOST_DIR}/rapids-4-spark_2.12-22.10.0.jar
-export XGBOOST4J_JAR=${SPARK_XGBOOST_DIR}/xgboost4j_3.0-1.4.2-0.3.0.jar
-export XGBOOST4J_SPARK_JAR=${SPARK_XGBOOST_DIR}/xgboost4j-spark_3.0-1.4.2-0.3.0.jar
-export SAMPLE_ZIP=${SPARK_XGBOOST_DIR}/samples.zip
-export MAIN_PY=${SPARK_XGBOOST_DIR}/main.py
-```
diff --git a/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md b/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md
index 5bdc4f7cc..2303fdfe0 100644
--- a/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md
+++ b/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md
@@ -5,7 +5,7 @@ For simplicity export the location to these jars. All examples assume the packag
 ### Download the jars
 
 1. Download the RAPIDS Accelerator for Apache Spark plugin jar
-   * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar)
+   * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar)
 
 ### Build XGBoost Scala Examples
 
@@ -17,11 +17,3 @@ You need to copy the dataset to `/opt/xgboost`. Use the following links to downl
 1. [Mortgage dataset](/docs/get-started/xgboost-examples/dataset/mortgage.md)
 2. [Taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
 3. [Agaricus dataset](https://gust.dev/r/xgboost-agaricus)
-
-### Setup environments
-
-``` bash
-export SPARK_XGBOOST_DIR=/opt/xgboost
-export RAPIDS_JAR=${SPARK_XGBOOST_DIR}/rapids-4-spark_2.12-22.10.0.jar
-export SAMPLE_JAR=${SPARK_XGBOOST_DIR}/sample_xgboost_apps-0.2.3-jar-with-dependencies.jar
-```
diff --git a/docs/img/guides/mortgage-perf.png b/docs/img/guides/mortgage-perf.png
index 23715ce9a..11c94865a 100644
Binary files a/docs/img/guides/mortgage-perf.png and b/docs/img/guides/mortgage-perf.png differ
diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile b/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile
index ba511c45f..9b9c6fd58 100644
--- a/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile
+++ b/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile
@@ -17,7 +17,7 @@
 
 ARG CUDA_VER=11.5.1
 FROM nvidia/cuda:${CUDA_VER}-devel-ubuntu20.04
-ARG BRANCH_VER=22.10
+ARG BRANCH_VER=22.12
 
 RUN apt-get update
 RUN apt-get install -y wget ninja-build git
diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/README.md b/examples/ML+DL-Examples/Spark-cuML/pca/README.md
index 1086c6907..ca573aaf1 100644
--- a/examples/ML+DL-Examples/Spark-cuML/pca/README.md
+++ b/examples/ML+DL-Examples/Spark-cuML/pca/README.md
@@ -12,7 +12,7 @@ User can also download the release jar from Maven central:
 
 [rapids-4-spark-ml_2.12-22.02.0-cuda11.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-ml_2.12/22.02.0/rapids-4-spark-ml_2.12-22.02.0-cuda11.jar)
 
-[rapids-4-spark_2.12-22.10.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar)
+[rapids-4-spark_2.12-22.12.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar)
 
 
 ## Sample code
@@ -48,7 +48,7 @@ It is assumed that a Standalone Spark cluster has been set up, the `SPARK_MASTER
 
     ``` bash
     RAPIDS_ML_JAR=PATH_TO_rapids-4-spark-ml_2.12-22.02.0-cuda11.jar
-    PLUGIN_JAR=PATH_TO_rapids-4-spark_2.12-22.10.0.jar
+    PLUGIN_JAR=PATH_TO_rapids-4-spark_2.12-22.12.0.jar
 
     jupyter toree install                                \
     --spark_home=${SPARK_HOME}                             \
diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml b/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml
index 875ada38a..9cc790476 100644
--- a/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml
+++ b/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml
@@ -21,7 +21,7 @@
     <groupId>com.nvidia</groupId>
     <artifactId>PCAExample</artifactId>
     <packaging>jar</packaging>
-    <version>22.10.0-SNAPSHOT</version>
+    <version>22.12.0-SNAPSHOT</version>
 
     <properties>
         <maven.compiler.source>8</maven.compiler.source>
@@ -51,7 +51,7 @@
         <dependency>
             <groupId>com.nvidia</groupId>
             <artifactId>rapids-4-spark-ml_2.12</artifactId>
-            <version>22.10.0-SNAPSHOT</version>
+            <version>22.12.0-SNAPSHOT</version>
         </dependency>
     </dependencies>
 
diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh b/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh
index a167ad0cc..f5b287351 100755
--- a/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh
+++ b/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh
@@ -15,8 +15,8 @@
 # limitations under the License.
 #
 
-ML_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark-ml_2.12/22.10.0-SNAPSHOT/rapids-4-spark-ml_2.12-22.10.0-SNAPSHOT.jar
-PLUGIN_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark_2.12/22.10.0-SNAPSHOT/rapids-4-spark_2.12-22.10.0-SNAPSHOT.jar
+ML_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark-ml_2.12/22.12.0-SNAPSHOT/rapids-4-spark-ml_2.12-22.12.0-SNAPSHOT.jar
+PLUGIN_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark_2.12/22.12.0-SNAPSHOT/rapids-4-spark_2.12-22.12.0-SNAPSHOT.jar
 
 $SPARK_HOME/bin/spark-submit \
 --master spark://127.0.0.1:7077  \
@@ -38,4 +38,4 @@ $SPARK_HOME/bin/spark-submit \
 --conf spark.network.timeout=1000s \
 --jars $ML_JAR,$PLUGIN_JAR \
 --class com.nvidia.spark.examples.pca.Main \
-/workspace/target/PCAExample-22.10.0-SNAPSHOT.jar
+/workspace/target/PCAExample-22.12.0-SNAPSHOT.jar
diff --git a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb
index d5249e8fd..9e854115b 100644
--- a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb
+++ b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb
@@ -22,7 +22,7 @@
     "import os\n",
     "# Change to your cluster ip:port and directories\n",
     "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"spark:your-ip:port\")\n",
-    "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-path/rapids-4-spark_2.12-22.10.0.jar\")\n"
+    "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-path/rapids-4-spark_2.12-22.12.0.jar\")\n"
    ]
   },
   {
diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md
index 40d3d6cdd..922bd5fd6 100644
--- a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md
+++ b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md
@@ -108,7 +108,7 @@ See above Prerequisites section
 First finish the steps in "Building with Native Code Examples and run test cases" section, then do the following in the docker.
 
 ### Get jars from Maven Central
-[rapids-4-spark_2.12-22.10.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar)
+[rapids-4-spark_2.12-22.12.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar)
 
 ### Launch a local mode Spark
 
diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml
index 252d3abc3..812bbc778 100644
--- a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml
+++ b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml
@@ -25,7 +25,7 @@
         user defined functions for use with the RAPIDS Accelerator
         for Apache Spark
     </description>
-    <version>22.10.0-SNAPSHOT</version>
+    <version>22.12.0-SNAPSHOT</version>
 
     <properties>
         <maven.compiler.source>1.8</maven.compiler.source>
@@ -37,7 +37,7 @@
         <cuda.version>cuda11</cuda.version>
         <scala.binary.version>2.12</scala.binary.version>
         <!-- Depends on release version, Snapshot version is not published to the Maven Central -->
-        <rapids4spark.version>22.10.0</rapids4spark.version>
+        <rapids4spark.version>22.12.0-SNAPSHOT</rapids4spark.version>
         <spark.version>3.1.1</spark.version>
         <scala.version>2.12.15</scala.version>
         <udf.native.build.path>${project.build.directory}/cpp-build</udf.native.build.path>
diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/CMakeLists.txt b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/CMakeLists.txt
index 6ec503c13..593312611 100755
--- a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/CMakeLists.txt
+++ b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/CMakeLists.txt
@@ -16,7 +16,7 @@
 
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
-file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.10/RAPIDS.cmake
+file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.12/RAPIDS.cmake
      ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 
@@ -32,7 +32,7 @@ if(DEFINED GPU_ARCHS)
 endif()
 rapids_cuda_init_architectures(UDFEXAMPLESJNI)
 
-project(UDFEXAMPLESJNI VERSION 22.10.0 LANGUAGES C CXX CUDA)
+project(UDFEXAMPLESJNI VERSION 22.12.0 LANGUAGES C CXX CUDA)
 
 option(PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" OFF)
 option(BUILD_UDF_BENCHMARKS "Build the benchmarks" OFF)
@@ -84,10 +84,10 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -w --expt-extended-lambda --expt-relax
 set(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 
 rapids_cpm_init()
-rapids_cpm_find(cudf 22.10.00
+rapids_cpm_find(cudf 22.12.00
         CPM_ARGS
         GIT_REPOSITORY  https://github.com/rapidsai/cudf.git
-        GIT_TAG         branch-22.10
+        GIT_TAG         branch-22.12
         GIT_SHALLOW     TRUE
         SOURCE_SUBDIR   cpp
         OPTIONS         "BUILD_TESTS OFF"
diff --git a/examples/UDF-Examples/Spark-cuSpatial/Dockerfile b/examples/UDF-Examples/Spark-cuSpatial/Dockerfile
index f9bbff653..6f3f7852c 100644
--- a/examples/UDF-Examples/Spark-cuSpatial/Dockerfile
+++ b/examples/UDF-Examples/Spark-cuSpatial/Dockerfile
@@ -38,9 +38,6 @@ RUN conda --version
 
 RUN conda install -c conda-forge openjdk=8 maven=3.8.1 -y
 
-# install cuDF dependency.
-RUN conda install -c rapidsai -c nvidia -c conda-forge -c defaults libcuspatial=22.10 python=3.8 -y
-
 RUN wget --quiet \
     https://github.com/Kitware/CMake/releases/download/v3.21.3/cmake-3.21.3-linux-x86_64.tar.gz \
     && tar -xzf cmake-3.21.3-linux-x86_64.tar.gz \
diff --git a/examples/UDF-Examples/Spark-cuSpatial/Dockerfile.awsdb b/examples/UDF-Examples/Spark-cuSpatial/Dockerfile.awsdb
index a054441a4..d16d68975 100644
--- a/examples/UDF-Examples/Spark-cuSpatial/Dockerfile.awsdb
+++ b/examples/UDF-Examples/Spark-cuSpatial/Dockerfile.awsdb
@@ -48,7 +48,7 @@ RUN wget -q https://repo.continuum.io/miniconda/Miniconda3-py38_4.9.2-Linux-x86_
     conda config --system --set always_yes True && \
     conda clean --all
 
-RUN conda install -c rapidsai-nightly -c nvidia -c conda-forge  -c defaults libcuspatial=22.10
+RUN conda install -c rapidsai-nightly -c nvidia -c conda-forge  -c defaults libcuspatial=22.12
 RUN conda install -c conda-forge libgdal==3.3.1
 RUN pip install jupyter
 ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64
diff --git a/examples/UDF-Examples/Spark-cuSpatial/README.md b/examples/UDF-Examples/Spark-cuSpatial/README.md
index 3828a8177..6a4e84ff2 100644
--- a/examples/UDF-Examples/Spark-cuSpatial/README.md
+++ b/examples/UDF-Examples/Spark-cuSpatial/README.md
@@ -45,13 +45,17 @@ or [in local machine](#build-in-local-machine) after prerequisites.
      docker build -f Dockerfile . -t build-spark-cuspatial
      docker run -it build-spark-cuspatial bash
      ```
-2. Get the code, then run `mvn package`.
+2. Bash into the Docker and install libcuspatial
+     ```Bash
+     conda install -c rapidsai-nightly -c nvidia -c conda-forge -c defaults libcuspatial=22.12
+     ```
+3. Get the code, then run `mvn package`.
      ```Bash
      git clone https://github.com/NVIDIA/spark-rapids-examples.git
      cd spark-rapids-examples/examples/UDF-Examples/Spark-cuSpatial/
      mvn package
      ```
-3. You'll get the jar named `spark-cuspatial-<version>.jar` in the target folder.
+4. You'll get the jar named `spark-cuspatial-<version>.jar` in the target folder.
 
 Note: The docker env is just for building the jar, not for running the application.
 
@@ -65,9 +69,7 @@ Note: The docker env is just for building the jar, not for running the applicati
 4. [cuspatial](https://github.com/rapidsai/cuspatial): install libcuspatial
     ```Bash
     # Install libcuspatial from conda
-    conda install -c rapidsai -c nvidia -c conda-forge  -c defaults libcuspatial=22.10
-    # or below command for the nightly (aka SNAPSHOT) version.
-    conda install -c rapidsai-nightly -c nvidia -c conda-forge  -c defaults libcuspatial=22.10
+    conda install -c rapidsai-nightly -c nvidia -c conda-forge -c defaults libcuspatial=22.12
     ```
 5. Build the JAR using `mvn package`.
      ```Bash
@@ -79,22 +81,18 @@ Note: The docker env is just for building the jar, not for running the applicati
 
 ## Run
 ### GPU Demo on Spark Standalone on-premises cluster
-1. Install necessary libraries. Besides `cudf` and `cuspatial`, the `gdal` library that is compatible with the installed `cuspatial` may also be needed.
-    ```
-    conda install -c conda-forge libgdal=3.3.1
-    ```
-2. Set up [a standalone cluster](/docs/get-started/xgboost-examples/on-prem-cluster/standalone-scala.md) of Spark. Make sure the conda/lib is included in LD_LIBRARY_PATH, so that spark executors can load libcuspatial.so.
+1. Set up [a standalone cluster](/docs/get-started/xgboost-examples/on-prem-cluster/standalone-scala.md) of Spark. Make sure the conda/lib is included in LD_LIBRARY_PATH, so that spark executors can load libcuspatial.so.
 
-3. Download Spark RAPIDS JAR
-   * [Spark RAPIDS JAR v22.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar) or above
-4. Prepare sample dataset and JARs. Copy the [sample dataset](../../../datasets/cuspatial_data.tar.gz) to `/data/cuspatial_data/`.
+2. Download Spark RAPIDS JAR
+   * [Spark RAPIDS JAR v22.12.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar) or above
+3. Prepare sample dataset and JARs. Copy the [sample dataset](../../../datasets/cuspatial_data.tar.gz) to `/data/cuspatial_data/`.
     Copy Spark RAPIDS JAR and `spark-cuspatial-<version>.jar` to `/data/cuspatial_data/jars/`.
     If you build the `spark-cuspatial-<version>.jar` in docker, please copy the jar from docker to local:
     ```
     docker cp YOUR_DOCKER_CONTAINER:/PATH/TO/spark-cuspatial-<version>.jar ./YOUR_LOCAL_PATH
     ```
     Note: update the paths in `gpu-run.sh` accordingly.
-5. Run `gpu-run.sh`
+4. Run `gpu-run.sh`
     ```Bash
     ./gpu-run.sh
     ```
diff --git a/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh b/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh
index fead762aa..c98b916ff 100755
--- a/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh
+++ b/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh
@@ -31,7 +31,7 @@ rm -rf $DATA_OUT_PATH
 # the path to keep the jars of spark-rapids & spark-cuspatial
 JARS=$ROOT_PATH/jars
 
-JARS_PATH=${JARS_PATH:-$JARS/rapids-4-spark_2.12-22.10.0.jar,$JARS/spark-cuspatial-22.10.0-SNAPSHOT.jar}
+JARS_PATH=${JARS_PATH:-$JARS/rapids-4-spark_2.12-22.12.0-SNAPSHOT.jar,$JARS/spark-cuspatial-22.12.0-SNAPSHOT.jar}
 
 $SPARK_HOME/bin/spark-submit --master spark://$HOSTNAME:7077 \
 --name "Gpu Spatial Join UDF" \
diff --git a/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_standalone.ipynb b/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_standalone.ipynb
index 04f77452f..3fa3744a3 100644
--- a/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_standalone.ipynb
+++ b/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_standalone.ipynb
@@ -9,7 +9,7 @@
    "source": [
     "from pyspark.sql import SparkSession\n",
     "import os\n",
-    "jarsPath = os.getenv(\"JARS_PATH\", \"/data/cuspatial_data/jars/rapids-4-spark_2.12-22.10.0.jar,/data/cuspatial_data/jars/spark-cuspatial-22.10.0-SNAPSHOT.jar\")\n",
+    "jarsPath = os.getenv(\"JARS_PATH\", \"/data/cuspatial_data/jars/rapids-4-spark_2.12-22.12.0.jar,/data/cuspatial_data/jars/spark-cuspatial-22.12.0-SNAPSHOT.jar\")\n",
     "spark = SparkSession.builder \\\n",
     "    .config(\"spark.jars\", jarsPath) \\\n",
     "    .config(\"spark.sql.adaptive.enabled\", \"false\") \\\n",
diff --git a/examples/UDF-Examples/Spark-cuSpatial/pom.xml b/examples/UDF-Examples/Spark-cuSpatial/pom.xml
index 100cc3f1d..1f609009f 100644
--- a/examples/UDF-Examples/Spark-cuSpatial/pom.xml
+++ b/examples/UDF-Examples/Spark-cuSpatial/pom.xml
@@ -24,13 +24,13 @@
   <name>UDF of the cuSpatial case for the RAPIDS Accelerator</name>
   <description>The RAPIDS accelerated user defined function of the cuSpatial case
     for use with the RAPIDS Accelerator for Apache Spark</description>
-  <version>22.10.0-SNAPSHOT</version>
+  <version>22.12.0-SNAPSHOT</version>
 
   <properties>
     <maven.compiler.source>1.8</maven.compiler.source>
     <maven.compiler.target>1.8</maven.compiler.target>
     <java.major.version>8</java.major.version>
-    <rapids.version>22.10.0</rapids.version>
+    <rapids.version>22.12.0-SNAPSHOT</rapids.version>
     <scala.binary.version>2.12</scala.binary.version>
     <spark.version>3.2.0</spark.version>
     <udf.native.build.path>${project.build.directory}/cpp-build</udf.native.build.path>
diff --git a/examples/UDF-Examples/Spark-cuSpatial/src/main/native/CMakeLists.txt b/examples/UDF-Examples/Spark-cuSpatial/src/main/native/CMakeLists.txt
index 50675a42a..506b1697a 100755
--- a/examples/UDF-Examples/Spark-cuSpatial/src/main/native/CMakeLists.txt
+++ b/examples/UDF-Examples/Spark-cuSpatial/src/main/native/CMakeLists.txt
@@ -16,7 +16,7 @@
 
 cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR)
 
-project(SPATIALUDJNI VERSION 22.10.0 LANGUAGES C CXX CUDA)
+project(SPATIALUDJNI VERSION 22.12.0 LANGUAGES C CXX CUDA)
 
 ###################################################################################################
 # - build type ------------------------------------------------------------------------------------
diff --git a/examples/UDF-Examples/Spark-cuSpatial/src/main/native/src/PointInPolygonJni.cpp b/examples/UDF-Examples/Spark-cuSpatial/src/main/native/src/PointInPolygonJni.cpp
index dd15cc78d..ecbc0b6b2 100644
--- a/examples/UDF-Examples/Spark-cuSpatial/src/main/native/src/PointInPolygonJni.cpp
+++ b/examples/UDF-Examples/Spark-cuSpatial/src/main/native/src/PointInPolygonJni.cpp
@@ -132,7 +132,7 @@ inline bool is_invalid_column(cudf::column_view const& col) {
  * double type, and have at least one valid row. Otherwise, the behavior is undefined.
  */
 inline double reduce_as_double(cudf::column_view const& col,
-                               std::unique_ptr<cudf::reduce_aggregation> const& agg) {
+                               cudf::reduce_aggregation const& agg) {
   auto s = cudf::reduce(col, agg, col.type());
   // s is always valid
   auto p_num_scalar = reinterpret_cast<cudf::numeric_scalar<double>*>(s.get());
@@ -279,10 +279,10 @@ Java_com_nvidia_spark_rapids_udf_PointInPolygon_pointInPolygon(JNIEnv* env, jcla
 
     auto min_agg = cudf::make_min_aggregation<cudf::reduce_aggregation>();
     auto max_agg = cudf::make_max_aggregation<cudf::reduce_aggregation>();
-    auto x_min = reduce_as_double(*ply_x, min_agg);
-    auto x_max = reduce_as_double(*ply_x, max_agg);
-    auto y_min = reduce_as_double(*ply_y, min_agg);
-    auto y_max = reduce_as_double(*ply_y, max_agg);
+    auto x_min = reduce_as_double(*ply_x, *min_agg);
+    auto x_max = reduce_as_double(*ply_x, *max_agg);
+    auto y_min = reduce_as_double(*ply_y, *min_agg);
+    auto y_max = reduce_as_double(*ply_y, *max_agg);
 
     // 2) quadtree construction
     cudf::size_type min_size = 512;
diff --git a/examples/XGBoost-Examples/.gitignore b/examples/XGBoost-Examples/.gitignore
new file mode 100644
index 000000000..dadfea074
--- /dev/null
+++ b/examples/XGBoost-Examples/.gitignore
@@ -0,0 +1 @@
+samples.zip
diff --git a/examples/XGBoost-Examples/README.md b/examples/XGBoost-Examples/README.md
index 69a831af0..5d38f816f 100644
--- a/examples/XGBoost-Examples/README.md
+++ b/examples/XGBoost-Examples/README.md
@@ -1,19 +1,18 @@
 # Spark XGBoost Examples
 
-Spark XGBoost examples here showcase the need for end-to-end GPU acceleration.
+Spark XGBoost examples here showcase the need for ETL+Training pipeline GPU acceleration.
 The Scala based XGBoost examples here use [DMLC’s version](https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-spark_2.12/).
-For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04) that
-uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/).
+The pyspark based XGBoost examples requires [installing RAPIDS via pip](https://rapids.ai/pip.html#install).
 Most data scientists spend a lot of time not only on
 Training models but also processing the large amounts of data needed to train these models.
-As you can see below, XGBoost training on GPUs can be up to 10X and data processing using
-RAPIDS Accelerator can also be accelerated with an end-to-end speed-up of 7X on GPU compared to CPU.
+As you can see below, Pyspark+XGBoost training on GPUs can be up to 13X and data processing using
+RAPIDS Accelerator can also be accelerated with an end-to-end speed-up of 11X on GPU compared to CPU.
 In the public cloud, better performance can lead to significantly lower costs as demonstrated in this [blog](https://developer.nvidia.com/blog/gpu-accelerated-spark-xgboost/).
 
 ![mortgage-speedup](/docs/img/guides/mortgage-perf.png)
 
-Note that the test result is based on 21 years [Fannie Mea Single-Family Loan Performance Data](https://capitalmarkets.fanniemae.com/credit-risk-transfer/single-family-credit-risk-transfer/fannie-mae-single-family-loan-performance-data) 
-with a 4 A100 GPU and 512 CPU vcores cluster, the performance is affected by many aspects, 
+Note that the Training test result is based on 4 years [Fannie Mea Single-Family Loan Performance Data](https://capitalmarkets.fanniemae.com/credit-risk-transfer/single-family-credit-risk-transfer/fannie-mae-single-family-loan-performance-data) 
+with a 8 A100 GPU and 1024 CPU vcores cluster, the performance is affected by many aspects, 
 including data size and type of GPU. 
 
 In this folder, there are three blue prints for users to learn about using 
@@ -94,6 +93,9 @@ Please follow below steps to run the example notebooks in different notebook env
     - [Jupyter Notebook for Python](/docs/get-started/xgboost-examples/notebook/python-notebook.md)
     
 Note: 
+Update the default value of `spark.sql.execution.arrow.maxRecordsPerBatch` to a larger number(such as 200000) will  
+significantly improve performance by accelerating data transfer between JVM and Python process.
+
 For the CrossValidator job, we need to set `spark.task.resource.gpu.amount=1` to allow only 1 training task running on 1 GPU(executor),
 otherwise the customized CrossValidator may schedule more than 1 xgboost training tasks into one executor simultaneously and trigger 
 [issue-131](https://github.com/NVIDIA/spark-rapids-examples/issues/131).
diff --git a/examples/XGBoost-Examples/agaricus/notebooks/python/agaricus-gpu.ipynb b/examples/XGBoost-Examples/agaricus/notebooks/python/agaricus-gpu.ipynb
index 9d1b1e311..a49b8eca0 100644
--- a/examples/XGBoost-Examples/agaricus/notebooks/python/agaricus-gpu.ipynb
+++ b/examples/XGBoost-Examples/agaricus/notebooks/python/agaricus-gpu.ipynb
@@ -9,16 +9,12 @@
     "Agaricus is an example of xgboost classifier for multiple classification. This notebook will show you how to load data, train the xgboost model.\n",
     "\n",
     "A few libraries required for this notebook:\n",
-    "  1. NumPy\n",
-    "  2. cudf jar\n",
-    "  3. xgboost4j jar\n",
-    "  4. xgboost4j-spark jar\n",
-    "  5. rapids-4-spark.jar\n",
+    "  1. cudf-cu11\n",
+    "  2. xgboost\n",
+    "  3. scikit-learn\n",
+    "  4. numpy\n",
     "  \n",
-    "This notebook also illustrates the ease of porting a sample CPU based Spark xgboost4j code into GPU. There is only one change required for running Spark XGBoost on GPU. That is replacing the API `setFeaturesCol(feature)` on CPU with the new API `setFeaturesCols(features)`. This also eliminates the need for vectorization (assembling multiple feature columns in to one column) since we can read multiple columns.\n",
-    "\n",
-    "Note: For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04) that\n",
-    "uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/)."
+    "This notebook also illustrates the ease of porting a sample CPU based Spark xgboost4j code into GPU. There is no change required for running Spark XGBoost on GPU because both CPU and GPU call the same API. For CPU run, we need to vectorize the trained dataset before fitting data to classifier."
    ]
   },
   {
@@ -34,12 +30,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from ml.dmlc.xgboost4j.scala.spark import XGBoostClassificationModel, XGBoostClassifier\n",
+    "from xgboost.spark import SparkXGBClassifier, SparkXGBClassifierModel\n",
     "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
     "from pyspark.sql import SparkSession\n",
     "from pyspark.sql.types import FloatType, StructField, StructType\n",
     "from time import time\n",
-    "import os"
+    "from pyspark.conf import SparkConf\n",
+    "import os\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\"\n",
+    "# os.environ['PYSPARK_DRIVER_PYTHON'] = \"./environment/bin/python\""
    ]
   },
   {
@@ -64,9 +64,66 @@
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 06:57:40,306 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
+      "Setting default log level to \"WARN\".\n",
+      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
+      "2022-11-30 06:57:40,550 WARN resource.ResourceUtils: The configuration of cores (exec = 2 task = 1, runnable tasks = 2) will result in wasted resources due to resource gpu limiting the number of runnable tasks per executor to: 1. Please adjust your configuration.\n",
+      "2022-11-30 06:57:54,195 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator 22.12.0 using cudf 22.12.0.\n",
+      "2022-11-30 06:57:54,210 WARN rapids.RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n",
+      "2022-11-30 06:57:54,214 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n",
+      "2022-11-30 06:57:54,214 WARN rapids.RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n",
+      "2022-11-30 06:57:54,685 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.\n"
+     ]
+    }
+   ],
    "source": [
-    "spark = SparkSession.builder.getOrCreate()\n",
+    "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"/your-url\")\n",
+    "\n",
+    "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-jar-path\")\n",
+    "\n",
+    "# You need to update with your real hardware resource \n",
+    "driverMem = os.getenv(\"DRIVER_MEM\", \"2g\")\n",
+    "executorMem = os.getenv(\"EXECUTOR_MEM\", \"2g\")\n",
+    "pinnedPoolSize = os.getenv(\"PINNED_POOL_SIZE\", \"2g\")\n",
+    "concurrentGpuTasks = os.getenv(\"CONCURRENT_GPU_TASKS\", \"2\")\n",
+    "executorCores = int(os.getenv(\"EXECUTOR_CORES\", \"2\"))\n",
+    "# Common spark settings\n",
+    "conf = SparkConf()\n",
+    "conf.setMaster(SPARK_MASTER_URL)\n",
+    "conf.setAppName(\"Microbenchmark on GPU\")\n",
+    "conf.set(\"spark.executor.instances\",\"1\")\n",
+    "conf.set(\"spark.driver.memory\", driverMem)\n",
+    "## The tasks will run on GPU memory, so there is no need to set a high host memory\n",
+    "conf.set(\"spark.executor.memory\", executorMem)\n",
+    "## The tasks will run on GPU cores, so there is no need to use many cpu cores\n",
+    "conf.set(\"spark.executor.cores\", executorCores)\n",
+    "\n",
+    "\n",
+    "# Plugin settings\n",
+    "conf.set(\"spark.executor.resource.gpu.amount\", \"1\")\n",
+    "conf.set(\"spark.rapids.sql.concurrentGpuTasks\", concurrentGpuTasks)\n",
+    "conf.set(\"spark.rapids.memory.pinnedPool.size\", pinnedPoolSize)\n",
+    "# since pyspark and xgboost share the same GPU, we need to allocate some memory to xgboost to avoid GPU OOM while training \n",
+    "conf.set(\"spark.rapids.memory.gpu.allocFraction\",\"0.7\")\n",
+    "conf.set(\"spark.locality.wait\",\"0\")\n",
+    "##############note: only support value=1 https://github.com/dmlc/xgboost/blame/master/python-package/xgboost/spark/core.py#L370-L374\n",
+    "conf.set(\"spark.task.resource.gpu.amount\", 1) \n",
+    "conf.set(\"spark.rapids.sql.enabled\", \"true\") \n",
+    "conf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\n",
+    "conf.set(\"spark.sql.cache.serializer\",\"com.nvidia.spark.ParquetCachedBatchSerializer\")\n",
+    "conf.set(\"spark.driver.extraClassPath\", RAPIDS_JAR)\n",
+    "conf.set(\"spark.executor.extraClassPath\", RAPIDS_JAR)\n",
+    "\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n",
+    "# Create spark session\n",
+    "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
+    "\n",
     "reader = spark.read"
    ]
   },
@@ -89,8 +146,17 @@
     "\n",
     "# You need to update them to your real paths!\n",
     "dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n",
-    "train_data = reader.schema(schema).option('header', True).csv(dataRoot + '/agaricus/csv/train')\n",
-    "trans_data = reader.schema(schema).option('header', True).csv(dataRoot + '/agaricus/csv/test')"
+    "train_path = dataRoot + \"/agaricus/csv/train\"\n",
+    "eval_path = dataRoot + \"/agaricus/csv/eval\"\n",
+    "\n",
+    "data_format = 'csv'\n",
+    "has_header = 'true'\n",
+    "if data_format == 'csv':\n",
+    "    train_data = reader.schema(schema).option('header',has_header).csv(train_path)\n",
+    "    trans_data = reader.schema(schema).option('header',has_header).csv(eval_path)\n",
+    "else :\n",
+    "    train_data = reader.load(train_path)\n",
+    "    trans_data = reader.load(eval_path)"
    ]
   },
   {
@@ -127,28 +193,34 @@
    "outputs": [],
    "source": [
     "params = { \n",
-    "    'eta': 0.1,\n",
-    "    'missing': 0.0,\n",
-    "    'treeMethod': 'gpu_hist',\n",
-    "    'maxDepth': 2,\n",
-    "    'numWorkers': 1,\n",
-    "    'numRound' : 100,\n",
+    "    \"tree_method\": \"gpu_hist\",\n",
+    "    \"grow_policy\": \"depthwise\",\n",
+    "    \"num_workers\": 1,\n",
+    "    \"use_gpu\": \"true\",\n",
     "}\n",
-    "classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCols(features)"
+    "params['features_col'] = features\n",
+    "params['label_col'] = label\n",
+    "    \n",
+    "classifier = SparkXGBClassifier(**params)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The CPU version classifier provides the API `setFeaturesCol` which only accepts a single column name, so vectorization for multiple feature columns is required.\n",
-    "```Python\n",
-    "classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCol('features')\n",
-    "```\n",
-    "\n",
     "The parameter `num_workers` should be set to the number of GPUs in Spark cluster for GPU version, while for CPU version it is usually equal to the number of the CPU cores.\n",
     "\n",
-    "Concerning the tree method, GPU version only supports `gpu_hist` currently, while `hist` is designed and used here for CPU training."
+    "Concerning the tree method, GPU version only supports `gpu_hist` currently, while `hist` is designed and used here for CPU training.\n",
+    "\n",
+    "An example of CPU classifier:\n",
+    "```\n",
+    "classifier = SparkXGBClassifier(\n",
+    "  feature_col=features,\n",
+    "  label_col=label,  \n",
+    "  num_workers=1024,\n",
+    "  use_gpu=False,\n",
+    ")\n",
+    "```"
    ]
   },
   {
@@ -163,11 +235,30 @@
    "execution_count": 5,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-30 07:00:45,526 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n",
+      "[Stage 5:>                                                          (0 + 1) / 1]\r"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Training takes 27.95 seconds\n"
+      "Training takes 13.92 seconds\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "                                                                                \r",
+      "/data/home/yuanli/work/reviews/pr252/pyspark_venv_20221125/lib/python3.8/site-packages/xgboost/sklearn.py:808: UserWarning: Loading a native XGBoost model with Scikit-Learn interface.\n",
+      "  warnings.warn(\"Loading a native XGBoost model with Scikit-Learn interface.\")\n"
      ]
     }
    ],
@@ -192,10 +283,26 @@
    "cell_type": "code",
    "execution_count": 6,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "If features_cols param set, then features_col param is ignored.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.write().overwrite().save(dataRoot + '/model/agaricus')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
    "outputs": [],
    "source": [
-    "model.write().overwrite().save(dataRoot + '/new-model-path')\n",
-    "loaded_model = XGBoostClassificationModel().load(dataRoot + '/new-model-path')"
+    "loaded_model = SparkXGBClassifierModel().load(dataRoot + '/model/agaricus')"
    ]
   },
   {
@@ -207,22 +314,330 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 07:01:07,030 WARN rapids.GpuOverrides: \n",
+      "!Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction#798, probability#1062]\n",
+      "  @Expression <AttributeReference> label#254 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_0#255 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_1#256 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_2#257 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_3#258 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_4#259 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_5#260 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_6#261 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_7#262 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_8#263 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_9#264 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_10#265 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_11#266 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_12#267 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_13#268 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_14#269 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_15#270 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_16#271 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_17#272 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_18#273 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_19#274 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_20#275 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_21#276 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_22#277 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_23#278 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_24#279 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_25#280 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_26#281 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_27#282 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_28#283 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_29#284 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_30#285 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_31#286 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_32#287 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_33#288 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_34#289 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_35#290 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_36#291 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_37#292 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_38#293 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_39#294 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_40#295 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_41#296 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_42#297 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_43#298 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_44#299 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_45#300 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_46#301 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_47#302 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_48#303 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_49#304 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_50#305 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_51#306 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_52#307 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_53#308 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_54#309 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_55#310 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_56#311 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_57#312 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_58#313 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_59#314 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_60#315 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_61#316 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_62#317 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_63#318 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_64#319 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_65#320 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_66#321 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_67#322 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_68#323 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_69#324 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_70#325 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_71#326 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_72#327 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_73#328 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_74#329 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_75#330 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_76#331 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_77#332 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_78#333 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_79#334 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_80#335 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_81#336 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_82#337 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_83#338 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_84#339 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_85#340 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_86#341 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_87#342 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_88#343 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_89#344 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_90#345 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_91#346 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_92#347 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_93#348 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_94#349 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_95#350 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_96#351 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_97#352 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_98#353 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_99#354 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_100#355 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_101#356 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_102#357 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_103#358 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_104#359 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_105#360 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_106#361 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_107#362 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_108#363 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_109#364 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_110#365 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_111#366 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_112#367 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_113#368 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_114#369 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_115#370 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_116#371 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_117#372 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_118#373 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_119#374 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_120#375 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_121#376 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_122#377 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_123#378 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_124#379 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_125#380 could run on GPU\n",
+      "  !Expression <Alias> UDF(pythonUDF0#1327.rawPrediction) AS rawPrediction#798 cannot run on GPU because expression Alias UDF(pythonUDF0#1327.rawPrediction) AS rawPrediction#798 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#1327.rawPrediction) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n",
+      "    !Expression <ScalaUDF> UDF(pythonUDF0#1327.rawPrediction) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3659/488666387 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#1327.rawPrediction) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      @Expression <GetStructField> pythonUDF0#1327.rawPrediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#1327 could run on GPU\n",
+      "  @Expression <Alias> pythonUDF0#1327.prediction AS prediction#931 could run on GPU\n",
+      "    @Expression <GetStructField> pythonUDF0#1327.prediction could run on GPU\n",
+      "      @Expression <AttributeReference> pythonUDF0#1327 could run on GPU\n",
+      "  !Expression <Alias> UDF(pythonUDF0#1327.probability) AS probability#1062 cannot run on GPU because expression Alias UDF(pythonUDF0#1327.probability) AS probability#1062 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#1327.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n",
+      "    !Expression <ScalaUDF> UDF(pythonUDF0#1327.probability) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3659/488666387 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#1327.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      @Expression <GetStructField> pythonUDF0#1327.probability could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#1327 could run on GPU\n",
+      "\n",
+      "2022-11-30 07:01:07,071 WARN rapids.GpuOverrides: \n",
+      "!Exec <InMemoryTableScanExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction#798, probability#1062]\n",
+      "  @Expression <AttributeReference> label#254 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_0#255 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_1#256 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_2#257 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_3#258 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_4#259 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_5#260 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_6#261 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_7#262 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_8#263 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_9#264 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_10#265 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_11#266 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_12#267 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_13#268 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_14#269 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_15#270 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_16#271 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_17#272 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_18#273 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_19#274 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_20#275 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_21#276 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_22#277 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_23#278 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_24#279 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_25#280 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_26#281 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_27#282 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_28#283 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_29#284 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_30#285 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_31#286 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_32#287 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_33#288 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_34#289 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_35#290 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_36#291 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_37#292 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_38#293 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_39#294 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_40#295 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_41#296 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_42#297 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_43#298 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_44#299 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_45#300 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_46#301 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_47#302 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_48#303 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_49#304 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_50#305 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_51#306 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_52#307 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_53#308 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_54#309 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_55#310 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_56#311 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_57#312 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_58#313 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_59#314 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_60#315 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_61#316 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_62#317 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_63#318 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_64#319 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_65#320 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_66#321 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_67#322 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_68#323 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_69#324 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_70#325 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_71#326 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_72#327 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_73#328 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_74#329 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_75#330 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_76#331 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_77#332 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_78#333 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_79#334 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_80#335 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_81#336 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_82#337 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_83#338 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_84#339 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_85#340 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_86#341 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_87#342 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_88#343 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_89#344 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_90#345 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_91#346 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_92#347 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_93#348 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_94#349 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_95#350 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_96#351 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_97#352 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_98#353 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_99#354 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_100#355 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_101#356 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_102#357 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_103#358 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_104#359 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_105#360 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_106#361 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_107#362 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_108#363 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_109#364 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_110#365 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_111#366 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_112#367 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_113#368 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_114#369 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_115#370 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_116#371 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_117#372 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_118#373 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_119#374 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_120#375 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_121#376 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_122#377 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_123#378 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_124#379 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_125#380 could run on GPU\n",
+      "  !Expression <AttributeReference> rawPrediction#798 cannot run on GPU because expression AttributeReference rawPrediction#798 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  @Expression <AttributeReference> prediction#931 could run on GPU\n",
+      "  !Expression <AttributeReference> probability#1062 cannot run on GPU because expression AttributeReference probability#1062 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 07:01:09,857 WARN rapids.GpuOverrides:                               \n",
+      "!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n",
+      "  @Partitioning <SinglePartition$> could run on GPU\n",
+      "  !Exec <ProjectExec> cannot run on GPU because unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#1062, rawPrediction#798]; not all expressions can be replaced\n",
+      "    @Expression <Alias> cast(label#254 as string) AS label#3936 could run on GPU\n",
+      "      @Expression <Cast> cast(label#254 as string) could run on GPU\n",
+      "        @Expression <AttributeReference> label#254 could run on GPU\n",
+      "    @Expression <Alias> cast(rawPrediction#798 as string) AS rawPrediction#3937 could run on GPU\n",
+      "      !Expression <Cast> cast(rawPrediction#798 as string) cannot run on GPU because Cast from org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 to StringType is not supported\n",
+      "        !Expression <AttributeReference> rawPrediction#798 cannot run on GPU because expression AttributeReference rawPrediction#798 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    @Expression <Alias> cast(probability#1062 as string) AS probability#3938 could run on GPU\n",
+      "      !Expression <Cast> cast(probability#1062 as string) cannot run on GPU because Cast from org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 to StringType is not supported\n",
+      "        !Expression <AttributeReference> probability#1062 cannot run on GPU because expression AttributeReference probability#1062 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    @Expression <Alias> cast(prediction#931 as string) AS prediction#3939 could run on GPU\n",
+      "      @Expression <Cast> cast(prediction#931 as string) could run on GPU\n",
+      "        @Expression <AttributeReference> prediction#931 could run on GPU\n",
+      "    !Exec <InMemoryTableScanExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#1062, rawPrediction#798]; not all expressions can be replaced\n",
+      "      @Expression <AttributeReference> label#254 could run on GPU\n",
+      "      @Expression <AttributeReference> prediction#931 could run on GPU\n",
+      "      !Expression <AttributeReference> probability#1062 cannot run on GPU because expression AttributeReference probability#1062 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      !Expression <AttributeReference> rawPrediction#798 cannot run on GPU because expression AttributeReference rawPrediction#798 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Transformation takes 2.63 seconds\n",
+      "Transformation takes 3.26 seconds\n",
       "+-----+--------------------+--------------------+----------+\n",
       "|label|       rawPrediction|         probability|prediction|\n",
       "+-----+--------------------+--------------------+----------+\n",
-      "|  1.0|[-0.9667757749557...|[0.03322422504425...|       1.0|\n",
-      "|  0.0|[-0.0080436170101...|[0.99195638298988...|       0.0|\n",
-      "|  0.0|[-0.0080436170101...|[0.99195638298988...|       0.0|\n",
-      "|  0.0|[-0.1416745483875...|[0.85832545161247...|       0.0|\n",
-      "|  0.0|[-0.0747678577899...|[0.92523214221000...|       0.0|\n",
+      "|  1.0|[-9.6646747589111...|[6.35385513305664...|       1.0|\n",
+      "|  0.0|[-8.3923015594482...|[2.26557254791259...|       1.0|\n",
+      "|  0.0|[-8.0568389892578...|[3.16858291625976...|       1.0|\n",
+      "|  0.0|[1.91234850883483...|[0.87128275632858...|       0.0|\n",
+      "|  0.0|[-8.5582475662231...|[1.91867351531982...|       1.0|\n",
       "+-----+--------------------+--------------------+----------+\n",
       "only showing top 5 rows\n",
       "\n"
@@ -247,15 +662,54 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 07:01:10,292 WARN rapids.GpuOverrides: \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#931, label#5899, 1.0#5900, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(label,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#931 could run on GPU\n",
+      "    @Expression <AttributeReference> label#5899 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#5900 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#1062 cannot run on GPU because expression AttributeReference probability#1062 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#5905 cannot run on GPU because expression AttributeReference obj#5905 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#1062]; unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#1062]\n",
+      "    @Expression <AttributeReference> prediction#931 could run on GPU\n",
+      "    @Expression <Alias> cast(label#254 as double) AS label#5899 could run on GPU\n",
+      "      @Expression <Cast> cast(label#254 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> label#254 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#5900 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <AttributeReference> probability#1062 cannot run on GPU because expression AttributeReference probability#1062 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    !Exec <InMemoryTableScanExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#1062]\n",
+      "      @Expression <AttributeReference> label#254 could run on GPU\n",
+      "      @Expression <AttributeReference> prediction#931 could run on GPU\n",
+      "      !Expression <AttributeReference> probability#1062 cannot run on GPU because expression AttributeReference probability#1062 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Evaluation takes 0.29 seconds\n",
-      "Accuracy is 0.9987577063864658\n"
+      "Evaluation takes 1.0 seconds\n",
+      "Accuracy is 0.9069677632722861\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "[Stage 12:>                                                         (0 + 1) / 1]\r",
+      "\r",
+      "                                                                                \r"
      ]
     }
    ],
@@ -275,7 +729,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
diff --git a/examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/consts.py b/examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/consts.py
deleted file mode 100644
index 045bce986..000000000
--- a/examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/consts.py
+++ /dev/null
@@ -1,28 +0,0 @@
-#
-# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from pyspark.sql.types import *
-
-label = 'label'
-features = [ 'feature_' + str(i) for i in range(0, 126) ]
-schema = StructType([ StructField(x, FloatType()) for x in [label] + features ])
-
-default_params = {
-    'eta': 0.1,
-    'missing': 0.0,
-    'maxDepth': 2,
-    'numWorkers': 1,
-}
diff --git a/examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/cpu_main.py b/examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/cpu_main.py
deleted file mode 100644
index bbc35e617..000000000
--- a/examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/cpu_main.py
+++ /dev/null
@@ -1,64 +0,0 @@
-#
-# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from com.nvidia.spark.examples.agaricus.consts import *
-from com.nvidia.spark.examples.utility.utils import *
-from ml.dmlc.xgboost4j.scala.spark import *
-from pyspark.sql import SparkSession
-
-def main(args, xgboost_args):
-    spark = (SparkSession
-        .builder
-        .appName(args.mainClass)
-        .getOrCreate())
-
-    train_data, eval_data, trans_data = valid_input_data(spark, args, '', schema)
-
-    if args.mode in [ 'all', 'train' ]:
-        classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args))
-            .setLabelCol(label)
-            .setFeaturesCol('features'))
-        if eval_data:
-            eval_data = vectorize_data_frame(eval_data, label)
-            classifier.setEvalSets({ 'test': eval_data })
-        if not train_data:
-            print('-' * 80)
-            print('Usage: train data path required when mode is all or train')
-            exit(1)
-        train_data = vectorize_data_frame(train_data, label)
-        model = with_benchmark('Training', lambda: classifier.fit(train_data))
-
-        if args.modelPath:
-            writer = model.write().overwrite() if args.overwrite else model
-            writer.save(args.modelPath)
-    else:
-        model = XGBoostClassificationModel().load(args.modelPath)
-
-    if args.mode in [ 'all', 'transform' ]:
-        def transform():
-            cv_trans_data = vectorize_data_frame(trans_data, label)
-            result = model.transform(cv_trans_data).cache()
-            result.foreachPartition(lambda _: None)
-            return result
-
-        if not trans_data:
-            print('-' * 80)
-            print('Usage: trans data path required when mode is all or transform')
-            exit(1)
-        result = with_benchmark('Transformation', transform)
-        show_sample(args, result, label)
-        with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label))
-
-    spark.stop()
diff --git a/examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/gpu_main.py b/examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/main.py
similarity index 62%
rename from examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/gpu_main.py
rename to examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/main.py
index 3f466dc18..03a41e91d 100644
--- a/examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/gpu_main.py
+++ b/examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/main.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,47 +13,64 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from com.nvidia.spark.examples.agaricus.consts import *
+from pyspark.sql.types import *
+
 from com.nvidia.spark.examples.utility.utils import *
-from ml.dmlc.xgboost4j.scala.spark import *
 from pyspark.sql import SparkSession
 
+from xgboost.spark import SparkXGBClassifier, SparkXGBClassifierModel
+
+label = 'label'
+feature_names = ['feature_' + str(i) for i in range(0, 126)]
+schema = StructType([StructField(x, FloatType()) for x in [label] + feature_names])
+
+
 def main(args, xgboost_args):
     spark = (SparkSession
-        .builder
-        .appName(args.mainClass)
-        .getOrCreate())
+             .builder
+             .appName(args.mainClass)
+             .getOrCreate())
 
     train_data, eval_data, trans_data = valid_input_data(spark, args, '', schema)
 
-    if args.mode in [ 'all', 'train' ]:
-        classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args))
-            .setLabelCol(label)
-            .setFeaturesCols(features))
-        if eval_data:
-            classifier.setEvalSets({ 'test': eval_data })
-        if not train_data:
+    if args.mode in ['all', 'train']:
+        if train_data is None:
             print('-' * 80)
             print('Usage: train data path required when mode is all or train')
+            print('-' * 80)
             exit(1)
+
+        train_data, features = transform_data(train_data, label, args.use_gpu)
+        xgboost_args['features_col'] = features
+        xgboost_args['label_col'] = label
+        classifier = SparkXGBClassifier(**xgboost_args)
+
+        if eval_data:
+            # TODO
+            pass
+
         model = with_benchmark('Training', lambda: classifier.fit(train_data))
 
         if args.modelPath:
             writer = model.write().overwrite() if args.overwrite else model
             writer.save(args.modelPath)
     else:
-        model = XGBoostClassificationModel().load(args.modelPath)
+        model = SparkXGBClassifierModel.load(args.modelPath)
+
+    if args.mode in ['all', 'transform']:
+        if trans_data is None:
+            print('-' * 80)
+            print('Usage: trans data path required when mode is all or transform')
+            print('-' * 80)
+            exit(1)
+
+        trans_data, _ = transform_data(trans_data, label, args.use_gpu)
 
-    if args.mode in [ 'all', 'transform' ]:
         def transform():
             result = model.transform(trans_data).cache()
             result.foreachPartition(lambda _: None)
             return result
 
-        if not trans_data:
-            print('-' * 80)
-            print('Usage: trans data path required when mode is all or transform')
-            exit(1)
         result = with_benchmark('Transformation', transform)
         show_sample(args, result, label)
         with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label))
diff --git a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb
index 974b6094d..a3b93140a 100644
--- a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb
+++ b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb
@@ -6,20 +6,17 @@
    "source": [
     "# Dataset\n",
     "\n",
-    "Dataset is derived from Fannie Mae’s [Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) with all rights reserved by Fannie Mae. Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.10/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n",
+    "Dataset is derived from Fannie Mae’s [Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) with all rights reserved by Fannie Mae. Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.12/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n",
     "\n",
     "# ETL + XGBoost train & transform\n",
     "\n",
-    "This notebook is an end-to-end example of ETL + XGBoost Train & Transform by using [Spark-Rapids](https://github.com/NVIDIA/spark-rapids) and [XGBoost](https://github.com/nvidia/spark-xgboost) with GPU accelerated.\n",
+    "This notebook is an end-to-end example of ETL + XGBoost Train & Transform by using [Spark-Rapids](https://github.com/NVIDIA/spark-rapids) and [XGBoost](https://github.com/dmlc/xgboost) with GPU accelerated.\n",
     "<br>The main steps:\n",
     "1. Run ETL to generate 2 datasets for train and test<br>\n",
     "   You can choose to save the datasets or not by setting \"is_save_dataset\" to True or False.<br>\n",
     "   It means you don't need to save the dataset to disk after ETL and directly feed the dataframe to XGBoost train or transform.\n",
     "2. Run XGBoost train with the train dataset\n",
-    "3. Run XGBoost transform with the test dataset\n",
-    "\n",
-    "Note: For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04) that\n",
-    "uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/)."
+    "3. Run XGBoost transform with the test dataset"
    ]
   },
   {
@@ -31,10 +28,13 @@
     "import time\n",
     "import os\n",
     "from pyspark import broadcast\n",
+    "from pyspark.conf import SparkConf\n",
     "from pyspark.sql import SparkSession\n",
     "from pyspark.sql.functions import *\n",
     "from pyspark.sql.types import *\n",
-    "from pyspark.sql.window import Window"
+    "from pyspark.sql.window import Window\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\""
    ]
   },
   {
@@ -54,7 +54,7 @@
    "source": [
     "# The input path of dataset\n",
     "dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n",
-    "orig_raw_path = dataRoot + \"/mortgage/input/\"",
+    "orig_raw_path = dataRoot + \"/mortgage/input/\"\n",
     "orig_raw_path_csv2parquet = dataRoot + \"/mortgage/output/csv2parquet/\""
    ]
   },
@@ -64,10 +64,47 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "spark = (SparkSession\n",
-    "    .builder\n",
-    "    .appName(\"MortgageETL+XGBoost\")\n",
-    "    .getOrCreate())"
+    "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"/your-url\")\n",
+    "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-jar-path\")\n",
+    "\n",
+    "# You need to update with your real hardware resource \n",
+    "driverMem = os.getenv(\"DRIVER_MEM\", \"10g\")\n",
+    "executorMem = os.getenv(\"EXECUTOR_MEM\", \"10g\")\n",
+    "pinnedPoolSize = os.getenv(\"PINNED_POOL_SIZE\", \"2g\")\n",
+    "concurrentGpuTasks = os.getenv(\"CONCURRENT_GPU_TASKS\", \"2\")\n",
+    "executorCores = int(os.getenv(\"EXECUTOR_CORES\", \"4\"))\n",
+    "\n",
+    "# Common spark settings\n",
+    "conf = SparkConf()\n",
+    "conf.setMaster(SPARK_MASTER_URL)\n",
+    "conf.setAppName(\"Microbenchmark on GPU\")\n",
+    "conf.set(\"spark.driver.memory\", driverMem)\n",
+    "## The tasks will run on GPU memory, so there is no need to set a high host memory\n",
+    "conf.set(\"spark.executor.memory\", executorMem)\n",
+    "## The tasks will run on GPU cores, so there is no need to use many cpu cores\n",
+    "conf.set(\"spark.executor.cores\", executorCores)\n",
+    "\n",
+    "# Plugin settings\n",
+    "conf.set(\"spark.executor.resource.gpu.amount\", \"1\")\n",
+    "conf.set(\"spark.rapids.sql.concurrentGpuTasks\", concurrentGpuTasks)\n",
+    "conf.set(\"spark.rapids.memory.pinnedPool.size\", pinnedPoolSize)\n",
+    "##############note: only support value=1 see https://github.com/dmlc/xgboost/blame/master/python-package/xgboost/spark/core.py#L370-L374\n",
+    "conf.set(\"spark.task.resource.gpu.amount\", 1) \n",
+    "# since pyspark and xgboost share the same GPU, we need to allocate some memory to xgboost to avoid GPU OOM while training \n",
+    "conf.set(\"spark.rapids.memory.gpu.allocFraction\",\"0.6\")\n",
+    "conf.set(\"spark.rapids.sql.enabled\", \"true\") \n",
+    "conf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\n",
+    "conf.set(\"spark.sql.cache.serializer\",\"com.nvidia.spark.ParquetCachedBatchSerializer\")\n",
+    "conf.set(\"spark.driver.extraClassPath\", RAPIDS_JAR)\n",
+    "conf.set(\"spark.executor.extraClassPath\", RAPIDS_JAR)\n",
+    "conf.set(\"spark.jars\", RAPIDS_JAR)\n",
+    "\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n",
+    "\n",
+    "# Create spark session\n",
+    "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
+    "reader = spark.read"
    ]
   },
   {
@@ -737,9 +774,7 @@
     "spark.conf.set(\"spark.rapids.sql.explain\", \"ALL\")\n",
     "spark.conf.set(\"spark.rapids.sql.batchSizeBytes\", \"512M\")\n",
     "spark.conf.set(\"spark.rapids.sql.reader.batchSizeBytes\", \"768M\")\n",
-    "spark.conf.set(\"spark.rapids.sql.hasNans\", \"false\")\n",
-    "# use GPU to read CSV\n",
-    "spark.conf.set(\"spark.rapids.sql.csv.read.double.enabled\", \"true\")"
+    "spark.conf.set(\"spark.rapids.sql.hasNans\", \"false\")"
    ]
   },
   {
@@ -805,7 +840,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from ml.dmlc.xgboost4j.scala.spark import XGBoostClassificationModel, XGBoostClassifier\n",
+    "from xgboost.spark import SparkXGBClassifier, SparkXGBClassifierModel\n",
     "from pyspark.ml.evaluation import MulticlassClassificationEvaluator"
    ]
   },
@@ -893,16 +928,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# This sample uses 2 workers(GPUs) to run XGBoost training \n",
+    "# This sample uses 1 worker(GPU) to run XGBoost training, you can change according to your GPU resources\n",
     "params = { \n",
-    "    \"treeMethod\": \"gpu_hist\",\n",
-    "    \"objective\":\"binary:logistic\",\n",
-    "    \"growPolicy\": \"depthwise\",\n",
-    "    \"nthread\": 1,\n",
-    "    \"numRound\": 100,\n",
-    "    \"numWorkers\": 1,\n",
+    "    \"tree_method\": \"gpu_hist\",\n",
+    "    \"grow_policy\": \"depthwise\",\n",
+    "    \"num_workers\": 1,\n",
+    "    \"use_gpu\": \"true\",\n",
     "}\n",
-    "classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCols(features)"
+    "params['features_col'] = features\n",
+    "params['label_col'] = label\n",
+    "    \n",
+    "classifier = SparkXGBClassifier(**params)"
    ]
   },
   {
@@ -934,8 +970,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.write().overwrite().save(output_path_model)\n",
-    "loaded_model = XGBoostClassificationModel().load(output_path_model)"
+    "model.write().overwrite().save(output_path_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loaded_model = SparkXGBClassifierModel().load(output_path_model)"
    ]
   },
   {
diff --git a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb
index 93dd98866..4551654f5 100644
--- a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb
+++ b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb
@@ -6,10 +6,10 @@
    "source": [
     "## Prerequirement\n",
     "### 1. Download data\n",
-    "Dataset is derived from Fannie Mae’s [Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) with all rights reserved by Fannie Mae. Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.10/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n",
+    "Dataset is derived from Fannie Mae’s [Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) with all rights reserved by Fannie Mae. Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.12/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n",
     "\n",
     "### 2. Download needed jars\n",
-    "* [rapids-4-spark_2.12-22.10.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar)\n",
+    "* [rapids-4-spark_2.12-22.12.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar)\n",
     "\n",
     "\n",
     "### 3. Start Spark Standalone\n",
@@ -17,7 +17,7 @@
     "\n",
     "### 4. Add ENV\n",
     "```\n",
-    "$ export SPARK_JARS=rapids-4-spark_2.12-22.10.0.jar\n",
+    "$ export SPARK_JARS=rapids-4-spark_2.12-22.12.0.jar\n",
     "$ export PYSPARK_DRIVER_PYTHON=jupyter \n",
     "$ export PYSPARK_DRIVER_PYTHON_OPTS=notebook\n",
     "```\n",
diff --git a/examples/XGBoost-Examples/mortgage/notebooks/python/cv-mortgage-gpu.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/python/cv-mortgage-gpu.ipynb
index 94a682cef..ea128ef9c 100644
--- a/examples/XGBoost-Examples/mortgage/notebooks/python/cv-mortgage-gpu.ipynb
+++ b/examples/XGBoost-Examples/mortgage/notebooks/python/cv-mortgage-gpu.ipynb
@@ -11,13 +11,10 @@
     "Here takes the application 'Mortgage' as an example.\n",
     "\n",
     "A few libraries are required for this notebook:\n",
-    "  1. NumPy\n",
-    "  2. cudf jar\n",
-    "  2. xgboost4j jar\n",
-    "  3. xgboost4j-spark jar\n",
-    "  \n",
-    "Note: For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04) that\n",
-    "uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/)."
+    "  1. cudf-cu11\n",
+    "  2. xgboost\n",
+    "  3. scikit-learn\n",
+    "  4. numpy"
    ]
   },
   {
@@ -33,21 +30,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from ml.dmlc.xgboost4j.scala.spark import XGBoostClassificationModel, XGBoostClassifier\n",
-    "from ml.dmlc.xgboost4j.scala.spark.rapids import CrossValidator\n",
+    "from xgboost.spark import SparkXGBClassifier, SparkXGBClassifierModel\n",
+    "from pyspark.ml.tuning import ParamGridBuilder, CrossValidator\n",
     "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
-    "from pyspark.ml.tuning import ParamGridBuilder\n",
     "from pyspark.sql import SparkSession\n",
     "from pyspark.sql.types import FloatType, IntegerType, StructField, StructType, DoubleType\n",
+    "from pyspark.conf import SparkConf\n",
     "from time import time\n",
-    "import os"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "As shown above, here `CrossValidator` is imported from package `ml.dmlc.xgboost4j.scala.spark.rapids`, not the spark's `tuning.CrossValidator`."
+    "import os\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\"\n",
+    "# os.environ['PYSPARK_DRIVER_PYTHON'] = \"./environment/bin/python\""
    ]
   },
   {
@@ -61,9 +54,62 @@
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-25 09:34:43,524 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
+      "Setting default log level to \"WARN\".\n",
+      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
+      "2022-11-25 09:34:43,952 WARN resource.ResourceUtils: The configuration of cores (exec = 4 task = 1, runnable tasks = 4) will result in wasted resources due to resource gpu limiting the number of runnable tasks per executor to: 1. Please adjust your configuration.\n",
+      "2022-11-25 09:34:58,155 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator 22.12.0-SNAPSHOT using cudf 22.12.0-SNAPSHOT.\n",
+      "2022-11-25 09:34:58,171 WARN rapids.RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n",
+      "2022-11-25 09:34:58,175 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n",
+      "2022-11-25 09:34:58,175 WARN rapids.RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n"
+     ]
+    }
+   ],
    "source": [
-    "spark = SparkSession.builder.appName(\"mortgage-cv-gpu-python\").getOrCreate()"
+    "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"/your-url\")\n",
+    "\n",
+    "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-jar-path\")\n",
+    "\n",
+    "# You need to update with your real hardware resource \n",
+    "driverMem = os.getenv(\"DRIVER_MEM\", \"2g\")\n",
+    "executorMem = os.getenv(\"EXECUTOR_MEM\", \"2g\")\n",
+    "pinnedPoolSize = os.getenv(\"PINNED_POOL_SIZE\", \"2g\")\n",
+    "concurrentGpuTasks = os.getenv(\"CONCURRENT_GPU_TASKS\", \"2\")\n",
+    "executorCores = int(os.getenv(\"EXECUTOR_CORES\", \"4\"))\n",
+    "# Common spark settings\n",
+    "conf = SparkConf()\n",
+    "conf.setMaster(SPARK_MASTER_URL)\n",
+    "conf.setAppName(\"Microbenchmark on GPU\")\n",
+    "conf.set(\"spark.driver.memory\", driverMem)\n",
+    "## The tasks will run on GPU memory, so there is no need to set a high host memory\n",
+    "conf.set(\"spark.executor.memory\", executorMem)\n",
+    "## The tasks will run on GPU cores, so there is no need to use many cpu cores\n",
+    "conf.set(\"spark.executor.cores\", executorCores)\n",
+    "\n",
+    "# Plugin settings\n",
+    "conf.set(\"spark.executor.resource.gpu.amount\", \"1\")\n",
+    "conf.set(\"spark.rapids.sql.concurrentGpuTasks\", concurrentGpuTasks)\n",
+    "conf.set(\"spark.rapids.memory.pinnedPool.size\", pinnedPoolSize)\n",
+    "conf.set(\"spark.rapids.memory.gpu.allocFraction\",\"0.7\")\n",
+    "conf.set(\"spark.locality.wait\",\"0\")\n",
+    "##############note: only support value=1 https://github.com/dmlc/xgboost/blame/master/python-package/xgboost/spark/core.py#L370-L374\n",
+    "conf.set(\"spark.task.resource.gpu.amount\", 1) \n",
+    "conf.set(\"spark.rapids.sql.enabled\", \"true\") \n",
+    "conf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\n",
+    "conf.set(\"spark.sql.cache.serializer\",\"com.nvidia.spark.ParquetCachedBatchSerializer\")\n",
+    "conf.set(\"spark.driver.extraClassPath\", RAPIDS_JAR)\n",
+    "conf.set(\"spark.executor.extraClassPath\", RAPIDS_JAR)\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n",
+    "# Create spark session\n",
+    "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
+    "\n",
+    "reader = spark.read"
    ]
   },
   {
@@ -117,8 +163,14 @@
     "train_path = dataRoot + \"/mortgage/output/train\"\n",
     "eval_path = dataRoot + \"/mortgage/output/eval\"\n",
     "\n",
-    "train_data = spark.read.parquet(train_path)\n",
-    "trans_data = spark.read.parquet(eval_path)"
+    "data_format = 'parquet'\n",
+    "has_header = 'true'\n",
+    "if data_format == 'csv':\n",
+    "    train_data = reader.schema(schema).option('header',has_header).csv(train_path)\n",
+    "    trans_data = reader.schema(schema).option('header',has_header).csv(eval_path)\n",
+    "else :\n",
+    "    train_data = reader.load(train_path)\n",
+    "    trans_data = reader.load(eval_path)"
    ]
   },
   {
@@ -134,38 +186,31 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# First build a classifier of GPU version using *setFeaturesCols* to set feature columns\n",
     "params = { \n",
-    "    'eta': 0.1,\n",
-    "    'gamma': 0.1,\n",
-    "    'missing': 0.0,\n",
-    "    'treeMethod': 'gpu_hist',\n",
-    "    'maxDepth': 10, \n",
-    "    'maxLeaves': 256,\n",
-    "    'growPolicy': 'depthwise',\n",
-    "    'objective': 'binary:logistic',\n",
-    "    'minChildWeight': 30.0,\n",
-    "    'lambda_': 1.0,\n",
-    "    'scalePosWeight': 2.0,\n",
-    "    'subsample': 1.0,\n",
-    "    'nthread': 1,\n",
-    "    'numRound': 100,\n",
-    "    'numWorkers': 1,\n",
+    "    \"tree_method\": \"gpu_hist\",\n",
+    "    \"grow_policy\": \"depthwise\",\n",
+    "    \"num_workers\": 1,\n",
+    "    \"use_gpu\": \"true\",\n",
     "}\n",
-    "classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCols(features)\n",
+    "\n",
+    "params['features_col'] = features\n",
+    "params['label_col'] = label\n",
+    "    \n",
+    "classifier = SparkXGBClassifier(**params)\n",
+    "\n",
     "# Then build the evaluator and the hyperparameters\n",
     "evaluator = (MulticlassClassificationEvaluator()\n",
     "    .setLabelCol(label))\n",
     "param_grid = (ParamGridBuilder()\n",
-    "    .addGrid(classifier.maxDepth, [3, 6])\n",
-    "    .addGrid(classifier.numRound, [100, 200])\n",
+    "    .addGrid(classifier.max_depth, [3, 6])\n",
+    "    .addGrid(classifier.n_estimators, [100, 200])\n",
     "    .build())\n",
     "# Finally the corss validator\n",
     "cross_validator = (CrossValidator()\n",
     "    .setEstimator(classifier)\n",
     "    .setEvaluator(evaluator)\n",
     "    .setEstimatorParamMaps(param_grid)\n",
-    "    .setNumFolds(3))"
+    "    .setNumFolds(2))"
    ]
   },
   {
@@ -180,11 +225,242 @@
    "execution_count": 5,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-25 09:35:01,049 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "/data/home/yuanli/work/reviews/pr252/pyspark_venv_20221125/lib/python3.8/site-packages/xgboost/sklearn.py:808: UserWarning: Loading a native XGBoost model with Scikit-Learn interface.\n",
+      "  warnings.warn(\"Loading a native XGBoost model with Scikit-Learn interface.\")\n",
+      "2022-11-25 09:35:26,758 WARN rapids.GpuOverrides: \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#2153, delinquency_12#2255, 1.0#2256, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#2153 could run on GPU\n",
+      "    @Expression <AttributeReference> delinquency_12#2255 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#2256 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#2186 cannot run on GPU because expression AttributeReference probability#2186 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#2261 cannot run on GPU because expression AttributeReference obj#2261 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#2186]\n",
+      "    @Expression <Alias> pythonUDF0#2552.prediction AS prediction#2153 could run on GPU\n",
+      "      @Expression <GetStructField> pythonUDF0#2552.prediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#2552 could run on GPU\n",
+      "    @Expression <Alias> cast(delinquency_12#27 as double) AS delinquency_12#2255 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#27 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#27 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#2256 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <Alias> UDF(pythonUDF0#2552.probability) AS probability#2186 cannot run on GPU because input expression ScalaUDF UDF(pythonUDF0#2552.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expression Alias UDF(pythonUDF0#2552.probability) AS probability#2186 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      !Expression <ScalaUDF> UDF(pythonUDF0#2552.probability) cannot run on GPU because expression ScalaUDF UDF(pythonUDF0#2552.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled\n",
+      "        @Expression <GetStructField> pythonUDF0#2552.probability could run on GPU\n",
+      "          @Expression <AttributeReference> pythonUDF0#2552 could run on GPU\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.                 \n",
+      "2022-11-25 09:35:34,074 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#4415, delinquency_12#4517, 1.0#4518, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#4415 could run on GPU\n",
+      "    @Expression <AttributeReference> delinquency_12#4517 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#4518 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#4448 cannot run on GPU because expression AttributeReference probability#4448 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#4523 cannot run on GPU because expression AttributeReference obj#4523 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#4448]; not all expressions can be replaced\n",
+      "    @Expression <Alias> pythonUDF0#4814.prediction AS prediction#4415 could run on GPU\n",
+      "      @Expression <GetStructField> pythonUDF0#4814.prediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#4814 could run on GPU\n",
+      "    @Expression <Alias> cast(delinquency_12#27 as double) AS delinquency_12#4517 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#27 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#27 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#4518 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <Alias> UDF(pythonUDF0#4814.probability) AS probability#4448 cannot run on GPU because expression Alias UDF(pythonUDF0#4814.probability) AS probability#4448 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#4814.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n",
+      "      !Expression <ScalaUDF> UDF(pythonUDF0#4814.probability) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#4814.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "        @Expression <GetStructField> pythonUDF0#4814.probability could run on GPU\n",
+      "          @Expression <AttributeReference> pythonUDF0#4814 could run on GPU\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-25 09:35:37,859 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#6677, delinquency_12#6779, 1.0#6780, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#6677 could run on GPU\n",
+      "    @Expression <AttributeReference> delinquency_12#6779 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#6780 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#6710 cannot run on GPU because expression AttributeReference probability#6710 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#6785 cannot run on GPU because expression AttributeReference obj#6785 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#6710]; not all expressions can be replaced\n",
+      "    @Expression <Alias> pythonUDF0#7076.prediction AS prediction#6677 could run on GPU\n",
+      "      @Expression <GetStructField> pythonUDF0#7076.prediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#7076 could run on GPU\n",
+      "    @Expression <Alias> cast(delinquency_12#27 as double) AS delinquency_12#6779 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#27 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#27 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#6780 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <Alias> UDF(pythonUDF0#7076.probability) AS probability#6710 cannot run on GPU because input expression ScalaUDF UDF(pythonUDF0#7076.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expression Alias UDF(pythonUDF0#7076.probability) AS probability#6710 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      !Expression <ScalaUDF> UDF(pythonUDF0#7076.probability) cannot run on GPU because expression ScalaUDF UDF(pythonUDF0#7076.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled\n",
+      "        @Expression <GetStructField> pythonUDF0#7076.probability could run on GPU\n",
+      "          @Expression <AttributeReference> pythonUDF0#7076 could run on GPU\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-25 09:35:41,551 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#8939, delinquency_12#9041, 1.0#9042, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#8939 could run on GPU\n",
+      "    @Expression <AttributeReference> delinquency_12#9041 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#9042 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#8972 cannot run on GPU because expression AttributeReference probability#8972 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#9047 cannot run on GPU because expression AttributeReference obj#9047 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#8972]; not all expressions can be replaced\n",
+      "    @Expression <Alias> pythonUDF0#9338.prediction AS prediction#8939 could run on GPU\n",
+      "      @Expression <GetStructField> pythonUDF0#9338.prediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#9338 could run on GPU\n",
+      "    @Expression <Alias> cast(delinquency_12#27 as double) AS delinquency_12#9041 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#27 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#27 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#9042 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <Alias> UDF(pythonUDF0#9338.probability) AS probability#8972 cannot run on GPU because input expression ScalaUDF UDF(pythonUDF0#9338.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expression Alias UDF(pythonUDF0#9338.probability) AS probability#8972 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      !Expression <ScalaUDF> UDF(pythonUDF0#9338.probability) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#9338.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "        @Expression <GetStructField> pythonUDF0#9338.probability could run on GPU\n",
+      "          @Expression <AttributeReference> pythonUDF0#9338 could run on GPU\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-25 09:35:45,231 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#11491, delinquency_12#11593, 1.0#11594, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#11491 could run on GPU\n",
+      "    @Expression <AttributeReference> delinquency_12#11593 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#11594 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#11524 cannot run on GPU because expression AttributeReference probability#11524 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#11599 cannot run on GPU because expression AttributeReference obj#11599 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#11524]\n",
+      "    @Expression <Alias> pythonUDF0#11890.prediction AS prediction#11491 could run on GPU\n",
+      "      @Expression <GetStructField> pythonUDF0#11890.prediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#11890 could run on GPU\n",
+      "    @Expression <Alias> cast(delinquency_12#27 as double) AS delinquency_12#11593 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#27 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#27 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#11594 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <Alias> UDF(pythonUDF0#11890.probability) AS probability#11524 cannot run on GPU because expression Alias UDF(pythonUDF0#11890.probability) AS probability#11524 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#11890.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n",
+      "      !Expression <ScalaUDF> UDF(pythonUDF0#11890.probability) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#11890.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "        @Expression <GetStructField> pythonUDF0#11890.probability could run on GPU\n",
+      "          @Expression <AttributeReference> pythonUDF0#11890 could run on GPU\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-25 09:35:49,003 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#13753, delinquency_12#13855, 1.0#13856, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#13753 could run on GPU\n",
+      "    @Expression <AttributeReference> delinquency_12#13855 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#13856 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#13786 cannot run on GPU because expression AttributeReference probability#13786 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#13861 cannot run on GPU because expression AttributeReference obj#13861 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#13786]; not all expressions can be replaced\n",
+      "    @Expression <Alias> pythonUDF0#14152.prediction AS prediction#13753 could run on GPU\n",
+      "      @Expression <GetStructField> pythonUDF0#14152.prediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#14152 could run on GPU\n",
+      "    @Expression <Alias> cast(delinquency_12#27 as double) AS delinquency_12#13855 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#27 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#27 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#13856 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <Alias> UDF(pythonUDF0#14152.probability) AS probability#13786 cannot run on GPU because expression Alias UDF(pythonUDF0#14152.probability) AS probability#13786 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#14152.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n",
+      "      !Expression <ScalaUDF> UDF(pythonUDF0#14152.probability) cannot run on GPU because expression ScalaUDF UDF(pythonUDF0#14152.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled\n",
+      "        @Expression <GetStructField> pythonUDF0#14152.probability could run on GPU\n",
+      "          @Expression <AttributeReference> pythonUDF0#14152 could run on GPU\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-25 09:35:52,578 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#16015, delinquency_12#16117, 1.0#16118, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#16015 could run on GPU\n",
+      "    @Expression <AttributeReference> delinquency_12#16117 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#16118 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#16048 cannot run on GPU because expression AttributeReference probability#16048 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#16123 cannot run on GPU because expression AttributeReference obj#16123 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#16048]; not all expressions can be replaced\n",
+      "    @Expression <Alias> pythonUDF0#16414.prediction AS prediction#16015 could run on GPU\n",
+      "      @Expression <GetStructField> pythonUDF0#16414.prediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#16414 could run on GPU\n",
+      "    @Expression <Alias> cast(delinquency_12#27 as double) AS delinquency_12#16117 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#27 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#27 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#16118 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <Alias> UDF(pythonUDF0#16414.probability) AS probability#16048 cannot run on GPU because expression Alias UDF(pythonUDF0#16414.probability) AS probability#16048 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#16414.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n",
+      "      !Expression <ScalaUDF> UDF(pythonUDF0#16414.probability) cannot run on GPU because expression ScalaUDF UDF(pythonUDF0#16414.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled\n",
+      "        @Expression <GetStructField> pythonUDF0#16414.probability could run on GPU\n",
+      "          @Expression <AttributeReference> pythonUDF0#16414 could run on GPU\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-25 09:35:56,267 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#18277, delinquency_12#18379, 1.0#18380, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#18277 could run on GPU\n",
+      "    @Expression <AttributeReference> delinquency_12#18379 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#18380 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#18310 cannot run on GPU because expression AttributeReference probability#18310 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#18385 cannot run on GPU because expression AttributeReference obj#18385 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#18310]; not all expressions can be replaced\n",
+      "    @Expression <Alias> pythonUDF0#18676.prediction AS prediction#18277 could run on GPU\n",
+      "      @Expression <GetStructField> pythonUDF0#18676.prediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#18676 could run on GPU\n",
+      "    @Expression <Alias> cast(delinquency_12#27 as double) AS delinquency_12#18379 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#27 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#27 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#18380 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <Alias> UDF(pythonUDF0#18676.probability) AS probability#18310 cannot run on GPU because expression Alias UDF(pythonUDF0#18676.probability) AS probability#18310 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#18676.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n",
+      "      !Expression <ScalaUDF> UDF(pythonUDF0#18676.probability) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#18676.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "        @Expression <GetStructField> pythonUDF0#18676.probability could run on GPU\n",
+      "          @Expression <AttributeReference> pythonUDF0#18676 could run on GPU\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "[Stage 69:>                                                         (0 + 1) / 1]\r"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Cross-Validation takes 88.53 seconds\n"
+      "Cross-Validation takes 59.46 seconds\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "                                                                                \r"
      ]
     }
    ],
@@ -207,22 +483,126 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-25 09:35:59,886 WARN rapids.GpuOverrides: \n",
+      "!Exec <ProjectExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction#18908, probability#18974]; not all expressions can be replaced\n",
+      "  @Expression <AttributeReference> orig_channel#56 could run on GPU\n",
+      "  @Expression <AttributeReference> first_home_buyer#57 could run on GPU\n",
+      "  @Expression <AttributeReference> loan_purpose#58 could run on GPU\n",
+      "  @Expression <AttributeReference> property_type#59 could run on GPU\n",
+      "  @Expression <AttributeReference> occupancy_status#60 could run on GPU\n",
+      "  @Expression <AttributeReference> property_state#61 could run on GPU\n",
+      "  @Expression <AttributeReference> product_type#62 could run on GPU\n",
+      "  @Expression <AttributeReference> relocation_mortgage_indicator#63 could run on GPU\n",
+      "  @Expression <AttributeReference> seller_name#64 could run on GPU\n",
+      "  @Expression <AttributeReference> mod_flag#65 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_interest_rate#66 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_upb#67 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_loan_term#68 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_ltv#69 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_cltv#70 could run on GPU\n",
+      "  @Expression <AttributeReference> num_borrowers#71 could run on GPU\n",
+      "  @Expression <AttributeReference> dti#72 could run on GPU\n",
+      "  @Expression <AttributeReference> borrower_credit_score#73 could run on GPU\n",
+      "  @Expression <AttributeReference> num_units#74 could run on GPU\n",
+      "  @Expression <AttributeReference> zip#75 could run on GPU\n",
+      "  @Expression <AttributeReference> mortgage_insurance_percent#76 could run on GPU\n",
+      "  @Expression <AttributeReference> current_loan_delinquency_status#77 could run on GPU\n",
+      "  @Expression <AttributeReference> current_actual_upb#78 could run on GPU\n",
+      "  @Expression <AttributeReference> interest_rate#79 could run on GPU\n",
+      "  @Expression <AttributeReference> loan_age#80 could run on GPU\n",
+      "  @Expression <AttributeReference> msa#81 could run on GPU\n",
+      "  @Expression <AttributeReference> non_interest_bearing_upb#82 could run on GPU\n",
+      "  @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "  !Expression <Alias> UDF(pythonUDF0#19041.rawPrediction) AS rawPrediction#18908 cannot run on GPU because input expression ScalaUDF UDF(pythonUDF0#19041.rawPrediction) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expression Alias UDF(pythonUDF0#19041.rawPrediction) AS rawPrediction#18908 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    !Expression <ScalaUDF> UDF(pythonUDF0#19041.rawPrediction) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#19041.rawPrediction) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      @Expression <GetStructField> pythonUDF0#19041.rawPrediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#19041 could run on GPU\n",
+      "  @Expression <Alias> pythonUDF0#19041.prediction AS prediction#18942 could run on GPU\n",
+      "    @Expression <GetStructField> pythonUDF0#19041.prediction could run on GPU\n",
+      "      @Expression <AttributeReference> pythonUDF0#19041 could run on GPU\n",
+      "  !Expression <Alias> UDF(pythonUDF0#19041.probability) AS probability#18974 cannot run on GPU because expression Alias UDF(pythonUDF0#19041.probability) AS probability#18974 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#19041.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n",
+      "    !Expression <ScalaUDF> UDF(pythonUDF0#19041.probability) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#19041.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      @Expression <GetStructField> pythonUDF0#19041.probability could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#19041 could run on GPU\n",
+      "\n",
+      "2022-11-25 09:35:59,893 WARN rapids.GpuOverrides: \n",
+      "!Exec <InMemoryTableScanExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction#18908, probability#18974]; not all expressions can be replaced\n",
+      "  @Expression <AttributeReference> orig_channel#56 could run on GPU\n",
+      "  @Expression <AttributeReference> first_home_buyer#57 could run on GPU\n",
+      "  @Expression <AttributeReference> loan_purpose#58 could run on GPU\n",
+      "  @Expression <AttributeReference> property_type#59 could run on GPU\n",
+      "  @Expression <AttributeReference> occupancy_status#60 could run on GPU\n",
+      "  @Expression <AttributeReference> property_state#61 could run on GPU\n",
+      "  @Expression <AttributeReference> product_type#62 could run on GPU\n",
+      "  @Expression <AttributeReference> relocation_mortgage_indicator#63 could run on GPU\n",
+      "  @Expression <AttributeReference> seller_name#64 could run on GPU\n",
+      "  @Expression <AttributeReference> mod_flag#65 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_interest_rate#66 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_upb#67 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_loan_term#68 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_ltv#69 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_cltv#70 could run on GPU\n",
+      "  @Expression <AttributeReference> num_borrowers#71 could run on GPU\n",
+      "  @Expression <AttributeReference> dti#72 could run on GPU\n",
+      "  @Expression <AttributeReference> borrower_credit_score#73 could run on GPU\n",
+      "  @Expression <AttributeReference> num_units#74 could run on GPU\n",
+      "  @Expression <AttributeReference> zip#75 could run on GPU\n",
+      "  @Expression <AttributeReference> mortgage_insurance_percent#76 could run on GPU\n",
+      "  @Expression <AttributeReference> current_loan_delinquency_status#77 could run on GPU\n",
+      "  @Expression <AttributeReference> current_actual_upb#78 could run on GPU\n",
+      "  @Expression <AttributeReference> interest_rate#79 could run on GPU\n",
+      "  @Expression <AttributeReference> loan_age#80 could run on GPU\n",
+      "  @Expression <AttributeReference> msa#81 could run on GPU\n",
+      "  @Expression <AttributeReference> non_interest_bearing_upb#82 could run on GPU\n",
+      "  @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "  !Expression <AttributeReference> rawPrediction#18908 cannot run on GPU because expression AttributeReference rawPrediction#18908 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  @Expression <AttributeReference> prediction#18942 could run on GPU\n",
+      "  !Expression <AttributeReference> probability#18974 cannot run on GPU because expression AttributeReference probability#18974 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "\n",
+      "2022-11-25 09:36:00,975 WARN rapids.GpuOverrides:                               \n",
+      "!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n",
+      "  @Partitioning <SinglePartition$> could run on GPU\n",
+      "  !Exec <ProjectExec> cannot run on GPU because unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#18974, rawPrediction#18908]; not all expressions can be replaced\n",
+      "    @Expression <Alias> cast(delinquency_12#83 as string) AS delinquency_12#19670 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#83 as string) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "    @Expression <Alias> cast(rawPrediction#18908 as string) AS rawPrediction#19671 could run on GPU\n",
+      "      !Expression <Cast> cast(rawPrediction#18908 as string) cannot run on GPU because Cast from org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 to StringType is not supported\n",
+      "        !Expression <AttributeReference> rawPrediction#18908 cannot run on GPU because expression AttributeReference rawPrediction#18908 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    @Expression <Alias> cast(probability#18974 as string) AS probability#19672 could run on GPU\n",
+      "      !Expression <Cast> cast(probability#18974 as string) cannot run on GPU because Cast from org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 to StringType is not supported\n",
+      "        !Expression <AttributeReference> probability#18974 cannot run on GPU because expression AttributeReference probability#18974 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    @Expression <Alias> cast(prediction#18942 as string) AS prediction#19673 could run on GPU\n",
+      "      @Expression <Cast> cast(prediction#18942 as string) could run on GPU\n",
+      "        @Expression <AttributeReference> prediction#18942 could run on GPU\n",
+      "    !Exec <InMemoryTableScanExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#18974, rawPrediction#18908]; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; not all expressions can be replaced\n",
+      "      @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "      @Expression <AttributeReference> prediction#18942 could run on GPU\n",
+      "      !Expression <AttributeReference> probability#18974 cannot run on GPU because expression AttributeReference probability#18974 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      !Expression <AttributeReference> rawPrediction#18908 cannot run on GPU because expression AttributeReference rawPrediction#18908 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Transforming takes 3.13 seconds\n",
+      "Transforming takes 1.15 seconds\n",
       "+--------------+--------------------+--------------------+----------+\n",
       "|delinquency_12|       rawPrediction|         probability|prediction|\n",
       "+--------------+--------------------+--------------------+----------+\n",
-      "|             0|[2.57163572311401...|[0.92901364713907...|       0.0|\n",
-      "|             0|[2.63977861404418...|[0.93337820470333...|       0.0|\n",
-      "|             0|[2.50156974792480...|[0.92425179481506...|       0.0|\n",
-      "|             0|[2.63977861404418...|[0.93337820470333...|       0.0|\n",
-      "|             0|[2.09173870086669...|[0.89009761810302...|       0.0|\n",
+      "|             0|[10.2152490615844...|[0.99996340274810...|       0.0|\n",
+      "|             0|[8.85215473175048...|[0.99985694885253...|       0.0|\n",
+      "|             0|[8.85215473175048...|[0.99985694885253...|       0.0|\n",
+      "|             0|[8.85215473175048...|[0.99985694885253...|       0.0|\n",
+      "|             0|[10.2152490615844...|[0.99996340274810...|       0.0|\n",
       "+--------------+--------------------+--------------------+----------+\n",
       "only showing top 5 rows\n",
       "\n"
@@ -247,15 +627,53 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-25 09:36:01,155 WARN rapids.GpuOverrides: \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#18942, delinquency_12#20148, 1.0#20149, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#18942 could run on GPU\n",
+      "    @Expression <AttributeReference> delinquency_12#20148 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#20149 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#18974 cannot run on GPU because expression AttributeReference probability#18974 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#20154 cannot run on GPU because expression AttributeReference obj#20154 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#18974]; not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#18974]\n",
+      "    @Expression <AttributeReference> prediction#18942 could run on GPU\n",
+      "    @Expression <Alias> cast(delinquency_12#83 as double) AS delinquency_12#20148 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#83 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#20149 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <AttributeReference> probability#18974 cannot run on GPU because expression AttributeReference probability#18974 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    !Exec <InMemoryTableScanExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#18974]\n",
+      "      @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "      @Expression <AttributeReference> prediction#18942 could run on GPU\n",
+      "      !Expression <AttributeReference> probability#18974 cannot run on GPU because expression AttributeReference probability#18974 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "\n",
+      "[Stage 72:>                                                         (0 + 1) / 1]\r"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Evaluation takes 0.29 seconds\n",
-      "Accuracy is 0.9868033296704449\n"
+      "Evaluation takes 1.41 seconds\n",
+      "Accuracy is 1.0\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "                                                                                \r"
      ]
     }
    ],
@@ -268,7 +686,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
diff --git a/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb
index e2c64c15e..e103567b4 100644
--- a/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb
+++ b/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb
@@ -9,16 +9,12 @@
     "The goal of this notebook is to show how to train a XGBoost Model with Spark RAPIDS XGBoost library on GPUs. The dataset used with this notebook is derived from Fannie Mae’s Single-Family Loan Performance Data with all rights reserved by Fannie Mae. This processed dataset is redistributed with permission and consent from Fannie Mae. This notebook uses XGBoost to train 12-month mortgage loan delinquency prediction model .\n",
     "\n",
     "A few libraries required for this notebook:\n",
-    "  1. NumPy\n",
-    "  2. cudf jar\n",
-    "  3. xgboost4j jar\n",
-    "  4. xgboost4j-spark jar\n",
-    "  5. rapids-4-spark.jar\n",
+    "  1. cudf-cu11\n",
+    "  2. xgboost\n",
+    "  3. scikit-learn\n",
+    "  4. numpy\n",
     "\n",
-    "This notebook also illustrates the ease of porting a sample CPU based Spark xgboost4j code into GPU. There is only one change required for running Spark XGBoost on GPU. That is replacing the API `setFeaturesCol(feature)` on CPU with the new API `setFeaturesCols(features)`. This also eliminates the need for vectorization (assembling multiple feature columns in to one column) since we can read multiple columns.\n",
-    "\n",
-    "Note: For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04) that\n",
-    "uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/)."
+    "This notebook also illustrates the ease of porting a sample CPU based Spark xgboost4j code into GPU. There is no change required for running Spark XGBoost on GPU because both CPU and GPU call the same API. For CPU run, we need to vectorize the trained dataset before fitting data to classifier."
    ]
   },
   {
@@ -34,12 +30,24 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from ml.dmlc.xgboost4j.scala.spark import XGBoostClassificationModel, XGBoostClassifier\n",
+    "import os\n",
+    "\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from xgboost.spark import SparkXGBClassifier, SparkXGBClassifierModel\n",
     "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
     "from pyspark.sql import SparkSession\n",
     "from pyspark.sql.types import FloatType, IntegerType, StructField, StructType, DoubleType\n",
-    "from time import time\n",
-    "import os"
+    "from pyspark.conf import SparkConf\n",
+    "from time import time"
    ]
   },
   {
@@ -62,11 +70,68 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting default log level to \"WARN\".\n",
+      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
+      "22/11/24 06:14:05 WARN org.apache.spark.resource.ResourceUtils: The configuration of cores (exec = 4 task = 1, runnable tasks = 4) will result in wasted resources due to resource gpu limiting the number of runnable tasks per executor to: 1. Please adjust your configuration.\n",
+      "22/11/24 06:14:06 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker\n",
+      "22/11/24 06:14:06 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster\n",
+      "22/11/24 06:14:06 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat\n",
+      "22/11/24 06:14:06 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator\n",
+      "22/11/24 06:14:07 WARN com.nvidia.spark.rapids.RapidsPluginUtils: RAPIDS Accelerator 22.12.0 using cudf 22.12.0.\n",
+      "22/11/24 06:14:07 WARN com.nvidia.spark.rapids.RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n",
+      "22/11/24 06:14:07 WARN com.nvidia.spark.rapids.RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n",
+      "22/11/24 06:14:07 WARN com.nvidia.spark.rapids.RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n"
+     ]
+    }
+   ],
    "source": [
-    "spark = SparkSession.builder.getOrCreate()\n",
+    "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"/your-url\")\n",
+    "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-jar-path\")\n",
+    "\n",
+    "# You need to update with your real hardware resource \n",
+    "driverMem = os.getenv(\"DRIVER_MEM\", \"10g\")\n",
+    "executorMem = os.getenv(\"EXECUTOR_MEM\", \"10g\")\n",
+    "pinnedPoolSize = os.getenv(\"PINNED_POOL_SIZE\", \"2g\")\n",
+    "concurrentGpuTasks = os.getenv(\"CONCURRENT_GPU_TASKS\", \"2\")\n",
+    "executorCores = int(os.getenv(\"EXECUTOR_CORES\", \"4\"))\n",
+    "\n",
+    "# Common spark settings\n",
+    "conf = SparkConf()\n",
+    "conf.setMaster(SPARK_MASTER_URL)\n",
+    "conf.setAppName(\"Microbenchmark on GPU\")\n",
+    "conf.set(\"spark.driver.memory\", driverMem)\n",
+    "## The tasks will run on GPU memory, so there is no need to set a high host memory\n",
+    "conf.set(\"spark.executor.memory\", executorMem)\n",
+    "## The tasks will run on GPU cores, so there is no need to use many cpu cores\n",
+    "conf.set(\"spark.executor.cores\", executorCores)\n",
+    "\n",
+    "# Plugin settings\n",
+    "conf.set(\"spark.executor.resource.gpu.amount\", \"1\")\n",
+    "conf.set(\"spark.rapids.sql.concurrentGpuTasks\", concurrentGpuTasks)\n",
+    "conf.set(\"spark.rapids.memory.pinnedPool.size\", pinnedPoolSize)\n",
+    "##############note: only support value=1 see https://github.com/dmlc/xgboost/blame/master/python-package/xgboost/spark/core.py#L370-L374\n",
+    "conf.set(\"spark.task.resource.gpu.amount\", 1) \n",
+    "# since pyspark and xgboost share the same GPU, we need to allocate some memory to xgboost to avoid GPU OOM while training \n",
+    "conf.set(\"spark.rapids.memory.gpu.allocFraction\",\"0.6\")\n",
+    "conf.set(\"spark.rapids.sql.enabled\", \"true\") \n",
+    "conf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\n",
+    "conf.set(\"spark.sql.cache.serializer\",\"com.nvidia.spark.ParquetCachedBatchSerializer\")\n",
+    "conf.set(\"spark.driver.extraClassPath\", RAPIDS_JAR)\n",
+    "conf.set(\"spark.executor.extraClassPath\", RAPIDS_JAR)\n",
+    "conf.set(\"spark.jars\", RAPIDS_JAR)\n",
+    "\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n",
+    "\n",
+    "# Create spark session\n",
+    "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
     "reader = spark.read"
    ]
   },
@@ -79,7 +144,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -121,8 +186,15 @@
     "train_path = dataRoot + \"/mortgage/output/train\"\n",
     "eval_path = dataRoot + \"/mortgage/output/eval\"\n",
     "\n",
-    "train_data = reader.parquet(train_path)\n",
-    "trans_data = reader.parquet(eval_path)"
+    "data_format = 'parquet'\n",
+    "has_header = 'true'\n",
+    "if data_format == 'csv':\n",
+    "    train_data = reader.schema(schema).option('header',has_header).csv(train_path)\n",
+    "    trans_data = reader.schema(schema).option('header',has_header).csv(eval_path)\n",
+    "else :\n",
+    "    train_data = reader.load(train_path)\n",
+    "    trans_data = reader.load(eval_path)\n",
+    "  "
    ]
   },
   {
@@ -154,42 +226,39 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
     "params = { \n",
-    "    'eta': 0.1,\n",
-    "    'gamma': 0.1,\n",
-    "    'missing': 0.0,\n",
-    "    'treeMethod': 'gpu_hist',\n",
-    "    'maxDepth': 10, \n",
-    "    'maxLeaves': 256,\n",
-    "    'objective':'binary:logistic',\n",
-    "    'growPolicy': 'depthwise',\n",
-    "    'minChildWeight': 30.0,\n",
-    "    'lambda_': 1.0,\n",
-    "    'scalePosWeight': 2.0,\n",
-    "    'subsample': 1.0,\n",
-    "    'nthread': 1,\n",
-    "    'numRound': 100,\n",
-    "    'numWorkers': 1,\n",
+    "    \"tree_method\": \"gpu_hist\",\n",
+    "    \"grow_policy\": \"depthwise\",\n",
+    "    \"num_workers\": 1,\n",
+    "    \"use_gpu\": \"true\",\n",
     "}\n",
-    "classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCols(features)"
+    "params['features_col'] = features\n",
+    "params['label_col'] = label\n",
+    "    \n",
+    "classifier = SparkXGBClassifier(**params)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The CPU version classifier provides the API `setFeaturesCol` which only accepts a single column name, so vectorization for multiple feature columns is required.\n",
-    "```Python\n",
-    "classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCol('features')\n",
-    "```\n",
-    "\n",
     "The parameter `num_workers` should be set to the number of GPUs in Spark cluster for GPU version, while for CPU version it is usually equal to the number of the CPU cores.\n",
     "\n",
-    "Concerning the tree method, GPU version only supports `gpu_hist` currently, while `hist` is designed and used here for CPU training."
+    "Concerning the tree method, GPU version only supports `gpu_hist` currently, while `hist` is designed and used here for CPU training.\n",
+    "\n",
+    "An example of CPU classifier:\n",
+    "```\n",
+    "classifier = SparkXGBClassifier(\n",
+    "  feature_col=features,\n",
+    "  label_col=label,  \n",
+    "  num_workers=1024,\n",
+    "  use_gpu=False,\n",
+    ")\n",
+    "```"
    ]
   },
   {
@@ -201,14 +270,42 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "If features_cols param set, then features_col param is ignored.\n",
+      "22/11/24 06:14:44 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n",
+      "[Stage 12:>                                                         (0 + 1) / 1]\r"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Training takes 25.67 seconds\n"
+      "[06:15:10] WARNING: ../src/learner.cc:553: \n",
+      "  If you are loading a serialized model (like pickle in Python, RDS in R) generated by\n",
+      "  older XGBoost, please export the model by calling `Booster.save_model` from that version\n",
+      "  first, then load it back in current version. See:\n",
+      "\n",
+      "    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html\n",
+      "\n",
+      "  for more details about differences between saving model and serializing.\n",
+      "\n",
+      "Training takes 28.6 seconds\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "                                                                                \r",
+      "/home/yuali_nvidia_com/.local/lib/python3.8/site-packages/xgboost/sklearn.py:808: UserWarning: Loading a native XGBoost model with Scikit-Learn interface.\n",
+      "  warnings.warn(\"Loading a native XGBoost model with Scikit-Learn interface.\")\n"
      ]
     }
    ],
@@ -231,12 +328,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "If features_cols param set, then features_col param is ignored.\n",
+      "                                                                                \r"
+     ]
+    }
+   ],
+   "source": [
+    "model.write().overwrite().save(dataRoot + '/model/mortgage')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.write().overwrite().save(dataRoot + '/mortgage/model')\n",
-    "loaded_model = XGBoostClassificationModel().load(dataRoot + '/mortgage/model')"
+    "loaded_model = SparkXGBClassifierModel().load(dataRoot + '/model/mortgage')"
    ]
   },
   {
@@ -248,22 +362,126 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "22/11/24 06:15:13 WARN com.nvidia.spark.rapids.GpuOverrides: \n",
+      "!Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction#209, probability#275]\n",
+      "  @Expression <AttributeReference> orig_channel#56 could run on GPU\n",
+      "  @Expression <AttributeReference> first_home_buyer#57 could run on GPU\n",
+      "  @Expression <AttributeReference> loan_purpose#58 could run on GPU\n",
+      "  @Expression <AttributeReference> property_type#59 could run on GPU\n",
+      "  @Expression <AttributeReference> occupancy_status#60 could run on GPU\n",
+      "  @Expression <AttributeReference> property_state#61 could run on GPU\n",
+      "  @Expression <AttributeReference> product_type#62 could run on GPU\n",
+      "  @Expression <AttributeReference> relocation_mortgage_indicator#63 could run on GPU\n",
+      "  @Expression <AttributeReference> seller_name#64 could run on GPU\n",
+      "  @Expression <AttributeReference> mod_flag#65 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_interest_rate#66 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_upb#67 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_loan_term#68 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_ltv#69 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_cltv#70 could run on GPU\n",
+      "  @Expression <AttributeReference> num_borrowers#71 could run on GPU\n",
+      "  @Expression <AttributeReference> dti#72 could run on GPU\n",
+      "  @Expression <AttributeReference> borrower_credit_score#73 could run on GPU\n",
+      "  @Expression <AttributeReference> num_units#74 could run on GPU\n",
+      "  @Expression <AttributeReference> zip#75 could run on GPU\n",
+      "  @Expression <AttributeReference> mortgage_insurance_percent#76 could run on GPU\n",
+      "  @Expression <AttributeReference> current_loan_delinquency_status#77 could run on GPU\n",
+      "  @Expression <AttributeReference> current_actual_upb#78 could run on GPU\n",
+      "  @Expression <AttributeReference> interest_rate#79 could run on GPU\n",
+      "  @Expression <AttributeReference> loan_age#80 could run on GPU\n",
+      "  @Expression <AttributeReference> msa#81 could run on GPU\n",
+      "  @Expression <AttributeReference> non_interest_bearing_upb#82 could run on GPU\n",
+      "  @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "  !Expression <Alias> UDF(pythonUDF0#342.rawPrediction) AS rawPrediction#209 cannot run on GPU because input expression ScalaUDF UDF(pythonUDF0#342.rawPrediction) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expression Alias UDF(pythonUDF0#342.rawPrediction) AS rawPrediction#209 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    !Expression <ScalaUDF> UDF(pythonUDF0#342.rawPrediction) cannot run on GPU because expression ScalaUDF UDF(pythonUDF0#342.rawPrediction) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3898/645590696 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled\n",
+      "      @Expression <GetStructField> pythonUDF0#342.rawPrediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#342 could run on GPU\n",
+      "  @Expression <Alias> pythonUDF0#342.prediction AS prediction#243 could run on GPU\n",
+      "    @Expression <GetStructField> pythonUDF0#342.prediction could run on GPU\n",
+      "      @Expression <AttributeReference> pythonUDF0#342 could run on GPU\n",
+      "  !Expression <Alias> UDF(pythonUDF0#342.probability) AS probability#275 cannot run on GPU because input expression ScalaUDF UDF(pythonUDF0#342.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expression Alias UDF(pythonUDF0#342.probability) AS probability#275 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    !Expression <ScalaUDF> UDF(pythonUDF0#342.probability) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3898/645590696 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#342.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      @Expression <GetStructField> pythonUDF0#342.probability could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#342 could run on GPU\n",
+      "\n",
+      "22/11/24 06:15:13 WARN com.nvidia.spark.rapids.GpuOverrides: \n",
+      "!Exec <InMemoryTableScanExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction#209, probability#275]\n",
+      "  @Expression <AttributeReference> orig_channel#56 could run on GPU\n",
+      "  @Expression <AttributeReference> first_home_buyer#57 could run on GPU\n",
+      "  @Expression <AttributeReference> loan_purpose#58 could run on GPU\n",
+      "  @Expression <AttributeReference> property_type#59 could run on GPU\n",
+      "  @Expression <AttributeReference> occupancy_status#60 could run on GPU\n",
+      "  @Expression <AttributeReference> property_state#61 could run on GPU\n",
+      "  @Expression <AttributeReference> product_type#62 could run on GPU\n",
+      "  @Expression <AttributeReference> relocation_mortgage_indicator#63 could run on GPU\n",
+      "  @Expression <AttributeReference> seller_name#64 could run on GPU\n",
+      "  @Expression <AttributeReference> mod_flag#65 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_interest_rate#66 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_upb#67 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_loan_term#68 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_ltv#69 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_cltv#70 could run on GPU\n",
+      "  @Expression <AttributeReference> num_borrowers#71 could run on GPU\n",
+      "  @Expression <AttributeReference> dti#72 could run on GPU\n",
+      "  @Expression <AttributeReference> borrower_credit_score#73 could run on GPU\n",
+      "  @Expression <AttributeReference> num_units#74 could run on GPU\n",
+      "  @Expression <AttributeReference> zip#75 could run on GPU\n",
+      "  @Expression <AttributeReference> mortgage_insurance_percent#76 could run on GPU\n",
+      "  @Expression <AttributeReference> current_loan_delinquency_status#77 could run on GPU\n",
+      "  @Expression <AttributeReference> current_actual_upb#78 could run on GPU\n",
+      "  @Expression <AttributeReference> interest_rate#79 could run on GPU\n",
+      "  @Expression <AttributeReference> loan_age#80 could run on GPU\n",
+      "  @Expression <AttributeReference> msa#81 could run on GPU\n",
+      "  @Expression <AttributeReference> non_interest_bearing_upb#82 could run on GPU\n",
+      "  @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "  !Expression <AttributeReference> rawPrediction#209 cannot run on GPU because expression AttributeReference rawPrediction#209 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  @Expression <AttributeReference> prediction#243 could run on GPU\n",
+      "  !Expression <AttributeReference> probability#275 cannot run on GPU because expression AttributeReference probability#275 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "\n",
+      "22/11/24 06:15:28 WARN com.nvidia.spark.rapids.GpuOverrides:                    \n",
+      "!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n",
+      "  @Partitioning <SinglePartition$> could run on GPU\n",
+      "  !Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced; unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#275, rawPrediction#209]\n",
+      "    @Expression <Alias> cast(delinquency_12#83 as string) AS delinquency_12#971 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#83 as string) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "    @Expression <Alias> cast(rawPrediction#209 as string) AS rawPrediction#972 could run on GPU\n",
+      "      !Expression <Cast> cast(rawPrediction#209 as string) cannot run on GPU because Cast from org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 to StringType is not supported\n",
+      "        !Expression <AttributeReference> rawPrediction#209 cannot run on GPU because expression AttributeReference rawPrediction#209 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    @Expression <Alias> cast(probability#275 as string) AS probability#973 could run on GPU\n",
+      "      !Expression <Cast> cast(probability#275 as string) cannot run on GPU because Cast from org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 to StringType is not supported\n",
+      "        !Expression <AttributeReference> probability#275 cannot run on GPU because expression AttributeReference probability#275 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    @Expression <Alias> cast(prediction#243 as string) AS prediction#974 could run on GPU\n",
+      "      @Expression <Cast> cast(prediction#243 as string) could run on GPU\n",
+      "        @Expression <AttributeReference> prediction#243 could run on GPU\n",
+      "    !Exec <InMemoryTableScanExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#275, rawPrediction#209]; not all expressions can be replaced\n",
+      "      @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "      @Expression <AttributeReference> prediction#243 could run on GPU\n",
+      "      !Expression <AttributeReference> probability#275 cannot run on GPU because expression AttributeReference probability#275 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      !Expression <AttributeReference> rawPrediction#209 cannot run on GPU because expression AttributeReference rawPrediction#209 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Transformation takes 11.39 seconds\n",
+      "Transformation takes 15.62 seconds\n",
       "+--------------+--------------------+--------------------+----------+\n",
       "|delinquency_12|       rawPrediction|         probability|prediction|\n",
       "+--------------+--------------------+--------------------+----------+\n",
-      "|             0|[7.76566505432128...|[0.99957613222068...|       0.0|\n",
-      "|             0|[4.50240230560302...|[0.98903913144022...|       0.0|\n",
-      "|             0|[4.50240230560302...|[0.98903913144022...|       0.0|\n",
-      "|             0|[4.50240230560302...|[0.98903913144022...|       0.0|\n",
-      "|             0|[4.50240230560302...|[0.98903913144022...|       0.0|\n",
+      "|             0|[8.84631538391113...|[0.99985611438751...|       0.0|\n",
+      "|             0|[9.41864871978759...|[0.99991881847381...|       0.0|\n",
+      "|             0|[9.41864871978759...|[0.99991881847381...|       0.0|\n",
+      "|             0|[9.41864871978759...|[0.99991881847381...|       0.0|\n",
+      "|             0|[8.84631538391113...|[0.99985611438751...|       0.0|\n",
       "+--------------+--------------------+--------------------+----------+\n",
       "only showing top 5 rows\n",
       "\n"
@@ -288,40 +506,83 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def check_classification_accuracy(data_frame, label):\n",
+    "    accuracy = (MulticlassClassificationEvaluator()\n",
+    "                .setLabelCol(label)\n",
+    "                .evaluate(data_frame))\n",
+    "    print('-' * 100)\n",
+    "    print('Accuracy is ' + str(accuracy))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "22/11/24 06:15:28 WARN com.nvidia.spark.rapids.GpuOverrides: \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#243, delinquency_12#1450, 1.0#1449, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#243 could run on GPU\n",
+      "    @Expression <AttributeReference> delinquency_12#1450 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#1449 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#275 cannot run on GPU because expression AttributeReference probability#275 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#1455 cannot run on GPU because expression AttributeReference obj#1455 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#275]; unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#275]; not all expressions can be replaced\n",
+      "    @Expression <AttributeReference> prediction#243 could run on GPU\n",
+      "    @Expression <Alias> cast(delinquency_12#83 as double) AS delinquency_12#1450 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#83 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#1449 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <AttributeReference> probability#275 cannot run on GPU because expression AttributeReference probability#275 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    !Exec <InMemoryTableScanExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#275]; not all expressions can be replaced\n",
+      "      @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "      @Expression <AttributeReference> prediction#243 could run on GPU\n",
+      "      !Expression <AttributeReference> probability#275 cannot run on GPU because expression AttributeReference probability#275 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "\n",
+      "[Stage 19:>                                                         (0 + 1) / 1]\r"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Evaluation takes 1.03 seconds\n",
-      "Accuracy is 0.9876786703104035\n"
+      "----------------------------------------------------------------------------------------------------\n",
+      "Accuracy is 1.0\n",
+      "Evaluation takes 2.29 seconds\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "                                                                                \r"
      ]
     }
    ],
    "source": [
-    "accuracy = with_benchmark(\n",
-    "    'Evaluation',\n",
-    "    lambda: MulticlassClassificationEvaluator().setLabelCol(label).evaluate(result))\n",
-    "print('Accuracy is ' + str(accuracy))"
+    "with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
     "spark.stop()"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-ETL.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-ETL.ipynb
index 794a0fa36..fb4128d35 100644
--- a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-ETL.ipynb
+++ b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-ETL.ipynb
@@ -16,18 +16,18 @@
    "source": [
     "## Prerequirement\n",
     "### 1. Download data\n",
-    "<!-- Refer these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.10/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset -->\n",
-    "Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.10/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n",
+    "<!-- Refer these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.12/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset -->\n",
+    "Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.12/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n",
     "\n",
     "### 2. Download needed jars\n",
-    "* [rapids-4-spark_2.12-22.10.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar)\n",
+    "* [rapids-4-spark_2.12-22.12.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar)\n",
     "\n",
     "### 3. Start Spark Standalone\n",
     "Before Running the script, please setup Spark standalone mode\n",
     "\n",
     "### 4. Add ENV\n",
     "```\n",
-    "$ export SPARK_JARS=rapids-4-spark_2.12-22.10.0.jar\n",
+    "$ export SPARK_JARS=rapids-4-spark_2.12-22.12.0.jar\n",
     "\n",
     "```\n",
     "\n",
diff --git a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/consts.py b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/consts.py
index 1cca6e6d8..eefa7358c 100644
--- a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/consts.py
+++ b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/consts.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -278,19 +278,3 @@
     'non_interest_bearing_upb',
     'delinquency_12',
 ]
-
-default_params = {
-    'eta': 0.1,
-    'gamma': 0.1,
-    'missing': 0.0,
-    'maxDepth': 10,
-    'maxLeaves': 256,
-    'growPolicy': 'depthwise',
-    'minChildWeight': 30.0,
-    'lambda_': 1.0,
-    'scalePosWeight': 2.0,
-    'subsample': 1.0,
-    'nthread': 1,
-    'numRound': 100,
-    'numWorkers': 1,
-}
diff --git a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/cpu_cross_validator_main.py b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/cpu_cross_validator_main.py
deleted file mode 100644
index f5924957e..000000000
--- a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/cpu_cross_validator_main.py
+++ /dev/null
@@ -1,77 +0,0 @@
-#
-# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from com.nvidia.spark.examples.mortgage.consts import *
-from com.nvidia.spark.examples.utility.utils import *
-from ml.dmlc.xgboost4j.scala.spark import *
-from ml.dmlc.xgboost4j.scala.spark.rapids import CrossValidator
-from pyspark.ml.tuning import ParamGridBuilder
-from pyspark.sql import SparkSession
-
-def main(args, xgboost_args):
-    spark = (SparkSession
-        .builder
-        .appName(args.mainClass)
-        .getOrCreate())
-
-    train_data, eval_data, trans_data = valid_input_data(spark, args, '', schema)
-    features = [x.name for x in schema if x.name != label]
-
-    if args.mode in [ 'all', 'train' ]:
-        classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args))
-                      .setLabelCol(label)
-                      .setFeaturesCol('features'))
-        evaluator = (MulticlassClassificationEvaluator()
-                     .setLabelCol(label))
-        param_grid = (ParamGridBuilder()
-                      .addGrid(classifier.maxDepth, [5, 10])
-                      .addGrid(classifier.numRound, [100, 200])
-                      .build())
-        cross_validator = (CrossValidator()
-                           .setEstimator(classifier)
-                           .setEvaluator(evaluator)
-                           .setEstimatorParamMaps(param_grid)
-                           .setNumFolds(3))
-        if not train_data:
-            print('-' * 80)
-            print('Usage: training data path required when mode is all or train')
-            exit(1)
-
-        train_data = vectorize_data_frame(train_data, label)
-        model = with_benchmark('Training', lambda: cross_validator.fit(train_data))
-        # get the best model to do transform
-        model = model.bestModel
-        if args.modelPath:
-            writer = model.write().overwrite() if args.overwrite else model
-            writer.save(args.modelPath)
-    else:
-        model = XGBoostClassificationModel().load(args.modelPath)
-
-    if args.mode in [ 'all', 'transform' ]:
-        def transform():
-            vec_df = vectorize_data_frame(trans_data, label)
-            result = model.transform(vec_df).cache()
-            result.foreachPartition(lambda _: None)
-            return result
-        if not trans_data:
-            print('-' * 80)
-            print('Usage: trans data path required when mode is all or transform')
-            exit(1)
-        result = with_benchmark('Transformation', transform)
-        show_sample(args, result, label)
-        with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label))
-
-    spark.stop()
diff --git a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/cpu_main.py b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/cpu_main.py
deleted file mode 100644
index e2edd80e0..000000000
--- a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/cpu_main.py
+++ /dev/null
@@ -1,64 +0,0 @@
-#
-# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from com.nvidia.spark.examples.mortgage.consts import *
-from com.nvidia.spark.examples.mortgage.etl import etl
-from com.nvidia.spark.examples.utility.utils import *
-from ml.dmlc.xgboost4j.scala.spark import *
-from pyspark.sql import SparkSession
-
-def main(args, xgboost_args):
-    spark = (SparkSession
-        .builder
-        .appName(args.mainClass)
-        .getOrCreate())
-
-    train_data, eval_data, trans_data = valid_input_data(spark, args, '', schema)
-
-    if args.mode in [ 'all', 'train' ]:
-        classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args))
-            .setLabelCol(label)
-            .setFeaturesCol('features'))
-        if eval_data:
-            eval_data = vectorize_data_frame(eval_data, label)
-            classifier.setEvalSets({ 'test': eval_data })
-        if not train_data:
-            print('-' * 80)
-            print('Usage: training data path required when mode is all or train')
-            exit(1)
-        train_data = vectorize_data_frame(train_data, label)
-        model = with_benchmark('Training', lambda: classifier.fit(train_data))
-
-        if args.modelPath:
-            writer = model.write().overwrite() if args.overwrite else model
-            writer.save(args.modelPath)
-    else:
-        model = XGBoostClassificationModel().load(args.modelPath)
-
-    if args.mode in [ 'all', 'transform' ]:
-        def transform():
-            vec_df = vectorize_data_frame(trans_data, label)
-            result = model.transform(vec_df).cache()
-            result.foreachPartition(lambda _: None)
-            return result
-        if not trans_data:
-            print('-' * 80)
-            print('Usage: trans data path required when mode is all or transform')
-            exit(1)
-        result = with_benchmark('Transformation', transform)
-        show_sample(args, result, label)
-        with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label))
-
-    spark.stop()
diff --git a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/gpu_cross_validator_main.py b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/cross_validator_main.py
similarity index 68%
rename from examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/gpu_cross_validator_main.py
rename to examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/cross_validator_main.py
index c717d3d59..b6305a893 100644
--- a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/gpu_cross_validator_main.py
+++ b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/cross_validator_main.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,32 +13,41 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from pyspark.ml.tuning import ParamGridBuilder
+from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
 
-from com.nvidia.spark.examples.mortgage.consts import *
+from .consts import *
 from com.nvidia.spark.examples.utility.utils import *
-from ml.dmlc.xgboost4j.scala.spark import *
-from ml.dmlc.xgboost4j.scala.spark.rapids import CrossValidator
 from pyspark.sql import SparkSession
 
+from xgboost.spark import SparkXGBClassifier, SparkXGBClassifierModel
+
+
 def main(args, xgboost_args):
     spark = (SparkSession
-        .builder
-        .appName(args.mainClass)
-        .getOrCreate())
+             .builder
+             .appName(args.mainClass)
+             .getOrCreate())
 
     train_data, eval_data, trans_data = valid_input_data(spark, args, '', schema)
-    features = [x.name for x in schema if x.name != label]
 
-    if args.mode in [ 'all', 'train' ]:
-        classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args))
-                      .setLabelCol(label)
-                      .setFeaturesCols(features))
+    if args.mode in ['all', 'train']:
+        if train_data is None:
+            print('-' * 80)
+            print('Usage: training data path required when mode is all or train')
+            exit(1)
+
+        train_data, features = transform_data(train_data, label, args.use_gpu)
+        xgboost_args['features_col'] = features
+        xgboost_args['label_col'] = label
+
+        classifier = SparkXGBClassifier(**xgboost_args)
+
         evaluator = (MulticlassClassificationEvaluator()
                      .setLabelCol(label))
+
         param_grid = (ParamGridBuilder()
-                      .addGrid(classifier.maxDepth, [5, 10])
-                      .addGrid(classifier.numRound, [100, 200])
+                      .addGrid(classifier.max_depth, [6, 8])
+                      .addGrid(classifier.n_estimators, [20, 40])
                       .build())
         cross_validator = (CrossValidator()
                            .setEstimator(classifier)
@@ -57,17 +66,21 @@ def main(args, xgboost_args):
             writer = model.write().overwrite() if args.overwrite else model
             writer.save(args.modelPath)
     else:
-        model = XGBoostClassificationModel().load(args.modelPath)
+        model = SparkXGBClassifierModel.load(args.modelPath)
 
-    if args.mode in [ 'all', 'transform' ]:
-        def transform():
-            result = model.transform(trans_data).cache()
-            result.foreachPartition(lambda _: None)
-            return result
+    if args.mode in ['all', 'transform']:
         if not trans_data:
             print('-' * 80)
             print('Usage: trans data path required when mode is all or transform')
             exit(1)
+
+        trans_data, _ = transform_data(trans_data, label, args.use_gpu)
+
+        def transform():
+            result = model.transform(trans_data).cache()
+            result.foreachPartition(lambda _: None)
+            return result
+
         result = with_benchmark('Transformation', transform)
         show_sample(args, result, label)
         with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label))
diff --git a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl.py b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl.py
index 47052737c..d59279d67 100644
--- a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl.py
+++ b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from com.nvidia.spark.examples.mortgage.consts import *
+from .consts import *
 from pyspark.sql.functions import *
 from pyspark.sql.types import *
 from pyspark.sql.window import Window
diff --git a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl_main.py b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl_main.py
index 55f5df5fc..ee09604ba 100644
--- a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl_main.py
+++ b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl_main.py
@@ -13,16 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from com.nvidia.spark.examples.mortgage.consts import *
-from com.nvidia.spark.examples.mortgage.etl import etl, extract_paths
+from .etl import etl, extract_paths
 from com.nvidia.spark.examples.utility.utils import *
 from pyspark.sql import SparkSession
 
+
 def main(args, xgboost_args):
     spark = (SparkSession
-        .builder
-        .appName(args.mainClass)
-        .getOrCreate())
+             .builder
+             .appName(args.mainClass)
+             .getOrCreate())
     etled_df = etl(spark, args)
     # outPath should has only one input
     outPath = extract_paths(args.dataPaths, 'out::')[0]
diff --git a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/gpu_main.py b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/main.py
similarity index 69%
rename from examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/gpu_main.py
rename to examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/main.py
index 7a4b2e06f..021887e4f 100644
--- a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/gpu_main.py
+++ b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/main.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,48 +13,58 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from com.nvidia.spark.examples.mortgage.consts import *
-from com.nvidia.spark.examples.mortgage.etl import etl
+
+from xgboost.spark import SparkXGBClassifier, SparkXGBClassifierModel
+
+from .consts import *
 from com.nvidia.spark.examples.utility.utils import *
-from ml.dmlc.xgboost4j.scala.spark import *
 from pyspark.sql import SparkSession
 
+
 def main(args, xgboost_args):
     spark = (SparkSession
-        .builder
-        .appName(args.mainClass)
-        .getOrCreate())
+             .builder
+             .appName(args.mainClass)
+             .getOrCreate())
 
     train_data, eval_data, trans_data = valid_input_data(spark, args, '', schema)
-    features = [x.name for x in schema if x.name != label]
 
-    if args.mode in [ 'all', 'train' ]:
-        classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args))
-            .setLabelCol(label)
-            .setFeaturesCols(features))
-        if eval_data:
-            classifier.setEvalSets({ 'test': eval_data })
-        if not train_data:
+    if args.mode in ['all', 'train']:
+        if train_data is None:
             print('-' * 80)
             print('Usage: training data path required when mode is all or train')
             exit(1)
+
+        train_data, features = transform_data(train_data, label, args.use_gpu)
+        xgboost_args['features_col'] = features
+        xgboost_args['label_col'] = label
+        classifier = SparkXGBClassifier(**xgboost_args)
+
+        if eval_data:
+            # TODO
+            pass
+
         model = with_benchmark('Training', lambda: classifier.fit(train_data))
 
         if args.modelPath:
             writer = model.write().overwrite() if args.overwrite else model
             writer.save(args.modelPath)
     else:
-        model = XGBoostClassificationModel().load(args.modelPath)
+        model = SparkXGBClassifierModel.load(args.modelPath)
+
+    if args.mode in ['all', 'transform']:
+        trans_data, _ = transform_data(trans_data, label, args.use_gpu)
 
-    if args.mode in [ 'all', 'transform' ]:
         def transform():
             result = model.transform(trans_data).cache()
             result.foreachPartition(lambda _: None)
             return result
+
         if not trans_data:
             print('-' * 80)
             print('Usage: trans data path required when mode is all or transform')
             exit(1)
+
         result = with_benchmark('Transformation', transform)
         show_sample(args, result, label)
         with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label))
diff --git a/examples/XGBoost-Examples/pack_pyspark_example.sh b/examples/XGBoost-Examples/pack_pyspark_example.sh
new file mode 100755
index 000000000..e446d27da
--- /dev/null
+++ b/examples/XGBoost-Examples/pack_pyspark_example.sh
@@ -0,0 +1,6 @@
+# Follow these steps to package the Python zip file
+rm -fr samples.zip
+cd agaricus/python ; zip -r ../../samples.zip com ; cd ../..
+cd mortgage/python ; zip -r ../../samples.zip com ; cd ../..
+cd taxi/python ; zip -r ../../samples.zip com ; cd ../..
+cd utility/python ; zip -r ../../samples.zip com ; cd ../..
diff --git a/examples/XGBoost-Examples/pom.xml b/examples/XGBoost-Examples/pom.xml
index 9d040878d..d6977f8c5 100644
--- a/examples/XGBoost-Examples/pom.xml
+++ b/examples/XGBoost-Examples/pom.xml
@@ -38,7 +38,7 @@
 
     <properties>
         <encoding>UTF-8</encoding>
-        <xgboost.version>1.6.1</xgboost.version>
+        <xgboost.version>1.7.1</xgboost.version>
         <spark.version>3.1.1</spark.version>
         <scala.version>2.12.8</scala.version>
         <scala.binary.version>2.12</scala.binary.version>
diff --git a/examples/XGBoost-Examples/taxi/notebooks/python/cv-taxi-gpu.ipynb b/examples/XGBoost-Examples/taxi/notebooks/python/cv-taxi-gpu.ipynb
index f7530c133..829d3c541 100644
--- a/examples/XGBoost-Examples/taxi/notebooks/python/cv-taxi-gpu.ipynb
+++ b/examples/XGBoost-Examples/taxi/notebooks/python/cv-taxi-gpu.ipynb
@@ -11,13 +11,10 @@
     "Here takes the application 'Taxi' as an example.\n",
     "\n",
     "A few libraries are required for this notebook:\n",
-    "  1. NumPy\n",
-    "  2. cudf jar\n",
-    "  2. xgboost4j jar\n",
-    "  3. xgboost4j-spark jar\n",
-    "  \n",
-    "Note: For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04) that\n",
-    "uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/)."
+    "  1. cudf-cu11\n",
+    "  2. xgboost\n",
+    "  3. scikit-learn\n",
+    "  4. numpy"
    ]
   },
   {
@@ -33,21 +30,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from ml.dmlc.xgboost4j.scala.spark import XGBoostRegressionModel, XGBoostRegressor\n",
-    "from ml.dmlc.xgboost4j.scala.spark.rapids import CrossValidator\n",
+    "from xgboost.spark import SparkXGBRegressor, SparkXGBRegressorModel\n",
+    "from pyspark.ml.tuning import ParamGridBuilder, CrossValidator\n",
     "from pyspark.ml.evaluation import RegressionEvaluator\n",
-    "from pyspark.ml.tuning import ParamGridBuilder\n",
     "from pyspark.sql import SparkSession\n",
     "from pyspark.sql.types import FloatType, IntegerType, StructField, StructType\n",
     "from time import time\n",
-    "import os"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "As shown above, here `CrossValidator` is imported from package `ml.dmlc.xgboost4j.scala.spark.rapids`, not the spark's `tuning.CrossValidator`."
+    "from pyspark.conf import SparkConf\n",
+    "import os\n",
+    "# os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\"\n",
+    "# os.environ['PYSPARK_DRIVER_PYTHON'] = \"./environment/bin/python\""
    ]
   },
   {
@@ -61,9 +53,64 @@
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 08:02:09,748 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
+      "Setting default log level to \"WARN\".\n",
+      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
+      "2022-11-30 08:02:10,103 WARN resource.ResourceUtils: The configuration of cores (exec = 2 task = 1, runnable tasks = 2) will result in wasted resources due to resource gpu limiting the number of runnable tasks per executor to: 1. Please adjust your configuration.\n",
+      "2022-11-30 08:02:23,737 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator 22.12.0 using cudf 22.12.0.\n",
+      "2022-11-30 08:02:23,752 WARN rapids.RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n",
+      "2022-11-30 08:02:23,756 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n",
+      "2022-11-30 08:02:23,757 WARN rapids.RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n",
+      "2022-11-30 08:02:24,226 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.\n"
+     ]
+    }
+   ],
    "source": [
-    "spark = SparkSession.builder.appName(\"taxi-cv-gpu-python\").getOrCreate()"
+    "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"/your-url\")\n",
+    "\n",
+    "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-jar-path\")\n",
+    "\n",
+    "# You need to update with your real hardware resource \n",
+    "driverMem = os.getenv(\"DRIVER_MEM\", \"2g\")\n",
+    "executorMem = os.getenv(\"EXECUTOR_MEM\", \"2g\")\n",
+    "pinnedPoolSize = os.getenv(\"PINNED_POOL_SIZE\", \"2g\")\n",
+    "concurrentGpuTasks = os.getenv(\"CONCURRENT_GPU_TASKS\", \"2\")\n",
+    "executorCores = int(os.getenv(\"EXECUTOR_CORES\", \"2\"))\n",
+    "# Common spark settings\n",
+    "conf = SparkConf()\n",
+    "conf.setMaster(SPARK_MASTER_URL)\n",
+    "conf.setAppName(\"Microbenchmark on GPU\")\n",
+    "conf.set(\"spark.executor.instances\",\"1\")\n",
+    "conf.set(\"spark.driver.memory\", driverMem)\n",
+    "## The tasks will run on GPU memory, so there is no need to set a high host memory\n",
+    "conf.set(\"spark.executor.memory\", executorMem)\n",
+    "## The tasks will run on GPU cores, so there is no need to use many cpu cores\n",
+    "conf.set(\"spark.executor.cores\", executorCores)\n",
+    "\n",
+    "# Plugin settings\n",
+    "conf.set(\"spark.executor.resource.gpu.amount\", \"1\")\n",
+    "conf.set(\"spark.rapids.sql.concurrentGpuTasks\", concurrentGpuTasks)\n",
+    "conf.set(\"spark.rapids.memory.pinnedPool.size\", pinnedPoolSize)\n",
+    "conf.set(\"spark.rapids.memory.gpu.allocFraction\",\"0.7\")\n",
+    "conf.set(\"spark.locality.wait\",\"0\")\n",
+    "##############note: only support value=1 https://github.com/dmlc/xgboost/blame/master/python-package/xgboost/spark/core.py#L370-L374\n",
+    "conf.set(\"spark.task.resource.gpu.amount\", 1) \n",
+    "conf.set(\"spark.rapids.sql.enabled\", \"true\") \n",
+    "conf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\n",
+    "conf.set(\"spark.sql.cache.serializer\",\"com.nvidia.spark.ParquetCachedBatchSerializer\")\n",
+    "conf.set(\"spark.driver.extraClassPath\", RAPIDS_JAR)\n",
+    "conf.set(\"spark.executor.extraClassPath\", RAPIDS_JAR)\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n",
+    "# Create spark session\n",
+    "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
+    "\n",
+    "reader = spark.read"
    ]
   },
   {
@@ -103,8 +150,17 @@
     "\n",
     "# You need to update them to your real paths!\n",
     "dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n",
-    "train_data = spark.read.parquet(dataRoot + '/taxi/parquet/train')\n",
-    "trans_data = spark.read.parquet(dataRoot + '/taxi/parquet/eval')"
+    "train_path = dataRoot + \"/taxi/csv/train\"\n",
+    "eval_path = dataRoot + \"/taxi/csv/test\"\n",
+    "\n",
+    "data_format = 'csv'\n",
+    "has_header = 'true'\n",
+    "if data_format == 'csv':\n",
+    "    train_data = reader.schema(schema).option('header',has_header).csv(train_path)\n",
+    "    trans_data = reader.schema(schema).option('header',has_header).csv(eval_path)\n",
+    "else :\n",
+    "    train_data = reader.load(train_path)\n",
+    "    trans_data = reader.load(eval_path)"
    ]
   },
   {
@@ -121,29 +177,29 @@
    "outputs": [],
    "source": [
     "# First build a regressor of GPU version using *setFeaturesCols* to set feature columns\n",
-    "params = {\n",
-    "    'eta': 0.05,\n",
-    "    'maxDepth': 8,\n",
-    "    'subsample': 0.8,\n",
-    "    'gamma': 1.0,\n",
-    "    'numRound': 100,\n",
-    "    'numWorkers': 1,\n",
-    "    'treeMethod': 'gpu_hist',\n",
+    "params = { \n",
+    "    \"tree_method\": \"gpu_hist\",\n",
+    "    \"grow_policy\": \"depthwise\",\n",
+    "    \"num_workers\": 1,\n",
+    "    \"use_gpu\": \"true\",\n",
     "}\n",
-    "regressor = XGBoostRegressor(**params).setLabelCol(label).setFeaturesCols(features)\n",
+    "params['features_col'] = features\n",
+    "params['label_col'] = label\n",
+    "\n",
+    "regressor = SparkXGBRegressor(**params)\n",
     "# Then build the evaluator and the hyperparameters\n",
     "evaluator = (RegressionEvaluator()\n",
     "    .setLabelCol(label))\n",
     "param_grid = (ParamGridBuilder()\n",
-    "    .addGrid(regressor.maxDepth, [3, 6])\n",
-    "    .addGrid(regressor.numRound, [100, 200])\n",
+    "    .addGrid(regressor.max_depth, [3, 6])\n",
+    "    .addGrid(regressor.n_estimators, [100, 200])\n",
     "    .build())\n",
     "# Finally the corss validator\n",
     "cross_validator = (CrossValidator()\n",
     "    .setEstimator(regressor)\n",
     "    .setEvaluator(evaluator)\n",
     "    .setEstimatorParamMaps(param_grid)\n",
-    "    .setNumFolds(3))"
+    "    .setNumFolds(2))"
    ]
   },
   {
@@ -158,11 +214,108 @@
    "execution_count": 5,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "If features_cols param set, then features_col param is ignored.\n",
+      "/data/home/yuanli/work/reviews/pr252/pyspark_venv_20221125/lib/python3.8/site-packages/xgboost/sklearn.py:808: UserWarning: Loading a native XGBoost model with Scikit-Learn interface.\n",
+      "  warnings.warn(\"Loading a native XGBoost model with Scikit-Learn interface.\")\n",
+      "2022-11-30 08:03:14,308 WARN rapids.GpuOverrides: \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#889, fare_amount#890, 1.0#891, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#889 could run on GPU\n",
+      "    @Expression <AttributeReference> fare_amount#890 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#891 could run on GPU\n",
+      "  !Expression <AttributeReference> obj#895 cannot run on GPU because expression AttributeReference obj#895 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "\n",
+      "2022-11-30 08:03:14,317 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-30 08:03:20,073 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#1789, fare_amount#1790, 1.0#1791, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#1789 could run on GPU\n",
+      "    @Expression <AttributeReference> fare_amount#1790 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#1791 could run on GPU\n",
+      "  !Expression <AttributeReference> obj#1795 cannot run on GPU because expression AttributeReference obj#1795 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-30 08:03:23,687 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#2689, fare_amount#2690, 1.0#2691, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#2689 could run on GPU\n",
+      "    @Expression <AttributeReference> fare_amount#2690 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#2691 could run on GPU\n",
+      "  !Expression <AttributeReference> obj#2695 cannot run on GPU because expression AttributeReference obj#2695 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-30 08:03:27,457 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#3589, fare_amount#3590, 1.0#3591, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#3589 could run on GPU\n",
+      "    @Expression <AttributeReference> fare_amount#3590 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#3591 could run on GPU\n",
+      "  !Expression <AttributeReference> obj#3595 cannot run on GPU because expression AttributeReference obj#3595 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-30 08:03:30,964 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#4659, fare_amount#4660, 1.0#4661, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#4659 could run on GPU\n",
+      "    @Expression <AttributeReference> fare_amount#4660 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#4661 could run on GPU\n",
+      "  !Expression <AttributeReference> obj#4665 cannot run on GPU because expression AttributeReference obj#4665 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-30 08:03:34,524 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#5559, fare_amount#5560, 1.0#5561, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#5559 could run on GPU\n",
+      "    @Expression <AttributeReference> fare_amount#5560 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#5561 could run on GPU\n",
+      "  !Expression <AttributeReference> obj#5565 cannot run on GPU because expression AttributeReference obj#5565 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-30 08:03:38,067 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#6459, fare_amount#6460, 1.0#6461, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#6459 could run on GPU\n",
+      "    @Expression <AttributeReference> fare_amount#6460 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#6461 could run on GPU\n",
+      "  !Expression <AttributeReference> obj#6465 cannot run on GPU because expression AttributeReference obj#6465 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-30 08:03:41,793 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#7359, fare_amount#7360, 1.0#7361, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#7359 could run on GPU\n",
+      "    @Expression <AttributeReference> fare_amount#7360 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#7361 could run on GPU\n",
+      "  !Expression <AttributeReference> obj#7365 cannot run on GPU because expression AttributeReference obj#7365 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "If features_cols param set, then features_col param is ignored.\n",
+      "[Stage 34:>                                                         (0 + 1) / 1]\r"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Cross-Validation takes 73.77 seconds\n"
+      "Cross-Validation takes 55.19 seconds\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "                                                                                \r"
      ]
     }
    ],
@@ -192,16 +345,32 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Transforming takes 1.33 seconds\n",
-      "+-----------+-----------------+\n",
-      "|fare_amount|       prediction|\n",
-      "+-----------+-----------------+\n",
-      "|        2.5|34.38509750366211|\n",
-      "|       45.0|37.97528839111328|\n",
-      "|        2.5|28.55727195739746|\n",
-      "|       45.0|40.39316177368164|\n",
-      "|       45.0|36.12188720703125|\n",
-      "+-----------+-----------------+\n",
+      "Transforming takes 0.23 seconds\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 08:03:45,503 WARN rapids.GpuOverrides: \n",
+      "!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n",
+      "  @Partitioning <SinglePartition$> could run on GPU\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-----------+-----------+\n",
+      "|fare_amount| prediction|\n",
+      "+-----------+-----------+\n",
+      "|        5.0| 5.01032114|\n",
+      "|       34.0|  31.134758|\n",
+      "|       10.0|9.288980484|\n",
+      "|       16.5|15.33446312|\n",
+      "|        7.0|8.197098732|\n",
+      "+-----------+-----------+\n",
       "only showing top 5 rows\n",
       "\n"
      ]
@@ -232,8 +401,22 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Evaluation takes 0.26 seconds\n",
-      "RMSE is 3.5167114187894883\n"
+      "Evaluation takes 0.05 seconds\n",
+      "RMSE is 2.055690464034438\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 08:03:45,728 WARN rapids.GpuOverrides: \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#7645, fare_amount#8271, 1.0#8272, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#7645 could run on GPU\n",
+      "    @Expression <AttributeReference> fare_amount#8271 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#8272 could run on GPU\n",
+      "  !Expression <AttributeReference> obj#8276 cannot run on GPU because expression AttributeReference obj#8276 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "\n"
      ]
     }
    ],
diff --git a/examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb b/examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb
index 54b181513..c41e3dd72 100644
--- a/examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb
+++ b/examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb
@@ -19,14 +19,14 @@
     "All data could be found at https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page\n",
     "\n",
     "### 2. Download needed jars\n",
-    "* [rapids-4-spark_2.12-22.10.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar)\n",
+    "* [rapids-4-spark_2.12-22.12.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar)\n",
     "\n",
     "### 3. Start Spark Standalone\n",
     "Before running the script, please setup Spark standalone mode\n",
     "\n",
     "### 4. Add ENV\n",
     "```\n",
-    "$ export SPARK_JARS=rapids-4-spark_2.12-22.10.0.jar\n",
+    "$ export SPARK_JARS=rapids-4-spark_2.12-22.12.0.jar\n",
     "$ export PYSPARK_DRIVER_PYTHON=jupyter \n",
     "$ export PYSPARK_DRIVER_PYTHON_OPTS=notebook\n",
     "```\n",
diff --git a/examples/XGBoost-Examples/taxi/notebooks/python/taxi-gpu.ipynb b/examples/XGBoost-Examples/taxi/notebooks/python/taxi-gpu.ipynb
index 3fdfa540a..593d381d2 100644
--- a/examples/XGBoost-Examples/taxi/notebooks/python/taxi-gpu.ipynb
+++ b/examples/XGBoost-Examples/taxi/notebooks/python/taxi-gpu.ipynb
@@ -4,21 +4,17 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Introduction to XGBoost Spark3.0 with GPU\n",
+    "# Introduction to XGBoost Spark3.1 with GPU\n",
     "\n",
     "Taxi is an example of xgboost regressor. This notebook will show you how to load data, train the xgboost model and use this model to predict \"fare_amount\" of your taxi trip.\n",
     "\n",
     "A few libraries required for this notebook:\n",
-    "  1. NumPy\n",
-    "  2. cudf jar\n",
-    "  3. xgboost4j jar\n",
-    "  4. xgboost4j-spark jar\n",
-    "  5. rapids-4-spark.jar  \n",
+    "  1. cudf-cu11\n",
+    "  2. xgboost\n",
+    "  3. scikit-learn\n",
+    "  4. numpy\n",
     "\n",
-    "This notebook also illustrates the ease of porting a sample CPU based Spark xgboost4j code into GPU. There is only one change required for running Spark XGBoost on GPU. That is replacing the API `setFeaturesCol(feature)` on CPU with the new API `setFeaturesCols(features)`. This also eliminates the need for vectorization (assembling multiple feature columns in to one column) since we can read multiple columns.\n",
-    "\n",
-    "Note: For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04) that\n",
-    "uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/)."
+    "This notebook also illustrates the ease of porting a sample CPU based Spark xgboost4j code into GPU. There is no change required for running Spark XGBoost on GPU because both CPU and GPU call the same API. For CPU run, we need to vectorize the trained dataset before fitting data to regressor."
    ]
   },
   {
@@ -34,12 +30,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from ml.dmlc.xgboost4j.scala.spark import XGBoostRegressionModel, XGBoostRegressor\n",
+    "from xgboost.spark import SparkXGBRegressor, SparkXGBRegressorModel\n",
     "from pyspark.ml.evaluation import RegressionEvaluator\n",
     "from pyspark.sql import SparkSession\n",
     "from pyspark.sql.types import FloatType, IntegerType, StructField, StructType\n",
     "from time import time\n",
-    "import os"
+    "from pyspark.conf import SparkConf\n",
+    "import os\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\"\n",
+    "# os.environ['PYSPARK_DRIVER_PYTHON'] = \"./environment/bin/python\""
    ]
   },
   {
@@ -62,11 +62,67 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 07:51:19,104 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
+      "Setting default log level to \"WARN\".\n",
+      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
+      "2022-11-30 07:51:19,480 WARN resource.ResourceUtils: The configuration of cores (exec = 2 task = 1, runnable tasks = 2) will result in wasted resources due to resource gpu limiting the number of runnable tasks per executor to: 1. Please adjust your configuration.\n",
+      "2022-11-30 07:51:33,277 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator 22.12.0 using cudf 22.12.0.\n",
+      "2022-11-30 07:51:33,292 WARN rapids.RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n",
+      "2022-11-30 07:51:33,295 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n",
+      "2022-11-30 07:51:33,295 WARN rapids.RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n",
+      "2022-11-30 07:51:33,798 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.\n"
+     ]
+    }
+   ],
    "source": [
-    "spark = SparkSession.builder.getOrCreate()\n",
+    "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"/your-url\")\n",
+    "\n",
+    "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-jar-path\")\n",
+    "\n",
+    "# You need to update with your real hardware resource \n",
+    "driverMem = os.getenv(\"DRIVER_MEM\", \"2g\")\n",
+    "executorMem = os.getenv(\"EXECUTOR_MEM\", \"2g\")\n",
+    "pinnedPoolSize = os.getenv(\"PINNED_POOL_SIZE\", \"2g\")\n",
+    "concurrentGpuTasks = os.getenv(\"CONCURRENT_GPU_TASKS\", \"2\")\n",
+    "executorCores = int(os.getenv(\"EXECUTOR_CORES\", \"2\"))\n",
+    "# Common spark settings\n",
+    "conf = SparkConf()\n",
+    "conf.setMaster(SPARK_MASTER_URL)\n",
+    "conf.setAppName(\"Microbenchmark on GPU\")\n",
+    "conf.set(\"spark.executor.instances\",\"1\")\n",
+    "conf.set(\"spark.driver.memory\", driverMem)\n",
+    "## The tasks will run on GPU memory, so there is no need to set a high host memory\n",
+    "conf.set(\"spark.executor.memory\", executorMem)\n",
+    "## The tasks will run on GPU cores, so there is no need to use many cpu cores\n",
+    "conf.set(\"spark.executor.cores\", executorCores)\n",
+    "\n",
+    "# Plugin settings\n",
+    "conf.set(\"spark.executor.resource.gpu.amount\", \"1\")\n",
+    "conf.set(\"spark.rapids.sql.concurrentGpuTasks\", concurrentGpuTasks)\n",
+    "conf.set(\"spark.rapids.memory.pinnedPool.size\", pinnedPoolSize)\n",
+    "# since pyspark and xgboost share the same GPU, we need to allocate some memory to xgboost to avoid GPU OOM while training \n",
+    "conf.set(\"spark.rapids.memory.gpu.allocFraction\",\"0.7\")\n",
+    "conf.set(\"spark.locality.wait\",\"0\")\n",
+    "##############note: only support value=1 https://github.com/dmlc/xgboost/blame/master/python-package/xgboost/spark/core.py#L370-L374\n",
+    "conf.set(\"spark.task.resource.gpu.amount\", 1) \n",
+    "conf.set(\"spark.rapids.sql.enabled\", \"true\") \n",
+    "conf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\n",
+    "conf.set(\"spark.sql.cache.serializer\",\"com.nvidia.spark.ParquetCachedBatchSerializer\")\n",
+    "conf.set(\"spark.driver.extraClassPath\", RAPIDS_JAR)\n",
+    "conf.set(\"spark.executor.extraClassPath\", RAPIDS_JAR)\n",
+    "\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n",
+    "# Create spark session\n",
+    "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
+    "\n",
     "reader = spark.read"
    ]
   },
@@ -79,7 +135,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -106,8 +162,17 @@
     "\n",
     "# You need to update them to your real paths!\n",
     "dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n",
-    "train_data = reader.schema(schema).option('header', True).csv(dataRoot + '/taxi/csv/train')\n",
-    "trans_data  = reader.schema(schema).option('header', True).csv(dataRoot + '/taxi/csv/test')"
+    "train_path = dataRoot + \"/taxi/csv/train\"\n",
+    "eval_path = dataRoot + \"/taxi/csv/test\"\n",
+    "\n",
+    "data_format = 'csv'\n",
+    "has_header = 'true'\n",
+    "if data_format == 'csv':\n",
+    "    train_data = reader.schema(schema).option('header',has_header).csv(train_path)\n",
+    "    trans_data = reader.schema(schema).option('header',has_header).csv(eval_path)\n",
+    "else :\n",
+    "    train_data = reader.load(train_path)\n",
+    "    trans_data = reader.load(eval_path)"
    ]
   },
   {
@@ -139,34 +204,39 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
     "params = { \n",
-    "    'eta': 0.05,\n",
-    "    'treeMethod': 'gpu_hist',\n",
-    "    'maxDepth': 8,\n",
-    "    'subsample': 0.8,\n",
-    "    'gamma': 1.0,\n",
-    "    'numRound': 100,\n",
-    "    'numWorkers': 1,\n",
+    "    \"tree_method\": \"gpu_hist\",\n",
+    "    \"grow_policy\": \"depthwise\",\n",
+    "    \"num_workers\": 1,\n",
+    "    \"use_gpu\": \"true\",\n",
     "}\n",
-    "regressor = XGBoostRegressor(**params).setLabelCol(label).setFeaturesCols(features)"
+    "params['features_col'] = features\n",
+    "params['label_col'] = label\n",
+    "    \n",
+    "regressor = SparkXGBRegressor(**params)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The CPU version regressor provides the API `setFeaturesCol` which only accepts a single column name, so vectorization for multiple feature columns is required.\n",
-    "```Python\n",
-    "regressor = XGBoostRegressor(**params).setLabelCol(label).setFeaturesCol('features')\n",
-    "```\n",
-    "\n",
     "The parameter `num_workers` should be set to the number of GPUs in Spark cluster for GPU version, while for CPU version it is usually equal to the number of the CPU cores.\n",
     "\n",
-    "Concerning the tree method, GPU version only supports `gpu_hist` currently, while `hist` is designed and used here for CPU training.\n"
+    "Concerning the tree method, GPU version only supports `gpu_hist` currently, while `hist` is designed and used here for CPU training.\n",
+    "\n",
+    "An example of CPU classifier:\n",
+    "```\n",
+    "classifier = SparkXGBClassifier(\n",
+    "  feature_col=features,\n",
+    "  label_col=label,  \n",
+    "  num_workers=1024,\n",
+    "  use_gpu=False,\n",
+    ")\n",
+    "```"
    ]
   },
   {
@@ -178,16 +248,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
    "metadata": {
     "scrolled": true
    },
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "If features_cols param set, then features_col param is ignored.\n",
+      "[Stage 2:>                                                          (0 + 1) / 1]\r"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Training takes 17.73 seconds\n"
+      "Training takes 24.08 seconds\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "                                                                                \r",
+      "/data/home/yuanli/work/reviews/pr252/pyspark_venv_20221125/lib/python3.8/site-packages/xgboost/sklearn.py:808: UserWarning: Loading a native XGBoost model with Scikit-Learn interface.\n",
+      "  warnings.warn(\"Loading a native XGBoost model with Scikit-Learn interface.\")\n"
      ]
     }
    ],
@@ -210,12 +298,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "If features_cols param set, then features_col param is ignored.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.write().overwrite().save(dataRoot + '/model/taxi')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.write().overwrite().save(dataRoot + '/new-model-path')\n",
-    "loaded_model = XGBoostRegressionModel().load(dataRoot + '/new-model-path')"
+    "loaded_model = SparkXGBRegressorModel().load(dataRoot + '/model/taxi')"
    ]
   },
   {
@@ -227,25 +331,48 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
    "metadata": {
     "scrolled": false
    },
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 07:52:27,357 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Transformation takes 0.93 seconds\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 07:52:28,189 WARN rapids.GpuOverrides: \n",
+      "!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n",
+      "  @Partitioning <SinglePartition$> could run on GPU\n",
+      "\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Transformation takes 2.55 seconds\n",
-      "+------------+---------------+-------------+-----------+------------------+\n",
-      "|   vendor_id|passenger_count|trip_distance|fare_amount|        prediction|\n",
-      "+------------+---------------+-------------+-----------+------------------+\n",
-      "|1.55973043E9|            1.0|          1.1|        6.2| 5.670516490936279|\n",
-      "|1.55973043E9|            4.0|          2.7|        9.4|10.054250717163086|\n",
-      "|1.55973043E9|            1.0|          1.5|        6.1|  7.01417350769043|\n",
-      "|1.55973043E9|            1.0|          4.1|       12.6|14.309316635131836|\n",
-      "|1.55973043E9|            1.0|          4.6|       13.4|13.990922927856445|\n",
-      "+------------+---------------+-------------+-----------+------------------+\n",
+      "+--------------+---------------+-------------+-----------+-----------+\n",
+      "|     vendor_id|passenger_count|trip_distance|fare_amount| prediction|\n",
+      "+--------------+---------------+-------------+-----------+-----------+\n",
+      "|1.559730432E09|            2.0|  0.699999988|        5.0|5.046935558|\n",
+      "|1.559730432E09|            3.0|  10.69999981|       34.0|31.72706413|\n",
+      "|1.559730432E09|            1.0|  2.299999952|       10.0|9.294451714|\n",
+      "|1.559730432E09|            1.0|  4.400000095|       16.5|15.05233097|\n",
+      "|1.559730432E09|            1.0|          1.5|        7.0|8.995832443|\n",
+      "+--------------+---------------+-------------+-----------+-----------+\n",
       "only showing top 5 rows\n",
       "\n"
      ]
@@ -276,7 +403,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 9,
    "metadata": {
     "scrolled": true
    },
@@ -285,8 +412,22 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Evaluation takes 0.45 seconds\n",
-      "RMSE is 3.3195416959403032\n"
+      "Evaluation takes 0.22 seconds\n",
+      "RMSE is 1.9141528471228921\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 07:52:28,580 WARN rapids.GpuOverrides: \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#87, fare_amount#728, 1.0#729, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#87 could run on GPU\n",
+      "    @Expression <AttributeReference> fare_amount#728 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#729 could run on GPU\n",
+      "  !Expression <AttributeReference> obj#733 cannot run on GPU because expression AttributeReference obj#733 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "\n"
      ]
     }
    ],
@@ -306,7 +447,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
diff --git a/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-ETL.ipynb b/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-ETL.ipynb
index 9b1d891ce..485518326 100644
--- a/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-ETL.ipynb
+++ b/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-ETL.ipynb
@@ -19,14 +19,14 @@
     "All data could be found at https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page\n",
     "\n",
     "### 2. Download needed jar\n",
-    "* [rapids-4-spark_2.12-22.10.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar)\n",
+    "* [rapids-4-spark_2.12-22.12.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar)\n",
     "\n",
     "### 3. Start Spark Standalone\n",
     "Before running the script, please setup Spark standalone mode\n",
     "\n",
     "### 4. Add ENV\n",
     "```\n",
-    "$ export SPARK_JARS=rapids-4-spark_2.12-22.10.0.jar\n",
+    "$ export SPARK_JARS=rapids-4-spark_2.12-22.12.0.jar\n",
     "\n",
     "```\n",
     "\n",
diff --git a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/consts.py b/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/consts.py
index 90915619a..578d23183 100644
--- a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/consts.py
+++ b/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/consts.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -57,12 +57,3 @@
     StructField('day_of_week', FloatType()),
     StructField('is_weekend', FloatType()),
 ])
-
-default_params = {
-    'eta': 0.05,
-    'maxDepth': 8,
-    'subsample': 0.8,
-    'gamma': 1.0,
-    'numRound': 100,
-    'numWorkers': 1,
-}
diff --git a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/cpu_cross_validator_main.py b/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/cpu_cross_validator_main.py
deleted file mode 100644
index 69ffc53bb..000000000
--- a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/cpu_cross_validator_main.py
+++ /dev/null
@@ -1,76 +0,0 @@
-#
-# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from com.nvidia.spark.examples.taxi.consts import *
-from com.nvidia.spark.examples.utility.utils import *
-from ml.dmlc.xgboost4j.scala.spark import *
-from pyspark.sql import SparkSession
-from pyspark.ml.tuning import ParamGridBuilder
-from ml.dmlc.xgboost4j.scala.spark.rapids import CrossValidator
-
-def main(args, xgboost_args):
-    spark = (SparkSession
-        .builder
-        .appName(args.mainClass)
-        .getOrCreate())
-
-    train_data, eval_data, trans_data = valid_input_data(spark, args, raw_schema, final_schema)
-
-    if args.mode in [ 'all', 'train' ]:
-        regressor = (XGBoostRegressor(**merge_dicts(default_params, xgboost_args))
-                     .setLabelCol(label)
-                     .setFeaturesCol('features'))
-        param_grid = (ParamGridBuilder()
-                      .addGrid(regressor.maxDepth, [5, 10])
-                      .addGrid(regressor.numRound, [100, 200])
-                      .build())
-        evaluator = (RegressionEvaluator()
-                    .setLabelCol(label))
-
-        cross_validator = (CrossValidator()
-                           .setEstimator(regressor)
-                           .setEvaluator(evaluator)
-                           .setEstimatorParamMaps(param_grid)
-                           .setNumFolds(3))
-        if not train_data:
-            print('-' * 80)
-            print('Usage: training data path required when mode is all or train')
-            exit(1)
-        train_data = vectorize_data_frame(train_data, label)
-        model = with_benchmark('Training', lambda: cross_validator.fit(train_data))
-
-        # get the best model to do transform
-        model = model.bestModel
-        if args.modelPath:
-            writer = model.write().overwrite() if args.overwrite else model
-            writer.save(args.modelPath)
-    else:
-        model = XGBoostRegressionModel().load(args.modelPath)
-
-    if args.mode in [ 'all', 'transform' ]:
-        def transform():
-            vec_df = vectorize_data_frame(trans_data, label)
-            result = model.transform(vec_df).cache()
-            result.foreachPartition(lambda _: None)
-            return result
-        if not trans_data:
-            print('-' * 80)
-            print('Usage: trans data path required when mode is all or transform')
-            exit(1)
-        result = with_benchmark('Transformation', transform)
-        show_sample(args, result, label)
-        with_benchmark('Evaluation', lambda: check_regression_accuracy(result, label))
-
-    spark.stop()
diff --git a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/cpu_main.py b/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/cpu_main.py
deleted file mode 100644
index e31241926..000000000
--- a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/cpu_main.py
+++ /dev/null
@@ -1,64 +0,0 @@
-#
-# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from com.nvidia.spark.examples.taxi.consts import *
-from com.nvidia.spark.examples.taxi.pre_process import pre_process
-from com.nvidia.spark.examples.utility.utils import *
-from ml.dmlc.xgboost4j.scala.spark import *
-from pyspark.sql import SparkSession
-
-def main(args, xgboost_args):
-    spark = (SparkSession
-        .builder
-        .appName(args.mainClass)
-        .getOrCreate())
-
-    train_data, eval_data, trans_data = valid_input_data(spark, args, raw_schema, final_schema)
-
-    if args.mode in [ 'all', 'train' ]:
-        regressor = (XGBoostRegressor(**merge_dicts(default_params, xgboost_args))
-            .setLabelCol(label)
-            .setFeaturesCol('features'))
-        if eval_data:
-            train_eval_data = vectorize_data_frame(eval_data, label)
-            regressor.setEvalSets({ 'test': train_eval_data })
-        if not train_data:
-            print('-' * 80)
-            print('Usage: training data path required when mode is all or train')
-            exit(1)
-        train_data = vectorize_data_frame(train_data, label)
-        model = with_benchmark('Training', lambda: regressor.fit(train_data))
-
-        if args.modelPath:
-            writer = model.write().overwrite() if args.overwrite else model
-            writer.save(args.modelPath)
-    else:
-        model = XGBoostRegressionModel().load(args.modelPath)
-
-    if args.mode in [ 'all', 'transform' ]:
-        def transform():
-            vec_df = vectorize_data_frame(trans_data, label)
-            result = model.transform(vec_df).cache()
-            result.foreachPartition(lambda _: None)
-            return result
-        if not trans_data:
-            print('-' * 80)
-            print('Usage: trans data path required when mode is all or transform')
-            exit(1)
-        result = with_benchmark('Transformation', transform)
-        show_sample(args, result, label)
-        with_benchmark('Evaluation', lambda: check_regression_accuracy(result, label))
-
-    spark.stop()
diff --git a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/gpu_cross_validator_main.py b/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/cross_validator_main.py
similarity index 70%
rename from examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/gpu_cross_validator_main.py
rename to examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/cross_validator_main.py
index b3b34c083..956c8d2ce 100644
--- a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/gpu_cross_validator_main.py
+++ b/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/cross_validator_main.py
@@ -13,42 +13,48 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from com.nvidia.spark.examples.taxi.consts import *
+from .consts import *
 from com.nvidia.spark.examples.utility.utils import *
-from ml.dmlc.xgboost4j.scala.spark import *
-from pyspark.ml.tuning import ParamGridBuilder
-from ml.dmlc.xgboost4j.scala.spark.rapids import CrossValidator
+from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
 from pyspark.sql import SparkSession
 
+from xgboost.spark import SparkXGBRegressor, SparkXGBRegressorModel
+
+
 def main(args, xgboost_args):
     spark = (SparkSession
-        .builder
-        .appName(args.mainClass)
-        .getOrCreate())
+             .builder
+             .appName(args.mainClass)
+             .getOrCreate())
 
     train_data, eval_data, trans_data = valid_input_data(spark, args, raw_schema, final_schema)
 
-    features = [x.name for x in final_schema if x.name != label]
+    if args.mode in ['all', 'train']:
+        if train_data is None:
+            print('-' * 80)
+            print('Usage: training data path required when mode is all or train')
+            print('-' * 80)
+            exit(1)
+
+        train_data, features = transform_data(train_data, label, args.use_gpu)
+        xgboost_args['features_col'] = features
+        xgboost_args['label_col'] = label
+
+        regressor = SparkXGBRegressor(**xgboost_args)
 
-    if args.mode in [ 'all', 'train' ]:
-        regressor = (XGBoostRegressor(**merge_dicts(default_params, xgboost_args))
-                     .setLabelCol(label)
-                     .setFeaturesCols(features))
         param_grid = (ParamGridBuilder()
-                      .addGrid(regressor.maxDepth, [5, 10])
-                      .addGrid(regressor.numRound, [100, 200])
+                      .addGrid(regressor.max_depth, [6, 8])
+                      .addGrid(regressor.n_estimators, [20, 40])
                       .build())
+
         evaluator = (RegressionEvaluator()
-                    .setLabelCol(label))
+                     .setLabelCol(label))
+
         cross_validator = (CrossValidator()
                            .setEstimator(regressor)
                            .setEvaluator(evaluator)
                            .setEstimatorParamMaps(param_grid)
                            .setNumFolds(3))
-        if not train_data:
-            print('-' * 80)
-            print('Usage: training data path required when mode is all or train')
-            exit(1)
 
         model = with_benchmark('Training', lambda: cross_validator.fit(train_data))
         # get the best model to do transform
@@ -57,17 +63,22 @@ def main(args, xgboost_args):
             writer = model.write().overwrite() if args.overwrite else model
             writer.save(args.modelPath)
     else:
-        model = XGBoostRegressionModel().load(args.modelPath)
+        model = SparkXGBRegressorModel.load(args.modelPath)
+
+    if args.mode in ['all', 'transform']:
+        if trans_data is None:
+            print('-' * 80)
+            print('Usage: trans data path required when mode is all or transform')
+            print('-' * 80)
+            exit(1)
+
+        trans_data, _ = transform_data(trans_data, label, args.use_gpu)
 
-    if args.mode in [ 'all', 'transform' ]:
         def transform():
             result = model.transform(trans_data).cache()
             result.foreachPartition(lambda _: None)
             return result
-        if not trans_data:
-            print('-' * 80)
-            print('Usage: trans data path required when mode is all or transform')
-            exit(1)
+
         result = with_benchmark('Transformation', transform)
         show_sample(args, result, label)
         with_benchmark('Evaluation', lambda: check_regression_accuracy(result, label))
diff --git a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/etl_main.py b/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/etl_main.py
index e5f409c1c..18d12faf7 100644
--- a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/etl_main.py
+++ b/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/etl_main.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,17 +13,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from com.nvidia.spark.examples.taxi.consts import *
-from com.nvidia.spark.examples.taxi.pre_process import pre_process
+from .consts import *
+from .pre_process import pre_process
 from com.nvidia.spark.examples.utility.utils import *
-from ml.dmlc.xgboost4j.scala.spark import *
 from pyspark.sql import SparkSession
 
+
 def main(args, xgboost_args):
     spark = (SparkSession
-        .builder
-        .appName(args.mainClass)
-        .getOrCreate())
+             .builder
+             .appName(args.mainClass)
+             .getOrCreate())
     raw_data_path = extract_paths(args.dataPaths, 'raw::')
     output_path = extract_paths(args.dataPaths, 'out::')[0]
     if not raw_data_path:
@@ -36,6 +36,6 @@ def main(args, xgboost_args):
         exit(1)
     raw_data = prepare_data(spark, args, raw_schema, raw_data_path)
     etled_train, etled_eval, etled_trans = pre_process(raw_data).randomSplit(list(map(float, args.splitRatios)))
-    etled_train.write.mode("overwrite").parquet(output_path+'/train')
-    etled_eval.write.mode("overwrite").parquet(output_path+'/eval')
-    etled_trans.write.mode("overwrite").parquet(output_path+'/trans')
+    etled_train.write.mode("overwrite").parquet(output_path + '/train')
+    etled_eval.write.mode("overwrite").parquet(output_path + '/eval')
+    etled_trans.write.mode("overwrite").parquet(output_path + '/trans')
diff --git a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/gpu_main.py b/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/main.py
similarity index 70%
rename from examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/gpu_main.py
rename to examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/main.py
index c9316d99e..2281e3e95 100644
--- a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/gpu_main.py
+++ b/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/main.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,49 +13,59 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from com.nvidia.spark.examples.taxi.consts import *
-from com.nvidia.spark.examples.taxi.pre_process import pre_process
+from .consts import *
 from com.nvidia.spark.examples.utility.utils import *
-from ml.dmlc.xgboost4j.scala.spark import *
 from pyspark.sql import SparkSession
 
+from xgboost.spark import SparkXGBRegressor, SparkXGBRegressorModel
+
+
 def main(args, xgboost_args):
     spark = (SparkSession
-        .builder
-        .appName(args.mainClass)
-        .getOrCreate())
+             .builder
+             .appName(args.mainClass)
+             .getOrCreate())
 
     train_data, eval_data, trans_data = valid_input_data(spark, args, raw_schema, final_schema)
 
-    features = [x.name for x in final_schema if x.name != label]
-
-    if args.mode in [ 'all', 'train' ]:
-        regressor = (XGBoostRegressor(**merge_dicts(default_params, xgboost_args))
-            .setLabelCol(label)
-            .setFeaturesCols(features))
-        if eval_data:
-            regressor.setEvalSets({ 'test': eval_data })
+    if args.mode in ['all', 'train']:
         if not train_data:
             print('-' * 80)
             print('Usage: training data path required when mode is all or train')
+            print('-' * 80)
             exit(1)
+
+        train_data, features = transform_data(train_data, label, args.use_gpu)
+        xgboost_args['features_col'] = features
+        xgboost_args['label_col'] = label
+        regressor = SparkXGBRegressor(**xgboost_args)
+
+        if eval_data:
+            # pass
+            pass
+
         model = with_benchmark('Training', lambda: regressor.fit(train_data))
 
         if args.modelPath:
             writer = model.write().overwrite() if args.overwrite else model
             writer.save(args.modelPath)
     else:
-        model = XGBoostRegressionModel().load(args.modelPath)
+        model = SparkXGBRegressorModel.load(args.modelPath)
 
-    if args.mode in [ 'all', 'transform' ]:
-        def transform():
-            result = model.transform(trans_data).cache()
-            result.foreachPartition(lambda _: None)
-            return result
+    if args.mode in ['all', 'transform']:
         if not trans_data:
             print('-' * 80)
             print('Usage: trans data path required when mode is all or transform')
+            print('-' * 80)
             exit(1)
+
+        trans_data, _ = transform_data(trans_data, label, args.use_gpu)
+
+        def transform():
+            result = model.transform(trans_data).cache()
+            result.foreachPartition(lambda _: None)
+            return result
+
         result = with_benchmark('Transformation', transform)
         show_sample(args, result, label)
         with_benchmark('Evaluation', lambda: check_regression_accuracy(result, label))
diff --git a/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/main.py b/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/main.py
index a06000d59..d997454bf 100644
--- a/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/main.py
+++ b/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/main.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,9 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from com.nvidia.spark.examples.utility.args import parse_arguments
+from .utility.args import parse_arguments
 from importlib import import_module
 
+
 def main():
     args, xgboost_args = parse_arguments()
     getattr(import_module(args.mainClass), 'main')(args, xgboost_args)
diff --git a/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/utility/args.py b/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/utility/args.py
index fe30ea68f..6318a1c2d 100644
--- a/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/utility/args.py
+++ b/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/utility/args.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,94 +13,33 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+import typing
 from argparse import ArgumentParser
 from distutils.util import strtobool
 from re import match
 from sys import exit
 
+
 def _to_bool(literal):
     return bool(strtobool(literal))
 
-def _to_ratio_pair(literal):        # e.g., '80:20'
+
+def _to_ratio_pair(literal):  # e.g., '80:20'
     return match(r'^\d+:\d+$', literal) and [int(x) for x in literal.split(':')]
 
+
 MAX_CHUNK_SIZE = 2 ** 31 - 1
 
 _examples = [
-    'com.nvidia.spark.examples.agaricus.cpu_main',
-    'com.nvidia.spark.examples.agaricus.gpu_main',
-    'com.nvidia.spark.examples.mortgage.cpu_main',
-    'com.nvidia.spark.examples.mortgage.gpu_main',
-    'com.nvidia.spark.examples.mortgage.gpu_cross_validator_main',
-    'com.nvidia.spark.examples.mortgage.cpu_cross_validator_main',
-    'com.nvidia.spark.examples.taxi.cpu_main',
-    'com.nvidia.spark.examples.taxi.gpu_main',
-    'com.nvidia.spark.examples.taxi.gpu_cross_validator_main',
-    'com.nvidia.spark.examples.taxi.cpu_cross_validator_main',
+    'com.nvidia.spark.examples.agaricus.main',
+    'com.nvidia.spark.examples.mortgage.main',
     'com.nvidia.spark.examples.mortgage.etl_main',
-    'com.nvidia.spark.examples.taxi.etl_main'
+    'com.nvidia.spark.examples.mortgage.cross_validator_main',
+    'com.nvidia.spark.examples.taxi.main',
+    'com.nvidia.spark.examples.taxi.etl_main',
+    'com.nvidia.spark.examples.taxi.cross_validator_main',
 ]
 
-_xgboost_simple_args = [
-    ('cacheTrainingSet', _to_bool),
-    ('maximizeEvaluationMetrics', _to_bool),
-    ('useExternalMemory', _to_bool),
-    ('checkpointInterval', int),
-    ('maxBins', int),
-    ('maxDepth', int),
-    ('maxLeaves', int),
-    ('nthread', int),
-    ('numClass', int),
-    ('numEarlyStoppingRounds', int),
-    ('numRound', int),
-    ('numWorkers', int),
-    ('seed', int),
-    ('silent', int),
-    ('timeoutRequestWorkers', int),
-    ('treeLimit', int),
-    ('verbosity', int),
-    ('alpha', float),
-    ('baseScore', float),
-    ('colsampleBylevel', float),
-    ('colsampleBytree', float),
-    ('eta', float),
-    ('gamma', float),
-    ('lambda_', float),
-    ('lambdaBias', float),
-    ('maxDeltaStep', float),
-    ('minChildWeight', float),
-    ('missing', float),
-    ('rateDrop', float),
-    ('scalePosWeight', float),
-    ('sketchEps', float),
-    ('skipDrop', float),
-    ('subsample', float),
-    ('trainTestRatio', float),
-    ('baseMarginCol', str),
-    ('checkpointPath', str),
-    ('contribPredictionCol', str),
-    ('evalMetric', str),
-    ('featuresCol', str),
-    ('groupCol', str),
-    ('growPolicy', str),
-    ('interactionConstraints', str),
-    ('labelCol', str),
-    ('leafPredictionCol', str),
-    ('monotoneConstraints', str),
-    ('normalizeType', str),
-    ('objective', str),
-    ('objectiveType', str),
-    ('predictionCol', str),
-    ('probabilityCol', str),
-    ('rawPredictionCol', str),
-    ('sampleType', str),
-    ('treeMethod', str),
-    ('weightCol', str),
-]
-
-_xgboost_array_args = [
-    ('thresholds', float),
-]
 
 def _validate_args(args):
     usage = ''
@@ -119,12 +58,36 @@ def _validate_args(args):
         print('Usage:\n' + usage)
         exit(1)
 
+
 def _attach_derived_args(args):
     args.trainRatio = args.dataRatios[0]
     args.evalRatio = args.dataRatios[1]
     args.trainEvalRatio = 100 - args.trainRatio - args.evalRatio
     args.splitRatios = [args.trainRatio, args.trainEvalRatio, args.evalRatio]
 
+
+def _inspect_xgb_parameters() -> typing.Dict[str, type]:
+    """inspect XGBModel parameters from __init__"""
+    from xgboost import XGBModel
+    from typing import get_type_hints, get_origin
+    xgb_parameters = {}
+    xgb_model_sig = get_type_hints(XGBModel.__init__)
+    for k, v in xgb_model_sig.items():
+        if k != "kwargs" and k != "return":
+            if get_origin(v) == typing.Union:
+                xgb_parameters[k] = v.__args__[0]
+            else:
+                xgb_parameters[k] = v
+
+    # some extra parameters used by xgboost pyspark
+    xgb_parameters['objective'] = str
+    xgb_parameters['force_repartition'] = _to_bool
+    xgb_parameters['use_gpu'] = _to_bool
+    xgb_parameters['num_workers'] = int
+    xgb_parameters['enable_sparse_data_optim'] = _to_bool
+    return xgb_parameters
+
+
 def parse_arguments():
     parser = ArgumentParser()
 
@@ -142,23 +105,18 @@ def parse_arguments():
     parser.add_argument('--numRows', type=int, default=5)
     parser.add_argument('--showFeatures', type=_to_bool, default=True)
 
-    # xgboost simple args
-    for arg, arg_type in _xgboost_simple_args:
-        parser.add_argument('--' + arg, type=arg_type)
-
-    # xgboost array args
-    for arg, arg_type in _xgboost_array_args:
-        parser.add_argument('--' + arg, type=arg_type, action='append')
+    xgboost_all_args = _inspect_xgb_parameters()
+    for arg, tp in xgboost_all_args.items():
+        parser.add_argument('--' + arg, type=tp)
 
     parsed_all = parser.parse_args()
     _validate_args(parsed_all)
     _attach_derived_args(parsed_all)
 
-    xgboost_args = [ arg for (arg, _) in _xgboost_simple_args + _xgboost_array_args ]
     parsed_xgboost = {
         k: v
         for k, v in vars(parsed_all).items()
-        if k in xgboost_args and v is not None
+        if k in xgboost_all_args and v is not None
     }
 
     return parsed_all, parsed_xgboost
diff --git a/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/utility/utils.py b/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/utility/utils.py
index 1b2818f3d..4b4037869 100644
--- a/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/utility/utils.py
+++ b/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/utility/utils.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,33 +13,41 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+import typing
+
 from pyspark.ml.evaluation import *
 from pyspark.ml.feature import VectorAssembler
+from pyspark.sql import DataFrame
 from pyspark.sql.functions import col
 from pyspark.sql.types import FloatType
 from com.nvidia.spark.examples.taxi.pre_process import pre_process
 from time import time
 
+
 def merge_dicts(dict_x, dict_y):
     result = dict_x.copy()
     result.update(dict_y)
     return result
 
+
 def show_sample(args, data_frame, label):
     data_frame = data_frame if args.showFeatures else data_frame.select(label, 'prediction')
     data_frame.show(args.numRows)
 
+
 def vectorize_data_frame(data_frame, label):
-    features = [ x.name for x in data_frame.schema if x.name != label ]
-    to_floats = [ col(x.name).cast(FloatType()) for x in data_frame.schema ]
+    features = [x.name for x in data_frame.schema if x.name != label]
+    to_floats = [col(x.name).cast(FloatType()) for x in data_frame.schema]
     return (VectorAssembler()
-        .setInputCols(features)
-        .setOutputCol('features')
-        .transform(data_frame.select(to_floats))
-        .select(col('features'), col(label)))
+            .setInputCols(features)
+            .setOutputCol('features')
+            .transform(data_frame.select(to_floats))
+            .select(col('features'), col(label)))
+
 
 def vectorize_data_frames(data_frames, label):
-    return [ vectorize_data_frame(x, label) for x in data_frames ]
+    return [vectorize_data_frame(x, label) for x in data_frames]
+
 
 def with_benchmark(phrase, action):
     start = time()
@@ -49,33 +57,50 @@ def with_benchmark(phrase, action):
     print('{} takes {} seconds'.format(phrase, round(end - start, 2)))
     return result
 
+
 def check_classification_accuracy(data_frame, label):
     accuracy = (MulticlassClassificationEvaluator()
-        .setLabelCol(label)
-        .evaluate(data_frame))
+                .setLabelCol(label)
+                .evaluate(data_frame))
     print('-' * 100)
     print('Accuracy is ' + str(accuracy))
 
+
 def check_regression_accuracy(data_frame, label):
     accuracy = (RegressionEvaluator()
-        .setLabelCol(label)
-        .evaluate(data_frame))
+                .setLabelCol(label)
+                .evaluate(data_frame))
     print('-' * 100)
     print('RMSE is ' + str(accuracy))
 
+
 def prepare_data(spark, args, schema, dataPath):
     reader = (spark
-        .read
-        .format(args.format))
+              .read
+              .format(args.format))
     if args.format == 'csv':
         reader.schema(schema).option('header', args.hasHeader)
     return reader.load(dataPath)
 
+
 def extract_paths(paths, prefix):
-    results = [ path[len(prefix):] for path in paths if path.startswith(prefix) ]
+    results = [path[len(prefix):] for path in paths if path.startswith(prefix)]
     return results
 
 
+def transform_data(
+        df: DataFrame,
+        label: str,
+        use_gpu: typing.Optional[bool],
+) -> (DataFrame, typing.Union[str, typing.List[str]]):
+    if use_gpu:
+        features = [x.name for x in df.schema if x.name != label]
+    else:
+        df = vectorize_data_frame(df, label)
+        features = 'features'
+    return df, features
+
+
 def valid_input_data(spark, args, raw_schema, final_schema):
     e2e = False
     for path in args.dataPaths:
@@ -88,9 +113,9 @@ def valid_input_data(spark, args, raw_schema, final_schema):
     eval_path = ''
 
     if e2e:
-        raw_train_path = extract_paths(args.dataPaths,'rawTrain::')
-        raw_eval_path = extract_paths(args.dataPaths,'rawEval::')
-        raw_trans_path = extract_paths(args.dataPaths,'rawTrans::')
+        raw_train_path = extract_paths(args.dataPaths, 'rawTrain::')
+        raw_eval_path = extract_paths(args.dataPaths, 'rawEval::')
+        raw_trans_path = extract_paths(args.dataPaths, 'rawTrans::')
 
     train_data = ''
     eval_data = ''
diff --git a/examples/XGBoost-Examples/utility/scala/src/com/nvidia/spark/examples/utility/XGBoostArgs.scala b/examples/XGBoost-Examples/utility/scala/src/com/nvidia/spark/examples/utility/XGBoostArgs.scala
index d8cca3fcd..a77a60add 100644
--- a/examples/XGBoost-Examples/utility/scala/src/com/nvidia/spark/examples/utility/XGBoostArgs.scala
+++ b/examples/XGBoost-Examples/utility/scala/src/com/nvidia/spark/examples/utility/XGBoostArgs.scala
@@ -64,7 +64,6 @@ object XGBoostArgs {
     "overwrite" -> XGBoostArg(parse = stringToBool, message = booleanMessage),
     "hasHeader" -> XGBoostArg(parse = stringToBool, message = booleanMessage),
     "saveDict"  -> XGBoostArg(parse = stringToBool, message = booleanMessage),
-    "rabitTrackerHost"  -> XGBoostArg(),
   )
 
   private def help: Unit = {
diff --git a/tools/databricks/README.md b/tools/databricks/README.md
new file mode 100644
index 000000000..467c5d277
--- /dev/null
+++ b/tools/databricks/README.md
@@ -0,0 +1,12 @@
+# Databricks Tools Demo Notebooks
+
+The RAPIDS Accelerator for Apache Spark includes two key tools for understanding the benefits of
+GPU acceleration as well as analyzing GPU Spark jobs.  For customers on Databricks, the demo
+notebooks offer a simple interface for running the tools given a set of Spark event logs from
+CPU (qualification) or GPU (profiling) application runs.
+
+To use a demo notebook, you can import the notebook in the Databricks Notebook UI via File->Import Notebook.
+
+Once the demo notebook is imported, you can select run to activate the notebook to an available compute
+cluster.  Once the notebook is activated, you can enter in the log path location in the text widget at the
+top of the notebook.  After that, select *Run all* to execute the tools for the specific logs in the log path.
diff --git a/tools/databricks/[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template.ipynb b/tools/databricks/[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template.ipynb
new file mode 100644
index 000000000..6d7d66d22
--- /dev/null
+++ b/tools/databricks/[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template.ipynb	
@@ -0,0 +1 @@
+{"cells":[{"cell_type":"markdown","source":["# Welcome to the Profiling Tool for the RAPIDS Accelerator for Apache Spark\nTo run the tool, you need to enter a log path that represents the DBFS location for your Spark GPU event logs.  Then you can select \"Run all\" to execute the notebook.  After the notebook completes, you will see various output tables show up below.\n\n## GPU Job Tuning Recommendations\nThis has general suggestions for tuning your applications to run optimally on GPUs.\n\n## Per-Job Profile\nThe profiler output includes information about the application, data sources, executors, SQL stages, Spark properties, and key application metrics at the job and stage levels."],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"5156a76c-7af7-465d-aff4-41a2e54e3595","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["import json\nimport requests\nimport base64\nimport shlex\nimport subprocess\nimport pandas as pd\n\nTOOL_JAR_URL = 'https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/22.12.0/rapids-4-spark-tools_2.12-22.12.0.jar'\nTOOL_JAR_LOCAL_PATH = '/tmp/rapids-4-spark-tools.jar'\n\n# Profiling tool output directory.\nOUTPUT_DIR = '/tmp' \n\nresponse = requests.get(TOOL_JAR_URL)\nopen(TOOL_JAR_LOCAL_PATH, \"wb\").write(response.content)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"53b4d770-9db6-4bd7-9b93-d036d375eac5","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["dbutils.widgets.text(\"log_path\", \"\")"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"f0e4371a-d2d9-4449-81ed-8f6c61ae8f80","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["eventlog_string=dbutils.widgets.get(\"log_path\") \n\nq_command_string=\"java -Xmx10g -cp /tmp/rapids-4-spark-tools.jar:/databricks/jars/* com.nvidia.spark.rapids.tool.profiling.ProfileMain --csv --auto-tuner -o {} \".format(OUTPUT_DIR) + eventlog_string\nargs = shlex.split(q_command_string)\ncmd_out = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)\n\nif cmd_out.returncode != 0:\n  dbutils.notebook.exit(\"Profiling Tool failed with stderr:\" + cmd_out.stderr)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"e9e7cecf-c2dc-4a0f-aea1-61a323e4ccc4","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["import os\n\napp_df = pd.DataFrame(columns = ['appId', 'appName'])\n\nfor x in os.scandir(OUTPUT_DIR + \"/rapids_4_spark_profile/\"):\n  tmp_df = pd.read_csv(x.path + \"/application_information.csv\")\n  app_df = app_df.append(tmp_df[['appId', 'appName']])"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"be0a2da7-1ee3-475e-96f9-303779edfd85","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["## GPU Job Tuning Recommendations"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"a1e326ec-5701-4b08-ae0f-7df0c8440038","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["app_list = app_df[\"appId\"].tolist()\napp_recommendations = pd.DataFrame(columns=['app', 'recommendations'])\n\nfor app in app_list:\n  app_file = open(OUTPUT_DIR + \"/rapids_4_spark_profile/\" + app + \"/profile.log\")\n  recommendations_start = 0\n  recommendations_str = \"\"\n  for line in app_file:\n    if recommendations_start == 1:\n      recommendations_str = recommendations_str + line\n    if \"### D. Recommended Configuration ###\" in line:\n      recommendations_start = 1\n  app_recommendations = app_recommendations.append({'app': app, 'recommendations': recommendations_str}, ignore_index=True)\n    \ndisplay(app_recommendations)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"4979f78c-44a0-4e54-b803-e5e194b71104","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["## Per-App Profile"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"1d4f9927-e9d8-4897-b604-f7832dc634aa","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["for x in os.scandir(OUTPUT_DIR + \"/rapids_4_spark_profile/\"):\n  print(\"APPLICATION ID = \" + str(x))\n  log = open(x.path + \"/profile.log\")\n  print(log.read())"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"9a8f1a58-e86f-4bd0-a245-878186feb8b9","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template","dashboards":[{"elements":[{"elementNUID":"be0a2da7-1ee3-475e-96f9-303779edfd85","dashboardResultIndex":0,"guid":"05eef9d3-7c55-4e26-8d1f-fa80338359e6","resultIndex":null,"options":null,"position":{"x":0,"y":0,"height":6,"width":24,"z":null},"elementType":"command"}],"guid":"a9ea7799-040a-484e-a59d-c3cdf5072953","layoutOption":{"stack":true,"grid":true},"version":"DashboardViewV1","nuid":"91c1bfb2-695a-4e5c-8a25-848a433108dc","origId":2690941040041430,"title":"Executive View","width":1600,"globalVars":{}},{"elements":[],"guid":"0896a45f-af1b-4849-b6c2-2b6abcb8b97b","layoutOption":{"stack":true,"grid":true},"version":"DashboardViewV1","nuid":"62243296-4562-4f06-90ac-d7a609f19c16","origId":2690941040041431,"title":"App View","width":1920,"globalVars":{}}],"notebookMetadata":{"pythonIndentUnit":2,"widgetLayout":[{"name":"log_path","width":576,"breakBefore":false},{"name":"Apps","width":494,"breakBefore":false}]},"language":"python","widgets":{"log_path":{"nuid":"c7ce3870-db19-4813-b1cb-cead3f4c36f1","currentValue":"/dbfs/","widgetInfo":{"widgetType":"text","name":"log_path","defaultValue":"","label":null,"options":{"widgetType":"text","validationRegex":null}}}},"notebookOrigID":2690941040041407}},"nbformat":4,"nbformat_minor":0}
diff --git a/tools/databricks/[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template.ipynb b/tools/databricks/[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template.ipynb
new file mode 100644
index 000000000..db4f756fb
--- /dev/null
+++ b/tools/databricks/[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template.ipynb	
@@ -0,0 +1 @@
+{"cells":[{"cell_type":"markdown","source":["# Welcome to the Qualification Tool for the RAPIDS Accelerator for Apache Spark\nTo run the tool, you need to enter a log path that represents the DBFS location for your Spark CPU event logs.  Then you can select \"Run all\" to execute the notebook.  After the notebook completes, you will see various output tables show up below.\n\n## Summary Output\nThe report represents the entire app execution, including unsupported operators and non-SQL operations.  By default, the applications and queries are sorted in descending order by the following fields:\n- Recommendation;\n- Estimated GPU Speed-up;\n- Estimated GPU Time Saved; and\n- End Time.\n\n## Stages Output\nFor each stage used in SQL operations, the Qualification tool generates the following information:\n1. App ID\n1. Stage ID\n1. Average Speedup Factor: the average estimated speed-up of all the operators in the given stage.\n1. Stage Task Duration: amount of time spent in tasks of SQL Dataframe operations for the given stage.\n1. Unsupported Task Duration: sum of task durations for the unsupported operators. For more details, see Supported Operators.\n1. Stage Estimated: True or False indicates if we had to estimate the stage duration.\n\n## Execs Output\nThe Qualification tool generates a report of the “Exec” in the “SparkPlan” or “Executor Nodes” along with the estimated acceleration on the GPU. Please refer to the Supported Operators guide for more details on limitations on UDFs and unsupported operators.\n1. App ID\n1. SQL ID\n1. Exec Name: example Filter, HashAggregate\n1. Expression Name\n1. Task Speedup Factor: it is simply the average acceleration of the operators based on the original CPU duration of the operator divided by the GPU duration. The tool uses historical queries and benchmarks to estimate a speed-up at an individual operator level to calculate how much a specific operator would accelerate on GPU.\n1. Exec Duration: wall-Clock time measured since the operator starts till it is completed.\n1. SQL Node Id\n1. Exec Is Supported: whether the Exec is supported by RAPIDS or not. Please refer to the Supported Operators section.\n1. Exec Stages: an array of stage IDs\n1. Exec Children\n1. Exec Children Node Ids\n1. Exec Should Remove: whether the Op is removed from the migrated plan."],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"df33c614-2ecc-47a0-8600-bc891681997f","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["import json\nimport requests\nimport base64\nimport shlex\nimport subprocess\nimport pandas as pd\n\nTOOL_JAR_URL = 'https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/22.12.0/rapids-4-spark-tools_2.12-22.12.0.jar'\nTOOL_JAR_LOCAL_PATH = '/tmp/rapids-4-spark-tools.jar'\n\n# Qualification tool output directory.\nOUTPUT_DIR = '/tmp/'\n\nresponse = requests.get(TOOL_JAR_URL)\nopen(TOOL_JAR_LOCAL_PATH, \"wb\").write(response.content)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"53b4d770-9db6-4bd7-9b93-d036d375eac5","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["dbutils.widgets.text(\"log_path\", \"\")\neventlog_string=dbutils.widgets.get(\"log_path\")\n\nq_command_string=\"java -Xmx10g -cp /tmp/rapids-4-spark-tools.jar:/databricks/jars/* com.nvidia.spark.rapids.tool.qualification.QualificationMain -o {} \".format(OUTPUT_DIR) + eventlog_string\nargs = shlex.split(q_command_string)\ncmd_out = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)\n\n\nif cmd_out.returncode != 0:\n  dbutils.notebook.exit(\"Qualification Tool failed with stderr:\" + cmd_out.stderr)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"e9e7cecf-c2dc-4a0f-aea1-61a323e4ccc4","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["## Summary Output"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"bbe50fde-0bd6-4281-95fd-6a1ec6f17ab2","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["summary_output=pd.read_csv(OUTPUT_DIR + \"rapids_4_spark_qualification_output/rapids_4_spark_qualification_output.csv\")\ndisplay(summary_output)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"fb8edb26-e173-47ff-92a1-463baec7c06b","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["## Stages Output"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"6756159b-30ca-407a-ab6b-9c29ced01ea6","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["stages_output=pd.read_csv(OUTPUT_DIR + \"rapids_4_spark_qualification_output/rapids_4_spark_qualification_output_stages.csv\")\ndisplay(stages_output)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"cdde6177-db5f-434a-995b-776678a64a3a","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["## Execs Output"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"4d7ce219-ae75-4a0c-a78c-4e7f25b8cd6f","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["execs_output=pd.read_csv(OUTPUT_DIR + \"rapids_4_spark_qualification_output/rapids_4_spark_qualification_output_execs.csv\")\ndisplay(execs_output)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"998b0c51-0cb6-408e-a01a-d1f5b1a61e1f","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template","dashboards":[{"elements":[],"guid":"0ed3c80b-b2f6-4c89-9a92-1af2f168d5ea","layoutOption":{"stack":true,"grid":true},"version":"DashboardViewV1","nuid":"91c1bfb2-695a-4e5c-8a25-848a433108dc","origId":2721260844584915,"title":"Executive View","width":1600,"globalVars":{}},{"elements":[],"guid":"ab4cecf9-0471-4fee-aa33-8927bb7e1bb1","layoutOption":{"stack":true,"grid":true},"version":"DashboardViewV1","nuid":"62243296-4562-4f06-90ac-d7a609f19c16","origId":2721260844584916,"title":"App View","width":1920,"globalVars":{}}],"notebookMetadata":{"pythonIndentUnit":2,"widgetLayout":[{"name":"log_path","width":1152,"breakBefore":false}]},"language":"python","widgets":{"log_path":{"nuid":"88986aa6-6e67-4d09-aeeb-7c96ea1ea8f1","currentValue":"/dbfs/","widgetInfo":{"widgetType":"text","name":"log_path","defaultValue":"","label":null,"options":{"widgetType":"text","validationRegex":null}}}},"notebookOrigID":2721260844584890}},"nbformat":4,"nbformat_minor":0}