diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml index 33bf4c5c6..035c57a9c 100644 --- a/.github/workflows/auto-merge.yml +++ b/.github/workflows/auto-merge.yml @@ -18,7 +18,7 @@ name: auto-merge HEAD to BASE on: pull_request_target: branches: - - branch-22.10 + - branch-22.12 types: [closed] jobs: @@ -27,15 +27,15 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: - ref: branch-22.10 # force to fetch from latest upstream instead of PR ref + ref: branch-22.12 # force to fetch from latest upstream instead of PR ref - name: auto-merge job uses: ./.github/workflows/auto-merge env: OWNER: NVIDIA REPO_NAME: spark-rapids-examples - HEAD: branch-22.10 - BASE: branch-22.12 + HEAD: branch-22.12 + BASE: branch-23.02 AUTOMERGE_TOKEN: ${{ secrets.AUTOMERGE_TOKEN }} # use to merge PR diff --git a/README.md b/README.md index a84a738eb..6c4df4ca5 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ There are broadly four categories of examples in this repo: 2. [Spark XGBoost](./examples/XGBoost-Examples) 3. [Deep Learning/Machine Learning](./examples/ML+DL-Examples) 4. [RAPIDS UDF](./examples/UDF-Examples) +5. [Databricks Tools demo notebooks](./tools/databricks) For more information on each of the examples please look into respective categories. diff --git a/docs/get-started/xgboost-examples/csp/aws/ec2.md b/docs/get-started/xgboost-examples/csp/aws/ec2.md index b64fa7a77..0565ce601 100644 --- a/docs/get-started/xgboost-examples/csp/aws/ec2.md +++ b/docs/get-started/xgboost-examples/csp/aws/ec2.md @@ -177,8 +177,8 @@ spark-submit --master spark://$HOSTNAME:7077 \ ${SAMPLE_JAR} \ -num_workers=${NUM_EXECUTORS} \ -format=csv \ - -dataPath="train::s3a://spark-xgboost-mortgage-dataset/csv/train/2000Q1" \ - -dataPath="trans::s3a://spark-xgboost-mortgage-dataset/csv/eval/2000Q1" \ + -dataPath="train::your-train-data-path" \ + -dataPath="trans::your-eval-data-path" \ -numRound=100 -max_depth=8 -nthread=$NUM_EXECUTOR_CORES -showFeatures=0 \ -tree_method=gpu_hist ``` diff --git a/docs/get-started/xgboost-examples/csp/databricks/generate-init-script-10.4.ipynb b/docs/get-started/xgboost-examples/csp/databricks/generate-init-script-10.4.ipynb index f056dfdf9..09033b8e0 100644 --- a/docs/get-started/xgboost-examples/csp/databricks/generate-init-script-10.4.ipynb +++ b/docs/get-started/xgboost-examples/csp/databricks/generate-init-script-10.4.ipynb @@ -24,9 +24,9 @@ "source": [ "%sh\n", "cd ../../dbfs/FileStore/jars/\n", - "sudo wget -O rapids-4-spark_2.12-22.10.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar\n", - "sudo wget -O xgboost4j-gpu_2.12-1.6.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-gpu_2.12/1.6.1/xgboost4j-gpu_2.12-1.6.1.jar\n", - "sudo wget -O xgboost4j-spark-gpu_2.12-1.6.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-spark-gpu_2.12/1.6.1/xgboost4j-spark-gpu_2.12-1.6.1.jar\n", + "sudo wget -O rapids-4-spark_2.12-22.12.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar\n", + "sudo wget -O xgboost4j-gpu_2.12-1.7.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-gpu_2.12/1.7.1/xgboost4j-gpu_2.12-1.7.1.jar\n", + "sudo wget -O xgboost4j-spark-gpu_2.12-1.7.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-spark-gpu_2.12/1.7.1/xgboost4j-spark-gpu_2.12-1.7.1.jar\n", "ls -ltr\n", "\n", "# Your Jars are downloaded in dbfs:/FileStore/jars directory" @@ -59,9 +59,9 @@ "sudo rm -f /databricks/jars/spark--maven-trees--ml--10.x--xgboost-gpu--ml.dmlc--xgboost4j-gpu_2.12--ml.dmlc__xgboost4j-gpu_2.12__1.5.2.jar\n", "sudo rm -f /databricks/jars/spark--maven-trees--ml--10.x--xgboost-gpu--ml.dmlc--xgboost4j-spark-gpu_2.12--ml.dmlc__xgboost4j-spark-gpu_2.12__1.5.2.jar\n", "\n", - "sudo cp /dbfs/FileStore/jars/xgboost4j-gpu_2.12-1.6.1.jar /databricks/jars/\n", - "sudo cp /dbfs/FileStore/jars/rapids-4-spark_2.12-22.10.0.jar /databricks/jars/\n", - "sudo cp /dbfs/FileStore/jars/xgboost4j-spark-gpu_2.12-1.6.1.jar /databricks/jars/\"\"\", True)" + "sudo cp /dbfs/FileStore/jars/xgboost4j-gpu_2.12-1.7.1.jar /databricks/jars/\n", + "sudo cp /dbfs/FileStore/jars/rapids-4-spark_2.12-22.12.0.jar /databricks/jars/\n", + "sudo cp /dbfs/FileStore/jars/xgboost4j-spark-gpu_2.12-1.7.1.jar /databricks/jars/\"\"\", True)" ] }, { @@ -132,8 +132,8 @@ "\n", "1. Edit your cluster, adding an initialization script from `dbfs:/databricks/init_scripts/init.sh` in the \"Advanced Options\" under \"Init Scripts\" tab\n", "2. Reboot the cluster\n", - "3. Go to \"Libraries\" tab under your cluster and install `dbfs:/FileStore/jars/xgboost4j-spark-gpu_2.12-1.6.1.jar` in your cluster by selecting the \"DBFS\" option for installing jars\n", - "4. Import the mortgage example notebook from `https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.10/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb`\n", + "3. Go to \"Libraries\" tab under your cluster and install `dbfs:/FileStore/jars/xgboost4j-spark-gpu_2.12-1.7.1.jar` in your cluster by selecting the \"DBFS\" option for installing jars\n", + "4. Import the mortgage example notebook from `https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.12/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb`\n", "5. Inside the mortgage example notebook, update the data paths\n", " `train_data = reader.schema(schema).option('header', True).csv('/data/mortgage/csv/small-train.csv')`\n", " `trans_data = reader.schema(schema).option('header', True).csv('/data/mortgage/csv/small-trans.csv')`" diff --git a/docs/get-started/xgboost-examples/csp/databricks/generate-init-script.ipynb b/docs/get-started/xgboost-examples/csp/databricks/generate-init-script.ipynb index 772453e39..b0799d5c1 100644 --- a/docs/get-started/xgboost-examples/csp/databricks/generate-init-script.ipynb +++ b/docs/get-started/xgboost-examples/csp/databricks/generate-init-script.ipynb @@ -24,9 +24,9 @@ "source": [ "%sh\n", "cd ../../dbfs/FileStore/jars/\n", - "sudo wget -O rapids-4-spark_2.12-22.10.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar\n", - "sudo wget -O xgboost4j-gpu_2.12-1.6.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-gpu_2.12/1.6.1/xgboost4j-gpu_2.12-1.6.1.jar\n", - "sudo wget -O xgboost4j-spark-gpu_2.12-1.6.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-spark-gpu_2.12/1.6.1/xgboost4j-spark-gpu_2.12-1.6.1.jar\n", + "sudo wget -O rapids-4-spark_2.12-22.12.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar\n", + "sudo wget -O xgboost4j-gpu_2.12-1.7.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-gpu_2.12/1.7.1/xgboost4j-gpu_2.12-1.7.1.jar\n", + "sudo wget -O xgboost4j-spark-gpu_2.12-1.7.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-spark-gpu_2.12/1.7.1/xgboost4j-spark-gpu_2.12-1.7.1.jar\n", "ls -ltr\n", "\n", "# Your Jars are downloaded in dbfs:/FileStore/jars directory" @@ -59,9 +59,9 @@ "sudo rm -f /databricks/jars/spark--maven-trees--ml--9.x--xgboost-gpu--ml.dmlc--xgboost4j-gpu_2.12--ml.dmlc__xgboost4j-gpu_2.12__1.4.1.jar\n", "sudo rm -f /databricks/jars/spark--maven-trees--ml--9.x--xgboost-gpu--ml.dmlc--xgboost4j-spark-gpu_2.12--ml.dmlc__xgboost4j-spark-gpu_2.12__1.4.1.jar\n", "\n", - "sudo cp /dbfs/FileStore/jars/xgboost4j-gpu_2.12-1.6.1.jar /databricks/jars/\n", - "sudo cp /dbfs/FileStore/jars/rapids-4-spark_2.12-22.10.0.jar /databricks/jars/\n", - "sudo cp /dbfs/FileStore/jars/xgboost4j-spark-gpu_2.12-1.6.1.jar /databricks/jars/\"\"\", True)" + "sudo cp /dbfs/FileStore/jars/xgboost4j-gpu_2.12-1.7.1.jar /databricks/jars/\n", + "sudo cp /dbfs/FileStore/jars/rapids-4-spark_2.12-22.12.0.jar /databricks/jars/\n", + "sudo cp /dbfs/FileStore/jars/xgboost4j-spark-gpu_2.12-1.7.1.jar /databricks/jars/\"\"\", True)" ] }, { @@ -132,8 +132,8 @@ "\n", "1. Edit your cluster, adding an initialization script from `dbfs:/databricks/init_scripts/init.sh` in the \"Advanced Options\" under \"Init Scripts\" tab\n", "2. Reboot the cluster\n", - "3. Go to \"Libraries\" tab under your cluster and install `dbfs:/FileStore/jars/xgboost4j-spark-gpu_2.12-1.6.1.jar` in your cluster by selecting the \"DBFS\" option for installing jars\n", - "4. Import the mortgage example notebook from `https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.10/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb`\n", + "3. Go to \"Libraries\" tab under your cluster and install `dbfs:/FileStore/jars/xgboost4j-spark-gpu_2.12-1.7.1.jar` in your cluster by selecting the \"DBFS\" option for installing jars\n", + "4. Import the mortgage example notebook from `https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.12/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb`\n", "5. Inside the mortgage example notebook, update the data paths\n", " `train_data = reader.schema(schema).option('header', True).csv('/data/mortgage/csv/small-train.csv')`\n", " `trans_data = reader.schema(schema).option('header', True).csv('/data/mortgage/csv/small-trans.csv')`" diff --git a/docs/get-started/xgboost-examples/notebook/python-notebook.md b/docs/get-started/xgboost-examples/notebook/python-notebook.md index 3bfd71174..c8cf57c3c 100644 --- a/docs/get-started/xgboost-examples/notebook/python-notebook.md +++ b/docs/get-started/xgboost-examples/notebook/python-notebook.md @@ -67,7 +67,3 @@ and the home directory for Apache Spark respectively. - Mortgage ETL Notebook: [Python](../../../../examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb) - Taxi ETL Notebook: [Python](../../../../examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb) - Note: Agaricus does not have ETL part. - -For PySpark based XGBoost, please refer to the -[Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.04/docs/get-started/xgboost-examples/notebook/python-notebook.md) -that uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/). diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md b/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md index 54a251fd1..9a869d59e 100644 --- a/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md +++ b/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md @@ -40,7 +40,7 @@ export SPARK_DOCKER_IMAGE= export SPARK_DOCKER_TAG= pushd ${SPARK_HOME} -wget https://github.com/NVIDIA/spark-rapids-examples/raw/branch-22.10/dockerfile/Dockerfile +wget https://github.com/NVIDIA/spark-rapids-examples/raw/branch-22.12/dockerfile/Dockerfile # Optionally install additional jars into ${SPARK_HOME}/jars/ diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md b/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md index 6132a7563..b41824fe2 100644 --- a/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md +++ b/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md @@ -12,11 +12,13 @@ Prerequisites * Multi-node clusters with homogenous GPU configuration * Software Requirements * Ubuntu 18.04, 20.04/CentOS7, CentOS8 - * CUDA 11.0+ + * CUDA 11.5+ * NVIDIA driver compatible with your CUDA * NCCL 2.7.8+ - * Python 3.6+ + * Python 3.8 or 3.9 * NumPy + * XGBoost 1.7.0+ + * cudf-cu11 The number of GPUs in each host dictates the number of Spark executors that can run there. Additionally, cores per Spark executor and cores per Spark task must match, such that each executor can run 1 task at any given time. @@ -47,6 +49,14 @@ And here are the steps to enable the GPU resources discovery for Spark 3.1+. spark.worker.resource.gpu.amount 1 spark.worker.resource.gpu.discoveryScript ${SPARK_HOME}/examples/src/main/scripts/getGpusResources.sh ``` +3. Install the XGBoost, cudf-cu11, numpy libraries on all nodes before running XGBoost application. + +``` bash +pip install xgboost +pip install cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com +pip install numpy +pip install scikit-learn +``` Get Application Files, Jar and Dataset ------------------------------- @@ -182,6 +192,10 @@ export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.gpu_main # tree construction algorithm export TREE_METHOD=gpu_hist + +# if you enable archive python environment +export PYSPARK_DRIVER_PYTHON=python +export PYSPARK_PYTHON=./environment/bin/python ``` Run spark-submit: @@ -197,8 +211,9 @@ ${SPARK_HOME}/bin/spark-submit --driver-memory ${SPARK_DRIVER_MEMORY} \ --executor-memory ${SPARK_EXECUTOR_MEMORY} \ --conf spark.cores.max=${TOTAL_CORES} \ - --jars ${RAPIDS_JAR},${XGBOOST4J_JAR},${XGBOOST4J_SPARK_JAR} \ - --py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP} \ + --archives your_pyspark_venv.tar.gz#environment #if you enabled archive python environment \ + --jars ${RAPIDS_JAR} \ + --py-files ${SAMPLE_ZIP} \ ${MAIN_PY} \ --mainClass=${EXAMPLE_CLASS} \ --dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/output/train/ \ @@ -261,6 +276,10 @@ export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.cpu_main # tree construction algorithm export TREE_METHOD=hist + +# if you enable archive python environment +export PYSPARK_DRIVER_PYTHON=python +export PYSPARK_PYTHON=./environment/bin/python ``` This is the same command as for the GPU example, repeated for convenience: @@ -271,8 +290,9 @@ ${SPARK_HOME}/bin/spark-submit --driver-memory ${SPARK_DRIVER_MEMORY} \ --executor-memory ${SPARK_EXECUTOR_MEMORY} \ --conf spark.cores.max=${TOTAL_CORES} \ - --jars ${XGBOOST4J_JAR},${XGBOOST4J_SPARK_JAR} \ - --py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP} \ + --archives your_pyspark_venv.tar.gz#environment #if you enabled archive python environment \ + --jars ${RAPIDS_JAR} \ + --py-files ${SAMPLE_ZIP} \ ${SPARK_PYTHON_ENTRYPOINT} \ --mainClass=${EXAMPLE_CLASS} \ --dataPath=train::${DATA_PATH}/mortgage/output/train/ \ diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md b/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md index 9d92da01a..f2bff0fdd 100644 --- a/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md +++ b/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md @@ -12,12 +12,14 @@ Prerequisites * Multi-node clusters with homogenous GPU configuration * Software Requirements * Ubuntu 18.04, 20.04/CentOS7, CentOS8 - * CUDA 11.0+ + * CUDA 11.5+ * NVIDIA driver compatible with your CUDA * NCCL 2.7.8+ - * Python 3.6+ + * Python 3.8 or 3.9 * NumPy - + * XGBoost 1.7.0+ + * cudf-cu11 + The number of GPUs per NodeManager dictates the number of Spark executors that can run in that NodeManager. Additionally, cores per Spark executor and cores per Spark task must match, such that each executor can run 1 task at any given time. @@ -32,6 +34,32 @@ We use `SPARK_HOME` environment variable to point to the Apache Spark cluster. And as to how to enable GPU scheduling and isolation for Yarn, please refer to [here](https://hadoop.apache.org/docs/r3.1.0/hadoop-yarn/hadoop-yarn-site/UsingGpus.html). +Please make sure to install the XGBoost, cudf-cu11, numpy libraries on all nodes before running XGBoost application. +``` bash +pip install xgboost +pip install cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com +pip install numpy +pip install scikit-learn +``` +You can also create an isolated python environment by using (Virtualenv)[https://virtualenv.pypa.io/en/latest/], +and then directly pass/unpack the archive file and enable the environment on executors +by leveraging the --archives option or spark.archives configuration. +``` bash +# create an isolated python environment and install libraries +python -m venv pyspark_venv +source pyspark_venv/bin/activate +pip install xgboost +pip install cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com +pip install numpy +pip install scikit-learn +venv-pack -o pyspark_venv.tar.gz + +# enable archive python environment on executors +export PYSPARK_DRIVER_PYTHON=python # Do not set in cluster modes. +export PYSPARK_PYTHON=./environment/bin/python +spark-submit --archives pyspark_venv.tar.gz#environment app.py +``` + Get Application Files, Jar and Dataset ------------------------------- @@ -114,6 +142,10 @@ export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.gpu_main # tree construction algorithm export TREE_METHOD=gpu_hist + +# if you enable archive python environment +export PYSPARK_DRIVER_PYTHON=python +export PYSPARK_PYTHON=./environment/bin/python ``` Run spark-submit: @@ -129,11 +161,12 @@ ${SPARK_HOME}/bin/spark-submit --files ${SPARK_HOME}/examples/src/main/scripts/getGpusResources.sh \ --master yarn \ --deploy-mode ${SPARK_DEPLOY_MODE} \ + --archives your_pyspark_venv.tar.gz#environment #if you enabled archive python environment \ --num-executors ${SPARK_NUM_EXECUTORS} \ --driver-memory ${SPARK_DRIVER_MEMORY} \ --executor-memory ${SPARK_EXECUTOR_MEMORY} \ - --jars ${RAPIDS_JAR},${XGBOOST4J_JAR} \ - --py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP} \ + --jars ${RAPIDS_JAR} \ + --py-files ${SAMPLE_ZIP} \ ${MAIN_PY} \ --mainClass=${EXAMPLE_CLASS} \ --dataPath=train::${DATA_PATH}/mortgage/out/train/ \ @@ -190,6 +223,10 @@ export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.cpu_main # tree construction algorithm export TREE_METHOD=hist + +# if you enable archive python environment +export PYSPARK_DRIVER_PYTHON=python +export PYSPARK_PYTHON=./environment/bin/python ``` This is the same command as for the GPU example, repeated for convenience: @@ -197,12 +234,13 @@ This is the same command as for the GPU example, repeated for convenience: ``` bash ${SPARK_HOME}/bin/spark-submit \ --master yarn \ + --archives your_pyspark_venv.tar.gz#environment #if you enabled archive python environment \ --deploy-mode ${SPARK_DEPLOY_MODE} \ --num-executors ${SPARK_NUM_EXECUTORS} \ --driver-memory ${SPARK_DRIVER_MEMORY} \ --executor-memory ${SPARK_EXECUTOR_MEMORY} \ - --jars ${XGBOOST4J_JAR},${XGBOOST4J_SPARK_JAR} \ - --py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP} \ + --jars ${RAPIDS_JAR} \ + --py-files ${SAMPLE_ZIP} \ ${MAIN_PY} \ --mainClass=${EXAMPLE_CLASS} \ --dataPath=train::${DATA_PATH}/mortgage/output/train/ \ diff --git a/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md b/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md index ca9442f44..2178d6d75 100644 --- a/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md +++ b/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md @@ -9,7 +9,7 @@ For simplicity export the location to these jars. All examples assume the packag * [XGBoost4j-Spark Package](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/) 2. Download the RAPIDS Accelerator for Apache Spark plugin jar - * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar) + * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar) ### Build XGBoost Python Examples @@ -21,14 +21,3 @@ You need to copy the dataset to `/opt/xgboost`. Use the following links to downl 1. [Mortgage dataset](/docs/get-started/xgboost-examples/dataset/mortgage.md) 2. [Taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) 3. [Agaricus dataset](https://gust.dev/r/xgboost-agaricus) - -### Setup environments - -``` bash -export SPARK_XGBOOST_DIR=/opt/xgboost -export RAPIDS_JAR=${SPARK_XGBOOST_DIR}/rapids-4-spark_2.12-22.10.0.jar -export XGBOOST4J_JAR=${SPARK_XGBOOST_DIR}/xgboost4j_3.0-1.4.2-0.3.0.jar -export XGBOOST4J_SPARK_JAR=${SPARK_XGBOOST_DIR}/xgboost4j-spark_3.0-1.4.2-0.3.0.jar -export SAMPLE_ZIP=${SPARK_XGBOOST_DIR}/samples.zip -export MAIN_PY=${SPARK_XGBOOST_DIR}/main.py -``` diff --git a/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md b/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md index 5bdc4f7cc..2303fdfe0 100644 --- a/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md +++ b/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md @@ -5,7 +5,7 @@ For simplicity export the location to these jars. All examples assume the packag ### Download the jars 1. Download the RAPIDS Accelerator for Apache Spark plugin jar - * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar) + * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar) ### Build XGBoost Scala Examples @@ -17,11 +17,3 @@ You need to copy the dataset to `/opt/xgboost`. Use the following links to downl 1. [Mortgage dataset](/docs/get-started/xgboost-examples/dataset/mortgage.md) 2. [Taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) 3. [Agaricus dataset](https://gust.dev/r/xgboost-agaricus) - -### Setup environments - -``` bash -export SPARK_XGBOOST_DIR=/opt/xgboost -export RAPIDS_JAR=${SPARK_XGBOOST_DIR}/rapids-4-spark_2.12-22.10.0.jar -export SAMPLE_JAR=${SPARK_XGBOOST_DIR}/sample_xgboost_apps-0.2.3-jar-with-dependencies.jar -``` diff --git a/docs/img/guides/mortgage-perf.png b/docs/img/guides/mortgage-perf.png index 23715ce9a..11c94865a 100644 Binary files a/docs/img/guides/mortgage-perf.png and b/docs/img/guides/mortgage-perf.png differ diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile b/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile index ba511c45f..9b9c6fd58 100644 --- a/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile +++ b/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile @@ -17,7 +17,7 @@ ARG CUDA_VER=11.5.1 FROM nvidia/cuda:${CUDA_VER}-devel-ubuntu20.04 -ARG BRANCH_VER=22.10 +ARG BRANCH_VER=22.12 RUN apt-get update RUN apt-get install -y wget ninja-build git diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/README.md b/examples/ML+DL-Examples/Spark-cuML/pca/README.md index 1086c6907..ca573aaf1 100644 --- a/examples/ML+DL-Examples/Spark-cuML/pca/README.md +++ b/examples/ML+DL-Examples/Spark-cuML/pca/README.md @@ -12,7 +12,7 @@ User can also download the release jar from Maven central: [rapids-4-spark-ml_2.12-22.02.0-cuda11.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-ml_2.12/22.02.0/rapids-4-spark-ml_2.12-22.02.0-cuda11.jar) -[rapids-4-spark_2.12-22.10.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar) +[rapids-4-spark_2.12-22.12.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar) ## Sample code @@ -48,7 +48,7 @@ It is assumed that a Standalone Spark cluster has been set up, the `SPARK_MASTER ``` bash RAPIDS_ML_JAR=PATH_TO_rapids-4-spark-ml_2.12-22.02.0-cuda11.jar - PLUGIN_JAR=PATH_TO_rapids-4-spark_2.12-22.10.0.jar + PLUGIN_JAR=PATH_TO_rapids-4-spark_2.12-22.12.0.jar jupyter toree install \ --spark_home=${SPARK_HOME} \ diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml b/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml index 875ada38a..9cc790476 100644 --- a/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml +++ b/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml @@ -21,7 +21,7 @@ com.nvidia PCAExample jar - 22.10.0-SNAPSHOT + 22.12.0-SNAPSHOT 8 @@ -51,7 +51,7 @@ com.nvidia rapids-4-spark-ml_2.12 - 22.10.0-SNAPSHOT + 22.12.0-SNAPSHOT diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh b/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh index a167ad0cc..f5b287351 100755 --- a/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh +++ b/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh @@ -15,8 +15,8 @@ # limitations under the License. # -ML_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark-ml_2.12/22.10.0-SNAPSHOT/rapids-4-spark-ml_2.12-22.10.0-SNAPSHOT.jar -PLUGIN_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark_2.12/22.10.0-SNAPSHOT/rapids-4-spark_2.12-22.10.0-SNAPSHOT.jar +ML_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark-ml_2.12/22.12.0-SNAPSHOT/rapids-4-spark-ml_2.12-22.12.0-SNAPSHOT.jar +PLUGIN_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark_2.12/22.12.0-SNAPSHOT/rapids-4-spark_2.12-22.12.0-SNAPSHOT.jar $SPARK_HOME/bin/spark-submit \ --master spark://127.0.0.1:7077 \ @@ -38,4 +38,4 @@ $SPARK_HOME/bin/spark-submit \ --conf spark.network.timeout=1000s \ --jars $ML_JAR,$PLUGIN_JAR \ --class com.nvidia.spark.examples.pca.Main \ -/workspace/target/PCAExample-22.10.0-SNAPSHOT.jar +/workspace/target/PCAExample-22.12.0-SNAPSHOT.jar diff --git a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb index d5249e8fd..9e854115b 100644 --- a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb +++ b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb @@ -22,7 +22,7 @@ "import os\n", "# Change to your cluster ip:port and directories\n", "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"spark:your-ip:port\")\n", - "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-path/rapids-4-spark_2.12-22.10.0.jar\")\n" + "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-path/rapids-4-spark_2.12-22.12.0.jar\")\n" ] }, { diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md index 40d3d6cdd..922bd5fd6 100644 --- a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md +++ b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md @@ -108,7 +108,7 @@ See above Prerequisites section First finish the steps in "Building with Native Code Examples and run test cases" section, then do the following in the docker. ### Get jars from Maven Central -[rapids-4-spark_2.12-22.10.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar) +[rapids-4-spark_2.12-22.12.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar) ### Launch a local mode Spark diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml index 252d3abc3..812bbc778 100644 --- a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml +++ b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml @@ -25,7 +25,7 @@ user defined functions for use with the RAPIDS Accelerator for Apache Spark - 22.10.0-SNAPSHOT + 22.12.0-SNAPSHOT 1.8 @@ -37,7 +37,7 @@ cuda11 2.12 - 22.10.0 + 22.12.0-SNAPSHOT 3.1.1 2.12.15 ${project.build.directory}/cpp-build diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/CMakeLists.txt b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/CMakeLists.txt index 6ec503c13..593312611 100755 --- a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/CMakeLists.txt +++ b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/CMakeLists.txt @@ -16,7 +16,7 @@ cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) -file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.10/RAPIDS.cmake +file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.12/RAPIDS.cmake ${CMAKE_BINARY_DIR}/RAPIDS.cmake) include(${CMAKE_BINARY_DIR}/RAPIDS.cmake) @@ -32,7 +32,7 @@ if(DEFINED GPU_ARCHS) endif() rapids_cuda_init_architectures(UDFEXAMPLESJNI) -project(UDFEXAMPLESJNI VERSION 22.10.0 LANGUAGES C CXX CUDA) +project(UDFEXAMPLESJNI VERSION 22.12.0 LANGUAGES C CXX CUDA) option(PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" OFF) option(BUILD_UDF_BENCHMARKS "Build the benchmarks" OFF) @@ -84,10 +84,10 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -w --expt-extended-lambda --expt-relax set(CUDA_USE_STATIC_CUDA_RUNTIME OFF) rapids_cpm_init() -rapids_cpm_find(cudf 22.10.00 +rapids_cpm_find(cudf 22.12.00 CPM_ARGS GIT_REPOSITORY https://github.com/rapidsai/cudf.git - GIT_TAG branch-22.10 + GIT_TAG branch-22.12 GIT_SHALLOW TRUE SOURCE_SUBDIR cpp OPTIONS "BUILD_TESTS OFF" diff --git a/examples/UDF-Examples/Spark-cuSpatial/Dockerfile b/examples/UDF-Examples/Spark-cuSpatial/Dockerfile index f9bbff653..6f3f7852c 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/Dockerfile +++ b/examples/UDF-Examples/Spark-cuSpatial/Dockerfile @@ -38,9 +38,6 @@ RUN conda --version RUN conda install -c conda-forge openjdk=8 maven=3.8.1 -y -# install cuDF dependency. -RUN conda install -c rapidsai -c nvidia -c conda-forge -c defaults libcuspatial=22.10 python=3.8 -y - RUN wget --quiet \ https://github.com/Kitware/CMake/releases/download/v3.21.3/cmake-3.21.3-linux-x86_64.tar.gz \ && tar -xzf cmake-3.21.3-linux-x86_64.tar.gz \ diff --git a/examples/UDF-Examples/Spark-cuSpatial/Dockerfile.awsdb b/examples/UDF-Examples/Spark-cuSpatial/Dockerfile.awsdb index a054441a4..d16d68975 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/Dockerfile.awsdb +++ b/examples/UDF-Examples/Spark-cuSpatial/Dockerfile.awsdb @@ -48,7 +48,7 @@ RUN wget -q https://repo.continuum.io/miniconda/Miniconda3-py38_4.9.2-Linux-x86_ conda config --system --set always_yes True && \ conda clean --all -RUN conda install -c rapidsai-nightly -c nvidia -c conda-forge -c defaults libcuspatial=22.10 +RUN conda install -c rapidsai-nightly -c nvidia -c conda-forge -c defaults libcuspatial=22.12 RUN conda install -c conda-forge libgdal==3.3.1 RUN pip install jupyter ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64 diff --git a/examples/UDF-Examples/Spark-cuSpatial/README.md b/examples/UDF-Examples/Spark-cuSpatial/README.md index 3828a8177..6a4e84ff2 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/README.md +++ b/examples/UDF-Examples/Spark-cuSpatial/README.md @@ -45,13 +45,17 @@ or [in local machine](#build-in-local-machine) after prerequisites. docker build -f Dockerfile . -t build-spark-cuspatial docker run -it build-spark-cuspatial bash ``` -2. Get the code, then run `mvn package`. +2. Bash into the Docker and install libcuspatial + ```Bash + conda install -c rapidsai-nightly -c nvidia -c conda-forge -c defaults libcuspatial=22.12 + ``` +3. Get the code, then run `mvn package`. ```Bash git clone https://github.com/NVIDIA/spark-rapids-examples.git cd spark-rapids-examples/examples/UDF-Examples/Spark-cuSpatial/ mvn package ``` -3. You'll get the jar named `spark-cuspatial-.jar` in the target folder. +4. You'll get the jar named `spark-cuspatial-.jar` in the target folder. Note: The docker env is just for building the jar, not for running the application. @@ -65,9 +69,7 @@ Note: The docker env is just for building the jar, not for running the applicati 4. [cuspatial](https://github.com/rapidsai/cuspatial): install libcuspatial ```Bash # Install libcuspatial from conda - conda install -c rapidsai -c nvidia -c conda-forge -c defaults libcuspatial=22.10 - # or below command for the nightly (aka SNAPSHOT) version. - conda install -c rapidsai-nightly -c nvidia -c conda-forge -c defaults libcuspatial=22.10 + conda install -c rapidsai-nightly -c nvidia -c conda-forge -c defaults libcuspatial=22.12 ``` 5. Build the JAR using `mvn package`. ```Bash @@ -79,22 +81,18 @@ Note: The docker env is just for building the jar, not for running the applicati ## Run ### GPU Demo on Spark Standalone on-premises cluster -1. Install necessary libraries. Besides `cudf` and `cuspatial`, the `gdal` library that is compatible with the installed `cuspatial` may also be needed. - ``` - conda install -c conda-forge libgdal=3.3.1 - ``` -2. Set up [a standalone cluster](/docs/get-started/xgboost-examples/on-prem-cluster/standalone-scala.md) of Spark. Make sure the conda/lib is included in LD_LIBRARY_PATH, so that spark executors can load libcuspatial.so. +1. Set up [a standalone cluster](/docs/get-started/xgboost-examples/on-prem-cluster/standalone-scala.md) of Spark. Make sure the conda/lib is included in LD_LIBRARY_PATH, so that spark executors can load libcuspatial.so. -3. Download Spark RAPIDS JAR - * [Spark RAPIDS JAR v22.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar) or above -4. Prepare sample dataset and JARs. Copy the [sample dataset](../../../datasets/cuspatial_data.tar.gz) to `/data/cuspatial_data/`. +2. Download Spark RAPIDS JAR + * [Spark RAPIDS JAR v22.12.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar) or above +3. Prepare sample dataset and JARs. Copy the [sample dataset](../../../datasets/cuspatial_data.tar.gz) to `/data/cuspatial_data/`. Copy Spark RAPIDS JAR and `spark-cuspatial-.jar` to `/data/cuspatial_data/jars/`. If you build the `spark-cuspatial-.jar` in docker, please copy the jar from docker to local: ``` docker cp YOUR_DOCKER_CONTAINER:/PATH/TO/spark-cuspatial-.jar ./YOUR_LOCAL_PATH ``` Note: update the paths in `gpu-run.sh` accordingly. -5. Run `gpu-run.sh` +4. Run `gpu-run.sh` ```Bash ./gpu-run.sh ``` diff --git a/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh b/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh index fead762aa..c98b916ff 100755 --- a/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh +++ b/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh @@ -31,7 +31,7 @@ rm -rf $DATA_OUT_PATH # the path to keep the jars of spark-rapids & spark-cuspatial JARS=$ROOT_PATH/jars -JARS_PATH=${JARS_PATH:-$JARS/rapids-4-spark_2.12-22.10.0.jar,$JARS/spark-cuspatial-22.10.0-SNAPSHOT.jar} +JARS_PATH=${JARS_PATH:-$JARS/rapids-4-spark_2.12-22.12.0-SNAPSHOT.jar,$JARS/spark-cuspatial-22.12.0-SNAPSHOT.jar} $SPARK_HOME/bin/spark-submit --master spark://$HOSTNAME:7077 \ --name "Gpu Spatial Join UDF" \ diff --git a/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_standalone.ipynb b/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_standalone.ipynb index 04f77452f..3fa3744a3 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_standalone.ipynb +++ b/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_standalone.ipynb @@ -9,7 +9,7 @@ "source": [ "from pyspark.sql import SparkSession\n", "import os\n", - "jarsPath = os.getenv(\"JARS_PATH\", \"/data/cuspatial_data/jars/rapids-4-spark_2.12-22.10.0.jar,/data/cuspatial_data/jars/spark-cuspatial-22.10.0-SNAPSHOT.jar\")\n", + "jarsPath = os.getenv(\"JARS_PATH\", \"/data/cuspatial_data/jars/rapids-4-spark_2.12-22.12.0.jar,/data/cuspatial_data/jars/spark-cuspatial-22.12.0-SNAPSHOT.jar\")\n", "spark = SparkSession.builder \\\n", " .config(\"spark.jars\", jarsPath) \\\n", " .config(\"spark.sql.adaptive.enabled\", \"false\") \\\n", diff --git a/examples/UDF-Examples/Spark-cuSpatial/pom.xml b/examples/UDF-Examples/Spark-cuSpatial/pom.xml index 100cc3f1d..1f609009f 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/pom.xml +++ b/examples/UDF-Examples/Spark-cuSpatial/pom.xml @@ -24,13 +24,13 @@ UDF of the cuSpatial case for the RAPIDS Accelerator The RAPIDS accelerated user defined function of the cuSpatial case for use with the RAPIDS Accelerator for Apache Spark - 22.10.0-SNAPSHOT + 22.12.0-SNAPSHOT 1.8 1.8 8 - 22.10.0 + 22.12.0-SNAPSHOT 2.12 3.2.0 ${project.build.directory}/cpp-build diff --git a/examples/UDF-Examples/Spark-cuSpatial/src/main/native/CMakeLists.txt b/examples/UDF-Examples/Spark-cuSpatial/src/main/native/CMakeLists.txt index 50675a42a..506b1697a 100755 --- a/examples/UDF-Examples/Spark-cuSpatial/src/main/native/CMakeLists.txt +++ b/examples/UDF-Examples/Spark-cuSpatial/src/main/native/CMakeLists.txt @@ -16,7 +16,7 @@ cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) -project(SPATIALUDJNI VERSION 22.10.0 LANGUAGES C CXX CUDA) +project(SPATIALUDJNI VERSION 22.12.0 LANGUAGES C CXX CUDA) ################################################################################################### # - build type ------------------------------------------------------------------------------------ diff --git a/examples/UDF-Examples/Spark-cuSpatial/src/main/native/src/PointInPolygonJni.cpp b/examples/UDF-Examples/Spark-cuSpatial/src/main/native/src/PointInPolygonJni.cpp index dd15cc78d..ecbc0b6b2 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/src/main/native/src/PointInPolygonJni.cpp +++ b/examples/UDF-Examples/Spark-cuSpatial/src/main/native/src/PointInPolygonJni.cpp @@ -132,7 +132,7 @@ inline bool is_invalid_column(cudf::column_view const& col) { * double type, and have at least one valid row. Otherwise, the behavior is undefined. */ inline double reduce_as_double(cudf::column_view const& col, - std::unique_ptr const& agg) { + cudf::reduce_aggregation const& agg) { auto s = cudf::reduce(col, agg, col.type()); // s is always valid auto p_num_scalar = reinterpret_cast*>(s.get()); @@ -279,10 +279,10 @@ Java_com_nvidia_spark_rapids_udf_PointInPolygon_pointInPolygon(JNIEnv* env, jcla auto min_agg = cudf::make_min_aggregation(); auto max_agg = cudf::make_max_aggregation(); - auto x_min = reduce_as_double(*ply_x, min_agg); - auto x_max = reduce_as_double(*ply_x, max_agg); - auto y_min = reduce_as_double(*ply_y, min_agg); - auto y_max = reduce_as_double(*ply_y, max_agg); + auto x_min = reduce_as_double(*ply_x, *min_agg); + auto x_max = reduce_as_double(*ply_x, *max_agg); + auto y_min = reduce_as_double(*ply_y, *min_agg); + auto y_max = reduce_as_double(*ply_y, *max_agg); // 2) quadtree construction cudf::size_type min_size = 512; diff --git a/examples/XGBoost-Examples/.gitignore b/examples/XGBoost-Examples/.gitignore new file mode 100644 index 000000000..dadfea074 --- /dev/null +++ b/examples/XGBoost-Examples/.gitignore @@ -0,0 +1 @@ +samples.zip diff --git a/examples/XGBoost-Examples/README.md b/examples/XGBoost-Examples/README.md index 69a831af0..5d38f816f 100644 --- a/examples/XGBoost-Examples/README.md +++ b/examples/XGBoost-Examples/README.md @@ -1,19 +1,18 @@ # Spark XGBoost Examples -Spark XGBoost examples here showcase the need for end-to-end GPU acceleration. +Spark XGBoost examples here showcase the need for ETL+Training pipeline GPU acceleration. The Scala based XGBoost examples here use [DMLC’s version](https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-spark_2.12/). -For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04) that -uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/). +The pyspark based XGBoost examples requires [installing RAPIDS via pip](https://rapids.ai/pip.html#install). Most data scientists spend a lot of time not only on Training models but also processing the large amounts of data needed to train these models. -As you can see below, XGBoost training on GPUs can be up to 10X and data processing using -RAPIDS Accelerator can also be accelerated with an end-to-end speed-up of 7X on GPU compared to CPU. +As you can see below, Pyspark+XGBoost training on GPUs can be up to 13X and data processing using +RAPIDS Accelerator can also be accelerated with an end-to-end speed-up of 11X on GPU compared to CPU. In the public cloud, better performance can lead to significantly lower costs as demonstrated in this [blog](https://developer.nvidia.com/blog/gpu-accelerated-spark-xgboost/). ![mortgage-speedup](/docs/img/guides/mortgage-perf.png) -Note that the test result is based on 21 years [Fannie Mea Single-Family Loan Performance Data](https://capitalmarkets.fanniemae.com/credit-risk-transfer/single-family-credit-risk-transfer/fannie-mae-single-family-loan-performance-data) -with a 4 A100 GPU and 512 CPU vcores cluster, the performance is affected by many aspects, +Note that the Training test result is based on 4 years [Fannie Mea Single-Family Loan Performance Data](https://capitalmarkets.fanniemae.com/credit-risk-transfer/single-family-credit-risk-transfer/fannie-mae-single-family-loan-performance-data) +with a 8 A100 GPU and 1024 CPU vcores cluster, the performance is affected by many aspects, including data size and type of GPU. In this folder, there are three blue prints for users to learn about using @@ -94,6 +93,9 @@ Please follow below steps to run the example notebooks in different notebook env - [Jupyter Notebook for Python](/docs/get-started/xgboost-examples/notebook/python-notebook.md) Note: +Update the default value of `spark.sql.execution.arrow.maxRecordsPerBatch` to a larger number(such as 200000) will +significantly improve performance by accelerating data transfer between JVM and Python process. + For the CrossValidator job, we need to set `spark.task.resource.gpu.amount=1` to allow only 1 training task running on 1 GPU(executor), otherwise the customized CrossValidator may schedule more than 1 xgboost training tasks into one executor simultaneously and trigger [issue-131](https://github.com/NVIDIA/spark-rapids-examples/issues/131). diff --git a/examples/XGBoost-Examples/agaricus/notebooks/python/agaricus-gpu.ipynb b/examples/XGBoost-Examples/agaricus/notebooks/python/agaricus-gpu.ipynb index 9d1b1e311..a49b8eca0 100644 --- a/examples/XGBoost-Examples/agaricus/notebooks/python/agaricus-gpu.ipynb +++ b/examples/XGBoost-Examples/agaricus/notebooks/python/agaricus-gpu.ipynb @@ -9,16 +9,12 @@ "Agaricus is an example of xgboost classifier for multiple classification. This notebook will show you how to load data, train the xgboost model.\n", "\n", "A few libraries required for this notebook:\n", - " 1. NumPy\n", - " 2. cudf jar\n", - " 3. xgboost4j jar\n", - " 4. xgboost4j-spark jar\n", - " 5. rapids-4-spark.jar\n", + " 1. cudf-cu11\n", + " 2. xgboost\n", + " 3. scikit-learn\n", + " 4. numpy\n", " \n", - "This notebook also illustrates the ease of porting a sample CPU based Spark xgboost4j code into GPU. There is only one change required for running Spark XGBoost on GPU. That is replacing the API `setFeaturesCol(feature)` on CPU with the new API `setFeaturesCols(features)`. This also eliminates the need for vectorization (assembling multiple feature columns in to one column) since we can read multiple columns.\n", - "\n", - "Note: For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04) that\n", - "uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/)." + "This notebook also illustrates the ease of porting a sample CPU based Spark xgboost4j code into GPU. There is no change required for running Spark XGBoost on GPU because both CPU and GPU call the same API. For CPU run, we need to vectorize the trained dataset before fitting data to classifier." ] }, { @@ -34,12 +30,16 @@ "metadata": {}, "outputs": [], "source": [ - "from ml.dmlc.xgboost4j.scala.spark import XGBoostClassificationModel, XGBoostClassifier\n", + "from xgboost.spark import SparkXGBClassifier, SparkXGBClassifierModel\n", "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n", "from pyspark.sql import SparkSession\n", "from pyspark.sql.types import FloatType, StructField, StructType\n", "from time import time\n", - "import os" + "from pyspark.conf import SparkConf\n", + "import os\n", + "# if you pass/unpack the archive file and enable the environment\n", + "# os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\"\n", + "# os.environ['PYSPARK_DRIVER_PYTHON'] = \"./environment/bin/python\"" ] }, { @@ -64,9 +64,66 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-11-30 06:57:40,306 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "2022-11-30 06:57:40,550 WARN resource.ResourceUtils: The configuration of cores (exec = 2 task = 1, runnable tasks = 2) will result in wasted resources due to resource gpu limiting the number of runnable tasks per executor to: 1. Please adjust your configuration.\n", + "2022-11-30 06:57:54,195 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator 22.12.0 using cudf 22.12.0.\n", + "2022-11-30 06:57:54,210 WARN rapids.RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n", + "2022-11-30 06:57:54,214 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n", + "2022-11-30 06:57:54,214 WARN rapids.RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n", + "2022-11-30 06:57:54,685 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.\n" + ] + } + ], "source": [ - "spark = SparkSession.builder.getOrCreate()\n", + "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"/your-url\")\n", + "\n", + "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-jar-path\")\n", + "\n", + "# You need to update with your real hardware resource \n", + "driverMem = os.getenv(\"DRIVER_MEM\", \"2g\")\n", + "executorMem = os.getenv(\"EXECUTOR_MEM\", \"2g\")\n", + "pinnedPoolSize = os.getenv(\"PINNED_POOL_SIZE\", \"2g\")\n", + "concurrentGpuTasks = os.getenv(\"CONCURRENT_GPU_TASKS\", \"2\")\n", + "executorCores = int(os.getenv(\"EXECUTOR_CORES\", \"2\"))\n", + "# Common spark settings\n", + "conf = SparkConf()\n", + "conf.setMaster(SPARK_MASTER_URL)\n", + "conf.setAppName(\"Microbenchmark on GPU\")\n", + "conf.set(\"spark.executor.instances\",\"1\")\n", + "conf.set(\"spark.driver.memory\", driverMem)\n", + "## The tasks will run on GPU memory, so there is no need to set a high host memory\n", + "conf.set(\"spark.executor.memory\", executorMem)\n", + "## The tasks will run on GPU cores, so there is no need to use many cpu cores\n", + "conf.set(\"spark.executor.cores\", executorCores)\n", + "\n", + "\n", + "# Plugin settings\n", + "conf.set(\"spark.executor.resource.gpu.amount\", \"1\")\n", + "conf.set(\"spark.rapids.sql.concurrentGpuTasks\", concurrentGpuTasks)\n", + "conf.set(\"spark.rapids.memory.pinnedPool.size\", pinnedPoolSize)\n", + "# since pyspark and xgboost share the same GPU, we need to allocate some memory to xgboost to avoid GPU OOM while training \n", + "conf.set(\"spark.rapids.memory.gpu.allocFraction\",\"0.7\")\n", + "conf.set(\"spark.locality.wait\",\"0\")\n", + "##############note: only support value=1 https://github.com/dmlc/xgboost/blame/master/python-package/xgboost/spark/core.py#L370-L374\n", + "conf.set(\"spark.task.resource.gpu.amount\", 1) \n", + "conf.set(\"spark.rapids.sql.enabled\", \"true\") \n", + "conf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\n", + "conf.set(\"spark.sql.cache.serializer\",\"com.nvidia.spark.ParquetCachedBatchSerializer\")\n", + "conf.set(\"spark.driver.extraClassPath\", RAPIDS_JAR)\n", + "conf.set(\"spark.executor.extraClassPath\", RAPIDS_JAR)\n", + "\n", + "# if you pass/unpack the archive file and enable the environment\n", + "# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n", + "# Create spark session\n", + "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n", + "\n", "reader = spark.read" ] }, @@ -89,8 +146,17 @@ "\n", "# You need to update them to your real paths!\n", "dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n", - "train_data = reader.schema(schema).option('header', True).csv(dataRoot + '/agaricus/csv/train')\n", - "trans_data = reader.schema(schema).option('header', True).csv(dataRoot + '/agaricus/csv/test')" + "train_path = dataRoot + \"/agaricus/csv/train\"\n", + "eval_path = dataRoot + \"/agaricus/csv/eval\"\n", + "\n", + "data_format = 'csv'\n", + "has_header = 'true'\n", + "if data_format == 'csv':\n", + " train_data = reader.schema(schema).option('header',has_header).csv(train_path)\n", + " trans_data = reader.schema(schema).option('header',has_header).csv(eval_path)\n", + "else :\n", + " train_data = reader.load(train_path)\n", + " trans_data = reader.load(eval_path)" ] }, { @@ -127,28 +193,34 @@ "outputs": [], "source": [ "params = { \n", - " 'eta': 0.1,\n", - " 'missing': 0.0,\n", - " 'treeMethod': 'gpu_hist',\n", - " 'maxDepth': 2,\n", - " 'numWorkers': 1,\n", - " 'numRound' : 100,\n", + " \"tree_method\": \"gpu_hist\",\n", + " \"grow_policy\": \"depthwise\",\n", + " \"num_workers\": 1,\n", + " \"use_gpu\": \"true\",\n", "}\n", - "classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCols(features)" + "params['features_col'] = features\n", + "params['label_col'] = label\n", + " \n", + "classifier = SparkXGBClassifier(**params)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The CPU version classifier provides the API `setFeaturesCol` which only accepts a single column name, so vectorization for multiple feature columns is required.\n", - "```Python\n", - "classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCol('features')\n", - "```\n", - "\n", "The parameter `num_workers` should be set to the number of GPUs in Spark cluster for GPU version, while for CPU version it is usually equal to the number of the CPU cores.\n", "\n", - "Concerning the tree method, GPU version only supports `gpu_hist` currently, while `hist` is designed and used here for CPU training." + "Concerning the tree method, GPU version only supports `gpu_hist` currently, while `hist` is designed and used here for CPU training.\n", + "\n", + "An example of CPU classifier:\n", + "```\n", + "classifier = SparkXGBClassifier(\n", + " feature_col=features,\n", + " label_col=label, \n", + " num_workers=1024,\n", + " use_gpu=False,\n", + ")\n", + "```" ] }, { @@ -163,11 +235,30 @@ "execution_count": 5, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "If features_cols param set, then features_col param is ignored.\n", + "2022-11-30 07:00:45,526 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n", + "[Stage 5:> (0 + 1) / 1]\r" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Training takes 27.95 seconds\n" + "Training takes 13.92 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " \r", + "/data/home/yuanli/work/reviews/pr252/pyspark_venv_20221125/lib/python3.8/site-packages/xgboost/sklearn.py:808: UserWarning: Loading a native XGBoost model with Scikit-Learn interface.\n", + " warnings.warn(\"Loading a native XGBoost model with Scikit-Learn interface.\")\n" ] } ], @@ -192,10 +283,26 @@ "cell_type": "code", "execution_count": 6, "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "If features_cols param set, then features_col param is ignored.\n" + ] + } + ], + "source": [ + "model.write().overwrite().save(dataRoot + '/model/agaricus')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, "outputs": [], "source": [ - "model.write().overwrite().save(dataRoot + '/new-model-path')\n", - "loaded_model = XGBoostClassificationModel().load(dataRoot + '/new-model-path')" + "loaded_model = SparkXGBClassifierModel().load(dataRoot + '/model/agaricus')" ] }, { @@ -207,22 +314,330 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-11-30 07:01:07,030 WARN rapids.GpuOverrides: \n", + "!Exec cannot run on GPU because not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction#798, probability#1062]\n", + " @Expression label#254 could run on GPU\n", + " @Expression feature_0#255 could run on GPU\n", + " @Expression feature_1#256 could run on GPU\n", + " @Expression feature_2#257 could run on GPU\n", + " @Expression feature_3#258 could run on GPU\n", + " @Expression feature_4#259 could run on GPU\n", + " @Expression feature_5#260 could run on GPU\n", + " @Expression feature_6#261 could run on GPU\n", + " @Expression feature_7#262 could run on GPU\n", + " @Expression feature_8#263 could run on GPU\n", + " @Expression feature_9#264 could run on GPU\n", + " @Expression feature_10#265 could run on GPU\n", + " @Expression feature_11#266 could run on GPU\n", + " @Expression feature_12#267 could run on GPU\n", + " @Expression feature_13#268 could run on GPU\n", + " @Expression feature_14#269 could run on GPU\n", + " @Expression feature_15#270 could run on GPU\n", + " @Expression feature_16#271 could run on GPU\n", + " @Expression feature_17#272 could run on GPU\n", + " @Expression feature_18#273 could run on GPU\n", + " @Expression feature_19#274 could run on GPU\n", + " @Expression feature_20#275 could run on GPU\n", + " @Expression feature_21#276 could run on GPU\n", + " @Expression feature_22#277 could run on GPU\n", + " @Expression feature_23#278 could run on GPU\n", + " @Expression feature_24#279 could run on GPU\n", + " @Expression feature_25#280 could run on GPU\n", + " @Expression feature_26#281 could run on GPU\n", + " @Expression feature_27#282 could run on GPU\n", + " @Expression feature_28#283 could run on GPU\n", + " @Expression feature_29#284 could run on GPU\n", + " @Expression feature_30#285 could run on GPU\n", + " @Expression feature_31#286 could run on GPU\n", + " @Expression feature_32#287 could run on GPU\n", + " @Expression feature_33#288 could run on GPU\n", + " @Expression feature_34#289 could run on GPU\n", + " @Expression feature_35#290 could run on GPU\n", + " @Expression feature_36#291 could run on GPU\n", + " @Expression feature_37#292 could run on GPU\n", + " @Expression feature_38#293 could run on GPU\n", + " @Expression feature_39#294 could run on GPU\n", + " @Expression feature_40#295 could run on GPU\n", + " @Expression feature_41#296 could run on GPU\n", + " @Expression feature_42#297 could run on GPU\n", + " @Expression feature_43#298 could run on GPU\n", + " @Expression feature_44#299 could run on GPU\n", + " @Expression feature_45#300 could run on GPU\n", + " @Expression feature_46#301 could run on GPU\n", + " @Expression feature_47#302 could run on GPU\n", + " @Expression feature_48#303 could run on GPU\n", + " @Expression feature_49#304 could run on GPU\n", + " @Expression feature_50#305 could run on GPU\n", + " @Expression feature_51#306 could run on GPU\n", + " @Expression feature_52#307 could run on GPU\n", + " @Expression feature_53#308 could run on GPU\n", + " @Expression feature_54#309 could run on GPU\n", + " @Expression feature_55#310 could run on GPU\n", + " @Expression feature_56#311 could run on GPU\n", + " @Expression feature_57#312 could run on GPU\n", + " @Expression feature_58#313 could run on GPU\n", + " @Expression feature_59#314 could run on GPU\n", + " @Expression feature_60#315 could run on GPU\n", + " @Expression feature_61#316 could run on GPU\n", + " @Expression feature_62#317 could run on GPU\n", + " @Expression feature_63#318 could run on GPU\n", + " @Expression feature_64#319 could run on GPU\n", + " @Expression feature_65#320 could run on GPU\n", + " @Expression feature_66#321 could run on GPU\n", + " @Expression feature_67#322 could run on GPU\n", + " @Expression feature_68#323 could run on GPU\n", + " @Expression feature_69#324 could run on GPU\n", + " @Expression feature_70#325 could run on GPU\n", + " @Expression feature_71#326 could run on GPU\n", + " @Expression feature_72#327 could run on GPU\n", + " @Expression feature_73#328 could run on GPU\n", + " @Expression feature_74#329 could run on GPU\n", + " @Expression feature_75#330 could run on GPU\n", + " @Expression feature_76#331 could run on GPU\n", + " @Expression feature_77#332 could run on GPU\n", + " @Expression feature_78#333 could run on GPU\n", + " @Expression feature_79#334 could run on GPU\n", + " @Expression feature_80#335 could run on GPU\n", + " @Expression feature_81#336 could run on GPU\n", + " @Expression feature_82#337 could run on GPU\n", + " @Expression feature_83#338 could run on GPU\n", + " @Expression feature_84#339 could run on GPU\n", + " @Expression feature_85#340 could run on GPU\n", + " @Expression feature_86#341 could run on GPU\n", + " @Expression feature_87#342 could run on GPU\n", + " @Expression feature_88#343 could run on GPU\n", + " @Expression feature_89#344 could run on GPU\n", + " @Expression feature_90#345 could run on GPU\n", + " @Expression feature_91#346 could run on GPU\n", + " @Expression feature_92#347 could run on GPU\n", + " @Expression feature_93#348 could run on GPU\n", + " @Expression feature_94#349 could run on GPU\n", + " @Expression feature_95#350 could run on GPU\n", + " @Expression feature_96#351 could run on GPU\n", + " @Expression feature_97#352 could run on GPU\n", + " @Expression feature_98#353 could run on GPU\n", + " @Expression feature_99#354 could run on GPU\n", + " @Expression feature_100#355 could run on GPU\n", + " @Expression feature_101#356 could run on GPU\n", + " @Expression feature_102#357 could run on GPU\n", + " @Expression feature_103#358 could run on GPU\n", + " @Expression feature_104#359 could run on GPU\n", + " @Expression feature_105#360 could run on GPU\n", + " @Expression feature_106#361 could run on GPU\n", + " @Expression feature_107#362 could run on GPU\n", + " @Expression feature_108#363 could run on GPU\n", + " @Expression feature_109#364 could run on GPU\n", + " @Expression feature_110#365 could run on GPU\n", + " @Expression feature_111#366 could run on GPU\n", + " @Expression feature_112#367 could run on GPU\n", + " @Expression feature_113#368 could run on GPU\n", + " @Expression feature_114#369 could run on GPU\n", + " @Expression feature_115#370 could run on GPU\n", + " @Expression feature_116#371 could run on GPU\n", + " @Expression feature_117#372 could run on GPU\n", + " @Expression feature_118#373 could run on GPU\n", + " @Expression feature_119#374 could run on GPU\n", + " @Expression feature_120#375 could run on GPU\n", + " @Expression feature_121#376 could run on GPU\n", + " @Expression feature_122#377 could run on GPU\n", + " @Expression feature_123#378 could run on GPU\n", + " @Expression feature_124#379 could run on GPU\n", + " @Expression feature_125#380 could run on GPU\n", + " !Expression UDF(pythonUDF0#1327.rawPrediction) AS rawPrediction#798 cannot run on GPU because expression Alias UDF(pythonUDF0#1327.rawPrediction) AS rawPrediction#798 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#1327.rawPrediction) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n", + " !Expression UDF(pythonUDF0#1327.rawPrediction) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3659/488666387 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#1327.rawPrediction) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " @Expression pythonUDF0#1327.rawPrediction could run on GPU\n", + " @Expression pythonUDF0#1327 could run on GPU\n", + " @Expression pythonUDF0#1327.prediction AS prediction#931 could run on GPU\n", + " @Expression pythonUDF0#1327.prediction could run on GPU\n", + " @Expression pythonUDF0#1327 could run on GPU\n", + " !Expression UDF(pythonUDF0#1327.probability) AS probability#1062 cannot run on GPU because expression Alias UDF(pythonUDF0#1327.probability) AS probability#1062 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#1327.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n", + " !Expression UDF(pythonUDF0#1327.probability) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3659/488666387 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#1327.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " @Expression pythonUDF0#1327.probability could run on GPU\n", + " @Expression pythonUDF0#1327 could run on GPU\n", + "\n", + "2022-11-30 07:01:07,071 WARN rapids.GpuOverrides: \n", + "!Exec cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction#798, probability#1062]\n", + " @Expression label#254 could run on GPU\n", + " @Expression feature_0#255 could run on GPU\n", + " @Expression feature_1#256 could run on GPU\n", + " @Expression feature_2#257 could run on GPU\n", + " @Expression feature_3#258 could run on GPU\n", + " @Expression feature_4#259 could run on GPU\n", + " @Expression feature_5#260 could run on GPU\n", + " @Expression feature_6#261 could run on GPU\n", + " @Expression feature_7#262 could run on GPU\n", + " @Expression feature_8#263 could run on GPU\n", + " @Expression feature_9#264 could run on GPU\n", + " @Expression feature_10#265 could run on GPU\n", + " @Expression feature_11#266 could run on GPU\n", + " @Expression feature_12#267 could run on GPU\n", + " @Expression feature_13#268 could run on GPU\n", + " @Expression feature_14#269 could run on GPU\n", + " @Expression feature_15#270 could run on GPU\n", + " @Expression feature_16#271 could run on GPU\n", + " @Expression feature_17#272 could run on GPU\n", + " @Expression feature_18#273 could run on GPU\n", + " @Expression feature_19#274 could run on GPU\n", + " @Expression feature_20#275 could run on GPU\n", + " @Expression feature_21#276 could run on GPU\n", + " @Expression feature_22#277 could run on GPU\n", + " @Expression feature_23#278 could run on GPU\n", + " @Expression feature_24#279 could run on GPU\n", + " @Expression feature_25#280 could run on GPU\n", + " @Expression feature_26#281 could run on GPU\n", + " @Expression feature_27#282 could run on GPU\n", + " @Expression feature_28#283 could run on GPU\n", + " @Expression feature_29#284 could run on GPU\n", + " @Expression feature_30#285 could run on GPU\n", + " @Expression feature_31#286 could run on GPU\n", + " @Expression feature_32#287 could run on GPU\n", + " @Expression feature_33#288 could run on GPU\n", + " @Expression feature_34#289 could run on GPU\n", + " @Expression feature_35#290 could run on GPU\n", + " @Expression feature_36#291 could run on GPU\n", + " @Expression feature_37#292 could run on GPU\n", + " @Expression feature_38#293 could run on GPU\n", + " @Expression feature_39#294 could run on GPU\n", + " @Expression feature_40#295 could run on GPU\n", + " @Expression feature_41#296 could run on GPU\n", + " @Expression feature_42#297 could run on GPU\n", + " @Expression feature_43#298 could run on GPU\n", + " @Expression feature_44#299 could run on GPU\n", + " @Expression feature_45#300 could run on GPU\n", + " @Expression feature_46#301 could run on GPU\n", + " @Expression feature_47#302 could run on GPU\n", + " @Expression feature_48#303 could run on GPU\n", + " @Expression feature_49#304 could run on GPU\n", + " @Expression feature_50#305 could run on GPU\n", + " @Expression feature_51#306 could run on GPU\n", + " @Expression feature_52#307 could run on GPU\n", + " @Expression feature_53#308 could run on GPU\n", + " @Expression feature_54#309 could run on GPU\n", + " @Expression feature_55#310 could run on GPU\n", + " @Expression feature_56#311 could run on GPU\n", + " @Expression feature_57#312 could run on GPU\n", + " @Expression feature_58#313 could run on GPU\n", + " @Expression feature_59#314 could run on GPU\n", + " @Expression feature_60#315 could run on GPU\n", + " @Expression feature_61#316 could run on GPU\n", + " @Expression feature_62#317 could run on GPU\n", + " @Expression feature_63#318 could run on GPU\n", + " @Expression feature_64#319 could run on GPU\n", + " @Expression feature_65#320 could run on GPU\n", + " @Expression feature_66#321 could run on GPU\n", + " @Expression feature_67#322 could run on GPU\n", + " @Expression feature_68#323 could run on GPU\n", + " @Expression feature_69#324 could run on GPU\n", + " @Expression feature_70#325 could run on GPU\n", + " @Expression feature_71#326 could run on GPU\n", + " @Expression feature_72#327 could run on GPU\n", + " @Expression feature_73#328 could run on GPU\n", + " @Expression feature_74#329 could run on GPU\n", + " @Expression feature_75#330 could run on GPU\n", + " @Expression feature_76#331 could run on GPU\n", + " @Expression feature_77#332 could run on GPU\n", + " @Expression feature_78#333 could run on GPU\n", + " @Expression feature_79#334 could run on GPU\n", + " @Expression feature_80#335 could run on GPU\n", + " @Expression feature_81#336 could run on GPU\n", + " @Expression feature_82#337 could run on GPU\n", + " @Expression feature_83#338 could run on GPU\n", + " @Expression feature_84#339 could run on GPU\n", + " @Expression feature_85#340 could run on GPU\n", + " @Expression feature_86#341 could run on GPU\n", + " @Expression feature_87#342 could run on GPU\n", + " @Expression feature_88#343 could run on GPU\n", + " @Expression feature_89#344 could run on GPU\n", + " @Expression feature_90#345 could run on GPU\n", + " @Expression feature_91#346 could run on GPU\n", + " @Expression feature_92#347 could run on GPU\n", + " @Expression feature_93#348 could run on GPU\n", + " @Expression feature_94#349 could run on GPU\n", + " @Expression feature_95#350 could run on GPU\n", + " @Expression feature_96#351 could run on GPU\n", + " @Expression feature_97#352 could run on GPU\n", + " @Expression feature_98#353 could run on GPU\n", + " @Expression feature_99#354 could run on GPU\n", + " @Expression feature_100#355 could run on GPU\n", + " @Expression feature_101#356 could run on GPU\n", + " @Expression feature_102#357 could run on GPU\n", + " @Expression feature_103#358 could run on GPU\n", + " @Expression feature_104#359 could run on GPU\n", + " @Expression feature_105#360 could run on GPU\n", + " @Expression feature_106#361 could run on GPU\n", + " @Expression feature_107#362 could run on GPU\n", + " @Expression feature_108#363 could run on GPU\n", + " @Expression feature_109#364 could run on GPU\n", + " @Expression feature_110#365 could run on GPU\n", + " @Expression feature_111#366 could run on GPU\n", + " @Expression feature_112#367 could run on GPU\n", + " @Expression feature_113#368 could run on GPU\n", + " @Expression feature_114#369 could run on GPU\n", + " @Expression feature_115#370 could run on GPU\n", + " @Expression feature_116#371 could run on GPU\n", + " @Expression feature_117#372 could run on GPU\n", + " @Expression feature_118#373 could run on GPU\n", + " @Expression feature_119#374 could run on GPU\n", + " @Expression feature_120#375 could run on GPU\n", + " @Expression feature_121#376 could run on GPU\n", + " @Expression feature_122#377 could run on GPU\n", + " @Expression feature_123#378 could run on GPU\n", + " @Expression feature_124#379 could run on GPU\n", + " @Expression feature_125#380 could run on GPU\n", + " !Expression rawPrediction#798 cannot run on GPU because expression AttributeReference rawPrediction#798 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " @Expression prediction#931 could run on GPU\n", + " !Expression probability#1062 cannot run on GPU because expression AttributeReference probability#1062 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-11-30 07:01:09,857 WARN rapids.GpuOverrides: \n", + "!Exec cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n", + " @Partitioning could run on GPU\n", + " !Exec cannot run on GPU because unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#1062, rawPrediction#798]; not all expressions can be replaced\n", + " @Expression cast(label#254 as string) AS label#3936 could run on GPU\n", + " @Expression cast(label#254 as string) could run on GPU\n", + " @Expression label#254 could run on GPU\n", + " @Expression cast(rawPrediction#798 as string) AS rawPrediction#3937 could run on GPU\n", + " !Expression cast(rawPrediction#798 as string) cannot run on GPU because Cast from org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 to StringType is not supported\n", + " !Expression rawPrediction#798 cannot run on GPU because expression AttributeReference rawPrediction#798 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " @Expression cast(probability#1062 as string) AS probability#3938 could run on GPU\n", + " !Expression cast(probability#1062 as string) cannot run on GPU because Cast from org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 to StringType is not supported\n", + " !Expression probability#1062 cannot run on GPU because expression AttributeReference probability#1062 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " @Expression cast(prediction#931 as string) AS prediction#3939 could run on GPU\n", + " @Expression cast(prediction#931 as string) could run on GPU\n", + " @Expression prediction#931 could run on GPU\n", + " !Exec cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#1062, rawPrediction#798]; not all expressions can be replaced\n", + " @Expression label#254 could run on GPU\n", + " @Expression prediction#931 could run on GPU\n", + " !Expression probability#1062 cannot run on GPU because expression AttributeReference probability#1062 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Expression rawPrediction#798 cannot run on GPU because expression AttributeReference rawPrediction#798 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + "\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Transformation takes 2.63 seconds\n", + "Transformation takes 3.26 seconds\n", "+-----+--------------------+--------------------+----------+\n", "|label| rawPrediction| probability|prediction|\n", "+-----+--------------------+--------------------+----------+\n", - "| 1.0|[-0.9667757749557...|[0.03322422504425...| 1.0|\n", - "| 0.0|[-0.0080436170101...|[0.99195638298988...| 0.0|\n", - "| 0.0|[-0.0080436170101...|[0.99195638298988...| 0.0|\n", - "| 0.0|[-0.1416745483875...|[0.85832545161247...| 0.0|\n", - "| 0.0|[-0.0747678577899...|[0.92523214221000...| 0.0|\n", + "| 1.0|[-9.6646747589111...|[6.35385513305664...| 1.0|\n", + "| 0.0|[-8.3923015594482...|[2.26557254791259...| 1.0|\n", + "| 0.0|[-8.0568389892578...|[3.16858291625976...| 1.0|\n", + "| 0.0|[1.91234850883483...|[0.87128275632858...| 0.0|\n", + "| 0.0|[-8.5582475662231...|[1.91867351531982...| 1.0|\n", "+-----+--------------------+--------------------+----------+\n", "only showing top 5 rows\n", "\n" @@ -247,15 +662,54 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-11-30 07:01:10,292 WARN rapids.GpuOverrides: \n", + "! cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n", + " ! createexternalrow(prediction#931, label#5899, 1.0#5900, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(label,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n", + " @Expression prediction#931 could run on GPU\n", + " @Expression label#5899 could run on GPU\n", + " @Expression 1.0#5900 could run on GPU\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n", + " !Expression probability#1062 cannot run on GPU because expression AttributeReference probability#1062 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Expression obj#5905 cannot run on GPU because expression AttributeReference obj#5905 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n", + " !Exec cannot run on GPU because not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#1062]; unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#1062]\n", + " @Expression prediction#931 could run on GPU\n", + " @Expression cast(label#254 as double) AS label#5899 could run on GPU\n", + " @Expression cast(label#254 as double) could run on GPU\n", + " @Expression label#254 could run on GPU\n", + " @Expression 1.0 AS 1.0#5900 could run on GPU\n", + " @Expression 1.0 could run on GPU\n", + " !Expression probability#1062 cannot run on GPU because expression AttributeReference probability#1062 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Exec cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#1062]\n", + " @Expression label#254 could run on GPU\n", + " @Expression prediction#931 could run on GPU\n", + " !Expression probability#1062 cannot run on GPU because expression AttributeReference probability#1062 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + "\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Evaluation takes 0.29 seconds\n", - "Accuracy is 0.9987577063864658\n" + "Evaluation takes 1.0 seconds\n", + "Accuracy is 0.9069677632722861\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + "[Stage 12:> (0 + 1) / 1]\r", + "\r", + " \r" ] } ], @@ -275,7 +729,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ diff --git a/examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/consts.py b/examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/consts.py deleted file mode 100644 index 045bce986..000000000 --- a/examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/consts.py +++ /dev/null @@ -1,28 +0,0 @@ -# -# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from pyspark.sql.types import * - -label = 'label' -features = [ 'feature_' + str(i) for i in range(0, 126) ] -schema = StructType([ StructField(x, FloatType()) for x in [label] + features ]) - -default_params = { - 'eta': 0.1, - 'missing': 0.0, - 'maxDepth': 2, - 'numWorkers': 1, -} diff --git a/examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/cpu_main.py b/examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/cpu_main.py deleted file mode 100644 index bbc35e617..000000000 --- a/examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/cpu_main.py +++ /dev/null @@ -1,64 +0,0 @@ -# -# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from com.nvidia.spark.examples.agaricus.consts import * -from com.nvidia.spark.examples.utility.utils import * -from ml.dmlc.xgboost4j.scala.spark import * -from pyspark.sql import SparkSession - -def main(args, xgboost_args): - spark = (SparkSession - .builder - .appName(args.mainClass) - .getOrCreate()) - - train_data, eval_data, trans_data = valid_input_data(spark, args, '', schema) - - if args.mode in [ 'all', 'train' ]: - classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args)) - .setLabelCol(label) - .setFeaturesCol('features')) - if eval_data: - eval_data = vectorize_data_frame(eval_data, label) - classifier.setEvalSets({ 'test': eval_data }) - if not train_data: - print('-' * 80) - print('Usage: train data path required when mode is all or train') - exit(1) - train_data = vectorize_data_frame(train_data, label) - model = with_benchmark('Training', lambda: classifier.fit(train_data)) - - if args.modelPath: - writer = model.write().overwrite() if args.overwrite else model - writer.save(args.modelPath) - else: - model = XGBoostClassificationModel().load(args.modelPath) - - if args.mode in [ 'all', 'transform' ]: - def transform(): - cv_trans_data = vectorize_data_frame(trans_data, label) - result = model.transform(cv_trans_data).cache() - result.foreachPartition(lambda _: None) - return result - - if not trans_data: - print('-' * 80) - print('Usage: trans data path required when mode is all or transform') - exit(1) - result = with_benchmark('Transformation', transform) - show_sample(args, result, label) - with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label)) - - spark.stop() diff --git a/examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/gpu_main.py b/examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/main.py similarity index 62% rename from examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/gpu_main.py rename to examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/main.py index 3f466dc18..03a41e91d 100644 --- a/examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/gpu_main.py +++ b/examples/XGBoost-Examples/agaricus/python/com/nvidia/spark/examples/agaricus/main.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,47 +13,64 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from com.nvidia.spark.examples.agaricus.consts import * +from pyspark.sql.types import * + from com.nvidia.spark.examples.utility.utils import * -from ml.dmlc.xgboost4j.scala.spark import * from pyspark.sql import SparkSession +from xgboost.spark import SparkXGBClassifier, SparkXGBClassifierModel + +label = 'label' +feature_names = ['feature_' + str(i) for i in range(0, 126)] +schema = StructType([StructField(x, FloatType()) for x in [label] + feature_names]) + + def main(args, xgboost_args): spark = (SparkSession - .builder - .appName(args.mainClass) - .getOrCreate()) + .builder + .appName(args.mainClass) + .getOrCreate()) train_data, eval_data, trans_data = valid_input_data(spark, args, '', schema) - if args.mode in [ 'all', 'train' ]: - classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args)) - .setLabelCol(label) - .setFeaturesCols(features)) - if eval_data: - classifier.setEvalSets({ 'test': eval_data }) - if not train_data: + if args.mode in ['all', 'train']: + if train_data is None: print('-' * 80) print('Usage: train data path required when mode is all or train') + print('-' * 80) exit(1) + + train_data, features = transform_data(train_data, label, args.use_gpu) + xgboost_args['features_col'] = features + xgboost_args['label_col'] = label + classifier = SparkXGBClassifier(**xgboost_args) + + if eval_data: + # TODO + pass + model = with_benchmark('Training', lambda: classifier.fit(train_data)) if args.modelPath: writer = model.write().overwrite() if args.overwrite else model writer.save(args.modelPath) else: - model = XGBoostClassificationModel().load(args.modelPath) + model = SparkXGBClassifierModel.load(args.modelPath) + + if args.mode in ['all', 'transform']: + if trans_data is None: + print('-' * 80) + print('Usage: trans data path required when mode is all or transform') + print('-' * 80) + exit(1) + + trans_data, _ = transform_data(trans_data, label, args.use_gpu) - if args.mode in [ 'all', 'transform' ]: def transform(): result = model.transform(trans_data).cache() result.foreachPartition(lambda _: None) return result - if not trans_data: - print('-' * 80) - print('Usage: trans data path required when mode is all or transform') - exit(1) result = with_benchmark('Transformation', transform) show_sample(args, result, label) with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label)) diff --git a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb index 974b6094d..a3b93140a 100644 --- a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb +++ b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb @@ -6,20 +6,17 @@ "source": [ "# Dataset\n", "\n", - "Dataset is derived from Fannie Mae’s [Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) with all rights reserved by Fannie Mae. Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.10/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n", + "Dataset is derived from Fannie Mae’s [Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) with all rights reserved by Fannie Mae. Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.12/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n", "\n", "# ETL + XGBoost train & transform\n", "\n", - "This notebook is an end-to-end example of ETL + XGBoost Train & Transform by using [Spark-Rapids](https://github.com/NVIDIA/spark-rapids) and [XGBoost](https://github.com/nvidia/spark-xgboost) with GPU accelerated.\n", + "This notebook is an end-to-end example of ETL + XGBoost Train & Transform by using [Spark-Rapids](https://github.com/NVIDIA/spark-rapids) and [XGBoost](https://github.com/dmlc/xgboost) with GPU accelerated.\n", "
The main steps:\n", "1. Run ETL to generate 2 datasets for train and test
\n", " You can choose to save the datasets or not by setting \"is_save_dataset\" to True or False.
\n", " It means you don't need to save the dataset to disk after ETL and directly feed the dataframe to XGBoost train or transform.\n", "2. Run XGBoost train with the train dataset\n", - "3. Run XGBoost transform with the test dataset\n", - "\n", - "Note: For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04) that\n", - "uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/)." + "3. Run XGBoost transform with the test dataset" ] }, { @@ -31,10 +28,13 @@ "import time\n", "import os\n", "from pyspark import broadcast\n", + "from pyspark.conf import SparkConf\n", "from pyspark.sql import SparkSession\n", "from pyspark.sql.functions import *\n", "from pyspark.sql.types import *\n", - "from pyspark.sql.window import Window" + "from pyspark.sql.window import Window\n", + "# if you pass/unpack the archive file and enable the environment\n", + "# os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\"" ] }, { @@ -54,7 +54,7 @@ "source": [ "# The input path of dataset\n", "dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n", - "orig_raw_path = dataRoot + \"/mortgage/input/\"", + "orig_raw_path = dataRoot + \"/mortgage/input/\"\n", "orig_raw_path_csv2parquet = dataRoot + \"/mortgage/output/csv2parquet/\"" ] }, @@ -64,10 +64,47 @@ "metadata": {}, "outputs": [], "source": [ - "spark = (SparkSession\n", - " .builder\n", - " .appName(\"MortgageETL+XGBoost\")\n", - " .getOrCreate())" + "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"/your-url\")\n", + "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-jar-path\")\n", + "\n", + "# You need to update with your real hardware resource \n", + "driverMem = os.getenv(\"DRIVER_MEM\", \"10g\")\n", + "executorMem = os.getenv(\"EXECUTOR_MEM\", \"10g\")\n", + "pinnedPoolSize = os.getenv(\"PINNED_POOL_SIZE\", \"2g\")\n", + "concurrentGpuTasks = os.getenv(\"CONCURRENT_GPU_TASKS\", \"2\")\n", + "executorCores = int(os.getenv(\"EXECUTOR_CORES\", \"4\"))\n", + "\n", + "# Common spark settings\n", + "conf = SparkConf()\n", + "conf.setMaster(SPARK_MASTER_URL)\n", + "conf.setAppName(\"Microbenchmark on GPU\")\n", + "conf.set(\"spark.driver.memory\", driverMem)\n", + "## The tasks will run on GPU memory, so there is no need to set a high host memory\n", + "conf.set(\"spark.executor.memory\", executorMem)\n", + "## The tasks will run on GPU cores, so there is no need to use many cpu cores\n", + "conf.set(\"spark.executor.cores\", executorCores)\n", + "\n", + "# Plugin settings\n", + "conf.set(\"spark.executor.resource.gpu.amount\", \"1\")\n", + "conf.set(\"spark.rapids.sql.concurrentGpuTasks\", concurrentGpuTasks)\n", + "conf.set(\"spark.rapids.memory.pinnedPool.size\", pinnedPoolSize)\n", + "##############note: only support value=1 see https://github.com/dmlc/xgboost/blame/master/python-package/xgboost/spark/core.py#L370-L374\n", + "conf.set(\"spark.task.resource.gpu.amount\", 1) \n", + "# since pyspark and xgboost share the same GPU, we need to allocate some memory to xgboost to avoid GPU OOM while training \n", + "conf.set(\"spark.rapids.memory.gpu.allocFraction\",\"0.6\")\n", + "conf.set(\"spark.rapids.sql.enabled\", \"true\") \n", + "conf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\n", + "conf.set(\"spark.sql.cache.serializer\",\"com.nvidia.spark.ParquetCachedBatchSerializer\")\n", + "conf.set(\"spark.driver.extraClassPath\", RAPIDS_JAR)\n", + "conf.set(\"spark.executor.extraClassPath\", RAPIDS_JAR)\n", + "conf.set(\"spark.jars\", RAPIDS_JAR)\n", + "\n", + "# if you pass/unpack the archive file and enable the environment\n", + "# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n", + "\n", + "# Create spark session\n", + "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n", + "reader = spark.read" ] }, { @@ -737,9 +774,7 @@ "spark.conf.set(\"spark.rapids.sql.explain\", \"ALL\")\n", "spark.conf.set(\"spark.rapids.sql.batchSizeBytes\", \"512M\")\n", "spark.conf.set(\"spark.rapids.sql.reader.batchSizeBytes\", \"768M\")\n", - "spark.conf.set(\"spark.rapids.sql.hasNans\", \"false\")\n", - "# use GPU to read CSV\n", - "spark.conf.set(\"spark.rapids.sql.csv.read.double.enabled\", \"true\")" + "spark.conf.set(\"spark.rapids.sql.hasNans\", \"false\")" ] }, { @@ -805,7 +840,7 @@ "metadata": {}, "outputs": [], "source": [ - "from ml.dmlc.xgboost4j.scala.spark import XGBoostClassificationModel, XGBoostClassifier\n", + "from xgboost.spark import SparkXGBClassifier, SparkXGBClassifierModel\n", "from pyspark.ml.evaluation import MulticlassClassificationEvaluator" ] }, @@ -893,16 +928,17 @@ "metadata": {}, "outputs": [], "source": [ - "# This sample uses 2 workers(GPUs) to run XGBoost training \n", + "# This sample uses 1 worker(GPU) to run XGBoost training, you can change according to your GPU resources\n", "params = { \n", - " \"treeMethod\": \"gpu_hist\",\n", - " \"objective\":\"binary:logistic\",\n", - " \"growPolicy\": \"depthwise\",\n", - " \"nthread\": 1,\n", - " \"numRound\": 100,\n", - " \"numWorkers\": 1,\n", + " \"tree_method\": \"gpu_hist\",\n", + " \"grow_policy\": \"depthwise\",\n", + " \"num_workers\": 1,\n", + " \"use_gpu\": \"true\",\n", "}\n", - "classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCols(features)" + "params['features_col'] = features\n", + "params['label_col'] = label\n", + " \n", + "classifier = SparkXGBClassifier(**params)" ] }, { @@ -934,8 +970,16 @@ "metadata": {}, "outputs": [], "source": [ - "model.write().overwrite().save(output_path_model)\n", - "loaded_model = XGBoostClassificationModel().load(output_path_model)" + "model.write().overwrite().save(output_path_model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loaded_model = SparkXGBClassifierModel().load(output_path_model)" ] }, { diff --git a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb index 93dd98866..4551654f5 100644 --- a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb +++ b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb @@ -6,10 +6,10 @@ "source": [ "## Prerequirement\n", "### 1. Download data\n", - "Dataset is derived from Fannie Mae’s [Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) with all rights reserved by Fannie Mae. Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.10/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n", + "Dataset is derived from Fannie Mae’s [Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) with all rights reserved by Fannie Mae. Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.12/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n", "\n", "### 2. Download needed jars\n", - "* [rapids-4-spark_2.12-22.10.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar)\n", + "* [rapids-4-spark_2.12-22.12.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar)\n", "\n", "\n", "### 3. Start Spark Standalone\n", @@ -17,7 +17,7 @@ "\n", "### 4. Add ENV\n", "```\n", - "$ export SPARK_JARS=rapids-4-spark_2.12-22.10.0.jar\n", + "$ export SPARK_JARS=rapids-4-spark_2.12-22.12.0.jar\n", "$ export PYSPARK_DRIVER_PYTHON=jupyter \n", "$ export PYSPARK_DRIVER_PYTHON_OPTS=notebook\n", "```\n", diff --git a/examples/XGBoost-Examples/mortgage/notebooks/python/cv-mortgage-gpu.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/python/cv-mortgage-gpu.ipynb index 94a682cef..ea128ef9c 100644 --- a/examples/XGBoost-Examples/mortgage/notebooks/python/cv-mortgage-gpu.ipynb +++ b/examples/XGBoost-Examples/mortgage/notebooks/python/cv-mortgage-gpu.ipynb @@ -11,13 +11,10 @@ "Here takes the application 'Mortgage' as an example.\n", "\n", "A few libraries are required for this notebook:\n", - " 1. NumPy\n", - " 2. cudf jar\n", - " 2. xgboost4j jar\n", - " 3. xgboost4j-spark jar\n", - " \n", - "Note: For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04) that\n", - "uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/)." + " 1. cudf-cu11\n", + " 2. xgboost\n", + " 3. scikit-learn\n", + " 4. numpy" ] }, { @@ -33,21 +30,17 @@ "metadata": {}, "outputs": [], "source": [ - "from ml.dmlc.xgboost4j.scala.spark import XGBoostClassificationModel, XGBoostClassifier\n", - "from ml.dmlc.xgboost4j.scala.spark.rapids import CrossValidator\n", + "from xgboost.spark import SparkXGBClassifier, SparkXGBClassifierModel\n", + "from pyspark.ml.tuning import ParamGridBuilder, CrossValidator\n", "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n", - "from pyspark.ml.tuning import ParamGridBuilder\n", "from pyspark.sql import SparkSession\n", "from pyspark.sql.types import FloatType, IntegerType, StructField, StructType, DoubleType\n", + "from pyspark.conf import SparkConf\n", "from time import time\n", - "import os" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As shown above, here `CrossValidator` is imported from package `ml.dmlc.xgboost4j.scala.spark.rapids`, not the spark's `tuning.CrossValidator`." + "import os\n", + "# if you pass/unpack the archive file and enable the environment\n", + "# os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\"\n", + "# os.environ['PYSPARK_DRIVER_PYTHON'] = \"./environment/bin/python\"" ] }, { @@ -61,9 +54,62 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-11-25 09:34:43,524 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "2022-11-25 09:34:43,952 WARN resource.ResourceUtils: The configuration of cores (exec = 4 task = 1, runnable tasks = 4) will result in wasted resources due to resource gpu limiting the number of runnable tasks per executor to: 1. Please adjust your configuration.\n", + "2022-11-25 09:34:58,155 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator 22.12.0-SNAPSHOT using cudf 22.12.0-SNAPSHOT.\n", + "2022-11-25 09:34:58,171 WARN rapids.RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n", + "2022-11-25 09:34:58,175 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n", + "2022-11-25 09:34:58,175 WARN rapids.RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n" + ] + } + ], "source": [ - "spark = SparkSession.builder.appName(\"mortgage-cv-gpu-python\").getOrCreate()" + "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"/your-url\")\n", + "\n", + "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-jar-path\")\n", + "\n", + "# You need to update with your real hardware resource \n", + "driverMem = os.getenv(\"DRIVER_MEM\", \"2g\")\n", + "executorMem = os.getenv(\"EXECUTOR_MEM\", \"2g\")\n", + "pinnedPoolSize = os.getenv(\"PINNED_POOL_SIZE\", \"2g\")\n", + "concurrentGpuTasks = os.getenv(\"CONCURRENT_GPU_TASKS\", \"2\")\n", + "executorCores = int(os.getenv(\"EXECUTOR_CORES\", \"4\"))\n", + "# Common spark settings\n", + "conf = SparkConf()\n", + "conf.setMaster(SPARK_MASTER_URL)\n", + "conf.setAppName(\"Microbenchmark on GPU\")\n", + "conf.set(\"spark.driver.memory\", driverMem)\n", + "## The tasks will run on GPU memory, so there is no need to set a high host memory\n", + "conf.set(\"spark.executor.memory\", executorMem)\n", + "## The tasks will run on GPU cores, so there is no need to use many cpu cores\n", + "conf.set(\"spark.executor.cores\", executorCores)\n", + "\n", + "# Plugin settings\n", + "conf.set(\"spark.executor.resource.gpu.amount\", \"1\")\n", + "conf.set(\"spark.rapids.sql.concurrentGpuTasks\", concurrentGpuTasks)\n", + "conf.set(\"spark.rapids.memory.pinnedPool.size\", pinnedPoolSize)\n", + "conf.set(\"spark.rapids.memory.gpu.allocFraction\",\"0.7\")\n", + "conf.set(\"spark.locality.wait\",\"0\")\n", + "##############note: only support value=1 https://github.com/dmlc/xgboost/blame/master/python-package/xgboost/spark/core.py#L370-L374\n", + "conf.set(\"spark.task.resource.gpu.amount\", 1) \n", + "conf.set(\"spark.rapids.sql.enabled\", \"true\") \n", + "conf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\n", + "conf.set(\"spark.sql.cache.serializer\",\"com.nvidia.spark.ParquetCachedBatchSerializer\")\n", + "conf.set(\"spark.driver.extraClassPath\", RAPIDS_JAR)\n", + "conf.set(\"spark.executor.extraClassPath\", RAPIDS_JAR)\n", + "# if you pass/unpack the archive file and enable the environment\n", + "# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n", + "# Create spark session\n", + "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n", + "\n", + "reader = spark.read" ] }, { @@ -117,8 +163,14 @@ "train_path = dataRoot + \"/mortgage/output/train\"\n", "eval_path = dataRoot + \"/mortgage/output/eval\"\n", "\n", - "train_data = spark.read.parquet(train_path)\n", - "trans_data = spark.read.parquet(eval_path)" + "data_format = 'parquet'\n", + "has_header = 'true'\n", + "if data_format == 'csv':\n", + " train_data = reader.schema(schema).option('header',has_header).csv(train_path)\n", + " trans_data = reader.schema(schema).option('header',has_header).csv(eval_path)\n", + "else :\n", + " train_data = reader.load(train_path)\n", + " trans_data = reader.load(eval_path)" ] }, { @@ -134,38 +186,31 @@ "metadata": {}, "outputs": [], "source": [ - "# First build a classifier of GPU version using *setFeaturesCols* to set feature columns\n", "params = { \n", - " 'eta': 0.1,\n", - " 'gamma': 0.1,\n", - " 'missing': 0.0,\n", - " 'treeMethod': 'gpu_hist',\n", - " 'maxDepth': 10, \n", - " 'maxLeaves': 256,\n", - " 'growPolicy': 'depthwise',\n", - " 'objective': 'binary:logistic',\n", - " 'minChildWeight': 30.0,\n", - " 'lambda_': 1.0,\n", - " 'scalePosWeight': 2.0,\n", - " 'subsample': 1.0,\n", - " 'nthread': 1,\n", - " 'numRound': 100,\n", - " 'numWorkers': 1,\n", + " \"tree_method\": \"gpu_hist\",\n", + " \"grow_policy\": \"depthwise\",\n", + " \"num_workers\": 1,\n", + " \"use_gpu\": \"true\",\n", "}\n", - "classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCols(features)\n", + "\n", + "params['features_col'] = features\n", + "params['label_col'] = label\n", + " \n", + "classifier = SparkXGBClassifier(**params)\n", + "\n", "# Then build the evaluator and the hyperparameters\n", "evaluator = (MulticlassClassificationEvaluator()\n", " .setLabelCol(label))\n", "param_grid = (ParamGridBuilder()\n", - " .addGrid(classifier.maxDepth, [3, 6])\n", - " .addGrid(classifier.numRound, [100, 200])\n", + " .addGrid(classifier.max_depth, [3, 6])\n", + " .addGrid(classifier.n_estimators, [100, 200])\n", " .build())\n", "# Finally the corss validator\n", "cross_validator = (CrossValidator()\n", " .setEstimator(classifier)\n", " .setEvaluator(evaluator)\n", " .setEstimatorParamMaps(param_grid)\n", - " .setNumFolds(3))" + " .setNumFolds(2))" ] }, { @@ -180,11 +225,242 @@ "execution_count": 5, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-11-25 09:35:01,049 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n", + "If features_cols param set, then features_col param is ignored.\n", + "/data/home/yuanli/work/reviews/pr252/pyspark_venv_20221125/lib/python3.8/site-packages/xgboost/sklearn.py:808: UserWarning: Loading a native XGBoost model with Scikit-Learn interface.\n", + " warnings.warn(\"Loading a native XGBoost model with Scikit-Learn interface.\")\n", + "2022-11-25 09:35:26,758 WARN rapids.GpuOverrides: \n", + "! cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n", + " ! createexternalrow(prediction#2153, delinquency_12#2255, 1.0#2256, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n", + " @Expression prediction#2153 could run on GPU\n", + " @Expression delinquency_12#2255 could run on GPU\n", + " @Expression 1.0#2256 could run on GPU\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n", + " !Expression probability#2186 cannot run on GPU because expression AttributeReference probability#2186 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Expression obj#2261 cannot run on GPU because expression AttributeReference obj#2261 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n", + " !Exec cannot run on GPU because not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#2186]\n", + " @Expression pythonUDF0#2552.prediction AS prediction#2153 could run on GPU\n", + " @Expression pythonUDF0#2552.prediction could run on GPU\n", + " @Expression pythonUDF0#2552 could run on GPU\n", + " @Expression cast(delinquency_12#27 as double) AS delinquency_12#2255 could run on GPU\n", + " @Expression cast(delinquency_12#27 as double) could run on GPU\n", + " @Expression delinquency_12#27 could run on GPU\n", + " @Expression 1.0 AS 1.0#2256 could run on GPU\n", + " @Expression 1.0 could run on GPU\n", + " !Expression UDF(pythonUDF0#2552.probability) AS probability#2186 cannot run on GPU because input expression ScalaUDF UDF(pythonUDF0#2552.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expression Alias UDF(pythonUDF0#2552.probability) AS probability#2186 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Expression UDF(pythonUDF0#2552.probability) cannot run on GPU because expression ScalaUDF UDF(pythonUDF0#2552.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled\n", + " @Expression pythonUDF0#2552.probability could run on GPU\n", + " @Expression pythonUDF0#2552 could run on GPU\n", + "\n", + "If features_cols param set, then features_col param is ignored. \n", + "2022-11-25 09:35:34,074 WARN rapids.GpuOverrides: \n", + "! cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n", + " ! createexternalrow(prediction#4415, delinquency_12#4517, 1.0#4518, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n", + " @Expression prediction#4415 could run on GPU\n", + " @Expression delinquency_12#4517 could run on GPU\n", + " @Expression 1.0#4518 could run on GPU\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n", + " !Expression probability#4448 cannot run on GPU because expression AttributeReference probability#4448 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Expression obj#4523 cannot run on GPU because expression AttributeReference obj#4523 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n", + " !Exec cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#4448]; not all expressions can be replaced\n", + " @Expression pythonUDF0#4814.prediction AS prediction#4415 could run on GPU\n", + " @Expression pythonUDF0#4814.prediction could run on GPU\n", + " @Expression pythonUDF0#4814 could run on GPU\n", + " @Expression cast(delinquency_12#27 as double) AS delinquency_12#4517 could run on GPU\n", + " @Expression cast(delinquency_12#27 as double) could run on GPU\n", + " @Expression delinquency_12#27 could run on GPU\n", + " @Expression 1.0 AS 1.0#4518 could run on GPU\n", + " @Expression 1.0 could run on GPU\n", + " !Expression UDF(pythonUDF0#4814.probability) AS probability#4448 cannot run on GPU because expression Alias UDF(pythonUDF0#4814.probability) AS probability#4448 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#4814.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n", + " !Expression UDF(pythonUDF0#4814.probability) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#4814.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " @Expression pythonUDF0#4814.probability could run on GPU\n", + " @Expression pythonUDF0#4814 could run on GPU\n", + "\n", + "If features_cols param set, then features_col param is ignored.\n", + "2022-11-25 09:35:37,859 WARN rapids.GpuOverrides: \n", + "! cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n", + " ! createexternalrow(prediction#6677, delinquency_12#6779, 1.0#6780, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n", + " @Expression prediction#6677 could run on GPU\n", + " @Expression delinquency_12#6779 could run on GPU\n", + " @Expression 1.0#6780 could run on GPU\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n", + " !Expression probability#6710 cannot run on GPU because expression AttributeReference probability#6710 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Expression obj#6785 cannot run on GPU because expression AttributeReference obj#6785 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n", + " !Exec cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#6710]; not all expressions can be replaced\n", + " @Expression pythonUDF0#7076.prediction AS prediction#6677 could run on GPU\n", + " @Expression pythonUDF0#7076.prediction could run on GPU\n", + " @Expression pythonUDF0#7076 could run on GPU\n", + " @Expression cast(delinquency_12#27 as double) AS delinquency_12#6779 could run on GPU\n", + " @Expression cast(delinquency_12#27 as double) could run on GPU\n", + " @Expression delinquency_12#27 could run on GPU\n", + " @Expression 1.0 AS 1.0#6780 could run on GPU\n", + " @Expression 1.0 could run on GPU\n", + " !Expression UDF(pythonUDF0#7076.probability) AS probability#6710 cannot run on GPU because input expression ScalaUDF UDF(pythonUDF0#7076.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expression Alias UDF(pythonUDF0#7076.probability) AS probability#6710 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Expression UDF(pythonUDF0#7076.probability) cannot run on GPU because expression ScalaUDF UDF(pythonUDF0#7076.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled\n", + " @Expression pythonUDF0#7076.probability could run on GPU\n", + " @Expression pythonUDF0#7076 could run on GPU\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "If features_cols param set, then features_col param is ignored.\n", + "2022-11-25 09:35:41,551 WARN rapids.GpuOverrides: \n", + "! cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n", + " ! createexternalrow(prediction#8939, delinquency_12#9041, 1.0#9042, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n", + " @Expression prediction#8939 could run on GPU\n", + " @Expression delinquency_12#9041 could run on GPU\n", + " @Expression 1.0#9042 could run on GPU\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n", + " !Expression probability#8972 cannot run on GPU because expression AttributeReference probability#8972 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Expression obj#9047 cannot run on GPU because expression AttributeReference obj#9047 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n", + " !Exec cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#8972]; not all expressions can be replaced\n", + " @Expression pythonUDF0#9338.prediction AS prediction#8939 could run on GPU\n", + " @Expression pythonUDF0#9338.prediction could run on GPU\n", + " @Expression pythonUDF0#9338 could run on GPU\n", + " @Expression cast(delinquency_12#27 as double) AS delinquency_12#9041 could run on GPU\n", + " @Expression cast(delinquency_12#27 as double) could run on GPU\n", + " @Expression delinquency_12#27 could run on GPU\n", + " @Expression 1.0 AS 1.0#9042 could run on GPU\n", + " @Expression 1.0 could run on GPU\n", + " !Expression UDF(pythonUDF0#9338.probability) AS probability#8972 cannot run on GPU because input expression ScalaUDF UDF(pythonUDF0#9338.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expression Alias UDF(pythonUDF0#9338.probability) AS probability#8972 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Expression UDF(pythonUDF0#9338.probability) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#9338.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " @Expression pythonUDF0#9338.probability could run on GPU\n", + " @Expression pythonUDF0#9338 could run on GPU\n", + "\n", + "If features_cols param set, then features_col param is ignored.\n", + "2022-11-25 09:35:45,231 WARN rapids.GpuOverrides: \n", + "! cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n", + " ! createexternalrow(prediction#11491, delinquency_12#11593, 1.0#11594, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n", + " @Expression prediction#11491 could run on GPU\n", + " @Expression delinquency_12#11593 could run on GPU\n", + " @Expression 1.0#11594 could run on GPU\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n", + " !Expression probability#11524 cannot run on GPU because expression AttributeReference probability#11524 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Expression obj#11599 cannot run on GPU because expression AttributeReference obj#11599 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n", + " !Exec cannot run on GPU because not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#11524]\n", + " @Expression pythonUDF0#11890.prediction AS prediction#11491 could run on GPU\n", + " @Expression pythonUDF0#11890.prediction could run on GPU\n", + " @Expression pythonUDF0#11890 could run on GPU\n", + " @Expression cast(delinquency_12#27 as double) AS delinquency_12#11593 could run on GPU\n", + " @Expression cast(delinquency_12#27 as double) could run on GPU\n", + " @Expression delinquency_12#27 could run on GPU\n", + " @Expression 1.0 AS 1.0#11594 could run on GPU\n", + " @Expression 1.0 could run on GPU\n", + " !Expression UDF(pythonUDF0#11890.probability) AS probability#11524 cannot run on GPU because expression Alias UDF(pythonUDF0#11890.probability) AS probability#11524 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#11890.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n", + " !Expression UDF(pythonUDF0#11890.probability) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#11890.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " @Expression pythonUDF0#11890.probability could run on GPU\n", + " @Expression pythonUDF0#11890 could run on GPU\n", + "\n", + "If features_cols param set, then features_col param is ignored.\n", + "2022-11-25 09:35:49,003 WARN rapids.GpuOverrides: \n", + "! cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n", + " ! createexternalrow(prediction#13753, delinquency_12#13855, 1.0#13856, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n", + " @Expression prediction#13753 could run on GPU\n", + " @Expression delinquency_12#13855 could run on GPU\n", + " @Expression 1.0#13856 could run on GPU\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n", + " !Expression probability#13786 cannot run on GPU because expression AttributeReference probability#13786 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Expression obj#13861 cannot run on GPU because expression AttributeReference obj#13861 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n", + " !Exec cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#13786]; not all expressions can be replaced\n", + " @Expression pythonUDF0#14152.prediction AS prediction#13753 could run on GPU\n", + " @Expression pythonUDF0#14152.prediction could run on GPU\n", + " @Expression pythonUDF0#14152 could run on GPU\n", + " @Expression cast(delinquency_12#27 as double) AS delinquency_12#13855 could run on GPU\n", + " @Expression cast(delinquency_12#27 as double) could run on GPU\n", + " @Expression delinquency_12#27 could run on GPU\n", + " @Expression 1.0 AS 1.0#13856 could run on GPU\n", + " @Expression 1.0 could run on GPU\n", + " !Expression UDF(pythonUDF0#14152.probability) AS probability#13786 cannot run on GPU because expression Alias UDF(pythonUDF0#14152.probability) AS probability#13786 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#14152.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n", + " !Expression UDF(pythonUDF0#14152.probability) cannot run on GPU because expression ScalaUDF UDF(pythonUDF0#14152.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled\n", + " @Expression pythonUDF0#14152.probability could run on GPU\n", + " @Expression pythonUDF0#14152 could run on GPU\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "If features_cols param set, then features_col param is ignored.\n", + "2022-11-25 09:35:52,578 WARN rapids.GpuOverrides: \n", + "! cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n", + " ! createexternalrow(prediction#16015, delinquency_12#16117, 1.0#16118, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n", + " @Expression prediction#16015 could run on GPU\n", + " @Expression delinquency_12#16117 could run on GPU\n", + " @Expression 1.0#16118 could run on GPU\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n", + " !Expression probability#16048 cannot run on GPU because expression AttributeReference probability#16048 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Expression obj#16123 cannot run on GPU because expression AttributeReference obj#16123 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n", + " !Exec cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#16048]; not all expressions can be replaced\n", + " @Expression pythonUDF0#16414.prediction AS prediction#16015 could run on GPU\n", + " @Expression pythonUDF0#16414.prediction could run on GPU\n", + " @Expression pythonUDF0#16414 could run on GPU\n", + " @Expression cast(delinquency_12#27 as double) AS delinquency_12#16117 could run on GPU\n", + " @Expression cast(delinquency_12#27 as double) could run on GPU\n", + " @Expression delinquency_12#27 could run on GPU\n", + " @Expression 1.0 AS 1.0#16118 could run on GPU\n", + " @Expression 1.0 could run on GPU\n", + " !Expression UDF(pythonUDF0#16414.probability) AS probability#16048 cannot run on GPU because expression Alias UDF(pythonUDF0#16414.probability) AS probability#16048 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#16414.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n", + " !Expression UDF(pythonUDF0#16414.probability) cannot run on GPU because expression ScalaUDF UDF(pythonUDF0#16414.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled\n", + " @Expression pythonUDF0#16414.probability could run on GPU\n", + " @Expression pythonUDF0#16414 could run on GPU\n", + "\n", + "If features_cols param set, then features_col param is ignored.\n", + "2022-11-25 09:35:56,267 WARN rapids.GpuOverrides: \n", + "! cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n", + " ! createexternalrow(prediction#18277, delinquency_12#18379, 1.0#18380, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n", + " @Expression prediction#18277 could run on GPU\n", + " @Expression delinquency_12#18379 could run on GPU\n", + " @Expression 1.0#18380 could run on GPU\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n", + " !Expression probability#18310 cannot run on GPU because expression AttributeReference probability#18310 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Expression obj#18385 cannot run on GPU because expression AttributeReference obj#18385 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n", + " !Exec cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#18310]; not all expressions can be replaced\n", + " @Expression pythonUDF0#18676.prediction AS prediction#18277 could run on GPU\n", + " @Expression pythonUDF0#18676.prediction could run on GPU\n", + " @Expression pythonUDF0#18676 could run on GPU\n", + " @Expression cast(delinquency_12#27 as double) AS delinquency_12#18379 could run on GPU\n", + " @Expression cast(delinquency_12#27 as double) could run on GPU\n", + " @Expression delinquency_12#27 could run on GPU\n", + " @Expression 1.0 AS 1.0#18380 could run on GPU\n", + " @Expression 1.0 could run on GPU\n", + " !Expression UDF(pythonUDF0#18676.probability) AS probability#18310 cannot run on GPU because expression Alias UDF(pythonUDF0#18676.probability) AS probability#18310 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#18676.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n", + " !Expression UDF(pythonUDF0#18676.probability) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#18676.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " @Expression pythonUDF0#18676.probability could run on GPU\n", + " @Expression pythonUDF0#18676 could run on GPU\n", + "\n", + "If features_cols param set, then features_col param is ignored.\n", + "[Stage 69:> (0 + 1) / 1]\r" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Cross-Validation takes 88.53 seconds\n" + "Cross-Validation takes 59.46 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " \r" ] } ], @@ -207,22 +483,126 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-11-25 09:35:59,886 WARN rapids.GpuOverrides: \n", + "!Exec cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction#18908, probability#18974]; not all expressions can be replaced\n", + " @Expression orig_channel#56 could run on GPU\n", + " @Expression first_home_buyer#57 could run on GPU\n", + " @Expression loan_purpose#58 could run on GPU\n", + " @Expression property_type#59 could run on GPU\n", + " @Expression occupancy_status#60 could run on GPU\n", + " @Expression property_state#61 could run on GPU\n", + " @Expression product_type#62 could run on GPU\n", + " @Expression relocation_mortgage_indicator#63 could run on GPU\n", + " @Expression seller_name#64 could run on GPU\n", + " @Expression mod_flag#65 could run on GPU\n", + " @Expression orig_interest_rate#66 could run on GPU\n", + " @Expression orig_upb#67 could run on GPU\n", + " @Expression orig_loan_term#68 could run on GPU\n", + " @Expression orig_ltv#69 could run on GPU\n", + " @Expression orig_cltv#70 could run on GPU\n", + " @Expression num_borrowers#71 could run on GPU\n", + " @Expression dti#72 could run on GPU\n", + " @Expression borrower_credit_score#73 could run on GPU\n", + " @Expression num_units#74 could run on GPU\n", + " @Expression zip#75 could run on GPU\n", + " @Expression mortgage_insurance_percent#76 could run on GPU\n", + " @Expression current_loan_delinquency_status#77 could run on GPU\n", + " @Expression current_actual_upb#78 could run on GPU\n", + " @Expression interest_rate#79 could run on GPU\n", + " @Expression loan_age#80 could run on GPU\n", + " @Expression msa#81 could run on GPU\n", + " @Expression non_interest_bearing_upb#82 could run on GPU\n", + " @Expression delinquency_12#83 could run on GPU\n", + " !Expression UDF(pythonUDF0#19041.rawPrediction) AS rawPrediction#18908 cannot run on GPU because input expression ScalaUDF UDF(pythonUDF0#19041.rawPrediction) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expression Alias UDF(pythonUDF0#19041.rawPrediction) AS rawPrediction#18908 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Expression UDF(pythonUDF0#19041.rawPrediction) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#19041.rawPrediction) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " @Expression pythonUDF0#19041.rawPrediction could run on GPU\n", + " @Expression pythonUDF0#19041 could run on GPU\n", + " @Expression pythonUDF0#19041.prediction AS prediction#18942 could run on GPU\n", + " @Expression pythonUDF0#19041.prediction could run on GPU\n", + " @Expression pythonUDF0#19041 could run on GPU\n", + " !Expression UDF(pythonUDF0#19041.probability) AS probability#18974 cannot run on GPU because expression Alias UDF(pythonUDF0#19041.probability) AS probability#18974 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#19041.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n", + " !Expression UDF(pythonUDF0#19041.probability) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#19041.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " @Expression pythonUDF0#19041.probability could run on GPU\n", + " @Expression pythonUDF0#19041 could run on GPU\n", + "\n", + "2022-11-25 09:35:59,893 WARN rapids.GpuOverrides: \n", + "!Exec cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction#18908, probability#18974]; not all expressions can be replaced\n", + " @Expression orig_channel#56 could run on GPU\n", + " @Expression first_home_buyer#57 could run on GPU\n", + " @Expression loan_purpose#58 could run on GPU\n", + " @Expression property_type#59 could run on GPU\n", + " @Expression occupancy_status#60 could run on GPU\n", + " @Expression property_state#61 could run on GPU\n", + " @Expression product_type#62 could run on GPU\n", + " @Expression relocation_mortgage_indicator#63 could run on GPU\n", + " @Expression seller_name#64 could run on GPU\n", + " @Expression mod_flag#65 could run on GPU\n", + " @Expression orig_interest_rate#66 could run on GPU\n", + " @Expression orig_upb#67 could run on GPU\n", + " @Expression orig_loan_term#68 could run on GPU\n", + " @Expression orig_ltv#69 could run on GPU\n", + " @Expression orig_cltv#70 could run on GPU\n", + " @Expression num_borrowers#71 could run on GPU\n", + " @Expression dti#72 could run on GPU\n", + " @Expression borrower_credit_score#73 could run on GPU\n", + " @Expression num_units#74 could run on GPU\n", + " @Expression zip#75 could run on GPU\n", + " @Expression mortgage_insurance_percent#76 could run on GPU\n", + " @Expression current_loan_delinquency_status#77 could run on GPU\n", + " @Expression current_actual_upb#78 could run on GPU\n", + " @Expression interest_rate#79 could run on GPU\n", + " @Expression loan_age#80 could run on GPU\n", + " @Expression msa#81 could run on GPU\n", + " @Expression non_interest_bearing_upb#82 could run on GPU\n", + " @Expression delinquency_12#83 could run on GPU\n", + " !Expression rawPrediction#18908 cannot run on GPU because expression AttributeReference rawPrediction#18908 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " @Expression prediction#18942 could run on GPU\n", + " !Expression probability#18974 cannot run on GPU because expression AttributeReference probability#18974 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + "\n", + "2022-11-25 09:36:00,975 WARN rapids.GpuOverrides: \n", + "!Exec cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n", + " @Partitioning could run on GPU\n", + " !Exec cannot run on GPU because unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#18974, rawPrediction#18908]; not all expressions can be replaced\n", + " @Expression cast(delinquency_12#83 as string) AS delinquency_12#19670 could run on GPU\n", + " @Expression cast(delinquency_12#83 as string) could run on GPU\n", + " @Expression delinquency_12#83 could run on GPU\n", + " @Expression cast(rawPrediction#18908 as string) AS rawPrediction#19671 could run on GPU\n", + " !Expression cast(rawPrediction#18908 as string) cannot run on GPU because Cast from org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 to StringType is not supported\n", + " !Expression rawPrediction#18908 cannot run on GPU because expression AttributeReference rawPrediction#18908 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " @Expression cast(probability#18974 as string) AS probability#19672 could run on GPU\n", + " !Expression cast(probability#18974 as string) cannot run on GPU because Cast from org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 to StringType is not supported\n", + " !Expression probability#18974 cannot run on GPU because expression AttributeReference probability#18974 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " @Expression cast(prediction#18942 as string) AS prediction#19673 could run on GPU\n", + " @Expression cast(prediction#18942 as string) could run on GPU\n", + " @Expression prediction#18942 could run on GPU\n", + " !Exec cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#18974, rawPrediction#18908]; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; not all expressions can be replaced\n", + " @Expression delinquency_12#83 could run on GPU\n", + " @Expression prediction#18942 could run on GPU\n", + " !Expression probability#18974 cannot run on GPU because expression AttributeReference probability#18974 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Expression rawPrediction#18908 cannot run on GPU because expression AttributeReference rawPrediction#18908 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + "\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Transforming takes 3.13 seconds\n", + "Transforming takes 1.15 seconds\n", "+--------------+--------------------+--------------------+----------+\n", "|delinquency_12| rawPrediction| probability|prediction|\n", "+--------------+--------------------+--------------------+----------+\n", - "| 0|[2.57163572311401...|[0.92901364713907...| 0.0|\n", - "| 0|[2.63977861404418...|[0.93337820470333...| 0.0|\n", - "| 0|[2.50156974792480...|[0.92425179481506...| 0.0|\n", - "| 0|[2.63977861404418...|[0.93337820470333...| 0.0|\n", - "| 0|[2.09173870086669...|[0.89009761810302...| 0.0|\n", + "| 0|[10.2152490615844...|[0.99996340274810...| 0.0|\n", + "| 0|[8.85215473175048...|[0.99985694885253...| 0.0|\n", + "| 0|[8.85215473175048...|[0.99985694885253...| 0.0|\n", + "| 0|[8.85215473175048...|[0.99985694885253...| 0.0|\n", + "| 0|[10.2152490615844...|[0.99996340274810...| 0.0|\n", "+--------------+--------------------+--------------------+----------+\n", "only showing top 5 rows\n", "\n" @@ -247,15 +627,53 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-11-25 09:36:01,155 WARN rapids.GpuOverrides: \n", + "! cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n", + " ! createexternalrow(prediction#18942, delinquency_12#20148, 1.0#20149, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n", + " @Expression prediction#18942 could run on GPU\n", + " @Expression delinquency_12#20148 could run on GPU\n", + " @Expression 1.0#20149 could run on GPU\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n", + " !Expression probability#18974 cannot run on GPU because expression AttributeReference probability#18974 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Expression obj#20154 cannot run on GPU because expression AttributeReference obj#20154 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n", + " !Exec cannot run on GPU because unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#18974]; not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#18974]\n", + " @Expression prediction#18942 could run on GPU\n", + " @Expression cast(delinquency_12#83 as double) AS delinquency_12#20148 could run on GPU\n", + " @Expression cast(delinquency_12#83 as double) could run on GPU\n", + " @Expression delinquency_12#83 could run on GPU\n", + " @Expression 1.0 AS 1.0#20149 could run on GPU\n", + " @Expression 1.0 could run on GPU\n", + " !Expression probability#18974 cannot run on GPU because expression AttributeReference probability#18974 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Exec cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#18974]\n", + " @Expression delinquency_12#83 could run on GPU\n", + " @Expression prediction#18942 could run on GPU\n", + " !Expression probability#18974 cannot run on GPU because expression AttributeReference probability#18974 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + "\n", + "[Stage 72:> (0 + 1) / 1]\r" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Evaluation takes 0.29 seconds\n", - "Accuracy is 0.9868033296704449\n" + "Evaluation takes 1.41 seconds\n", + "Accuracy is 1.0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " \r" ] } ], @@ -268,7 +686,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ diff --git a/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb index e2c64c15e..e103567b4 100644 --- a/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb +++ b/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb @@ -9,16 +9,12 @@ "The goal of this notebook is to show how to train a XGBoost Model with Spark RAPIDS XGBoost library on GPUs. The dataset used with this notebook is derived from Fannie Mae’s Single-Family Loan Performance Data with all rights reserved by Fannie Mae. This processed dataset is redistributed with permission and consent from Fannie Mae. This notebook uses XGBoost to train 12-month mortgage loan delinquency prediction model .\n", "\n", "A few libraries required for this notebook:\n", - " 1. NumPy\n", - " 2. cudf jar\n", - " 3. xgboost4j jar\n", - " 4. xgboost4j-spark jar\n", - " 5. rapids-4-spark.jar\n", + " 1. cudf-cu11\n", + " 2. xgboost\n", + " 3. scikit-learn\n", + " 4. numpy\n", "\n", - "This notebook also illustrates the ease of porting a sample CPU based Spark xgboost4j code into GPU. There is only one change required for running Spark XGBoost on GPU. That is replacing the API `setFeaturesCol(feature)` on CPU with the new API `setFeaturesCols(features)`. This also eliminates the need for vectorization (assembling multiple feature columns in to one column) since we can read multiple columns.\n", - "\n", - "Note: For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04) that\n", - "uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/)." + "This notebook also illustrates the ease of porting a sample CPU based Spark xgboost4j code into GPU. There is no change required for running Spark XGBoost on GPU because both CPU and GPU call the same API. For CPU run, we need to vectorize the trained dataset before fitting data to classifier." ] }, { @@ -34,12 +30,24 @@ "metadata": {}, "outputs": [], "source": [ - "from ml.dmlc.xgboost4j.scala.spark import XGBoostClassificationModel, XGBoostClassifier\n", + "import os\n", + "\n", + "# if you pass/unpack the archive file and enable the environment\n", + "# os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from xgboost.spark import SparkXGBClassifier, SparkXGBClassifierModel\n", "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n", "from pyspark.sql import SparkSession\n", "from pyspark.sql.types import FloatType, IntegerType, StructField, StructType, DoubleType\n", - "from time import time\n", - "import os" + "from pyspark.conf import SparkConf\n", + "from time import time" ] }, { @@ -62,11 +70,68 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "22/11/24 06:14:05 WARN org.apache.spark.resource.ResourceUtils: The configuration of cores (exec = 4 task = 1, runnable tasks = 4) will result in wasted resources due to resource gpu limiting the number of runnable tasks per executor to: 1. Please adjust your configuration.\n", + "22/11/24 06:14:06 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker\n", + "22/11/24 06:14:06 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster\n", + "22/11/24 06:14:06 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat\n", + "22/11/24 06:14:06 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator\n", + "22/11/24 06:14:07 WARN com.nvidia.spark.rapids.RapidsPluginUtils: RAPIDS Accelerator 22.12.0 using cudf 22.12.0.\n", + "22/11/24 06:14:07 WARN com.nvidia.spark.rapids.RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n", + "22/11/24 06:14:07 WARN com.nvidia.spark.rapids.RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n", + "22/11/24 06:14:07 WARN com.nvidia.spark.rapids.RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n" + ] + } + ], "source": [ - "spark = SparkSession.builder.getOrCreate()\n", + "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"/your-url\")\n", + "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-jar-path\")\n", + "\n", + "# You need to update with your real hardware resource \n", + "driverMem = os.getenv(\"DRIVER_MEM\", \"10g\")\n", + "executorMem = os.getenv(\"EXECUTOR_MEM\", \"10g\")\n", + "pinnedPoolSize = os.getenv(\"PINNED_POOL_SIZE\", \"2g\")\n", + "concurrentGpuTasks = os.getenv(\"CONCURRENT_GPU_TASKS\", \"2\")\n", + "executorCores = int(os.getenv(\"EXECUTOR_CORES\", \"4\"))\n", + "\n", + "# Common spark settings\n", + "conf = SparkConf()\n", + "conf.setMaster(SPARK_MASTER_URL)\n", + "conf.setAppName(\"Microbenchmark on GPU\")\n", + "conf.set(\"spark.driver.memory\", driverMem)\n", + "## The tasks will run on GPU memory, so there is no need to set a high host memory\n", + "conf.set(\"spark.executor.memory\", executorMem)\n", + "## The tasks will run on GPU cores, so there is no need to use many cpu cores\n", + "conf.set(\"spark.executor.cores\", executorCores)\n", + "\n", + "# Plugin settings\n", + "conf.set(\"spark.executor.resource.gpu.amount\", \"1\")\n", + "conf.set(\"spark.rapids.sql.concurrentGpuTasks\", concurrentGpuTasks)\n", + "conf.set(\"spark.rapids.memory.pinnedPool.size\", pinnedPoolSize)\n", + "##############note: only support value=1 see https://github.com/dmlc/xgboost/blame/master/python-package/xgboost/spark/core.py#L370-L374\n", + "conf.set(\"spark.task.resource.gpu.amount\", 1) \n", + "# since pyspark and xgboost share the same GPU, we need to allocate some memory to xgboost to avoid GPU OOM while training \n", + "conf.set(\"spark.rapids.memory.gpu.allocFraction\",\"0.6\")\n", + "conf.set(\"spark.rapids.sql.enabled\", \"true\") \n", + "conf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\n", + "conf.set(\"spark.sql.cache.serializer\",\"com.nvidia.spark.ParquetCachedBatchSerializer\")\n", + "conf.set(\"spark.driver.extraClassPath\", RAPIDS_JAR)\n", + "conf.set(\"spark.executor.extraClassPath\", RAPIDS_JAR)\n", + "conf.set(\"spark.jars\", RAPIDS_JAR)\n", + "\n", + "# if you pass/unpack the archive file and enable the environment\n", + "# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n", + "\n", + "# Create spark session\n", + "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n", "reader = spark.read" ] }, @@ -79,7 +144,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -121,8 +186,15 @@ "train_path = dataRoot + \"/mortgage/output/train\"\n", "eval_path = dataRoot + \"/mortgage/output/eval\"\n", "\n", - "train_data = reader.parquet(train_path)\n", - "trans_data = reader.parquet(eval_path)" + "data_format = 'parquet'\n", + "has_header = 'true'\n", + "if data_format == 'csv':\n", + " train_data = reader.schema(schema).option('header',has_header).csv(train_path)\n", + " trans_data = reader.schema(schema).option('header',has_header).csv(eval_path)\n", + "else :\n", + " train_data = reader.load(train_path)\n", + " trans_data = reader.load(eval_path)\n", + " " ] }, { @@ -154,42 +226,39 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "params = { \n", - " 'eta': 0.1,\n", - " 'gamma': 0.1,\n", - " 'missing': 0.0,\n", - " 'treeMethod': 'gpu_hist',\n", - " 'maxDepth': 10, \n", - " 'maxLeaves': 256,\n", - " 'objective':'binary:logistic',\n", - " 'growPolicy': 'depthwise',\n", - " 'minChildWeight': 30.0,\n", - " 'lambda_': 1.0,\n", - " 'scalePosWeight': 2.0,\n", - " 'subsample': 1.0,\n", - " 'nthread': 1,\n", - " 'numRound': 100,\n", - " 'numWorkers': 1,\n", + " \"tree_method\": \"gpu_hist\",\n", + " \"grow_policy\": \"depthwise\",\n", + " \"num_workers\": 1,\n", + " \"use_gpu\": \"true\",\n", "}\n", - "classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCols(features)" + "params['features_col'] = features\n", + "params['label_col'] = label\n", + " \n", + "classifier = SparkXGBClassifier(**params)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The CPU version classifier provides the API `setFeaturesCol` which only accepts a single column name, so vectorization for multiple feature columns is required.\n", - "```Python\n", - "classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCol('features')\n", - "```\n", - "\n", "The parameter `num_workers` should be set to the number of GPUs in Spark cluster for GPU version, while for CPU version it is usually equal to the number of the CPU cores.\n", "\n", - "Concerning the tree method, GPU version only supports `gpu_hist` currently, while `hist` is designed and used here for CPU training." + "Concerning the tree method, GPU version only supports `gpu_hist` currently, while `hist` is designed and used here for CPU training.\n", + "\n", + "An example of CPU classifier:\n", + "```\n", + "classifier = SparkXGBClassifier(\n", + " feature_col=features,\n", + " label_col=label, \n", + " num_workers=1024,\n", + " use_gpu=False,\n", + ")\n", + "```" ] }, { @@ -201,14 +270,42 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "If features_cols param set, then features_col param is ignored.\n", + "22/11/24 06:14:44 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n", + "[Stage 12:> (0 + 1) / 1]\r" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Training takes 25.67 seconds\n" + "[06:15:10] WARNING: ../src/learner.cc:553: \n", + " If you are loading a serialized model (like pickle in Python, RDS in R) generated by\n", + " older XGBoost, please export the model by calling `Booster.save_model` from that version\n", + " first, then load it back in current version. See:\n", + "\n", + " https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html\n", + "\n", + " for more details about differences between saving model and serializing.\n", + "\n", + "Training takes 28.6 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " \r", + "/home/yuali_nvidia_com/.local/lib/python3.8/site-packages/xgboost/sklearn.py:808: UserWarning: Loading a native XGBoost model with Scikit-Learn interface.\n", + " warnings.warn(\"Loading a native XGBoost model with Scikit-Learn interface.\")\n" ] } ], @@ -231,12 +328,29 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "If features_cols param set, then features_col param is ignored.\n", + " \r" + ] + } + ], + "source": [ + "model.write().overwrite().save(dataRoot + '/model/mortgage')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "model.write().overwrite().save(dataRoot + '/mortgage/model')\n", - "loaded_model = XGBoostClassificationModel().load(dataRoot + '/mortgage/model')" + "loaded_model = SparkXGBClassifierModel().load(dataRoot + '/model/mortgage')" ] }, { @@ -248,22 +362,126 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "22/11/24 06:15:13 WARN com.nvidia.spark.rapids.GpuOverrides: \n", + "!Exec cannot run on GPU because not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction#209, probability#275]\n", + " @Expression orig_channel#56 could run on GPU\n", + " @Expression first_home_buyer#57 could run on GPU\n", + " @Expression loan_purpose#58 could run on GPU\n", + " @Expression property_type#59 could run on GPU\n", + " @Expression occupancy_status#60 could run on GPU\n", + " @Expression property_state#61 could run on GPU\n", + " @Expression product_type#62 could run on GPU\n", + " @Expression relocation_mortgage_indicator#63 could run on GPU\n", + " @Expression seller_name#64 could run on GPU\n", + " @Expression mod_flag#65 could run on GPU\n", + " @Expression orig_interest_rate#66 could run on GPU\n", + " @Expression orig_upb#67 could run on GPU\n", + " @Expression orig_loan_term#68 could run on GPU\n", + " @Expression orig_ltv#69 could run on GPU\n", + " @Expression orig_cltv#70 could run on GPU\n", + " @Expression num_borrowers#71 could run on GPU\n", + " @Expression dti#72 could run on GPU\n", + " @Expression borrower_credit_score#73 could run on GPU\n", + " @Expression num_units#74 could run on GPU\n", + " @Expression zip#75 could run on GPU\n", + " @Expression mortgage_insurance_percent#76 could run on GPU\n", + " @Expression current_loan_delinquency_status#77 could run on GPU\n", + " @Expression current_actual_upb#78 could run on GPU\n", + " @Expression interest_rate#79 could run on GPU\n", + " @Expression loan_age#80 could run on GPU\n", + " @Expression msa#81 could run on GPU\n", + " @Expression non_interest_bearing_upb#82 could run on GPU\n", + " @Expression delinquency_12#83 could run on GPU\n", + " !Expression UDF(pythonUDF0#342.rawPrediction) AS rawPrediction#209 cannot run on GPU because input expression ScalaUDF UDF(pythonUDF0#342.rawPrediction) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expression Alias UDF(pythonUDF0#342.rawPrediction) AS rawPrediction#209 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Expression UDF(pythonUDF0#342.rawPrediction) cannot run on GPU because expression ScalaUDF UDF(pythonUDF0#342.rawPrediction) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3898/645590696 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled\n", + " @Expression pythonUDF0#342.rawPrediction could run on GPU\n", + " @Expression pythonUDF0#342 could run on GPU\n", + " @Expression pythonUDF0#342.prediction AS prediction#243 could run on GPU\n", + " @Expression pythonUDF0#342.prediction could run on GPU\n", + " @Expression pythonUDF0#342 could run on GPU\n", + " !Expression UDF(pythonUDF0#342.probability) AS probability#275 cannot run on GPU because input expression ScalaUDF UDF(pythonUDF0#342.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expression Alias UDF(pythonUDF0#342.probability) AS probability#275 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Expression UDF(pythonUDF0#342.probability) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3898/645590696 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#342.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " @Expression pythonUDF0#342.probability could run on GPU\n", + " @Expression pythonUDF0#342 could run on GPU\n", + "\n", + "22/11/24 06:15:13 WARN com.nvidia.spark.rapids.GpuOverrides: \n", + "!Exec cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction#209, probability#275]\n", + " @Expression orig_channel#56 could run on GPU\n", + " @Expression first_home_buyer#57 could run on GPU\n", + " @Expression loan_purpose#58 could run on GPU\n", + " @Expression property_type#59 could run on GPU\n", + " @Expression occupancy_status#60 could run on GPU\n", + " @Expression property_state#61 could run on GPU\n", + " @Expression product_type#62 could run on GPU\n", + " @Expression relocation_mortgage_indicator#63 could run on GPU\n", + " @Expression seller_name#64 could run on GPU\n", + " @Expression mod_flag#65 could run on GPU\n", + " @Expression orig_interest_rate#66 could run on GPU\n", + " @Expression orig_upb#67 could run on GPU\n", + " @Expression orig_loan_term#68 could run on GPU\n", + " @Expression orig_ltv#69 could run on GPU\n", + " @Expression orig_cltv#70 could run on GPU\n", + " @Expression num_borrowers#71 could run on GPU\n", + " @Expression dti#72 could run on GPU\n", + " @Expression borrower_credit_score#73 could run on GPU\n", + " @Expression num_units#74 could run on GPU\n", + " @Expression zip#75 could run on GPU\n", + " @Expression mortgage_insurance_percent#76 could run on GPU\n", + " @Expression current_loan_delinquency_status#77 could run on GPU\n", + " @Expression current_actual_upb#78 could run on GPU\n", + " @Expression interest_rate#79 could run on GPU\n", + " @Expression loan_age#80 could run on GPU\n", + " @Expression msa#81 could run on GPU\n", + " @Expression non_interest_bearing_upb#82 could run on GPU\n", + " @Expression delinquency_12#83 could run on GPU\n", + " !Expression rawPrediction#209 cannot run on GPU because expression AttributeReference rawPrediction#209 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " @Expression prediction#243 could run on GPU\n", + " !Expression probability#275 cannot run on GPU because expression AttributeReference probability#275 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + "\n", + "22/11/24 06:15:28 WARN com.nvidia.spark.rapids.GpuOverrides: \n", + "!Exec cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n", + " @Partitioning could run on GPU\n", + " !Exec cannot run on GPU because not all expressions can be replaced; unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#275, rawPrediction#209]\n", + " @Expression cast(delinquency_12#83 as string) AS delinquency_12#971 could run on GPU\n", + " @Expression cast(delinquency_12#83 as string) could run on GPU\n", + " @Expression delinquency_12#83 could run on GPU\n", + " @Expression cast(rawPrediction#209 as string) AS rawPrediction#972 could run on GPU\n", + " !Expression cast(rawPrediction#209 as string) cannot run on GPU because Cast from org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 to StringType is not supported\n", + " !Expression rawPrediction#209 cannot run on GPU because expression AttributeReference rawPrediction#209 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " @Expression cast(probability#275 as string) AS probability#973 could run on GPU\n", + " !Expression cast(probability#275 as string) cannot run on GPU because Cast from org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 to StringType is not supported\n", + " !Expression probability#275 cannot run on GPU because expression AttributeReference probability#275 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " @Expression cast(prediction#243 as string) AS prediction#974 could run on GPU\n", + " @Expression cast(prediction#243 as string) could run on GPU\n", + " @Expression prediction#243 could run on GPU\n", + " !Exec cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#275, rawPrediction#209]; not all expressions can be replaced\n", + " @Expression delinquency_12#83 could run on GPU\n", + " @Expression prediction#243 could run on GPU\n", + " !Expression probability#275 cannot run on GPU because expression AttributeReference probability#275 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Expression rawPrediction#209 cannot run on GPU because expression AttributeReference rawPrediction#209 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + "\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Transformation takes 11.39 seconds\n", + "Transformation takes 15.62 seconds\n", "+--------------+--------------------+--------------------+----------+\n", "|delinquency_12| rawPrediction| probability|prediction|\n", "+--------------+--------------------+--------------------+----------+\n", - "| 0|[7.76566505432128...|[0.99957613222068...| 0.0|\n", - "| 0|[4.50240230560302...|[0.98903913144022...| 0.0|\n", - "| 0|[4.50240230560302...|[0.98903913144022...| 0.0|\n", - "| 0|[4.50240230560302...|[0.98903913144022...| 0.0|\n", - "| 0|[4.50240230560302...|[0.98903913144022...| 0.0|\n", + "| 0|[8.84631538391113...|[0.99985611438751...| 0.0|\n", + "| 0|[9.41864871978759...|[0.99991881847381...| 0.0|\n", + "| 0|[9.41864871978759...|[0.99991881847381...| 0.0|\n", + "| 0|[9.41864871978759...|[0.99991881847381...| 0.0|\n", + "| 0|[8.84631538391113...|[0.99985611438751...| 0.0|\n", "+--------------+--------------------+--------------------+----------+\n", "only showing top 5 rows\n", "\n" @@ -288,40 +506,83 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def check_classification_accuracy(data_frame, label):\n", + " accuracy = (MulticlassClassificationEvaluator()\n", + " .setLabelCol(label)\n", + " .evaluate(data_frame))\n", + " print('-' * 100)\n", + " print('Accuracy is ' + str(accuracy))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "22/11/24 06:15:28 WARN com.nvidia.spark.rapids.GpuOverrides: \n", + "! cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n", + " ! createexternalrow(prediction#243, delinquency_12#1450, 1.0#1449, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n", + " @Expression prediction#243 could run on GPU\n", + " @Expression delinquency_12#1450 could run on GPU\n", + " @Expression 1.0#1449 could run on GPU\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n", + " ! newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n", + " !Expression probability#275 cannot run on GPU because expression AttributeReference probability#275 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Expression obj#1455 cannot run on GPU because expression AttributeReference obj#1455 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n", + " !Exec cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#275]; unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#275]; not all expressions can be replaced\n", + " @Expression prediction#243 could run on GPU\n", + " @Expression cast(delinquency_12#83 as double) AS delinquency_12#1450 could run on GPU\n", + " @Expression cast(delinquency_12#83 as double) could run on GPU\n", + " @Expression delinquency_12#83 could run on GPU\n", + " @Expression 1.0 AS 1.0#1449 could run on GPU\n", + " @Expression 1.0 could run on GPU\n", + " !Expression probability#275 cannot run on GPU because expression AttributeReference probability#275 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + " !Exec cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#275]; not all expressions can be replaced\n", + " @Expression delinquency_12#83 could run on GPU\n", + " @Expression prediction#243 could run on GPU\n", + " !Expression probability#275 cannot run on GPU because expression AttributeReference probability#275 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n", + "\n", + "[Stage 19:> (0 + 1) / 1]\r" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Evaluation takes 1.03 seconds\n", - "Accuracy is 0.9876786703104035\n" + "----------------------------------------------------------------------------------------------------\n", + "Accuracy is 1.0\n", + "Evaluation takes 2.29 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " \r" ] } ], "source": [ - "accuracy = with_benchmark(\n", - " 'Evaluation',\n", - " lambda: MulticlassClassificationEvaluator().setLabelCol(label).evaluate(result))\n", - "print('Accuracy is ' + str(accuracy))" + "with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label))" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "spark.stop()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-ETL.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-ETL.ipynb index 794a0fa36..fb4128d35 100644 --- a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-ETL.ipynb +++ b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-ETL.ipynb @@ -16,18 +16,18 @@ "source": [ "## Prerequirement\n", "### 1. Download data\n", - "\n", - "Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.10/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n", + "\n", + "Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.12/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n", "\n", "### 2. Download needed jars\n", - "* [rapids-4-spark_2.12-22.10.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar)\n", + "* [rapids-4-spark_2.12-22.12.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar)\n", "\n", "### 3. Start Spark Standalone\n", "Before Running the script, please setup Spark standalone mode\n", "\n", "### 4. Add ENV\n", "```\n", - "$ export SPARK_JARS=rapids-4-spark_2.12-22.10.0.jar\n", + "$ export SPARK_JARS=rapids-4-spark_2.12-22.12.0.jar\n", "\n", "```\n", "\n", diff --git a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/consts.py b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/consts.py index 1cca6e6d8..eefa7358c 100644 --- a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/consts.py +++ b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/consts.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -278,19 +278,3 @@ 'non_interest_bearing_upb', 'delinquency_12', ] - -default_params = { - 'eta': 0.1, - 'gamma': 0.1, - 'missing': 0.0, - 'maxDepth': 10, - 'maxLeaves': 256, - 'growPolicy': 'depthwise', - 'minChildWeight': 30.0, - 'lambda_': 1.0, - 'scalePosWeight': 2.0, - 'subsample': 1.0, - 'nthread': 1, - 'numRound': 100, - 'numWorkers': 1, -} diff --git a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/cpu_cross_validator_main.py b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/cpu_cross_validator_main.py deleted file mode 100644 index f5924957e..000000000 --- a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/cpu_cross_validator_main.py +++ /dev/null @@ -1,77 +0,0 @@ -# -# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from com.nvidia.spark.examples.mortgage.consts import * -from com.nvidia.spark.examples.utility.utils import * -from ml.dmlc.xgboost4j.scala.spark import * -from ml.dmlc.xgboost4j.scala.spark.rapids import CrossValidator -from pyspark.ml.tuning import ParamGridBuilder -from pyspark.sql import SparkSession - -def main(args, xgboost_args): - spark = (SparkSession - .builder - .appName(args.mainClass) - .getOrCreate()) - - train_data, eval_data, trans_data = valid_input_data(spark, args, '', schema) - features = [x.name for x in schema if x.name != label] - - if args.mode in [ 'all', 'train' ]: - classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args)) - .setLabelCol(label) - .setFeaturesCol('features')) - evaluator = (MulticlassClassificationEvaluator() - .setLabelCol(label)) - param_grid = (ParamGridBuilder() - .addGrid(classifier.maxDepth, [5, 10]) - .addGrid(classifier.numRound, [100, 200]) - .build()) - cross_validator = (CrossValidator() - .setEstimator(classifier) - .setEvaluator(evaluator) - .setEstimatorParamMaps(param_grid) - .setNumFolds(3)) - if not train_data: - print('-' * 80) - print('Usage: training data path required when mode is all or train') - exit(1) - - train_data = vectorize_data_frame(train_data, label) - model = with_benchmark('Training', lambda: cross_validator.fit(train_data)) - # get the best model to do transform - model = model.bestModel - if args.modelPath: - writer = model.write().overwrite() if args.overwrite else model - writer.save(args.modelPath) - else: - model = XGBoostClassificationModel().load(args.modelPath) - - if args.mode in [ 'all', 'transform' ]: - def transform(): - vec_df = vectorize_data_frame(trans_data, label) - result = model.transform(vec_df).cache() - result.foreachPartition(lambda _: None) - return result - if not trans_data: - print('-' * 80) - print('Usage: trans data path required when mode is all or transform') - exit(1) - result = with_benchmark('Transformation', transform) - show_sample(args, result, label) - with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label)) - - spark.stop() diff --git a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/cpu_main.py b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/cpu_main.py deleted file mode 100644 index e2edd80e0..000000000 --- a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/cpu_main.py +++ /dev/null @@ -1,64 +0,0 @@ -# -# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from com.nvidia.spark.examples.mortgage.consts import * -from com.nvidia.spark.examples.mortgage.etl import etl -from com.nvidia.spark.examples.utility.utils import * -from ml.dmlc.xgboost4j.scala.spark import * -from pyspark.sql import SparkSession - -def main(args, xgboost_args): - spark = (SparkSession - .builder - .appName(args.mainClass) - .getOrCreate()) - - train_data, eval_data, trans_data = valid_input_data(spark, args, '', schema) - - if args.mode in [ 'all', 'train' ]: - classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args)) - .setLabelCol(label) - .setFeaturesCol('features')) - if eval_data: - eval_data = vectorize_data_frame(eval_data, label) - classifier.setEvalSets({ 'test': eval_data }) - if not train_data: - print('-' * 80) - print('Usage: training data path required when mode is all or train') - exit(1) - train_data = vectorize_data_frame(train_data, label) - model = with_benchmark('Training', lambda: classifier.fit(train_data)) - - if args.modelPath: - writer = model.write().overwrite() if args.overwrite else model - writer.save(args.modelPath) - else: - model = XGBoostClassificationModel().load(args.modelPath) - - if args.mode in [ 'all', 'transform' ]: - def transform(): - vec_df = vectorize_data_frame(trans_data, label) - result = model.transform(vec_df).cache() - result.foreachPartition(lambda _: None) - return result - if not trans_data: - print('-' * 80) - print('Usage: trans data path required when mode is all or transform') - exit(1) - result = with_benchmark('Transformation', transform) - show_sample(args, result, label) - with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label)) - - spark.stop() diff --git a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/gpu_cross_validator_main.py b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/cross_validator_main.py similarity index 68% rename from examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/gpu_cross_validator_main.py rename to examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/cross_validator_main.py index c717d3d59..b6305a893 100644 --- a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/gpu_cross_validator_main.py +++ b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/cross_validator_main.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,32 +13,41 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from pyspark.ml.tuning import ParamGridBuilder +from pyspark.ml.tuning import ParamGridBuilder, CrossValidator -from com.nvidia.spark.examples.mortgage.consts import * +from .consts import * from com.nvidia.spark.examples.utility.utils import * -from ml.dmlc.xgboost4j.scala.spark import * -from ml.dmlc.xgboost4j.scala.spark.rapids import CrossValidator from pyspark.sql import SparkSession +from xgboost.spark import SparkXGBClassifier, SparkXGBClassifierModel + + def main(args, xgboost_args): spark = (SparkSession - .builder - .appName(args.mainClass) - .getOrCreate()) + .builder + .appName(args.mainClass) + .getOrCreate()) train_data, eval_data, trans_data = valid_input_data(spark, args, '', schema) - features = [x.name for x in schema if x.name != label] - if args.mode in [ 'all', 'train' ]: - classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args)) - .setLabelCol(label) - .setFeaturesCols(features)) + if args.mode in ['all', 'train']: + if train_data is None: + print('-' * 80) + print('Usage: training data path required when mode is all or train') + exit(1) + + train_data, features = transform_data(train_data, label, args.use_gpu) + xgboost_args['features_col'] = features + xgboost_args['label_col'] = label + + classifier = SparkXGBClassifier(**xgboost_args) + evaluator = (MulticlassClassificationEvaluator() .setLabelCol(label)) + param_grid = (ParamGridBuilder() - .addGrid(classifier.maxDepth, [5, 10]) - .addGrid(classifier.numRound, [100, 200]) + .addGrid(classifier.max_depth, [6, 8]) + .addGrid(classifier.n_estimators, [20, 40]) .build()) cross_validator = (CrossValidator() .setEstimator(classifier) @@ -57,17 +66,21 @@ def main(args, xgboost_args): writer = model.write().overwrite() if args.overwrite else model writer.save(args.modelPath) else: - model = XGBoostClassificationModel().load(args.modelPath) + model = SparkXGBClassifierModel.load(args.modelPath) - if args.mode in [ 'all', 'transform' ]: - def transform(): - result = model.transform(trans_data).cache() - result.foreachPartition(lambda _: None) - return result + if args.mode in ['all', 'transform']: if not trans_data: print('-' * 80) print('Usage: trans data path required when mode is all or transform') exit(1) + + trans_data, _ = transform_data(trans_data, label, args.use_gpu) + + def transform(): + result = model.transform(trans_data).cache() + result.foreachPartition(lambda _: None) + return result + result = with_benchmark('Transformation', transform) show_sample(args, result, label) with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label)) diff --git a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl.py b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl.py index 47052737c..d59279d67 100644 --- a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl.py +++ b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from com.nvidia.spark.examples.mortgage.consts import * +from .consts import * from pyspark.sql.functions import * from pyspark.sql.types import * from pyspark.sql.window import Window diff --git a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl_main.py b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl_main.py index 55f5df5fc..ee09604ba 100644 --- a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl_main.py +++ b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl_main.py @@ -13,16 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from com.nvidia.spark.examples.mortgage.consts import * -from com.nvidia.spark.examples.mortgage.etl import etl, extract_paths +from .etl import etl, extract_paths from com.nvidia.spark.examples.utility.utils import * from pyspark.sql import SparkSession + def main(args, xgboost_args): spark = (SparkSession - .builder - .appName(args.mainClass) - .getOrCreate()) + .builder + .appName(args.mainClass) + .getOrCreate()) etled_df = etl(spark, args) # outPath should has only one input outPath = extract_paths(args.dataPaths, 'out::')[0] diff --git a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/gpu_main.py b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/main.py similarity index 69% rename from examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/gpu_main.py rename to examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/main.py index 7a4b2e06f..021887e4f 100644 --- a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/gpu_main.py +++ b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/main.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,48 +13,58 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from com.nvidia.spark.examples.mortgage.consts import * -from com.nvidia.spark.examples.mortgage.etl import etl + +from xgboost.spark import SparkXGBClassifier, SparkXGBClassifierModel + +from .consts import * from com.nvidia.spark.examples.utility.utils import * -from ml.dmlc.xgboost4j.scala.spark import * from pyspark.sql import SparkSession + def main(args, xgboost_args): spark = (SparkSession - .builder - .appName(args.mainClass) - .getOrCreate()) + .builder + .appName(args.mainClass) + .getOrCreate()) train_data, eval_data, trans_data = valid_input_data(spark, args, '', schema) - features = [x.name for x in schema if x.name != label] - if args.mode in [ 'all', 'train' ]: - classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args)) - .setLabelCol(label) - .setFeaturesCols(features)) - if eval_data: - classifier.setEvalSets({ 'test': eval_data }) - if not train_data: + if args.mode in ['all', 'train']: + if train_data is None: print('-' * 80) print('Usage: training data path required when mode is all or train') exit(1) + + train_data, features = transform_data(train_data, label, args.use_gpu) + xgboost_args['features_col'] = features + xgboost_args['label_col'] = label + classifier = SparkXGBClassifier(**xgboost_args) + + if eval_data: + # TODO + pass + model = with_benchmark('Training', lambda: classifier.fit(train_data)) if args.modelPath: writer = model.write().overwrite() if args.overwrite else model writer.save(args.modelPath) else: - model = XGBoostClassificationModel().load(args.modelPath) + model = SparkXGBClassifierModel.load(args.modelPath) + + if args.mode in ['all', 'transform']: + trans_data, _ = transform_data(trans_data, label, args.use_gpu) - if args.mode in [ 'all', 'transform' ]: def transform(): result = model.transform(trans_data).cache() result.foreachPartition(lambda _: None) return result + if not trans_data: print('-' * 80) print('Usage: trans data path required when mode is all or transform') exit(1) + result = with_benchmark('Transformation', transform) show_sample(args, result, label) with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label)) diff --git a/examples/XGBoost-Examples/pack_pyspark_example.sh b/examples/XGBoost-Examples/pack_pyspark_example.sh new file mode 100755 index 000000000..e446d27da --- /dev/null +++ b/examples/XGBoost-Examples/pack_pyspark_example.sh @@ -0,0 +1,6 @@ +# Follow these steps to package the Python zip file +rm -fr samples.zip +cd agaricus/python ; zip -r ../../samples.zip com ; cd ../.. +cd mortgage/python ; zip -r ../../samples.zip com ; cd ../.. +cd taxi/python ; zip -r ../../samples.zip com ; cd ../.. +cd utility/python ; zip -r ../../samples.zip com ; cd ../.. diff --git a/examples/XGBoost-Examples/pom.xml b/examples/XGBoost-Examples/pom.xml index 9d040878d..d6977f8c5 100644 --- a/examples/XGBoost-Examples/pom.xml +++ b/examples/XGBoost-Examples/pom.xml @@ -38,7 +38,7 @@ UTF-8 - 1.6.1 + 1.7.1 3.1.1 2.12.8 2.12 diff --git a/examples/XGBoost-Examples/taxi/notebooks/python/cv-taxi-gpu.ipynb b/examples/XGBoost-Examples/taxi/notebooks/python/cv-taxi-gpu.ipynb index f7530c133..829d3c541 100644 --- a/examples/XGBoost-Examples/taxi/notebooks/python/cv-taxi-gpu.ipynb +++ b/examples/XGBoost-Examples/taxi/notebooks/python/cv-taxi-gpu.ipynb @@ -11,13 +11,10 @@ "Here takes the application 'Taxi' as an example.\n", "\n", "A few libraries are required for this notebook:\n", - " 1. NumPy\n", - " 2. cudf jar\n", - " 2. xgboost4j jar\n", - " 3. xgboost4j-spark jar\n", - " \n", - "Note: For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04) that\n", - "uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/)." + " 1. cudf-cu11\n", + " 2. xgboost\n", + " 3. scikit-learn\n", + " 4. numpy" ] }, { @@ -33,21 +30,16 @@ "metadata": {}, "outputs": [], "source": [ - "from ml.dmlc.xgboost4j.scala.spark import XGBoostRegressionModel, XGBoostRegressor\n", - "from ml.dmlc.xgboost4j.scala.spark.rapids import CrossValidator\n", + "from xgboost.spark import SparkXGBRegressor, SparkXGBRegressorModel\n", + "from pyspark.ml.tuning import ParamGridBuilder, CrossValidator\n", "from pyspark.ml.evaluation import RegressionEvaluator\n", - "from pyspark.ml.tuning import ParamGridBuilder\n", "from pyspark.sql import SparkSession\n", "from pyspark.sql.types import FloatType, IntegerType, StructField, StructType\n", "from time import time\n", - "import os" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As shown above, here `CrossValidator` is imported from package `ml.dmlc.xgboost4j.scala.spark.rapids`, not the spark's `tuning.CrossValidator`." + "from pyspark.conf import SparkConf\n", + "import os\n", + "# os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\"\n", + "# os.environ['PYSPARK_DRIVER_PYTHON'] = \"./environment/bin/python\"" ] }, { @@ -61,9 +53,64 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-11-30 08:02:09,748 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "2022-11-30 08:02:10,103 WARN resource.ResourceUtils: The configuration of cores (exec = 2 task = 1, runnable tasks = 2) will result in wasted resources due to resource gpu limiting the number of runnable tasks per executor to: 1. Please adjust your configuration.\n", + "2022-11-30 08:02:23,737 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator 22.12.0 using cudf 22.12.0.\n", + "2022-11-30 08:02:23,752 WARN rapids.RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n", + "2022-11-30 08:02:23,756 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n", + "2022-11-30 08:02:23,757 WARN rapids.RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n", + "2022-11-30 08:02:24,226 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.\n" + ] + } + ], "source": [ - "spark = SparkSession.builder.appName(\"taxi-cv-gpu-python\").getOrCreate()" + "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"/your-url\")\n", + "\n", + "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-jar-path\")\n", + "\n", + "# You need to update with your real hardware resource \n", + "driverMem = os.getenv(\"DRIVER_MEM\", \"2g\")\n", + "executorMem = os.getenv(\"EXECUTOR_MEM\", \"2g\")\n", + "pinnedPoolSize = os.getenv(\"PINNED_POOL_SIZE\", \"2g\")\n", + "concurrentGpuTasks = os.getenv(\"CONCURRENT_GPU_TASKS\", \"2\")\n", + "executorCores = int(os.getenv(\"EXECUTOR_CORES\", \"2\"))\n", + "# Common spark settings\n", + "conf = SparkConf()\n", + "conf.setMaster(SPARK_MASTER_URL)\n", + "conf.setAppName(\"Microbenchmark on GPU\")\n", + "conf.set(\"spark.executor.instances\",\"1\")\n", + "conf.set(\"spark.driver.memory\", driverMem)\n", + "## The tasks will run on GPU memory, so there is no need to set a high host memory\n", + "conf.set(\"spark.executor.memory\", executorMem)\n", + "## The tasks will run on GPU cores, so there is no need to use many cpu cores\n", + "conf.set(\"spark.executor.cores\", executorCores)\n", + "\n", + "# Plugin settings\n", + "conf.set(\"spark.executor.resource.gpu.amount\", \"1\")\n", + "conf.set(\"spark.rapids.sql.concurrentGpuTasks\", concurrentGpuTasks)\n", + "conf.set(\"spark.rapids.memory.pinnedPool.size\", pinnedPoolSize)\n", + "conf.set(\"spark.rapids.memory.gpu.allocFraction\",\"0.7\")\n", + "conf.set(\"spark.locality.wait\",\"0\")\n", + "##############note: only support value=1 https://github.com/dmlc/xgboost/blame/master/python-package/xgboost/spark/core.py#L370-L374\n", + "conf.set(\"spark.task.resource.gpu.amount\", 1) \n", + "conf.set(\"spark.rapids.sql.enabled\", \"true\") \n", + "conf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\n", + "conf.set(\"spark.sql.cache.serializer\",\"com.nvidia.spark.ParquetCachedBatchSerializer\")\n", + "conf.set(\"spark.driver.extraClassPath\", RAPIDS_JAR)\n", + "conf.set(\"spark.executor.extraClassPath\", RAPIDS_JAR)\n", + "# if you pass/unpack the archive file and enable the environment\n", + "# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n", + "# Create spark session\n", + "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n", + "\n", + "reader = spark.read" ] }, { @@ -103,8 +150,17 @@ "\n", "# You need to update them to your real paths!\n", "dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n", - "train_data = spark.read.parquet(dataRoot + '/taxi/parquet/train')\n", - "trans_data = spark.read.parquet(dataRoot + '/taxi/parquet/eval')" + "train_path = dataRoot + \"/taxi/csv/train\"\n", + "eval_path = dataRoot + \"/taxi/csv/test\"\n", + "\n", + "data_format = 'csv'\n", + "has_header = 'true'\n", + "if data_format == 'csv':\n", + " train_data = reader.schema(schema).option('header',has_header).csv(train_path)\n", + " trans_data = reader.schema(schema).option('header',has_header).csv(eval_path)\n", + "else :\n", + " train_data = reader.load(train_path)\n", + " trans_data = reader.load(eval_path)" ] }, { @@ -121,29 +177,29 @@ "outputs": [], "source": [ "# First build a regressor of GPU version using *setFeaturesCols* to set feature columns\n", - "params = {\n", - " 'eta': 0.05,\n", - " 'maxDepth': 8,\n", - " 'subsample': 0.8,\n", - " 'gamma': 1.0,\n", - " 'numRound': 100,\n", - " 'numWorkers': 1,\n", - " 'treeMethod': 'gpu_hist',\n", + "params = { \n", + " \"tree_method\": \"gpu_hist\",\n", + " \"grow_policy\": \"depthwise\",\n", + " \"num_workers\": 1,\n", + " \"use_gpu\": \"true\",\n", "}\n", - "regressor = XGBoostRegressor(**params).setLabelCol(label).setFeaturesCols(features)\n", + "params['features_col'] = features\n", + "params['label_col'] = label\n", + "\n", + "regressor = SparkXGBRegressor(**params)\n", "# Then build the evaluator and the hyperparameters\n", "evaluator = (RegressionEvaluator()\n", " .setLabelCol(label))\n", "param_grid = (ParamGridBuilder()\n", - " .addGrid(regressor.maxDepth, [3, 6])\n", - " .addGrid(regressor.numRound, [100, 200])\n", + " .addGrid(regressor.max_depth, [3, 6])\n", + " .addGrid(regressor.n_estimators, [100, 200])\n", " .build())\n", "# Finally the corss validator\n", "cross_validator = (CrossValidator()\n", " .setEstimator(regressor)\n", " .setEvaluator(evaluator)\n", " .setEstimatorParamMaps(param_grid)\n", - " .setNumFolds(3))" + " .setNumFolds(2))" ] }, { @@ -158,11 +214,108 @@ "execution_count": 5, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "If features_cols param set, then features_col param is ignored.\n", + "/data/home/yuanli/work/reviews/pr252/pyspark_venv_20221125/lib/python3.8/site-packages/xgboost/sklearn.py:808: UserWarning: Loading a native XGBoost model with Scikit-Learn interface.\n", + " warnings.warn(\"Loading a native XGBoost model with Scikit-Learn interface.\")\n", + "2022-11-30 08:03:14,308 WARN rapids.GpuOverrides: \n", + "! cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n", + " ! createexternalrow(prediction#889, fare_amount#890, 1.0#891, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n", + " @Expression prediction#889 could run on GPU\n", + " @Expression fare_amount#890 could run on GPU\n", + " @Expression 1.0#891 could run on GPU\n", + " !Expression obj#895 cannot run on GPU because expression AttributeReference obj#895 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n", + "\n", + "2022-11-30 08:03:14,317 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n", + "If features_cols param set, then features_col param is ignored.\n", + "2022-11-30 08:03:20,073 WARN rapids.GpuOverrides: \n", + "! cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n", + " ! createexternalrow(prediction#1789, fare_amount#1790, 1.0#1791, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n", + " @Expression prediction#1789 could run on GPU\n", + " @Expression fare_amount#1790 could run on GPU\n", + " @Expression 1.0#1791 could run on GPU\n", + " !Expression obj#1795 cannot run on GPU because expression AttributeReference obj#1795 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n", + "\n", + "If features_cols param set, then features_col param is ignored.\n", + "2022-11-30 08:03:23,687 WARN rapids.GpuOverrides: \n", + "! cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n", + " ! createexternalrow(prediction#2689, fare_amount#2690, 1.0#2691, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n", + " @Expression prediction#2689 could run on GPU\n", + " @Expression fare_amount#2690 could run on GPU\n", + " @Expression 1.0#2691 could run on GPU\n", + " !Expression obj#2695 cannot run on GPU because expression AttributeReference obj#2695 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n", + "\n", + "If features_cols param set, then features_col param is ignored.\n", + "2022-11-30 08:03:27,457 WARN rapids.GpuOverrides: \n", + "! cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n", + " ! createexternalrow(prediction#3589, fare_amount#3590, 1.0#3591, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n", + " @Expression prediction#3589 could run on GPU\n", + " @Expression fare_amount#3590 could run on GPU\n", + " @Expression 1.0#3591 could run on GPU\n", + " !Expression obj#3595 cannot run on GPU because expression AttributeReference obj#3595 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n", + "\n", + "If features_cols param set, then features_col param is ignored.\n", + "2022-11-30 08:03:30,964 WARN rapids.GpuOverrides: \n", + "! cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n", + " ! createexternalrow(prediction#4659, fare_amount#4660, 1.0#4661, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n", + " @Expression prediction#4659 could run on GPU\n", + " @Expression fare_amount#4660 could run on GPU\n", + " @Expression 1.0#4661 could run on GPU\n", + " !Expression obj#4665 cannot run on GPU because expression AttributeReference obj#4665 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n", + "\n", + "If features_cols param set, then features_col param is ignored.\n", + "2022-11-30 08:03:34,524 WARN rapids.GpuOverrides: \n", + "! cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n", + " ! createexternalrow(prediction#5559, fare_amount#5560, 1.0#5561, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n", + " @Expression prediction#5559 could run on GPU\n", + " @Expression fare_amount#5560 could run on GPU\n", + " @Expression 1.0#5561 could run on GPU\n", + " !Expression obj#5565 cannot run on GPU because expression AttributeReference obj#5565 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n", + "\n", + "If features_cols param set, then features_col param is ignored.\n", + "2022-11-30 08:03:38,067 WARN rapids.GpuOverrides: \n", + "! cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n", + " ! createexternalrow(prediction#6459, fare_amount#6460, 1.0#6461, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n", + " @Expression prediction#6459 could run on GPU\n", + " @Expression fare_amount#6460 could run on GPU\n", + " @Expression 1.0#6461 could run on GPU\n", + " !Expression obj#6465 cannot run on GPU because expression AttributeReference obj#6465 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n", + "\n", + "If features_cols param set, then features_col param is ignored.\n", + "2022-11-30 08:03:41,793 WARN rapids.GpuOverrides: \n", + "! cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n", + " ! createexternalrow(prediction#7359, fare_amount#7360, 1.0#7361, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n", + " @Expression prediction#7359 could run on GPU\n", + " @Expression fare_amount#7360 could run on GPU\n", + " @Expression 1.0#7361 could run on GPU\n", + " !Expression obj#7365 cannot run on GPU because expression AttributeReference obj#7365 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "If features_cols param set, then features_col param is ignored.\n", + "[Stage 34:> (0 + 1) / 1]\r" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Cross-Validation takes 73.77 seconds\n" + "Cross-Validation takes 55.19 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " \r" ] } ], @@ -192,16 +345,32 @@ "name": "stdout", "output_type": "stream", "text": [ - "Transforming takes 1.33 seconds\n", - "+-----------+-----------------+\n", - "|fare_amount| prediction|\n", - "+-----------+-----------------+\n", - "| 2.5|34.38509750366211|\n", - "| 45.0|37.97528839111328|\n", - "| 2.5|28.55727195739746|\n", - "| 45.0|40.39316177368164|\n", - "| 45.0|36.12188720703125|\n", - "+-----------+-----------------+\n", + "Transforming takes 0.23 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-11-30 08:03:45,503 WARN rapids.GpuOverrides: \n", + "!Exec cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n", + " @Partitioning could run on GPU\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------+-----------+\n", + "|fare_amount| prediction|\n", + "+-----------+-----------+\n", + "| 5.0| 5.01032114|\n", + "| 34.0| 31.134758|\n", + "| 10.0|9.288980484|\n", + "| 16.5|15.33446312|\n", + "| 7.0|8.197098732|\n", + "+-----------+-----------+\n", "only showing top 5 rows\n", "\n" ] @@ -232,8 +401,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "Evaluation takes 0.26 seconds\n", - "RMSE is 3.5167114187894883\n" + "Evaluation takes 0.05 seconds\n", + "RMSE is 2.055690464034438\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-11-30 08:03:45,728 WARN rapids.GpuOverrides: \n", + "! cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n", + " ! createexternalrow(prediction#7645, fare_amount#8271, 1.0#8272, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n", + " @Expression prediction#7645 could run on GPU\n", + " @Expression fare_amount#8271 could run on GPU\n", + " @Expression 1.0#8272 could run on GPU\n", + " !Expression obj#8276 cannot run on GPU because expression AttributeReference obj#8276 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n", + "\n" ] } ], diff --git a/examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb b/examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb index 54b181513..c41e3dd72 100644 --- a/examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb +++ b/examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb @@ -19,14 +19,14 @@ "All data could be found at https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page\n", "\n", "### 2. Download needed jars\n", - "* [rapids-4-spark_2.12-22.10.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar)\n", + "* [rapids-4-spark_2.12-22.12.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar)\n", "\n", "### 3. Start Spark Standalone\n", "Before running the script, please setup Spark standalone mode\n", "\n", "### 4. Add ENV\n", "```\n", - "$ export SPARK_JARS=rapids-4-spark_2.12-22.10.0.jar\n", + "$ export SPARK_JARS=rapids-4-spark_2.12-22.12.0.jar\n", "$ export PYSPARK_DRIVER_PYTHON=jupyter \n", "$ export PYSPARK_DRIVER_PYTHON_OPTS=notebook\n", "```\n", diff --git a/examples/XGBoost-Examples/taxi/notebooks/python/taxi-gpu.ipynb b/examples/XGBoost-Examples/taxi/notebooks/python/taxi-gpu.ipynb index 3fdfa540a..593d381d2 100644 --- a/examples/XGBoost-Examples/taxi/notebooks/python/taxi-gpu.ipynb +++ b/examples/XGBoost-Examples/taxi/notebooks/python/taxi-gpu.ipynb @@ -4,21 +4,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Introduction to XGBoost Spark3.0 with GPU\n", + "# Introduction to XGBoost Spark3.1 with GPU\n", "\n", "Taxi is an example of xgboost regressor. This notebook will show you how to load data, train the xgboost model and use this model to predict \"fare_amount\" of your taxi trip.\n", "\n", "A few libraries required for this notebook:\n", - " 1. NumPy\n", - " 2. cudf jar\n", - " 3. xgboost4j jar\n", - " 4. xgboost4j-spark jar\n", - " 5. rapids-4-spark.jar \n", + " 1. cudf-cu11\n", + " 2. xgboost\n", + " 3. scikit-learn\n", + " 4. numpy\n", "\n", - "This notebook also illustrates the ease of porting a sample CPU based Spark xgboost4j code into GPU. There is only one change required for running Spark XGBoost on GPU. That is replacing the API `setFeaturesCol(feature)` on CPU with the new API `setFeaturesCols(features)`. This also eliminates the need for vectorization (assembling multiple feature columns in to one column) since we can read multiple columns.\n", - "\n", - "Note: For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04) that\n", - "uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/)." + "This notebook also illustrates the ease of porting a sample CPU based Spark xgboost4j code into GPU. There is no change required for running Spark XGBoost on GPU because both CPU and GPU call the same API. For CPU run, we need to vectorize the trained dataset before fitting data to regressor." ] }, { @@ -34,12 +30,16 @@ "metadata": {}, "outputs": [], "source": [ - "from ml.dmlc.xgboost4j.scala.spark import XGBoostRegressionModel, XGBoostRegressor\n", + "from xgboost.spark import SparkXGBRegressor, SparkXGBRegressorModel\n", "from pyspark.ml.evaluation import RegressionEvaluator\n", "from pyspark.sql import SparkSession\n", "from pyspark.sql.types import FloatType, IntegerType, StructField, StructType\n", "from time import time\n", - "import os" + "from pyspark.conf import SparkConf\n", + "import os\n", + "# if you pass/unpack the archive file and enable the environment\n", + "# os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\"\n", + "# os.environ['PYSPARK_DRIVER_PYTHON'] = \"./environment/bin/python\"" ] }, { @@ -62,11 +62,67 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-11-30 07:51:19,104 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "2022-11-30 07:51:19,480 WARN resource.ResourceUtils: The configuration of cores (exec = 2 task = 1, runnable tasks = 2) will result in wasted resources due to resource gpu limiting the number of runnable tasks per executor to: 1. Please adjust your configuration.\n", + "2022-11-30 07:51:33,277 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator 22.12.0 using cudf 22.12.0.\n", + "2022-11-30 07:51:33,292 WARN rapids.RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n", + "2022-11-30 07:51:33,295 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n", + "2022-11-30 07:51:33,295 WARN rapids.RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n", + "2022-11-30 07:51:33,798 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.\n" + ] + } + ], "source": [ - "spark = SparkSession.builder.getOrCreate()\n", + "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"/your-url\")\n", + "\n", + "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-jar-path\")\n", + "\n", + "# You need to update with your real hardware resource \n", + "driverMem = os.getenv(\"DRIVER_MEM\", \"2g\")\n", + "executorMem = os.getenv(\"EXECUTOR_MEM\", \"2g\")\n", + "pinnedPoolSize = os.getenv(\"PINNED_POOL_SIZE\", \"2g\")\n", + "concurrentGpuTasks = os.getenv(\"CONCURRENT_GPU_TASKS\", \"2\")\n", + "executorCores = int(os.getenv(\"EXECUTOR_CORES\", \"2\"))\n", + "# Common spark settings\n", + "conf = SparkConf()\n", + "conf.setMaster(SPARK_MASTER_URL)\n", + "conf.setAppName(\"Microbenchmark on GPU\")\n", + "conf.set(\"spark.executor.instances\",\"1\")\n", + "conf.set(\"spark.driver.memory\", driverMem)\n", + "## The tasks will run on GPU memory, so there is no need to set a high host memory\n", + "conf.set(\"spark.executor.memory\", executorMem)\n", + "## The tasks will run on GPU cores, so there is no need to use many cpu cores\n", + "conf.set(\"spark.executor.cores\", executorCores)\n", + "\n", + "# Plugin settings\n", + "conf.set(\"spark.executor.resource.gpu.amount\", \"1\")\n", + "conf.set(\"spark.rapids.sql.concurrentGpuTasks\", concurrentGpuTasks)\n", + "conf.set(\"spark.rapids.memory.pinnedPool.size\", pinnedPoolSize)\n", + "# since pyspark and xgboost share the same GPU, we need to allocate some memory to xgboost to avoid GPU OOM while training \n", + "conf.set(\"spark.rapids.memory.gpu.allocFraction\",\"0.7\")\n", + "conf.set(\"spark.locality.wait\",\"0\")\n", + "##############note: only support value=1 https://github.com/dmlc/xgboost/blame/master/python-package/xgboost/spark/core.py#L370-L374\n", + "conf.set(\"spark.task.resource.gpu.amount\", 1) \n", + "conf.set(\"spark.rapids.sql.enabled\", \"true\") \n", + "conf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\n", + "conf.set(\"spark.sql.cache.serializer\",\"com.nvidia.spark.ParquetCachedBatchSerializer\")\n", + "conf.set(\"spark.driver.extraClassPath\", RAPIDS_JAR)\n", + "conf.set(\"spark.executor.extraClassPath\", RAPIDS_JAR)\n", + "\n", + "# if you pass/unpack the archive file and enable the environment\n", + "# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n", + "# Create spark session\n", + "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n", + "\n", "reader = spark.read" ] }, @@ -79,7 +135,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -106,8 +162,17 @@ "\n", "# You need to update them to your real paths!\n", "dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n", - "train_data = reader.schema(schema).option('header', True).csv(dataRoot + '/taxi/csv/train')\n", - "trans_data = reader.schema(schema).option('header', True).csv(dataRoot + '/taxi/csv/test')" + "train_path = dataRoot + \"/taxi/csv/train\"\n", + "eval_path = dataRoot + \"/taxi/csv/test\"\n", + "\n", + "data_format = 'csv'\n", + "has_header = 'true'\n", + "if data_format == 'csv':\n", + " train_data = reader.schema(schema).option('header',has_header).csv(train_path)\n", + " trans_data = reader.schema(schema).option('header',has_header).csv(eval_path)\n", + "else :\n", + " train_data = reader.load(train_path)\n", + " trans_data = reader.load(eval_path)" ] }, { @@ -139,34 +204,39 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "params = { \n", - " 'eta': 0.05,\n", - " 'treeMethod': 'gpu_hist',\n", - " 'maxDepth': 8,\n", - " 'subsample': 0.8,\n", - " 'gamma': 1.0,\n", - " 'numRound': 100,\n", - " 'numWorkers': 1,\n", + " \"tree_method\": \"gpu_hist\",\n", + " \"grow_policy\": \"depthwise\",\n", + " \"num_workers\": 1,\n", + " \"use_gpu\": \"true\",\n", "}\n", - "regressor = XGBoostRegressor(**params).setLabelCol(label).setFeaturesCols(features)" + "params['features_col'] = features\n", + "params['label_col'] = label\n", + " \n", + "regressor = SparkXGBRegressor(**params)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The CPU version regressor provides the API `setFeaturesCol` which only accepts a single column name, so vectorization for multiple feature columns is required.\n", - "```Python\n", - "regressor = XGBoostRegressor(**params).setLabelCol(label).setFeaturesCol('features')\n", - "```\n", - "\n", "The parameter `num_workers` should be set to the number of GPUs in Spark cluster for GPU version, while for CPU version it is usually equal to the number of the CPU cores.\n", "\n", - "Concerning the tree method, GPU version only supports `gpu_hist` currently, while `hist` is designed and used here for CPU training.\n" + "Concerning the tree method, GPU version only supports `gpu_hist` currently, while `hist` is designed and used here for CPU training.\n", + "\n", + "An example of CPU classifier:\n", + "```\n", + "classifier = SparkXGBClassifier(\n", + " feature_col=features,\n", + " label_col=label, \n", + " num_workers=1024,\n", + " use_gpu=False,\n", + ")\n", + "```" ] }, { @@ -178,16 +248,34 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": { "scrolled": true }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "If features_cols param set, then features_col param is ignored.\n", + "[Stage 2:> (0 + 1) / 1]\r" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Training takes 17.73 seconds\n" + "Training takes 24.08 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " \r", + "/data/home/yuanli/work/reviews/pr252/pyspark_venv_20221125/lib/python3.8/site-packages/xgboost/sklearn.py:808: UserWarning: Loading a native XGBoost model with Scikit-Learn interface.\n", + " warnings.warn(\"Loading a native XGBoost model with Scikit-Learn interface.\")\n" ] } ], @@ -210,12 +298,28 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "If features_cols param set, then features_col param is ignored.\n" + ] + } + ], + "source": [ + "model.write().overwrite().save(dataRoot + '/model/taxi')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "model.write().overwrite().save(dataRoot + '/new-model-path')\n", - "loaded_model = XGBoostRegressionModel().load(dataRoot + '/new-model-path')" + "loaded_model = SparkXGBRegressorModel().load(dataRoot + '/model/taxi')" ] }, { @@ -227,25 +331,48 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": { "scrolled": false }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-11-30 07:52:27,357 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transformation takes 0.93 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-11-30 07:52:28,189 WARN rapids.GpuOverrides: \n", + "!Exec cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n", + " @Partitioning could run on GPU\n", + "\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Transformation takes 2.55 seconds\n", - "+------------+---------------+-------------+-----------+------------------+\n", - "| vendor_id|passenger_count|trip_distance|fare_amount| prediction|\n", - "+------------+---------------+-------------+-----------+------------------+\n", - "|1.55973043E9| 1.0| 1.1| 6.2| 5.670516490936279|\n", - "|1.55973043E9| 4.0| 2.7| 9.4|10.054250717163086|\n", - "|1.55973043E9| 1.0| 1.5| 6.1| 7.01417350769043|\n", - "|1.55973043E9| 1.0| 4.1| 12.6|14.309316635131836|\n", - "|1.55973043E9| 1.0| 4.6| 13.4|13.990922927856445|\n", - "+------------+---------------+-------------+-----------+------------------+\n", + "+--------------+---------------+-------------+-----------+-----------+\n", + "| vendor_id|passenger_count|trip_distance|fare_amount| prediction|\n", + "+--------------+---------------+-------------+-----------+-----------+\n", + "|1.559730432E09| 2.0| 0.699999988| 5.0|5.046935558|\n", + "|1.559730432E09| 3.0| 10.69999981| 34.0|31.72706413|\n", + "|1.559730432E09| 1.0| 2.299999952| 10.0|9.294451714|\n", + "|1.559730432E09| 1.0| 4.400000095| 16.5|15.05233097|\n", + "|1.559730432E09| 1.0| 1.5| 7.0|8.995832443|\n", + "+--------------+---------------+-------------+-----------+-----------+\n", "only showing top 5 rows\n", "\n" ] @@ -276,7 +403,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": { "scrolled": true }, @@ -285,8 +412,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "Evaluation takes 0.45 seconds\n", - "RMSE is 3.3195416959403032\n" + "Evaluation takes 0.22 seconds\n", + "RMSE is 1.9141528471228921\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-11-30 07:52:28,580 WARN rapids.GpuOverrides: \n", + "! cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n", + " ! createexternalrow(prediction#87, fare_amount#728, 1.0#729, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n", + " @Expression prediction#87 could run on GPU\n", + " @Expression fare_amount#728 could run on GPU\n", + " @Expression 1.0#729 could run on GPU\n", + " !Expression obj#733 cannot run on GPU because expression AttributeReference obj#733 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n", + "\n" ] } ], @@ -306,7 +447,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ diff --git a/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-ETL.ipynb b/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-ETL.ipynb index 9b1d891ce..485518326 100644 --- a/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-ETL.ipynb +++ b/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-ETL.ipynb @@ -19,14 +19,14 @@ "All data could be found at https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page\n", "\n", "### 2. Download needed jar\n", - "* [rapids-4-spark_2.12-22.10.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar)\n", + "* [rapids-4-spark_2.12-22.12.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.12.0/rapids-4-spark_2.12-22.12.0.jar)\n", "\n", "### 3. Start Spark Standalone\n", "Before running the script, please setup Spark standalone mode\n", "\n", "### 4. Add ENV\n", "```\n", - "$ export SPARK_JARS=rapids-4-spark_2.12-22.10.0.jar\n", + "$ export SPARK_JARS=rapids-4-spark_2.12-22.12.0.jar\n", "\n", "```\n", "\n", diff --git a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/consts.py b/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/consts.py index 90915619a..578d23183 100644 --- a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/consts.py +++ b/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/consts.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -57,12 +57,3 @@ StructField('day_of_week', FloatType()), StructField('is_weekend', FloatType()), ]) - -default_params = { - 'eta': 0.05, - 'maxDepth': 8, - 'subsample': 0.8, - 'gamma': 1.0, - 'numRound': 100, - 'numWorkers': 1, -} diff --git a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/cpu_cross_validator_main.py b/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/cpu_cross_validator_main.py deleted file mode 100644 index 69ffc53bb..000000000 --- a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/cpu_cross_validator_main.py +++ /dev/null @@ -1,76 +0,0 @@ -# -# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from com.nvidia.spark.examples.taxi.consts import * -from com.nvidia.spark.examples.utility.utils import * -from ml.dmlc.xgboost4j.scala.spark import * -from pyspark.sql import SparkSession -from pyspark.ml.tuning import ParamGridBuilder -from ml.dmlc.xgboost4j.scala.spark.rapids import CrossValidator - -def main(args, xgboost_args): - spark = (SparkSession - .builder - .appName(args.mainClass) - .getOrCreate()) - - train_data, eval_data, trans_data = valid_input_data(spark, args, raw_schema, final_schema) - - if args.mode in [ 'all', 'train' ]: - regressor = (XGBoostRegressor(**merge_dicts(default_params, xgboost_args)) - .setLabelCol(label) - .setFeaturesCol('features')) - param_grid = (ParamGridBuilder() - .addGrid(regressor.maxDepth, [5, 10]) - .addGrid(regressor.numRound, [100, 200]) - .build()) - evaluator = (RegressionEvaluator() - .setLabelCol(label)) - - cross_validator = (CrossValidator() - .setEstimator(regressor) - .setEvaluator(evaluator) - .setEstimatorParamMaps(param_grid) - .setNumFolds(3)) - if not train_data: - print('-' * 80) - print('Usage: training data path required when mode is all or train') - exit(1) - train_data = vectorize_data_frame(train_data, label) - model = with_benchmark('Training', lambda: cross_validator.fit(train_data)) - - # get the best model to do transform - model = model.bestModel - if args.modelPath: - writer = model.write().overwrite() if args.overwrite else model - writer.save(args.modelPath) - else: - model = XGBoostRegressionModel().load(args.modelPath) - - if args.mode in [ 'all', 'transform' ]: - def transform(): - vec_df = vectorize_data_frame(trans_data, label) - result = model.transform(vec_df).cache() - result.foreachPartition(lambda _: None) - return result - if not trans_data: - print('-' * 80) - print('Usage: trans data path required when mode is all or transform') - exit(1) - result = with_benchmark('Transformation', transform) - show_sample(args, result, label) - with_benchmark('Evaluation', lambda: check_regression_accuracy(result, label)) - - spark.stop() diff --git a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/cpu_main.py b/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/cpu_main.py deleted file mode 100644 index e31241926..000000000 --- a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/cpu_main.py +++ /dev/null @@ -1,64 +0,0 @@ -# -# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from com.nvidia.spark.examples.taxi.consts import * -from com.nvidia.spark.examples.taxi.pre_process import pre_process -from com.nvidia.spark.examples.utility.utils import * -from ml.dmlc.xgboost4j.scala.spark import * -from pyspark.sql import SparkSession - -def main(args, xgboost_args): - spark = (SparkSession - .builder - .appName(args.mainClass) - .getOrCreate()) - - train_data, eval_data, trans_data = valid_input_data(spark, args, raw_schema, final_schema) - - if args.mode in [ 'all', 'train' ]: - regressor = (XGBoostRegressor(**merge_dicts(default_params, xgboost_args)) - .setLabelCol(label) - .setFeaturesCol('features')) - if eval_data: - train_eval_data = vectorize_data_frame(eval_data, label) - regressor.setEvalSets({ 'test': train_eval_data }) - if not train_data: - print('-' * 80) - print('Usage: training data path required when mode is all or train') - exit(1) - train_data = vectorize_data_frame(train_data, label) - model = with_benchmark('Training', lambda: regressor.fit(train_data)) - - if args.modelPath: - writer = model.write().overwrite() if args.overwrite else model - writer.save(args.modelPath) - else: - model = XGBoostRegressionModel().load(args.modelPath) - - if args.mode in [ 'all', 'transform' ]: - def transform(): - vec_df = vectorize_data_frame(trans_data, label) - result = model.transform(vec_df).cache() - result.foreachPartition(lambda _: None) - return result - if not trans_data: - print('-' * 80) - print('Usage: trans data path required when mode is all or transform') - exit(1) - result = with_benchmark('Transformation', transform) - show_sample(args, result, label) - with_benchmark('Evaluation', lambda: check_regression_accuracy(result, label)) - - spark.stop() diff --git a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/gpu_cross_validator_main.py b/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/cross_validator_main.py similarity index 70% rename from examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/gpu_cross_validator_main.py rename to examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/cross_validator_main.py index b3b34c083..956c8d2ce 100644 --- a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/gpu_cross_validator_main.py +++ b/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/cross_validator_main.py @@ -13,42 +13,48 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from com.nvidia.spark.examples.taxi.consts import * +from .consts import * from com.nvidia.spark.examples.utility.utils import * -from ml.dmlc.xgboost4j.scala.spark import * -from pyspark.ml.tuning import ParamGridBuilder -from ml.dmlc.xgboost4j.scala.spark.rapids import CrossValidator +from pyspark.ml.tuning import ParamGridBuilder, CrossValidator from pyspark.sql import SparkSession +from xgboost.spark import SparkXGBRegressor, SparkXGBRegressorModel + + def main(args, xgboost_args): spark = (SparkSession - .builder - .appName(args.mainClass) - .getOrCreate()) + .builder + .appName(args.mainClass) + .getOrCreate()) train_data, eval_data, trans_data = valid_input_data(spark, args, raw_schema, final_schema) - features = [x.name for x in final_schema if x.name != label] + if args.mode in ['all', 'train']: + if train_data is None: + print('-' * 80) + print('Usage: training data path required when mode is all or train') + print('-' * 80) + exit(1) + + train_data, features = transform_data(train_data, label, args.use_gpu) + xgboost_args['features_col'] = features + xgboost_args['label_col'] = label + + regressor = SparkXGBRegressor(**xgboost_args) - if args.mode in [ 'all', 'train' ]: - regressor = (XGBoostRegressor(**merge_dicts(default_params, xgboost_args)) - .setLabelCol(label) - .setFeaturesCols(features)) param_grid = (ParamGridBuilder() - .addGrid(regressor.maxDepth, [5, 10]) - .addGrid(regressor.numRound, [100, 200]) + .addGrid(regressor.max_depth, [6, 8]) + .addGrid(regressor.n_estimators, [20, 40]) .build()) + evaluator = (RegressionEvaluator() - .setLabelCol(label)) + .setLabelCol(label)) + cross_validator = (CrossValidator() .setEstimator(regressor) .setEvaluator(evaluator) .setEstimatorParamMaps(param_grid) .setNumFolds(3)) - if not train_data: - print('-' * 80) - print('Usage: training data path required when mode is all or train') - exit(1) model = with_benchmark('Training', lambda: cross_validator.fit(train_data)) # get the best model to do transform @@ -57,17 +63,22 @@ def main(args, xgboost_args): writer = model.write().overwrite() if args.overwrite else model writer.save(args.modelPath) else: - model = XGBoostRegressionModel().load(args.modelPath) + model = SparkXGBRegressorModel.load(args.modelPath) + + if args.mode in ['all', 'transform']: + if trans_data is None: + print('-' * 80) + print('Usage: trans data path required when mode is all or transform') + print('-' * 80) + exit(1) + + trans_data, _ = transform_data(trans_data, label, args.use_gpu) - if args.mode in [ 'all', 'transform' ]: def transform(): result = model.transform(trans_data).cache() result.foreachPartition(lambda _: None) return result - if not trans_data: - print('-' * 80) - print('Usage: trans data path required when mode is all or transform') - exit(1) + result = with_benchmark('Transformation', transform) show_sample(args, result, label) with_benchmark('Evaluation', lambda: check_regression_accuracy(result, label)) diff --git a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/etl_main.py b/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/etl_main.py index e5f409c1c..18d12faf7 100644 --- a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/etl_main.py +++ b/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/etl_main.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,17 +13,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -from com.nvidia.spark.examples.taxi.consts import * -from com.nvidia.spark.examples.taxi.pre_process import pre_process +from .consts import * +from .pre_process import pre_process from com.nvidia.spark.examples.utility.utils import * -from ml.dmlc.xgboost4j.scala.spark import * from pyspark.sql import SparkSession + def main(args, xgboost_args): spark = (SparkSession - .builder - .appName(args.mainClass) - .getOrCreate()) + .builder + .appName(args.mainClass) + .getOrCreate()) raw_data_path = extract_paths(args.dataPaths, 'raw::') output_path = extract_paths(args.dataPaths, 'out::')[0] if not raw_data_path: @@ -36,6 +36,6 @@ def main(args, xgboost_args): exit(1) raw_data = prepare_data(spark, args, raw_schema, raw_data_path) etled_train, etled_eval, etled_trans = pre_process(raw_data).randomSplit(list(map(float, args.splitRatios))) - etled_train.write.mode("overwrite").parquet(output_path+'/train') - etled_eval.write.mode("overwrite").parquet(output_path+'/eval') - etled_trans.write.mode("overwrite").parquet(output_path+'/trans') + etled_train.write.mode("overwrite").parquet(output_path + '/train') + etled_eval.write.mode("overwrite").parquet(output_path + '/eval') + etled_trans.write.mode("overwrite").parquet(output_path + '/trans') diff --git a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/gpu_main.py b/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/main.py similarity index 70% rename from examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/gpu_main.py rename to examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/main.py index c9316d99e..2281e3e95 100644 --- a/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/gpu_main.py +++ b/examples/XGBoost-Examples/taxi/python/com/nvidia/spark/examples/taxi/main.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,49 +13,59 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from com.nvidia.spark.examples.taxi.consts import * -from com.nvidia.spark.examples.taxi.pre_process import pre_process +from .consts import * from com.nvidia.spark.examples.utility.utils import * -from ml.dmlc.xgboost4j.scala.spark import * from pyspark.sql import SparkSession +from xgboost.spark import SparkXGBRegressor, SparkXGBRegressorModel + + def main(args, xgboost_args): spark = (SparkSession - .builder - .appName(args.mainClass) - .getOrCreate()) + .builder + .appName(args.mainClass) + .getOrCreate()) train_data, eval_data, trans_data = valid_input_data(spark, args, raw_schema, final_schema) - features = [x.name for x in final_schema if x.name != label] - - if args.mode in [ 'all', 'train' ]: - regressor = (XGBoostRegressor(**merge_dicts(default_params, xgboost_args)) - .setLabelCol(label) - .setFeaturesCols(features)) - if eval_data: - regressor.setEvalSets({ 'test': eval_data }) + if args.mode in ['all', 'train']: if not train_data: print('-' * 80) print('Usage: training data path required when mode is all or train') + print('-' * 80) exit(1) + + train_data, features = transform_data(train_data, label, args.use_gpu) + xgboost_args['features_col'] = features + xgboost_args['label_col'] = label + regressor = SparkXGBRegressor(**xgboost_args) + + if eval_data: + # pass + pass + model = with_benchmark('Training', lambda: regressor.fit(train_data)) if args.modelPath: writer = model.write().overwrite() if args.overwrite else model writer.save(args.modelPath) else: - model = XGBoostRegressionModel().load(args.modelPath) + model = SparkXGBRegressorModel.load(args.modelPath) - if args.mode in [ 'all', 'transform' ]: - def transform(): - result = model.transform(trans_data).cache() - result.foreachPartition(lambda _: None) - return result + if args.mode in ['all', 'transform']: if not trans_data: print('-' * 80) print('Usage: trans data path required when mode is all or transform') + print('-' * 80) exit(1) + + trans_data, _ = transform_data(trans_data, label, args.use_gpu) + + def transform(): + result = model.transform(trans_data).cache() + result.foreachPartition(lambda _: None) + return result + result = with_benchmark('Transformation', transform) show_sample(args, result, label) with_benchmark('Evaluation', lambda: check_regression_accuracy(result, label)) diff --git a/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/main.py b/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/main.py index a06000d59..d997454bf 100644 --- a/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/main.py +++ b/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/main.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,9 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from com.nvidia.spark.examples.utility.args import parse_arguments +from .utility.args import parse_arguments from importlib import import_module + def main(): args, xgboost_args = parse_arguments() getattr(import_module(args.mainClass), 'main')(args, xgboost_args) diff --git a/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/utility/args.py b/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/utility/args.py index fe30ea68f..6318a1c2d 100644 --- a/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/utility/args.py +++ b/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/utility/args.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,94 +13,33 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import typing from argparse import ArgumentParser from distutils.util import strtobool from re import match from sys import exit + def _to_bool(literal): return bool(strtobool(literal)) -def _to_ratio_pair(literal): # e.g., '80:20' + +def _to_ratio_pair(literal): # e.g., '80:20' return match(r'^\d+:\d+$', literal) and [int(x) for x in literal.split(':')] + MAX_CHUNK_SIZE = 2 ** 31 - 1 _examples = [ - 'com.nvidia.spark.examples.agaricus.cpu_main', - 'com.nvidia.spark.examples.agaricus.gpu_main', - 'com.nvidia.spark.examples.mortgage.cpu_main', - 'com.nvidia.spark.examples.mortgage.gpu_main', - 'com.nvidia.spark.examples.mortgage.gpu_cross_validator_main', - 'com.nvidia.spark.examples.mortgage.cpu_cross_validator_main', - 'com.nvidia.spark.examples.taxi.cpu_main', - 'com.nvidia.spark.examples.taxi.gpu_main', - 'com.nvidia.spark.examples.taxi.gpu_cross_validator_main', - 'com.nvidia.spark.examples.taxi.cpu_cross_validator_main', + 'com.nvidia.spark.examples.agaricus.main', + 'com.nvidia.spark.examples.mortgage.main', 'com.nvidia.spark.examples.mortgage.etl_main', - 'com.nvidia.spark.examples.taxi.etl_main' + 'com.nvidia.spark.examples.mortgage.cross_validator_main', + 'com.nvidia.spark.examples.taxi.main', + 'com.nvidia.spark.examples.taxi.etl_main', + 'com.nvidia.spark.examples.taxi.cross_validator_main', ] -_xgboost_simple_args = [ - ('cacheTrainingSet', _to_bool), - ('maximizeEvaluationMetrics', _to_bool), - ('useExternalMemory', _to_bool), - ('checkpointInterval', int), - ('maxBins', int), - ('maxDepth', int), - ('maxLeaves', int), - ('nthread', int), - ('numClass', int), - ('numEarlyStoppingRounds', int), - ('numRound', int), - ('numWorkers', int), - ('seed', int), - ('silent', int), - ('timeoutRequestWorkers', int), - ('treeLimit', int), - ('verbosity', int), - ('alpha', float), - ('baseScore', float), - ('colsampleBylevel', float), - ('colsampleBytree', float), - ('eta', float), - ('gamma', float), - ('lambda_', float), - ('lambdaBias', float), - ('maxDeltaStep', float), - ('minChildWeight', float), - ('missing', float), - ('rateDrop', float), - ('scalePosWeight', float), - ('sketchEps', float), - ('skipDrop', float), - ('subsample', float), - ('trainTestRatio', float), - ('baseMarginCol', str), - ('checkpointPath', str), - ('contribPredictionCol', str), - ('evalMetric', str), - ('featuresCol', str), - ('groupCol', str), - ('growPolicy', str), - ('interactionConstraints', str), - ('labelCol', str), - ('leafPredictionCol', str), - ('monotoneConstraints', str), - ('normalizeType', str), - ('objective', str), - ('objectiveType', str), - ('predictionCol', str), - ('probabilityCol', str), - ('rawPredictionCol', str), - ('sampleType', str), - ('treeMethod', str), - ('weightCol', str), -] - -_xgboost_array_args = [ - ('thresholds', float), -] def _validate_args(args): usage = '' @@ -119,12 +58,36 @@ def _validate_args(args): print('Usage:\n' + usage) exit(1) + def _attach_derived_args(args): args.trainRatio = args.dataRatios[0] args.evalRatio = args.dataRatios[1] args.trainEvalRatio = 100 - args.trainRatio - args.evalRatio args.splitRatios = [args.trainRatio, args.trainEvalRatio, args.evalRatio] + +def _inspect_xgb_parameters() -> typing.Dict[str, type]: + """inspect XGBModel parameters from __init__""" + from xgboost import XGBModel + from typing import get_type_hints, get_origin + xgb_parameters = {} + xgb_model_sig = get_type_hints(XGBModel.__init__) + for k, v in xgb_model_sig.items(): + if k != "kwargs" and k != "return": + if get_origin(v) == typing.Union: + xgb_parameters[k] = v.__args__[0] + else: + xgb_parameters[k] = v + + # some extra parameters used by xgboost pyspark + xgb_parameters['objective'] = str + xgb_parameters['force_repartition'] = _to_bool + xgb_parameters['use_gpu'] = _to_bool + xgb_parameters['num_workers'] = int + xgb_parameters['enable_sparse_data_optim'] = _to_bool + return xgb_parameters + + def parse_arguments(): parser = ArgumentParser() @@ -142,23 +105,18 @@ def parse_arguments(): parser.add_argument('--numRows', type=int, default=5) parser.add_argument('--showFeatures', type=_to_bool, default=True) - # xgboost simple args - for arg, arg_type in _xgboost_simple_args: - parser.add_argument('--' + arg, type=arg_type) - - # xgboost array args - for arg, arg_type in _xgboost_array_args: - parser.add_argument('--' + arg, type=arg_type, action='append') + xgboost_all_args = _inspect_xgb_parameters() + for arg, tp in xgboost_all_args.items(): + parser.add_argument('--' + arg, type=tp) parsed_all = parser.parse_args() _validate_args(parsed_all) _attach_derived_args(parsed_all) - xgboost_args = [ arg for (arg, _) in _xgboost_simple_args + _xgboost_array_args ] parsed_xgboost = { k: v for k, v in vars(parsed_all).items() - if k in xgboost_args and v is not None + if k in xgboost_all_args and v is not None } return parsed_all, parsed_xgboost diff --git a/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/utility/utils.py b/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/utility/utils.py index 1b2818f3d..4b4037869 100644 --- a/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/utility/utils.py +++ b/examples/XGBoost-Examples/utility/python/com/nvidia/spark/examples/utility/utils.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,33 +13,41 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import typing + from pyspark.ml.evaluation import * from pyspark.ml.feature import VectorAssembler +from pyspark.sql import DataFrame from pyspark.sql.functions import col from pyspark.sql.types import FloatType from com.nvidia.spark.examples.taxi.pre_process import pre_process from time import time + def merge_dicts(dict_x, dict_y): result = dict_x.copy() result.update(dict_y) return result + def show_sample(args, data_frame, label): data_frame = data_frame if args.showFeatures else data_frame.select(label, 'prediction') data_frame.show(args.numRows) + def vectorize_data_frame(data_frame, label): - features = [ x.name for x in data_frame.schema if x.name != label ] - to_floats = [ col(x.name).cast(FloatType()) for x in data_frame.schema ] + features = [x.name for x in data_frame.schema if x.name != label] + to_floats = [col(x.name).cast(FloatType()) for x in data_frame.schema] return (VectorAssembler() - .setInputCols(features) - .setOutputCol('features') - .transform(data_frame.select(to_floats)) - .select(col('features'), col(label))) + .setInputCols(features) + .setOutputCol('features') + .transform(data_frame.select(to_floats)) + .select(col('features'), col(label))) + def vectorize_data_frames(data_frames, label): - return [ vectorize_data_frame(x, label) for x in data_frames ] + return [vectorize_data_frame(x, label) for x in data_frames] + def with_benchmark(phrase, action): start = time() @@ -49,33 +57,50 @@ def with_benchmark(phrase, action): print('{} takes {} seconds'.format(phrase, round(end - start, 2))) return result + def check_classification_accuracy(data_frame, label): accuracy = (MulticlassClassificationEvaluator() - .setLabelCol(label) - .evaluate(data_frame)) + .setLabelCol(label) + .evaluate(data_frame)) print('-' * 100) print('Accuracy is ' + str(accuracy)) + def check_regression_accuracy(data_frame, label): accuracy = (RegressionEvaluator() - .setLabelCol(label) - .evaluate(data_frame)) + .setLabelCol(label) + .evaluate(data_frame)) print('-' * 100) print('RMSE is ' + str(accuracy)) + def prepare_data(spark, args, schema, dataPath): reader = (spark - .read - .format(args.format)) + .read + .format(args.format)) if args.format == 'csv': reader.schema(schema).option('header', args.hasHeader) return reader.load(dataPath) + def extract_paths(paths, prefix): - results = [ path[len(prefix):] for path in paths if path.startswith(prefix) ] + results = [path[len(prefix):] for path in paths if path.startswith(prefix)] return results +def transform_data( + df: DataFrame, + label: str, + use_gpu: typing.Optional[bool], +) -> (DataFrame, typing.Union[str, typing.List[str]]): + if use_gpu: + features = [x.name for x in df.schema if x.name != label] + else: + df = vectorize_data_frame(df, label) + features = 'features' + return df, features + + def valid_input_data(spark, args, raw_schema, final_schema): e2e = False for path in args.dataPaths: @@ -88,9 +113,9 @@ def valid_input_data(spark, args, raw_schema, final_schema): eval_path = '' if e2e: - raw_train_path = extract_paths(args.dataPaths,'rawTrain::') - raw_eval_path = extract_paths(args.dataPaths,'rawEval::') - raw_trans_path = extract_paths(args.dataPaths,'rawTrans::') + raw_train_path = extract_paths(args.dataPaths, 'rawTrain::') + raw_eval_path = extract_paths(args.dataPaths, 'rawEval::') + raw_trans_path = extract_paths(args.dataPaths, 'rawTrans::') train_data = '' eval_data = '' diff --git a/examples/XGBoost-Examples/utility/scala/src/com/nvidia/spark/examples/utility/XGBoostArgs.scala b/examples/XGBoost-Examples/utility/scala/src/com/nvidia/spark/examples/utility/XGBoostArgs.scala index d8cca3fcd..a77a60add 100644 --- a/examples/XGBoost-Examples/utility/scala/src/com/nvidia/spark/examples/utility/XGBoostArgs.scala +++ b/examples/XGBoost-Examples/utility/scala/src/com/nvidia/spark/examples/utility/XGBoostArgs.scala @@ -64,7 +64,6 @@ object XGBoostArgs { "overwrite" -> XGBoostArg(parse = stringToBool, message = booleanMessage), "hasHeader" -> XGBoostArg(parse = stringToBool, message = booleanMessage), "saveDict" -> XGBoostArg(parse = stringToBool, message = booleanMessage), - "rabitTrackerHost" -> XGBoostArg(), ) private def help: Unit = { diff --git a/tools/databricks/README.md b/tools/databricks/README.md new file mode 100644 index 000000000..467c5d277 --- /dev/null +++ b/tools/databricks/README.md @@ -0,0 +1,12 @@ +# Databricks Tools Demo Notebooks + +The RAPIDS Accelerator for Apache Spark includes two key tools for understanding the benefits of +GPU acceleration as well as analyzing GPU Spark jobs. For customers on Databricks, the demo +notebooks offer a simple interface for running the tools given a set of Spark event logs from +CPU (qualification) or GPU (profiling) application runs. + +To use a demo notebook, you can import the notebook in the Databricks Notebook UI via File->Import Notebook. + +Once the demo notebook is imported, you can select run to activate the notebook to an available compute +cluster. Once the notebook is activated, you can enter in the log path location in the text widget at the +top of the notebook. After that, select *Run all* to execute the tools for the specific logs in the log path. diff --git a/tools/databricks/[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template.ipynb b/tools/databricks/[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template.ipynb new file mode 100644 index 000000000..6d7d66d22 --- /dev/null +++ b/tools/databricks/[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"markdown","source":["# Welcome to the Profiling Tool for the RAPIDS Accelerator for Apache Spark\nTo run the tool, you need to enter a log path that represents the DBFS location for your Spark GPU event logs. Then you can select \"Run all\" to execute the notebook. After the notebook completes, you will see various output tables show up below.\n\n## GPU Job Tuning Recommendations\nThis has general suggestions for tuning your applications to run optimally on GPUs.\n\n## Per-Job Profile\nThe profiler output includes information about the application, data sources, executors, SQL stages, Spark properties, and key application metrics at the job and stage levels."],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"5156a76c-7af7-465d-aff4-41a2e54e3595","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["import json\nimport requests\nimport base64\nimport shlex\nimport subprocess\nimport pandas as pd\n\nTOOL_JAR_URL = 'https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/22.12.0/rapids-4-spark-tools_2.12-22.12.0.jar'\nTOOL_JAR_LOCAL_PATH = '/tmp/rapids-4-spark-tools.jar'\n\n# Profiling tool output directory.\nOUTPUT_DIR = '/tmp' \n\nresponse = requests.get(TOOL_JAR_URL)\nopen(TOOL_JAR_LOCAL_PATH, \"wb\").write(response.content)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"53b4d770-9db6-4bd7-9b93-d036d375eac5","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["dbutils.widgets.text(\"log_path\", \"\")"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"f0e4371a-d2d9-4449-81ed-8f6c61ae8f80","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["eventlog_string=dbutils.widgets.get(\"log_path\") \n\nq_command_string=\"java -Xmx10g -cp /tmp/rapids-4-spark-tools.jar:/databricks/jars/* com.nvidia.spark.rapids.tool.profiling.ProfileMain --csv --auto-tuner -o {} \".format(OUTPUT_DIR) + eventlog_string\nargs = shlex.split(q_command_string)\ncmd_out = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)\n\nif cmd_out.returncode != 0:\n dbutils.notebook.exit(\"Profiling Tool failed with stderr:\" + cmd_out.stderr)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"e9e7cecf-c2dc-4a0f-aea1-61a323e4ccc4","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["import os\n\napp_df = pd.DataFrame(columns = ['appId', 'appName'])\n\nfor x in os.scandir(OUTPUT_DIR + \"/rapids_4_spark_profile/\"):\n tmp_df = pd.read_csv(x.path + \"/application_information.csv\")\n app_df = app_df.append(tmp_df[['appId', 'appName']])"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"be0a2da7-1ee3-475e-96f9-303779edfd85","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["## GPU Job Tuning Recommendations"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"a1e326ec-5701-4b08-ae0f-7df0c8440038","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["app_list = app_df[\"appId\"].tolist()\napp_recommendations = pd.DataFrame(columns=['app', 'recommendations'])\n\nfor app in app_list:\n app_file = open(OUTPUT_DIR + \"/rapids_4_spark_profile/\" + app + \"/profile.log\")\n recommendations_start = 0\n recommendations_str = \"\"\n for line in app_file:\n if recommendations_start == 1:\n recommendations_str = recommendations_str + line\n if \"### D. Recommended Configuration ###\" in line:\n recommendations_start = 1\n app_recommendations = app_recommendations.append({'app': app, 'recommendations': recommendations_str}, ignore_index=True)\n \ndisplay(app_recommendations)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"4979f78c-44a0-4e54-b803-e5e194b71104","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["## Per-App Profile"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"1d4f9927-e9d8-4897-b604-f7832dc634aa","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["for x in os.scandir(OUTPUT_DIR + \"/rapids_4_spark_profile/\"):\n print(\"APPLICATION ID = \" + str(x))\n log = open(x.path + \"/profile.log\")\n print(log.read())"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"9a8f1a58-e86f-4bd0-a245-878186feb8b9","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template","dashboards":[{"elements":[{"elementNUID":"be0a2da7-1ee3-475e-96f9-303779edfd85","dashboardResultIndex":0,"guid":"05eef9d3-7c55-4e26-8d1f-fa80338359e6","resultIndex":null,"options":null,"position":{"x":0,"y":0,"height":6,"width":24,"z":null},"elementType":"command"}],"guid":"a9ea7799-040a-484e-a59d-c3cdf5072953","layoutOption":{"stack":true,"grid":true},"version":"DashboardViewV1","nuid":"91c1bfb2-695a-4e5c-8a25-848a433108dc","origId":2690941040041430,"title":"Executive View","width":1600,"globalVars":{}},{"elements":[],"guid":"0896a45f-af1b-4849-b6c2-2b6abcb8b97b","layoutOption":{"stack":true,"grid":true},"version":"DashboardViewV1","nuid":"62243296-4562-4f06-90ac-d7a609f19c16","origId":2690941040041431,"title":"App View","width":1920,"globalVars":{}}],"notebookMetadata":{"pythonIndentUnit":2,"widgetLayout":[{"name":"log_path","width":576,"breakBefore":false},{"name":"Apps","width":494,"breakBefore":false}]},"language":"python","widgets":{"log_path":{"nuid":"c7ce3870-db19-4813-b1cb-cead3f4c36f1","currentValue":"/dbfs/","widgetInfo":{"widgetType":"text","name":"log_path","defaultValue":"","label":null,"options":{"widgetType":"text","validationRegex":null}}}},"notebookOrigID":2690941040041407}},"nbformat":4,"nbformat_minor":0} diff --git a/tools/databricks/[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template.ipynb b/tools/databricks/[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template.ipynb new file mode 100644 index 000000000..db4f756fb --- /dev/null +++ b/tools/databricks/[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"markdown","source":["# Welcome to the Qualification Tool for the RAPIDS Accelerator for Apache Spark\nTo run the tool, you need to enter a log path that represents the DBFS location for your Spark CPU event logs. Then you can select \"Run all\" to execute the notebook. After the notebook completes, you will see various output tables show up below.\n\n## Summary Output\nThe report represents the entire app execution, including unsupported operators and non-SQL operations. By default, the applications and queries are sorted in descending order by the following fields:\n- Recommendation;\n- Estimated GPU Speed-up;\n- Estimated GPU Time Saved; and\n- End Time.\n\n## Stages Output\nFor each stage used in SQL operations, the Qualification tool generates the following information:\n1. App ID\n1. Stage ID\n1. Average Speedup Factor: the average estimated speed-up of all the operators in the given stage.\n1. Stage Task Duration: amount of time spent in tasks of SQL Dataframe operations for the given stage.\n1. Unsupported Task Duration: sum of task durations for the unsupported operators. For more details, see Supported Operators.\n1. Stage Estimated: True or False indicates if we had to estimate the stage duration.\n\n## Execs Output\nThe Qualification tool generates a report of the “Exec” in the “SparkPlan” or “Executor Nodes” along with the estimated acceleration on the GPU. Please refer to the Supported Operators guide for more details on limitations on UDFs and unsupported operators.\n1. App ID\n1. SQL ID\n1. Exec Name: example Filter, HashAggregate\n1. Expression Name\n1. Task Speedup Factor: it is simply the average acceleration of the operators based on the original CPU duration of the operator divided by the GPU duration. The tool uses historical queries and benchmarks to estimate a speed-up at an individual operator level to calculate how much a specific operator would accelerate on GPU.\n1. Exec Duration: wall-Clock time measured since the operator starts till it is completed.\n1. SQL Node Id\n1. Exec Is Supported: whether the Exec is supported by RAPIDS or not. Please refer to the Supported Operators section.\n1. Exec Stages: an array of stage IDs\n1. Exec Children\n1. Exec Children Node Ids\n1. Exec Should Remove: whether the Op is removed from the migrated plan."],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"df33c614-2ecc-47a0-8600-bc891681997f","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["import json\nimport requests\nimport base64\nimport shlex\nimport subprocess\nimport pandas as pd\n\nTOOL_JAR_URL = 'https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/22.12.0/rapids-4-spark-tools_2.12-22.12.0.jar'\nTOOL_JAR_LOCAL_PATH = '/tmp/rapids-4-spark-tools.jar'\n\n# Qualification tool output directory.\nOUTPUT_DIR = '/tmp/'\n\nresponse = requests.get(TOOL_JAR_URL)\nopen(TOOL_JAR_LOCAL_PATH, \"wb\").write(response.content)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"53b4d770-9db6-4bd7-9b93-d036d375eac5","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["dbutils.widgets.text(\"log_path\", \"\")\neventlog_string=dbutils.widgets.get(\"log_path\")\n\nq_command_string=\"java -Xmx10g -cp /tmp/rapids-4-spark-tools.jar:/databricks/jars/* com.nvidia.spark.rapids.tool.qualification.QualificationMain -o {} \".format(OUTPUT_DIR) + eventlog_string\nargs = shlex.split(q_command_string)\ncmd_out = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)\n\n\nif cmd_out.returncode != 0:\n dbutils.notebook.exit(\"Qualification Tool failed with stderr:\" + cmd_out.stderr)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"e9e7cecf-c2dc-4a0f-aea1-61a323e4ccc4","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["## Summary Output"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"bbe50fde-0bd6-4281-95fd-6a1ec6f17ab2","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["summary_output=pd.read_csv(OUTPUT_DIR + \"rapids_4_spark_qualification_output/rapids_4_spark_qualification_output.csv\")\ndisplay(summary_output)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"fb8edb26-e173-47ff-92a1-463baec7c06b","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["## Stages Output"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"6756159b-30ca-407a-ab6b-9c29ced01ea6","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["stages_output=pd.read_csv(OUTPUT_DIR + \"rapids_4_spark_qualification_output/rapids_4_spark_qualification_output_stages.csv\")\ndisplay(stages_output)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"cdde6177-db5f-434a-995b-776678a64a3a","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["## Execs Output"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"4d7ce219-ae75-4a0c-a78c-4e7f25b8cd6f","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["execs_output=pd.read_csv(OUTPUT_DIR + \"rapids_4_spark_qualification_output/rapids_4_spark_qualification_output_execs.csv\")\ndisplay(execs_output)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"998b0c51-0cb6-408e-a01a-d1f5b1a61e1f","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template","dashboards":[{"elements":[],"guid":"0ed3c80b-b2f6-4c89-9a92-1af2f168d5ea","layoutOption":{"stack":true,"grid":true},"version":"DashboardViewV1","nuid":"91c1bfb2-695a-4e5c-8a25-848a433108dc","origId":2721260844584915,"title":"Executive View","width":1600,"globalVars":{}},{"elements":[],"guid":"ab4cecf9-0471-4fee-aa33-8927bb7e1bb1","layoutOption":{"stack":true,"grid":true},"version":"DashboardViewV1","nuid":"62243296-4562-4f06-90ac-d7a609f19c16","origId":2721260844584916,"title":"App View","width":1920,"globalVars":{}}],"notebookMetadata":{"pythonIndentUnit":2,"widgetLayout":[{"name":"log_path","width":1152,"breakBefore":false}]},"language":"python","widgets":{"log_path":{"nuid":"88986aa6-6e67-4d09-aeeb-7c96ea1ea8f1","currentValue":"/dbfs/","widgetInfo":{"widgetType":"text","name":"log_path","defaultValue":"","label":null,"options":{"widgetType":"text","validationRegex":null}}}},"notebookOrigID":2721260844584890}},"nbformat":4,"nbformat_minor":0}