diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml index f89865c3f..b292dca00 100644 --- a/.github/workflows/auto-merge.yml +++ b/.github/workflows/auto-merge.yml @@ -18,7 +18,7 @@ name: auto-merge HEAD to BASE on: pull_request_target: branches: - - branch-22.06 + - branch-22.08 types: [closed] jobs: @@ -29,13 +29,13 @@ jobs: steps: - uses: actions/checkout@v2 with: - ref: branch-22.06 # force to fetch from latest upstream instead of PR ref + ref: branch-22.08 # force to fetch from latest upstream instead of PR ref - name: auto-merge job uses: ./.github/workflows/auto-merge env: OWNER: NVIDIA REPO_NAME: spark-rapids-examples - HEAD: branch-22.06 - BASE: branch-22.08 + HEAD: branch-22.08 + BASE: branch-22.10 AUTOMERGE_TOKEN: ${{ secrets.AUTOMERGE_TOKEN }} # use to merge PR diff --git a/datasets/mortgage-small.tar.gz b/datasets/mortgage-small.tar.gz deleted file mode 100644 index 2f7c6a016..000000000 Binary files a/datasets/mortgage-small.tar.gz and /dev/null differ diff --git a/docs/get-started/xgboost-examples/building-sample-apps/python.md b/docs/get-started/xgboost-examples/building-sample-apps/python.md index 28f35d40d..53f4e66c6 100644 --- a/docs/get-started/xgboost-examples/building-sample-apps/python.md +++ b/docs/get-started/xgboost-examples/building-sample-apps/python.md @@ -17,7 +17,8 @@ Two files are required by PySpark: + *samples.zip* - the package including all example code + the package including all example code. + Executing the above build commands generates the samples.zip file in 'spark-rapids-examples/examples/XGBoost-Examples' folder + *main.py* diff --git a/docs/get-started/xgboost-examples/csp/databricks/databricks.md b/docs/get-started/xgboost-examples/csp/databricks/databricks.md index 073b1902e..c09576d0e 100644 --- a/docs/get-started/xgboost-examples/csp/databricks/databricks.md +++ b/docs/get-started/xgboost-examples/csp/databricks/databricks.md @@ -49,7 +49,7 @@ cluster. - [Databricks 10.4 LTS ML](https://docs.databricks.com/release-notes/runtime/9.1ml.html#system-environment) has CUDA 11 - installed. Users will need to use 22.06.0 or later on Databricks 10.4 LTS ML. In this case use + installed. Users will need to use 22.04.0 or later on Databricks 10.4 LTS ML. In this case use [generate-init-script-10.4.ipynb](generate-init-script-10.4.ipynb) which will install the RAPIDS Spark plugin. diff --git a/docs/get-started/xgboost-examples/csp/databricks/generate-init-script-10.4.ipynb b/docs/get-started/xgboost-examples/csp/databricks/generate-init-script-10.4.ipynb index 53d61c456..3be77a4b5 100644 --- a/docs/get-started/xgboost-examples/csp/databricks/generate-init-script-10.4.ipynb +++ b/docs/get-started/xgboost-examples/csp/databricks/generate-init-script-10.4.ipynb @@ -24,7 +24,7 @@ "source": [ "%sh\n", "cd ../../dbfs/FileStore/jars/\n", - "sudo wget -O rapids-4-spark_2.12-22.06.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.06.0/rapids-4-spark_2.12-22.06.0.jar\n", + "sudo wget -O rapids-4-spark_2.12-22.08.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar\n", "sudo wget -O xgboost4j-gpu_2.12-1.6.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-gpu_2.12/1.6.1/xgboost4j-gpu_2.12-1.6.1.jar\n", "sudo wget -O xgboost4j-spark-gpu_2.12-1.6.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-spark-gpu_2.12/1.6.1/xgboost4j-spark-gpu_2.12-1.6.1.jar\n", "ls -ltr\n", @@ -60,7 +60,7 @@ "sudo rm -f /databricks/jars/spark--maven-trees--ml--10.x--xgboost-gpu--ml.dmlc--xgboost4j-spark-gpu_2.12--ml.dmlc__xgboost4j-spark-gpu_2.12__1.5.2.jar\n", "\n", "sudo cp /dbfs/FileStore/jars/xgboost4j-gpu_2.12-1.6.1.jar /databricks/jars/\n", - "sudo cp /dbfs/FileStore/jars/rapids-4-spark_2.12-22.06.0.jar /databricks/jars/\n", + "sudo cp /dbfs/FileStore/jars/rapids-4-spark_2.12-22.08.0.jar /databricks/jars/\n", "sudo cp /dbfs/FileStore/jars/xgboost4j-spark-gpu_2.12-1.6.1.jar /databricks/jars/\"\"\", True)" ] }, @@ -133,7 +133,7 @@ "1. Edit your cluster, adding an initialization script from `dbfs:/databricks/init_scripts/init.sh` in the \"Advanced Options\" under \"Init Scripts\" tab\n", "2. Reboot the cluster\n", "3. Go to \"Libraries\" tab under your cluster and install `dbfs:/FileStore/jars/xgboost4j-spark-gpu_2.12-1.6.1.jar` in your cluster by selecting the \"DBFS\" option for installing jars\n", - "4. Import the mortgage example notebook from `https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.06/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb`\n", + "4. Import the mortgage example notebook from `https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.08/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb`\n", "5. Inside the mortgage example notebook, update the data paths\n", " `train_data = reader.schema(schema).option('header', True).csv('/data/mortgage/csv/small-train.csv')`\n", " `trans_data = reader.schema(schema).option('header', True).csv('/data/mortgage/csv/small-trans.csv')`" diff --git a/docs/get-started/xgboost-examples/csp/databricks/generate-init-script.ipynb b/docs/get-started/xgboost-examples/csp/databricks/generate-init-script.ipynb index e81611b43..540132062 100644 --- a/docs/get-started/xgboost-examples/csp/databricks/generate-init-script.ipynb +++ b/docs/get-started/xgboost-examples/csp/databricks/generate-init-script.ipynb @@ -24,7 +24,7 @@ "source": [ "%sh\n", "cd ../../dbfs/FileStore/jars/\n", - "sudo wget -O rapids-4-spark_2.12-22.06.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.06.0/rapids-4-spark_2.12-22.06.0.jar\n", + "sudo wget -O rapids-4-spark_2.12-22.08.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar\n", "sudo wget -O xgboost4j-gpu_2.12-1.6.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-gpu_2.12/1.6.1/xgboost4j-gpu_2.12-1.6.1.jar\n", "sudo wget -O xgboost4j-spark-gpu_2.12-1.6.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-spark-gpu_2.12/1.6.1/xgboost4j-spark-gpu_2.12-1.6.1.jar\n", "ls -ltr\n", @@ -60,7 +60,7 @@ "sudo rm -f /databricks/jars/spark--maven-trees--ml--9.x--xgboost-gpu--ml.dmlc--xgboost4j-spark-gpu_2.12--ml.dmlc__xgboost4j-spark-gpu_2.12__1.4.1.jar\n", "\n", "sudo cp /dbfs/FileStore/jars/xgboost4j-gpu_2.12-1.6.1.jar /databricks/jars/\n", - "sudo cp /dbfs/FileStore/jars/rapids-4-spark_2.12-22.06.0.jar /databricks/jars/\n", + "sudo cp /dbfs/FileStore/jars/rapids-4-spark_2.12-22.08.0.jar /databricks/jars/\n", "sudo cp /dbfs/FileStore/jars/xgboost4j-spark-gpu_2.12-1.6.1.jar /databricks/jars/\"\"\", True)" ] }, @@ -133,7 +133,7 @@ "1. Edit your cluster, adding an initialization script from `dbfs:/databricks/init_scripts/init.sh` in the \"Advanced Options\" under \"Init Scripts\" tab\n", "2. Reboot the cluster\n", "3. Go to \"Libraries\" tab under your cluster and install `dbfs:/FileStore/jars/xgboost4j-spark-gpu_2.12-1.6.1.jar` in your cluster by selecting the \"DBFS\" option for installing jars\n", - "4. Import the mortgage example notebook from `https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.06/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb`\n", + "4. Import the mortgage example notebook from `https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.08/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb`\n", "5. Inside the mortgage example notebook, update the data paths\n", " `train_data = reader.schema(schema).option('header', True).csv('/data/mortgage/csv/small-train.csv')`\n", " `trans_data = reader.schema(schema).option('header', True).csv('/data/mortgage/csv/small-trans.csv')`" diff --git a/docs/get-started/xgboost-examples/dataset/mortgage.md b/docs/get-started/xgboost-examples/dataset/mortgage.md new file mode 100644 index 000000000..1c36155fa --- /dev/null +++ b/docs/get-started/xgboost-examples/dataset/mortgage.md @@ -0,0 +1,22 @@ +# How to download the Mortgage dataset + + + +## Steps to download the data + +1. Go to the [Fannie Mae](https://capitalmarkets.fanniemae.com/credit-risk-transfer/single-family-credit-risk-transfer/fannie-mae-single-family-loan-performance-data) website +2. Click on [Single-Family Loan Performance Data](https://datadynamics.fanniemae.com/data-dynamics/?&_ga=2.181456292.2043790680.1657122341-289272350.1655822609#/reportMenu;category=HP) + * Register as a new user if you are using the website for the first time + * Use the credentials to login +3. Select [HP](https://datadynamics.fanniemae.com/data-dynamics/#/reportMenu;category=HP) +4. Click on **Download Data** and choose *Single-Family Loan Performance Data* +5. You will find a tabular list of 'Acquisition and Performance' files sorted based on year and quarter. Click on the file to download `Eg: 2017Q1.zip` +6. Unzip the downlad file to extract the csv file `Eg: 2017Q1.csv` +7. Copy only the csv files to a new folder for the ETL to read + +## Notes +1. Refer to the [Loan Performance Data Tutorial](https://capitalmarkets.fanniemae.com/media/9066/display) for more details. +2. Note that *Single-Family Loan Performance Data* has 2 componenets. However, the Mortgage ETL requires only the first one (primary dataset) + * Primary Dataset: Acquisition and Performance Files + * HARP Dataset +3. Use the [Resources](https://datadynamics.fanniemae.com/data-dynamics/#/resources/HP) section to know more about the dataset \ No newline at end of file diff --git a/docs/get-started/xgboost-examples/notebook/python-notebook.md b/docs/get-started/xgboost-examples/notebook/python-notebook.md index 94486d58c..3bfd71174 100644 --- a/docs/get-started/xgboost-examples/notebook/python-notebook.md +++ b/docs/get-started/xgboost-examples/notebook/python-notebook.md @@ -20,6 +20,10 @@ and the home directory for Apache Spark respectively. 3. Launch the notebook: + Note: For ETL jobs, Set `spark.task.resource.gpu.amount` to `1/spark.executor.cores`. + + For ETL: + ``` bash PYSPARK_DRIVER_PYTHON=jupyter \ PYSPARK_DRIVER_PYTHON_OPTS=notebook \ @@ -28,14 +32,38 @@ and the home directory for Apache Spark respectively. --jars ${RAPIDS_JAR},${XGBOOST4J_JAR},${XGBOOST4J_SPARK_JAR}\ --py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP} \ --conf spark.plugins=com.nvidia.spark.SQLPlugin \ - --conf spark.rapids.memory.gpu.pooling.enabled=false \ --conf spark.executor.resource.gpu.amount=1 \ + --conf spark.executor.cores=10 \ + --conf spark.task.resource.gpu.amount=0.1 \ + --conf spark.sql.cache.serializer=com.nvidia.spark.ParquetCachedBatchSerializer \ + --conf spark.rapids.sql.hasNans=false \ + --conf spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh \ + --files $SPARK_HOME/examples/src/main/scripts/getGpusResources.sh + ``` + + For XGBoost: + + ``` bash + PYSPARK_DRIVER_PYTHON=jupyter \ + PYSPARK_DRIVER_PYTHON_OPTS=notebook \ + pyspark \ + --master ${SPARK_MASTER} \ + --jars ${RAPIDS_JAR},${XGBOOST4J_JAR},${XGBOOST4J_SPARK_JAR}\ + --py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP} \ + --conf spark.plugins=com.nvidia.spark.SQLPlugin \ + --conf spark.rapids.memory.gpu.pool=NONE \ + --conf spark.executor.resource.gpu.amount=1 \ + --conf spark.executor.cores=10 \ --conf spark.task.resource.gpu.amount=1 \ + --conf spark.rapids.sql.hasNans=false \ --conf spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh \ --files $SPARK_HOME/examples/src/main/scripts/getGpusResources.sh ``` + + 4. Launch ETL Part + - Mortgage ETL Notebook: [Python](../../../../examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb) - Taxi ETL Notebook: [Python](../../../../examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb) - Note: Agaricus does not have ETL part. diff --git a/docs/get-started/xgboost-examples/notebook/toree.md b/docs/get-started/xgboost-examples/notebook/toree.md index 5be31c180..e338fd909 100644 --- a/docs/get-started/xgboost-examples/notebook/toree.md +++ b/docs/get-started/xgboost-examples/notebook/toree.md @@ -29,18 +29,39 @@ and the home directory for Apache Spark respectively. 4. Install a new kernel with gpu enabled and launch the notebook + Note: For ETL jobs, Set `spark.task.resource.gpu.amount` to `1/spark.executor.cores`. + + For ETL: ``` bash jupyter toree install \ --spark_home=${SPARK_HOME} \ --user \ --toree_opts='--nosparkcontext' \ - --kernel_name="XGBoost4j-Spark" \ + --kernel_name="ETL-Spark" \ + --spark_opts='--master ${SPARK_MASTER} \ + --jars ${RAPIDS_JAR},${SAMPLE_JAR} \ + --conf spark.plugins=com.nvidia.spark.SQLPlugin \ + --conf spark.executor.extraClassPath=${RAPIDS_JAR} \ + --conf spark.executor.cores=10 \ + --conf spark.task.resource.gpu.amount=0.1 \ + --conf spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh \ + --files $SPARK_HOME/examples/src/main/scripts/getGpusResources.sh' + ``` + + For XGBoost: + ``` bash + jupyter toree install \ + --spark_home=${SPARK_HOME} \ + --user \ + --toree_opts='--nosparkcontext' \ + --kernel_name="XGBoost-Spark" \ --spark_opts='--master ${SPARK_MASTER} \ --jars ${RAPIDS_JAR},${SAMPLE_JAR} \ --conf spark.plugins=com.nvidia.spark.SQLPlugin \ --conf spark.executor.extraClassPath=${RAPIDS_JAR} \ - --conf spark.rapids.memory.gpu.pooling.enabled=false \ + --conf spark.rapids.memory.gpu.pool=NONE \ --conf spark.executor.resource.gpu.amount=1 \ + --conf spark.executor.cores=10 \ --conf spark.task.resource.gpu.amount=1 \ --conf spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh \ --files $SPARK_HOME/examples/src/main/scripts/getGpusResources.sh' diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md b/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md index 887c39d02..11d1fb4dd 100644 --- a/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md +++ b/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md @@ -40,7 +40,7 @@ export SPARK_DOCKER_IMAGE= export SPARK_DOCKER_TAG= pushd ${SPARK_HOME} -wget https://github.com/NVIDIA/spark-rapids-examples/raw/branch-22.06/dockerfile/Dockerfile +wget https://github.com/NVIDIA/spark-rapids-examples/raw/branch-22.08/dockerfile/Dockerfile # Optionally install additional jars into ${SPARK_HOME}/jars/ @@ -60,9 +60,10 @@ on cluster filesystems like HDFS, or in [object stores like S3 and GCS](https:// Note that using [application dependencies](https://spark.apache.org/docs/latest/running-on-kubernetes.html#dependency-management) from the submission client’s local file system is currently not yet supported. -Note: the `mortgage_eval_merged.csv` and `mortgage_train_merged.csv` are not Mortgage raw data, -they are the data produced by Mortgage ETL job. If user wants to use a larger size Mortgage data, please refer to [Launch ETL job](#etl). -Taxi ETL job is the same. But Agaricus does not have ETL process, it is combined with XGBoost as there is just a filter operation. +#### Note: +1. Mortgage and Taxi jobs have ETLs to generate the processed data. +2. For convenience, a subset of [Taxi](/datasets/) dataset is made available in this repo that can be readily used for launching XGBoost job. Use [ETL](#etl) to generate larger datasets for trainig and testing. +3. Agaricus does not have an ETL process, it is combined with XGBoost as there is just a filter operation. Save Kubernetes Template Resources ---------------------------------- @@ -89,16 +90,23 @@ to execute using a GPU which is already in use -- causing undefined behavior and Launch Mortgage or Taxi ETL Part --------------------------- +Use the ETL app to process raw Mortgage data. You can either use this ETLed data to split into training and evaluation data or run the ETL on different subsets of the dataset to produce training and evaluation datasets. + +Note: For ETL jobs, Set `spark.task.resource.gpu.amount` to `1/spark.executor.cores`. Run spark-submit ``` bash ${SPARK_HOME}/bin/spark-submit \ --conf spark.plugins=com.nvidia.spark.SQLPlugin \ - --conf spark.rapids.memory.gpu.pooling.enabled=false \ --conf spark.executor.resource.gpu.amount=1 \ - --conf spark.task.resource.gpu.amount=1 \ + --conf spark.executor.cores=10 \ + --conf spark.task.resource.gpu.amount=0.1 \ + --conf spark.rapids.sql.incompatibleDateFormats.enabled=true \ + --conf spark.rapids.sql.csv.read.double.enabled=true \ --conf spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh \ + --conf spark.sql.cache.serializer=com.nvidia.spark.ParquetCachedBatchSerializer \ + --conf spark.rapids.sql.hasNans=false \ --files $SPARK_HOME/examples/src/main/scripts/getGpusResources.sh \ --jars ${RAPIDS_JAR} \ --master \ @@ -106,18 +114,17 @@ ${SPARK_HOME}/bin/spark-submit \ --num-executors ${SPARK_NUM_EXECUTORS} \ --driver-memory ${SPARK_DRIVER_MEMORY} \ --executor-memory ${SPARK_EXECUTOR_MEMORY} \ - --class ${EXAMPLE_CLASS} \ --class com.nvidia.spark.examples.mortgage.ETLMain \ $SAMPLE_JAR \ -format=csv \ - -dataPath="perf::${SPARK_XGBOOST_DIR}/mortgage/perf-train/" \ - -dataPath="acq::${SPARK_XGBOOST_DIR}/mortgage/acq-train/" \ - -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/out/train/" - -# if generating eval data, change the data path to eval as well as the corresponding perf-eval and acq-eval data -# -dataPath="perf::${SPARK_XGBOOST_DIR}/mortgage/perf-eval" -# -dataPath="acq::${SPARK_XGBOOST_DIR}/mortgage/acq-eval" -# -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/out/eval/" + -dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/" \ + -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/train/" \ + -dataPath="tmp::${SPARK_XGBOOST_DIR}/mortgage/output/tmp/" + +# if generating eval data, change the data path to eval +# -dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/" +# -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/eval/" +# -dataPath="tmp::${SPARK_XGBOOST_DIR}/mortgage/output/tmp/" # if running Taxi ETL benchmark, change the class and data path params to # -class com.nvidia.spark.examples.taxi.ETLMain # -dataPath="raw::${SPARK_XGBOOST_DIR}/taxi/your-path" @@ -163,9 +170,9 @@ export SPARK_DRIVER_MEMORY=4g export SPARK_EXECUTOR_MEMORY=8g # example class to use -export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.GPUMain -# or change to com.nvidia.spark.examples.taxi.GPUMain to run Taxi Xgboost benchmark -# or change to com.nvidia.spark.examples.agaricus.GPUMain to run Agaricus Xgboost benchmark +export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.Main +# or change to com.nvidia.spark.examples.taxi.Main to run Taxi Xgboost benchmark +# or change to com.nvidia.spark.examples.agaricus.Main to run Agaricus Xgboost benchmark # tree construction algorithm export TREE_METHOD=gpu_hist @@ -176,9 +183,10 @@ Run spark-submit: ``` bash ${SPARK_HOME}/bin/spark-submit \ --conf spark.plugins=com.nvidia.spark.SQLPlugin \ - --conf spark.rapids.memory.gpu.pooling.enabled=false \ + --conf spark.rapids.memory.gpu.pool=NONE \ --conf spark.executor.resource.gpu.amount=1 \ --conf spark.task.resource.gpu.amount=1 \ + --conf spark.rapids.sql.hasNans=false \ --conf spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh \ --files $SPARK_HOME/examples/src/main/scripts/getGpusResources.sh \ --jars ${RAPIDS_JAR} \ @@ -192,9 +200,9 @@ ${SPARK_HOME}/bin/spark-submit --conf spark.kubernetes.executor.podTemplateFile=${TEMPLATE_PATH} \ --conf spark.kubernetes.authenticate.driver.serviceAccountName=spark \ ${SAMPLE_JAR} \ - -dataPath=train::${DATA_PATH}/mortgage/csv/train/mortgage_train_merged.csv \ - -dataPath=trans::${DATA_PATH}/mortgage/csv/test/mortgage_eval_merged.csv \ - -format=csv \ + -dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/output/train/ \ + -dataPath=trans::${SPARK_XGBOOST_DIR}/mortgage/output/eval/ \ + -format=parquet \ -numWorkers=${SPARK_NUM_EXECUTORS} \ -treeMethod=${TREE_METHOD} \ -numRound=100 \ diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md b/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md index 55ac2a1c4..6132a7563 100644 --- a/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md +++ b/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md @@ -53,6 +53,13 @@ Get Application Files, Jar and Dataset Make sure you have prepared the necessary packages and dataset by following this [guide](/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md) + +#### Note: +1. Mortgage and Taxi jobs have ETLs to generate the processed data. +2. For convenience, a subset of [Taxi](/datasets/) dataset is made available in this repo that can be readily used for launching XGBoost job. Use [ETL](#etl) to generate larger datasets for trainig and testing. +3. Agaricus does not have an ETL process, it is combined with XGBoost as there is just a filter operation. + + Launch a Standalone Spark Cluster --------------------------------- @@ -83,30 +90,57 @@ Launch a Standalone Spark Cluster Launch Mortgage or Taxi ETL Part --------------------------- +Use the ETL app to process raw Mortgage data. You can either use this ETLed data to split into training and evaluation data or run the ETL on different subsets of the dataset to produce training and evaluation datasets. -Run spark-submit - +Note: For ETL jobs, Set `spark.task.resource.gpu.amount` to `1/spark.executor.cores`. +### ETL on GPU ``` bash ${SPARK_HOME}/bin/spark-submit \ --master spark://$HOSTNAME:7077 \ --executor-memory 32G \ --conf spark.executor.resource.gpu.amount=1 \ - --conf spark.task.resource.gpu.amount=1 \ + --conf spark.executor.cores=10 \ + --conf spark.task.resource.gpu.amount=0.1 \ --conf spark.plugins=com.nvidia.spark.SQLPlugin \ --conf spark.rapids.sql.incompatibleDateFormats.enabled=true \ --conf spark.rapids.sql.csv.read.double.enabled=true \ + --conf spark.sql.cache.serializer=com.nvidia.spark.ParquetCachedBatchSerializer \ + --conf spark.rapids.sql.hasNans=false \ + --py-files ${SAMPLE_ZIP} \ + main.py \ + --mainClass='com.nvidia.spark.examples.mortgage.etl_main' \ + --format=csv \ + --dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/" \ + --dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/train/" \ + --dataPath="tmp::${SPARK_XGBOOST_DIR}/mortgage/output/tmp/" + +# if generating eval data, change the data path to eval +# --dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/" +# --dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/eval/" +# --dataPath="tmp::${SPARK_XGBOOST_DIR}/mortgage/output/tmp/" +# if running Taxi ETL benchmark, change the class and data path params to +# -class com.nvidia.spark.examples.taxi.ETLMain +# -dataPath="raw::${SPARK_XGBOOST_DIR}/taxi/your-path" +# -dataPath="out::${SPARK_XGBOOST_DIR}/taxi/your-path" +``` +### ETL on CPU +```bash +${SPARK_HOME}/bin/spark-submit \ + --master spark://$HOSTNAME:7077 \ + --executor-memory 32G \ + --conf spark.executor.instances=1 \ --py-files ${SAMPLE_ZIP} \ main.py \ --mainClass='com.nvidia.spark.examples.mortgage.etl_main' \ --format=csv \ - --dataPath="perf::${SPARK_XGBOOST_DIR}/mortgage/perf-train/" \ - --dataPath="acq::${SPARK_XGBOOST_DIR}/mortgage/acq-train/" \ - --dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/out/train/" - -# if generating eval data, change the data path to eval as well as the corresponding perf-eval and acq-eval data -# --dataPath="perf::${SPARK_XGBOOST_DIR}/mortgage/perf-eval" -# --dataPath="acq::${SPARK_XGBOOST_DIR}/mortgage/acq-eval" -# --dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/out/eval/" + --dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/" \ + --dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/train/" \ + --dataPath="tmp::${SPARK_XGBOOST_DIR}/mortgage/output/tmp/" + +# if generating eval data, change the data path to eval +# --dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/" +# --dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/eval/" +# --dataPath="tmp::${SPARK_XGBOOST_DIR}/mortgage/output/tmp/" # if running Taxi ETL benchmark, change the class and data path params to # -class com.nvidia.spark.examples.taxi.ETLMain # -dataPath="raw::${SPARK_XGBOOST_DIR}/taxi/your-path" @@ -155,9 +189,10 @@ Run spark-submit: ``` bash ${SPARK_HOME}/bin/spark-submit \ --conf spark.plugins=com.nvidia.spark.SQLPlugin \ - --conf spark.rapids.memory.gpu.pooling.enabled=false \ + --conf spark.rapids.memory.gpu.pool=NONE \ --conf spark.executor.resource.gpu.amount=1 \ --conf spark.task.resource.gpu.amount=1 \ + --conf spark.rapids.sql.hasNans=false \ --master ${SPARK_MASTER} \ --driver-memory ${SPARK_DRIVER_MEMORY} \ --executor-memory ${SPARK_EXECUTOR_MEMORY} \ @@ -166,8 +201,8 @@ ${SPARK_HOME}/bin/spark-submit --py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP} \ ${MAIN_PY} \ --mainClass=${EXAMPLE_CLASS} \ - --dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/out/train/ \ - --dataPath=trans::${SPARK_XGBOOST_DIR}/mortgage/out/eval/ \ + --dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/output/train/ \ + --dataPath=trans::${SPARK_XGBOOST_DIR}/mortgage/output/eval/ \ --format=parquet \ --numWorkers=${SPARK_NUM_EXECUTORS} \ --treeMethod=${TREE_METHOD} \ @@ -240,8 +275,8 @@ ${SPARK_HOME}/bin/spark-submit --py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP} \ ${SPARK_PYTHON_ENTRYPOINT} \ --mainClass=${EXAMPLE_CLASS} \ - --dataPath=train::${DATA_PATH}/mortgage/out/train/ \ - --dataPath=trans::${DATA_PATH}/mortgage/out/eval/ \ + --dataPath=train::${DATA_PATH}/mortgage/output/train/ \ + --dataPath=trans::${DATA_PATH}/mortgage/output/eval/ \ --format=parquet \ --numWorkers=${SPARK_NUM_EXECUTORS} \ --treeMethod=${TREE_METHOD} \ diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/standalone-scala.md b/docs/get-started/xgboost-examples/on-prem-cluster/standalone-scala.md index 5493340c2..c1b512b07 100644 --- a/docs/get-started/xgboost-examples/on-prem-cluster/standalone-scala.md +++ b/docs/get-started/xgboost-examples/on-prem-cluster/standalone-scala.md @@ -53,9 +53,11 @@ Get Jars and Dataset Make sure you have prepared the necessary packages and dataset by following this [guide](/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md) -Note: the `mortgage_eval_merged.csv` and `mortgage_train_merged.csv` are not Mortgage raw data, -they are the data produced by Mortgage ETL job. If user wants to use a larger size Mortgage data, please refer to [Launch ETL job](#etl). -Taxi ETL job is the same. But Agaricus does not have ETL process, it is combined with XGBoost as there is just a filter operation. +#### Note: +1. Mortgage and Taxi jobs have ETLs to generate the processed data. +2. For convenience, a subset of [Taxi](/datasets/) dataset is made available in this repo that can be readily used for launching XGBoost job. Use [ETL](#etl) to generate larger datasets for trainig and testing. +3. Agaricus does not have an ETL process, it is combined with XGBoost as there is just a filter operation. + Launch a Standalone Spark Cluster --------------------------------- @@ -90,31 +92,59 @@ Launch a Standalone Spark Cluster Launch Mortgage or Taxi ETL Part --------------------------- -If user wants to use a larger size dataset other than the default one, we provide an ETL app to process raw Mortgage data. - +Use the ETL app to process raw Mortgage data. You can either use this ETLed data to split into training and evaluation data or run the ETL on different subsets of the dataset to produce training and evaluation datasets. Run spark-submit +Note: For ETL jobs, Set `spark.task.resource.gpu.amount` to `1/spark.executor.cores`. + +### ETL on GPU ``` bash ${SPARK_HOME}/bin/spark-submit \ --master spark://$HOSTNAME:7077 \ --executor-memory 32G \ - --conf spark.rapids.memory.gpu.pooling.enabled=false \ --conf spark.executor.resource.gpu.amount=1 \ - --conf spark.task.resource.gpu.amount=1 \ + --conf spark.executor.cores=10 \ + --conf spark.task.resource.gpu.amount=0.1 \ --conf spark.plugins=com.nvidia.spark.SQLPlugin \ --conf spark.rapids.sql.incompatibleDateFormats.enabled=true \ --conf spark.rapids.sql.csv.read.double.enabled=true \ + --conf spark.sql.cache.serializer=com.nvidia.spark.ParquetCachedBatchSerializer \ + --conf spark.rapids.sql.hasNans=false \ --class com.nvidia.spark.examples.mortgage.ETLMain \ $SAMPLE_JAR \ -format=csv \ - -dataPath="perf::${SPARK_XGBOOST_DIR}/mortgage/perf-train/" \ - -dataPath="acq::${SPARK_XGBOOST_DIR}/mortgage/acq-train/" \ - -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/out/train/" - -# if generating eval data, change the data path to eval as well as the corresponding perf-eval and acq-eval data -# -dataPath="perf::${SPARK_XGBOOST_DIR}/mortgage/perf-eval" -# -dataPath="acq::${SPARK_XGBOOST_DIR}/mortgage/acq-eval" -# -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/out/eval/" + -dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/" \ + -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/train/" \ + -dataPath="tmp::${SPARK_XGBOOST_DIR}/mortgage/output/tmp/" + +# if generating eval data, change the data path to eval +# -dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/" +# -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/eval/" +# -dataPath="tmp::${SPARK_XGBOOST_DIR}/mortgage/output/tmp/" +# if running Taxi ETL benchmark, change the class and data path params to +# -class com.nvidia.spark.examples.taxi.ETLMain +# -dataPath="raw::${SPARK_XGBOOST_DIR}/taxi/your-path" +# -dataPath="out::${SPARK_XGBOOST_DIR}/taxi/your-path" +``` + +### ETL on CPU + +```bash +${SPARK_HOME}/bin/spark-submit \ +--master spark://$HOSTNAME:7077 \ +--executor-memory 32G \ +--conf spark.executor.instances=1 \ +--conf spark.sql.broadcastTimeout=700 \ +--class com.nvidia.spark.examples.mortgage.ETLMain \ +$SAMPLE_JAR \ +-format=csv \ +-dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/" \ +-dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/train/" \ +-dataPath="tmp::${SPARK_XGBOOST_DIR}/mortgage/output/tmp/" + +# if generating eval data, change the data path to eval +# -dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/" +# -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/eval/" # if running Taxi ETL benchmark, change the class and data path params to # -class com.nvidia.spark.examples.taxi.ETLMain # -dataPath="raw::${SPARK_XGBOOST_DIR}/taxi/your-path" @@ -150,9 +180,9 @@ export SPARK_DRIVER_MEMORY=4g export SPARK_EXECUTOR_MEMORY=8g # example class to use -export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.GPUMain -# or change to com.nvidia.spark.examples.taxi.GPUMain to run Taxi Xgboost benchmark -# or change to com.nvidia.spark.examples.agaricus.GPUMain to run Agaricus Xgboost benchmark +export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.Main +# or change to com.nvidia.spark.examples.taxi.Main to run Taxi Xgboost benchmark +# or change to com.nvidia.spark.examples.agaricus.Main to run Agaricus Xgboost benchmark # tree construction algorithm export TREE_METHOD=gpu_hist @@ -163,7 +193,8 @@ Run spark-submit: ``` bash ${SPARK_HOME}/bin/spark-submit \ --conf spark.plugins=com.nvidia.spark.SQLPlugin \ - --conf spark.rapids.memory.gpu.pooling.enabled=false \ + --conf spark.rapids.memory.gpu.pool=NONE \ + --conf spark.rapids.sql.hasNans=false \ --conf spark.executor.resource.gpu.amount=1 \ --conf spark.task.resource.gpu.amount=1 \ --master ${SPARK_MASTER} \ @@ -172,9 +203,9 @@ ${SPARK_HOME}/bin/spark-submit --conf spark.cores.max=${TOTAL_CORES} \ --class ${EXAMPLE_CLASS} \ ${SAMPLE_JAR} \ - -dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/csv/train/mortgage_train_merged.csv \ - -dataPath=trans::${SPARK_XGBOOST_DIR}/mortgage/csv/test/mortgage_eval_merged.csv \ - -format=csv \ + -dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/output/train/ \ + -dataPath=trans::${SPARK_XGBOOST_DIR}/mortgage/output/eval/ \ + -format=parquet \ -numWorkers=${SPARK_NUM_EXECUTORS} \ -treeMethod=${TREE_METHOD} \ -numRound=100 \ @@ -229,7 +260,7 @@ export SPARK_DRIVER_MEMORY=4g export SPARK_EXECUTOR_MEMORY=8g # example class to use -export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.CPUMain +export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.Main # Please make sure to change the class while running Taxi or Agaricus benchmark # tree construction algorithm @@ -238,7 +269,7 @@ export TREE_METHOD=hist This is the same command as for the GPU example, repeated for convenience: -``` bash +```bash ${SPARK_HOME}/bin/spark-submit \ --master ${SPARK_MASTER} \ --driver-memory ${SPARK_DRIVER_MEMORY} \ @@ -246,9 +277,9 @@ ${SPARK_HOME}/bin/spark-submit --conf spark.cores.max=${TOTAL_CORES} \ --class ${EXAMPLE_CLASS} \ ${SAMPLE_JAR} \ - -dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/csv/train/mortgage_train_merged.csv \ - -dataPath=trans::${SPARK_XGBOOST_DIR}/mortgage/csv/test/mortgage_eval_merged.csv \ - -format=csv \ + -dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/output/train/ \ + -dataPath=trans::${SPARK_XGBOOST_DIR}/mortgage/output/eval/ \ + -format=parquet \ -numWorkers=${SPARK_NUM_EXECUTORS} \ -treeMethod=${TREE_METHOD} \ -numRound=100 \ diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md b/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md index 7966791a2..9d92da01a 100644 --- a/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md +++ b/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md @@ -47,25 +47,35 @@ Then create a directory in HDFS, and run below commands, Launch Mortgage or Taxi ETL Part --------------------------- -Run spark-submit: +Use the ETL app to process raw Mortgage data. You can either use this ETLed data to split into training and evaluation data or run the ETL on different subsets of the dataset to produce training and evaluation datasets. + +Note: For ETL jobs, Set `spark.task.resource.gpu.amount` to `1/spark.executor.cores`. ``` bash # location where data was downloaded export DATA_PATH=hdfs:/tmp/xgboost4j_spark_python/ ${SPARK_HOME}/bin/spark-submit \ - --master yarn - --deploy-mode cluster + --master yarn \ + --deploy-mode cluster \ + --conf spark.executor.cores=10 \ + --conf spark.task.resource.gpu.amount=0.1 \ + --conf spark.rapids.sql.incompatibleDateFormats.enabled=true \ + --conf spark.rapids.sql.csv.read.double.enabled=true \ + --conf spark.sql.cache.serializer=com.nvidia.spark.ParquetCachedBatchSerializer \ + --conf spark.rapids.sql.hasNans=false \ --jars ${RAPIDS_JAR}\ ${MAIN_PY} \ --mainClass='com.nvidia.spark.examples.mortgage.etl_main' \ --format=csv \ - --dataPath="perf::${DATA_PATH}/mortgage/data/mortgage/perf/" \ - --dataPath="acq::${DATA_PATH}/mortgage/data/mortgage/acq/" \ - --dataPath="out::${DATA_PATH}/mortgage/data/mortgage/out/train/" - -# if generate eval data, change the data path to eval -# --dataPath="out::${DATA_PATH}/mortgage/data/mortgage/out/eval/ + --dataPath="data::${DATA_PATH}/mortgage/data/mortgage/input/" \ + --dataPath="out::${DATA_PATH}/mortgage/data/mortgage/output/train/" \ + --dataPath="tmp::${SPARK_XGBOOST_DIR}/mortgage/output/tmp/" + +# if generating eval data, change the data path to eval +# --dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/" +# --dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/eval/" +# --dataPath="tmp::${SPARK_XGBOOST_DIR}/mortgage/output/tmp/" # if running Taxi ETL benchmark, change the class and data path params to # -class com.nvidia.spark.examples.taxi.ETLMain # -dataPath="raw::${SPARK_XGBOOST_DIR}/taxi/your-path" @@ -111,9 +121,10 @@ Run spark-submit: ``` bash ${SPARK_HOME}/bin/spark-submit \ --conf spark.plugins=com.nvidia.spark.SQLPlugin \ - --conf spark.rapids.memory.gpu.pooling.enabled=false \ + --conf spark.rapids.memory.gpu.pool=NONE \ --conf spark.executor.resource.gpu.amount=1 \ --conf spark.task.resource.gpu.amount=1 \ + --conf spark.rapids.sql.hasNans=false \ --conf spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh \ --files ${SPARK_HOME}/examples/src/main/scripts/getGpusResources.sh \ --master yarn \ @@ -194,8 +205,8 @@ ${SPARK_HOME}/bin/spark-submit --py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP} \ ${MAIN_PY} \ --mainClass=${EXAMPLE_CLASS} \ - --dataPath=train::${DATA_PATH}/mortgage/out/train/ \ - --dataPath=trans::${DATA_PATH}/mortgage/out/eval/ \ + --dataPath=train::${DATA_PATH}/mortgage/output/train/ \ + --dataPath=trans::${DATA_PATH}/mortgage/output/eval/ \ --format=parquet \ --numWorkers=${SPARK_NUM_EXECUTORS} \ --treeMethod=${TREE_METHOD} \ diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/yarn-scala.md b/docs/get-started/xgboost-examples/on-prem-cluster/yarn-scala.md index 9e6e4367b..dc6918ee4 100644 --- a/docs/get-started/xgboost-examples/on-prem-cluster/yarn-scala.md +++ b/docs/get-started/xgboost-examples/on-prem-cluster/yarn-scala.md @@ -35,6 +35,11 @@ Get Jars and Dataset Make sure you have prepared the necessary packages and dataset by following this [guide](/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md) +#### Note: +1. Mortgage and Taxi jobs have ETLs to generate the processed data. +2. For convenience, a subset of [Taxi](/datasets/) dataset is made available in this repo that can be readily used for launching XGBoost job. Use [ETL](#etl) to generate larger datasets for trainig and testing. +3. Agaricus does not have an ETL process, it is combined with XGBoost as there is just a filter operation. + Create a directory in HDFS, and copy: ``` bash @@ -45,19 +50,24 @@ Create a directory in HDFS, and copy: Launch Mortgage or Taxi ETL Part --------------------------- -Note: the `mortgage_eval_merged.csv` and `mortgage_train_merged.csv` are not Mortgage raw data, -they are the data produced by Mortgage ETL job. If user wants to use a larger size Mortgage data, please refer to [Launch ETL job](#etl). -Taxi ETL job is the same. But Agaricus does not have ETL process, it is combined with XGBoost as there is just a filter operation. +Use the ETL app to process raw Mortgage data. You can either use this ETLed data to split into training and evaluation data or run the ETL on different subsets of the dataset to produce training and evaluation datasets. + +Note: For ETL jobs, Set `spark.task.resource.gpu.amount` to `1/spark.executor.cores`. + Run spark-submit ``` bash ${SPARK_HOME}/bin/spark-submit \ --conf spark.plugins=com.nvidia.spark.SQLPlugin \ - --conf spark.rapids.memory.gpu.pooling.enabled=false \ --conf spark.executor.resource.gpu.amount=1 \ - --conf spark.task.resource.gpu.amount=1 \ + --conf spark.executor.cores=10 \ + --conf spark.task.resource.gpu.amount=0.1 \ + --conf spark.rapids.sql.incompatibleDateFormats.enabled=true \ + --conf spark.rapids.sql.csv.read.double.enabled=true \ --conf spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh \ + --conf spark.sql.cache.serializer=com.nvidia.spark.ParquetCachedBatchSerializer \ + --conf spark.rapids.sql.hasNans=false \ --files $SPARK_HOME/examples/src/main/scripts/getGpusResources.sh \ --jars ${RAPIDS_JAR} \ --master yarn \ @@ -65,18 +75,17 @@ ${SPARK_HOME}/bin/spark-submit \ --num-executors ${SPARK_NUM_EXECUTORS} \ --driver-memory ${SPARK_DRIVER_MEMORY} \ --executor-memory ${SPARK_EXECUTOR_MEMORY} \ - --class ${EXAMPLE_CLASS} \ --class com.nvidia.spark.examples.mortgage.ETLMain \ $SAMPLE_JAR \ -format=csv \ - -dataPath="perf::${SPARK_XGBOOST_DIR}/mortgage/perf-train/" \ - -dataPath="acq::${SPARK_XGBOOST_DIR}/mortgage/acq-train/" \ - -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/out/train/" - -# if generating eval data, change the data path to eval as well as the corresponding perf-eval and acq-eval data -# -dataPath="perf::${SPARK_XGBOOST_DIR}/mortgage/perf-eval" -# -dataPath="acq::${SPARK_XGBOOST_DIR}/mortgage/acq-eval" -# -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/out/eval/" + -dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/" \ + -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/train/" \ + -dataPath="tmp::${SPARK_XGBOOST_DIR}/mortgage/output/tmp/" + +# if generating eval data, change the data path to eval +# -dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/" +# -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/eval/" +# -dataPath="tmp::${SPARK_XGBOOST_DIR}/mortgage/output/tmp/" # if running Taxi ETL benchmark, change the class and data path params to # -class com.nvidia.spark.examples.taxi.ETLMain # -dataPath="raw::${SPARK_XGBOOST_DIR}/taxi/your-path" @@ -106,9 +115,9 @@ export SPARK_DRIVER_MEMORY=4g export SPARK_EXECUTOR_MEMORY=8g # example class to use -export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.GPUMain -# or change to com.nvidia.spark.examples.taxi.GPUMain to run Taxi Xgboost benchmark -# or change to com.nvidia.spark.examples.agaricus.GPUMain to run Agaricus Xgboost benchmark +export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.Main +# or change to com.nvidia.spark.examples.taxi.Main to run Taxi Xgboost benchmark +# or change to com.nvidia.spark.examples.agaricus.Main to run Agaricus Xgboost benchmark # tree construction algorithm export TREE_METHOD=gpu_hist @@ -119,9 +128,10 @@ Run spark-submit: ``` bash ${SPARK_HOME}/bin/spark-submit \ --conf spark.plugins=com.nvidia.spark.SQLPlugin \ - --conf spark.rapids.memory.gpu.pooling.enabled=false \ + --conf spark.rapids.memory.gpu.pool=NONE \ --conf spark.executor.resource.gpu.amount=1 \ --conf spark.task.resource.gpu.amount=1 \ + --conf spark.rapids.sql.hasNans=false \ --conf spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh \ --files $SPARK_HOME/examples/src/main/scripts/getGpusResources.sh \ --jars ${RAPIDS_JAR} \ @@ -132,9 +142,9 @@ ${SPARK_HOME}/bin/spark-submit --executor-memory ${SPARK_EXECUTOR_MEMORY} \ --class ${EXAMPLE_CLASS} \ ${SAMPLE_JAR} \ - -dataPath=train::${DATA_PATH}/mortgage/csv/train/mortgage_train_merged.csv \ - -dataPath=trans::${DATA_PATH}/mortgage/csv/test/mortgage_eval_merged.csv \ - -format=csv \ + -dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/output/train/ \ + -dataPath=trans::${SPARK_XGBOOST_DIR}/mortgage/output/eval/ \ + -format=parquet \ -numWorkers=${SPARK_NUM_EXECUTORS} \ -treeMethod=${TREE_METHOD} \ -numRound=100 \ @@ -181,7 +191,7 @@ export SPARK_DRIVER_MEMORY=4g export SPARK_EXECUTOR_MEMORY=8g # example class to use -export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.CPUMain +export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.Main # Please make sure to change the class while running Taxi or Agaricus benchmark # tree construction algorithm @@ -199,9 +209,9 @@ ${SPARK_HOME}/bin/spark-submit --executor-memory ${SPARK_EXECUTOR_MEMORY} \ --class ${EXAMPLE_CLASS} \ ${SAMPLE_JAR} \ - -dataPath=train::${DATA_PATH}/mortgage/csv/train/mortgage_train_merged.csv \ - -dataPath=trans::${DATA_PATH}/mortgage/csv/test/mortgage_eval_merged.csv \ - -format=csv \ + -dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/output/train/ \ + -dataPath=trans::${SPARK_XGBOOST_DIR}/mortgage/output/eval/ \ + -format=parquet \ -numWorkers=${SPARK_NUM_EXECUTORS} \ -treeMethod=${TREE_METHOD} \ -numRound=100 \ diff --git a/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md b/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md index cbeeccdbb..6f511be5b 100644 --- a/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md +++ b/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md @@ -9,7 +9,7 @@ For simplicity export the location to these jars. All examples assume the packag * [XGBoost4j-Spark Package](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/) 2. Download the RAPIDS Accelerator for Apache Spark plugin jar - * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.06.0/rapids-4-spark_2.12-22.06.0.jar) + * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar) ### Build XGBoost Python Examples @@ -17,15 +17,16 @@ Following this [guide](/docs/get-started/xgboost-examples/building-sample-apps/p ### Download dataset -You need to download Mortgage dataset to `/opt/xgboost` from this [site](https://docs.rapids.ai/datasets/mortgage-data) -, download Taxi dataset from this [site](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) -, download Agaricus dataset from this [site](https://gust.dev/r/xgboost-agaricus). +You need to copy the dataset to `/opt/xgboost`. Use the following links to download the data. +1. [Mortgage dataset](/docs/get-started/xgboost-examples/dataset/mortgage.md) +2. [Taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) +3. [Agaricus dataset](https://gust.dev/r/xgboost-agaricus) ### Setup environments ``` bash export SPARK_XGBOOST_DIR=/opt/xgboost -export RAPIDS_JAR=${SPARK_XGBOOST_DIR}/rapids-4-spark_2.12-22.06.0.jar +export RAPIDS_JAR=${SPARK_XGBOOST_DIR}/rapids-4-spark_2.12-22.08.0.jar export XGBOOST4J_JAR=${SPARK_XGBOOST_DIR}/xgboost4j_3.0-1.4.2-0.3.0.jar export XGBOOST4J_SPARK_JAR=${SPARK_XGBOOST_DIR}/xgboost4j-spark_3.0-1.4.2-0.3.0.jar export SAMPLE_ZIP=${SPARK_XGBOOST_DIR}/samples.zip diff --git a/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md b/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md index a5f451778..e5bf88571 100644 --- a/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md +++ b/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md @@ -5,7 +5,7 @@ For simplicity export the location to these jars. All examples assume the packag ### Download the jars 1. Download the RAPIDS Accelerator for Apache Spark plugin jar - * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.06.0/rapids-4-spark_2.12-22.06.0.jar) + * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar) ### Build XGBoost Scala Examples @@ -13,14 +13,15 @@ Following this [guide](/docs/get-started/xgboost-examples/building-sample-apps/s ### Download dataset -You need to download mortgage dataset to `/opt/xgboost` from this [site](https://docs.rapids.ai/datasets/mortgage-data) -, download Taxi dataset from this [site](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) -, download Agaricus dataset from this [site](https://gust.dev/r/xgboost-agaricus). +You need to copy the dataset to `/opt/xgboost`. Use the following links to download the data. +1. [Mortgage dataset](/docs/get-started/xgboost-examples/dataset/mortgage.md) +2. [Taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) +3. [Agaricus dataset](https://gust.dev/r/xgboost-agaricus) ### Setup environments ``` bash export SPARK_XGBOOST_DIR=/opt/xgboost -export RAPIDS_JAR=${SPARK_XGBOOST_DIR}/rapids-4-spark_2.12-22.06.0.jar +export RAPIDS_JAR=${SPARK_XGBOOST_DIR}/rapids-4-spark_2.12-22.08.0.jar export SAMPLE_JAR=${SPARK_XGBOOST_DIR}/sample_xgboost_apps-0.2.3-jar-with-dependencies.jar ``` diff --git a/docs/img/guides/cuspatial/Nycd-Community-Districts.png b/docs/img/guides/cuspatial/Nycd-Community-Districts.png new file mode 100644 index 000000000..fa96b3b60 Binary files /dev/null and b/docs/img/guides/cuspatial/Nycd-Community-Districts.png differ diff --git a/docs/img/guides/cuspatial/Nyct2000.png b/docs/img/guides/cuspatial/Nyct2000.png new file mode 100644 index 000000000..055f3de8f Binary files /dev/null and b/docs/img/guides/cuspatial/Nyct2000.png differ diff --git a/docs/img/guides/cuspatial/install-jar.png b/docs/img/guides/cuspatial/install-jar.png new file mode 100644 index 000000000..0d11c81ec Binary files /dev/null and b/docs/img/guides/cuspatial/install-jar.png differ diff --git a/docs/img/guides/cuspatial/sample-polygon.png b/docs/img/guides/cuspatial/sample-polygon.png new file mode 100644 index 000000000..f8afb907f Binary files /dev/null and b/docs/img/guides/cuspatial/sample-polygon.png differ diff --git a/docs/img/guides/cuspatial/taxi-zones.png b/docs/img/guides/cuspatial/taxi-zones.png new file mode 100644 index 000000000..a8682cb03 Binary files /dev/null and b/docs/img/guides/cuspatial/taxi-zones.png differ diff --git a/docs/img/guides/mortgage-perf.png b/docs/img/guides/mortgage-perf.png index 0548ffd0e..23715ce9a 100644 Binary files a/docs/img/guides/mortgage-perf.png and b/docs/img/guides/mortgage-perf.png differ diff --git a/examples/ML+DL-Examples/Spark-DL/criteo_train/Dockerfile.conda_db b/examples/ML+DL-Examples/Spark-DL/criteo_train/Dockerfile.conda_db index 135a3328b..475b99149 100644 --- a/examples/ML+DL-Examples/Spark-DL/criteo_train/Dockerfile.conda_db +++ b/examples/ML+DL-Examples/Spark-DL/criteo_train/Dockerfile.conda_db @@ -13,15 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -FROM nvidia/cuda:11.2.1-cudnn8-devel-ubuntu20.04 - +FROM nvidia/cuda:11.4.3-cudnn8-devel-ubuntu20.04 ENV DEBIAN_FRONTEND=noninteractive # Disable NVIDIA repos to prevent accidental upgrades. RUN cd /etc/apt/sources.list.d && \ - mv cuda.list cuda.list.disabled && \ - mv nvidia-ml.list nvidia-ml.list.disabled + mv cuda.list cuda.list.disabled # See https://github.com/databricks/containers/blob/master/ubuntu/minimal/Dockerfile RUN apt-get update && \ @@ -52,21 +49,20 @@ RUN wget -q https://repo.continuum.io/miniconda/Miniconda3-py38_4.9.2-Linux-x86_ conda clean --all # install openjdk8, cmake, openmpi openmpi-mpicc -RUN conda install cmake openmpi openmpi-mpicc -y -RUN pip install jupyter +RUN conda install cmake openmpi openmpi-mpicc -y ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64 ENV PATH $PATH:/usr/lib/jvm/java-1.8.0-openjdk-amd64/jre/bin:/usr/lib/jvm/java-1.8.0-openjdk-amd64/bin -RUN pip uninstall tensorflow -y; pip install tensorflow +RUN conda install -y -c nvidia -c rapidsai -c numba -c conda-forge nvtabular=1.2.2 python=3.8 cudatoolkit=11.4 scikit-learn -RUN HOROVOD_WITH_MPI=1 HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_TENSORFLOW=1 \ +RUN pip uninstall tensorflow -y; pip install tensorflow-gpu==2.8 +RUN pip install torch==1.11.0+cu115 torchvision==0.12.0+cu115 torchaudio===0.11.0+cu115 -f https://download.pytorch.org/whl/cu115/torch_stable.html +RUN rm -rf /databricks/conda/include/google +RUN HOROVOD_WITH_MPI=1 HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 \ pip install horovod[spark] --no-cache-dir +RUN pip install pynvml jupyter matplotlib -RUN conda install -c nvidia -c rapidsai -c numba -c conda-forge nvtabular=0.9.0 python=3.8 cudatoolkit=11.2 -RUN pip install pynvml -RUN conda install -c conda-forge ipython==7.19.0 matplotlib==3.4.2 jinja2==2.11.3 -RUN pip uninstall pandas -y; pip install pandas==1.1.5 RUN apt-get update && apt-get install wget openssh-client openssh-server \ -y --allow-downgrades --allow-change-held-packages --no-install-recommends RUN useradd --create-home --shell /bin/bash --groups sudo ubuntu @@ -75,6 +71,8 @@ ENV PYSPARK_PYTHON=/databricks/conda/bin/python ENV USER root ENV DEFAULT_DATABRICKS_ROOT_CONDA_ENV=base ENV DATABRICKS_ROOT_CONDA_ENV=base +# disable gds due to errors +ENV LIBCUDF_CUFILE_POLICY=OFF # required by DB RUN pip install virtualenv RUN pip install adlfs diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile b/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile index 3f0ae80e8..ea40e1ed4 100644 --- a/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile +++ b/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile @@ -17,7 +17,7 @@ ARG CUDA_VER=11.5.1 FROM nvidia/cuda:${CUDA_VER}-devel-ubuntu20.04 -ARG BRANCH_VER=22.06 +ARG BRANCH_VER=22.08 RUN apt-get update RUN apt-get install -y wget ninja-build git diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/README.md b/examples/ML+DL-Examples/Spark-cuML/pca/README.md index f844553b5..4c1d9e861 100644 --- a/examples/ML+DL-Examples/Spark-cuML/pca/README.md +++ b/examples/ML+DL-Examples/Spark-cuML/pca/README.md @@ -12,7 +12,7 @@ User can also download the release jar from Maven central: [rapids-4-spark-ml_2.12-22.02.0-cuda11.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-ml_2.12/22.02.0/rapids-4-spark-ml_2.12-22.02.0-cuda11.jar) -[rapids-4-spark_2.12-22.06.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.06.0/rapids-4-spark_2.12-22.06.0.jar) +[rapids-4-spark_2.12-22.08.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar) ## Sample code @@ -48,7 +48,7 @@ It is assumed that a Standalone Spark cluster has been set up, the `SPARK_MASTER ``` bash RAPIDS_ML_JAR=PATH_TO_rapids-4-spark-ml_2.12-22.02.0-cuda11.jar - PLUGIN_JAR=PATH_TO_rapids-4-spark_2.12-22.06.0.jar + PLUGIN_JAR=PATH_TO_rapids-4-spark_2.12-22.08.0.jar jupyter toree install \ --spark_home=${SPARK_HOME} \ diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml b/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml index 6a856d9f9..7a7b399d5 100644 --- a/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml +++ b/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml @@ -21,7 +21,7 @@ com.nvidia PCAExample jar - 22.06.0-SNAPSHOT + 22.08.0-SNAPSHOT 8 @@ -51,7 +51,7 @@ com.nvidia rapids-4-spark-ml_2.12 - 22.06.0-SNAPSHOT + 22.08.0-SNAPSHOT diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh b/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh index 4e5f796d1..03381d8e9 100755 --- a/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh +++ b/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh @@ -15,8 +15,8 @@ # limitations under the License. # -ML_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark-ml_2.12/22.06.0-SNAPSHOT/rapids-4-spark-ml_2.12-22.06.0-SNAPSHOT.jar -PLUGIN_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark_2.12/22.06.0-SNAPSHOT/rapids-4-spark_2.12-22.06.0-SNAPSHOT.jar +ML_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark-ml_2.12/22.08.0-SNAPSHOT/rapids-4-spark-ml_2.12-22.08.0-SNAPSHOT.jar +PLUGIN_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark_2.12/22.08.0-SNAPSHOT/rapids-4-spark_2.12-22.08.0-SNAPSHOT.jar $SPARK_HOME/bin/spark-submit \ --master spark://127.0.0.1:7077 \ @@ -38,4 +38,4 @@ $SPARK_HOME/bin/spark-submit \ --conf spark.network.timeout=1000s \ --jars $ML_JAR,$PLUGIN_JAR \ --class com.nvidia.spark.examples.pca.Main \ -/workspace/target/PCAExample-22.06.0-SNAPSHOT.jar +/workspace/target/PCAExample-22.08.0-SNAPSHOT.jar diff --git a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-cpu.ipynb b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-cpu.ipynb index a90f57aa6..ce5c5a797 100644 --- a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-cpu.ipynb +++ b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-cpu.ipynb @@ -120,7 +120,7 @@ "# By default, Spark will try to distribute the data among all the tasks in the cluster, \n", "# but on large clusters with large parquet files the splittable portions of the parquet files end up not being distributed evenly \n", "# and it is faster to re-partition the data to redistribute it than to deal with skew.\n", - "spark.read.parquet(\"hdfs:///data/tpcds_sf3000-parquet/useDecimal=true,useDate=true,filterNull=false/customer\").repartition(512).createOrReplaceTempView(\"customer\")\n", + "spark.read.parquet(dataRoot + \"/tpcds/customer\").repartition(512).createOrReplaceTempView(\"customer\")\n", "\n", "print(\"-\"*50)" ] diff --git a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb index a72b24727..cc1d11331 100644 --- a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb +++ b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb @@ -22,7 +22,7 @@ "import os\n", "# Change to your cluster ip:port and directories\n", "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"spark:your-ip:port\")\n", - "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-path/rapids-4-spark_2.12-22.06.0.jar\")\n" + "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-path/rapids-4-spark_2.12-22.08.0.jar\")\n" ] }, { @@ -497,7 +497,7 @@ ], "source": [ "start = time() \n", - "spark.read.parquet(\"hdfs:///data/tpcds_sf3000-parquet/useDecimal=true,useDate=true,filterNull=false/customer\").limit(1000000).write.format(\"parquet\").mode(\"overwrite\").save(\"/data/tmp/customer1m\")\n", + "spark.read.parquet(dataRoot + \"/tpcds/customer\").limit(1000000).write.format(\"parquet\").mode(\"overwrite\").save(\"/data/tmp/customer1m\")\n", "end = time()\n", "# Parquet file scanning and writing will be about 3 times faster running on GPU\n", "print(\"scanning and writing parquet cost : {} seconds\".format(round(end - start, 2)))\n", diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md index d924f4b06..242719b2e 100644 --- a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md +++ b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md @@ -108,7 +108,7 @@ See above Prerequisites section First finish the steps in "Building with Native Code Examples and run test cases" section, then do the following in the docker. ### Get jars from Maven Central -[rapids-4-spark_2.12-22.06.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.06.0/rapids-4-spark_2.12-22.06.0.jar) +[rapids-4-spark_2.12-22.08.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar) ### Launch a local mode Spark diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml index 237d2d2cb..95f18cbc4 100644 --- a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml +++ b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml @@ -25,7 +25,7 @@ user defined functions for use with the RAPIDS Accelerator for Apache Spark - 22.06.0-SNAPSHOT + 22.08.0-SNAPSHOT 1.8 @@ -37,7 +37,7 @@ cuda11 2.12 - 22.06.0 + 22.08.0 3.1.1 2.12.15 ${project.build.directory}/cpp-build diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/CMakeLists.txt b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/CMakeLists.txt index 11b21ae15..b9b4929d5 100755 --- a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/CMakeLists.txt +++ b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/CMakeLists.txt @@ -16,7 +16,7 @@ cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) -file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.06/RAPIDS.cmake +file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.08/RAPIDS.cmake ${CMAKE_BINARY_DIR}/RAPIDS.cmake) include(${CMAKE_BINARY_DIR}/RAPIDS.cmake) @@ -32,7 +32,7 @@ if(DEFINED GPU_ARCHS) endif() rapids_cuda_init_architectures(UDFEXAMPLESJNI) -project(UDFEXAMPLESJNI VERSION 22.06.0 LANGUAGES C CXX CUDA) +project(UDFEXAMPLESJNI VERSION 22.08.0 LANGUAGES C CXX CUDA) option(PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" OFF) option(BUILD_UDF_BENCHMARKS "Build the benchmarks" OFF) @@ -84,10 +84,10 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -w --expt-extended-lambda --expt-relax set(CUDA_USE_STATIC_CUDA_RUNTIME OFF) rapids_cpm_init() -rapids_cpm_find(cudf 22.06.00 +rapids_cpm_find(cudf 22.08.00 CPM_ARGS GIT_REPOSITORY https://github.com/rapidsai/cudf.git - GIT_TAG branch-22.06 + GIT_TAG branch-22.08 GIT_SHALLOW TRUE SOURCE_SUBDIR cpp OPTIONS "BUILD_TESTS OFF" diff --git a/examples/UDF-Examples/Spark-cuSpatial/Dockerfile b/examples/UDF-Examples/Spark-cuSpatial/Dockerfile index 253ecc290..6d81a260c 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/Dockerfile +++ b/examples/UDF-Examples/Spark-cuSpatial/Dockerfile @@ -18,6 +18,7 @@ ARG CUDA_VER=11.2.2 FROM nvidia/cuda:${CUDA_VER}-devel-ubuntu18.04 +RUN apt-key adv --fetch-keys https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub RUN apt-get update RUN apt-get install -y wget ninja-build git @@ -38,11 +39,11 @@ RUN conda --version RUN conda install -c conda-forge openjdk=8 maven=3.8.1 -y # install cuDF dependency. -RUN conda install -c rapidsai-nightly -c nvidia -c conda-forge -c defaults libcuspatial=22.06 python=3.8 -y +RUN conda install -c rapidsai -c nvidia -c conda-forge -c defaults libcuspatial=22.08 python=3.8 -y RUN wget --quiet \ https://github.com/Kitware/CMake/releases/download/v3.21.3/cmake-3.21.3-linux-x86_64.tar.gz \ && tar -xzf cmake-3.21.3-linux-x86_64.tar.gz \ && rm -rf cmake-3.21.3-linux-x86_64.tar.gz -ENV PATH="/cmake-3.21.3-linux-x86_64/bin:${PATH}" +ENV PATH="/cmake-3.21.3-linux-x86_64/bin:${PATH}" \ No newline at end of file diff --git a/examples/UDF-Examples/Spark-cuSpatial/Dockerfile.awsdb b/examples/UDF-Examples/Spark-cuSpatial/Dockerfile.awsdb index 73a3e8646..98839d1ed 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/Dockerfile.awsdb +++ b/examples/UDF-Examples/Spark-cuSpatial/Dockerfile.awsdb @@ -18,6 +18,9 @@ FROM nvidia/cuda:11.2.2-devel-ubuntu18.04 ENV DEBIAN_FRONTEND=noninteractive +# update cuda pub key to avoid GPG error +RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub + # See https://github.com/databricks/containers/blob/master/ubuntu/minimal/Dockerfile RUN apt-get update && \ apt-get install --yes --no-install-recommends \ @@ -45,7 +48,7 @@ RUN wget -q https://repo.continuum.io/miniconda/Miniconda3-py38_4.9.2-Linux-x86_ conda config --system --set always_yes True && \ conda clean --all -RUN conda install -c rapidsai-nightly -c nvidia -c conda-forge -c defaults libcuspatial=22.06 +RUN conda install -c rapidsai-nightly -c nvidia -c conda-forge -c defaults libcuspatial=22.08 RUN conda install -c conda-forge libgdal==3.3.1 RUN pip install jupyter ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64 diff --git a/examples/UDF-Examples/Spark-cuSpatial/README.md b/examples/UDF-Examples/Spark-cuSpatial/README.md index 49598396e..b90b34be7 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/README.md +++ b/examples/UDF-Examples/Spark-cuSpatial/README.md @@ -5,19 +5,39 @@ It implements a [RapidsUDF](https://nvidia.github.io/spark-rapids/docs/additiona interface to call the cuSpatial functions through JNI. It can be run on a distributed Spark cluster with scalability. ## Performance -We got the end-2-end time as below table when running with 2009 NYC Taxi trip pickup location, -which includes 168,898,952 points, and 3 sets of polygons(taxi_zone, nyct2000, nycd). -The data can be downloaded from [TLC Trip Record Data](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) -and [NYC Open data](https://www1.nyc.gov/site/planning/data-maps/open-data.page#district_political). -| Environment | Taxi_zones (263 Polygons) | Nyct2000 (2216 Polygons) | Nycd (71 Complex Polygons)| +We got the end-2-end hot run times as below table when running with 2009 NYC Taxi trip pickup location, +which includes 170,896,055 points, and 3 sets of polygons(taxi_zone, nyct2000, nycd Community-Districts). +The point data can be downloaded from [TLC Trip Record Data](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). +The polygon data can be downloaded from [taxi_zone dataset](https://data.cityofnewyork.us/Transportation/NYC-Taxi-Zones/d3c5-ddgc), +[nyct2000 dataset](https://data.cityofnewyork.us/City-Government/2000-Census-Tracts/ysjj-vb9j) and +[nycd Community-Districts dataset](https://data.cityofnewyork.us/City-Government/Community-Districts/yfnk-k7r4) + +| Environment | Taxi_zones (263 Polygons) | Nyct2000 (2216 Polygons) | Nycd Community-Districts (71 Complex Polygons)| | ----------- | :---------: | :---------: | :---------: | -| 4-core CPU | 1122.9 seconds | 5525.4 seconds| 6642.7 seconds | -| 1 GPU(Titan V) on local | 4.5 seconds | 5.7 seconds | 6.6 seconds| -| 2 GPU(T4) on Databricks | 9.1 seconds | 10.0 seconds | 12.1 seconds | +| 4-core CPU | 3.9 minutes | 4.0 minutes| 4.1 minutes | +| 1 GPU(T4) on Databricks | 25 seconds | 27 seconds | 28 seconds| +| 2 GPU(T4) on Databricks | 15 seconds | 14 seconds | 17 seconds | +| 4 GPU(T4) on Databricks | 11 seconds | 11 seconds | 12 seconds | + +Note: Please update the `x,y` column names to `Start_Lon,Start_Lat` in +the [notebook](./notebooks/cuspatial_sample_db.ipynb) if you test with the download points. + +taxi-zones map: + + + +nyct2000 map: + + + +nyct-community-districts map: + + ## Build -You can build the jar file [in Docker](#build-in-docker) with the provided [Dockerfile](Dockerfile) -or you can build it [in local](#build-in-local) machine after some prerequisites. +Firstly build the UDF JAR from source code before running this demo. +You can build the JAR [in Docker](#build-in-docker) with the provided [Dockerfile](Dockerfile), +or [in local machine](#build-in-local-machine) after prerequisites. ### Build in Docker 1. Build the docker image [Dockerfile](Dockerfile), then run the container. @@ -25,16 +45,18 @@ or you can build it [in local](#build-in-local) machine after some prerequisites docker build -f Dockerfile . -t build-spark-cuspatial docker run -it build-spark-cuspatial bash ``` -2. Get the code, then run "mvn package". +2. Get the code, then run `mvn package`. ```Bash git clone https://github.com/NVIDIA/spark-rapids-examples.git cd spark-rapids-examples/examples/UDF-Examples/Spark-cuSpatial/ mvn package ``` -3. You'll get the jar named like "spark-cuspatial-.jar" in the target folder. +3. You'll get the jar named `spark-cuspatial-.jar` in the target folder. + +Note: The docker env is just for building the jar, not for running the application. -### Build in Local: -1. essential build tools: +### Build in local machine: +1. Essential build tools: - [cmake(>=3.20)](https://cmake.org/download/), - [ninja(>=1.10)](https://github.com/ninja-build/ninja/releases), - [gcc(>=9.3)](https://gcc.gnu.org/releases.html) @@ -42,40 +64,42 @@ or you can build it [in local](#build-in-local) machine after some prerequisites 3. conda: use [miniconda](https://docs.conda.io/en/latest/miniconda.html) to maintain header files and cmake dependecies 4. [cuspatial](https://github.com/rapidsai/cuspatial): install libcuspatial ```Bash - # get libcuspatial from conda - conda install -c rapidsai -c nvidia -c conda-forge -c defaults libcuspatial=22.04 + # Install libcuspatial from conda + conda install -c rapidsai -c nvidia -c conda-forge -c defaults libcuspatial=22.06 # or below command for the nightly (aka SNAPSHOT) version. - conda install -c rapidsai-nightly -c nvidia -c conda-forge -c defaults libcuspatial=22.06 + conda install -c rapidsai-nightly -c nvidia -c conda-forge -c defaults libcuspatial=22.08 ``` -5. Get the code, then run "mvn package". +5. Build the JAR using `mvn package`. ```Bash git clone https://github.com/NVIDIA/spark-rapids-examples.git cd spark-rapids-examples/examples/Spark-cuSpatial/ mvn package ``` -6. You'll get "spark-cuspatial-.jar" in the target folder. - +6. `spark-cuspatial-.jar` will be generated in the target folder. ## Run -### Run on-premises clusters: standalone +### GPU Demo on Spark Standalone on-premises cluster 1. Install necessary libraries. Besides `cudf` and `cuspatial`, the `gdal` library that is compatible with the installed `cuspatial` may also be needed. - Install it by running the command below. ``` conda install -c conda-forge libgdal=3.3.1 ``` 2. Set up [a standalone cluster](/docs/get-started/xgboost-examples/on-prem-cluster/standalone-scala.md) of Spark. Make sure the conda/lib is included in LD_LIBRARY_PATH, so that spark executors can load libcuspatial.so. -3. Download spark-rapids jars - * [spark-rapids v22.06.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.06.0/rapids-4-spark_2.12-22.06.0.jar) or above -4. Prepare the dataset & jars. Copy the sample dataset from [cuspatial_data](../../../datasets/cuspatial_data.tar.gz) to "/data/cuspatial_data". - Copy spark-rapids & spark-cuspatial-22.06.0-SNAPSHOT.jar to "/data/cuspatial_data/jars". - You can use your own path, but remember to update the paths in "gpu-run.sh" accordingly. -5. Run "gpu-run.sh" +3. Download Spark RAPIDS JAR + * [Spark RAPIDS JAR v22.08.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar) or above +4. Prepare sample dataset and JARs. Copy the [sample dataset](../../../datasets/cuspatial_data.tar.gz) to `/data/cuspatial_data/`. + Copy Spark RAPIDS JAR and `spark-cuspatial-.jar` to `/data/cuspatial_data/jars/`. + If you build the `spark-cuspatial-.jar` in docker, please copy the jar from docker to local: + ``` + docker cp YOUR_DOCKER_CONTAINER:/PATH/TO/spark-cuspatial-.jar ./YOUR_LOCAL_PATH + ``` + Note: update the paths in `gpu-run.sh` accordingly. +5. Run `gpu-run.sh` ```Bash ./gpu-run.sh ``` -### Run on AWS Databricks -1. Build the customized docker image [Dockerfile.awsdb](Dockerfile.awsdb) and push to dockerhub so that it can be accessible by AWS Databricks. +### GPU Demo on AWS Databricks +1. Build a customized docker image using [Dockerfile.awsdb](Dockerfile.awsdb) and push to a Docker registry such as [Docker Hub](https://hub.docker.com/) which can be accessible by AWS Databricks. ```Bash # replace your dockerhub repo, your tag or any other repo AWS DB can access docker build -f Dockerfile.awsdb . -t : @@ -83,15 +107,15 @@ or you can build it [in local](#build-in-local) machine after some prerequisites ``` 2. Follow the [Spark-rapids get-started document](https://nvidia.github.io/spark-rapids/docs/get-started/getting-started-databricks.html#start-a-databricks-cluster) to create a GPU cluster on AWS Databricks. - Something different from the document. + Below are some different steps since a custom docker image is used with Databricks: * Databricks Runtime Version - You should choose a Standard version of the Runtime version like "Runtime: 9.1 LTS(Scala 2.12, Spark 3.1.2)" and - choose GPU instance type like "g4dn.xlarge". Note that ML runtime does not support customized docker container. - If you choose a ML version, it says "Support for Databricks container services requires runtime version 5.3+" - and the "Confirm" button is disabled. + Choose a non-ML Databricks Runtime such as `Runtime: 9.1 LTS(Scala 2.12, Spark 3.1.2)` and + choose GPU AWS instance type such as `g4dn.xlarge`. Note that ML runtime does not support customized docker container with below messages: +`Support for Databricks container services requires runtime version 5.3+` + and the `Confirm` button is disabled. * Use your own Docker container - Input "Docker Image URL" as "your-dockerhub-repo:your-tag" - * For the other configurations, you can follow the get-started document. + Input `Docker Image URL` as `your-dockerhub-repo:your-tag` + * Follow the Databricks get-started document for other steps. 3. Copy the sample [cuspatial_data.tar.gz](../../../datasets/cuspatial_data.tar.gz) or your data to DBFS by using Databricks CLI. ```Bash @@ -103,5 +127,38 @@ or you can build it [in local](#build-in-local) machine after some prerequisites points polygons ``` -4. Import the Library "spark-cuspatial-22.06.0-SNAPSHOT.jar" to the Databricks, then install it to your cluster. -5. Import [cuspatial_sample.ipynb](notebooks/cuspatial_sample_db.ipynb) to your workspace in Databricks. Attach to your cluster, then run it. + The sample points and polygons are randomly generated. + + Sample polygons: + + + +4. Upload `spark-cuspatial-.jar` on dbfs and then install it in Databricks cluster. + + + +5. Import [cuspatial_sample.ipynb](notebooks/cuspatial_sample_db.ipynb) to Databricks workspace, attach it to Databricks cluster and run it. + +### CPU Demo on AWS Databricks +1. Create a Databricks cluster. For example, Databricks Runtime 10.3. + +2. Install the Sedona jars and Sedona Python libs on Databricks using web UI. + The Sedona version should be 1.1.1-incubating or higher. + * Install below jars from Maven Coordinates in Libraries tab: + ```Bash + org.apache.sedona:sedona-python-adapter-3.0_2.12:1.2.0-incubating + org.datasyslab:geotools-wrapper:1.1.0-25.2 + ``` + * To enable python support, install below python lib from PyPI in Libraries tab + ```Bash + apache-sedona + ``` +3. From your cluster configuration (Cluster -> Edit -> Configuration -> Advanced options -> Spark) activate the + Sedona functions and the kryo serializer by adding below to the Spark Config + ```Bash + spark.sql.extensions org.apache.sedona.viz.sql.SedonaVizExtensions,org.apache.sedona.sql.SedonaSqlExtensions + spark.serializer org.apache.spark.serializer.KryoSerializer + spark.kryo.registrator org.apache.sedona.core.serde.SedonaKryoRegistrator + ``` + +4. Upload the sample data files to DBFS, start the cluster, attach the [notebook](notebooks/spacial-cpu-apache-sedona_db.ipynb) to the cluster, and run it. diff --git a/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh b/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh index 27571388d..987a3ea52 100755 --- a/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh +++ b/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh @@ -15,10 +15,10 @@ # # change to your spark folder -SPARK_HOME=/data/spark-3.2.0-bin-hadoop3.2 +SPARK_HOME=${SPARK_HOME:-/data/spark-3.2.0-bin-hadoop3.2} # change this path to your root path for the dataset -ROOT_PATH=/data/cuspatial_data +ROOT_PATH=${ROOT_PATH:-/data/cuspatial_data} # Extract the sample dataset in ../../datasets/cuspatial_data.tar.gz # Copy the polygons and points data into the root path or change the root path to where they are SHAPE_FILE_DIR=$ROOT_PATH/polygons @@ -31,7 +31,7 @@ rm -rf $DATA_OUT_PATH # the path to keep the jars of spark-rapids & spark-cuspatial JARS=$ROOT_PATH/jars -JARS_PATH=$JARS/rapids-4-spark_2.12-22.06.0.jar,$JARS/spark-cuspatial-22.06.0-SNAPSHOT.jar +JARS_PATH=${JARS_PATH:-$JARS/rapids-4-spark_2.12-22.08.0.jar,$JARS/spark-cuspatial-22.08.0-SNAPSHOT.jar} $SPARK_HOME/bin/spark-submit --master spark://$HOSTNAME:7077 \ --name "Gpu Spatial Join UDF" \ diff --git a/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_db.ipynb b/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_db.ipynb index ba535f63d..4797ada9c 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_db.ipynb +++ b/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_db.ipynb @@ -1 +1,328 @@ -{"cells":[{"cell_type":"code","source":["#define the input & output path\ninputPath='dbfs:/data/cuspatial_data/points'\noutputPath='dbfs:/data/output'\n\n# add the shapefile(.shp & .shx)\nspark.sparkContext.addFile(\"dbfs:/data/cuspatial_data/polygons/polygons.shp\")\nspark.sparkContext.addFile(\"dbfs:/data/cuspatial_data/polygons/polygons.shx\")\nspark.conf.set(\"spark.cuspatial.sql.udf.shapeFileName\", \"polygons.shp\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"4f12a9eb-6580-4575-a6a8-4c08820fe8e0"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":0},{"cell_type":"code","source":["# register the UDF\nspark.udf.registerJavaFunction(\"point_in_polygon\", \"com.nvidia.spark.rapids.udf.PointInPolygon\", None)\n\ndf = spark.read.parquet(inputPath)\ndf = df.selectExpr('x', 'y', 'point_in_polygon(x, y) as ret')\ndf.write.mode(\"overwrite\").parquet(outputPath)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"741761ee-d5cf-4c43-ae73-30703d246901"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":0},{"cell_type":"code","source":["print(\"Input rows: \", df.count())"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"592bfcde-ca6f-4b94-aac1-e3b298f25fcb"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
Input rows: 71\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
Input rows: 71\n
"]}}],"execution_count":0},{"cell_type":"code","source":["# show the result\nresult = spark.read.parquet(outputPath)\nprint(\"Output rows: \", result.count())\nresult.show(71)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"f1b744a4-13a3-4260-b8f0-efcbbf1c0337"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
Output rows: 71\n+-------------------+--------------------+---+\n| x| y|ret|\n+-------------------+--------------------+---+\n|0.48171647380517046| 1.9022922214961997| []|\n| 1.2591725716781235| 0.1448705855995005| []|\n| 0.1895259128530169| 0.5431061133894604| []|\n| 3.028362149164369|0.027638405909631958| []|\n| 1.3890664414691907| 1.5177694304735412| []|\n| 3.1907684812039956| 0.2621847215928189| []|\n| 3.7080407833612004| 0.09804238103130436| []|\n| 3.0706987088385853| 0.9376313558467103| []|\n| 2.0697434332621234| 1.1809465376402173| []|\n| 2.175448214220591| 1.2372448404986038| []|\n| 2.113652420701984| 1.2774712415624014| []|\n| 2.9909779614491687| 1.2420487904041893| []|\n| 4.07037627210835| 1.9486902798139454| []|\n| 4.822583857757069| 0.3234041700489503| []|\n| 4.849847745942472| 1.9531893897409585| []|\n| 4.732546857961497| 0.5659923375279095| []|\n| 4.529792124514895| 1.942673409259531| []|\n| 3.2648444465931474| 2.693039435509084| []|\n| 2.1807636574967466| 3.2296461832828114|[3]|\n| 3.7164018490892348| 2.4612194182614333| []|\n| 2.2006520196663066| 3.7672478678985257|[3]|\n| 6.291790729917634| 2.983311357415729|[1]|\n| 2.5104987015171574| 3.0668114607133137|[3]|\n| 2.3007438625108882| 3.6045900851589048|[3]|\n| 6.101327777646798| 2.5239201807166616|[1]|\n| 6.109985464455084| 2.2235950639628523|[1]|\n| 6.4274219368674315| 2.9754616970668213|[1]|\n| 7.886010001346151| 3.538128217886674| []|\n| 7.5085184104988| 3.623862886287816| []|\n| 7.430677191305505| 3.380489849365283| []|\n| 1.7015273093278767| 7.478882372510933| []|\n| 7.769497359206111| 3.253257011908445| []|\n| 3.86008672302403| 7.513564222799629| []|\n| 0.059011873032214| 5.823535317960799| []|\n| 3.154282922203257| 5.788316610960881| []|\n| 2.4264509160270813| 5.188939408363776|[0]|\n| 1.9804558865545805| 1.3472225743317712| []|\n| 0.8178039499335275| 0.8138440641113271| []|\n| 0.2536015260915061| 1.8762161698642947| []|\n| 3.710910700915217| 0.9937713340192049| []|\n| 3.918090468102582| 0.3338651960183463| []|\n| 3.572744183805594| 0.33184908855075124| []|\n| 3.70669993057843| 0.7485845679979923| []|\n| 3.3588457228653024| 0.2346381514128677| []|\n| 2.520755151373394| 1.902015274420646| []|\n| 2.5322042870739683| 1.419555755682142| []|\n| 2.4613232527836137| 1.0484414482621331| []|\n| 4.975578758530645| 0.9606291981013242| []|\n| 4.5584381091040616| 1.8996548860019926| []|\n| 4.300706849071861|0.021365525588281198| []|\n| 3.01954722322135| 2.57810040095543| []|\n| 3.7622247877537456| 2.8709552313924487| []|\n| 4.75489831780737| 0.7800065259479418| []|\n| 2.566986568683904| 3.6607732238530897|[3]|\n| 3.7002781846945347| 2.3345952955903906| []|\n| 2.493975723955388| 3.3999020934055837|[3]|\n| 2.8222482218882474| 3.8159308233351266|[3]|\n| 6.0821276168848994| 2.5470532680258002|[1]|\n| 2.241538022180476| 3.8812819070357545|[3]|\n| 6.325158445513714| 2.8765450351723674|[1]|\n| 6.444584786789386| 2.174562817047202|[1]|\n| 6.6793884701899| 2.5605928243991434|[1]|\n| 7.250745898479374| 3.4154469467473447| []|\n| 7.079453687660189| 3.063690547962938|[1]|\n| 7.897735998643542| 3.380784914178574| []|\n| 2.2065031771469| 6.896038613284851| []|\n| 1.8703303641352362| 4.209727933188015|[3]|\n| 2.7456295127617385| 7.474216636277054| []|\n| 1.9143371250907073| 6.885401350515916| []|\n| 3.7176098065039747| 6.194330707468438| []|\n| 3.1162712022943757| 6.789029097334483| []|\n+-------------------+--------------------+---+\n\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
Output rows: 71\n+-------------------+--------------------+---+\n x| y|ret|\n+-------------------+--------------------+---+\n0.48171647380517046| 1.9022922214961997| []|\n 1.2591725716781235| 0.1448705855995005| []|\n 0.1895259128530169| 0.5431061133894604| []|\n 3.028362149164369|0.027638405909631958| []|\n 1.3890664414691907| 1.5177694304735412| []|\n 3.1907684812039956| 0.2621847215928189| []|\n 3.7080407833612004| 0.09804238103130436| []|\n 3.0706987088385853| 0.9376313558467103| []|\n 2.0697434332621234| 1.1809465376402173| []|\n 2.175448214220591| 1.2372448404986038| []|\n 2.113652420701984| 1.2774712415624014| []|\n 2.9909779614491687| 1.2420487904041893| []|\n 4.07037627210835| 1.9486902798139454| []|\n 4.822583857757069| 0.3234041700489503| []|\n 4.849847745942472| 1.9531893897409585| []|\n 4.732546857961497| 0.5659923375279095| []|\n 4.529792124514895| 1.942673409259531| []|\n 3.2648444465931474| 2.693039435509084| []|\n 2.1807636574967466| 3.2296461832828114|[3]|\n 3.7164018490892348| 2.4612194182614333| []|\n 2.2006520196663066| 3.7672478678985257|[3]|\n 6.291790729917634| 2.983311357415729|[1]|\n 2.5104987015171574| 3.0668114607133137|[3]|\n 2.3007438625108882| 3.6045900851589048|[3]|\n 6.101327777646798| 2.5239201807166616|[1]|\n 6.109985464455084| 2.2235950639628523|[1]|\n 6.4274219368674315| 2.9754616970668213|[1]|\n 7.886010001346151| 3.538128217886674| []|\n 7.5085184104988| 3.623862886287816| []|\n 7.430677191305505| 3.380489849365283| []|\n 1.7015273093278767| 7.478882372510933| []|\n 7.769497359206111| 3.253257011908445| []|\n 3.86008672302403| 7.513564222799629| []|\n 0.059011873032214| 5.823535317960799| []|\n 3.154282922203257| 5.788316610960881| []|\n 2.4264509160270813| 5.188939408363776|[0]|\n 1.9804558865545805| 1.3472225743317712| []|\n 0.8178039499335275| 0.8138440641113271| []|\n 0.2536015260915061| 1.8762161698642947| []|\n 3.710910700915217| 0.9937713340192049| []|\n 3.918090468102582| 0.3338651960183463| []|\n 3.572744183805594| 0.33184908855075124| []|\n 3.70669993057843| 0.7485845679979923| []|\n 3.3588457228653024| 0.2346381514128677| []|\n 2.520755151373394| 1.902015274420646| []|\n 2.5322042870739683| 1.419555755682142| []|\n 2.4613232527836137| 1.0484414482621331| []|\n 4.975578758530645| 0.9606291981013242| []|\n 4.5584381091040616| 1.8996548860019926| []|\n 4.300706849071861|0.021365525588281198| []|\n 3.01954722322135| 2.57810040095543| []|\n 3.7622247877537456| 2.8709552313924487| []|\n 4.75489831780737| 0.7800065259479418| []|\n 2.566986568683904| 3.6607732238530897|[3]|\n 3.7002781846945347| 2.3345952955903906| []|\n 2.493975723955388| 3.3999020934055837|[3]|\n 2.8222482218882474| 3.8159308233351266|[3]|\n 6.0821276168848994| 2.5470532680258002|[1]|\n 2.241538022180476| 3.8812819070357545|[3]|\n 6.325158445513714| 2.8765450351723674|[1]|\n 6.444584786789386| 2.174562817047202|[1]|\n 6.6793884701899| 2.5605928243991434|[1]|\n 7.250745898479374| 3.4154469467473447| []|\n 7.079453687660189| 3.063690547962938|[1]|\n 7.897735998643542| 3.380784914178574| []|\n 2.2065031771469| 6.896038613284851| []|\n 1.8703303641352362| 4.209727933188015|[3]|\n 2.7456295127617385| 7.474216636277054| []|\n 1.9143371250907073| 6.885401350515916| []|\n 3.7176098065039747| 6.194330707468438| []|\n 3.1162712022943757| 6.789029097334483| []|\n+-------------------+--------------------+---+\n\n
"]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"cuspatial_sample","dashboards":[],"notebookMetadata":{"pythonIndentUnit":2},"language":"python","widgets":{},"notebookOrigID":3352849421916703}},"nbformat":4,"nbformat_minor":0} +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "4f12a9eb-6580-4575-a6a8-4c08820fe8e0", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "#define the input & output path\n", + "inputPath='dbfs:/data/cuspatial_data/points'\n", + "outputPath='dbfs:/data/output'\n", + "\n", + "# add the shapefile(.shp & .shx)\n", + "spark.sparkContext.addFile(\"dbfs:/data/cuspatial_data/polygons/polygons.shp\")\n", + "spark.sparkContext.addFile(\"dbfs:/data/cuspatial_data/polygons/polygons.shx\")\n", + "spark.conf.set(\"spark.cuspatial.sql.udf.shapeFileName\", \"polygons.shp\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "741761ee-d5cf-4c43-ae73-30703d246901", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "# register the UDF\n", + "spark.udf.registerJavaFunction(\"point_in_polygon\", \"com.nvidia.spark.rapids.udf.PointInPolygon\", None)\n", + "\n", + "df = spark.read.parquet(inputPath)\n", + "df = df.selectExpr('x', 'y', 'point_in_polygon(x, y) as ret')\n", + "df.write.mode(\"overwrite\").parquet(outputPath)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "592bfcde-ca6f-4b94-aac1-e3b298f25fcb", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
Input rows: 71\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
Input rows: 71\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "print(\"Input rows: \", df.count())" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "f1b744a4-13a3-4260-b8f0-efcbbf1c0337", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
Output rows: 71\n", + "+-------------------+--------------------+---+\n", + " x| y|ret|\n", + "+-------------------+--------------------+---+\n", + "0.48171647380517046| 1.9022922214961997| []|\n", + " 1.2591725716781235| 0.1448705855995005| []|\n", + " 0.1895259128530169| 0.5431061133894604| []|\n", + " 3.028362149164369|0.027638405909631958| []|\n", + " 1.3890664414691907| 1.5177694304735412| []|\n", + " 3.1907684812039956| 0.2621847215928189| []|\n", + " 3.7080407833612004| 0.09804238103130436| []|\n", + " 3.0706987088385853| 0.9376313558467103| []|\n", + " 2.0697434332621234| 1.1809465376402173| []|\n", + " 2.175448214220591| 1.2372448404986038| []|\n", + " 2.113652420701984| 1.2774712415624014| []|\n", + " 2.9909779614491687| 1.2420487904041893| []|\n", + " 4.07037627210835| 1.9486902798139454| []|\n", + " 4.822583857757069| 0.3234041700489503| []|\n", + " 4.849847745942472| 1.9531893897409585| []|\n", + " 4.732546857961497| 0.5659923375279095| []|\n", + " 4.529792124514895| 1.942673409259531| []|\n", + " 3.2648444465931474| 2.693039435509084| []|\n", + " 2.1807636574967466| 3.2296461832828114|[3]|\n", + " 3.7164018490892348| 2.4612194182614333| []|\n", + " 2.2006520196663066| 3.7672478678985257|[3]|\n", + " 6.291790729917634| 2.983311357415729|[1]|\n", + " 2.5104987015171574| 3.0668114607133137|[3]|\n", + " 2.3007438625108882| 3.6045900851589048|[3]|\n", + " 6.101327777646798| 2.5239201807166616|[1]|\n", + " 6.109985464455084| 2.2235950639628523|[1]|\n", + " 6.4274219368674315| 2.9754616970668213|[1]|\n", + " 7.886010001346151| 3.538128217886674| []|\n", + " 7.5085184104988| 3.623862886287816| []|\n", + " 7.430677191305505| 3.380489849365283| []|\n", + " 1.7015273093278767| 7.478882372510933| []|\n", + " 7.769497359206111| 3.253257011908445| []|\n", + " 3.86008672302403| 7.513564222799629| []|\n", + " 0.059011873032214| 5.823535317960799| []|\n", + " 3.154282922203257| 5.788316610960881| []|\n", + " 2.4264509160270813| 5.188939408363776|[0]|\n", + " 1.9804558865545805| 1.3472225743317712| []|\n", + " 0.8178039499335275| 0.8138440641113271| []|\n", + " 0.2536015260915061| 1.8762161698642947| []|\n", + " 3.710910700915217| 0.9937713340192049| []|\n", + " 3.918090468102582| 0.3338651960183463| []|\n", + " 3.572744183805594| 0.33184908855075124| []|\n", + " 3.70669993057843| 0.7485845679979923| []|\n", + " 3.3588457228653024| 0.2346381514128677| []|\n", + " 2.520755151373394| 1.902015274420646| []|\n", + " 2.5322042870739683| 1.419555755682142| []|\n", + " 2.4613232527836137| 1.0484414482621331| []|\n", + " 4.975578758530645| 0.9606291981013242| []|\n", + " 4.5584381091040616| 1.8996548860019926| []|\n", + " 4.300706849071861|0.021365525588281198| []|\n", + " 3.01954722322135| 2.57810040095543| []|\n", + " 3.7622247877537456| 2.8709552313924487| []|\n", + " 4.75489831780737| 0.7800065259479418| []|\n", + " 2.566986568683904| 3.6607732238530897|[3]|\n", + " 3.7002781846945347| 2.3345952955903906| []|\n", + " 2.493975723955388| 3.3999020934055837|[3]|\n", + " 2.8222482218882474| 3.8159308233351266|[3]|\n", + " 6.0821276168848994| 2.5470532680258002|[1]|\n", + " 2.241538022180476| 3.8812819070357545|[3]|\n", + " 6.325158445513714| 2.8765450351723674|[1]|\n", + " 6.444584786789386| 2.174562817047202|[1]|\n", + " 6.6793884701899| 2.5605928243991434|[1]|\n", + " 7.250745898479374| 3.4154469467473447| []|\n", + " 7.079453687660189| 3.063690547962938|[1]|\n", + " 7.897735998643542| 3.380784914178574| []|\n", + " 2.2065031771469| 6.896038613284851| []|\n", + " 1.8703303641352362| 4.209727933188015|[3]|\n", + " 2.7456295127617385| 7.474216636277054| []|\n", + " 1.9143371250907073| 6.885401350515916| []|\n", + " 3.7176098065039747| 6.194330707468438| []|\n", + " 3.1162712022943757| 6.789029097334483| []|\n", + "+-------------------+--------------------+---+\n", + "\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
Output rows: 71\n+-------------------+--------------------+---+\n| x| y|ret|\n+-------------------+--------------------+---+\n|0.48171647380517046| 1.9022922214961997| []|\n| 1.2591725716781235| 0.1448705855995005| []|\n| 0.1895259128530169| 0.5431061133894604| []|\n| 3.028362149164369|0.027638405909631958| []|\n| 1.3890664414691907| 1.5177694304735412| []|\n| 3.1907684812039956| 0.2621847215928189| []|\n| 3.7080407833612004| 0.09804238103130436| []|\n| 3.0706987088385853| 0.9376313558467103| []|\n| 2.0697434332621234| 1.1809465376402173| []|\n| 2.175448214220591| 1.2372448404986038| []|\n| 2.113652420701984| 1.2774712415624014| []|\n| 2.9909779614491687| 1.2420487904041893| []|\n| 4.07037627210835| 1.9486902798139454| []|\n| 4.822583857757069| 0.3234041700489503| []|\n| 4.849847745942472| 1.9531893897409585| []|\n| 4.732546857961497| 0.5659923375279095| []|\n| 4.529792124514895| 1.942673409259531| []|\n| 3.2648444465931474| 2.693039435509084| []|\n| 2.1807636574967466| 3.2296461832828114|[3]|\n| 3.7164018490892348| 2.4612194182614333| []|\n| 2.2006520196663066| 3.7672478678985257|[3]|\n| 6.291790729917634| 2.983311357415729|[1]|\n| 2.5104987015171574| 3.0668114607133137|[3]|\n| 2.3007438625108882| 3.6045900851589048|[3]|\n| 6.101327777646798| 2.5239201807166616|[1]|\n| 6.109985464455084| 2.2235950639628523|[1]|\n| 6.4274219368674315| 2.9754616970668213|[1]|\n| 7.886010001346151| 3.538128217886674| []|\n| 7.5085184104988| 3.623862886287816| []|\n| 7.430677191305505| 3.380489849365283| []|\n| 1.7015273093278767| 7.478882372510933| []|\n| 7.769497359206111| 3.253257011908445| []|\n| 3.86008672302403| 7.513564222799629| []|\n| 0.059011873032214| 5.823535317960799| []|\n| 3.154282922203257| 5.788316610960881| []|\n| 2.4264509160270813| 5.188939408363776|[0]|\n| 1.9804558865545805| 1.3472225743317712| []|\n| 0.8178039499335275| 0.8138440641113271| []|\n| 0.2536015260915061| 1.8762161698642947| []|\n| 3.710910700915217| 0.9937713340192049| []|\n| 3.918090468102582| 0.3338651960183463| []|\n| 3.572744183805594| 0.33184908855075124| []|\n| 3.70669993057843| 0.7485845679979923| []|\n| 3.3588457228653024| 0.2346381514128677| []|\n| 2.520755151373394| 1.902015274420646| []|\n| 2.5322042870739683| 1.419555755682142| []|\n| 2.4613232527836137| 1.0484414482621331| []|\n| 4.975578758530645| 0.9606291981013242| []|\n| 4.5584381091040616| 1.8996548860019926| []|\n| 4.300706849071861|0.021365525588281198| []|\n| 3.01954722322135| 2.57810040095543| []|\n| 3.7622247877537456| 2.8709552313924487| []|\n| 4.75489831780737| 0.7800065259479418| []|\n| 2.566986568683904| 3.6607732238530897|[3]|\n| 3.7002781846945347| 2.3345952955903906| []|\n| 2.493975723955388| 3.3999020934055837|[3]|\n| 2.8222482218882474| 3.8159308233351266|[3]|\n| 6.0821276168848994| 2.5470532680258002|[1]|\n| 2.241538022180476| 3.8812819070357545|[3]|\n| 6.325158445513714| 2.8765450351723674|[1]|\n| 6.444584786789386| 2.174562817047202|[1]|\n| 6.6793884701899| 2.5605928243991434|[1]|\n| 7.250745898479374| 3.4154469467473447| []|\n| 7.079453687660189| 3.063690547962938|[1]|\n| 7.897735998643542| 3.380784914178574| []|\n| 2.2065031771469| 6.896038613284851| []|\n| 1.8703303641352362| 4.209727933188015|[3]|\n| 2.7456295127617385| 7.474216636277054| []|\n| 1.9143371250907073| 6.885401350515916| []|\n| 3.7176098065039747| 6.194330707468438| []|\n| 3.1162712022943757| 6.789029097334483| []|\n+-------------------+--------------------+---+\n\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "# show the result\n", + "result = spark.read.parquet(outputPath)\n", + "print(\"Output rows: \", result.count())\n", + "result.show(71)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "cuspatial_sample", + "notebookOrigID": 3352849421916703, + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.2" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_standalone.ipynb b/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_standalone.ipynb index a53a6d813..f13889ed1 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_standalone.ipynb +++ b/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_standalone.ipynb @@ -8,8 +8,10 @@ "outputs": [], "source": [ "from pyspark.sql import SparkSession\n", + "import os\n", + "jarsPath = os.getenv(\"JARS_PATH\", \"/data/cuspatial_data/jars/rapids-4-spark_2.12-22.08.0.jar,/data/cuspatial_data/jars/spark-cuspatial-22.08.0-SNAPSHOT.jar\")\n", "spark = SparkSession.builder \\\n", - " .config(\"spark.jars\", \"/data/cuspatial_data/jars/rapids-4-spark_2.12-22.06.0.jar,/data/cuspatial_data/jars/spark-cuspatial-22.06.0-SNAPSHOT.jar\") \\\n", + " .config(\"spark.jars\", jarsPath) \\\n", " .config(\"spark.sql.adaptive.enabled\", \"false\") \\\n", " .config(\"spark.executor.memory\", \"20GB\") \\\n", " .config(\"spark.executor.cores\", \"6\") \\\n", @@ -28,11 +30,12 @@ "outputs": [], "source": [ "# prepare shape files\n", - "spark.sparkContext.addFile(\"/data/cuspatial_data/polygons/polygons.shp\")\n", - "spark.sparkContext.addFile(\"/data/cuspatial_data/polygons/polygons.shx\")\n", + "rootPath = os.getenv(\"ROOT_PATH\", \"/data/cuspatial_data\")\n", + "spark.sparkContext.addFile(rootPath + \"/polygons/polygons.shp\")\n", + "spark.sparkContext.addFile(rootPath + \"/polygons/polygons.shx\")\n", "\n", - "inputPath = \"/data/cuspatial_data/points/\"\n", - "outputPath = \"/data/cuspatial_data/output/\"" + "inputPath = rootPath + \"/points/\"\n", + "outputPath = rootPath + \"/output/\"" ] }, { diff --git a/examples/UDF-Examples/Spark-cuSpatial/notebooks/spacial-cpu-apache-sedona_db.ipynb b/examples/UDF-Examples/Spark-cuSpatial/notebooks/spacial-cpu-apache-sedona_db.ipynb new file mode 100644 index 000000000..47cf5e872 --- /dev/null +++ b/examples/UDF-Examples/Spark-cuSpatial/notebooks/spacial-cpu-apache-sedona_db.ipynb @@ -0,0 +1,364 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "61a74001-716b-4411-aecb-77d07058d200", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
Requirement already satisfied: geopandas in /databricks/python3/lib/python3.8/site-packages (0.11.0)\n", + "Requirement already satisfied: pyproj>=2.6.1.post1 in /databricks/python3/lib/python3.8/site-packages (from geopandas) (3.3.1)\n", + "Requirement already satisfied: shapely<2,>=1.7 in /databricks/python3/lib/python3.8/site-packages (from geopandas) (1.8.2)\n", + "Requirement already satisfied: pandas>=1.0.0 in /databricks/python3/lib/python3.8/site-packages (from geopandas) (1.2.4)\n", + "Requirement already satisfied: packaging in /databricks/python3/lib/python3.8/site-packages (from geopandas) (20.9)\n", + "Requirement already satisfied: fiona>=1.8 in /databricks/python3/lib/python3.8/site-packages (from geopandas) (1.8.21)\n", + "Requirement already satisfied: six>=1.7 in /databricks/python3/lib/python3.8/site-packages (from fiona>=1.8->geopandas) (1.15.0)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.8/dist-packages (from fiona>=1.8->geopandas) (52.0.0)\n", + "Requirement already satisfied: click>=4.0 in /databricks/python3/lib/python3.8/site-packages (from fiona>=1.8->geopandas) (8.1.3)\n", + "Requirement already satisfied: certifi in /databricks/python3/lib/python3.8/site-packages (from fiona>=1.8->geopandas) (2020.12.5)\n", + "Requirement already satisfied: munch in /databricks/python3/lib/python3.8/site-packages (from fiona>=1.8->geopandas) (2.5.0)\n", + "Requirement already satisfied: cligj>=0.5 in /databricks/python3/lib/python3.8/site-packages (from fiona>=1.8->geopandas) (0.7.2)\n", + "Requirement already satisfied: click-plugins>=1.0 in /databricks/python3/lib/python3.8/site-packages (from fiona>=1.8->geopandas) (1.1.1)\n", + "Requirement already satisfied: attrs>=17 in /databricks/python3/lib/python3.8/site-packages (from fiona>=1.8->geopandas) (20.3.0)\n", + "Requirement already satisfied: pytz>=2017.3 in /databricks/python3/lib/python3.8/site-packages (from pandas>=1.0.0->geopandas) (2020.5)\n", + "Requirement already satisfied: numpy>=1.16.5 in /databricks/python3/lib/python3.8/site-packages (from pandas>=1.0.0->geopandas) (1.20.1)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /databricks/python3/lib/python3.8/site-packages (from pandas>=1.0.0->geopandas) (2.8.1)\n", + "Requirement already satisfied: pyparsing>=2.0.2 in /databricks/python3/lib/python3.8/site-packages (from packaging->geopandas) (2.4.7)\n", + "WARNING: You are using pip version 21.0.1; however, version 22.1.2 is available.\n", + "You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
Requirement already satisfied: geopandas in /databricks/python3/lib/python3.8/site-packages (0.11.0)\nRequirement already satisfied: pyproj>=2.6.1.post1 in /databricks/python3/lib/python3.8/site-packages (from geopandas) (3.3.1)\nRequirement already satisfied: shapely<2,>=1.7 in /databricks/python3/lib/python3.8/site-packages (from geopandas) (1.8.2)\nRequirement already satisfied: pandas>=1.0.0 in /databricks/python3/lib/python3.8/site-packages (from geopandas) (1.2.4)\nRequirement already satisfied: packaging in /databricks/python3/lib/python3.8/site-packages (from geopandas) (20.9)\nRequirement already satisfied: fiona>=1.8 in /databricks/python3/lib/python3.8/site-packages (from geopandas) (1.8.21)\nRequirement already satisfied: six>=1.7 in /databricks/python3/lib/python3.8/site-packages (from fiona>=1.8->geopandas) (1.15.0)\nRequirement already satisfied: setuptools in /usr/local/lib/python3.8/dist-packages (from fiona>=1.8->geopandas) (52.0.0)\nRequirement already satisfied: click>=4.0 in /databricks/python3/lib/python3.8/site-packages (from fiona>=1.8->geopandas) (8.1.3)\nRequirement already satisfied: certifi in /databricks/python3/lib/python3.8/site-packages (from fiona>=1.8->geopandas) (2020.12.5)\nRequirement already satisfied: munch in /databricks/python3/lib/python3.8/site-packages (from fiona>=1.8->geopandas) (2.5.0)\nRequirement already satisfied: cligj>=0.5 in /databricks/python3/lib/python3.8/site-packages (from fiona>=1.8->geopandas) (0.7.2)\nRequirement already satisfied: click-plugins>=1.0 in /databricks/python3/lib/python3.8/site-packages (from fiona>=1.8->geopandas) (1.1.1)\nRequirement already satisfied: attrs>=17 in /databricks/python3/lib/python3.8/site-packages (from fiona>=1.8->geopandas) (20.3.0)\nRequirement already satisfied: pytz>=2017.3 in /databricks/python3/lib/python3.8/site-packages (from pandas>=1.0.0->geopandas) (2020.5)\nRequirement already satisfied: numpy>=1.16.5 in /databricks/python3/lib/python3.8/site-packages (from pandas>=1.0.0->geopandas) (1.20.1)\nRequirement already satisfied: python-dateutil>=2.7.3 in /databricks/python3/lib/python3.8/site-packages (from pandas>=1.0.0->geopandas) (2.8.1)\nRequirement already satisfied: pyparsing>=2.0.2 in /databricks/python3/lib/python3.8/site-packages (from packaging->geopandas) (2.4.7)\nWARNING: You are using pip version 21.0.1; however, version 22.1.2 is available.\nYou should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sh \n", + "pip install geopandas" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "4fd97bf3-dc08-495f-9cfe-e9e551f40e16", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "import os\n", + "import geopandas as gpd\n", + "from pyspark.sql.functions import col, expr, when\n", + "from sedona.register import SedonaRegistrator\n", + "from sedona.utils import SedonaKryoRegistrator, KryoSerializer" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "6b15de34-d411-457b-89fb-7232587ae949", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
/databricks/spark/python/pyspark/sql/pandas/conversion.py:340: UserWarning: createDataFrame attempted Arrow optimization because 'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, failed by the reason below:\n", + " Did not pass numpy.dtype object\n", + "Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.\n", + " warnings.warn(msg)\n", + "Out[9]: 4
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
/databricks/spark/python/pyspark/sql/pandas/conversion.py:340: UserWarning: createDataFrame attempted Arrow optimization because 'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, failed by the reason below:\n Did not pass numpy.dtype object\nAttempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.\n warnings.warn(msg)\nOut[9]: 4
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "# read the shapefile\n", + "polygons = gpd.read_file(\"/dbfs/data/cuspatial_data/polygons/polygons.shp\")\n", + "polygons_df = spark.createDataFrame(\n", + " polygons\n", + ")\n", + "polygons_df.createOrReplaceTempView(\"polygons\")\n", + "polygons_df.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "3a4d1a80-72e4-490d-8152-f6f231cac37f", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
+--------------------+\n", + " mypoint|\n", + "+--------------------+\n", + "POINT (0.48171647...|\n", + "POINT (1.25917257...|\n", + "POINT (0.18952591...|\n", + "POINT (3.02836214...|\n", + "POINT (1.38906644...|\n", + "+--------------------+\n", + "only showing top 5 rows\n", + "\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
+--------------------+\n| mypoint|\n+--------------------+\n|POINT (0.48171647...|\n|POINT (1.25917257...|\n|POINT (0.18952591...|\n|POINT (3.02836214...|\n|POINT (1.38906644...|\n+--------------------+\nonly showing top 5 rows\n\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "point_parquet_df = spark.read.format(\"parquet\").\\\n", + " load(\"dbfs:/data/cuspatial_data/points\")\n", + "\n", + "point_parquet_df.createOrReplaceTempView(\"pointtable\")\n", + "point_df = spark.sql(\"select ST_Point(x, y) as mypoint from pointtable\")\n", + "point_df.show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "8ec977f5-937e-45ce-89d6-46fa3b48cc39", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
+--------------------+------------------+------------------+\n", + " geometry| x| y|\n", + "+--------------------+------------------+------------------+\n", + "POLYGON ((2.08811...|2.1807636574967466|3.2296461832828114|\n", + "POLYGON ((2.08811...|2.2006520196663066|3.7672478678985257|\n", + "POLYGON ((2.08811...|2.5104987015171574|3.0668114607133137|\n", + "POLYGON ((2.08811...|2.3007438625108882|3.6045900851589048|\n", + "POLYGON ((2.08811...| 2.566986568683904|3.6607732238530897|\n", + "POLYGON ((2.08811...| 2.493975723955388|3.3999020934055837|\n", + "POLYGON ((2.08811...|2.8222482218882474|3.8159308233351266|\n", + "POLYGON ((2.08811...| 2.241538022180476|3.8812819070357545|\n", + "POLYGON ((2.08811...|1.8703303641352362| 4.209727933188015|\n", + "POLYGON ((2.48845...|2.4264509160270813| 5.188939408363776|\n", + "POLYGON ((5.03982...| 6.291790729917634| 2.983311357415729|\n", + "POLYGON ((5.03982...| 6.101327777646798|2.5239201807166616|\n", + "POLYGON ((5.03982...| 6.109985464455084|2.2235950639628523|\n", + "POLYGON ((5.03982...|6.4274219368674315|2.9754616970668213|\n", + "POLYGON ((5.03982...|6.0821276168848994|2.5470532680258002|\n", + "POLYGON ((5.03982...| 6.325158445513714|2.8765450351723674|\n", + "POLYGON ((5.03982...| 6.444584786789386| 2.174562817047202|\n", + "POLYGON ((5.03982...| 6.6793884701899|2.5605928243991434|\n", + "POLYGON ((5.03982...| 7.079453687660189| 3.063690547962938|\n", + "+--------------------+------------------+------------------+\n", + "\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
+--------------------+------------------+------------------+\n| geometry| x| y|\n+--------------------+------------------+------------------+\n|POLYGON ((2.08811...|2.1807636574967466|3.2296461832828114|\n|POLYGON ((2.08811...|2.2006520196663066|3.7672478678985257|\n|POLYGON ((2.08811...|2.5104987015171574|3.0668114607133137|\n|POLYGON ((2.08811...|2.3007438625108882|3.6045900851589048|\n|POLYGON ((2.08811...| 2.566986568683904|3.6607732238530897|\n|POLYGON ((2.08811...| 2.493975723955388|3.3999020934055837|\n|POLYGON ((2.08811...|2.8222482218882474|3.8159308233351266|\n|POLYGON ((2.08811...| 2.241538022180476|3.8812819070357545|\n|POLYGON ((2.08811...|1.8703303641352362| 4.209727933188015|\n|POLYGON ((2.48845...|2.4264509160270813| 5.188939408363776|\n|POLYGON ((5.03982...| 6.291790729917634| 2.983311357415729|\n|POLYGON ((5.03982...| 6.101327777646798|2.5239201807166616|\n|POLYGON ((5.03982...| 6.109985464455084|2.2235950639628523|\n|POLYGON ((5.03982...|6.4274219368674315|2.9754616970668213|\n|POLYGON ((5.03982...|6.0821276168848994|2.5470532680258002|\n|POLYGON ((5.03982...| 6.325158445513714|2.8765450351723674|\n|POLYGON ((5.03982...| 6.444584786789386| 2.174562817047202|\n|POLYGON ((5.03982...| 6.6793884701899|2.5605928243991434|\n|POLYGON ((5.03982...| 7.079453687660189| 3.063690547962938|\n+--------------------+------------------+------------------+\n\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "result = spark.sql(\"\\\n", + "SELECT polygons.*, pointtable.* FROM polygons ,pointtable \\\n", + "WHERE ST_Contains(polygons.geometry, ST_Point(pointtable.x, pointtable.y)) \\\n", + "\")\n", + "result.show()" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "spacial-cpu-apache-sedona", + "notebookOrigID": 1618423020047086, + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.2" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/examples/UDF-Examples/Spark-cuSpatial/pom.xml b/examples/UDF-Examples/Spark-cuSpatial/pom.xml index 1196fd775..6f8f08a49 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/pom.xml +++ b/examples/UDF-Examples/Spark-cuSpatial/pom.xml @@ -24,13 +24,13 @@ UDF of the cuSpatial case for the RAPIDS Accelerator The RAPIDS accelerated user defined function of the cuSpatial case for use with the RAPIDS Accelerator for Apache Spark - 22.06.0-SNAPSHOT + 22.08.0-SNAPSHOT 1.8 1.8 8 - 22.06.0 + 22.08.0 2.12 3.2.0 ${project.build.directory}/cpp-build diff --git a/examples/UDF-Examples/Spark-cuSpatial/src/main/native/CMakeLists.txt b/examples/UDF-Examples/Spark-cuSpatial/src/main/native/CMakeLists.txt index 0ab52a7c8..40eff8c31 100755 --- a/examples/UDF-Examples/Spark-cuSpatial/src/main/native/CMakeLists.txt +++ b/examples/UDF-Examples/Spark-cuSpatial/src/main/native/CMakeLists.txt @@ -16,7 +16,7 @@ cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) -project(SPATIALUDJNI VERSION 22.06.0 LANGUAGES C CXX CUDA) +project(SPATIALUDJNI VERSION 22.08.0 LANGUAGES C CXX CUDA) ################################################################################################### # - build type ------------------------------------------------------------------------------------ diff --git a/examples/XGBoost-Examples/README.md b/examples/XGBoost-Examples/README.md index 4d064c8d7..f4e654530 100644 --- a/examples/XGBoost-Examples/README.md +++ b/examples/XGBoost-Examples/README.md @@ -6,7 +6,7 @@ For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 bran uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/). Most data scientists spend a lot of time not only on Training models but also processing the large amounts of data needed to train these models. -As you can see below, XGBoost training on GPUs can be upto 7X and data processing using +As you can see below, XGBoost training on GPUs can be up to 10X and data processing using RAPIDS Accelerator can also be accelerated with an end-to-end speed-up of 7X on GPU compared to CPU. In the public cloud, better performance can lead to significantly lower costs as demonstrated in this [blog](https://developer.nvidia.com/blog/gpu-accelerated-spark-xgboost/). @@ -96,3 +96,8 @@ otherwise the customized CrossValidator may schedule more than 1 xgboost trainin For XGBoost job, if the number of shuffle stage tasks before training is less than the num_worker, the training tasks will be scheduled to run on part of nodes instead of all nodes due to Spark Data Locality feature. The workaround is to increase the partitions of the shuffle stage by setting `spark.sql.files.maxPartitionBytes=RightNum`. +If you are running XGBoost scala notebooks on Dataproc, please make sure to update below configs to avoid job failure: +``` +spark.dynamicAllocation.enabled=false +spark.task.resource.gpu.amount=1 +``` \ No newline at end of file diff --git a/examples/XGBoost-Examples/agaricus/notebooks/scala/agaricus-gpu.ipynb b/examples/XGBoost-Examples/agaricus/notebooks/scala/agaricus-gpu.ipynb index 738a5a0cb..06efa2ba9 100644 --- a/examples/XGBoost-Examples/agaricus/notebooks/scala/agaricus-gpu.ipynb +++ b/examples/XGBoost-Examples/agaricus/notebooks/scala/agaricus-gpu.ipynb @@ -604,9 +604,9 @@ ], "metadata": { "kernelspec": { - "display_name": "XGBoost4j-Spark-2206 - Scala", + "display_name": "XGBoost4j-Spark - Scala", "language": "scala", - "name": "xgboost4j-spark-2206_scala" + "name": "XGBoost4j-Spark_scala" }, "language_info": { "codemirror_mode": "text/x-scala", diff --git a/examples/XGBoost-Examples/app-parameters/supported_xgboost_parameters_python.md b/examples/XGBoost-Examples/app-parameters/supported_xgboost_parameters_python.md index 10b1c293a..b3cf72ebf 100644 --- a/examples/XGBoost-Examples/app-parameters/supported_xgboost_parameters_python.md +++ b/examples/XGBoost-Examples/app-parameters/supported_xgboost_parameters_python.md @@ -30,6 +30,7 @@ This is a description of all the parameters available when you are running examp * `--dataPath=raw::[path]`: Path to the raw data files to be transformed by taxi/ETLMain. * `--dataPath=perf::[path]`,`-dataPath=acq::[path]`: Paths to the raw data files to be transformed by mortgage/ETLMain. * `--dataPath=out::`: Path where to place the output data files for both mortgage/ETLMain and taxi/ETLMain. + * `--dataPath=tmp::`: Path where to place the output data files for converting raw csv format to parquet. 6. `--modelPath=[path]`: Path to save model after training, or where to load model for transforming only. Required only when mode is 'transform'. 7. `--overwrite=[true|false]`: Whether to overwrite the current model data under 'modelPath'. Default is false. You may need to set to true to avoid IOException when saving the model to a path already exists. 8. `--hasHeader=[true|false]`: Indicate whether the csv file has header. diff --git a/examples/XGBoost-Examples/app-parameters/supported_xgboost_parameters_scala.md b/examples/XGBoost-Examples/app-parameters/supported_xgboost_parameters_scala.md index 5b1d77410..838404342 100644 --- a/examples/XGBoost-Examples/app-parameters/supported_xgboost_parameters_scala.md +++ b/examples/XGBoost-Examples/app-parameters/supported_xgboost_parameters_scala.md @@ -19,6 +19,7 @@ This is a description of all the parameters available when you are running examp * `-dataPath=raw::[path]`: Path to the raw data files to be transformed by taxi/ETLMain. * `-dataPath=perf::[path]`,`-dataPath=acq::[path]`: Paths to the raw data files to be transformed by mortgage/ETLMain. * `-dataPath=out::`: Path where to place the output data files for both mortgage/ETLMain and taxi/ETLMain. + * `-dataPath=tmp::`: Path where to place the output data files for converting raw csv format to parquet. 5. `-modelPath=[path]`: Path to save model after training, or where to load model for transforming only. Required only when mode is 'transform'. 6. `-overwrite=[true|false]`: Whether to overwrite the current model data under 'modelPath'. Default is false. You may need to set to true to avoid IOException when saving the model to a path already exists. 7. `-hasHeader=[true|false]`: Indicate whether the csv file has header. diff --git a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb index a544f5795..63a11ccfd 100644 --- a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb +++ b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb @@ -4,6 +4,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "# Dataset\n", + "\n", + "Dataset is derived from Fannie Mae’s [Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) with all rights reserved by Fannie Mae. Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.08/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n", + "\n", "# ETL + XGBoost train & transform\n", "\n", "This notebook is an end-to-end example of ETL + XGBoost Train & Transform by using [Spark-Rapids](https://github.com/NVIDIA/spark-rapids) and [XGBoost](https://github.com/nvidia/spark-xgboost) with GPU accelerated.\n", @@ -50,8 +54,8 @@ "source": [ "# The input path of dataset\n", "dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n", - "orig_perf_path=dataRoot + \"/mortgage/Performance/\"\n", - "orig_acq_path=dataRoot + \"/mortgage/Acquisition/\"" + "orig_raw_path = dataRoot + \"/mortgage/input/\"", + "orig_raw_path_csv2parquet = dataRoot + \"/mortgage/output/csv2parquet/\"" ] }, { @@ -72,15 +76,13 @@ "metadata": {}, "outputs": [], "source": [ - "# Set True to save the dataset after ETL\n", + "# Set True to save processed dataset after ETL\n", "# Set False, the dataset after ETL will be directly used in XGBoost train and transform\n", + "\n", "is_save_dataset=True\n", - "# the path to save the train dataset\n", - "output_path_train=dataRoot + \"/mortgage/output/train/\"\n", - "# the path to save the test dataset\n", - "output_path_test=dataRoot + \"/mortgage/output/test/\"\n", + "output_path_data=dataRoot + \"/mortgage/output/data/\"\n", "# the path to save the xgboost model\n", - "output_path_model=dataRoot + \"/mortgage/new-model-path\"" + "output_path_model=dataRoot + \"/mortgage/output/model/\"" ] }, { @@ -97,65 +99,117 @@ "outputs": [], "source": [ "# File schema\n", - "_csv_perf_schema = StructType([\n", - " StructField(\"loan_id\", LongType()),\n", - " StructField(\"monthly_reporting_period\", StringType()),\n", - " StructField(\"servicer\", StringType()),\n", - " StructField(\"interest_rate\", DoubleType()),\n", - " StructField(\"current_actual_upb\", DoubleType()),\n", - " StructField(\"loan_age\", DoubleType()),\n", - " StructField(\"remaining_months_to_legal_maturity\", DoubleType()),\n", - " StructField(\"adj_remaining_months_to_maturity\", DoubleType()),\n", - " StructField(\"maturity_date\", StringType()),\n", - " StructField(\"msa\", DoubleType()),\n", - " StructField(\"current_loan_delinquency_status\", IntegerType()),\n", - " StructField(\"mod_flag\", StringType()),\n", - " StructField(\"zero_balance_code\", StringType()),\n", - " StructField(\"zero_balance_effective_date\", StringType()),\n", - " StructField(\"last_paid_installment_date\", StringType()),\n", - " StructField(\"foreclosed_after\", StringType()),\n", - " StructField(\"disposition_date\", StringType()),\n", - " StructField(\"foreclosure_costs\", DoubleType()),\n", - " StructField(\"prop_preservation_and_repair_costs\", DoubleType()),\n", - " StructField(\"asset_recovery_costs\", DoubleType()),\n", - " StructField(\"misc_holding_expenses\", DoubleType()),\n", - " StructField(\"holding_taxes\", DoubleType()),\n", - " StructField(\"net_sale_proceeds\", DoubleType()),\n", - " StructField(\"credit_enhancement_proceeds\", DoubleType()),\n", - " StructField(\"repurchase_make_whole_proceeds\", StringType()),\n", - " StructField(\"other_foreclosure_proceeds\", DoubleType()),\n", - " StructField(\"non_interest_bearing_upb\", DoubleType()),\n", - " StructField(\"principal_forgiveness_upb\", StringType()),\n", - " StructField(\"repurchase_make_whole_proceeds_flag\", StringType()),\n", - " StructField(\"foreclosure_principal_write_off_amount\", StringType()),\n", - " StructField(\"servicing_activity_indicator\", StringType())])\n", "\n", - "_csv_acq_schema = StructType([\n", - " StructField(\"loan_id\", LongType()),\n", - " StructField(\"orig_channel\", StringType()),\n", - " StructField(\"seller_name\", StringType()),\n", - " StructField(\"orig_interest_rate\", DoubleType()),\n", - " StructField(\"orig_upb\", IntegerType()),\n", - " StructField(\"orig_loan_term\", IntegerType()),\n", - " StructField(\"orig_date\", StringType()),\n", - " StructField(\"first_pay_date\", StringType()),\n", - " StructField(\"orig_ltv\", DoubleType()),\n", - " StructField(\"orig_cltv\", DoubleType()),\n", - " StructField(\"num_borrowers\", DoubleType()),\n", - " StructField(\"dti\", DoubleType()),\n", - " StructField(\"borrower_credit_score\", DoubleType()),\n", - " StructField(\"first_home_buyer\", StringType()),\n", - " StructField(\"loan_purpose\", StringType()),\n", - " StructField(\"property_type\", StringType()),\n", - " StructField(\"num_units\", IntegerType()),\n", - " StructField(\"occupancy_status\", StringType()),\n", - " StructField(\"property_state\", StringType()),\n", - " StructField(\"zip\", IntegerType()),\n", - " StructField(\"mortgage_insurance_percent\", DoubleType()),\n", - " StructField(\"product_type\", StringType()),\n", - " StructField(\"coborrow_credit_score\", DoubleType()),\n", - " StructField(\"mortgage_insurance_type\", DoubleType()),\n", - " StructField(\"relocation_mortgage_indicator\", StringType())])" + "_csv_raw_schema = StructType([\n", + " StructField(\"reference_pool_id\", StringType()),\n", + " StructField(\"loan_id\", LongType()),\n", + " StructField(\"monthly_reporting_period\", StringType()),\n", + " StructField(\"orig_channel\", StringType()),\n", + " StructField(\"seller_name\", StringType()),\n", + " StructField(\"servicer\", StringType()),\n", + " StructField(\"master_servicer\", StringType()),\n", + " StructField(\"orig_interest_rate\", DoubleType()),\n", + " StructField(\"interest_rate\", DoubleType()),\n", + " StructField(\"orig_upb\", DoubleType()),\n", + " StructField(\"upb_at_issuance\", StringType()),\n", + " StructField(\"current_actual_upb\", DoubleType()),\n", + " StructField(\"orig_loan_term\", IntegerType()),\n", + " StructField(\"orig_date\", StringType()),\n", + " StructField(\"first_pay_date\", StringType()), \n", + " StructField(\"loan_age\", DoubleType()),\n", + " StructField(\"remaining_months_to_legal_maturity\", DoubleType()),\n", + " StructField(\"adj_remaining_months_to_maturity\", DoubleType()),\n", + " StructField(\"maturity_date\", StringType()),\n", + " StructField(\"orig_ltv\", DoubleType()),\n", + " StructField(\"orig_cltv\", DoubleType()),\n", + " StructField(\"num_borrowers\", DoubleType()),\n", + " StructField(\"dti\", DoubleType()),\n", + " StructField(\"borrower_credit_score\", DoubleType()),\n", + " StructField(\"coborrow_credit_score\", DoubleType()),\n", + " StructField(\"first_home_buyer\", StringType()),\n", + " StructField(\"loan_purpose\", StringType()),\n", + " StructField(\"property_type\", StringType()),\n", + " StructField(\"num_units\", IntegerType()),\n", + " StructField(\"occupancy_status\", StringType()),\n", + " StructField(\"property_state\", StringType()),\n", + " StructField(\"msa\", DoubleType()),\n", + " StructField(\"zip\", IntegerType()),\n", + " StructField(\"mortgage_insurance_percent\", DoubleType()),\n", + " StructField(\"product_type\", StringType()),\n", + " StructField(\"prepayment_penalty_indicator\", StringType()),\n", + " StructField(\"interest_only_loan_indicator\", StringType()),\n", + " StructField(\"interest_only_first_principal_and_interest_payment_date\", StringType()),\n", + " StructField(\"months_to_amortization\", StringType()),\n", + " StructField(\"current_loan_delinquency_status\", IntegerType()),\n", + " StructField(\"loan_payment_history\", StringType()),\n", + " StructField(\"mod_flag\", StringType()),\n", + " StructField(\"mortgage_insurance_cancellation_indicator\", StringType()),\n", + " StructField(\"zero_balance_code\", StringType()),\n", + " StructField(\"zero_balance_effective_date\", StringType()),\n", + " StructField(\"upb_at_the_time_of_removal\", StringType()),\n", + " StructField(\"repurchase_date\", StringType()),\n", + " StructField(\"scheduled_principal_current\", StringType()),\n", + " StructField(\"total_principal_current\", StringType()),\n", + " StructField(\"unscheduled_principal_current\", StringType()),\n", + " StructField(\"last_paid_installment_date\", StringType()),\n", + " StructField(\"foreclosed_after\", StringType()),\n", + " StructField(\"disposition_date\", StringType()),\n", + " StructField(\"foreclosure_costs\", DoubleType()),\n", + " StructField(\"prop_preservation_and_repair_costs\", DoubleType()),\n", + " StructField(\"asset_recovery_costs\", DoubleType()),\n", + " StructField(\"misc_holding_expenses\", DoubleType()),\n", + " StructField(\"holding_taxes\", DoubleType()),\n", + " StructField(\"net_sale_proceeds\", DoubleType()),\n", + " StructField(\"credit_enhancement_proceeds\", DoubleType()),\n", + " StructField(\"repurchase_make_whole_proceeds\", StringType()),\n", + " StructField(\"other_foreclosure_proceeds\", DoubleType()),\n", + " StructField(\"non_interest_bearing_upb\", DoubleType()),\n", + " StructField(\"principal_forgiveness_upb\", StringType()),\n", + " StructField(\"original_list_start_date\", StringType()),\n", + " StructField(\"original_list_price\", StringType()),\n", + " StructField(\"current_list_start_date\", StringType()),\n", + " StructField(\"current_list_price\", StringType()),\n", + " StructField(\"borrower_credit_score_at_issuance\", StringType()),\n", + " StructField(\"co-borrower_credit_score_at_issuance\", StringType()),\n", + " StructField(\"borrower_credit_score_current\", StringType()),\n", + " StructField(\"co-Borrower_credit_score_current\", StringType()),\n", + " StructField(\"mortgage_insurance_type\", DoubleType()),\n", + " StructField(\"servicing_activity_indicator\", StringType()),\n", + " StructField(\"current_period_modification_loss_amount\", StringType()),\n", + " StructField(\"cumulative_modification_loss_amount\", StringType()),\n", + " StructField(\"current_period_credit_event_net_gain_or_loss\", StringType()),\n", + " StructField(\"cumulative_credit_event_net_gain_or_loss\", StringType()),\n", + " StructField(\"homeready_program_indicator\", StringType()),\n", + " StructField(\"foreclosure_principal_write_off_amount\", StringType()),\n", + " StructField(\"relocation_mortgage_indicator\", StringType()),\n", + " StructField(\"zero_balance_code_change_date\", StringType()),\n", + " StructField(\"loan_holdback_indicator\", StringType()),\n", + " StructField(\"loan_holdback_effective_date\", StringType()),\n", + " StructField(\"delinquent_accrued_interest\", StringType()),\n", + " StructField(\"property_valuation_method\", StringType()),\n", + " StructField(\"high_balance_loan_indicator\", StringType()),\n", + " StructField(\"arm_initial_fixed-rate_period_lt_5_yr_indicator\", StringType()),\n", + " StructField(\"arm_product_type\", StringType()),\n", + " StructField(\"initial_fixed-rate_period\", StringType()),\n", + " StructField(\"interest_rate_adjustment_frequency\", StringType()),\n", + " StructField(\"next_interest_rate_adjustment_date\", StringType()),\n", + " StructField(\"next_payment_change_date\", StringType()),\n", + " StructField(\"index\", StringType()),\n", + " StructField(\"arm_cap_structure\", StringType()),\n", + " StructField(\"initial_interest_rate_cap_up_percent\", StringType()),\n", + " StructField(\"periodic_interest_rate_cap_up_percent\", StringType()),\n", + " StructField(\"lifetime_interest_rate_cap_up_percent\", StringType()),\n", + " StructField(\"mortgage_margin\", StringType()),\n", + " StructField(\"arm_balloon_indicator\", StringType()),\n", + " StructField(\"arm_plan_number\", StringType()),\n", + " StructField(\"borrower_assistance_plan\", StringType()),\n", + " StructField(\"hltv_refinance_option_indicator\", StringType()),\n", + " StructField(\"deal_name\", StringType()),\n", + " StructField(\"repurchase_make_whole_proceeds_flag\", StringType()),\n", + " StructField(\"alternative_delinquency_resolution\", StringType()),\n", + " StructField(\"alternative_delinquency_resolution_count\", StringType()),\n", + " StructField(\"total_deferral_amount\", StringType())\n", + " ])" ] }, { @@ -312,14 +366,14 @@ "outputs": [], "source": [ "def _get_quarter_from_csv_file_name():\n", - " return substring_index(substring_index(input_file_name(), \".\", 1), \"_\", -1)" + " return substring_index(substring_index(input_file_name(), \".\", 1), \"/\", -1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "* Define function to read Performance CSV data file" + "* Define function to read raw CSV data file" ] }, { @@ -328,37 +382,98 @@ "metadata": {}, "outputs": [], "source": [ - "def read_perf_csv(spark, path):\n", - " return spark.read.format(\"csv\") \\\n", - " .option(\"nullValue\", \"\") \\\n", - " .option(\"header\", \"false\") \\\n", - " .option(\"delimiter\", \"|\") \\\n", - " .schema(_csv_perf_schema) \\\n", + "def read_raw_csv(spark, path):\n", + " return spark.read.format('csv') \\\n", + " .option('nullValue', '') \\\n", + " .option('header', False) \\\n", + " .option('delimiter', '|') \\\n", + " .schema(_csv_raw_schema) \\\n", " .load(path) \\\n", - " .withColumn(\"quarter\", _get_quarter_from_csv_file_name())" + " .withColumn('quarter', _get_quarter_from_csv_file_name())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "* Define function to read Acquisition CSV file" + "* Functions to extract perf and acq columns from raw schema" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ - "def read_acq_csv(spark, path):\n", - " return spark.read.format(\"csv\") \\\n", - " .option(\"nullValue\", \"\") \\\n", - " .option(\"header\", \"false\") \\\n", - " .option(\"delimiter\", \"|\") \\\n", - " .schema(_csv_acq_schema) \\\n", - " .load(path) \\\n", - " .withColumn(\"quarter\", _get_quarter_from_csv_file_name())" + "def extract_perf_columns(rawDf):\n", + " perfDf = rawDf.select(\n", + " col(\"loan_id\"),\n", + " date_format(to_date(col(\"monthly_reporting_period\"),\"MMyyyy\"), \"MM/dd/yyyy\").alias(\"monthly_reporting_period\"),\n", + " upper(col(\"servicer\")).alias(\"servicer\"),\n", + " col(\"interest_rate\"),\n", + " col(\"current_actual_upb\"),\n", + " col(\"loan_age\"),\n", + " col(\"remaining_months_to_legal_maturity\"),\n", + " col(\"adj_remaining_months_to_maturity\"),\n", + " date_format(to_date(col(\"maturity_date\"),\"MMyyyy\"), \"MM/yyyy\").alias(\"maturity_date\"),\n", + " col(\"msa\"),\n", + " col(\"current_loan_delinquency_status\"),\n", + " col(\"mod_flag\"),\n", + " col(\"zero_balance_code\"),\n", + " date_format(to_date(col(\"zero_balance_effective_date\"),\"MMyyyy\"), \"MM/yyyy\").alias(\"zero_balance_effective_date\"),\n", + " date_format(to_date(col(\"last_paid_installment_date\"),\"MMyyyy\"), \"MM/dd/yyyy\").alias(\"last_paid_installment_date\"),\n", + " date_format(to_date(col(\"foreclosed_after\"),\"MMyyyy\"), \"MM/dd/yyyy\").alias(\"foreclosed_after\"),\n", + " date_format(to_date(col(\"disposition_date\"),\"MMyyyy\"), \"MM/dd/yyyy\").alias(\"disposition_date\"),\n", + " col(\"foreclosure_costs\"),\n", + " col(\"prop_preservation_and_repair_costs\"),\n", + " col(\"asset_recovery_costs\"),\n", + " col(\"misc_holding_expenses\"),\n", + " col(\"holding_taxes\"),\n", + " col(\"net_sale_proceeds\"),\n", + " col(\"credit_enhancement_proceeds\"),\n", + " col(\"repurchase_make_whole_proceeds\"),\n", + " col(\"other_foreclosure_proceeds\"),\n", + " col(\"non_interest_bearing_upb\"),\n", + " col(\"principal_forgiveness_upb\"),\n", + " col(\"repurchase_make_whole_proceeds_flag\"),\n", + " col(\"foreclosure_principal_write_off_amount\"),\n", + " col(\"servicing_activity_indicator\"),\n", + " col('quarter')\n", + " )\n", + " return perfDf.select(\"*\").filter(\"current_actual_upb != 0.0\")\n", + "\n", + "def extract_acq_columns(rawDf):\n", + " acqDf = rawDf.select(\n", + " col(\"loan_id\"),\n", + " col(\"orig_channel\"),\n", + " upper(col(\"seller_name\")).alias(\"seller_name\"),\n", + " col(\"orig_interest_rate\"),\n", + " col(\"orig_upb\"),\n", + " col(\"orig_loan_term\"),\n", + " date_format(to_date(col(\"orig_date\"),\"MMyyyy\"), \"MM/yyyy\").alias(\"orig_date\"),\n", + " date_format(to_date(col(\"first_pay_date\"),\"MMyyyy\"), \"MM/yyyy\").alias(\"first_pay_date\"),\n", + " col(\"orig_ltv\"),\n", + " col(\"orig_cltv\"),\n", + " col(\"num_borrowers\"),\n", + " col(\"dti\"),\n", + " col(\"borrower_credit_score\"),\n", + " col(\"first_home_buyer\"),\n", + " col(\"loan_purpose\"),\n", + " col(\"property_type\"),\n", + " col(\"num_units\"),\n", + " col(\"occupancy_status\"),\n", + " col(\"property_state\"),\n", + " col(\"zip\"),\n", + " col(\"mortgage_insurance_percent\"),\n", + " col(\"product_type\"),\n", + " col(\"coborrow_credit_score\"),\n", + " col(\"mortgage_insurance_type\"),\n", + " col(\"relocation_mortgage_indicator\"),\n", + " dense_rank().over(Window.partitionBy(\"loan_id\").orderBy(to_date(col(\"monthly_reporting_period\"),\"MMyyyy\"))).alias(\"rank\"),\n", + " col('quarter')\n", + " )\n", + "\n", + " return acqDf.select(\"*\").filter(col(\"rank\")==1)" ] }, { @@ -372,7 +487,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -398,7 +513,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -477,7 +592,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -524,7 +639,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -548,7 +663,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -583,7 +698,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -610,7 +725,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -619,12 +734,9 @@ "# CPU run, set to false, it can only make ETL run on CPU when is_save_dataset=True.\n", "# spark.conf.set(\"spark.rapids.sql.enabled\", \"false\")\n", "spark.conf.set(\"spark.sql.files.maxPartitionBytes\", \"1G\")\n", - "spark.conf.set(\"spark.sql.shuffle.partitions\", \"192\")\n", "spark.conf.set(\"spark.rapids.sql.explain\", \"ALL\")\n", - "spark.conf.set(\"spark.rapids.sql.incompatibleOps.enabled\", \"true\")\n", "spark.conf.set(\"spark.rapids.sql.batchSizeBytes\", \"512M\")\n", "spark.conf.set(\"spark.rapids.sql.reader.batchSizeBytes\", \"768M\")\n", - "spark.conf.set(\"spark.rapids.sql.incompatibleDateFormats.enabled\", \"true\")\n", "spark.conf.set(\"spark.rapids.sql.hasNans\", \"false\")\n", "# use GPU to read CSV\n", "spark.conf.set(\"spark.rapids.sql.csv.read.double.enabled\", \"true\")" @@ -639,7 +751,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "metadata": { "scrolled": false }, @@ -648,27 +760,27 @@ "name": "stdout", "output_type": "stream", "text": [ - "ETL takes 41.10439682006836\n" + "ETL takes 135.9117729663849\n" ] } ], "source": [ "\n", "# read raw dataset\n", - "perf = read_perf_csv(spark, orig_perf_path)\n", - "acq = read_acq_csv(spark, orig_acq_path)\n", + "rawDf = read_raw_csv(spark, orig_raw_path)\n", + "rawDf.write.parquet(orig_raw_path_csv2parquet, mode='overwrite')\n", + "rawDf = spark.read.parquet(orig_raw_path_csv2parquet)\n", + "\n", + "acq = extract_acq_columns(rawDf)\n", + "perf = extract_perf_columns(rawDf)\n", "\n", "# run main function to process data\n", "out = run_mortgage(spark, perf, acq)\n", "\n", - "# split 80% for training, 20% for test\n", - "splits = out.randomSplit([0.8, 0.2])\n", - "\n", "# save processed data\n", "if is_save_dataset:\n", " start = time.time()\n", - " splits[0].write.parquet(output_path_train, mode=\"overwrite\")\n", - " splits[1].write.parquet(output_path_test, mode=\"overwrite\")\n", + " out.write.parquet(output_path_data, mode=\"overwrite\")\n", " end = time.time()\n", " print(\"ETL takes {}\".format(end - start))" ] @@ -689,7 +801,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -706,7 +818,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -725,7 +837,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -742,7 +854,7 @@ " StructField(\"seller_name\", FloatType()),\n", " StructField(\"mod_flag\", FloatType()),\n", " StructField(\"orig_interest_rate\", FloatType()),\n", - " StructField(\"orig_upb\", IntegerType()),\n", + " StructField(\"orig_upb\", DoubleType()),\n", " StructField(\"orig_loan_term\", IntegerType()),\n", " StructField(\"orig_ltv\", FloatType()),\n", " StructField(\"orig_cltv\", FloatType()),\n", @@ -764,17 +876,20 @@ "\n", "if is_save_dataset:\n", " # load dataset from file\n", - " train_data = reader.parquet(output_path_train)\n", - " test_data = reader.parquet(output_path_test)\n", + " etlDf = reader.parquet(output_path_data)\n", + " splits = etlDf.randomSplit([0.8, 0.2])\n", + " train_data = splits[0]\n", + " test_data = splits[1]\n", "else:\n", " # use Dataframe from ETL directly\n", + " splits = out.randomSplit([0.8, 0.2])\n", " train_data = splits[0]\n", " test_data = splits[1]" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -785,21 +900,21 @@ " \"growPolicy\": \"depthwise\",\n", " \"nthread\": 1,\n", " \"numRound\": 100,\n", - " \"numWorkers\": 2,\n", + " \"numWorkers\": 1,\n", "}\n", "classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCols(features)" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Training takes 23.666603565216064 seconds\n" + "Training takes 18.92583155632019 seconds\n" ] } ], @@ -815,7 +930,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -825,22 +940,22 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Transformation takes 10.464573383331299 seconds\n", + "Transformation takes 8.959877967834473 seconds\n", "+--------------+--------------------+--------------------+----------+\n", "|delinquency_12| rawPrediction| probability|prediction|\n", "+--------------+--------------------+--------------------+----------+\n", - "| 0|[11.3724613189697...|[0.99998849205439...| 0.0|\n", - "| 0|[8.75509834289550...|[0.99984236936143...| 0.0|\n", - "| 0|[8.56840324401855...|[0.99981002029380...| 0.0|\n", - "| 0|[8.45872020721435...|[0.99978800168901...| 0.0|\n", - "| 0|[8.45872020721435...|[0.99978800168901...| 0.0|\n", + "| 0|[7.92072248458862...|[0.99963699193904...| 0.0|\n", + "| 0|[7.92072248458862...|[0.99963699193904...| 0.0|\n", + "| 0|[8.43130302429199...|[0.99978211015695...| 0.0|\n", + "| 0|[8.20779895782470...|[0.99972755435737...| 0.0|\n", + "| 0|[8.885986328125,-...|[0.99986170543706...| 0.0|\n", "+--------------+--------------------+--------------------+----------+\n", "only showing top 5 rows\n", "\n" @@ -858,15 +973,15 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Evaluation takes 0.770418643951416 seconds\n", - "Accuracy is 0.9881320119084719\n" + "Evaluation takes 0.6158628463745117 seconds\n", + "Accuracy is 0.9861453808970397\n" ] } ], @@ -879,7 +994,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ diff --git a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb index d36474176..433d35880 100644 --- a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb +++ b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb @@ -6,10 +6,10 @@ "source": [ "## Prerequirement\n", "### 1. Download data\n", - "All data could be found at https://docs.rapids.ai/datasets/mortgage-data\n", + "Dataset is derived from Fannie Mae’s [Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) with all rights reserved by Fannie Mae. Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.08/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n", "\n", "### 2. Download needed jars\n", - "* [rapids-4-spark_2.12-22.06.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.06.0/rapids-4-spark_2.12-22.06.0.jar)\n", + "* [rapids-4-spark_2.12-22.08.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar)\n", "\n", "\n", "### 3. Start Spark Standalone\n", @@ -17,7 +17,7 @@ "\n", "### 4. Add ENV\n", "```\n", - "$ export SPARK_JARS=rapids-4-spark_2.12-22.06.0.jar\n", + "$ export SPARK_JARS=rapids-4-spark_2.12-22.08.0.jar\n", "$ export PYSPARK_DRIVER_PYTHON=jupyter \n", "$ export PYSPARK_DRIVER_PYTHON_OPTS=notebook\n", "```\n", @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -60,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -82,70 +82,121 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# File schema\n", - "_csv_perf_schema = StructType([\n", - " StructField('loan_id', LongType()),\n", - " StructField('monthly_reporting_period', StringType()),\n", - " StructField('servicer', StringType()),\n", - " StructField('interest_rate', DoubleType()),\n", - " StructField('current_actual_upb', DoubleType()),\n", - " StructField('loan_age', DoubleType()),\n", - " StructField('remaining_months_to_legal_maturity', DoubleType()),\n", - " StructField('adj_remaining_months_to_maturity', DoubleType()),\n", - " StructField('maturity_date', StringType()),\n", - " StructField('msa', DoubleType()),\n", - " StructField('current_loan_delinquency_status', IntegerType()),\n", - " StructField('mod_flag', StringType()),\n", - " StructField('zero_balance_code', StringType()),\n", - " StructField('zero_balance_effective_date', StringType()),\n", - " StructField('last_paid_installment_date', StringType()),\n", - " StructField('foreclosed_after', StringType()),\n", - " StructField('disposition_date', StringType()),\n", - " StructField('foreclosure_costs', DoubleType()),\n", - " StructField('prop_preservation_and_repair_costs', DoubleType()),\n", - " StructField('asset_recovery_costs', DoubleType()),\n", - " StructField('misc_holding_expenses', DoubleType()),\n", - " StructField('holding_taxes', DoubleType()),\n", - " StructField('net_sale_proceeds', DoubleType()),\n", - " StructField('credit_enhancement_proceeds', DoubleType()),\n", - " StructField('repurchase_make_whole_proceeds', StringType()),\n", - " StructField('other_foreclosure_proceeds', DoubleType()),\n", - " StructField('non_interest_bearing_upb', DoubleType()),\n", - " StructField('principal_forgiveness_upb', StringType()),\n", - " StructField('repurchase_make_whole_proceeds_flag', StringType()),\n", - " StructField('foreclosure_principal_write_off_amount', StringType()),\n", - " StructField('servicing_activity_indicator', StringType())])\n", - "\n", - "_csv_acq_schema = StructType([\n", - " StructField('loan_id', LongType()),\n", - " StructField('orig_channel', StringType()),\n", - " StructField('seller_name', StringType()),\n", - " StructField('orig_interest_rate', DoubleType()),\n", - " StructField('orig_upb', IntegerType()),\n", - " StructField('orig_loan_term', IntegerType()),\n", - " StructField('orig_date', StringType()),\n", - " StructField('first_pay_date', StringType()),\n", - " StructField('orig_ltv', DoubleType()),\n", - " StructField('orig_cltv', DoubleType()),\n", - " StructField('num_borrowers', DoubleType()),\n", - " StructField('dti', DoubleType()),\n", - " StructField('borrower_credit_score', DoubleType()),\n", - " StructField('first_home_buyer', StringType()),\n", - " StructField('loan_purpose', StringType()),\n", - " StructField('property_type', StringType()),\n", - " StructField('num_units', IntegerType()),\n", - " StructField('occupancy_status', StringType()),\n", - " StructField('property_state', StringType()),\n", - " StructField('zip', IntegerType()),\n", - " StructField('mortgage_insurance_percent', DoubleType()),\n", - " StructField('product_type', StringType()),\n", - " StructField('coborrow_credit_score', DoubleType()),\n", - " StructField('mortgage_insurance_type', DoubleType()),\n", - " StructField('relocation_mortgage_indicator', StringType())])" + "_csv_raw_schema = StructType([\n", + " StructField(\"reference_pool_id\", StringType()),\n", + " StructField(\"loan_id\", LongType()),\n", + " StructField(\"monthly_reporting_period\", StringType()),\n", + " StructField(\"orig_channel\", StringType()),\n", + " StructField(\"seller_name\", StringType()),\n", + " StructField(\"servicer\", StringType()),\n", + " StructField(\"master_servicer\", StringType()),\n", + " StructField(\"orig_interest_rate\", DoubleType()),\n", + " StructField(\"interest_rate\", DoubleType()),\n", + " StructField(\"orig_upb\", DoubleType()),\n", + " StructField(\"upb_at_issuance\", StringType()),\n", + " StructField(\"current_actual_upb\", DoubleType()),\n", + " StructField(\"orig_loan_term\", IntegerType()),\n", + " StructField(\"orig_date\", StringType()),\n", + " StructField(\"first_pay_date\", StringType()), \n", + " StructField(\"loan_age\", DoubleType()),\n", + " StructField(\"remaining_months_to_legal_maturity\", DoubleType()),\n", + " StructField(\"adj_remaining_months_to_maturity\", DoubleType()),\n", + " StructField(\"maturity_date\", StringType()),\n", + " StructField(\"orig_ltv\", DoubleType()),\n", + " StructField(\"orig_cltv\", DoubleType()),\n", + " StructField(\"num_borrowers\", DoubleType()),\n", + " StructField(\"dti\", DoubleType()),\n", + " StructField(\"borrower_credit_score\", DoubleType()),\n", + " StructField(\"coborrow_credit_score\", DoubleType()),\n", + " StructField(\"first_home_buyer\", StringType()),\n", + " StructField(\"loan_purpose\", StringType()),\n", + " StructField(\"property_type\", StringType()),\n", + " StructField(\"num_units\", IntegerType()),\n", + " StructField(\"occupancy_status\", StringType()),\n", + " StructField(\"property_state\", StringType()),\n", + " StructField(\"msa\", DoubleType()),\n", + " StructField(\"zip\", IntegerType()),\n", + " StructField(\"mortgage_insurance_percent\", DoubleType()),\n", + " StructField(\"product_type\", StringType()),\n", + " StructField(\"prepayment_penalty_indicator\", StringType()),\n", + " StructField(\"interest_only_loan_indicator\", StringType()),\n", + " StructField(\"interest_only_first_principal_and_interest_payment_date\", StringType()),\n", + " StructField(\"months_to_amortization\", StringType()),\n", + " StructField(\"current_loan_delinquency_status\", IntegerType()),\n", + " StructField(\"loan_payment_history\", StringType()),\n", + " StructField(\"mod_flag\", StringType()),\n", + " StructField(\"mortgage_insurance_cancellation_indicator\", StringType()),\n", + " StructField(\"zero_balance_code\", StringType()),\n", + " StructField(\"zero_balance_effective_date\", StringType()),\n", + " StructField(\"upb_at_the_time_of_removal\", StringType()),\n", + " StructField(\"repurchase_date\", StringType()),\n", + " StructField(\"scheduled_principal_current\", StringType()),\n", + " StructField(\"total_principal_current\", StringType()),\n", + " StructField(\"unscheduled_principal_current\", StringType()),\n", + " StructField(\"last_paid_installment_date\", StringType()),\n", + " StructField(\"foreclosed_after\", StringType()),\n", + " StructField(\"disposition_date\", StringType()),\n", + " StructField(\"foreclosure_costs\", DoubleType()),\n", + " StructField(\"prop_preservation_and_repair_costs\", DoubleType()),\n", + " StructField(\"asset_recovery_costs\", DoubleType()),\n", + " StructField(\"misc_holding_expenses\", DoubleType()),\n", + " StructField(\"holding_taxes\", DoubleType()),\n", + " StructField(\"net_sale_proceeds\", DoubleType()),\n", + " StructField(\"credit_enhancement_proceeds\", DoubleType()),\n", + " StructField(\"repurchase_make_whole_proceeds\", StringType()),\n", + " StructField(\"other_foreclosure_proceeds\", DoubleType()),\n", + " StructField(\"non_interest_bearing_upb\", DoubleType()),\n", + " StructField(\"principal_forgiveness_upb\", StringType()),\n", + " StructField(\"original_list_start_date\", StringType()),\n", + " StructField(\"original_list_price\", StringType()),\n", + " StructField(\"current_list_start_date\", StringType()),\n", + " StructField(\"current_list_price\", StringType()),\n", + " StructField(\"borrower_credit_score_at_issuance\", StringType()),\n", + " StructField(\"co-borrower_credit_score_at_issuance\", StringType()),\n", + " StructField(\"borrower_credit_score_current\", StringType()),\n", + " StructField(\"co-Borrower_credit_score_current\", StringType()),\n", + " StructField(\"mortgage_insurance_type\", DoubleType()),\n", + " StructField(\"servicing_activity_indicator\", StringType()),\n", + " StructField(\"current_period_modification_loss_amount\", StringType()),\n", + " StructField(\"cumulative_modification_loss_amount\", StringType()),\n", + " StructField(\"current_period_credit_event_net_gain_or_loss\", StringType()),\n", + " StructField(\"cumulative_credit_event_net_gain_or_loss\", StringType()),\n", + " StructField(\"homeready_program_indicator\", StringType()),\n", + " StructField(\"foreclosure_principal_write_off_amount\", StringType()),\n", + " StructField(\"relocation_mortgage_indicator\", StringType()),\n", + " StructField(\"zero_balance_code_change_date\", StringType()),\n", + " StructField(\"loan_holdback_indicator\", StringType()),\n", + " StructField(\"loan_holdback_effective_date\", StringType()),\n", + " StructField(\"delinquent_accrued_interest\", StringType()),\n", + " StructField(\"property_valuation_method\", StringType()),\n", + " StructField(\"high_balance_loan_indicator\", StringType()),\n", + " StructField(\"arm_initial_fixed-rate_period_lt_5_yr_indicator\", StringType()),\n", + " StructField(\"arm_product_type\", StringType()),\n", + " StructField(\"initial_fixed-rate_period\", StringType()),\n", + " StructField(\"interest_rate_adjustment_frequency\", StringType()),\n", + " StructField(\"next_interest_rate_adjustment_date\", StringType()),\n", + " StructField(\"next_payment_change_date\", StringType()),\n", + " StructField(\"index\", StringType()),\n", + " StructField(\"arm_cap_structure\", StringType()),\n", + " StructField(\"initial_interest_rate_cap_up_percent\", StringType()),\n", + " StructField(\"periodic_interest_rate_cap_up_percent\", StringType()),\n", + " StructField(\"lifetime_interest_rate_cap_up_percent\", StringType()),\n", + " StructField(\"mortgage_margin\", StringType()),\n", + " StructField(\"arm_balloon_indicator\", StringType()),\n", + " StructField(\"arm_plan_number\", StringType()),\n", + " StructField(\"borrower_assistance_plan\", StringType()),\n", + " StructField(\"hltv_refinance_option_indicator\", StringType()),\n", + " StructField(\"deal_name\", StringType()),\n", + " StructField(\"repurchase_make_whole_proceeds_flag\", StringType()),\n", + " StructField(\"alternative_delinquency_resolution\", StringType()),\n", + " StructField(\"alternative_delinquency_resolution_count\", StringType()),\n", + " StructField(\"total_deferral_amount\", StringType())\n", + " ])" ] }, { @@ -157,7 +208,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -254,7 +305,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -300,67 +351,129 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 2. Define ETL Process\n", - "\n", - "Define the function to do the ETL process\n", - "\n", - "#### 2.1 Define Functions to Read Raw CSV File\n", - "\n", - "* Define function to get quarter from input CSV file name" + "* Functions to extract perf and acq columns from raw schema" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ - "def _get_quarter_from_csv_file_name():\n", - " return substring_index(substring_index(input_file_name(), '.', 1), '_', -1)" + "def extract_perf_columns(rawDf):\n", + " perfDf = rawDf.select(\n", + " col(\"loan_id\"),\n", + " date_format(to_date(col(\"monthly_reporting_period\"),\"MMyyyy\"), \"MM/dd/yyyy\").alias(\"monthly_reporting_period\"),\n", + " upper(col(\"servicer\")).alias(\"servicer\"),\n", + " col(\"interest_rate\"),\n", + " col(\"current_actual_upb\"),\n", + " col(\"loan_age\"),\n", + " col(\"remaining_months_to_legal_maturity\"),\n", + " col(\"adj_remaining_months_to_maturity\"),\n", + " date_format(to_date(col(\"maturity_date\"),\"MMyyyy\"), \"MM/yyyy\").alias(\"maturity_date\"),\n", + " col(\"msa\"),\n", + " col(\"current_loan_delinquency_status\"),\n", + " col(\"mod_flag\"),\n", + " col(\"zero_balance_code\"),\n", + " date_format(to_date(col(\"zero_balance_effective_date\"),\"MMyyyy\"), \"MM/yyyy\").alias(\"zero_balance_effective_date\"),\n", + " date_format(to_date(col(\"last_paid_installment_date\"),\"MMyyyy\"), \"MM/dd/yyyy\").alias(\"last_paid_installment_date\"),\n", + " date_format(to_date(col(\"foreclosed_after\"),\"MMyyyy\"), \"MM/dd/yyyy\").alias(\"foreclosed_after\"),\n", + " date_format(to_date(col(\"disposition_date\"),\"MMyyyy\"), \"MM/dd/yyyy\").alias(\"disposition_date\"),\n", + " col(\"foreclosure_costs\"),\n", + " col(\"prop_preservation_and_repair_costs\"),\n", + " col(\"asset_recovery_costs\"),\n", + " col(\"misc_holding_expenses\"),\n", + " col(\"holding_taxes\"),\n", + " col(\"net_sale_proceeds\"),\n", + " col(\"credit_enhancement_proceeds\"),\n", + " col(\"repurchase_make_whole_proceeds\"),\n", + " col(\"other_foreclosure_proceeds\"),\n", + " col(\"non_interest_bearing_upb\"),\n", + " col(\"principal_forgiveness_upb\"),\n", + " col(\"repurchase_make_whole_proceeds_flag\"),\n", + " col(\"foreclosure_principal_write_off_amount\"),\n", + " col(\"servicing_activity_indicator\"),\n", + " col('quarter')\n", + " )\n", + "\n", + " return perfDf.select(\"*\").filter(\"current_actual_upb != 0.0\")\n", + "\n", + "def extract_acq_columns(rawDf):\n", + " acqDf = rawDf.select(\n", + " col(\"loan_id\"),\n", + " col(\"orig_channel\"),\n", + " upper(col(\"seller_name\")).alias(\"seller_name\"),\n", + " col(\"orig_interest_rate\"),\n", + " col(\"orig_upb\"),\n", + " col(\"orig_loan_term\"),\n", + " date_format(to_date(col(\"orig_date\"),\"MMyyyy\"), \"MM/yyyy\").alias(\"orig_date\"),\n", + " date_format(to_date(col(\"first_pay_date\"),\"MMyyyy\"), \"MM/yyyy\").alias(\"first_pay_date\"),\n", + " col(\"orig_ltv\"),\n", + " col(\"orig_cltv\"),\n", + " col(\"num_borrowers\"),\n", + " col(\"dti\"),\n", + " col(\"borrower_credit_score\"),\n", + " col(\"first_home_buyer\"),\n", + " col(\"loan_purpose\"),\n", + " col(\"property_type\"),\n", + " col(\"num_units\"),\n", + " col(\"occupancy_status\"),\n", + " col(\"property_state\"),\n", + " col(\"zip\"),\n", + " col(\"mortgage_insurance_percent\"),\n", + " col(\"product_type\"),\n", + " col(\"coborrow_credit_score\"),\n", + " col(\"mortgage_insurance_type\"),\n", + " col(\"relocation_mortgage_indicator\"),\n", + " dense_rank().over(Window.partitionBy(\"loan_id\").orderBy(to_date(col(\"monthly_reporting_period\"),\"MMyyyy\"))).alias(\"rank\"),\n", + " col('quarter')\n", + " )\n", + "\n", + " return acqDf.select(\"*\").filter(col(\"rank\")==1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "* Define function to read Performance CSV data file" + "### 2. Define ETL Process\n", + "\n", + "Define the function to do the ETL process\n", + "\n", + "#### 2.1 Define Functions to Read Raw CSV File\n", + "\n", + "* Define function to get quarter from input CSV file name" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ - "def read_perf_csv(spark, path):\n", - " return spark.read.format('csv') \\\n", - " .option('nullValue', '') \\\n", - " .option('header', 'false') \\\n", - " .option('delimiter', '|') \\\n", - " .schema(_csv_perf_schema) \\\n", - " .load(path) \\\n", - " .withColumn('quarter', _get_quarter_from_csv_file_name())" + "def _get_quarter_from_csv_file_name():\n", + " return substring_index(substring_index(input_file_name(), '.', 1), '/', -1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "* Define function to read Acquisition CSV file" + "* Define function to read raw CSV data file" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ - "def read_acq_csv(spark, path):\n", + "def read_raw_csv(spark, path):\n", " return spark.read.format('csv') \\\n", " .option('nullValue', '') \\\n", - " .option('header', 'false') \\\n", + " .option('header', False) \\\n", " .option('delimiter', '|') \\\n", - " .schema(_csv_acq_schema) \\\n", + " .schema(_csv_raw_schema) \\\n", " .load(path) \\\n", " .withColumn('quarter', _get_quarter_from_csv_file_name())" ] @@ -376,7 +489,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ @@ -402,7 +515,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -481,7 +594,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ @@ -528,7 +641,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -552,7 +665,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -587,7 +700,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ @@ -615,31 +728,13 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "# You need to update them to your real paths!\n", "dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n", - "orig_perf_path=dataRoot + '/mortgage/Performance/'\n", - "orig_acq_path=dataRoot + '/mortgage/Acquisition/'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Define temporary folder path " - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [], - "source": [ - "tmp_perf_path=dataRoot + '/mortgage/perf/'\n", - "tmp_acq_path=dataRoot + '/mortgage/acq/'" + "orig_raw_path = dataRoot + '/mortgage/input/'" ] }, { @@ -651,11 +746,15 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ - "output_path=dataRoot + '/mortgage/output/'" + "output_path = dataRoot + '/mortgage/output/data/'\n", + "output_csv2parquet = dataRoot + '/mortgage/output/csv2parquet/'\n", + "output_path_train = dataRoot + '/mortgage/output/train/'\n", + "output_path_eval = dataRoot + '/mortgage/output/eval/'\n", + "save_train_eval_dataset = True" ] }, { @@ -667,12 +766,11 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "spark.conf.set('spark.rapids.sql.explain', 'ALL')\n", - "spark.conf.set('spark.rapids.sql.incompatibleOps.enabled', 'true')\n", "spark.conf.set('spark.rapids.sql.batchSizeBytes', '512M')\n", "spark.conf.set('spark.rapids.sql.reader.batchSizeBytes', '768M')" ] @@ -681,50 +779,28 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Run Part\n", - "### Read Raw File and Transcode Data\n", - "#### 1. Add additional Spark settings" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [], - "source": [ - "# we want a few big files instead of lots of small files\n", - "spark.conf.set('spark.sql.files.maxPartitionBytes', '200G')" + "## Run Part" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### 2. Read Raw File and Transcode to Parquet" + "### Read Raw File" ] }, { "cell_type": "code", - "execution_count": 62, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "6.568682670593262\n" - ] - } - ], + "outputs": [], "source": [ - "start = time.time()\n", - "# read data and transcode to qarquet\n", - "acq = read_acq_csv(spark, orig_acq_path)\n", - "acq.repartition(12).write.parquet(tmp_acq_path, mode='overwrite')\n", - "perf = read_perf_csv(spark, orig_perf_path)\n", - "perf.coalesce(96).write.parquet(tmp_perf_path, mode='overwrite')\n", - "end = time.time()\n", - "print(end - start)" + "rawDf = read_raw_csv(spark, orig_raw_path)\n", + "rawDf.write.parquet(output_csv2parquet, mode='overwrite')\n", + "rawDf = spark.read.parquet(output_csv2parquet)\n", + "\n", + "acq = extract_acq_columns(rawDf)\n", + "perf = extract_perf_columns(rawDf)" ] }, { @@ -737,7 +813,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -746,7 +822,9 @@ "# CPU run, set to false\n", "# spark.conf.set('spark.rapids.sql.enabled', 'false')\n", "spark.conf.set('spark.sql.files.maxPartitionBytes', '1G')\n", - "spark.conf.set('spark.sql.shuffle.partitions', '192')" + "spark.conf.set(\"spark.rapids.sql.hasNans\", \"false\")\n", + "# use GPU to read CSV\n", + "spark.conf.set(\"spark.rapids.sql.csv.read.double.enabled\", \"true\")" ] }, { @@ -758,7 +836,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 61, "metadata": {}, "outputs": [ { @@ -766,786 +844,881 @@ "output_type": "stream", "text": [ "== Physical Plan ==\n", - "*(5) GpuColumnarToRow false\n", - "+- !GpuProject [gpucoalesce(orig_channel#27851, 0) AS orig_channel#29615, gpucoalesce(first_home_buyer#28053, 0) AS first_home_buyer#29616, gpucoalesce(loan_purpose#28255, 0) AS loan_purpose#29617, gpucoalesce(property_type#28457, 0) AS property_type#29618, gpucoalesce(occupancy_status#28659, 0) AS occupancy_status#29619, gpucoalesce(property_state#28861, 0) AS property_state#29620, gpucoalesce(relocation_mortgage_indicator#29063, 0) AS relocation_mortgage_indicator#29621, gpucoalesce(seller_name#29265, 0) AS seller_name#29622, gpucoalesce(id#27657, 0) AS mod_flag#29623, gpucoalesce(gpunanvl(orig_interest_rate#26291, null), 0.0) AS orig_interest_rate#29624, gpucoalesce(orig_upb#26292, 0) AS orig_upb#29625, gpucoalesce(orig_loan_term#26293, 0) AS orig_loan_term#29626, gpucoalesce(gpunanvl(orig_ltv#26296, null), 0.0) AS orig_ltv#29627, gpucoalesce(gpunanvl(orig_cltv#26297, null), 0.0) AS orig_cltv#29628, gpucoalesce(gpunanvl(num_borrowers#26298, null), 0.0) AS num_borrowers#29629, gpucoalesce(gpunanvl(dti#26299, null), 0.0) AS dti#29630, gpucoalesce(gpunanvl(borrower_credit_score#26300, null), 0.0) AS borrower_credit_score#29631, gpucoalesce(num_units#26304, 0) AS num_units#29632, gpucoalesce(zip#26307, 0) AS zip#29633, gpucoalesce(gpunanvl(mortgage_insurance_percent#26308, null), 0.0) AS mortgage_insurance_percent#29634, gpucoalesce(current_loan_delinquency_status#26234, 0) AS current_loan_delinquency_status#29635, gpucoalesce(gpunanvl(current_actual_upb#26228, null), 0.0) AS current_actual_upb#29636, gpucoalesce(gpunanvl(interest_rate#26227, null), 0.0) AS interest_rate#29637, gpucoalesce(gpunanvl(loan_age#26229, null), 0.0) AS loan_age#29638, ... 3 more fields]\n", - " +- !GpuBroadcastHashJoin [mod_flag#26235], [mod_flag#29333], LeftOuter, BuildRight\n", - " :- !GpuProject [interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, delinquency_12#27036, orig_interest_rate#26291, orig_upb#26292, orig_loan_term#26293, orig_ltv#26296, orig_cltv#26297, num_borrowers#26298, dti#26299, borrower_credit_score#26300, num_units#26304, zip#26307, mortgage_insurance_percent#26308, orig_channel#27851, first_home_buyer#28053, loan_purpose#28255, property_type#28457, occupancy_status#28659, ... 3 more fields]\n", - " : +- !GpuBroadcastHashJoin [seller_name#27396], [seller_name#29131], LeftOuter, BuildRight\n", - " : :- !GpuProject [interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, delinquency_12#27036, seller_name#27396, orig_interest_rate#26291, orig_upb#26292, orig_loan_term#26293, orig_ltv#26296, orig_cltv#26297, num_borrowers#26298, dti#26299, borrower_credit_score#26300, num_units#26304, zip#26307, mortgage_insurance_percent#26308, orig_channel#27851, first_home_buyer#28053, loan_purpose#28255, property_type#28457, ... 3 more fields]\n", - " : : +- !GpuBroadcastHashJoin [relocation_mortgage_indicator#26312], [relocation_mortgage_indicator#28929], LeftOuter, BuildRight\n", - " : : :- !GpuProject [interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, delinquency_12#27036, seller_name#27396, orig_interest_rate#26291, orig_upb#26292, orig_loan_term#26293, orig_ltv#26296, orig_cltv#26297, num_borrowers#26298, dti#26299, borrower_credit_score#26300, num_units#26304, zip#26307, mortgage_insurance_percent#26308, relocation_mortgage_indicator#26312, orig_channel#27851, first_home_buyer#28053, loan_purpose#28255, ... 3 more fields]\n", - " : : : +- !GpuBroadcastHashJoin [property_state#26306], [property_state#28727], LeftOuter, BuildRight\n", - " : : : :- !GpuProject [interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, delinquency_12#27036, seller_name#27396, orig_interest_rate#26291, orig_upb#26292, orig_loan_term#26293, orig_ltv#26296, orig_cltv#26297, num_borrowers#26298, dti#26299, borrower_credit_score#26300, num_units#26304, property_state#26306, zip#26307, mortgage_insurance_percent#26308, relocation_mortgage_indicator#26312, orig_channel#27851, first_home_buyer#28053, ... 3 more fields]\n", - " : : : : +- !GpuBroadcastHashJoin [occupancy_status#26305], [occupancy_status#28525], LeftOuter, BuildRight\n", - " : : : : :- !GpuProject [interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, delinquency_12#27036, seller_name#27396, orig_interest_rate#26291, orig_upb#26292, orig_loan_term#26293, orig_ltv#26296, orig_cltv#26297, num_borrowers#26298, dti#26299, borrower_credit_score#26300, num_units#26304, occupancy_status#26305, property_state#26306, zip#26307, mortgage_insurance_percent#26308, relocation_mortgage_indicator#26312, orig_channel#27851, ... 3 more fields]\n", - " : : : : : +- !GpuBroadcastHashJoin [property_type#26303], [property_type#28323], LeftOuter, BuildRight\n", - " : : : : : :- !GpuProject [interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, delinquency_12#27036, seller_name#27396, orig_interest_rate#26291, orig_upb#26292, orig_loan_term#26293, orig_ltv#26296, orig_cltv#26297, num_borrowers#26298, dti#26299, borrower_credit_score#26300, property_type#26303, num_units#26304, occupancy_status#26305, property_state#26306, zip#26307, mortgage_insurance_percent#26308, relocation_mortgage_indicator#26312, ... 3 more fields]\n", - " : : : : : : +- !GpuBroadcastHashJoin [loan_purpose#26302], [loan_purpose#28121], LeftOuter, BuildRight\n", - " : : : : : : :- !GpuProject [interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, delinquency_12#27036, seller_name#27396, orig_interest_rate#26291, orig_upb#26292, orig_loan_term#26293, orig_ltv#26296, orig_cltv#26297, num_borrowers#26298, dti#26299, borrower_credit_score#26300, loan_purpose#26302, property_type#26303, num_units#26304, occupancy_status#26305, property_state#26306, zip#26307, mortgage_insurance_percent#26308, ... 3 more fields]\n", - " : : : : : : : +- !GpuBroadcastHashJoin [first_home_buyer#26301], [first_home_buyer#27919], LeftOuter, BuildRight\n", - " : : : : : : : :- !GpuProject [interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, delinquency_12#27036, seller_name#27396, orig_interest_rate#26291, orig_upb#26292, orig_loan_term#26293, orig_ltv#26296, orig_cltv#26297, num_borrowers#26298, dti#26299, borrower_credit_score#26300, first_home_buyer#26301, loan_purpose#26302, property_type#26303, num_units#26304, occupancy_status#26305, property_state#26306, zip#26307, ... 3 more fields]\n", - " : : : : : : : : +- !GpuBroadcastHashJoin [orig_channel#26289], [orig_channel#27717], LeftOuter, BuildRight\n", - " : : : : : : : : :- !GpuProject [interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, delinquency_12#27036, orig_channel#26289, seller_name#27396, orig_interest_rate#26291, orig_upb#26292, orig_loan_term#26293, orig_ltv#26296, orig_cltv#26297, num_borrowers#26298, dti#26299, borrower_credit_score#26300, first_home_buyer#26301, loan_purpose#26302, property_type#26303, num_units#26304, occupancy_status#26305, property_state#26306, ... 3 more fields]\n", - " : : : : : : : : : +- !GpuShuffledHashJoin [loan_id#26224L, quarter#26255], [loan_id#26288L, quarter#26313], Inner, BuildRight\n", - " : : : : : : : : : :- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(loan_id#26224L, quarter#26255, 192), true, [id=#17112]\n", - " : : : : : : : : : : +- !GpuProject [quarter#26255, loan_id#26224L, interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, delinquency_12#27036]\n", - " : : : : : : : : : : +- !GpuShuffledHashJoin [quarter#26255, loan_id#26224L, cast(timestamp_year#27100 as bigint), cast(timestamp_month#27064 as bigint)], [quarter#27167, loan_id#27136L, timestamp_year#26990L, timestamp_month#27019L], LeftOuter, BuildRight\n", - " : : : : : : : : : : :- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : : : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#26255, loan_id#26224L, cast(timestamp_year#27100 as bigint), cast(timestamp_month#27064 as bigint), 192), true, [id=#17081]\n", - " : : : : : : : : : : : +- GpuRowToColumnar TargetSize(536870912)\n", - " : : : : : : : : : : : +- *(1) Project [loan_id#26224L, interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, quarter#26255, month(cast(cast(unix_timestamp(monthly_reporting_period#26225, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_month#27064, year(cast(cast(unix_timestamp(monthly_reporting_period#26225, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_year#27100]\n", - " : : : : : : : : : : : +- *(1) GpuColumnarToRow false\n", - " : : : : : : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#26224L) AND gpuisnotnull(quarter#26255))\n", - " : : : : : : : : : : : +- GpuFileScan parquet [loan_id#26224L,monthly_reporting_period#26225,interest_rate#26227,current_actual_upb#26228,loan_age#26229,msa#26233,current_loan_delinquency_status#26234,mod_flag#26235,non_interest_bearing_upb#26250,quarter#26255] Batched: true, DataFilters: [isnotnull(loan_id#26224L), isnotnull(quarter#26255)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_30#26658, CASE WHEN (current_loan_delinquency_status#26895 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_90#26659, CASE WHEN (current_loan_delinquency_status#26895 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_180#26660]\n", - " : : : : : : : : : : +- *(3) GpuColumnarToRow false\n", - " : : : : : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#26885L) AND gpuisnotnull(quarter#26916))\n", - " : : : : : : : : : : +- GpuFileScan parquet [loan_id#26885L,monthly_reporting_period#26886,current_loan_delinquency_status#26895,quarter#26916] Batched: true, DataFilters: [isnotnull(loan_id#26885L), isnotnull(quarter#26916)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", - " : : : : : : : : : +- GpuCoalesceBatches RequireSingleBatch\n", - " : : : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, timestamp_year#26990L, timestamp_month#27019L, 192), true, [id=#15140]\n", - " : : : : : : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n", - " : : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931, 192), true, [id=#15137]\n", - " : : : : : : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n", - " : : : : : : : : : +- !GpuProject [quarter#27167, FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) AS josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, loan_id#27136L, month_y#26931]\n", - " : : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : : : : : : : +- !GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast((month_y#26931 - 1) as bigint)) as double) / 12.0))))\n", - " : : : : : : : : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680], [month_y#26931]\n", - " : : : : : : : : : +- !GpuProject [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680]\n", - " : : : : : : : : : +- !GpuBroadcastHashJoin [loan_id#27136L, quarter#27167], [loan_id#26885L, quarter#26916], LeftOuter, BuildRight\n", - " : : : : : : : : : :- GpuRowToColumnar TargetSize(536870912)\n", - " : : : : : : : : : : +- *(2) Project [quarter#27167, loan_id#27136L, month(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_month#26734, year(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_year#26770]\n", - " : : : : : : : : : : +- *(2) GpuColumnarToRow false\n", - " : : : : : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#27136L) AND gpuisnotnull(quarter#27167))\n", - " : : : : : : : : : : +- GpuFileScan parquet [loan_id#27136L,monthly_reporting_period#27137,quarter#27167] Batched: true, DataFilters: [isnotnull(loan_id#27136L), isnotnull(quarter#27167)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", - " : : : : : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true])), [id=#15129]\n", - " : : : : : : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[gpumax(current_loan_delinquency_status#26895), gpumin(delinquency_30#26658), gpumin(delinquency_90#26659), gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n", - " : : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#26916, loan_id#26885L, 192), true, [id=#15126]\n", - " : : : : : : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[partial_gpumax(current_loan_delinquency_status#26895), partial_gpumin(delinquency_30#26658), partial_gpumin(delinquency_90#26659), partial_gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n", - " : : : : : : : : : +- GpuRowToColumnar TargetSize(536870912)\n", - " : : : : : : : : : +- *(3) Project [quarter#26916, loan_id#26885L, current_loan_delinquency_status#26895, CASE WHEN (current_loan_delinquency_status#26895 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_30#26658, CASE WHEN (current_loan_delinquency_status#26895 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_90#26659, CASE WHEN (current_loan_delinquency_status#26895 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_180#26660]\n", - " : : : : : : : : : +- *(3) GpuColumnarToRow false\n", - " : : : : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#26885L) AND gpuisnotnull(quarter#26916))\n", - " : : : : : : : : : +- GpuFileScan parquet [loan_id#26885L,monthly_reporting_period#26886,current_loan_delinquency_status#26895,quarter#26916] Batched: true, DataFilters: [isnotnull(loan_id#26885L), isnotnull(quarter#26916)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", - " : : : : : : : : +- GpuCoalesceBatches RequireSingleBatch\n", - " : : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, timestamp_year#26990L, timestamp_month#27019L, 192), true, [id=#15140]\n", - " : : : : : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n", - " : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931, 192), true, [id=#15137]\n", - " : : : : : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n", - " : : : : : : : : +- !GpuProject [quarter#27167, FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) AS josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, loan_id#27136L, month_y#26931]\n", - " : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : : : : : : +- !GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast((month_y#26931 - 1) as bigint)) as double) / 12.0))))\n", - " : : : : : : : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680], [month_y#26931]\n", - " : : : : : : : : +- !GpuProject [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680]\n", - " : : : : : : : : +- !GpuBroadcastHashJoin [loan_id#27136L, quarter#27167], [loan_id#26885L, quarter#26916], LeftOuter, BuildRight\n", - " : : : : : : : : :- GpuRowToColumnar TargetSize(536870912)\n", - " : : : : : : : : : +- *(2) Project [quarter#27167, loan_id#27136L, month(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_month#26734, year(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_year#26770]\n", - " : : : : : : : : : +- *(2) GpuColumnarToRow false\n", - " : : : : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#27136L) AND gpuisnotnull(quarter#27167))\n", - " : : : : : : : : : +- GpuFileScan parquet [loan_id#27136L,monthly_reporting_period#27137,quarter#27167] Batched: true, DataFilters: [isnotnull(loan_id#27136L), isnotnull(quarter#27167)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", - " : : : : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true])), [id=#15129]\n", - " : : : : : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[gpumax(current_loan_delinquency_status#26895), gpumin(delinquency_30#26658), gpumin(delinquency_90#26659), gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n", - " : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#26916, loan_id#26885L, 192), true, [id=#15126]\n", - " : : : : : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[partial_gpumax(current_loan_delinquency_status#26895), partial_gpumin(delinquency_30#26658), partial_gpumin(delinquency_90#26659), partial_gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n", - " : : : : : : : : +- GpuRowToColumnar TargetSize(536870912)\n", - " : : : : : : : : +- *(3) Project [quarter#26916, loan_id#26885L, current_loan_delinquency_status#26895, CASE WHEN (current_loan_delinquency_status#26895 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_30#26658, CASE WHEN (current_loan_delinquency_status#26895 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_90#26659, CASE WHEN (current_loan_delinquency_status#26895 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_180#26660]\n", - " : : : : : : : : +- *(3) GpuColumnarToRow false\n", - " : : : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#26885L) AND gpuisnotnull(quarter#26916))\n", - " : : : : : : : : +- GpuFileScan parquet [loan_id#26885L,monthly_reporting_period#26886,current_loan_delinquency_status#26895,quarter#26916] Batched: true, DataFilters: [isnotnull(loan_id#26885L), isnotnull(quarter#26916)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", - " : : : : : : : +- GpuCoalesceBatches RequireSingleBatch\n", - " : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, timestamp_year#26990L, timestamp_month#27019L, 192), true, [id=#15140]\n", - " : : : : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n", - " : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931, 192), true, [id=#15137]\n", - " : : : : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n", - " : : : : : : : +- !GpuProject [quarter#27167, FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) AS josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, loan_id#27136L, month_y#26931]\n", - " : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : : : : : +- !GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast((month_y#26931 - 1) as bigint)) as double) / 12.0))))\n", - " : : : : : : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680], [month_y#26931]\n", - " : : : : : : : +- !GpuProject [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680]\n", - " : : : : : : : +- !GpuBroadcastHashJoin [loan_id#27136L, quarter#27167], [loan_id#26885L, quarter#26916], LeftOuter, BuildRight\n", - " : : : : : : : :- GpuRowToColumnar TargetSize(536870912)\n", - " : : : : : : : : +- *(2) Project [quarter#27167, loan_id#27136L, month(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_month#26734, year(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_year#26770]\n", - " : : : : : : : : +- *(2) GpuColumnarToRow false\n", - " : : : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#27136L) AND gpuisnotnull(quarter#27167))\n", - " : : : : : : : : +- GpuFileScan parquet [loan_id#27136L,monthly_reporting_period#27137,quarter#27167] Batched: true, DataFilters: [isnotnull(loan_id#27136L), isnotnull(quarter#27167)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", - " : : : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true])), [id=#15129]\n", - " : : : : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[gpumax(current_loan_delinquency_status#26895), gpumin(delinquency_30#26658), gpumin(delinquency_90#26659), gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n", - " : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#26916, loan_id#26885L, 192), true, [id=#15126]\n", - " : : : : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[partial_gpumax(current_loan_delinquency_status#26895), partial_gpumin(delinquency_30#26658), partial_gpumin(delinquency_90#26659), partial_gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n", - " : : : : : : : +- GpuRowToColumnar TargetSize(536870912)\n", - " : : : : : : : +- *(3) Project [quarter#26916, loan_id#26885L, current_loan_delinquency_status#26895, CASE WHEN (current_loan_delinquency_status#26895 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_30#26658, CASE WHEN (current_loan_delinquency_status#26895 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_90#26659, CASE WHEN (current_loan_delinquency_status#26895 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_180#26660]\n", - " : : : : : : : +- *(3) GpuColumnarToRow false\n", - " : : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#26885L) AND gpuisnotnull(quarter#26916))\n", - " : : : : : : : +- GpuFileScan parquet [loan_id#26885L,monthly_reporting_period#26886,current_loan_delinquency_status#26895,quarter#26916] Batched: true, DataFilters: [isnotnull(loan_id#26885L), isnotnull(quarter#26916)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", - " : : : : : : +- GpuCoalesceBatches RequireSingleBatch\n", - " : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, timestamp_year#26990L, timestamp_month#27019L, 192), true, [id=#15140]\n", - " : : : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n", - " : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931, 192), true, [id=#15137]\n", - " : : : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n", - " : : : : : : +- !GpuProject [quarter#27167, FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) AS josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, loan_id#27136L, month_y#26931]\n", - " : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : : : : +- !GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast((month_y#26931 - 1) as bigint)) as double) / 12.0))))\n", - " : : : : : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680], [month_y#26931]\n", - " : : : : : : +- !GpuProject [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680]\n", - " : : : : : : +- !GpuBroadcastHashJoin [loan_id#27136L, quarter#27167], [loan_id#26885L, quarter#26916], LeftOuter, BuildRight\n", - " : : : : : : :- GpuRowToColumnar TargetSize(536870912)\n", - " : : : : : : : +- *(2) Project [quarter#27167, loan_id#27136L, month(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_month#26734, year(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_year#26770]\n", - " : : : : : : : +- *(2) GpuColumnarToRow false\n", - " : : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#27136L) AND gpuisnotnull(quarter#27167))\n", - " : : : : : : : +- GpuFileScan parquet [loan_id#27136L,monthly_reporting_period#27137,quarter#27167] Batched: true, DataFilters: [isnotnull(loan_id#27136L), isnotnull(quarter#27167)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", - " : : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true])), [id=#15129]\n", - " : : : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[gpumax(current_loan_delinquency_status#26895), gpumin(delinquency_30#26658), gpumin(delinquency_90#26659), gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n", - " : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#26916, loan_id#26885L, 192), true, [id=#15126]\n", - " : : : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[partial_gpumax(current_loan_delinquency_status#26895), partial_gpumin(delinquency_30#26658), partial_gpumin(delinquency_90#26659), partial_gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n", - " : : : : : : +- GpuRowToColumnar TargetSize(536870912)\n", - " : : : : : : +- *(3) Project [quarter#26916, loan_id#26885L, current_loan_delinquency_status#26895, CASE WHEN (current_loan_delinquency_status#26895 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_30#26658, CASE WHEN (current_loan_delinquency_status#26895 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_90#26659, CASE WHEN (current_loan_delinquency_status#26895 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_180#26660]\n", - " : : : : : : +- *(3) GpuColumnarToRow false\n", - " : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#26885L) AND gpuisnotnull(quarter#26916))\n", - " : : : : : : +- GpuFileScan parquet [loan_id#26885L,monthly_reporting_period#26886,current_loan_delinquency_status#26895,quarter#26916] Batched: true, DataFilters: [isnotnull(loan_id#26885L), isnotnull(quarter#26916)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", - " : : : : : +- GpuCoalesceBatches RequireSingleBatch\n", - " : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, timestamp_year#26990L, timestamp_month#27019L, 192), true, [id=#15140]\n", - " : : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n", - " : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931, 192), true, [id=#15137]\n", - " : : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n", - " : : : : : +- !GpuProject [quarter#27167, FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) AS josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, loan_id#27136L, month_y#26931]\n", - " : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : : : +- !GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast((month_y#26931 - 1) as bigint)) as double) / 12.0))))\n", - " : : : : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680], [month_y#26931]\n", - " : : : : : +- !GpuProject [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680]\n", - " : : : : : +- !GpuBroadcastHashJoin [loan_id#27136L, quarter#27167], [loan_id#26885L, quarter#26916], LeftOuter, BuildRight\n", - " : : : : : :- GpuRowToColumnar TargetSize(536870912)\n", - " : : : : : : +- *(2) Project [quarter#27167, loan_id#27136L, month(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_month#26734, year(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_year#26770]\n", - " : : : : : : +- *(2) GpuColumnarToRow false\n", - " : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#27136L) AND gpuisnotnull(quarter#27167))\n", - " : : : : : : +- GpuFileScan parquet [loan_id#27136L,monthly_reporting_period#27137,quarter#27167] Batched: true, DataFilters: [isnotnull(loan_id#27136L), isnotnull(quarter#27167)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", - " : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true])), [id=#15129]\n", - " : : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[gpumax(current_loan_delinquency_status#26895), gpumin(delinquency_30#26658), gpumin(delinquency_90#26659), gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n", - " : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#26916, loan_id#26885L, 192), true, [id=#15126]\n", - " : : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[partial_gpumax(current_loan_delinquency_status#26895), partial_gpumin(delinquency_30#26658), partial_gpumin(delinquency_90#26659), partial_gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n", - " : : : : : +- GpuRowToColumnar TargetSize(536870912)\n", - " : : : : : +- *(3) Project [quarter#26916, loan_id#26885L, current_loan_delinquency_status#26895, CASE WHEN (current_loan_delinquency_status#26895 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_30#26658, CASE WHEN (current_loan_delinquency_status#26895 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_90#26659, CASE WHEN (current_loan_delinquency_status#26895 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_180#26660]\n", - " : : : : : +- *(3) GpuColumnarToRow false\n", - " : : : : : +- !GpuFilter (gpuisnotnull(loan_id#26885L) AND gpuisnotnull(quarter#26916))\n", - " : : : : : +- GpuFileScan parquet [loan_id#26885L,monthly_reporting_period#26886,current_loan_delinquency_status#26895,quarter#26916] Batched: true, DataFilters: [isnotnull(loan_id#26885L), isnotnull(quarter#26916)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", - " : : : : +- GpuCoalesceBatches RequireSingleBatch\n", - " : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, timestamp_year#26990L, timestamp_month#27019L, 192), true, [id=#15140]\n", - " : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n", - " : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931, 192), true, [id=#15137]\n", - " : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n", - " : : : : +- !GpuProject [quarter#27167, FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) AS josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, loan_id#27136L, month_y#26931]\n", - " : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : : +- !GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast((month_y#26931 - 1) as bigint)) as double) / 12.0))))\n", - " : : : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680], [month_y#26931]\n", - " : : : : +- !GpuProject [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680]\n", - " : : : : +- !GpuBroadcastHashJoin [loan_id#27136L, quarter#27167], [loan_id#26885L, quarter#26916], LeftOuter, BuildRight\n", - " : : : : :- GpuRowToColumnar TargetSize(536870912)\n", - " : : : : : +- *(2) Project [quarter#27167, loan_id#27136L, month(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_month#26734, year(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_year#26770]\n", - " : : : : : +- *(2) GpuColumnarToRow false\n", - " : : : : : +- !GpuFilter (gpuisnotnull(loan_id#27136L) AND gpuisnotnull(quarter#27167))\n", - " : : : : : +- GpuFileScan parquet [loan_id#27136L,monthly_reporting_period#27137,quarter#27167] Batched: true, DataFilters: [isnotnull(loan_id#27136L), isnotnull(quarter#27167)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", - " : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true])), [id=#15129]\n", - " : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[gpumax(current_loan_delinquency_status#26895), gpumin(delinquency_30#26658), gpumin(delinquency_90#26659), gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n", - " : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#26916, loan_id#26885L, 192), true, [id=#15126]\n", - " : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[partial_gpumax(current_loan_delinquency_status#26895), partial_gpumin(delinquency_30#26658), partial_gpumin(delinquency_90#26659), partial_gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n", - " : : : : +- GpuRowToColumnar TargetSize(536870912)\n", - " : : : : +- *(3) Project [quarter#26916, loan_id#26885L, current_loan_delinquency_status#26895, CASE WHEN (current_loan_delinquency_status#26895 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_30#26658, CASE WHEN (current_loan_delinquency_status#26895 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_90#26659, CASE WHEN (current_loan_delinquency_status#26895 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_180#26660]\n", - " : : : : +- *(3) GpuColumnarToRow false\n", - " : : : : +- !GpuFilter (gpuisnotnull(loan_id#26885L) AND gpuisnotnull(quarter#26916))\n", - " : : : : +- GpuFileScan parquet [loan_id#26885L,monthly_reporting_period#26886,current_loan_delinquency_status#26895,quarter#26916] Batched: true, DataFilters: [isnotnull(loan_id#26885L), isnotnull(quarter#26916)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", - " : : : +- GpuCoalesceBatches RequireSingleBatch\n", - " : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, timestamp_year#26990L, timestamp_month#27019L, 192), true, [id=#15140]\n", - " : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n", - " : : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931, 192), true, [id=#15137]\n", - " : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n", - " : : : +- !GpuProject [quarter#27167, FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) AS josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, loan_id#27136L, month_y#26931]\n", - " : : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : +- !GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast((month_y#26931 - 1) as bigint)) as double) / 12.0))))\n", - " : : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680], [month_y#26931]\n", - " : : : +- !GpuProject [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680]\n", - " : : : +- !GpuBroadcastHashJoin [loan_id#27136L, quarter#27167], [loan_id#26885L, quarter#26916], LeftOuter, BuildRight\n", - " : : : :- GpuRowToColumnar TargetSize(536870912)\n", - " : : : : +- *(2) Project [quarter#27167, loan_id#27136L, month(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_month#26734, year(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_year#26770]\n", - " : : : : +- *(2) GpuColumnarToRow false\n", - " : : : : +- !GpuFilter (gpuisnotnull(loan_id#27136L) AND gpuisnotnull(quarter#27167))\n", - " : : : : +- GpuFileScan parquet [loan_id#27136L,monthly_reporting_period#27137,quarter#27167] Batched: true, DataFilters: [isnotnull(loan_id#27136L), isnotnull(quarter#27167)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", - " : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true])), [id=#15129]\n", - " : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[gpumax(current_loan_delinquency_status#26895), gpumin(delinquency_30#26658), gpumin(delinquency_90#26659), gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n", - " : : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#26916, loan_id#26885L, 192), true, [id=#15126]\n", - " : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[partial_gpumax(current_loan_delinquency_status#26895), partial_gpumin(delinquency_30#26658), partial_gpumin(delinquency_90#26659), partial_gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n", - " : : : +- GpuRowToColumnar TargetSize(536870912)\n", - " : : : +- *(3) Project [quarter#26916, loan_id#26885L, current_loan_delinquency_status#26895, CASE WHEN (current_loan_delinquency_status#26895 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_30#26658, CASE WHEN (current_loan_delinquency_status#26895 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_90#26659, CASE WHEN (current_loan_delinquency_status#26895 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_180#26660]\n", - " : : : +- *(3) GpuColumnarToRow false\n", - " : : : +- !GpuFilter (gpuisnotnull(loan_id#26885L) AND gpuisnotnull(quarter#26916))\n", - " : : : +- GpuFileScan parquet [loan_id#26885L,monthly_reporting_period#26886,current_loan_delinquency_status#26895,quarter#26916] Batched: true, DataFilters: [isnotnull(loan_id#26885L), isnotnull(quarter#26916)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", - " : : +- GpuCoalesceBatches RequireSingleBatch\n", - " : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, timestamp_year#26990L, timestamp_month#27019L, 192), true, [id=#15140]\n", - " : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n", - " : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931, 192), true, [id=#15137]\n", - " : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n", - " : : +- !GpuProject [quarter#27167, FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) AS josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, loan_id#27136L, month_y#26931]\n", - " : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : +- !GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast((month_y#26931 - 1) as bigint)) as double) / 12.0))))\n", - " : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680], [month_y#26931]\n", - " : : +- !GpuProject [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680]\n", - " : : +- !GpuBroadcastHashJoin [loan_id#27136L, quarter#27167], [loan_id#26885L, quarter#26916], LeftOuter, BuildRight\n", - " : : :- GpuRowToColumnar TargetSize(536870912)\n", - " : : : +- *(2) Project [quarter#27167, loan_id#27136L, month(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_month#26734, year(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_year#26770]\n", - " : : : +- *(2) GpuColumnarToRow false\n", - " : : : +- !GpuFilter (gpuisnotnull(loan_id#27136L) AND gpuisnotnull(quarter#27167))\n", - " : : : +- GpuFileScan parquet [loan_id#27136L,monthly_reporting_period#27137,quarter#27167] Batched: true, DataFilters: [isnotnull(loan_id#27136L), isnotnull(quarter#27167)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", - " : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true])), [id=#15129]\n", - " : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[gpumax(current_loan_delinquency_status#26895), gpumin(delinquency_30#26658), gpumin(delinquency_90#26659), gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n", - " : : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#26916, loan_id#26885L, 192), true, [id=#15126]\n", - " : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[partial_gpumax(current_loan_delinquency_status#26895), partial_gpumin(delinquency_30#26658), partial_gpumin(delinquency_90#26659), partial_gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n", - " : : +- GpuRowToColumnar TargetSize(536870912)\n", - " : : +- *(3) Project [quarter#26916, loan_id#26885L, current_loan_delinquency_status#26895, CASE WHEN (current_loan_delinquency_status#26895 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_30#26658, CASE WHEN (current_loan_delinquency_status#26895 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_90#26659, CASE WHEN (current_loan_delinquency_status#26895 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_180#26660]\n", - " : : +- *(3) GpuColumnarToRow false\n", - " : : +- !GpuFilter (gpuisnotnull(loan_id#26885L) AND gpuisnotnull(quarter#26916))\n", - " : : +- GpuFileScan parquet [loan_id#26885L,monthly_reporting_period#26886,current_loan_delinquency_status#26895,quarter#26916] Batched: true, DataFilters: [isnotnull(loan_id#26885L), isnotnull(quarter#26916)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", - " : +- GpuCoalesceBatches RequireSingleBatch\n", - " : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, timestamp_year#26990L, timestamp_month#27019L, 192), true, [id=#15140]\n", - " : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n", - " : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931, 192), true, [id=#15137]\n", - " : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n", - " : +- !GpuProject [quarter#27167, FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) AS josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, loan_id#27136L, month_y#26931]\n", - " : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : +- !GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast((month_y#26931 - 1) as bigint)) as double) / 12.0))))\n", - " : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680], [month_y#26931]\n", - " : +- !GpuProject [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680]\n", - " : +- !GpuBroadcastHashJoin [loan_id#27136L, quarter#27167], [loan_id#26885L, quarter#26916], LeftOuter, BuildRight\n", - " : :- GpuRowToColumnar TargetSize(536870912)\n", - " : : +- *(2) Project [quarter#27167, loan_id#27136L, month(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_month#26734, year(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_year#26770]\n", - " : : +- *(2) GpuColumnarToRow false\n", - " : : +- !GpuFilter (gpuisnotnull(loan_id#27136L) AND gpuisnotnull(quarter#27167))\n", - " : : +- GpuFileScan parquet [loan_id#27136L,monthly_reporting_period#27137,quarter#27167] Batched: true, DataFilters: [isnotnull(loan_id#27136L), isnotnull(quarter#27167)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", - " : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true])), [id=#15129]\n", - " : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[gpumax(current_loan_delinquency_status#26895), gpumin(delinquency_30#26658), gpumin(delinquency_90#26659), gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n", - " : +- GpuCoalesceBatches TargetSize(536870912)\n", - " : +- !GpuColumnarExchange gpuhashpartitioning(quarter#26916, loan_id#26885L, 192), true, [id=#15126]\n", - " : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[partial_gpumax(current_loan_delinquency_status#26895), partial_gpumin(delinquency_30#26658), partial_gpumin(delinquency_90#26659), partial_gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n", - " : +- GpuRowToColumnar TargetSize(536870912)\n", - " : +- *(3) Project [quarter#26916, loan_id#26885L, current_loan_delinquency_status#26895, CASE WHEN (current_loan_delinquency_status#26895 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_30#26658, CASE WHEN (current_loan_delinquency_status#26895 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_90#26659, CASE WHEN (current_loan_delinquency_status#26895 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_180#26660]\n", - " : +- *(3) GpuColumnarToRow false\n", - " : +- !GpuFilter (gpuisnotnull(loan_id#26885L) AND gpuisnotnull(quarter#26916))\n", - " : +- GpuFileScan parquet [loan_id#26885L,monthly_reporting_period#26886,current_loan_delinquency_status#26895,quarter#26916] Batched: true, DataFilters: [isnotnull(loan_id#26885L), isnotnull(quarter#26916)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n", + " : : : : : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : : : : : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n", + " : : : : : : : : : : : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : : : : +- GpuSort [quarter#2484 ASC NULLS FIRST, loan_id#2453L ASC NULLS FIRST, timestamp_year#2307L ASC NULLS FIRST, timestamp_month#2336L ASC NULLS FIRST], false, com.nvidia.spark.rapids.OutOfCoreSort$@163d9f7d\n", + " : : : : : : : : : : +- GpuShuffleCoalesce 536870912\n", + " : : : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, timestamp_year#2307L, timestamp_month#2336L, 192), ENSURE_REQUIREMENTS, [id=#1114]\n", + " : : : : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : : : : : : +- *(6) HashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[])\n", + " : : : : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : : : : +- GpuShuffleCoalesce 536870912\n", + " : : : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248, 192), ENSURE_REQUIREMENTS, [id=#1107]\n", + " : : : : : : : : : : +- GpuHashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[]), filters=ArrayBuffer())\n", + " : : : : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : : : : : : +- *(5) Project [quarter#2484, FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) AS josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, loan_id#2453L, month_y#2248]\n", + " : : : : : : : : : : +- *(5) Filter (isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast((month_y#2248 - 1) as bigint)) as double) / 12.0))) AND isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) END))\n", + " : : : : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : : : : +- GpuGenerate gpuexplode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997], false, [month_y#2248]\n", + " : : : : : : : : : : +- GpuProject [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997]\n", + " : : : : : : : : : : +- GpuBroadcastHashJoin [loan_id#2453L, quarter#2484], [loan_id#2202L, quarter#2233], LeftOuter, GpuBuildRight\n", + " : : : : : : : : : : :- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : : : : : : : +- *(3) Project [quarter#2484, loan_id#2453L, month(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_month#2051, year(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_year#2087]\n", + " : : : : : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : : : : : +- GpuFilter (gpuisnotnull(quarter#2484) AND gpuisnotnull(loan_id#2453L)), true\n", + " : : : : : : : : : : : +- GpuFileGpuScan parquet [loan_id#2453L,monthly_reporting_period#2454,quarter#2484] Batched: true, DataFilters: [isnotnull(quarter#2484), isnotnull(loan_id#2453L)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n", + " : : : : : : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true]),false), [id=#1096]\n", + " : : : : : : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[gpumax(current_loan_delinquency_status#2212), gpumin(delinquency_30#1975), gpumin(delinquency_90#1976), gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : : : : : : +- GpuShuffleCoalesce 536870912\n", + " : : : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2233, loan_id#2202L, 192), ENSURE_REQUIREMENTS, [id=#1093]\n", + " : : : : : : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[partial_gpumax(current_loan_delinquency_status#2212), partial_gpumin(delinquency_30#1975), partial_gpumin(delinquency_90#1976), partial_gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : : : : : : +- *(4) Project [quarter#2233, loan_id#2202L, current_loan_delinquency_status#2212, CASE WHEN (current_loan_delinquency_status#2212 >= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n", + " : : : : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : : : : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n", + " : : : : : : : : : : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : : : +- GpuSort [quarter#2484 ASC NULLS FIRST, loan_id#2453L ASC NULLS FIRST, timestamp_year#2307L ASC NULLS FIRST, timestamp_month#2336L ASC NULLS FIRST], false, com.nvidia.spark.rapids.OutOfCoreSort$@163d9f7d\n", + " : : : : : : : : : +- GpuShuffleCoalesce 536870912\n", + " : : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, timestamp_year#2307L, timestamp_month#2336L, 192), ENSURE_REQUIREMENTS, [id=#1114]\n", + " : : : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : : : : : +- *(6) HashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[])\n", + " : : : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : : : +- GpuShuffleCoalesce 536870912\n", + " : : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248, 192), ENSURE_REQUIREMENTS, [id=#1107]\n", + " : : : : : : : : : +- GpuHashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[]), filters=ArrayBuffer())\n", + " : : : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : : : : : +- *(5) Project [quarter#2484, FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) AS josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, loan_id#2453L, month_y#2248]\n", + " : : : : : : : : : +- *(5) Filter (isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast((month_y#2248 - 1) as bigint)) as double) / 12.0))) AND isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) END))\n", + " : : : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : : : +- GpuGenerate gpuexplode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997], false, [month_y#2248]\n", + " : : : : : : : : : +- GpuProject [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997]\n", + " : : : : : : : : : +- GpuBroadcastHashJoin [loan_id#2453L, quarter#2484], [loan_id#2202L, quarter#2233], LeftOuter, GpuBuildRight\n", + " : : : : : : : : : :- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : : : : : : +- *(3) Project [quarter#2484, loan_id#2453L, month(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_month#2051, year(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_year#2087]\n", + " : : : : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : : : : +- GpuFilter (gpuisnotnull(quarter#2484) AND gpuisnotnull(loan_id#2453L)), true\n", + " : : : : : : : : : : +- GpuFileGpuScan parquet [loan_id#2453L,monthly_reporting_period#2454,quarter#2484] Batched: true, DataFilters: [isnotnull(quarter#2484), isnotnull(loan_id#2453L)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n", + " : : : : : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true]),false), [id=#1096]\n", + " : : : : : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[gpumax(current_loan_delinquency_status#2212), gpumin(delinquency_30#1975), gpumin(delinquency_90#1976), gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : : : : : +- GpuShuffleCoalesce 536870912\n", + " : : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2233, loan_id#2202L, 192), ENSURE_REQUIREMENTS, [id=#1093]\n", + " : : : : : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[partial_gpumax(current_loan_delinquency_status#2212), partial_gpumin(delinquency_30#1975), partial_gpumin(delinquency_90#1976), partial_gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : : : : : +- *(4) Project [quarter#2233, loan_id#2202L, current_loan_delinquency_status#2212, CASE WHEN (current_loan_delinquency_status#2212 >= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n", + " : : : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : : : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n", + " : : : : : : : : : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : : +- GpuSort [quarter#2484 ASC NULLS FIRST, loan_id#2453L ASC NULLS FIRST, timestamp_year#2307L ASC NULLS FIRST, timestamp_month#2336L ASC NULLS FIRST], false, com.nvidia.spark.rapids.OutOfCoreSort$@163d9f7d\n", + " : : : : : : : : +- GpuShuffleCoalesce 536870912\n", + " : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, timestamp_year#2307L, timestamp_month#2336L, 192), ENSURE_REQUIREMENTS, [id=#1114]\n", + " : : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : : : : +- *(6) HashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[])\n", + " : : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : : +- GpuShuffleCoalesce 536870912\n", + " : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248, 192), ENSURE_REQUIREMENTS, [id=#1107]\n", + " : : : : : : : : +- GpuHashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[]), filters=ArrayBuffer())\n", + " : : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : : : : +- *(5) Project [quarter#2484, FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) AS josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, loan_id#2453L, month_y#2248]\n", + " : : : : : : : : +- *(5) Filter (isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast((month_y#2248 - 1) as bigint)) as double) / 12.0))) AND isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) END))\n", + " : : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : : +- GpuGenerate gpuexplode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997], false, [month_y#2248]\n", + " : : : : : : : : +- GpuProject [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997]\n", + " : : : : : : : : +- GpuBroadcastHashJoin [loan_id#2453L, quarter#2484], [loan_id#2202L, quarter#2233], LeftOuter, GpuBuildRight\n", + " : : : : : : : : :- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : : : : : +- *(3) Project [quarter#2484, loan_id#2453L, month(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_month#2051, year(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_year#2087]\n", + " : : : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : : : +- GpuFilter (gpuisnotnull(quarter#2484) AND gpuisnotnull(loan_id#2453L)), true\n", + " : : : : : : : : : +- GpuFileGpuScan parquet [loan_id#2453L,monthly_reporting_period#2454,quarter#2484] Batched: true, DataFilters: [isnotnull(quarter#2484), isnotnull(loan_id#2453L)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n", + " : : : : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true]),false), [id=#1096]\n", + " : : : : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[gpumax(current_loan_delinquency_status#2212), gpumin(delinquency_30#1975), gpumin(delinquency_90#1976), gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : : : : +- GpuShuffleCoalesce 536870912\n", + " : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2233, loan_id#2202L, 192), ENSURE_REQUIREMENTS, [id=#1093]\n", + " : : : : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[partial_gpumax(current_loan_delinquency_status#2212), partial_gpumin(delinquency_30#1975), partial_gpumin(delinquency_90#1976), partial_gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : : : : +- *(4) Project [quarter#2233, loan_id#2202L, current_loan_delinquency_status#2212, CASE WHEN (current_loan_delinquency_status#2212 >= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n", + " : : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n", + " : : : : : : : : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : +- GpuSort [quarter#2484 ASC NULLS FIRST, loan_id#2453L ASC NULLS FIRST, timestamp_year#2307L ASC NULLS FIRST, timestamp_month#2336L ASC NULLS FIRST], false, com.nvidia.spark.rapids.OutOfCoreSort$@163d9f7d\n", + " : : : : : : : +- GpuShuffleCoalesce 536870912\n", + " : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, timestamp_year#2307L, timestamp_month#2336L, 192), ENSURE_REQUIREMENTS, [id=#1114]\n", + " : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : : : +- *(6) HashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[])\n", + " : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : +- GpuShuffleCoalesce 536870912\n", + " : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248, 192), ENSURE_REQUIREMENTS, [id=#1107]\n", + " : : : : : : : +- GpuHashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[]), filters=ArrayBuffer())\n", + " : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : : : +- *(5) Project [quarter#2484, FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) AS josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, loan_id#2453L, month_y#2248]\n", + " : : : : : : : +- *(5) Filter (isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast((month_y#2248 - 1) as bigint)) as double) / 12.0))) AND isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) END))\n", + " : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : +- GpuGenerate gpuexplode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997], false, [month_y#2248]\n", + " : : : : : : : +- GpuProject [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997]\n", + " : : : : : : : +- GpuBroadcastHashJoin [loan_id#2453L, quarter#2484], [loan_id#2202L, quarter#2233], LeftOuter, GpuBuildRight\n", + " : : : : : : : :- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : : : : +- *(3) Project [quarter#2484, loan_id#2453L, month(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_month#2051, year(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_year#2087]\n", + " : : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : : +- GpuFilter (gpuisnotnull(quarter#2484) AND gpuisnotnull(loan_id#2453L)), true\n", + " : : : : : : : : +- GpuFileGpuScan parquet [loan_id#2453L,monthly_reporting_period#2454,quarter#2484] Batched: true, DataFilters: [isnotnull(quarter#2484), isnotnull(loan_id#2453L)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n", + " : : : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true]),false), [id=#1096]\n", + " : : : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[gpumax(current_loan_delinquency_status#2212), gpumin(delinquency_30#1975), gpumin(delinquency_90#1976), gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : : : +- GpuShuffleCoalesce 536870912\n", + " : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2233, loan_id#2202L, 192), ENSURE_REQUIREMENTS, [id=#1093]\n", + " : : : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[partial_gpumax(current_loan_delinquency_status#2212), partial_gpumin(delinquency_30#1975), partial_gpumin(delinquency_90#1976), partial_gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : : : +- *(4) Project [quarter#2233, loan_id#2202L, current_loan_delinquency_status#2212, CASE WHEN (current_loan_delinquency_status#2212 >= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n", + " : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n", + " : : : : : : : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : +- GpuSort [quarter#2484 ASC NULLS FIRST, loan_id#2453L ASC NULLS FIRST, timestamp_year#2307L ASC NULLS FIRST, timestamp_month#2336L ASC NULLS FIRST], false, com.nvidia.spark.rapids.OutOfCoreSort$@163d9f7d\n", + " : : : : : : +- GpuShuffleCoalesce 536870912\n", + " : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, timestamp_year#2307L, timestamp_month#2336L, 192), ENSURE_REQUIREMENTS, [id=#1114]\n", + " : : : : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : : +- *(6) HashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[])\n", + " : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : +- GpuShuffleCoalesce 536870912\n", + " : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248, 192), ENSURE_REQUIREMENTS, [id=#1107]\n", + " : : : : : : +- GpuHashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[]), filters=ArrayBuffer())\n", + " : : : : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : : +- *(5) Project [quarter#2484, FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) AS josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, loan_id#2453L, month_y#2248]\n", + " : : : : : : +- *(5) Filter (isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast((month_y#2248 - 1) as bigint)) as double) / 12.0))) AND isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) END))\n", + " : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : +- GpuGenerate gpuexplode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997], false, [month_y#2248]\n", + " : : : : : : +- GpuProject [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997]\n", + " : : : : : : +- GpuBroadcastHashJoin [loan_id#2453L, quarter#2484], [loan_id#2202L, quarter#2233], LeftOuter, GpuBuildRight\n", + " : : : : : : :- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : : : +- *(3) Project [quarter#2484, loan_id#2453L, month(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_month#2051, year(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_year#2087]\n", + " : : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : : +- GpuFilter (gpuisnotnull(quarter#2484) AND gpuisnotnull(loan_id#2453L)), true\n", + " : : : : : : : +- GpuFileGpuScan parquet [loan_id#2453L,monthly_reporting_period#2454,quarter#2484] Batched: true, DataFilters: [isnotnull(quarter#2484), isnotnull(loan_id#2453L)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n", + " : : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true]),false), [id=#1096]\n", + " : : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[gpumax(current_loan_delinquency_status#2212), gpumin(delinquency_30#1975), gpumin(delinquency_90#1976), gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : : +- GpuShuffleCoalesce 536870912\n", + " : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2233, loan_id#2202L, 192), ENSURE_REQUIREMENTS, [id=#1093]\n", + " : : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[partial_gpumax(current_loan_delinquency_status#2212), partial_gpumin(delinquency_30#1975), partial_gpumin(delinquency_90#1976), partial_gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : : +- *(4) Project [quarter#2233, loan_id#2202L, current_loan_delinquency_status#2212, CASE WHEN (current_loan_delinquency_status#2212 >= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n", + " : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n", + " : : : : : : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : : +- GpuColumnarToRow false\n", + " : : : : : +- GpuSort [quarter#2484 ASC NULLS FIRST, loan_id#2453L ASC NULLS FIRST, timestamp_year#2307L ASC NULLS FIRST, timestamp_month#2336L ASC NULLS FIRST], false, com.nvidia.spark.rapids.OutOfCoreSort$@163d9f7d\n", + " : : : : : +- GpuShuffleCoalesce 536870912\n", + " : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, timestamp_year#2307L, timestamp_month#2336L, 192), ENSURE_REQUIREMENTS, [id=#1114]\n", + " : : : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : +- *(6) HashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[])\n", + " : : : : : +- GpuColumnarToRow false\n", + " : : : : : +- GpuShuffleCoalesce 536870912\n", + " : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248, 192), ENSURE_REQUIREMENTS, [id=#1107]\n", + " : : : : : +- GpuHashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[]), filters=ArrayBuffer())\n", + " : : : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : +- *(5) Project [quarter#2484, FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) AS josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, loan_id#2453L, month_y#2248]\n", + " : : : : : +- *(5) Filter (isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast((month_y#2248 - 1) as bigint)) as double) / 12.0))) AND isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) END))\n", + " : : : : : +- GpuColumnarToRow false\n", + " : : : : : +- GpuGenerate gpuexplode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997], false, [month_y#2248]\n", + " : : : : : +- GpuProject [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997]\n", + " : : : : : +- GpuBroadcastHashJoin [loan_id#2453L, quarter#2484], [loan_id#2202L, quarter#2233], LeftOuter, GpuBuildRight\n", + " : : : : : :- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : : +- *(3) Project [quarter#2484, loan_id#2453L, month(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_month#2051, year(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_year#2087]\n", + " : : : : : : +- GpuColumnarToRow false\n", + " : : : : : : +- GpuFilter (gpuisnotnull(quarter#2484) AND gpuisnotnull(loan_id#2453L)), true\n", + " : : : : : : +- GpuFileGpuScan parquet [loan_id#2453L,monthly_reporting_period#2454,quarter#2484] Batched: true, DataFilters: [isnotnull(quarter#2484), isnotnull(loan_id#2453L)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n", + " : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true]),false), [id=#1096]\n", + " : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[gpumax(current_loan_delinquency_status#2212), gpumin(delinquency_30#1975), gpumin(delinquency_90#1976), gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : +- GpuShuffleCoalesce 536870912\n", + " : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2233, loan_id#2202L, 192), ENSURE_REQUIREMENTS, [id=#1093]\n", + " : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[partial_gpumax(current_loan_delinquency_status#2212), partial_gpumin(delinquency_30#1975), partial_gpumin(delinquency_90#1976), partial_gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : +- *(4) Project [quarter#2233, loan_id#2202L, current_loan_delinquency_status#2212, CASE WHEN (current_loan_delinquency_status#2212 >= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n", + " : : : : : +- GpuColumnarToRow false\n", + " : : : : : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n", + " : : : : : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : +- GpuColumnarToRow false\n", + " : : : : +- GpuSort [quarter#2484 ASC NULLS FIRST, loan_id#2453L ASC NULLS FIRST, timestamp_year#2307L ASC NULLS FIRST, timestamp_month#2336L ASC NULLS FIRST], false, com.nvidia.spark.rapids.OutOfCoreSort$@163d9f7d\n", + " : : : : +- GpuShuffleCoalesce 536870912\n", + " : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, timestamp_year#2307L, timestamp_month#2336L, 192), ENSURE_REQUIREMENTS, [id=#1114]\n", + " : : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : : +- *(6) HashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[])\n", + " : : : : +- GpuColumnarToRow false\n", + " : : : : +- GpuShuffleCoalesce 536870912\n", + " : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248, 192), ENSURE_REQUIREMENTS, [id=#1107]\n", + " : : : : +- GpuHashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[]), filters=ArrayBuffer())\n", + " : : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : : +- *(5) Project [quarter#2484, FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) AS josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, loan_id#2453L, month_y#2248]\n", + " : : : : +- *(5) Filter (isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast((month_y#2248 - 1) as bigint)) as double) / 12.0))) AND isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) END))\n", + " : : : : +- GpuColumnarToRow false\n", + " : : : : +- GpuGenerate gpuexplode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997], false, [month_y#2248]\n", + " : : : : +- GpuProject [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997]\n", + " : : : : +- GpuBroadcastHashJoin [loan_id#2453L, quarter#2484], [loan_id#2202L, quarter#2233], LeftOuter, GpuBuildRight\n", + " : : : : :- GpuRowToColumnar targetsize(536870912)\n", + " : : : : : +- *(3) Project [quarter#2484, loan_id#2453L, month(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_month#2051, year(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_year#2087]\n", + " : : : : : +- GpuColumnarToRow false\n", + " : : : : : +- GpuFilter (gpuisnotnull(quarter#2484) AND gpuisnotnull(loan_id#2453L)), true\n", + " : : : : : +- GpuFileGpuScan parquet [loan_id#2453L,monthly_reporting_period#2454,quarter#2484] Batched: true, DataFilters: [isnotnull(quarter#2484), isnotnull(loan_id#2453L)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n", + " : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true]),false), [id=#1096]\n", + " : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[gpumax(current_loan_delinquency_status#2212), gpumin(delinquency_30#1975), gpumin(delinquency_90#1976), gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : +- GpuShuffleCoalesce 536870912\n", + " : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2233, loan_id#2202L, 192), ENSURE_REQUIREMENTS, [id=#1093]\n", + " : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[partial_gpumax(current_loan_delinquency_status#2212), partial_gpumin(delinquency_30#1975), partial_gpumin(delinquency_90#1976), partial_gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : : +- *(4) Project [quarter#2233, loan_id#2202L, current_loan_delinquency_status#2212, CASE WHEN (current_loan_delinquency_status#2212 >= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n", + " : : : : +- GpuColumnarToRow false\n", + " : : : : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n", + " : : : : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : +- GpuColumnarToRow false\n", + " : : : +- GpuSort [quarter#2484 ASC NULLS FIRST, loan_id#2453L ASC NULLS FIRST, timestamp_year#2307L ASC NULLS FIRST, timestamp_month#2336L ASC NULLS FIRST], false, com.nvidia.spark.rapids.OutOfCoreSort$@163d9f7d\n", + " : : : +- GpuShuffleCoalesce 536870912\n", + " : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, timestamp_year#2307L, timestamp_month#2336L, 192), ENSURE_REQUIREMENTS, [id=#1114]\n", + " : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : +- *(6) HashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[])\n", + " : : : +- GpuColumnarToRow false\n", + " : : : +- GpuShuffleCoalesce 536870912\n", + " : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248, 192), ENSURE_REQUIREMENTS, [id=#1107]\n", + " : : : +- GpuHashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[]), filters=ArrayBuffer())\n", + " : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : +- *(5) Project [quarter#2484, FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) AS josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, loan_id#2453L, month_y#2248]\n", + " : : : +- *(5) Filter (isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast((month_y#2248 - 1) as bigint)) as double) / 12.0))) AND isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) END))\n", + " : : : +- GpuColumnarToRow false\n", + " : : : +- GpuGenerate gpuexplode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997], false, [month_y#2248]\n", + " : : : +- GpuProject [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997]\n", + " : : : +- GpuBroadcastHashJoin [loan_id#2453L, quarter#2484], [loan_id#2202L, quarter#2233], LeftOuter, GpuBuildRight\n", + " : : : :- GpuRowToColumnar targetsize(536870912)\n", + " : : : : +- *(3) Project [quarter#2484, loan_id#2453L, month(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_month#2051, year(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_year#2087]\n", + " : : : : +- GpuColumnarToRow false\n", + " : : : : +- GpuFilter (gpuisnotnull(quarter#2484) AND gpuisnotnull(loan_id#2453L)), true\n", + " : : : : +- GpuFileGpuScan parquet [loan_id#2453L,monthly_reporting_period#2454,quarter#2484] Batched: true, DataFilters: [isnotnull(quarter#2484), isnotnull(loan_id#2453L)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n", + " : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true]),false), [id=#1096]\n", + " : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[gpumax(current_loan_delinquency_status#2212), gpumin(delinquency_30#1975), gpumin(delinquency_90#1976), gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : +- GpuShuffleCoalesce 536870912\n", + " : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2233, loan_id#2202L, 192), ENSURE_REQUIREMENTS, [id=#1093]\n", + " : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[partial_gpumax(current_loan_delinquency_status#2212), partial_gpumin(delinquency_30#1975), partial_gpumin(delinquency_90#1976), partial_gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : : +- *(4) Project [quarter#2233, loan_id#2202L, current_loan_delinquency_status#2212, CASE WHEN (current_loan_delinquency_status#2212 >= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n", + " : : : +- GpuColumnarToRow false\n", + " : : : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n", + " : : : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : +- GpuColumnarToRow false\n", + " : : +- GpuSort [quarter#2484 ASC NULLS FIRST, loan_id#2453L ASC NULLS FIRST, timestamp_year#2307L ASC NULLS FIRST, timestamp_month#2336L ASC NULLS FIRST], false, com.nvidia.spark.rapids.OutOfCoreSort$@163d9f7d\n", + " : : +- GpuShuffleCoalesce 536870912\n", + " : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, timestamp_year#2307L, timestamp_month#2336L, 192), ENSURE_REQUIREMENTS, [id=#1114]\n", + " : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : +- *(6) HashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[])\n", + " : : +- GpuColumnarToRow false\n", + " : : +- GpuShuffleCoalesce 536870912\n", + " : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248, 192), ENSURE_REQUIREMENTS, [id=#1107]\n", + " : : +- GpuHashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[]), filters=ArrayBuffer())\n", + " : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : +- *(5) Project [quarter#2484, FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) AS josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, loan_id#2453L, month_y#2248]\n", + " : : +- *(5) Filter (isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast((month_y#2248 - 1) as bigint)) as double) / 12.0))) AND isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) END))\n", + " : : +- GpuColumnarToRow false\n", + " : : +- GpuGenerate gpuexplode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997], false, [month_y#2248]\n", + " : : +- GpuProject [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997]\n", + " : : +- GpuBroadcastHashJoin [loan_id#2453L, quarter#2484], [loan_id#2202L, quarter#2233], LeftOuter, GpuBuildRight\n", + " : : :- GpuRowToColumnar targetsize(536870912)\n", + " : : : +- *(3) Project [quarter#2484, loan_id#2453L, month(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_month#2051, year(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_year#2087]\n", + " : : : +- GpuColumnarToRow false\n", + " : : : +- GpuFilter (gpuisnotnull(quarter#2484) AND gpuisnotnull(loan_id#2453L)), true\n", + " : : : +- GpuFileGpuScan parquet [loan_id#2453L,monthly_reporting_period#2454,quarter#2484] Batched: true, DataFilters: [isnotnull(quarter#2484), isnotnull(loan_id#2453L)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n", + " : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true]),false), [id=#1096]\n", + " : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[gpumax(current_loan_delinquency_status#2212), gpumin(delinquency_30#1975), gpumin(delinquency_90#1976), gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : +- GpuShuffleCoalesce 536870912\n", + " : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2233, loan_id#2202L, 192), ENSURE_REQUIREMENTS, [id=#1093]\n", + " : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[partial_gpumax(current_loan_delinquency_status#2212), partial_gpumin(delinquency_30#1975), partial_gpumin(delinquency_90#1976), partial_gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : +- GpuRowToColumnar targetsize(536870912)\n", + " : : +- *(4) Project [quarter#2233, loan_id#2202L, current_loan_delinquency_status#2212, CASE WHEN (current_loan_delinquency_status#2212 >= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n", + " : : +- GpuColumnarToRow false\n", + " : : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n", + " : : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : +- GpuColumnarToRow false\n", + " : +- GpuSort [quarter#2484 ASC NULLS FIRST, loan_id#2453L ASC NULLS FIRST, timestamp_year#2307L ASC NULLS FIRST, timestamp_month#2336L ASC NULLS FIRST], false, com.nvidia.spark.rapids.OutOfCoreSort$@163d9f7d\n", + " : +- GpuShuffleCoalesce 536870912\n", + " : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, timestamp_year#2307L, timestamp_month#2336L, 192), ENSURE_REQUIREMENTS, [id=#1114]\n", + " : +- GpuRowToColumnar targetsize(536870912)\n", + " : +- *(6) HashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[])\n", + " : +- GpuColumnarToRow false\n", + " : +- GpuShuffleCoalesce 536870912\n", + " : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248, 192), ENSURE_REQUIREMENTS, [id=#1107]\n", + " : +- GpuHashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[]), filters=ArrayBuffer())\n", + " : +- GpuRowToColumnar targetsize(536870912)\n", + " : +- *(5) Project [quarter#2484, FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) AS josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, loan_id#2453L, month_y#2248]\n", + " : +- *(5) Filter (isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast((month_y#2248 - 1) as bigint)) as double) / 12.0))) AND isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) END))\n", + " : +- GpuColumnarToRow false\n", + " : +- GpuGenerate gpuexplode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997], false, [month_y#2248]\n", + " : +- GpuProject [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997]\n", + " : +- GpuBroadcastHashJoin [loan_id#2453L, quarter#2484], [loan_id#2202L, quarter#2233], LeftOuter, GpuBuildRight\n", + " : :- GpuRowToColumnar targetsize(536870912)\n", + " : : +- *(3) Project [quarter#2484, loan_id#2453L, month(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_month#2051, year(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_year#2087]\n", + " : : +- GpuColumnarToRow false\n", + " : : +- GpuFilter (gpuisnotnull(quarter#2484) AND gpuisnotnull(loan_id#2453L)), true\n", + " : : +- GpuFileGpuScan parquet [loan_id#2453L,monthly_reporting_period#2454,quarter#2484] Batched: true, DataFilters: [isnotnull(quarter#2484), isnotnull(loan_id#2453L)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n", + " : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true]),false), [id=#1096]\n", + " : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[gpumax(current_loan_delinquency_status#2212), gpumin(delinquency_30#1975), gpumin(delinquency_90#1976), gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n", + " : +- GpuShuffleCoalesce 536870912\n", + " : +- GpuColumnarExchange gpuhashpartitioning(quarter#2233, loan_id#2202L, 192), ENSURE_REQUIREMENTS, [id=#1093]\n", + " : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[partial_gpumax(current_loan_delinquency_status#2212), partial_gpumin(delinquency_30#1975), partial_gpumin(delinquency_90#1976), partial_gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n", + " : +- GpuRowToColumnar targetsize(536870912)\n", + " : +- *(4) Project [quarter#2233, loan_id#2202L, current_loan_delinquency_status#2212, CASE WHEN (current_loan_delinquency_status#2212 >= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n", + " : +- GpuColumnarToRow false\n", + " : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n", + " : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + "Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.08/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n", "\n", "### 2. Download needed jars\n", - "* [rapids-4-spark_2.12-22.06.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.06.0/rapids-4-spark_2.12-22.06.0.jar)\n", + "* [rapids-4-spark_2.12-22.08.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar)\n", "\n", "### 3. Start Spark Standalone\n", "Before Running the script, please setup Spark standalone mode\n", "\n", "### 4. Add ENV\n", "```\n", - "$ export SPARK_JARS=rapids-4-spark_2.12-22.06.0.jar\n", + "$ export SPARK_JARS=rapids-4-spark_2.12-22.08.0.jar\n", "\n", "```\n", "\n", - "### 5.Start Jupyter Notebook with spylon-kernal or toree\n", + "### 5.Start Jupyter Notebook with spylon-kernel or toree\n", "\n", "```\n", "$ jupyter notebook --allow-root --notebook-dir=${your-dir} --config=${your-configs}\n", @@ -66,38 +67,17 @@ }, { "cell_type": "code", - "execution_count": 2, - "id": "674771a8", + "execution_count": null, + "id": "b2834c06", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dataRoot = /data\n", - "perfPath = /data/mortgage/Performance/\n", - "acqPath = /data/mortgage/Acquisition/\n", - "outPath = /data/mortgage/output/\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "/data" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "val dataRoot = sys.env.getOrElse(\"DATA_ROOT\", \"/data\")\n", - "val perfPath = dataRoot + \"/mortgage/Performance/\"\n", - "val acqPath = dataRoot + \"/mortgage/Acquisition/\"\n", - "val outPath = dataRoot + \"/mortgage/output/\"\n" + "val dataOut = sys.env.getOrElse(\"DATA_OUT\", \"/data\")\n", + "val dataPath = dataRoot + \"/mortgage/input\"\n", + "val outPath = dataOut + \"/mortgage/output\"\n", + "val output_csv2parquet = dataOut + \"/mortgage/output/csv2parquet/\"\n", + "val saveTrainEvalDataset = true" ] }, { @@ -120,7 +100,7 @@ { "data": { "text/plain": [ - "performanceSchema = StructType(StructField(loan_id,LongType,true), StructField(monthly_reporting_period,StringType,true), StructField(servicer,StringType,true), StructField(interest_rate,DoubleType,true), StructField(current_actual_upb,DoubleType,true), StructField(loan_age,DoubleType,true), StructField(remaining_months_to_legal_maturity,DoubleType,true), StructField(adj_remaining_months_to_maturity,DoubleType,true), StructField(maturity_date,StringType,true), StructField(msa,DoubleType,true), StructField(current_loan_delinquency_status,IntegerType,true), StructField(mod_flag,StringType,true), StructField(zero_balance_code,StringType,true), StructField(zero_balance_effective_date,StringType,true), StructField(last_paid_installment_date,StringType,t...\n" + "rawSchema = StructType(StructField(reference_pool_id,StringType,true), StructField(loan_id,LongType,true), StructField(monthly_reporting_period,StringType,true), StructField(orig_channel,StringType,true), StructField(seller_name,StringType,true), StructField(servicer,StringType,true), StructField(master_servicer,StringType,true), StructField(orig_interest_rate,DoubleType,true), StructField(interest_rate,DoubleType,true), StructField(orig_upb,IntegerType,true), StructField(upb_at_issuance,StringType,true), StructField(current_actual_upb,DoubleType,true), StructField(orig_loan_term,IntegerType,true), StructField(orig_date,StringType,true), StructField(first_pay_date,StringType,true), StructField(loan_age,DoubleType,true), StructField(remaining_months...\n" ] }, "metadata": {}, @@ -129,7 +109,7 @@ { "data": { "text/plain": [ - "StructType(StructField(loan_id,LongType,true), StructField(monthly_reporting_period,StringType,true), StructField(servicer,StringType,true), StructField(interest_rate,DoubleType,true), StructField(current_actual_upb,DoubleType,true), StructField(loan_age,DoubleType,true), StructField(remaining_months_to_legal_maturity,DoubleType,true), StructField(adj_remaining_months_to_maturity,DoubleType,true), StructField(maturity_date,StringType,true), StructField(msa,DoubleType,true), StructField(current_loan_delinquency_status,IntegerType,true), StructField(mod_flag,StringType,true), StructField(zero_balance_code,StringType,true), StructField(zero_balance_effective_date,StringType,true), StructField(last_paid_installment_date,StringType,t..." + "StructType(StructField(reference_pool_id,StringType,true), StructField(loan_id,LongType,true), StructField(monthly_reporting_period,StringType,true), StructField(orig_channel,StringType,true), StructField(seller_name,StringType,true), StructField(servicer,StringType,true), StructField(master_servicer,StringType,true), StructField(orig_interest_rate,DoubleType,true), StructField(interest_rate,DoubleType,true), StructField(orig_upb,IntegerType,true), StructField(upb_at_issuance,StringType,true), StructField(current_actual_upb,DoubleType,true), StructField(orig_loan_term,IntegerType,true), StructField(orig_date,StringType,true), StructField(first_pay_date,StringType,true), StructField(loan_age,DoubleType,true), StructField(remaining_months..." ] }, "execution_count": 3, @@ -139,21 +119,57 @@ ], "source": [ "// File schema\n", - "val performanceSchema = StructType(Array(\n", + "val rawSchema = StructType(Array(\n", + " StructField(\"reference_pool_id\", StringType),\n", " StructField(\"loan_id\", LongType),\n", " StructField(\"monthly_reporting_period\", StringType),\n", + " StructField(\"orig_channel\", StringType),\n", + " StructField(\"seller_name\", StringType),\n", " StructField(\"servicer\", StringType),\n", + " StructField(\"master_servicer\", StringType),\n", + " StructField(\"orig_interest_rate\", DoubleType),\n", " StructField(\"interest_rate\", DoubleType),\n", + " StructField(\"orig_upb\", DoubleType),\n", + " StructField(\"upb_at_issuance\", StringType),\n", " StructField(\"current_actual_upb\", DoubleType),\n", + " StructField(\"orig_loan_term\", IntegerType),\n", + " StructField(\"orig_date\", StringType),\n", + " StructField(\"first_pay_date\", StringType), \n", " StructField(\"loan_age\", DoubleType),\n", " StructField(\"remaining_months_to_legal_maturity\", DoubleType),\n", " StructField(\"adj_remaining_months_to_maturity\", DoubleType),\n", " StructField(\"maturity_date\", StringType),\n", + " StructField(\"orig_ltv\", DoubleType),\n", + " StructField(\"orig_cltv\", DoubleType),\n", + " StructField(\"num_borrowers\", DoubleType),\n", + " StructField(\"dti\", DoubleType),\n", + " StructField(\"borrower_credit_score\", DoubleType),\n", + " StructField(\"coborrow_credit_score\", DoubleType),\n", + " StructField(\"first_home_buyer\", StringType),\n", + " StructField(\"loan_purpose\", StringType),\n", + " StructField(\"property_type\", StringType),\n", + " StructField(\"num_units\", IntegerType),\n", + " StructField(\"occupancy_status\", StringType),\n", + " StructField(\"property_state\", StringType),\n", " StructField(\"msa\", DoubleType),\n", + " StructField(\"zip\", IntegerType),\n", + " StructField(\"mortgage_insurance_percent\", DoubleType),\n", + " StructField(\"product_type\", StringType),\n", + " StructField(\"prepayment_penalty_indicator\", StringType),\n", + " StructField(\"interest_only_loan_indicator\", StringType),\n", + " StructField(\"interest_only_first_principal_and_interest_payment_date\", StringType),\n", + " StructField(\"months_to_amortization\", StringType),\n", " StructField(\"current_loan_delinquency_status\", IntegerType),\n", + " StructField(\"loan_payment_history\", StringType),\n", " StructField(\"mod_flag\", StringType),\n", + " StructField(\"mortgage_insurance_cancellation_indicator\", StringType),\n", " StructField(\"zero_balance_code\", StringType),\n", " StructField(\"zero_balance_effective_date\", StringType),\n", + " StructField(\"upb_at_the_time_of_removal\", StringType),\n", + " StructField(\"repurchase_date\", StringType),\n", + " StructField(\"scheduled_principal_current\", StringType),\n", + " StructField(\"total_principal_current\", StringType),\n", + " StructField(\"unscheduled_principal_current\", StringType),\n", " StructField(\"last_paid_installment_date\", StringType),\n", " StructField(\"foreclosed_after\", StringType),\n", " StructField(\"disposition_date\", StringType),\n", @@ -168,37 +184,51 @@ " StructField(\"other_foreclosure_proceeds\", DoubleType),\n", " StructField(\"non_interest_bearing_upb\", DoubleType),\n", " StructField(\"principal_forgiveness_upb\", StringType),\n", - " StructField(\"repurchase_make_whole_proceeds_flag\", StringType),\n", - " StructField(\"foreclosure_principal_write_off_amount\", StringType),\n", - " StructField(\"servicing_activity_indicator\", StringType))\n", - " )\n", - "\n", - "val acquisitionSchema = StructType(Array(\n", - " StructField(\"loan_id\", LongType),\n", - " StructField(\"orig_channel\", StringType),\n", - " StructField(\"seller_name\", StringType),\n", - " StructField(\"orig_interest_rate\", DoubleType),\n", - " StructField(\"orig_upb\", IntegerType),\n", - " StructField(\"orig_loan_term\", IntegerType),\n", - " StructField(\"orig_date\", StringType),\n", - " StructField(\"first_pay_date\", StringType),\n", - " StructField(\"orig_ltv\", DoubleType),\n", - " StructField(\"orig_cltv\", DoubleType),\n", - " StructField(\"num_borrowers\", DoubleType),\n", - " StructField(\"dti\", DoubleType),\n", - " StructField(\"borrower_credit_score\", DoubleType),\n", - " StructField(\"first_home_buyer\", StringType),\n", - " StructField(\"loan_purpose\", StringType),\n", - " StructField(\"property_type\", StringType),\n", - " StructField(\"num_units\", IntegerType),\n", - " StructField(\"occupancy_status\", StringType),\n", - " StructField(\"property_state\", StringType),\n", - " StructField(\"zip\", IntegerType),\n", - " StructField(\"mortgage_insurance_percent\", DoubleType),\n", - " StructField(\"product_type\", StringType),\n", - " StructField(\"coborrow_credit_score\", DoubleType),\n", + " StructField(\"original_list_start_date\", StringType),\n", + " StructField(\"original_list_price\", StringType),\n", + " StructField(\"current_list_start_date\", StringType),\n", + " StructField(\"current_list_price\", StringType),\n", + " StructField(\"borrower_credit_score_at_issuance\", StringType),\n", + " StructField(\"co-borrower_credit_score_at_issuance\", StringType),\n", + " StructField(\"borrower_credit_score_current\", StringType),\n", + " StructField(\"co-Borrower_credit_score_current\", StringType),\n", " StructField(\"mortgage_insurance_type\", DoubleType),\n", - " StructField(\"relocation_mortgage_indicator\", StringType))\n", + " StructField(\"servicing_activity_indicator\", StringType),\n", + " StructField(\"current_period_modification_loss_amount\", StringType),\n", + " StructField(\"cumulative_modification_loss_amount\", StringType),\n", + " StructField(\"current_period_credit_event_net_gain_or_loss\", StringType),\n", + " StructField(\"cumulative_credit_event_net_gain_or_loss\", StringType),\n", + " StructField(\"homeready_program_indicator\", StringType),\n", + " StructField(\"foreclosure_principal_write_off_amount\", StringType),\n", + " StructField(\"relocation_mortgage_indicator\", StringType),\n", + " StructField(\"zero_balance_code_change_date\", StringType),\n", + " StructField(\"loan_holdback_indicator\", StringType),\n", + " StructField(\"loan_holdback_effective_date\", StringType),\n", + " StructField(\"delinquent_accrued_interest\", StringType),\n", + " StructField(\"property_valuation_method\", StringType),\n", + " StructField(\"high_balance_loan_indicator\", StringType),\n", + " StructField(\"arm_initial_fixed-rate_period_lt_5_yr_indicator\", StringType),\n", + " StructField(\"arm_product_type\", StringType),\n", + " StructField(\"initial_fixed-rate_period\", StringType),\n", + " StructField(\"interest_rate_adjustment_frequency\", StringType),\n", + " StructField(\"next_interest_rate_adjustment_date\", StringType),\n", + " StructField(\"next_payment_change_date\", StringType),\n", + " StructField(\"index\", StringType),\n", + " StructField(\"arm_cap_structure\", StringType),\n", + " StructField(\"initial_interest_rate_cap_up_percent\", StringType),\n", + " StructField(\"periodic_interest_rate_cap_up_percent\", StringType),\n", + " StructField(\"lifetime_interest_rate_cap_up_percent\", StringType),\n", + " StructField(\"mortgage_margin\", StringType),\n", + " StructField(\"arm_balloon_indicator\", StringType),\n", + " StructField(\"arm_plan_number\", StringType),\n", + " StructField(\"borrower_assistance_plan\", StringType),\n", + " StructField(\"hltv_refinance_option_indicator\", StringType),\n", + " StructField(\"deal_name\", StringType),\n", + " StructField(\"repurchase_make_whole_proceeds_flag\", StringType),\n", + " StructField(\"alternative_delinquency_resolution\", StringType),\n", + " StructField(\"alternative_delinquency_resolution_count\", StringType),\n", + " StructField(\"total_deferral_amount\", StringType)\n", + " )\n", " )" ] }, @@ -356,7 +386,7 @@ " // So we strip off the .txt and everything after it\n", " // and then take everything after the last remaining _\n", " def apply(): Column = substring_index(\n", - " substring_index(input_file_name(), \".\", 1), \"_\", -1)\n", + " substring_index(input_file_name(), \".\", 1), \"/\", -1)\n", "}" ] }, @@ -413,7 +443,7 @@ "\n", "val numericCols = List(\n", " (\"orig_interest_rate\", FloatType),\n", - " (\"orig_upb\", IntegerType),\n", + " (\"orig_upb\", DoubleType),\n", " (\"orig_loan_term\", IntegerType),\n", " (\"orig_ltv\", FloatType),\n", " (\"orig_cltv\", FloatType),\n", @@ -556,6 +586,120 @@ " }" ] }, + { + "cell_type": "code", + "execution_count": 9, + "id": "9e1fbb61", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "defined object extractPerfColumns\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "object extractPerfColumns{\n", + " def apply(rawDf : DataFrame) : DataFrame = {\n", + " val perfDf = rawDf.select(\n", + " col(\"loan_id\"),\n", + " date_format(to_date(col(\"monthly_reporting_period\"),\"MMyyyy\"), \"MM/dd/yyyy\").as(\"monthly_reporting_period\"),\n", + " upper(col(\"servicer\")).as(\"servicer\"),\n", + " col(\"interest_rate\"),\n", + " col(\"current_actual_upb\"),\n", + " col(\"loan_age\"),\n", + " col(\"remaining_months_to_legal_maturity\"),\n", + " col(\"adj_remaining_months_to_maturity\"),\n", + " date_format(to_date(col(\"maturity_date\"),\"MMyyyy\"), \"MM/yyyy\").as(\"maturity_date\"),\n", + " col(\"msa\"),\n", + " col(\"current_loan_delinquency_status\"),\n", + " col(\"mod_flag\"),\n", + " col(\"zero_balance_code\"),\n", + " date_format(to_date(col(\"zero_balance_effective_date\"),\"MMyyyy\"), \"MM/yyyy\").as(\"zero_balance_effective_date\"),\n", + " date_format(to_date(col(\"last_paid_installment_date\"),\"MMyyyy\"), \"MM/dd/yyyy\").as(\"last_paid_installment_date\"),\n", + " date_format(to_date(col(\"foreclosed_after\"),\"MMyyyy\"), \"MM/dd/yyyy\").as(\"foreclosed_after\"),\n", + " date_format(to_date(col(\"disposition_date\"),\"MMyyyy\"), \"MM/dd/yyyy\").as(\"disposition_date\"),\n", + " col(\"foreclosure_costs\"),\n", + " col(\"prop_preservation_and_repair_costs\"),\n", + " col(\"asset_recovery_costs\"),\n", + " col(\"misc_holding_expenses\"),\n", + " col(\"holding_taxes\"),\n", + " col(\"net_sale_proceeds\"),\n", + " col(\"credit_enhancement_proceeds\"),\n", + " col(\"repurchase_make_whole_proceeds\"),\n", + " col(\"other_foreclosure_proceeds\"),\n", + " col(\"non_interest_bearing_upb\"),\n", + " col(\"principal_forgiveness_upb\"),\n", + " col(\"repurchase_make_whole_proceeds_flag\"),\n", + " col(\"foreclosure_principal_write_off_amount\"),\n", + " col(\"servicing_activity_indicator\"),\n", + " col(\"quarter\")\n", + " )\n", + " \n", + " perfDf.select(\"*\").filter(\"current_actual_upb != 0.0\")\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ce429163", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "defined object extractAcqColumns\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "object extractAcqColumns{\n", + " def apply(rawDf : DataFrame) : DataFrame = {\n", + " val acqDf = rawDf.select(\n", + " col(\"loan_id\"),\n", + " col(\"orig_channel\"),\n", + " upper(col(\"seller_name\")).as(\"seller_name\"),\n", + " col(\"orig_interest_rate\"),\n", + " col(\"orig_upb\"),\n", + " col(\"orig_loan_term\"),\n", + " date_format(to_date(col(\"orig_date\"),\"MMyyyy\"), \"MM/yyyy\").as(\"orig_date\"),\n", + " date_format(to_date(col(\"first_pay_date\"),\"MMyyyy\"), \"MM/yyyy\").as(\"first_pay_date\"),\n", + " col(\"orig_ltv\"),\n", + " col(\"orig_cltv\"),\n", + " col(\"num_borrowers\"),\n", + " col(\"dti\"),\n", + " col(\"borrower_credit_score\"),\n", + " col(\"first_home_buyer\"),\n", + " col(\"loan_purpose\"),\n", + " col(\"property_type\"),\n", + " col(\"num_units\"),\n", + " col(\"occupancy_status\"),\n", + " col(\"property_state\"),\n", + " col(\"zip\"),\n", + " col(\"mortgage_insurance_percent\"),\n", + " col(\"product_type\"),\n", + " col(\"coborrow_credit_score\"),\n", + " col(\"mortgage_insurance_type\"),\n", + " col(\"relocation_mortgage_indicator\"),\n", + " col(\"quarter\"),\n", + " dense_rank().over(Window.partitionBy(\"loan_id\").orderBy(to_date(col(\"monthly_reporting_period\"),\"MMyyyy\"))).as(\"rank\")\n", + " )\n", + "\n", + " acqDf.select(\"*\").filter(col(\"rank\") === 1)\n", + " }\n", + "\n", + "}" + ] + }, { "cell_type": "markdown", "id": "37c64d85", @@ -566,15 +710,15 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "id": "98d37174", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "sparkSession = org.apache.spark.sql.SparkSession@1d87c1c2\n", - "reader = org.apache.spark.sql.DataFrameReader@2e8a7a69\n" + "sparkSession = org.apache.spark.sql.SparkSession@694178ec\n", + "reader = org.apache.spark.sql.DataFrameReader@4b2afd51\n" ] }, "metadata": {}, @@ -583,18 +727,30 @@ { "data": { "text/plain": [ - "org.apache.spark.sql.DataFrameReader@2e8a7a69" + "org.apache.spark.sql.DataFrameReader@4b2afd51" ] }, - "execution_count": 9, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Build the spark session and data reader as usual\n", - "val sparkSession = SparkSession.builder.appName(\"mortgage-gpu\").getOrCreate\n", - "val reader = sparkSession.read.option(\"header\", true).schema(performanceSchema)" + "val sparkSession = SparkSession.builder.appName(\"mortgage-gpu\").config(\"spark.sql.cache.serializer\", \"com.nvidia.spark.ParquetCachedBatchSerializer\").getOrCreate\n", + "\n", + "// GPU run, set to true\n", + "sparkSession.conf.set(\"spark.rapids.sql.enabled\", true)\n", + "// CPU run, set to false\n", + "// sparkSession.conf.set('spark.rapids.sql.enabled', 'false')\n", + "// remove config(\"spark.sql.cache.serializer\", \"com.nvidia.spark.ParquetCachedBatchSerializer\") for CPU\n", + "sparkSession.conf.set(\"spark.sql.files.maxPartitionBytes\", \"1G\")\n", + "sparkSession.conf.set(\"spark.sql.broadcastTimeout\", 700)\n", + "sparkSession.conf.set(\"spark.rapids.sql.hasNans\", false)\n", + "// use GPU to read CSV\n", + "sparkSession.conf.set(\"spark.rapids.sql.csv.read.double.enabled\", true)\n", + "\n", + "val reader = sparkSession.read.schema(rawSchema)" ] }, { @@ -607,7 +763,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "id": "5bac2301", "metadata": {}, "outputs": [ @@ -615,8 +771,9 @@ "data": { "text/plain": [ "optionsMap = Map(header -> true)\n", + "rawDf = [reference_pool_id: string, loan_id: bigint ... 107 more fields]\n", "perfSet = [loan_id: bigint, monthly_reporting_period: string ... 30 more fields]\n", - "acqSet = [loan_id: bigint, orig_channel: string ... 24 more fields]\n" + "acqSet = [loan_id: bigint, orig_channel: string ... 25 more fields]\n" ] }, "metadata": {}, @@ -625,28 +782,28 @@ { "data": { "text/plain": [ - "[loan_id: bigint, orig_channel: string ... 24 more fields]" + "[loan_id: bigint, orig_channel: string ... 25 more fields]" ] }, - "execution_count": 10, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "val optionsMap = Map(\"header\" -> \"true\")\n", - "val perfSet = reader.options(optionsMap)\n", + "val rawDf_csv = reader.option(\"header\", false)\n", " .option(\"nullValue\", \"\")\n", " .option(\"delimiter\", \"|\")\n", " .option(\"parserLib\", \"univocity\")\n", - " .schema(performanceSchema)\n", - " .csv(perfPath)\n", + " .schema(rawSchema)\n", + " .csv(dataPath)\n", " .withColumn(\"quarter\", GetQuarterFromCsvFileName())\n", - "val acqSet = reader.options(optionsMap)\n", - " .option(\"delimiter\", \"|\")\n", - " .schema(acquisitionSchema)\n", - " .csv(acqPath)\n", - " .withColumn(\"quarter\", GetQuarterFromCsvFileName())" + "\n", + "rawDf_csv.write.mode(\"overwrite\").parquet(output_csv2parquet)\n", + "val rawDf = spark.read.parquet(output_csv2parquet)\n", + "\n", + "val perfSet = extractPerfColumns(rawDf)\n", + "val acqSet = extractAcqColumns(rawDf)" ] }, { @@ -659,7 +816,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "id": "a16155cb", "metadata": {}, "outputs": [ @@ -681,7 +838,7 @@ "List(orig_channel, first_home_buyer, loan_purpose, property_type, occupancy_status, property_state, product_type, relocation_mortgage_indicator, seller_name, mod_flag, orig_interest_rate, orig_upb, orig_loan_term, orig_ltv, orig_cltv, num_borrowers, dti, borrower_credit_score, num_units, zip, mortgage_insurance_percent, current_loan_delinquency_status, current_actual_upb, interest_rate, loan_age, msa, non_interest_bearing_upb, delinquency_12)" ] }, - "execution_count": 11, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -816,7 +973,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "id": "78b76252", "metadata": {}, "outputs": [ @@ -859,7 +1016,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "id": "ffdb0a62", "metadata": {}, "outputs": [ @@ -867,16 +1024,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Elapsed time : 35.638s\n" + "Elapsed time : 399.241s\n" ] }, { "data": { "text/plain": [ - "t0 = 1654138715501\n", + "t0 = 1656695479451\n", "optionsMap = Map(header -> true)\n", "rawDF = [orig_channel: int, first_home_buyer: int ... 26 more fields]\n", - "t1 = 1654138751139\n" + "t1 = 1656695878692\n" ] }, "metadata": {}, @@ -885,42 +1042,47 @@ { "data": { "text/plain": [ - "1654138751139" + "1656695878692" ] }, - "execution_count": 13, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "val t0 = System.currentTimeMillis\n", - "val optionsMap = Map(\"header\" -> \"true\")\n", "val rawDF = transform(\n", " perfSet,\n", " acqSet,\n", " sparkSession\n", " )\n", - "rawDF.write.mode(\"overwrite\").parquet(new Path(outPath, \"data\").toString)\n", + "\n", + "val etlDataPath = new Path(outPath, \"data\").toString\n", + "rawDF.write.mode(\"overwrite\").parquet(etlDataPath)\n", + "\n", + "if(saveTrainEvalDataset == true)\n", + "{\n", + " val etlDf = sparkSession.read.parquet(etlDataPath)\n", + " val sets = etlDf.randomSplit(Array[Double](0.8, 0.2))\n", + " val train = sets(0)\n", + " val eval = sets(1)\n", + " train.write.mode(\"overwrite\").parquet(new Path(outPath, \"train\").toString)\n", + " eval.write.mode(\"overwrite\").parquet(new Path(outPath, \"eval\").toString)\n", + "}\n", + "\n", + "\n", "val t1 = System.currentTimeMillis\n", "println(\"Elapsed time : \" + ((t1 - t0).toFloat / 1000) + \"s\")\n", "sparkSession.stop()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4388fe96", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "XGBoost4j-Spark-2206 - Scala", + "display_name": "XGBoost4j-Spark Scala", "language": "scala", - "name": "xgboost4j-spark-2206_scala" + "name": "XGBoost4j-Spark_scala" }, "language_info": { "codemirror_mode": "text/x-scala", @@ -933,4 +1095,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-gpu.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-gpu.ipynb index 6eaec8c72..872e09c2d 100644 --- a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-gpu.ipynb +++ b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-gpu.ipynb @@ -47,39 +47,15 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dataRoot = /data\n", - "trainPath = /data/mortgage/csv/train/\n", - "evalPath = /data/mortgage/csv/test/\n", - "transPath = /data/mortgage/csv/test/\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "/data/mortgage/csv/test/" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "// You need to update them to your real paths! The input data files can be the output of mortgage-etl jobs, or you can\n", - "// just use the provided sample datasets upder datasets path. \n", + "// You need to update them to your real paths! The input data files is the output of mortgage-etl jobs\n", "val dataRoot = sys.env.getOrElse(\"DATA_ROOT\", \"/data\")\n", - "val trainPath = dataRoot + \"/mortgage/csv/train/\"\n", - "val evalPath = dataRoot + \"/mortgage/csv/test/\"\n", - "val transPath = dataRoot + \"/mortgage/csv/test/\"" + "val trainPath = dataRoot + \"/mortgage/output/train/\"\n", + "val evalPath = dataRoot + \"/mortgage/output/eval/\"\n", + "val transPath = dataRoot + \"/mortgage/output/eval/\"" ] }, { @@ -132,7 +108,7 @@ " StructField(\"seller_name\", DoubleType),\n", " StructField(\"mod_flag\", DoubleType),\n", " StructField(\"orig_interest_rate\", DoubleType),\n", - " StructField(\"orig_upb\", IntegerType),\n", + " StructField(\"orig_upb\", DoubleType),\n", " StructField(\"orig_loan_term\", IntegerType),\n", " StructField(\"orig_ltv\", DoubleType),\n", " StructField(\"orig_cltv\", DoubleType),\n", @@ -208,7 +184,8 @@ "source": [ "// Build the spark session and data reader as usual\n", "val sparkSession = SparkSession.builder.appName(\"mortgage-gpu\").getOrCreate\n", - "val reader = sparkSession.read.option(\"header\", true).schema(schema)" + "sparkSession.conf.set(\"spark.rapids.sql.hasNans\", false)\n", + "val reader = sparkSession.read" ] }, { @@ -239,10 +216,9 @@ } ], "source": [ - "// Please make sure to change the api to reader.parquet if you load parquet files.\n", - "val trainSet = reader.csv(trainPath)\n", - "val evalSet = reader.csv(evalPath)\n", - "val transSet = reader.csv(transPath)" + "val trainSet = reader.parquet(trainPath)\n", + "val evalSet = reader.parquet(evalPath)\n", + "val transSet = reader.parquet(transPath)" ] }, { @@ -588,9 +564,9 @@ } ], "source": [ - "xgbClassificationModel.write.overwrite.save(dataRoot + \"/model/mortgage\")\n", + "xgbClassificationModel.write.overwrite.save(dataRoot + \"/mortgage/model/\")\n", "\n", - "val modelFromDisk = XGBoostClassificationModel.load(dataRoot + \"/model/mortgage\")\n", + "val modelFromDisk = XGBoostClassificationModel.load(dataRoot + \"/mortgage/model/\")\n", "\n", "val (results2, _) = Benchmark.time(\"transform2\") {\n", " modelFromDisk.transform(transSet)\n", @@ -606,20 +582,13 @@ "source": [ "sparkSession.close()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "XGBoost4j-Spark-2206 - Scala", + "display_name": "XGBoost4j-Spark - Scala", "language": "scala", - "name": "xgboost4j-spark-2206_scala" + "name": "XGBoost4j-Spark_scala" }, "language_info": { "codemirror_mode": "text/x-scala", @@ -632,4 +601,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage_gpu_crossvalidation.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage_gpu_crossvalidation.ipynb index a83c2bcdb..812436087 100644 --- a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage_gpu_crossvalidation.ipynb +++ b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage_gpu_crossvalidation.ipynb @@ -23,7 +23,7 @@ "import org.apache.spark.sql.SparkSession\n", "import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\n", "import org.apache.spark.ml.tuning.{ParamGridBuilder,CrossValidator}\n", - "import org.apache.spark.sql.types.{FloatType, IntegerType, StructField, StructType}" + "import org.apache.spark.sql.types.{FloatType, IntegerType, StructField, StructType, DoubleType}" ] }, { @@ -42,36 +42,14 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dataRoot = /data\n", - "trainParquetPath = /data/mortgage/parquet/train\n", - "evalParquetPath = /data/mortgage/parquet/eval\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "/data/mortgage/parquet/eval" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "// You need to update them to your real paths!\n", "val dataRoot = sys.env.getOrElse(\"DATA_ROOT\", \"/data\")\n", - "val trainParquetPath=dataRoot + \"/mortgage/parquet/train\"\n", - "val evalParquetPath=dataRoot + \"/mortgage/parquet/eval\"" + "val trainParquetPath=dataRoot + \"/mortgage/output/train\"\n", + "val evalParquetPath=dataRoot + \"/mortgage/output/eval\"" ] }, { @@ -83,30 +61,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "labelColName = delinquency_12\n", - "schema = StructType(StructField(orig_channel,FloatType,true), StructField(first_home_buyer,FloatType,true), StructField(loan_purpose,FloatType,true), StructField(property_type,FloatType,true), StructField(occupancy_status,FloatType,true), StructField(property_state,FloatType,true), StructField(product_type,FloatType,true), StructField(relocation_mortgage_indicator,FloatType,true), StructField(seller_name,FloatType,true), StructField(mod_flag,FloatType,true), StructField(orig_interest_rate,FloatType,true), StructField(orig_upb,IntegerType,true), StructField(orig_loan_term,IntegerType,true), StructField(orig_ltv,FloatType,true), StructField(orig_cltv,FloatType,true), StructField(num_borrowers,FloatType,true), Str...\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "StructType(StructField(orig_channel,FloatType,true), StructField(first_home_buyer,FloatType,true), StructField(loan_purpose,FloatType,true), StructField(property_type,FloatType,true), StructField(occupancy_status,FloatType,true), StructField(property_state,FloatType,true), StructField(product_type,FloatType,true), StructField(relocation_mortgage_indicator,FloatType,true), StructField(seller_name,FloatType,true), StructField(mod_flag,FloatType,true), StructField(orig_interest_rate,FloatType,true), StructField(orig_upb,IntegerType,true), StructField(orig_loan_term,IntegerType,true), StructField(orig_ltv,FloatType,true), StructField(orig_cltv,FloatType,true), StructField(num_borrowers,FloatType,true), Str..." - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "val labelColName = \"delinquency_12\"\n", "val schema = StructType(List(\n", @@ -121,7 +78,7 @@ " StructField(\"seller_name\", FloatType),\n", " StructField(\"mod_flag\", FloatType),\n", " StructField(\"orig_interest_rate\", FloatType),\n", - " StructField(\"orig_upb\", IntegerType),\n", + " StructField(\"orig_upb\", DoubleType),\n", " StructField(\"orig_loan_term\", IntegerType),\n", " StructField(\"orig_ltv\", FloatType),\n", " StructField(\"orig_cltv\", FloatType),\n", @@ -480,20 +437,13 @@ "source": [ "spark.close()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "XGBoost4j-Spark-2206 - Scala", + "display_name": "XGBoost4j-Spark - Scala", "language": "scala", - "name": "xgboost4j-spark-2206_scala" + "name": "XGBoost4j-Spark_scala" }, "language_info": { "codemirror_mode": "text/x-scala", @@ -506,4 +456,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/consts.py b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/consts.py index 7782d84c5..1cca6e6d8 100644 --- a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/consts.py +++ b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/consts.py @@ -30,7 +30,7 @@ StructField('seller_name', FloatType()), StructField('mod_flag', FloatType()), StructField('orig_interest_rate', FloatType()), - StructField('orig_upb', IntegerType()), + StructField('orig_upb', DoubleType()), StructField('orig_loan_term', IntegerType()), StructField('orig_ltv', FloatType()), StructField('orig_cltv', FloatType()), @@ -133,67 +133,117 @@ 'Other REFINANCE': 'OTHER REFINANCE', } -performance_schema = StructType([ - StructField('loan_id', LongType()), - StructField('monthly_reporting_period', StringType()), - StructField('servicer', StringType()), - StructField('interest_rate', DoubleType()), - StructField('current_actual_upb', DoubleType()), - StructField('loan_age', DoubleType()), - StructField('remaining_months_to_legal_maturity', DoubleType()), - StructField('adj_remaining_months_to_maturity', DoubleType()), - StructField('maturity_date', StringType()), - StructField('msa', DoubleType()), - StructField('current_loan_delinquency_status', IntegerType()), - StructField('mod_flag', StringType()), - StructField('zero_balance_code', StringType()), - StructField('zero_balance_effective_date', StringType()), - StructField('last_paid_installment_date', StringType()), - StructField('foreclosed_after', StringType()), - StructField('disposition_date', StringType()), - StructField('foreclosure_costs', DoubleType()), - StructField('prop_preservation_and_repair_costs', DoubleType()), - StructField('asset_recovery_costs', DoubleType()), - StructField('misc_holding_expenses', DoubleType()), - StructField('holding_taxes', DoubleType()), - StructField('net_sale_proceeds', DoubleType()), - StructField('credit_enhancement_proceeds', DoubleType()), - StructField('repurchase_make_whole_proceeds', StringType()), - StructField('other_foreclosure_proceeds', DoubleType()), - StructField('non_interest_bearing_upb', DoubleType()), - StructField('principal_forgiveness_upb', StringType()), - StructField('repurchase_make_whole_proceeds_flag', StringType()), - StructField('foreclosure_principal_write_off_amount', StringType()), - StructField('servicing_activity_indicator', StringType()), -]) -acquisition_schema = StructType([ - StructField('loan_id', LongType()), - StructField('orig_channel', StringType()), - StructField('seller_name', StringType()), - StructField('orig_interest_rate', DoubleType()), - StructField('orig_upb', IntegerType()), - StructField('orig_loan_term', IntegerType()), - StructField('orig_date', StringType()), - StructField('first_pay_date', StringType()), - StructField('orig_ltv', DoubleType()), - StructField('orig_cltv', DoubleType()), - StructField('num_borrowers', DoubleType()), - StructField('dti', DoubleType()), - StructField('borrower_credit_score', DoubleType()), - StructField('first_home_buyer', StringType()), - StructField('loan_purpose', StringType()), - StructField('property_type', StringType()), - StructField('num_units', IntegerType()), - StructField('occupancy_status', StringType()), - StructField('property_state', StringType()), - StructField('zip', IntegerType()), - StructField('mortgage_insurance_percent', DoubleType()), - StructField('product_type', StringType()), - StructField('coborrow_credit_score', DoubleType()), - StructField('mortgage_insurance_type', DoubleType()), - StructField('relocation_mortgage_indicator', StringType()), -]) +rawSchema = StructType([ + StructField("reference_pool_id", StringType()), + StructField("loan_id", LongType()), + StructField("monthly_reporting_period", StringType()), + StructField("orig_channel", StringType()), + StructField("seller_name", StringType()), + StructField("servicer", StringType()), + StructField("master_servicer", StringType()), + StructField("orig_interest_rate", DoubleType()), + StructField("interest_rate", DoubleType()), + StructField("orig_upb", DoubleType()), + StructField("upb_at_issuance", StringType()), + StructField("current_actual_upb", DoubleType()), + StructField("orig_loan_term", IntegerType()), + StructField("orig_date", StringType()), + StructField("first_pay_date", StringType()), + StructField("loan_age", DoubleType()), + StructField("remaining_months_to_legal_maturity", DoubleType()), + StructField("adj_remaining_months_to_maturity", DoubleType()), + StructField("maturity_date", StringType()), + StructField("orig_ltv", DoubleType()), + StructField("orig_cltv", DoubleType()), + StructField("num_borrowers", DoubleType()), + StructField("dti", DoubleType()), + StructField("borrower_credit_score", DoubleType()), + StructField("coborrow_credit_score", DoubleType()), + StructField("first_home_buyer", StringType()), + StructField("loan_purpose", StringType()), + StructField("property_type", StringType()), + StructField("num_units", IntegerType()), + StructField("occupancy_status", StringType()), + StructField("property_state", StringType()), + StructField("msa", DoubleType()), + StructField("zip", IntegerType()), + StructField("mortgage_insurance_percent", DoubleType()), + StructField("product_type", StringType()), + StructField("prepayment_penalty_indicator", StringType()), + StructField("interest_only_loan_indicator", StringType()), + StructField("interest_only_first_principal_and_interest_payment_date", StringType()), + StructField("months_to_amortization", StringType()), + StructField("current_loan_delinquency_status", IntegerType()), + StructField("loan_payment_history", StringType()), + StructField("mod_flag", StringType()), + StructField("mortgage_insurance_cancellation_indicator", StringType()), + StructField("zero_balance_code", StringType()), + StructField("zero_balance_effective_date", StringType()), + StructField("upb_at_the_time_of_removal", StringType()), + StructField("repurchase_date", StringType()), + StructField("scheduled_principal_current", StringType()), + StructField("total_principal_current", StringType()), + StructField("unscheduled_principal_current", StringType()), + StructField("last_paid_installment_date", StringType()), + StructField("foreclosed_after", StringType()), + StructField("disposition_date", StringType()), + StructField("foreclosure_costs", DoubleType()), + StructField("prop_preservation_and_repair_costs", DoubleType()), + StructField("asset_recovery_costs", DoubleType()), + StructField("misc_holding_expenses", DoubleType()), + StructField("holding_taxes", DoubleType()), + StructField("net_sale_proceeds", DoubleType()), + StructField("credit_enhancement_proceeds", DoubleType()), + StructField("repurchase_make_whole_proceeds", StringType()), + StructField("other_foreclosure_proceeds", DoubleType()), + StructField("non_interest_bearing_upb", DoubleType()), + StructField("principal_forgiveness_upb", StringType()), + StructField("original_list_start_date", StringType()), + StructField("original_list_price", StringType()), + StructField("current_list_start_date", StringType()), + StructField("current_list_price", StringType()), + StructField("borrower_credit_score_at_issuance", StringType()), + StructField("co-borrower_credit_score_at_issuance", StringType()), + StructField("borrower_credit_score_current", StringType()), + StructField("co-Borrower_credit_score_current", StringType()), + StructField("mortgage_insurance_type", DoubleType()), + StructField("servicing_activity_indicator", StringType()), + StructField("current_period_modification_loss_amount", StringType()), + StructField("cumulative_modification_loss_amount", StringType()), + StructField("current_period_credit_event_net_gain_or_loss", StringType()), + StructField("cumulative_credit_event_net_gain_or_loss", StringType()), + StructField("homeready_program_indicator", StringType()), + StructField("foreclosure_principal_write_off_amount", StringType()), + StructField("relocation_mortgage_indicator", StringType()), + StructField("zero_balance_code_change_date", StringType()), + StructField("loan_holdback_indicator", StringType()), + StructField("loan_holdback_effective_date", StringType()), + StructField("delinquent_accrued_interest", StringType()), + StructField("property_valuation_method", StringType()), + StructField("high_balance_loan_indicator", StringType()), + StructField("arm_initial_fixed-rate_period_lt_5_yr_indicator", StringType()), + StructField("arm_product_type", StringType()), + StructField("initial_fixed-rate_period", StringType()), + StructField("interest_rate_adjustment_frequency", StringType()), + StructField("next_interest_rate_adjustment_date", StringType()), + StructField("next_payment_change_date", StringType()), + StructField("index", StringType()), + StructField("arm_cap_structure", StringType()), + StructField("initial_interest_rate_cap_up_percent", StringType()), + StructField("periodic_interest_rate_cap_up_percent", StringType()), + StructField("lifetime_interest_rate_cap_up_percent", StringType()), + StructField("mortgage_margin", StringType()), + StructField("arm_balloon_indicator", StringType()), + StructField("arm_plan_number", StringType()), + StructField("borrower_assistance_plan", StringType()), + StructField("hltv_refinance_option_indicator", StringType()), + StructField("deal_name", StringType()), + StructField("repurchase_make_whole_proceeds_flag", StringType()), + StructField("alternative_delinquency_resolution", StringType()), + StructField("alternative_delinquency_resolution_count", StringType()), + StructField("total_deferral_amount", StringType()) + ]) categorical_columns = [ 'orig_channel', diff --git a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl.py b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl.py index eb3f40aef..47052737c 100644 --- a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl.py +++ b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl.py @@ -16,9 +16,10 @@ from com.nvidia.spark.examples.mortgage.consts import * from pyspark.sql.functions import * from pyspark.sql.types import * +from pyspark.sql.window import Window from sys import exit -get_quarter = udf(lambda path: path.split(r'.')[0].split('_')[-1], StringType()) +get_quarter = udf(lambda path: path.split(r'.')[0].split('/')[-1], StringType()) standardize_name = udf(lambda name: name_mapping.get(name), StringType()) def load_data(spark, paths, schema, args, extra_csv_opts={}): @@ -31,18 +32,60 @@ def load_data(spark, paths, schema, args, extra_csv_opts={}): (reader .schema(schema) .option('delimiter', '|') - .option('header', args.hasHeader)) + .option('header', False)) for k, v in extra_csv_opts.items(): reader.option(k, v) return reader.load(paths) -def prepare_performance(spark, args): +def prepare_rawDf(spark, args): extra_csv_options = { 'nullValue': '', 'parserLib': 'univocity', } - paths = extract_paths(args.dataPaths, 'perf::') - performance = (load_data(spark, paths, performance_schema, args, extra_csv_options) + paths = extract_paths(args.dataPaths, 'data::') + rawDf = load_data(spark, paths, rawSchema, args, extra_csv_options) + + return rawDf + +def extract_perf_columns(rawDf): + perfDf = rawDf.select( + col("loan_id"), + date_format(to_date(col("monthly_reporting_period"),"MMyyyy"), "MM/dd/yyyy").alias("monthly_reporting_period"), + upper(col("servicer")).alias("servicer"), + col("interest_rate"), + col("current_actual_upb"), + col("loan_age"), + col("remaining_months_to_legal_maturity"), + col("adj_remaining_months_to_maturity"), + date_format(to_date(col("maturity_date"),"MMyyyy"), "MM/yyyy").alias("maturity_date"), + col("msa"), + col("current_loan_delinquency_status"), + col("mod_flag"), + col("zero_balance_code"), + date_format(to_date(col("zero_balance_effective_date"),"MMyyyy"), "MM/yyyy").alias("zero_balance_effective_date"), + date_format(to_date(col("last_paid_installment_date"),"MMyyyy"), "MM/dd/yyyy").alias("last_paid_installment_date"), + date_format(to_date(col("foreclosed_after"),"MMyyyy"), "MM/dd/yyyy").alias("foreclosed_after"), + date_format(to_date(col("disposition_date"),"MMyyyy"), "MM/dd/yyyy").alias("disposition_date"), + col("foreclosure_costs"), + col("prop_preservation_and_repair_costs"), + col("asset_recovery_costs"), + col("misc_holding_expenses"), + col("holding_taxes"), + col("net_sale_proceeds"), + col("credit_enhancement_proceeds"), + col("repurchase_make_whole_proceeds"), + col("other_foreclosure_proceeds"), + col("non_interest_bearing_upb"), + col("principal_forgiveness_upb"), + col("repurchase_make_whole_proceeds_flag"), + col("foreclosure_principal_write_off_amount"), + col("servicing_activity_indicator")) + + return perfDf.select("*").filter("current_actual_upb != 0.0") + + +def prepare_performance(spark, args, rawDf): + performance = (extract_perf_columns(rawDf) .withColumn('quarter', get_quarter(input_file_name())) .withColumn('timestamp', to_date(col('monthly_reporting_period'), 'MM/dd/yyyy')) .withColumn('timestamp_year', year(col('timestamp'))) @@ -133,8 +176,42 @@ def prepare_performance(spark, args): .join(to_join, ['quarter', 'loan_id', 'timestamp_year', 'timestamp_month'], 'left') .drop('timestamp_year', 'timestamp_month')) -def prepare_acquisition(spark, args): - return (load_data(spark, extract_paths(args.dataPaths, 'acq::'), acquisition_schema, args) +def extract_acq_columns(rawDf): + acqDf = rawDf.select( + col("loan_id"), + col("orig_channel"), + upper(col("seller_name")).alias("seller_name"), + col("orig_interest_rate"), + col("orig_upb"), + col("orig_loan_term"), + date_format(to_date(col("orig_date"),"MMyyyy"), "MM/yyyy").alias("orig_date"), + date_format(to_date(col("first_pay_date"),"MMyyyy"), "MM/yyyy").alias("first_pay_date"), + col("orig_ltv"), + col("orig_cltv"), + col("num_borrowers"), + col("dti"), + col("borrower_credit_score"), + col("first_home_buyer"), + col("loan_purpose"), + col("property_type"), + col("num_units"), + col("occupancy_status"), + col("property_state"), + col("zip"), + col("mortgage_insurance_percent"), + col("product_type"), + col("coborrow_credit_score"), + col("mortgage_insurance_type"), + col("relocation_mortgage_indicator"), + dense_rank().over(Window.partitionBy("loan_id").orderBy(to_date(col("monthly_reporting_period"),"MMyyyy"))).alias("rank") + ) + + return acqDf.select("*").filter(col("rank")==1) + + + +def prepare_acquisition(spark, args, rawDf): + return (extract_acq_columns(rawDf) .withColumn('quarter', get_quarter(input_file_name())) .withColumn('seller_name', standardize_name(col('seller_name')))) @@ -147,8 +224,12 @@ def extract_paths(paths, prefix): return results def etl(spark, args): - performance = prepare_performance(spark, args) - acquisition = prepare_acquisition(spark, args) + rawDf = prepare_rawDf(spark, args) + rawDf.write.parquet(extract_paths(args.dataPaths, 'tmp::')[0], mode='overwrite') + rawDf = spark.read.parquet(extract_paths(args.dataPaths, 'tmp::')[0]) + + performance = prepare_performance(spark, args, rawDf) + acquisition = prepare_acquisition(spark, args, rawDf) return (performance .join(acquisition, ['loan_id', 'quarter'], 'left_outer') .select( diff --git a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl_main.py b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl_main.py index 6002f5056..55f5df5fc 100644 --- a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl_main.py +++ b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl_main.py @@ -16,7 +16,6 @@ from com.nvidia.spark.examples.mortgage.consts import * from com.nvidia.spark.examples.mortgage.etl import etl, extract_paths from com.nvidia.spark.examples.utility.utils import * -from ml.dmlc.xgboost4j.scala.spark import * from pyspark.sql import SparkSession def main(args, xgboost_args): diff --git a/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/ETLMain.scala b/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/ETLMain.scala index f54d3d67c..d6b5db30a 100644 --- a/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/ETLMain.scala +++ b/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/ETLMain.scala @@ -31,17 +31,17 @@ object ETLMain extends Mortgage { val spark = SparkSession.builder().appName(appInfo.mkString("-")).getOrCreate() try { - val (perfPaths, acqPaths, outPath) = checkAndGetPaths(xgbArgs.dataPaths) + val (dataPaths, outPath, tmpPath) = checkAndGetPaths(xgbArgs.dataPaths) println("\n------ Start ETL ------") benchmark.time("ETL") { // ETL the raw data val rawDF = xgbArgs.format match { - case "csv" => XGBoostETL.csv(spark, perfPaths, acqPaths, xgbArgs.hasHeader) - case "orc" => XGBoostETL.orc(spark, perfPaths, acqPaths) - case "parquet" => XGBoostETL.parquet(spark, perfPaths, acqPaths) + case "csv" => XGBoostETL.csv(spark, dataPaths, tmpPath, false) + case "orc" => XGBoostETL.orc(spark, dataPaths) + case "parquet" => XGBoostETL.parquet(spark, dataPaths) case _ => throw new IllegalArgumentException("Unsupported data file format!") } - rawDF.write.mode("overwrite").parquet(new Path(outPath, "data").toString) + rawDF.write.mode("overwrite").parquet(outPath) } if (xgbArgs.saveDict) { XGBoostETL.saveDictTable(new Path(outPath, ".dict").toString) @@ -52,32 +52,32 @@ object ETLMain extends Mortgage { } } - private def checkAndGetPaths(paths: Seq[String]): (Seq[String], Seq[String], String) = { - val prefixes = Array("perf::", "acq::", "out::") + def checkAndGetPaths(paths: Seq[String]): (Seq[String], String, String) = { + val prefixes = Array("data::", "out::", "tmp::") val validPaths = paths.filter(_.nonEmpty).map(_.trim) // get and check perf data paths - val perfPaths = validPaths.filter(_.startsWith(prefixes.head)) - require(perfPaths.nonEmpty, s"$appName ETL requires at least one path for performance data file." + - s" Please specify it by '-dataPath=perf::your_perf_path'") - - // get and check acq data paths - val acqPaths = validPaths.filter(_.startsWith(prefixes(1))) - require(acqPaths.nonEmpty, s"$appName ETL requires at least one path for acquisition data file." + - s" Please specify it by '-dataPath=acq::your_acq_path'") + val dataPaths = validPaths.filter(_.startsWith(prefixes.head)) + require(dataPaths.nonEmpty, s"$appName ETL requires at least one path for data file." + + s" Please specify it by '-dataPath=data::your_data_path'") // get and check out path - val outPath = validPaths.filter(_.startsWith(prefixes(2))) + val outPath = validPaths.filter(_.startsWith(prefixes(1))) require(outPath.nonEmpty, s"$appName ETL requires a path to save the ETLed data file. Please specify it" + " by '-dataPath=out::your_out_path', only the first path is used if multiple paths are found.") + + // get and check tmp path + val tmpPath = validPaths.filter(_.startsWith(prefixes(2))) + require(tmpPath.nonEmpty, s"$appName ETL requires a path to save the temp parquet files. Please specify it" + + " by '-dataPath=tmp::your_out_path'.") // check data paths not specified type val unknownPaths = validPaths.filterNot(p => prefixes.exists(p.contains(_))) require(unknownPaths.isEmpty, s"Unknown type for data path: ${unknownPaths.head}, $appName requires to specify" + - " the type for each data path by adding the prefix 'perf::' or 'acq::' or 'out::'.") + " the type for each data path by adding the prefix 'data::' or 'out::'.") - (perfPaths.map(_.stripPrefix(prefixes.head)), - acqPaths.map(_.stripPrefix(prefixes(1))), - outPath.head.stripPrefix(prefixes(2))) + (dataPaths.map(_.stripPrefix(prefixes.head)), + outPath.head.stripPrefix(prefixes(1)), + tmpPath.head.stripPrefix(prefixes(2))) } } diff --git a/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/Mortgage.scala b/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/Mortgage.scala index 582492006..c051cff07 100644 --- a/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/Mortgage.scala +++ b/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/Mortgage.scala @@ -16,7 +16,7 @@ package com.nvidia.spark.examples.mortgage -import org.apache.spark.sql.types.{FloatType, IntegerType, StructField, StructType} +import org.apache.spark.sql.types.{FloatType, IntegerType, StructField, StructType, DoubleType} private[mortgage] trait Mortgage { val appName = "Mortgage" @@ -37,7 +37,7 @@ private[mortgage] trait Mortgage { protected val numericCols = List( ("orig_interest_rate", FloatType), - ("orig_upb", IntegerType), + ("orig_upb", DoubleType), ("orig_loan_term", IntegerType), ("orig_ltv", FloatType), ("orig_cltv", FloatType), diff --git a/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/XGBoostETL.scala b/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/XGBoostETL.scala index 0ef25ea2e..7c21b9dbe 100644 --- a/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/XGBoostETL.scala +++ b/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/XGBoostETL.scala @@ -27,27 +27,64 @@ object GetQuarterFromCsvFileName { // So we strip off the .txt and everything after it // and then take everything after the last remaining _ def apply(): Column = substring_index( - substring_index(input_file_name(), ".", 1), "_", -1) + substring_index(input_file_name(), ".", 1), "/", -1) } private object CsvReader { - def readPerformance(spark: SparkSession, paths: Seq[String], optionsMap: Map[String, String]): DataFrame = { - val performanceSchema = StructType(Array( + def readRaw(spark: SparkSession, paths: Seq[String], optionsMap: Map[String, String]): DataFrame = { + + val rawSchema = StructType(Array( + StructField("reference_pool_id", StringType), StructField("loan_id", LongType), StructField("monthly_reporting_period", StringType), + StructField("orig_channel", StringType), + StructField("seller_name", StringType), StructField("servicer", StringType), + StructField("master_servicer", StringType), + StructField("orig_interest_rate", DoubleType), StructField("interest_rate", DoubleType), + StructField("orig_upb", DoubleType), + StructField("upb_at_issuance", StringType), StructField("current_actual_upb", DoubleType), + StructField("orig_loan_term", IntegerType), + StructField("orig_date", StringType), + StructField("first_pay_date", StringType), StructField("loan_age", DoubleType), StructField("remaining_months_to_legal_maturity", DoubleType), StructField("adj_remaining_months_to_maturity", DoubleType), StructField("maturity_date", StringType), + StructField("orig_ltv", DoubleType), + StructField("orig_cltv", DoubleType), + StructField("num_borrowers", DoubleType), + StructField("dti", DoubleType), + StructField("borrower_credit_score", DoubleType), + StructField("coborrow_credit_score", DoubleType), + StructField("first_home_buyer", StringType), + StructField("loan_purpose", StringType), + StructField("property_type", StringType), + StructField("num_units", IntegerType), + StructField("occupancy_status", StringType), + StructField("property_state", StringType), StructField("msa", DoubleType), + StructField("zip", IntegerType), + StructField("mortgage_insurance_percent", DoubleType), + StructField("product_type", StringType), + StructField("prepayment_penalty_indicator", StringType), + StructField("interest_only_loan_indicator", StringType), + StructField("interest_only_first_principal_and_interest_payment_date", StringType), + StructField("months_to_amortization", StringType), StructField("current_loan_delinquency_status", IntegerType), + StructField("loan_payment_history", StringType), StructField("mod_flag", StringType), + StructField("mortgage_insurance_cancellation_indicator", StringType), StructField("zero_balance_code", StringType), StructField("zero_balance_effective_date", StringType), + StructField("upb_at_the_time_of_removal", StringType), + StructField("repurchase_date", StringType), + StructField("scheduled_principal_current", StringType), + StructField("total_principal_current", StringType), + StructField("unscheduled_principal_current", StringType), StructField("last_paid_installment_date", StringType), StructField("foreclosed_after", StringType), StructField("disposition_date", StringType), @@ -62,59 +99,141 @@ private object CsvReader { StructField("other_foreclosure_proceeds", DoubleType), StructField("non_interest_bearing_upb", DoubleType), StructField("principal_forgiveness_upb", StringType), - StructField("repurchase_make_whole_proceeds_flag", StringType), + StructField("original_list_start_date", StringType), + StructField("original_list_price", StringType), + StructField("current_list_start_date", StringType), + StructField("current_list_price", StringType), + StructField("borrower_credit_score_at_issuance", StringType), + StructField("co-borrower_credit_score_at_issuance", StringType), + StructField("borrower_credit_score_current", StringType), + StructField("co-Borrower_credit_score_current", StringType), + StructField("mortgage_insurance_type", DoubleType), + StructField("servicing_activity_indicator", StringType), + StructField("current_period_modification_loss_amount", StringType), + StructField("cumulative_modification_loss_amount", StringType), + StructField("current_period_credit_event_net_gain_or_loss", StringType), + StructField("cumulative_credit_event_net_gain_or_loss", StringType), + StructField("homeready_program_indicator", StringType), StructField("foreclosure_principal_write_off_amount", StringType), - StructField("servicing_activity_indicator", StringType)) + StructField("relocation_mortgage_indicator", StringType), + StructField("zero_balance_code_change_date", StringType), + StructField("loan_holdback_indicator", StringType), + StructField("loan_holdback_effective_date", StringType), + StructField("delinquent_accrued_interest", StringType), + StructField("property_valuation_method", StringType), + StructField("high_balance_loan_indicator", StringType), + StructField("arm_initial_fixed-rate_period_lt_5_yr_indicator", StringType), + StructField("arm_product_type", StringType), + StructField("initial_fixed-rate_period", StringType), + StructField("interest_rate_adjustment_frequency", StringType), + StructField("next_interest_rate_adjustment_date", StringType), + StructField("next_payment_change_date", StringType), + StructField("index", StringType), + StructField("arm_cap_structure", StringType), + StructField("initial_interest_rate_cap_up_percent", StringType), + StructField("periodic_interest_rate_cap_up_percent", StringType), + StructField("lifetime_interest_rate_cap_up_percent", StringType), + StructField("mortgage_margin", StringType), + StructField("arm_balloon_indicator", StringType), + StructField("arm_plan_number", StringType), + StructField("borrower_assistance_plan", StringType), + StructField("hltv_refinance_option_indicator", StringType), + StructField("deal_name", StringType), + StructField("repurchase_make_whole_proceeds_flag", StringType), + StructField("alternative_delinquency_resolution", StringType), + StructField("alternative_delinquency_resolution_count", StringType), + StructField("total_deferral_amount", StringType) + ) ) spark.read .options(optionsMap) .option("nullValue", "") .option("delimiter", "|") - .option("parserLib", "univocity") - .schema(performanceSchema) + .schema(rawSchema) .csv(paths: _*) .withColumn("quarter", GetQuarterFromCsvFileName()) } +} - def readAcquisition(spark: SparkSession, paths: Seq[String], optionsMap: Map[String, String]): DataFrame = { - val acquisitionSchema = StructType(Array( - StructField("loan_id", LongType), - StructField("orig_channel", StringType), - StructField("seller_name", StringType), - StructField("orig_interest_rate", DoubleType), - StructField("orig_upb", IntegerType), - StructField("orig_loan_term", IntegerType), - StructField("orig_date", StringType), - StructField("first_pay_date", StringType), - StructField("orig_ltv", DoubleType), - StructField("orig_cltv", DoubleType), - StructField("num_borrowers", DoubleType), - StructField("dti", DoubleType), - StructField("borrower_credit_score", DoubleType), - StructField("first_home_buyer", StringType), - StructField("loan_purpose", StringType), - StructField("property_type", StringType), - StructField("num_units", IntegerType), - StructField("occupancy_status", StringType), - StructField("property_state", StringType), - StructField("zip", IntegerType), - StructField("mortgage_insurance_percent", DoubleType), - StructField("product_type", StringType), - StructField("coborrow_credit_score", DoubleType), - StructField("mortgage_insurance_type", DoubleType), - StructField("relocation_mortgage_indicator", StringType)) +object extractPerfColumns{ + def apply(rawDf : DataFrame) : DataFrame = { + val perfDf = rawDf.select( + col("loan_id"), + date_format(to_date(col("monthly_reporting_period"),"MMyyyy"), "MM/dd/yyyy").as("monthly_reporting_period"), + upper(col("servicer")).as("servicer"), + col("interest_rate"), + col("current_actual_upb"), + col("loan_age"), + col("remaining_months_to_legal_maturity"), + col("adj_remaining_months_to_maturity"), + date_format(to_date(col("maturity_date"),"MMyyyy"), "MM/yyyy").as("maturity_date"), + col("msa"), + col("current_loan_delinquency_status"), + col("mod_flag"), + col("zero_balance_code"), + date_format(to_date(col("zero_balance_effective_date"),"MMyyyy"), "MM/yyyy").as("zero_balance_effective_date"), + date_format(to_date(col("last_paid_installment_date"),"MMyyyy"), "MM/dd/yyyy").as("last_paid_installment_date"), + date_format(to_date(col("foreclosed_after"),"MMyyyy"), "MM/dd/yyyy").as("foreclosed_after"), + date_format(to_date(col("disposition_date"),"MMyyyy"), "MM/dd/yyyy").as("disposition_date"), + col("foreclosure_costs"), + col("prop_preservation_and_repair_costs"), + col("asset_recovery_costs"), + col("misc_holding_expenses"), + col("holding_taxes"), + col("net_sale_proceeds"), + col("credit_enhancement_proceeds"), + col("repurchase_make_whole_proceeds"), + col("other_foreclosure_proceeds"), + col("non_interest_bearing_upb"), + col("principal_forgiveness_upb"), + col("repurchase_make_whole_proceeds_flag"), + col("foreclosure_principal_write_off_amount"), + col("servicing_activity_indicator"), + col("quarter") ) - spark.read - .options(optionsMap) - .option("delimiter", "|") - .schema(acquisitionSchema) - .csv(paths: _*) - .withColumn("quarter", GetQuarterFromCsvFileName()) + perfDf.select("*").filter("current_actual_upb != 0.0") } } +object extractAcqColumns{ + def apply(rawDf : DataFrame) : DataFrame = { + val acqDf = rawDf.select( + col("loan_id"), + col("orig_channel"), + upper(col("seller_name")).as("seller_name"), + col("orig_interest_rate"), + col("orig_upb"), + col("orig_loan_term"), + date_format(to_date(col("orig_date"),"MMyyyy"), "MM/yyyy").as("orig_date"), + date_format(to_date(col("first_pay_date"),"MMyyyy"), "MM/yyyy").as("first_pay_date"), + col("orig_ltv"), + col("orig_cltv"), + col("num_borrowers"), + col("dti"), + col("borrower_credit_score"), + col("first_home_buyer"), + col("loan_purpose"), + col("property_type"), + col("num_units"), + col("occupancy_status"), + col("property_state"), + col("zip"), + col("mortgage_insurance_percent"), + col("product_type"), + col("coborrow_credit_score"), + col("mortgage_insurance_type"), + col("relocation_mortgage_indicator"), + col("quarter"), + dense_rank().over(Window.partitionBy("loan_id").orderBy(to_date(col("monthly_reporting_period"),"MMyyyy"))).as("rank") + ) + + acqDf.select("*").filter(col("rank") === 1).drop("rank") + } + +} + object NameMapping { /** * Returns a dataframe with two columns named based off of the column names passed in. @@ -414,28 +533,43 @@ object XGBoostETL extends Mortgage { } } - def csv(spark: SparkSession, perfPaths: Seq[String], acqPaths: Seq[String], hasHeader: Boolean): DataFrame = { + def csv(spark: SparkSession, dataPaths: Seq[String], tmpPath: String, hasHeader: Boolean): DataFrame = { val optionsMap = Map("header" -> hasHeader.toString) + val rawDf_csv = CsvReader.readRaw(spark, dataPaths, optionsMap) + + rawDf_csv.write.mode("overwrite").parquet(tmpPath) + val rawDf = spark.read.parquet(tmpPath) + + val perfDf = extractPerfColumns(rawDf) + val acqDf = extractAcqColumns(rawDf) transform( - CsvReader.readPerformance(spark, perfPaths, optionsMap), - CsvReader.readAcquisition(spark, acqPaths, optionsMap), + perfDf, + acqDf, spark ) } - def parquet(spark: SparkSession, perfPaths: Seq[String], acqPaths: Seq[String]): DataFrame = { + def parquet(spark: SparkSession, dataPaths: Seq[String]): DataFrame = { + val rawDf = spark.read.parquet(dataPaths: _*) + val perfDf = extractPerfColumns(rawDf) + val acqDf = extractAcqColumns(rawDf) transform( - spark.read.parquet(perfPaths: _*), - spark.read.parquet(acqPaths: _*), + perfDf, + acqDf, spark ) } - def orc(spark: SparkSession, perfPaths: Seq[String], acqPaths: Seq[String]): DataFrame = { + def orc(spark: SparkSession, dataPaths: Seq[String]): DataFrame = { + val rawDf = spark.read.orc(dataPaths: _*) + val perfDf = extractPerfColumns(rawDf) + val acqDf = extractAcqColumns(rawDf) transform( - spark.read.orc(perfPaths: _*), - spark.read.orc(acqPaths: _*), + perfDf, + acqDf, spark ) } + + } diff --git a/examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb b/examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb index 7abeac750..171f47f4c 100644 --- a/examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb +++ b/examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb @@ -19,14 +19,14 @@ "All data could be found at https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page\n", "\n", "### 2. Download needed jars\n", - "* [rapids-4-spark_2.12-22.06.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.06.0/rapids-4-spark_2.12-22.06.0.jar)\n", + "* [rapids-4-spark_2.12-22.08.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar)\n", "\n", "### 3. Start Spark Standalone\n", "Before running the script, please setup Spark standalone mode\n", "\n", "### 4. Add ENV\n", "```\n", - "$ export SPARK_JARS=rapids-4-spark_2.12-22.06.0.jar\n", + "$ export SPARK_JARS=rapids-4-spark_2.12-22.08.0.jar\n", "$ export PYSPARK_DRIVER_PYTHON=jupyter \n", "$ export PYSPARK_DRIVER_PYTHON_OPTS=notebook\n", "```\n", diff --git a/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-ETL.ipynb b/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-ETL.ipynb index 6a849b5b1..0f14cdc65 100644 --- a/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-ETL.ipynb +++ b/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-ETL.ipynb @@ -19,18 +19,18 @@ "All data could be found at https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page\n", "\n", "### 2. Download needed jar\n", - "* [rapids-4-spark_2.12-22.06.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.06.0/rapids-4-spark_2.12-22.06.0.jar)\n", + "* [rapids-4-spark_2.12-22.08.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar)\n", "\n", "### 3. Start Spark Standalone\n", "Before running the script, please setup Spark standalone mode\n", "\n", "### 4. Add ENV\n", "```\n", - "$ export SPARK_JARS=rapids-4-spark_2.12-22.06.0.jar\n", + "$ export SPARK_JARS=rapids-4-spark_2.12-22.08.0.jar\n", "\n", "```\n", "\n", - "### 5.Start Jupyter Notebook with spylon-kernal or toree\n", + "### 5.Start Jupyter Notebook with spylon-kernel or toree\n", "\n", "```\n", "$ jupyter notebook --allow-root --notebook-dir=${your-dir} --config=${your-configs}\n", @@ -563,9 +563,9 @@ ], "metadata": { "kernelspec": { - "display_name": "XGBoost4j-Spark-2206 - Scala", + "display_name": "XGBoost4j-Spark - Scala", "language": "scala", - "name": "xgboost4j-spark-2206_scala" + "name": "XGBoost4j-Spark_scala" }, "language_info": { "codemirror_mode": "text/x-scala", @@ -578,4 +578,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-gpu.ipynb b/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-gpu.ipynb index b59f74473..58dd84eb0 100644 --- a/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-gpu.ipynb +++ b/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-gpu.ipynb @@ -594,9 +594,9 @@ ], "metadata": { "kernelspec": { - "display_name": "XGBoost4j-Spark-2206 - Scala", + "display_name": "XGBoost4j-Spark - Scala", "language": "scala", - "name": "xgboost4j-spark-2206_scala" + "name": "XGBoost4j-Spark_scala" }, "language_info": { "codemirror_mode": "text/x-scala", diff --git a/examples/XGBoost-Examples/taxi/notebooks/scala/taxi_gpu_crossvalidation.ipynb b/examples/XGBoost-Examples/taxi/notebooks/scala/taxi_gpu_crossvalidation.ipynb index b8e524e4d..b5d1da710 100644 --- a/examples/XGBoost-Examples/taxi/notebooks/scala/taxi_gpu_crossvalidation.ipynb +++ b/examples/XGBoost-Examples/taxi/notebooks/scala/taxi_gpu_crossvalidation.ipynb @@ -490,9 +490,9 @@ ], "metadata": { "kernelspec": { - "display_name": "XGBoost4j-Spark-2206 - Scala", + "display_name": "XGBoost4j-Spark - Scala", "language": "scala", - "name": "xgboost4j-spark-2206_scala" + "name": "XGBoost4j-Spark_scala" }, "language_info": { "codemirror_mode": "text/x-scala", diff --git a/examples/XGBoost-Examples/utility/scala/src/com/nvidia/spark/examples/utility/XGBoostArgs.scala b/examples/XGBoost-Examples/utility/scala/src/com/nvidia/spark/examples/utility/XGBoostArgs.scala index 75faaaa5a..d8cca3fcd 100644 --- a/examples/XGBoost-Examples/utility/scala/src/com/nvidia/spark/examples/utility/XGBoostArgs.scala +++ b/examples/XGBoost-Examples/utility/scala/src/com/nvidia/spark/examples/utility/XGBoostArgs.scala @@ -97,10 +97,10 @@ object XGBoostArgs { println(" -saveDict=value: Boolean\n" + " Whether to save the dictionary table for Mortgage ETL. It is saved under '/.dict'. Default is true.\n") println(" -rabitTrackerHost=value: String\n" + - " Specify rabit tracker host IP address. In some environments XGBoost might fail to resolve - the IP address of the rabit tracker, a symptom is user receiving ``OSError: [Errno 99] - Cannot assign requested address`` error during training. A quick workaround is to - specify the address explicitly.\n") + " Specify rabit tracker host IP address. In some environments XGBoost might fail to resolve\n" + + "the IP address of the rabit tracker, a symptom is user receiving ``OSError: [Errno 99]\n" + + "Cannot assign requested address`` error during training. A quick workaround is to\n" + + "specify the address explicitly.\n") println("For XGBoost arguments:") println(" Now we pass all XGBoost parameters transparently to XGBoost, no longer to verify them.") println(" Both of the formats are supported, such as 'numWorkers'. You can pass as either one below:")