From a785691d7ba2503be52b738258539d9a27faad42 Mon Sep 17 00:00:00 2001 From: ifigeneia1989 Date: Fri, 6 Dec 2019 12:11:21 -0500 Subject: [PATCH 1/4] typo correction --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d736d11..c9dad15 100755 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ In this project, there are 5 different supervised classifer models designed for 1. In CSDW, download the project using the git url for [here](https://github.com/BrooksIan/ChurnBabyChurn.git) 2. Open a new session, and execute the setup.sh file 3. In Experiments, run the following scripts - * dsforteko_pyspark.py - vanilla random forest churn model + * dsfortelco_pyspark.py - vanilla random forest churn model * gbt_churn_pyspark.py - gradient boost tree churn model with normamlized variables, hyperturning, and crossvalidation * mlp_churn_pyspark.py - multilayer perceptron churn model with normamlized variables, hyperturning, and crossvalidatio * rf_churn_pyspark.py - random forest churn model with normamlized variables, hyperturning, and crossvalidation From cbf6a8d207b6be4e58a844735be5d5b861c7a7d6 Mon Sep 17 00:00:00 2001 From: ifigeneia1989 Date: Fri, 6 Dec 2019 12:12:01 -0500 Subject: [PATCH 2/4] typo fix --- dsfortelco_pyspark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dsfortelco_pyspark.py b/dsfortelco_pyspark.py index ca8f09b..eb16194 100755 --- a/dsfortelco_pyspark.py +++ b/dsfortelco_pyspark.py @@ -112,7 +112,7 @@ !rm -r -f models/spark_rf_vanilla.tar !hdfs dfs -get models/spark !hdfs dfs -get models/ -!tar -cvf models/spark_rf._vanilla.tar models/spark/vanilla +!tar -cvf models/spark_rf_vanilla.tar models/spark/vanilla cdsw.track_file("models/spark_rf_vanilla.tar") From adbd1d033a5333648c9edaf167f43f4b739d8e81 Mon Sep 17 00:00:00 2001 From: Ian Brooks Date: Tue, 22 Sep 2020 21:56:34 -0400 Subject: [PATCH 3/4] Rapid Updates --- RapidSetup.sh | 12 ++++++++++++ spark-defaults.conf | 32 +++++++++++++++++++++++++------- 2 files changed, 37 insertions(+), 7 deletions(-) create mode 100644 RapidSetup.sh mode change 100755 => 100644 spark-defaults.conf diff --git a/RapidSetup.sh b/RapidSetup.sh new file mode 100644 index 0000000..47536f7 --- /dev/null +++ b/RapidSetup.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +#Set Up Paths for Rapids Jars + +export SPARK_RAPIDS_DIR=/opt/sparkRapidsPlugin +export SPARK_CUDF_JAR=${SPARK_RAPIDS_DIR}/cudf-0.15-SNAPSHOT-cuda10-1.jar +export SPARK_RAPIDS_PLUGIN_JAR=${SPARK_RAPIDS_DIR}/rapids-4-spark_2.12-0.2.0-SNAPSHOT.jar + +chmod 775 /opt/sparkRapidsPlugin/*.jar + +#/opt/sparkRapidsPlugin/cudf-0.15-SNAPSHOT-cuda10-1.jar +#/opt/sparkRapidsPlugin/rapids-4-spark_2.12-0.2.0-SNAPSHOT.jar \ No newline at end of file diff --git a/spark-defaults.conf b/spark-defaults.conf old mode 100755 new mode 100644 index 5c06c87..0241fba --- a/spark-defaults.conf +++ b/spark-defaults.conf @@ -1,7 +1,25 @@ -spark.dynamicAllocation.enabled True -spark.dynamicAllocation.maxExecutors 3 -spark.executor.memory 3584m -spark.executor.cores 1 -spark.yarn.executor.memoryOverhead 512m -spark.app.name ds-for-telco -spark.lineage.enabled false +spark.master=k8s://https://172.20.0.1:443 +spark.rapids.sql.concurrentGpuTasks=1 +spark.rapids.sql.format.csv.read.enabled=false +spark.rapids.sql.enabled=false +spark.executor.memory=4G +spark.executor.cores=4 +spark.task.cpus=1 +spark.task.resource.gpu.amount=0.25 +spark.executor.resource.gpu.amount=1 +spark.executor.memoryOverhead=4G +spark.rapids.memory.pinnedPool.size=2G +spark.locality.wait=0s +spark.sql.files.maxPartitionBytes=512m +spark.sql.shuffle.partitions=10 +spark.plugins=com.nvidia.spark.SQLPlugin +spark.executor.resource.gpu.discoveryScript=/opt/sparkRapidsPlugin/getGpusResources.sh +spark.executor.resource.gpu.vendor=nvidia.com + +#spark.dynamicAllocation.enabled True +#spark.dynamicAllocation.maxExecutors 3 +#spark.executor.memory 3584m +#spark.executor.cores 1 +#spark.yarn.executor.memoryOverhead 512m +#spark.app.name ds-for-telco +#spark.lineage.enabled false \ No newline at end of file From 0fe4de5e7370030136b8cf70cc353adaf690889f Mon Sep 17 00:00:00 2001 From: Ian Brooks Date: Wed, 23 Sep 2020 13:33:35 -0400 Subject: [PATCH 4/4] Rapids Updates --- spark-defaults.conf | 32 +++++++------------------------- spark-rapids.conf | 17 +++++++++++++++++ 2 files changed, 24 insertions(+), 25 deletions(-) create mode 100644 spark-rapids.conf diff --git a/spark-defaults.conf b/spark-defaults.conf index 0241fba..ec824c9 100644 --- a/spark-defaults.conf +++ b/spark-defaults.conf @@ -1,25 +1,7 @@ -spark.master=k8s://https://172.20.0.1:443 -spark.rapids.sql.concurrentGpuTasks=1 -spark.rapids.sql.format.csv.read.enabled=false -spark.rapids.sql.enabled=false -spark.executor.memory=4G -spark.executor.cores=4 -spark.task.cpus=1 -spark.task.resource.gpu.amount=0.25 -spark.executor.resource.gpu.amount=1 -spark.executor.memoryOverhead=4G -spark.rapids.memory.pinnedPool.size=2G -spark.locality.wait=0s -spark.sql.files.maxPartitionBytes=512m -spark.sql.shuffle.partitions=10 -spark.plugins=com.nvidia.spark.SQLPlugin -spark.executor.resource.gpu.discoveryScript=/opt/sparkRapidsPlugin/getGpusResources.sh -spark.executor.resource.gpu.vendor=nvidia.com - -#spark.dynamicAllocation.enabled True -#spark.dynamicAllocation.maxExecutors 3 -#spark.executor.memory 3584m -#spark.executor.cores 1 -#spark.yarn.executor.memoryOverhead 512m -#spark.app.name ds-for-telco -#spark.lineage.enabled false \ No newline at end of file +spark.dynamicAllocation.enabled True +spark.dynamicAllocation.maxExecutors 3 +spark.executor.memory 3584m +spark.executor.cores 1 +spark.yarn.executor.memoryOverhead 512m +spark.app.name ds-for-telco +spark.lineage.enabled false \ No newline at end of file diff --git a/spark-rapids.conf b/spark-rapids.conf new file mode 100644 index 0000000..6e5ab96 --- /dev/null +++ b/spark-rapids.conf @@ -0,0 +1,17 @@ +spark.master=k8s://https://172.20.0.1:443 +spark.rapids.sql.concurrentGpuTasks=1 +spark.rapids.sql.format.csv.read.enabled=false +spark.rapids.sql.enabled=false +spark.executor.memory=4G +spark.executor.cores=4 +spark.task.cpus=1 +spark.task.resource.gpu.amount=0.25 +spark.executor.resource.gpu.amount=1 +spark.executor.memoryOverhead=4G +spark.rapids.memory.pinnedPool.size=2G +spark.locality.wait=0s +spark.sql.files.maxPartitionBytes=512m +spark.sql.shuffle.partitions=10 +spark.plugins=com.nvidia.spark.SQLPlugin +spark.executor.resource.gpu.discoveryScript=/opt/sparkRapidsPlugin/getGpusResources.sh +spark.executor.resource.gpu.vendor=nvidia.com \ No newline at end of file