NVIDIA · nvliyuan · Dec 19, 2022 · Nov 21, 2022 · Nov 21, 2022 · Nov 21, 2022
diff --git a/docs/get-started/xgboost-examples/csp/aws/ec2.md b/docs/get-started/xgboost-examples/csp/aws/ec2.md
@@ -177,8 +177,8 @@ spark-submit --master spark://$HOSTNAME:7077 \
         ${SAMPLE_JAR} \
         -num_workers=${NUM_EXECUTORS} \
         -format=csv \
-        -dataPath="train::s3a://spark-xgboost-mortgage-dataset/csv/train/2000Q1" \
-        -dataPath="trans::s3a://spark-xgboost-mortgage-dataset/csv/eval/2000Q1" \
+        -dataPath="train::your-train-data-path" \
+        -dataPath="trans::your-eval-data-path" \
         -numRound=100 -max_depth=8 -nthread=$NUM_EXECUTOR_CORES -showFeatures=0 \
         -tree_method=gpu_hist
 ```

diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md b/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md
@@ -12,11 +12,13 @@ Prerequisites
   * Multi-node clusters with homogenous GPU configuration
 * Software Requirements
   * Ubuntu 18.04, 20.04/CentOS7, CentOS8
-  * CUDA 11.0+
+  * CUDA 11.5+
   * NVIDIA driver compatible with your CUDA
   * NCCL 2.7.8+
-  * Python 3.6+
+  * Python 3.8 or 3.9
   * NumPy
+  * XGBoost 1.7.0+
+  * cudf-cu11  
 
 The number of GPUs in each host dictates the number of Spark executors that can run there.
 Additionally, cores per Spark executor and cores per Spark task must match, such that each executor can run 1 task at any given time.
@@ -47,6 +49,13 @@ And here are the steps to enable the GPU resources discovery for Spark 3.1+.
     spark.worker.resource.gpu.amount 1
     spark.worker.resource.gpu.discoveryScript ${SPARK_HOME}/examples/src/main/scripts/getGpusResources.sh
     ```
+3. Install the XGBoost, cudf-cu11, numpy libraries on all nodes before running XGBoost application.
+
+``` bash
+pip install xgboost==1.7.0
+pip install cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
+pip install numpy
+```
 
 Get Application Files, Jar and Dataset
 -------------------------------
@@ -182,6 +191,10 @@ export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.gpu_main
 
 # tree construction algorithm
 export TREE_METHOD=gpu_hist
+
+# if you enable archive python environment
+export PYSPARK_DRIVER_PYTHON=python
+export PYSPARK_PYTHON=./environment/bin/python
 ```
 
 Run spark-submit:
@@ -197,8 +210,9 @@ ${SPARK_HOME}/bin/spark-submit
  --driver-memory ${SPARK_DRIVER_MEMORY}                                         \
  --executor-memory ${SPARK_EXECUTOR_MEMORY}                                     \
  --conf spark.cores.max=${TOTAL_CORES}                                          \
- --jars ${RAPIDS_JAR},${XGBOOST4J_JAR},${XGBOOST4J_SPARK_JAR}     \
- --py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP}                   \
+ --archives your_pyspark_venv.tar.gz#environment     #if you enabled archive python environment \
+ --jars ${RAPIDS_JAR}    \
+ --py-files ${SAMPLE_ZIP}                   \
  ${MAIN_PY}                                                     \
  --mainClass=${EXAMPLE_CLASS}                                                   \
  --dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/output/train/      \
@@ -261,6 +275,10 @@ export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.cpu_main
 
 # tree construction algorithm
 export TREE_METHOD=hist
+
+# if you enable archive python environment
+export PYSPARK_DRIVER_PYTHON=python
+export PYSPARK_PYTHON=./environment/bin/python
 ```
 
 This is the same command as for the GPU example, repeated for convenience:
@@ -271,8 +289,9 @@ ${SPARK_HOME}/bin/spark-submit
  --driver-memory ${SPARK_DRIVER_MEMORY}                                         \
  --executor-memory ${SPARK_EXECUTOR_MEMORY}                                     \
  --conf spark.cores.max=${TOTAL_CORES}                                          \
- --jars ${XGBOOST4J_JAR},${XGBOOST4J_SPARK_JAR}       \
- --py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP}                       \
+ --archives your_pyspark_venv.tar.gz#environment     #if you enabled archive python environment \
+ --jars ${RAPIDS_JAR}     \
+ --py-files ${SAMPLE_ZIP}                       \
  ${SPARK_PYTHON_ENTRYPOINT}                                                     \
  --mainClass=${EXAMPLE_CLASS}                                                   \
  --dataPath=train::${DATA_PATH}/mortgage/output/train/      \

diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md b/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md
@@ -12,12 +12,14 @@ Prerequisites
   * Multi-node clusters with homogenous GPU configuration
 * Software Requirements
   * Ubuntu 18.04, 20.04/CentOS7, CentOS8
-  * CUDA 11.0+
+  * CUDA 11.5+
   * NVIDIA driver compatible with your CUDA
   * NCCL 2.7.8+
-  * Python 3.6+
+  * Python 3.8 or 3.9
   * NumPy
-
+  * XGBoost 1.7.0+
+  * cudf-cu11  
+
 The number of GPUs per NodeManager dictates the number of Spark executors that can run in that NodeManager. 
 Additionally, cores per Spark executor and cores per Spark task must match, such that each executor can run 1 task at any given time.
 
@@ -32,6 +34,30 @@ We use `SPARK_HOME` environment variable to point to the Apache Spark cluster.
 And as to how to enable GPU scheduling and isolation for Yarn,
 please refer to [here](https://hadoop.apache.org/docs/r3.1.0/hadoop-yarn/hadoop-yarn-site/UsingGpus.html).
 
+Please make sure to install the XGBoost, cudf-cu11, numpy libraries on all nodes before running XGBoost application.
+``` bash
+pip install xgboost==1.7.0
+pip install cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
+pip install numpy
+```
+You can also create an isolated python environment by using (Virtualenv)[https://virtualenv.pypa.io/en/latest/],
+and then directly pass/unpack the archive file and enable the environment on executors
+by leveraging the --archives option or spark.archives configuration.
+``` bash
+# create an isolated python environment and install libraries
+python -m venv pyspark_venv
+source pyspark_venv/bin/activate
+pip install xgboost==1.7.0
+pip install cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
+pip install numpy
+venv-pack -o pyspark_venv.tar.gz
+
+# enable archive python environment on executors
+export PYSPARK_DRIVER_PYTHON=python # Do not set in cluster modes.
+export PYSPARK_PYTHON=./environment/bin/python
+spark-submit --archives pyspark_venv.tar.gz#environment app.py
+```
+
 Get Application Files, Jar and Dataset
 -------------------------------
 
@@ -114,6 +140,10 @@ export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.gpu_main
 
 # tree construction algorithm
 export TREE_METHOD=gpu_hist
+
+# if you enable archive python environment
+export PYSPARK_DRIVER_PYTHON=python
+export PYSPARK_PYTHON=./environment/bin/python
 ```
 
 Run spark-submit:
@@ -129,11 +159,12 @@ ${SPARK_HOME}/bin/spark-submit
  --files ${SPARK_HOME}/examples/src/main/scripts/getGpusResources.sh            \
  --master yarn                                                                  \
  --deploy-mode ${SPARK_DEPLOY_MODE}                                             \
+ --archives your_pyspark_venv.tar.gz#environment     #if you enabled archive python environment \
  --num-executors ${SPARK_NUM_EXECUTORS}                                         \
  --driver-memory ${SPARK_DRIVER_MEMORY}                                         \
  --executor-memory ${SPARK_EXECUTOR_MEMORY}                                     \
- --jars ${RAPIDS_JAR},${XGBOOST4J_JAR}        \
- --py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP}                   \
+ --jars ${RAPIDS_JAR}        \
+ --py-files ${SAMPLE_ZIP}                   \
  ${MAIN_PY}                                                     \
  --mainClass=${EXAMPLE_CLASS}                                                   \
  --dataPath=train::${DATA_PATH}/mortgage/out/train/      \
@@ -190,19 +221,24 @@ export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.cpu_main
 
 # tree construction algorithm
 export TREE_METHOD=hist
+
+# if you enable archive python environment
+export PYSPARK_DRIVER_PYTHON=python
+export PYSPARK_PYTHON=./environment/bin/python
 ```
 
 This is the same command as for the GPU example, repeated for convenience:
 
 ``` bash
 ${SPARK_HOME}/bin/spark-submit                                                  \
  --master yarn                                                                  \
+ --archives your_pyspark_venv.tar.gz#environment     #if you enabled archive python environment \
  --deploy-mode ${SPARK_DEPLOY_MODE}                                             \
  --num-executors ${SPARK_NUM_EXECUTORS}                                         \
  --driver-memory ${SPARK_DRIVER_MEMORY}                                         \
  --executor-memory ${SPARK_EXECUTOR_MEMORY}                                     \
- --jars ${XGBOOST4J_JAR},${XGBOOST4J_SPARK_JAR}                                 \
- --py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP}                                  \
+ --jars ${RAPIDS_JAR}        \
+ --py-files ${SAMPLE_ZIP}                                  \
  ${MAIN_PY}                                                     \
  --mainClass=${EXAMPLE_CLASS}                                                   \
  --dataPath=train::${DATA_PATH}/mortgage/output/train/       \