diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml
index f874c0ed34a..dedf2416987 100644
--- a/.github/workflows/auto-merge.yml
+++ b/.github/workflows/auto-merge.yml
@@ -18,7 +18,7 @@ name: auto-merge HEAD to BASE
on:
pull_request_target:
branches:
- - branch-23.02
+ - branch-23.04
types: [closed]
jobs:
@@ -29,13 +29,13 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
- ref: branch-23.02 # force to fetch from latest upstream instead of PR ref
+ ref: branch-23.04 # force to fetch from latest upstream instead of PR ref
- name: auto-merge job
uses: ./.github/workflows/auto-merge
env:
OWNER: NVIDIA
REPO_NAME: spark-rapids
- HEAD: branch-23.02
- BASE: branch-23.04
+ HEAD: branch-23.04
+ BASE: branch-23.06
AUTOMERGE_TOKEN: ${{ secrets.AUTOMERGE_TOKEN }} # use to merge PR
diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
index fbba835e95b..e9bd75607dd 100644
--- a/.github/workflows/blossom-ci.yml
+++ b/.github/workflows/blossom-ci.yml
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -96,10 +96,10 @@ jobs:
java-version: 8
# add blackduck properties https://synopsys.atlassian.net/wiki/spaces/INTDOCS/pages/631308372/Methods+for+Configuring+Analysis#Using-a-configuration-file
+ # currently hardcode projects here to avoid intermittent mvn scan failures
- name: Setup blackduck properties
run: |
- PROJECTS=$(mvn -am dependency:tree | grep maven-dependency-plugin | awk '{ out="com.nvidia:"$(NF-1);print out }' | grep rapids | xargs | sed -e 's/ /,/g')
- echo detect.maven.build.command="-pl=$PROJECTS -am" >> application.properties
+ echo detect.maven.build.command="-pl=com.nvidia:rapids-4-spark-parent,com.nvidia:rapids-4-spark-sql_2.12 -am" >> application.properties
echo detect.maven.included.scopes=compile >> application.properties
- name: Run blossom action
diff --git a/.github/workflows/mvn-verify-check.yml b/.github/workflows/mvn-verify-check.yml
index e781ed75758..3d034eeaa3c 100644
--- a/.github/workflows/mvn-verify-check.yml
+++ b/.github/workflows/mvn-verify-check.yml
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -46,7 +46,10 @@ jobs:
. jenkins/version-def.sh
svArrBodyNoSnapshot=$(printf ",{\"spark-version\":\"%s\",\"isSnapshot\":false}" "${SPARK_SHIM_VERSIONS_NOSNAPSHOTS_TAIL[@]}")
svArrBodyNoSnapshot=${svArrBodyNoSnapshot:1}
- svArrBodySnapshot=$(printf ",{\"spark-version\":\"%s\",\"isSnapshot\":true}" "${SPARK_SHIM_VERSIONS_SNAPSHOTS_ONLY[@]}")
+ # do not add empty snapshot versions
+ if [ ${#SPARK_SHIM_VERSIONS_SNAPSHOTS_ONLY[@]} -gt 0 ]; then
+ svArrBodySnapshot=$(printf ",{\"spark-version\":\"%s\",\"isSnapshot\":true}" "${SPARK_SHIM_VERSIONS_SNAPSHOTS_ONLY[@]}")
+ fi
# add snapshot versions which are not in snapshot property in pom file
svArrBodySnapshot+=$(printf ",{\"spark-version\":\"%s\",\"isSnapshot\":true}" 340)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 933ae8f52f9..0a328c47de7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,11 +1,223 @@
# Change log
-Generated on 2023-02-14
+Generated on 2023-04-18
+
+## Release 23.04
+
+### Features
+|||
+|:---|:---|
+|[#7985](https://github.com/NVIDIA/spark-rapids/issues/7985)|[FEA] Expose Alluxio master URL to support K8s Env|
+|[#7880](https://github.com/NVIDIA/spark-rapids/issues/7880)|[FEA] retry framework task level metrics|
+|[#7394](https://github.com/NVIDIA/spark-rapids/issues/7394)|[FEA] Support Delta Lake auto compaction|
+|[#7463](https://github.com/NVIDIA/spark-rapids/issues/7463)|[FEA] Drop support for Databricks-9.1 ML LTS|
+|[#7253](https://github.com/NVIDIA/spark-rapids/issues/7253)|[FEA] Implement OOM retry framework|
+|[#7042](https://github.com/NVIDIA/spark-rapids/issues/7042)|[FEA] Add support in the tools event parsing for ML functions, libraries, and expressions|
+
+### Performance
+|||
+|:---|:---|
+|[#7907](https://github.com/NVIDIA/spark-rapids/issues/7907)|[FEA] Optimize regexp_replace in multi-replace scenarios|
+|[#7691](https://github.com/NVIDIA/spark-rapids/issues/7691)|[FEA] Upgrade and document UCX 1.14|
+|[#6516](https://github.com/NVIDIA/spark-rapids/issues/6516)|[FEA] Enable RAPIDS Shuffle Manager smoke testing for the databricks environment|
+|[#7695](https://github.com/NVIDIA/spark-rapids/issues/7695)|[FEA] Transpile regexp_extract expression to only have the single capture group that is needed|
+|[#7393](https://github.com/NVIDIA/spark-rapids/issues/7393)|[FEA] Support Delta Lake optimized write|
+|[#6561](https://github.com/NVIDIA/spark-rapids/issues/6561)|[FEA] Make SpillableColumnarBatch inform Spill code of actual usage of the batch|
+|[#6864](https://github.com/NVIDIA/spark-rapids/issues/6864)|[BUG] Spilling logic can spill data that cannot be freed|
+
+### Bugs Fixed
+|||
+|:---|:---|
+|[#8111](https://github.com/NVIDIA/spark-rapids/issues/8111)|[BUG] test_delta_delete_entire_table failed in databricks 10.4 runtime|
+|[#8074](https://github.com/NVIDIA/spark-rapids/issues/8074)|[BUG] test_parquet_read_nano_as_longs_31x failed on Dataproc|
+|[#7997](https://github.com/NVIDIA/spark-rapids/issues/7997)|[BUG] executors died with too much off heap in yarn UCX CI `udf_test`|
+|[#8067](https://github.com/NVIDIA/spark-rapids/issues/8067)|[BUG] extras jar sometimes fails to load|
+|[#8038](https://github.com/NVIDIA/spark-rapids/issues/8038)|[BUG] vector leaked when running NDS 3TB with memory restricted|
+|[#8030](https://github.com/NVIDIA/spark-rapids/issues/8030)|[BUG] test_re_replace_no_unicode_fallback test failes on integratoin tests Yarn|
+|[#7971](https://github.com/NVIDIA/spark-rapids/issues/7971)|[BUG] withRestoreOnRetry should look at Throwable causes in addition to retry OOMs|
+|[#6990](https://github.com/NVIDIA/spark-rapids/issues/6990)|[BUG] Several integration test failures in Spark-3.4 SNAPSHOT build|
+|[#7924](https://github.com/NVIDIA/spark-rapids/issues/7924)|[BUG] Physical plan for regexp_extract does not escape newlines|
+|[#7341](https://github.com/NVIDIA/spark-rapids/issues/7341)|[BUG] Leverage OOM retry framework for ORC writes|
+|[#7921](https://github.com/NVIDIA/spark-rapids/issues/7921)|[BUG] ORC writes with bloom filters enabled do not fall back to the CPU|
+|[#7818](https://github.com/NVIDIA/spark-rapids/issues/7818)|[BUG] Reuse of broadcast exchange can lead to unnecessary CPU fallback|
+|[#7904](https://github.com/NVIDIA/spark-rapids/issues/7904)|[BUG] test_write_sql_save_table sporadically fails on Pascal|
+|[#7922](https://github.com/NVIDIA/spark-rapids/issues/7922)|[BUG] YARN IT test test_optimized_hive_ctas_basic failures|
+|[#7933](https://github.com/NVIDIA/spark-rapids/issues/7933)|[BUG] NDS running hits DPP error on Databricks 10.4 when enable Alluxio cache.|
+|[#7850](https://github.com/NVIDIA/spark-rapids/issues/7850)|[BUG] nvcomp usage for the UCX mode of the shuffle manager is broken|
+|[#7927](https://github.com/NVIDIA/spark-rapids/issues/7927)|[BUG] Shimplify adding new shim layer fails|
+|[#6138](https://github.com/NVIDIA/spark-rapids/issues/6138)|[BUG] cast timezone-awareness check positive for date/time-unrelated types|
+|[#7914](https://github.com/NVIDIA/spark-rapids/issues/7914)|[BUG] Parquet read with integer upcast crashes|
+|[#6961](https://github.com/NVIDIA/spark-rapids/issues/6961)|[BUG] Using `\d` (or others) inside a character class results in "Unsupported escape character" |
+|[#7908](https://github.com/NVIDIA/spark-rapids/issues/7908)|[BUG] Interpolate spark.version.classifier into scala:compile `secondaryCacheDir`|
+|[#7707](https://github.com/NVIDIA/spark-rapids/issues/7707)|[BUG] IndexOutOfBoundsException when joining on 2 integer columns with DPP|
+|[#7892](https://github.com/NVIDIA/spark-rapids/issues/7892)|[BUG] Invalid or unsupported escape character `t` when trying to use tab in regexp_replace|
+|[#7640](https://github.com/NVIDIA/spark-rapids/issues/7640)|[BUG] GPU OOM using GpuRegExpExtract|
+|[#7814](https://github.com/NVIDIA/spark-rapids/issues/7814)|[BUG] GPU's output differs from CPU's for big decimals when joining by sub-partitioning algorithm|
+|[#7796](https://github.com/NVIDIA/spark-rapids/issues/7796)|[BUG] Parquet chunked reader size of output exceeds column size limit|
+|[#7833](https://github.com/NVIDIA/spark-rapids/issues/7833)|[BUG] run_pyspark_from_build computes 5 MiB per runner instead of 5 GiB|
+|[#7855](https://github.com/NVIDIA/spark-rapids/issues/7855)|[BUG] shuffle_test test_hash_grpby_sum failed OOM in premerge CI|
+|[#7858](https://github.com/NVIDIA/spark-rapids/issues/7858)|[BUG] HostToGpuCoalesceIterator leaks all host batches|
+|[#7826](https://github.com/NVIDIA/spark-rapids/issues/7826)|[BUG] buildall dist jar contains aggregator dependency|
+|[#7729](https://github.com/NVIDIA/spark-rapids/issues/7729)|[BUG] Active GPU thread not holding the semaphore|
+|[#7820](https://github.com/NVIDIA/spark-rapids/issues/7820)|[BUG] Restore pandas require_minimum_pandas_version() check|
+|[#7829](https://github.com/NVIDIA/spark-rapids/issues/7829)|[BUG] Parquet buffer time not correct with multithreaded combining reader|
+|[#7819](https://github.com/NVIDIA/spark-rapids/issues/7819)|[BUG] GpuDeviceManager allows setting UVM regardless of other RMM configs|
+|[#7643](https://github.com/NVIDIA/spark-rapids/issues/7643)|[BUG] Databricks init scripts can fail silently|
+|[#7799](https://github.com/NVIDIA/spark-rapids/issues/7799)|[BUG] Cannot lexicographic compare a table with a LIST of STRUCT column at ai.rapids.cudf.Table.sortOrder|
+|[#7767](https://github.com/NVIDIA/spark-rapids/issues/7767)|[BUG] VS Code / Metals / Bloop integration fails with java.lang.RuntimeException: 'boom' |
+|[#6383](https://github.com/NVIDIA/spark-rapids/issues/6383)|[SPARK-40066][SQL] ANSI mode: always return null on invalid access to map column|
+|[#7093](https://github.com/NVIDIA/spark-rapids/issues/7093)|[BUG] Spark-3.4 - Integration test failures in map_test|
+|[#7779](https://github.com/NVIDIA/spark-rapids/issues/7779)|[BUG] AlluxioUtilsSuite uses illegal character underscore in URI scheme|
+|[#7725](https://github.com/NVIDIA/spark-rapids/issues/7725)|[BUG] cache_test failed w/ ParquetCachedBatchSerializer in spark 3.3.2-SNAPSHOT|
+|[#7639](https://github.com/NVIDIA/spark-rapids/issues/7639)|[BUG] Databricks premerge failing with cannot find pytest|
+|[#7694](https://github.com/NVIDIA/spark-rapids/issues/7694)|[BUG] Spark-3.4 build breaks due to removing InternalRowSet|
+|[#6598](https://github.com/NVIDIA/spark-rapids/issues/6598)|[BUG] CUDA error when casting large column vector from long to string|
+|[#7739](https://github.com/NVIDIA/spark-rapids/issues/7739)|[BUG] udf_test failed in databricks 11.3 ENV|
+|[#5748](https://github.com/NVIDIA/spark-rapids/issues/5748)|[BUG] 3 cast tests fails on Spark 3.4.0|
+|[#7688](https://github.com/NVIDIA/spark-rapids/issues/7688)|[BUG] GpuParquetScan fails with NullPointerException - Delta CDF query|
+|[#7648](https://github.com/NVIDIA/spark-rapids/issues/7648)|[BUG] java.lang.ClassCastException: SerializeConcatHostBuffersDeserializeBatch cannot be cast to.HashedRelation|
+|[#6988](https://github.com/NVIDIA/spark-rapids/issues/6988)|[BUG] Integration test failures with DecimalType on Spark-3.4 SNAPSHOT build|
+|[#7615](https://github.com/NVIDIA/spark-rapids/issues/7615)|[BUG] Build fails on Spark 3.4|
+|[#7557](https://github.com/NVIDIA/spark-rapids/issues/7557)|[AUDIT][SPARK-41970] Introduce SparkPath for typesafety|
+|[#7617](https://github.com/NVIDIA/spark-rapids/issues/7617)|[BUG] Build 340 failed due to miss shim code for GpuShuffleMeta|
+
+### PRs
+|||
+|:---|:---|
+|[#8109](https://github.com/NVIDIA/spark-rapids/pull/8109)|Bump up JNI and private version to released 23.04.0|
+|[#7939](https://github.com/NVIDIA/spark-rapids/pull/7939)|[Doc]update download docs for 2304 version[skip ci]|
+|[#8127](https://github.com/NVIDIA/spark-rapids/pull/8127)|Avoid SQL result check of Delta Lake full delete on Databricks|
+|[#8098](https://github.com/NVIDIA/spark-rapids/pull/8098)|Fix loading of ORC files with missing column names|
+|[#8110](https://github.com/NVIDIA/spark-rapids/pull/8110)|Update ML integration page docs page [skip ci]|
+|[#8103](https://github.com/NVIDIA/spark-rapids/pull/8103)|Add license of spark-rapids private in NOTICE-binary[skip ci]|
+|[#8100](https://github.com/NVIDIA/spark-rapids/pull/8100)|Update/improve EMR getting started documentation [skip ci]|
+|[#8101](https://github.com/NVIDIA/spark-rapids/pull/8101)|Improve OOM exception messages|
+|[#8087](https://github.com/NVIDIA/spark-rapids/pull/8087)|Add an FAQ entry on encryption support [skip ci]|
+|[#8076](https://github.com/NVIDIA/spark-rapids/pull/8076)|Add in docs about RetryOOM [skip ci]|
+|[#8077](https://github.com/NVIDIA/spark-rapids/pull/8077)|Temporarily skip `test_parquet_read_nano_as_longs_31x` on dataproc|
+|[#8071](https://github.com/NVIDIA/spark-rapids/pull/8071)|Fix error in deploy script [skip ci]|
+|[#8070](https://github.com/NVIDIA/spark-rapids/pull/8070)|Fixes closed RapidsShuffleHandleImpl leak in ShuffleBufferCatalog|
+|[#8069](https://github.com/NVIDIA/spark-rapids/pull/8069)|Fix loading extra jar|
+|[#8044](https://github.com/NVIDIA/spark-rapids/pull/8044)|Fall back to CPU if `spark.sql.legacy.parquet.nanosAsLong` is set|
+|[#8049](https://github.com/NVIDIA/spark-rapids/pull/8049)|[DOC] Adding user tool info to main qualification docs page [skip ci]|
+|[#8040](https://github.com/NVIDIA/spark-rapids/pull/8040)|Fix device vector leak in RmmRetryIterator.splitSpillableInHalfByRows|
+|[#8031](https://github.com/NVIDIA/spark-rapids/pull/8031)|Fix regexp_replace integration test that should fallback when unicode is disabled|
+|[#7828](https://github.com/NVIDIA/spark-rapids/pull/7828)|Fallback to arena allocator if RMM failed to initialize with async allocator|
+|[#8006](https://github.com/NVIDIA/spark-rapids/pull/8006)|Handle caused-by retry exceptions in withRestoreOnRetry|
+|[#8013](https://github.com/NVIDIA/spark-rapids/pull/8013)|[Doc] Adding user tools info into EMR getting started guide [skip ci]|
+|[#8007](https://github.com/NVIDIA/spark-rapids/pull/8007)|Fix leak where RapidsShuffleIterator for a completed task was kept alive|
+|[#8010](https://github.com/NVIDIA/spark-rapids/pull/8010)|Specify that UCX should be 1.12.1 only [skip ci]|
+|[#7967](https://github.com/NVIDIA/spark-rapids/pull/7967)|Transpile simple choice-type regular expressions into lists of choices to use with string replace multi|
+|[#7902](https://github.com/NVIDIA/spark-rapids/pull/7902)|Add oom retry handling for createGatherer in gpu hash joins|
+|[#7986](https://github.com/NVIDIA/spark-rapids/pull/7986)|Provides a config to expose Alluxio master URL to support K8s Env|
+|[#7936](https://github.com/NVIDIA/spark-rapids/pull/7936)|Stop showing internal details of ternary expressions in SparkPlan.toString|
+|[#7972](https://github.com/NVIDIA/spark-rapids/pull/7972)|Add in retry for ORC writes|
+|[#7975](https://github.com/NVIDIA/spark-rapids/pull/7975)|Publish documentation for private configs|
+|[#7976](https://github.com/NVIDIA/spark-rapids/pull/7976)|Disable GPU write for ORC and Parquet, if bloom-filters are enabled.|
+|[#7925](https://github.com/NVIDIA/spark-rapids/pull/7925)|Inject RetryOOM in CI where retry iterator is used|
+|[#7970](https://github.com/NVIDIA/spark-rapids/pull/7970)|[DOCS] Updating qual tool docs from latest in tools repo|
+|[#7952](https://github.com/NVIDIA/spark-rapids/pull/7952)|Add in minimal retry metrics|
+|[#7884](https://github.com/NVIDIA/spark-rapids/pull/7884)|Add Python requirements file for integration tests|
+|[#7958](https://github.com/NVIDIA/spark-rapids/pull/7958)|Add CheckpointRestore trait and withRestoreOnRetry|
+|[#7849](https://github.com/NVIDIA/spark-rapids/pull/7849)|Fix CPU broadcast exchanges being left unreplaced due to AQE and reuse|
+|[#7944](https://github.com/NVIDIA/spark-rapids/pull/7944)|Fix issue with dynamicpruning filters used in converted GPU scans when S3 paths are replaced with alluxio|
+|[#7949](https://github.com/NVIDIA/spark-rapids/pull/7949)|Lazily unspill the stream batches for joins by sub-partitioning|
+|[#7951](https://github.com/NVIDIA/spark-rapids/pull/7951)|Fix PMD docs URL [skip ci]|
+|[#7945](https://github.com/NVIDIA/spark-rapids/pull/7945)|Enable automerge from 2304 to 2306 [skip ci]|
+|[#7935](https://github.com/NVIDIA/spark-rapids/pull/7935)|Add GPU level task metrics|
+|[#7930](https://github.com/NVIDIA/spark-rapids/pull/7930)|Add OOM Retry handling for join gather next|
+|[#7942](https://github.com/NVIDIA/spark-rapids/pull/7942)|Revert "Upgrade to UCX 1.14.0 (#7877)"|
+|[#7889](https://github.com/NVIDIA/spark-rapids/pull/7889)|Support auto-compaction for Delta tables on|
+|[#7937](https://github.com/NVIDIA/spark-rapids/pull/7937)|Support hashing different types for sub-partitioning|
+|[#7877](https://github.com/NVIDIA/spark-rapids/pull/7877)|Upgrade to UCX 1.14.0|
+|[#7926](https://github.com/NVIDIA/spark-rapids/pull/7926)|Fixes issue where UCX compressed tables would be decompressed multiple times|
+|[#7928](https://github.com/NVIDIA/spark-rapids/pull/7928)|Adjust assert for SparkShims: no longer a per-shim file [skip ci]|
+|[#7895](https://github.com/NVIDIA/spark-rapids/pull/7895)|Some refactor of shuffled hash join|
+|[#7894](https://github.com/NVIDIA/spark-rapids/pull/7894)|Support tagging `Cast` for timezone conditionally|
+|[#7915](https://github.com/NVIDIA/spark-rapids/pull/7915)|Fix upcast of signed integral values when reading from Parquet|
+|[#7879](https://github.com/NVIDIA/spark-rapids/pull/7879)|Retry for file read operations|
+|[#7905](https://github.com/NVIDIA/spark-rapids/pull/7905)|[Doc] Fix some documentation issue based on VPR feedback on 23.04 branch (new PR) [skip CI] |
+|[#7912](https://github.com/NVIDIA/spark-rapids/pull/7912)|[Doc] Hotfix gh-pages for compatibility page format issue [skip ci]|
+|[#7913](https://github.com/NVIDIA/spark-rapids/pull/7913)|Fix resolution of GpuRapidsProcessDeltaMergeJoinExec expressions|
+|[#7916](https://github.com/NVIDIA/spark-rapids/pull/7916)|Add clarification for Delta Lake optimized write fallback due to sorting [skip ci]|
+|[#7906](https://github.com/NVIDIA/spark-rapids/pull/7906)|ColumnarToRowIterator should release the semaphore if parent is empty|
+|[#7909](https://github.com/NVIDIA/spark-rapids/pull/7909)|Interpolate buildver into secondaryCacheDir|
+|[#7844](https://github.com/NVIDIA/spark-rapids/pull/7844)|Update alluxio version to 2.9.0|
+|[#7896](https://github.com/NVIDIA/spark-rapids/pull/7896)|Update regular expression parser to handle escape character sequences|
+|[#7885](https://github.com/NVIDIA/spark-rapids/pull/7885)|Add Join Reordering Integration Test|
+|[#7862](https://github.com/NVIDIA/spark-rapids/pull/7862)|Reduce shimming of GpuFlatMapGroupsInPandasExec|
+|[#7859](https://github.com/NVIDIA/spark-rapids/pull/7859)|Remove 3.1.4-SNAPSHOT shim code|
+|[#7835](https://github.com/NVIDIA/spark-rapids/pull/7835)|Update to pull the rapids spark extra plugin jar|
+|[#7863](https://github.com/NVIDIA/spark-rapids/pull/7863)|[Doc] Address document issues [skip ci]|
+|[#7794](https://github.com/NVIDIA/spark-rapids/pull/7794)|Implement sub partitioning for large/skewed hash joins|
+|[#7864](https://github.com/NVIDIA/spark-rapids/pull/7864)|Add in basic support for OOM retry for project and filter|
+|[#7878](https://github.com/NVIDIA/spark-rapids/pull/7878)|Fixing host memory calculation to properly be 5GiB|
+|[#7860](https://github.com/NVIDIA/spark-rapids/pull/7860)|Enable manual copy-and-paste code detection [skip ci]|
+|[#7852](https://github.com/NVIDIA/spark-rapids/pull/7852)|Use withRetry in GpuCoalesceBatches|
+|[#7857](https://github.com/NVIDIA/spark-rapids/pull/7857)|Unshim getSparkShimVersion|
+|[#7854](https://github.com/NVIDIA/spark-rapids/pull/7854)|Optimize `regexp_extract*` by transpiling capture groups to non-capturing groups so that only the required capturing group is manifested|
+|[#7853](https://github.com/NVIDIA/spark-rapids/pull/7853)|Remove support for Databricks-9.1 ML LTS|
+|[#7856](https://github.com/NVIDIA/spark-rapids/pull/7856)|Update references to reduced dependencies pom [skip ci]|
+|[#7848](https://github.com/NVIDIA/spark-rapids/pull/7848)|Initialize only sql-plugin to prevent missing submodule artifacts in buildall [skip ci]|
+|[#7839](https://github.com/NVIDIA/spark-rapids/pull/7839)|Add reduced pom to dist jar in the packaging phase|
+|[#7822](https://github.com/NVIDIA/spark-rapids/pull/7822)|Add in support for OOM retry|
+|[#7846](https://github.com/NVIDIA/spark-rapids/pull/7846)|Stop releasing semaphore in GpuUserDefinedFunction|
+|[#7840](https://github.com/NVIDIA/spark-rapids/pull/7840)|Execute mvn initialize before parallel build [skip ci]|
+|[#7222](https://github.com/NVIDIA/spark-rapids/pull/7222)|Automatic conversion to shimplified directory structure|
+|[#7824](https://github.com/NVIDIA/spark-rapids/pull/7824)|Use withRetryNoSplit in BasicWindowCalc|
+|[#7842](https://github.com/NVIDIA/spark-rapids/pull/7842)|Try fix broken blackduck scan [skip ci]|
+|[#7841](https://github.com/NVIDIA/spark-rapids/pull/7841)|Hardcode scan projects [skip ci]|
+|[#7830](https://github.com/NVIDIA/spark-rapids/pull/7830)|Fix buffer and Filter time with Parquet multithreaded combine reader|
+|[#7678](https://github.com/NVIDIA/spark-rapids/pull/7678)|Premerge CI to drop support for Databricks-9.1 ML LTS|
+|[#7823](https://github.com/NVIDIA/spark-rapids/pull/7823)|[BUG] Enable managed memory only if async allocator is not used|
+|[#7821](https://github.com/NVIDIA/spark-rapids/pull/7821)|Restore pandas import check in db113 runtime|
+|[#7810](https://github.com/NVIDIA/spark-rapids/pull/7810)|UnXfail large decimal window range queries|
+|[#7771](https://github.com/NVIDIA/spark-rapids/pull/7771)|Add withRetry and withRetryNoSplit and PoC with hash aggregate|
+|[#7815](https://github.com/NVIDIA/spark-rapids/pull/7815)|Fix the hyperlink to shimplify.py [skip ci]|
+|[#7812](https://github.com/NVIDIA/spark-rapids/pull/7812)|Fallback Delta Lake optimized writes if GPU cannot support partitioning|
+|[#7791](https://github.com/NVIDIA/spark-rapids/pull/7791)|Doc changes for new nested JSON reader [skip ci]|
+|[#7797](https://github.com/NVIDIA/spark-rapids/pull/7797)|Add GPU support for EphemeralSubstring|
+|[#7561](https://github.com/NVIDIA/spark-rapids/pull/7561)|Ant task to automatically convert to a simple shim layout|
+|[#7789](https://github.com/NVIDIA/spark-rapids/pull/7789)|Update script for integration tests on Databricks|
+|[#7798](https://github.com/NVIDIA/spark-rapids/pull/7798)|Do not error out DB IT test script when pytest code 5 [skip ci]|
+|[#7787](https://github.com/NVIDIA/spark-rapids/pull/7787)|Document a workaround to RuntimeException 'boom' [skip ci]|
+|[#7786](https://github.com/NVIDIA/spark-rapids/pull/7786)|Fix nested loop joins when there's no build-side columns|
+|[#7730](https://github.com/NVIDIA/spark-rapids/pull/7730)|[FEA] Switch to `regex_program` APIs|
+|[#7788](https://github.com/NVIDIA/spark-rapids/pull/7788)|Support released spark 3.3.2|
+|[#7095](https://github.com/NVIDIA/spark-rapids/pull/7095)|Fix the failure in `map_test.py` on Spark 3.4|
+|[#7769](https://github.com/NVIDIA/spark-rapids/pull/7769)|Fix issue where GpuSemaphore can throw NPE when logDebug is on|
+|[#7780](https://github.com/NVIDIA/spark-rapids/pull/7780)|Make AlluxioUtilsSuite pass for 340|
+|[#7772](https://github.com/NVIDIA/spark-rapids/pull/7772)|Fix cache test for Spark 3.3.2|
+|[#7717](https://github.com/NVIDIA/spark-rapids/pull/7717)|Move Databricks variables into blossom-lib|
+|[#7749](https://github.com/NVIDIA/spark-rapids/pull/7749)|Support Delta Lake optimized write on Databricks|
+|[#7696](https://github.com/NVIDIA/spark-rapids/pull/7696)|Create new version of GpuBatchScanExec to fix Spark-3.4 build|
+|[#7747](https://github.com/NVIDIA/spark-rapids/pull/7747)|batched full join tracking batch does not need to be lazy|
+|[#7758](https://github.com/NVIDIA/spark-rapids/pull/7758)|Hardcode python 3.8 to be used in databricks runtime for cudf_udf ENV|
+|[#7716](https://github.com/NVIDIA/spark-rapids/pull/7716)|Clean the code of `GpuMetrics`|
+|[#7746](https://github.com/NVIDIA/spark-rapids/pull/7746)|Merge branch-23.02 into branch-23.04 [skip ci]|
+|[#7740](https://github.com/NVIDIA/spark-rapids/pull/7740)|Revert 7737 workaround for cudf setup in databricks 11.3 runtime [skip ci]|
+|[#7737](https://github.com/NVIDIA/spark-rapids/pull/7737)|Workaround for cudf setup in databricks 11.3 runtime|
+|[#7734](https://github.com/NVIDIA/spark-rapids/pull/7734)|Temporarily skip the test_parquet_read_ignore_missing on Databricks|
+|[#7728](https://github.com/NVIDIA/spark-rapids/pull/7728)|Fix estimatedNumBatches in case of OOM for Full Outer Join|
+|[#7718](https://github.com/NVIDIA/spark-rapids/pull/7718)|GpuParquetScan fails with NullPointerException during combining|
+|[#7712](https://github.com/NVIDIA/spark-rapids/pull/7712)|Enable Dynamic FIle Pruning on|
+|[#7702](https://github.com/NVIDIA/spark-rapids/pull/7702)|Merge 23.02 into 23.04|
+|[#7572](https://github.com/NVIDIA/spark-rapids/pull/7572)|Enables spillable/unspillable state for RapidsBuffer and allow buffer sharing|
+|[#7687](https://github.com/NVIDIA/spark-rapids/pull/7687)|Fix window tests for Spark-3.4|
+|[#7667](https://github.com/NVIDIA/spark-rapids/pull/7667)|Reenable tests originally bypassed for 3.4|
+|[#7542](https://github.com/NVIDIA/spark-rapids/pull/7542)|Support WriteFilesExec in Spark-3.4 to fix several tests|
+|[#7673](https://github.com/NVIDIA/spark-rapids/pull/7673)|Add missing spark shim test suites |
+|[#7655](https://github.com/NVIDIA/spark-rapids/pull/7655)|Fix Spark 3.4 build|
+|[#7621](https://github.com/NVIDIA/spark-rapids/pull/7621)|Document GNU sed for macOS auto-copyrighter users [skip ci]|
+|[#7618](https://github.com/NVIDIA/spark-rapids/pull/7618)|Update JNI to 23.04.0-SNAPSHOT and update new delta-stub ver to 23.04|
+|[#7541](https://github.com/NVIDIA/spark-rapids/pull/7541)|Init version 23.04.0-SNAPSHOT|
## Release 23.02
### Features
|||
|:---|:---|
+|[#6420](https://github.com/NVIDIA/spark-rapids/issues/6420)|[FEA]Support HiveTableScanExec to scan a Hive text table|
+|[#4897](https://github.com/NVIDIA/spark-rapids/issues/4897)|Profiling tool: create a section to focus on I/O metrics|
|[#6419](https://github.com/NVIDIA/spark-rapids/issues/6419)|[FEA] Support write a Hive text table |
|[#7280](https://github.com/NVIDIA/spark-rapids/issues/7280)|[FEA] Support UpdateCommand for Delta Lake|
|[#7281](https://github.com/NVIDIA/spark-rapids/issues/7281)|[FEA] Support DeleteCommand for Delta Lake|
@@ -16,6 +228,7 @@ Generated on 2023-02-14
|[#6698](https://github.com/NVIDIA/spark-rapids/issues/6698)|[FEA] Support json_tuple|
|[#6885](https://github.com/NVIDIA/spark-rapids/issues/6885)|[FEA] Support reverse|
|[#6879](https://github.com/NVIDIA/spark-rapids/issues/6879)|[FEA] Support Databricks 11.3 ML LTS|
+|[#5618](https://github.com/NVIDIA/spark-rapids/issues/5618)|Qualification tool use expressions parsed in duration and speedup factors|
### Performance
|||
@@ -30,6 +243,8 @@ Generated on 2023-02-14
### Bugs Fixed
|||
|:---|:---|
+|[#7069](https://github.com/NVIDIA/spark-rapids/issues/7069)|[BUG] GPU Hive Text Reader reads empty strings as null|
+|[#7068](https://github.com/NVIDIA/spark-rapids/issues/7068)|[BUG] GPU Hive Text Reader skips empty lines|
|[#7448](https://github.com/NVIDIA/spark-rapids/issues/7448)|[BUG] GDS cufile test failed in elder cuda runtime|
|[#7686](https://github.com/NVIDIA/spark-rapids/issues/7686)|[BUG] Large floating point values written as `Inf` not `Infinity` in Hive text writer|
|[#7703](https://github.com/NVIDIA/spark-rapids/issues/7703)|[BUG] test_basic_hive_text_write fails|
@@ -104,6 +319,7 @@ Generated on 2023-02-14
### PRs
|||
|:---|:---|
+|[#7763](https://github.com/NVIDIA/spark-rapids/pull/7763)|23.02 changelog update 2/14 [skip ci]|
|[#7761](https://github.com/NVIDIA/spark-rapids/pull/7761)|[Doc] remove xgboost demo from aws-emr doc due to nccl issue [skip ci]|
|[#7760](https://github.com/NVIDIA/spark-rapids/pull/7760)|Add notice in gds to install cuda 11.8 [skip ci]|
|[#7570](https://github.com/NVIDIA/spark-rapids/pull/7570)|[Doc] 23.02 doc updates [skip ci]|
@@ -265,6 +481,7 @@ Generated on 2023-02-14
|[#7167](https://github.com/NVIDIA/spark-rapids/pull/7167)|Handle two changes related to `FileFormatWriter` since Spark 340|
|[#7194](https://github.com/NVIDIA/spark-rapids/pull/7194)|Skip tests that fail due to recent cuDF changes related to end of string/line anchors|
|[#7170](https://github.com/NVIDIA/spark-rapids/pull/7170)|Fix the `limit_test` failures on Spark 3.4|
+|[#7075](https://github.com/NVIDIA/spark-rapids/pull/7075)|Fix the failure of `test_array_element_at_zero_index_fail` on Spark3.4|
|[#7126](https://github.com/NVIDIA/spark-rapids/pull/7126)|Fix support for binary encoded decimal for parquet|
|[#7113](https://github.com/NVIDIA/spark-rapids/pull/7113)|Use an improved API for appending binary to host vector|
|[#7130](https://github.com/NVIDIA/spark-rapids/pull/7130)|Enable chunked parquet reads by default|
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index c48b76dd91e..24529414347 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -152,7 +152,7 @@ To this end in a pre-production build you can set the Boolean property
The time saved is more significant if you are merely changing
the `aggregator` module, or the `dist` module, or just incorporating changes from
-[spark-rapids-jni](https://github.com/NVIDIA/spark-rapids-jni/blob/branch-23.02/CONTRIBUTING.md#local-testing-of-cross-repo-contributions-cudf-spark-rapids-jni-and-spark-rapids)
+[spark-rapids-jni](https://github.com/NVIDIA/spark-rapids-jni/blob/branch-23.04/CONTRIBUTING.md#local-testing-of-cross-repo-contributions-cudf-spark-rapids-jni-and-spark-rapids)
For example, to quickly repackage `rapids-4-spark` after the
initial `./build/buildall` you can iterate by invoking
@@ -186,15 +186,20 @@ The following acronyms may appear in directory names:
|cdh |Cloudera CDH|321cdh |Cloudera CDH Spark based on Apache Spark 3.2.1|
The version-specific directory names have one of the following forms / use cases:
-- `src/main/312/scala` contains Scala source code for a single Spark version, 3.1.2 in this case
-- `src/main/312+-apache/scala`contains Scala source code for *upstream* **Apache** Spark builds,
+
+#### Version range directories
+
+The following source directory system is deprecated. See below and [shimplify.md][1]
+
+* `src/main/312/scala` contains Scala source code for a single Spark version, 3.1.2 in this case
+* `src/main/312+-apache/scala`contains Scala source code for *upstream* **Apache** Spark builds,
only beginning with version Spark 3.1.2, and + signifies there is no upper version boundary
among the supported versions
-- `src/main/311until320-all` contains code that applies to all shims between 3.1.1 *inclusive*,
+* `src/main/311until320-all` contains code that applies to all shims between 3.1.1 *inclusive*,
3.2.0 *exclusive*
-- `src/main/pre320-treenode` contains shims for the Catalyst `TreeNode` class before the
+* `src/main/pre320-treenode` contains shims for the Catalyst `TreeNode` class before the
[children trait specialization in Apache Spark 3.2.0](https://issues.apache.org/jira/browse/SPARK-34906).
-- `src/main/post320-treenode` contains shims for the Catalyst `TreeNode` class after the
+* `src/main/post320-treenode` contains shims for the Catalyst `TreeNode` class after the
[children trait specialization in Apache Spark 3.2.0](https://issues.apache.org/jira/browse/SPARK-34906).
For each Spark shim, we use Ant path patterns to compute the property
@@ -202,6 +207,17 @@ For each Spark shim, we use Ant path patterns to compute the property
picked up as additional source code roots. When possible path patterns are reused using
the conventions outlined in the pom.
+#### Simplified version directory structure
+
+Going forward new shim files should be added under:
+
+* `src/main/spark${buildver}`, example: `src/main/spark330db`
+* `src/test/spark${buildver}`, example: `src/test/spark340`
+
+with a special shim descriptor as a Scala/Java comment. See [shimplify.md][1]
+
+[1]: ./docs/dev/shimplify.md
+
### Setting up an Integrated Development Environment
Our project currently uses `build-helper-maven-plugin` for shimming against conflicting definitions of superclasses
@@ -238,7 +254,12 @@ Known Issues:
* There is a known issue that the test sources added via the `build-helper-maven-plugin` are not handled
[properly](https://youtrack.jetbrains.com/issue/IDEA-100532). The workaround is to `mark` the affected folders
-such as `tests/src/test/320+-noncdh-nondb` manually as `Test Sources Root`
+such as
+
+ * `tests/src/test/320+-noncdh-nondb`
+ * `tests/src/test/spark340`
+
+manually as `Test Sources Root`
* There is a known issue where, even after selecting a different Maven profile in the Maven submenu,
the source folders from a previously selected profile may remain active. As a workaround,
@@ -264,7 +285,7 @@ interested in. For example, to generate the Bloop projects for the Spark 3.2.0 d
just for the production code run:
```shell script
-mvn install ch.epfl.scala:maven-bloop_2.13:1.4.9:bloopInstall -pl aggregator -am \
+mvn install ch.epfl.scala:bloop-maven-plugin:bloopInstall -pl aggregator -am \
-DdownloadSources=true \
-Dbuildver=320 \
-DskipTests \
@@ -296,7 +317,7 @@ You can now open the spark-rapids as a
Read on for VS Code Scala Metals instructions.
-# Bloop, Scala Metals, and Visual Studio Code
+#### Bloop, Scala Metals, and Visual Studio Code
_Last tested with 1.63.0-insider (Universal) Commit: bedf867b5b02c1c800fbaf4d6ce09cefba_
@@ -338,6 +359,29 @@ jps -l
72349 scala.meta.metals.Main
```
+##### Known Issues
+
+###### java.lang.RuntimeException: boom
+
+Metals background compilation process status appears to be resetting to 0% after reaching 99%
+and you see a peculiar error message [`java.lang.RuntimeException: boom`][1]. You can work around
+it by making sure Metals Server (Bloop client) and Bloop Server are both running on Java 11+.
+
+1. To this end make sure that Bloop projects are generated using Java 11+
+
+ ```bash
+ JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 \
+ mvn install ch.epfl.scala:bloop-maven-plugin:bloopInstall \
+ -DdownloadSources=true \
+ -Dbuildver=331 \
+ -Dskip -DskipTests -Dmaven.javadoc.skip
+ ```
+
+1. Add [`metals.javaHome`][2] to VSCode preferences to point to Java 11+.
+
+[1]: https://github.com/sourcegraph/scip-java/blob/b7d268233f1a303f66b6d9804a68f64b1e5d7032/semanticdb-javac/src/main/java/com/sourcegraph/semanticdb_javac/SemanticdbTaskListener.java#L76
+
+[2]: https://github.com/scalameta/metals-vscode/pull/644/files#diff-04bba6a35cad1c794cbbe677678a51de13441b7a6ee8592b7b50be1f05c6f626R132
#### Other IDEs
We welcome pull requests with tips how to setup your favorite IDE!
@@ -481,6 +525,16 @@ You can confirm that the update actually has happened by either inspecting its e
`git diff` first or simply reexecuting `git commit` right away. The second time no file
modification should be triggered by the copyright year update hook and the commit should succeed.
+There is a known issue for macOS users if they use the default version of `sed`. The copyright update
+script may fail and generate an unexpected file named `source-file-E`. As a workaround, please
+install GNU sed
+
+```bash
+brew install gnu-sed
+# and add to PATH to make it as default sed for your shell
+export PATH="/usr/local/opt/gnu-sed/libexec/gnubin:$PATH"
+```
+
### Pull request status checks
A pull request should pass all status checks before merged.
#### signoff check
diff --git a/NOTICE-binary b/NOTICE-binary
index 6d488b89a7b..52d0395f2a3 100644
--- a/NOTICE-binary
+++ b/NOTICE-binary
@@ -463,3 +463,110 @@ misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
Mark Adler madler@alumni.caltech.edu
+
+--------------------------------------------------------------------------------
+This software includes the SPARK-RAPIDS PRIVATE jar with the following licenses:
+
+NVIDIA SPARK-RAPIDS PRIVATE LICENSE AGREEMENT
+
+IMPORTANT NOTICE – PLEASE READ AND AGREE BEFORE USING THE SOFTWARE.
+
+This license agreement (“Agreement”) is a legal agreement between you, whether an individual or entity ("you”) and NVIDIA Corporation ("NVIDIA") and governs the use of NVIDIA Spark-RAPIDS PRIVATE, including the software and materials provided hereunder (“SOFTWARE”).
+
+This Agreement can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this Agreement.
+
+If you don’t have the required age or authority to accept this Agreement, or if you don’t accept all the terms and conditions of this Agreement, do not use the SOFTWARE.
+
+You agree to use the SOFTWARE only for purposes that are permitted by this Agreement and any applicable law or regulation in the relevant jurisdictions.
+
+1. License.
+
+Subject to the terms of this Agreement, NVIDIA grants you a non-exclusive, revocable, non-transferable, non-sublicensable (except as expressly provided in this Agreement) license to install and use copies of the SOFTWARE in systems with NVIDIA GPUS.
+
+2. Limitations. Your license to use the SOFTWARE is restricted as follows:
+
+2.1 You may not reverse engineer, decompile, or disassemble the SOFTWARE components provided in binary form, nor attempt in any other manner to obtain source code of such SOFTWARE.
+
+2.2 You may not change or remove copyright or other proprietary notices in the SOFTWARE.
+
+2.3 Except as expressly granted in this Agreement, you may not copy, sell, rent, sublicense, transfer, distribute, modify or create derivative works of the SOFTWARE, or make its functionality available to others.
+
+2.4 You may not bypass, disable, or circumvent any technical limitation, encryption, security, digital rights management or authentication mechanism in the SOFTWARE.
+
+2.5 You may not use the SOFTWARE for the purpose of developing competing products or technologies or assisting a third party in such activities.
+
+2.6 You may not use the SOFTWARE in any manner that would cause it to become subject to an open source software license; subject to the terms in the “Components Under Other Licenses” section below.
+
+2.7 You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney’s fees and costs incident to establishing the right of indemnification) arising out of use of the SOFTWARE outside of the scope of this Agreement or not in compliance with its terms.
+
+3. Authorized Users.
+
+You may allow employees and contractors of your entity or of your subsidiary(ies) to access and use the SOFTWARE from your secure network to perform the work authorized by this Agreement on your behalf. If you are an academic institution, you may allow users enrolled or employed by the academic institution to access and use the SOFTWARE as authorized by this Agreement from your secure network. You are responsible for the compliance with the terms of this Agreement by your authorized users. Any act or omission that if committed by you would constitute a breach of this Agreement will be deemed to constitute a breach of this Agreement if committed by your authorized users.
+
+4. Pre-Release Versions.
+
+SOFTWARE versions or specific features identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA offerings. You may use a pre-release SOFTWARE at your own risk, understanding that such versions are not intended for use in business-critical systems. NVIDIA may choose not to make available a commercial version of any pre-release SOFTWARE. NVIDIA may also choose to abandon development and terminate the availability of a pre-release SOFTWARE at any time without liability.
+
+5. Updates.
+
+NVIDIA may, at its option, make available patches, workarounds or other updates to the SOFTWARE. Unless the updates are provided with their separate governing terms, they are deemed part of the SOFTWARE licensed to you as provided in this Agreement.
+
+6. Components Under Other Licenses.
+
+The SOFTWARE may include or be distributed with components provided with separate legal notices or terms that accompany the components, such as open source software licenses and other license terms ("Other Licenses”). The components are subject to the applicable Other Licenses, including any proprietary notices, disclaimers, requirements and extended use rights; except that this Agreement will prevail regarding the use of third-party open source software, unless a third-party open source software license requires its license terms to prevail. Open source software license means any software, data or documentation subject to any license identified as an open source license by the Open Source Initiative (http://opensource.org), Free Software Foundation (http://www.fsf.org) or other similar open source organization or listed by the Software Package Data Exchange (SPDX) Workgroup under the Linux Foundation (http://www.spdx.org).
+
+7. Termination.
+
+This Agreement will automatically terminate without notice from NVIDIA if you fail to comply with any of the terms in this Agreement or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. Additionally, either party may terminate this Agreement at any time with prior written notice to the other party. Upon any termination, you must stop using and destroy all copies of the SOFTWARE. Upon written request, you will certify in writing that you have complied with your commitments under this section. All provisions will survive termination, except for the licenses granted to you.
+
+8. Ownership.
+
+The SOFTWARE, including all intellectual property rights, is and will remain the sole and exclusive property of NVIDIA or its licensors. Except as expressly granted in this Agreement, (i) NVIDIA reserves all rights, interests, and remedies in connection with the SOFTWARE, and (ii) no other license or right is granted to you by implication, estoppel or otherwise. You agree to cooperate with NVIDIA and provide reasonably requested information to verify your compliance with this Agreement.
+
+9. Feedback.
+
+You may, but you are not obligated to, provide suggestions, requests, fixes, modifications, enhancements, or other feedback regarding the SOFTWARE (collectively, “Feedback”). Feedback, even if designated as confidential by you, will not create any confidentiality obligation for NVIDIA or its affiliates. If you provide Feedback, you hereby grant NVIDIA, its affiliates and its designees a non-exclusive, perpetual, irrevocable, sublicensable, worldwide, royalty-free, fully paid-up and transferable license, under your intellectual property rights, to publicly perform, publicly display, reproduce, use, make, have made, sell, offer for sale, distribute (through multiple tiers of distribution), import, create derivative works of and otherwise commercialize and exploit the Feedback at NVIDIA’s discretion. You will not give Feedback (i) that you have reason to believe is subject to any restriction that impairs the exercise of the grant stated in this section, such as third-party intellectual property rights; or (ii) subject to license terms which seek to require any product incorporating or developed using such Feedback, or other intellectual property of NVIDIA or its affiliates, to be licensed to or otherwise shared with any third party.
+
+10. Disclaimer of Warranties.
+
+THE SOFTWARE IS PROVIDED BY NVIDIA AS-IS AND WITH ALL FAULTS. TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, NVIDIA DISCLAIMS ALL WARRANTIES AND REPRESENTATIONS OF ANY KIND, WHETHER EXPRESS, IMPLIED OR STATUTORY, RELATING TO OR ARISING UNDER THIS AGREEMENT, INCLUDING, WITHOUT LIMITATION, THE WARRANTIES OF TITLE, NONINFRINGEMENT, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, USAGE OF TRADE AND COURSE OF DEALING. WITHOUT LIMITING THE FOREGOING, NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS; THAT ANY DEFECTS OR ERRORS WILL BE CORRECTED; THAT ANY CERTAIN CONTENT WILL BE AVAILABLE; OR THAT THE SOFTWARE IS FREE OF VIRUSES OR OTHER HARMFUL COMPONENTS. NO INFORMATION OR ADVICE GIVEN BY NVIDIA WILL IN ANY WAY INCREASE THE SCOPE OF ANY WARRANTY EXPRESSLY PROVIDED IN THIS AGREEMENT.
+
+11. Limitations of Liability.
+
+TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT WILL NVIDIA BE LIABLE FOR ANY (I) INDIRECT, PUNITIVE, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES, OR (II) DAMAGES FOR THE (A) COST OF PROCURING SUBSTITUTE GOODS, OR (B) LOSS OF PROFITS, REVENUES, USE, DATA OR GOODWILL ARISING OUT OF OR RELATED TO THIS AGREEMENT, WHETHER BASED ON BREACH OF CONTRACT, TORT (INCLUDING NEGLIGENCE), STRICT LIABILITY, OR OTHERWISE, AND EVEN IF NVIDIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES AND EVEN IF A PARTY'S REMEDIES FAIL THEIR ESSENTIAL PURPOSE.
+
+ADDITIONALLY, TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, NVIDIA’S TOTAL CUMULATIVE AGGREGATE LIABILITY FOR ANY AND ALL LIABILITIES, OBLIGATIONS OR CLAIMS ARISING OUT OF OR RELATED TO THIS AGREEMENT WILL NOT EXCEED FIVE U.S. DOLLARS (US$5).
+
+12. Governing Law and Jurisdiction.
+
+This Agreement will be governed in all respects by the laws of the United States and the laws of the State of Delaware, without regard to conflict of laws principles or the United Nations Convention on Contracts for the International Sale of Goods. The state and federal courts residing in Santa Clara County, California will have exclusive jurisdiction over any dispute or claim arising out of or related to this Agreement, and the parties irrevocably consent to personal jurisdiction and venue in those courts; except that either party may apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
+
+13. No Assignment.
+
+NVIDIA may assign, delegate or transfer its rights or obligations under this Agreement by any means or operation of law. You may not, without NVIDIA’s prior written consent, assign, delegate or transfer any of your rights or obligations under this Agreement by any means or operation of law, and any attempt to do so is null and void.
+
+14. Waiver.
+
+No failure or delay by a party to enforce any Agreement term or obligation will operate as a waiver by that party, nor prevent the enforcement of such term or obligation later.
+
+15. Export.
+
+You agree to comply with all applicable export, import, trade and economic sanctions laws and regulations, including U.S. Export Administration Regulations and Office of Foreign Assets Control regulations. These laws include restrictions on destinations, end-users and end-use.
+
+16. Government Use.
+
+The SOFTWARE, including related documentation (“Protected Items”) is a “Commercial product” as this term is defined at 48 C.F.R. 2.101, consisting of “commercial computer software” and “commercial computer software documentation” as such terms are used in, respectively, 48 C.F.R. 12.212 and 48 C.F.R. 227.7202 & 252.227-7014(a)(1). Before any Protected Items are supplied to the U.S. Government, you will (i) inform the U.S. Government in writing that the Protected Items are and must be treated as commercial computer software and commercial computer software documentation developed at private expense; (ii) inform the U.S. Government that the Protected Items are provided subject to the terms of this Agreement; and (iii) mark the Protected Items as commercial computer software and commercial computer software documentation developed at private expense. In no event will you permit the U.S. Government to acquire rights in Protected Items beyond those specified in 48 C.F.R. 52.227-19(b)(1)-(2) or 252.227-7013(c) except as expressly approved by NVIDIA in writing.
+
+17. Notices.
+
+Please direct your legal notices or other correspondence to NVIDIA Corporation, 2788 San Tomas Expressway, Santa Clara, California 95051, United States of America, Attention: Legal Department. If NVIDIA needs to contact you about the SOFTWARE, you consent to receive the notices by email and that such notices will satisfy any legal communication requirements.
+
+18. Force Majeure.
+
+Neither party will be liable during any period where an event or circumstance prevents or delays that party from performing its obligations under this Agreement and that event or circumstance: (i) is not within the reasonable control of that party and is not the result of that party’s negligence, and (ii) cannot be overcome or avoided by that party using reasonably diligent efforts.
+
+19. Entire Agreement.
+
+Regarding the subject matter of this Agreement, the parties agree that (i) this Agreement constitutes the entire and exclusive agreement between the parties and supersedes all prior and contemporaneous communications and (ii) any additional or different terms or conditions, whether contained in purchase orders, order acknowledgments, invoices or otherwise, will not be binding on the receiving party and are null and void. If a court of competent jurisdiction rules that a provision of this Agreement is unenforceable, that provision will be deemed modified to the extent necessary to make it enforceable and the remainder of this Agreement will continue in full force and effect. Any amendment to this Agreement must be in writing and signed by authorized representatives of both parties.
+
+(v. April 10, 2023)
diff --git a/README.md b/README.md
index 97e754dddd8..7e905afac06 100644
--- a/README.md
+++ b/README.md
@@ -73,7 +73,7 @@ as a `provided` dependency.
com.nvidiarapids-4-spark_2.12
- 23.02.0
+ 23.04.0provided
```
diff --git a/aggregator/pom.xml b/aggregator/pom.xml
index 0af849ce3d2..2b090d31517 100644
--- a/aggregator/pom.xml
+++ b/aggregator/pom.xml
@@ -22,12 +22,12 @@
com.nvidiarapids-4-spark-parent
- 23.02.0
+ 23.04.0rapids-4-spark-aggregator_2.12RAPIDS Accelerator for Apache Spark AggregatorCreates an aggregated shaded package of the RAPIDS plugin for Apache Spark
- 23.02.0
+ 23.04.0
- META-INF/maven/**
-
@@ -418,7 +415,7 @@
@@ -428,12 +425,28 @@
run
- reduce-pom-deps-in-the-jar
+ check-pom-dependencies-empty
-
+
+
+
+
+
+
+
@@ -536,14 +549,14 @@
com.nvidiaspark-rapids-jni${cuda.version}
- META-INF
+ META-INF/**${project.build.directory}/parallel-worldtrueorg.openucxjucx
- META-INF
+ META-INF/**${project.build.directory}/parallel-worldtrue
diff --git a/dist/unshimmed-common-from-spark311.txt b/dist/unshimmed-common-from-spark311.txt
index 66c4d7a46aa..c1c4e4fdb1f 100644
--- a/dist/unshimmed-common-from-spark311.txt
+++ b/dist/unshimmed-common-from-spark311.txt
@@ -1,7 +1,6 @@
META-INF/DEPENDENCIES
META-INF/LICENSE
META-INF/NOTICE
-META-INF/maven/**
com/nvidia/spark/ExclusiveModeGpuDiscoveryPlugin*
com/nvidia/spark/GpuCachedBatchSerializer*
com/nvidia/spark/ParquetCachedBatchSerializer*
@@ -22,10 +21,12 @@ com/nvidia/spark/rapids/RapidsExecutorUpdateMsg*
com/nvidia/spark/rapids/RapidsShuffleHeartbeatHandler*
com/nvidia/spark/rapids/SQLExecPlugin*
com/nvidia/spark/rapids/ShimLoader*
+com/nvidia/spark/rapids/ShimReflectionUtils*
com/nvidia/spark/rapids/ShimVersion*
com/nvidia/spark/rapids/SparkShimServiceProvider*
com/nvidia/spark/rapids/SparkShimVersion*
com/nvidia/spark/rapids/SparkShims*
+com/nvidia/spark/rapids/optimizer/SQLOptimizerPlugin*
com/nvidia/spark/udf/Plugin*
org/apache/spark/sql/rapids/ExecutionPlanCaptureCallback*
org/apache/spark/sql/rapids/ProxyRapidsShuffleInternalManagerBase*
diff --git a/docs/FAQ.md b/docs/FAQ.md
index 1cf25e96b5f..def8e8b5c8b 100644
--- a/docs/FAQ.md
+++ b/docs/FAQ.md
@@ -10,7 +10,7 @@ nav_order: 12
### What versions of Apache Spark does the RAPIDS Accelerator for Apache Spark support?
-The RAPIDS Accelerator for Apache Spark requires version 3.1.1, 3.1.2, 3.1.3, 3.2.0, 3.2.1, 3.2.2, 3.2.3, 3.3.0 or 3.3.1 of
+The RAPIDS Accelerator for Apache Spark requires version 3.1.1, 3.1.2, 3.1.3, 3.2.0, 3.2.1, 3.2.2, 3.2.3, 3.3.0, 3.3.1 or 3.3.2 of
Apache Spark. Because the plugin replaces parts of the physical plan that Apache Spark considers to
be internal the code for those plans can change even between bug fix releases. As a part of our
process, we try to stay on top of these changes and release updates as quickly as possible.
@@ -20,7 +20,7 @@ process, we try to stay on top of these changes and release updates as quickly a
The RAPIDS Accelerator for Apache Spark officially supports:
- [Apache Spark](get-started/getting-started-on-prem.md)
- [AWS EMR 6.2+](get-started/getting-started-aws-emr.md)
-- [Databricks Runtime 9.1, 10.4, 11.3](get-started/getting-started-databricks.md)
+- [Databricks Runtime 10.4, 11.3](get-started/getting-started-databricks.md)
- [Google Cloud Dataproc 2.0](get-started/getting-started-gcp.md)
- [Azure Synapse](get-started/getting-started-azure-synapse-analytics.md)
- Cloudera provides the plugin packaged through
@@ -39,7 +39,7 @@ release.
### What hardware is supported?
-The plugin is tested and supported on P100, V100, T4, A2, A10, A30 and A100 datacenter GPUs. It is possible
+The plugin is tested and supported on P100, V100, T4, A2, A10, A30, A100 and L4 datacenter GPUs. It is possible
to run the plugin on GeForce desktop hardware with Volta or better architectures. GeForce hardware
does not support [CUDA forward
compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/index.html#forward-compatibility-title),
@@ -403,7 +403,7 @@ There are multiple reasons why this a problematic configuration:
- CUDA context switches between processes sharing a single GPU can be expensive
- Each executor would have a fraction of the GPU memory available for processing
-### Is [Multi-Instance GPU (MIG)](https://docs.nvidia.com/cuda/mig/index.html) supported?
+### Is [Multi-Instance GPU (MIG)](https://www.nvidia.com/en-gb/technologies/multi-instance-gpu/) supported?
Yes, but it requires support from the underlying cluster manager to isolate the MIG GPU instance
for each executor (e.g.: by setting `CUDA_VISIBLE_DEVICES`,
@@ -540,7 +540,7 @@ Below are some troubleshooting tips on GPU query performance issue:
`spark.sql.files.maxPartitionBytes` and `spark.rapids.sql.concurrentGpuTasks` as these configurations can affect performance of queries significantly.
Please refer to [Tuning Guide](./tuning-guide.md) for more details.
-### Why is Avro library not found by RAPIDS?
+### Why is the Avro library not found by RAPIDS?
If you are getting a warning `Avro library not found by the RAPIDS plugin.` or if you are getting the
`java.lang.NoClassDefFoundError: org/apache/spark/sql/v2/avro/AvroScan` error, make sure you ran the
@@ -561,6 +561,51 @@ use the RAPIDS Shuffle Manager, your deployment option may be limited to the ext
Starting from 22.06, the default value for `spark.rapids.memory.gpu.pool` is changed to `ASYNC` from
`ARENA` for CUDA 11.5+. For CUDA 11.4 and older, it will fall back to `ARENA`.
+### What is a `RetryOOM` or `SplitAndRetryOOM` exception?
+
+In the 23.04 release of the accelerator two new exceptions were added to replace a
+regular `OutOfMemoryError` that was thrown before when the GPU ran out of memory.
+Originally we used `OutOfMemoryError` like on the CPU thinking that it would help to
+trigger GC in case handles pointing to GPU memory were leaked in the JVM heap. But
+`OutOfMemoryError` is technically a fatal exception and recovering from it is
+not strictly supported. As such Apache Spark treats it as a fatal exception and will
+kill the process that sees this exception. This can result in a lot of tasks
+being rerun if the GPU runs out of memory. These new exceptions prevent that. They
+also provide an indication to various GPU operators that the GPU ran out of memory
+and how that operator might be able to recover. `RetryOOM` indicates that the operator
+should roll back to a known good spot and then wait until the memory allocation
+framework decides that it should be retried. `SplitAndRetryOOM` is used
+when there is really only one task unblocked and the only way to recover would be to
+roll back to a good spot and try to split the input so that less total GPU memory is
+needed.
+
+These are not implemented for all GPU operations. A number of GPU operations that
+use a significant amount of memory have been updated to handle `RetryOOM`, but fewer
+have been updated to handle `SplitAndRetryOOM`. If you do run into these exceptions
+it is an indication that you are using too much GPU memory. The tuning guide can
+help you to reduce your memory usage. Be aware that some algorithms do not have
+a way to split their usage, things like window operations over some large windows.
+If tuning does not fix the problem please file an issue to help us understand what
+operators may need better out of core algorithm support.
+
+### Encryption Support
+
+The RAPIDS Accelerator for Apache Spark has several components that may or may not follow
+the encryption configurations that Apache Spark provides. The following documents the
+exceptions that are known at the time of writing this FAQ entry:
+
+Local storage encryption (`spark.io.encryption.enabled`) is not supported for spilled buffers that the
+plugin uses to help with GPU out-of-memory situations. The RAPIDS Shuffle Manager does not implement
+local storage encryption for shuffle blocks when configured for UCX, but it does when configured in
+MULTITHREADED mode.
+
+Network encryption (`spark.network.crypto.enabled`) is not supported in the RAPIDS Shuffle Manager
+when configured for UCX, but it is supported when configured in MULTITHREADED mode.
+
+If your environment has specific encryption requirements for network or IO, please make sure
+that the RAPIDS Accelerator suits your needs, and file and issue or discussion if you have doubts
+or would like expanded encryption support.
+
### I have more questions, where do I go?
We use github to track bugs, feature requests, and answer questions. File an
[issue](https://github.com/NVIDIA/spark-rapids/issues/new/choose) for a bug or feature request. Ask
diff --git a/docs/additional-functionality/delta-lake-support.md b/docs/additional-functionality/delta-lake-support.md
index e55acff1e7f..a0a68587df1 100644
--- a/docs/additional-functionality/delta-lake-support.md
+++ b/docs/additional-functionality/delta-lake-support.md
@@ -50,9 +50,36 @@ operation which is typically triggered via the DataFrame `write` API, e.g.:
Table creation from selection, table insertion from SQL, and table merges are not currently
GPU accelerated. These operations will fallback to the CPU.
-[Automatic optimization](https://docs.databricks.com/optimizations/auto-optimize.html)
-during Delta Lake writes is not supported. Write operations that are configured to
-automatically optimize or automatically compact will fallback to the CPU.
+#### Automatic Optimization of Writes
+
+Delta Lake on Databricks has
+[automatic optimization](https://docs.databricks.com/optimizations/auto-optimize.html)
+features for optimized writes and automatic compaction.
+
+Optimized writes are supported only on Databricks platforms. The algorithm used is similar but
+not identical to the Databricks version. The following table describes configuration settings
+that control the operation of the optimized write.
+
+| Configuration | Default | Description |
+|-------------------------------------------------------------|---------|--------------------------------------------------------------------------------------------|
+| spark.databricks.delta.optimizeWrite.binSize | 512 | Target uncompressed partition size in megabytes |
+| spark.databricks.delta.optimizeWrite.smallPartitionFactor | 0.5 | Merge partitions smaller than this factor multiplied by the target partition size |
+| spark.databricks.delta.optimizeWrite.mergedPartitionFactor | 1.2 | Avoid combining partitions larger than this factor multiplied by the target partition size |
+
+Automatic compaction is supported only on Databricks platforms. The algorithm is similar but
+not identical to the Databricks version. The following table describes configuration settings
+that control the operation of automatic compaction.
+
+| Configuration | Default | Description |
+|---------------------------------------------------------------------|---------|--------------------------------------------------------------------------------------------------------|
+| spark.databricks.delta.autoCompact.enabled | false | Enable/disable auto compaction for writes to Delta directories |
+| spark.databricks.delta.properties.defaults.autoOptimize.autoCompact | false | Whether to enable auto compaction by default, if spark.databricks.delta.autoCompact.enabled is not set |
+| spark.databricks.delta.autoCompact.minNumFiles | 50 | Minimum number of files in the Delta directory before which auto optimize does not begin compaction |
+
+Note that optimized write support requires round-robin partitioning of the data, and round-robin
+partitioning requires sorting across all columns for deterministic operation. If the GPU cannot
+support sorting a particular column type in order to support the round-robin partitioning, the
+Delta Lake write will fallback to the CPU.
### RapidsDeltaWrite Node in Query Plans
@@ -98,6 +125,16 @@ spark.rapids.sql.command.DeleteCommandEdge=true on Databricks platforms.
Deleting data from Delta Lake tables via the SQL `DELETE FROM` statement or via the DeltaTable
`delete` API is supported.
+### num_affected_rows Difference with Databricks
+
+The Delta Lake delete command returns a single row result with a `num_affected_rows` column.
+When entire partition files in the table are deleted, the open source Delta Lake and RAPIDS
+Acclerator implementations of delete can return -1 for `num_affected_rows` since it could be
+expensive to open the files and produce an accurate row count. Databricks changed the behavior
+of delete operations that delete entire partition files to return the actual row count.
+This is only a difference in the statistics of the operation, and the table contents will still
+be accurately deleted with the RAPIDS Accelerator.
+
## Update Operations on Delta Lake Tables
Delta Lake update acceleration is experimental and is disabled by default. To enable acceleration
diff --git a/docs/additional-functionality/ml-integration.md b/docs/additional-functionality/ml-integration.md
index 35bd5970b45..bc5ba67bee3 100644
--- a/docs/additional-functionality/ml-integration.md
+++ b/docs/additional-functionality/ml-integration.md
@@ -6,19 +6,53 @@ nav_order: 1
---
# RAPIDS Accelerator for Apache Spark ML Library Integration
-There are cases where you may want to get access to the raw data on the GPU, preferably without
-copying it. One use case for this is exporting the data to an ML framework after doing feature
-extraction. To do this we provide a simple Scala utility `com.nvidia.spark.rapids.ColumnarRdd` that can
-be used to convert a `DataFrame` to an `RDD[ai.rapids.cudf.Table]`. Each `Table` will have the same
-schema as the `DataFrame` passed in.
-
-`Table` is not a typical thing in an `RDD` so special care needs to be taken when working with it.
-By default, it is not serializable so repartitioning the `RDD` or any other operator that involves
-a shuffle will not work. This is because it is relatively expensive to serialize and
+## Existing ML Libraries
+
+The RAPIDS Accelerator for Apache Spark can be used to accelerate the ETL portions (e.g., loading
+training data from parquet files) of applications using ML libraries with Spark DataFrame APIs.
+Examples of such libraries include the original [Apache Spark
+MLlib](https://spark.apache.org/mllib/), [XGBoost](https://xgboost.readthedocs.io/en/stable/),
+[Spark RAPIDS ML](https://nvidia.github.io/spark-rapids-ml/), and the [DL inference UDF
+function](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.functions.predict_batch_udf.html)
+introduced in Spark 3.4. The latter three also enable leveraging GPUs (in the case of the DL
+inference UDF, indirectly via the underlying DL framework) to accelerate the core ML algorithms, and
+thus, in conjunction with the RAPIDS Accelerator for Apache Spark for ETL, can further enhance the
+cost-benefit of GPU accelerated Spark clusters.
+
+For Spark API compatible ML libraries that implement their core ML computations inside pandas UDFs,
+such as XGBoost’s pySpark API, Spark RAPIDS ML pySpark API, and the DL inference UDF it is
+recommended to enable the RAPIDS Accelerator for Apache Spark’s [support for GPU accelerated pandas
+UDFs](https://nvidia.github.io/spark-rapids/docs/additional-functionality/rapids-udfs.html#gpu-support-for-pandas-udf).
+
+### RMM
+
+One consideration when using the RAPIDS Accelerator for Apache Spark with a GPU accelerated ML
+library is the sharing of GPU memory between the two, as the ML library would typically have a
+distinct GPU memory manager from the RAPIDS Accelerator’s RMM instance. Accordingly, you may need
+to disable RMM pooling in the RAPIDS Accelerator via the config `spark.rapids.memory.gpu.pool` when
+exporting data to an ML library since that library will likely not have access to any of the memory
+that the RAPIDS Accelerator’s RMM instance is holding. Similarly, aggressive GPU memory reservation
+on the side of the ML library may also need to be disabled, as via these steps in the case of
+[Tensorflow](https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth).
+
+## GPU accelerated ML Library development
+
+### ColumnarRdd
+
+When developing a GPU accelerated ML library for Spark, there are cases where you may want to get
+access to the raw data on the GPU, preferably without copying it. One use case for this is exporting
+the data to the ML library after doing feature extraction. To enable this for Scala development, the
+RAPIDS Accelerator for Apache Spark provides a simple utility `com.nvidia.spark.rapids.ColumnarRdd`
+that can be used to convert a `DataFrame` to an `RDD[ai.rapids.cudf.Table]`. Each `Table` will have
+the same schema as the `DataFrame` passed in.
+
+Note that `Table` is not a typical thing in an `RDD` so special care needs to be taken when working
+with it. By default, it is not serializable so repartitioning the `RDD` or any other operator that
+involves a shuffle will not work. This is because it is relatively expensive to serialize and
deserialize GPU data using a conventional Spark shuffle. In addition, most of the memory associated
with the `Table` is on the GPU itself. So, each `Table` must be closed when it is no longer needed
-to avoid running out of GPU memory. By convention, it is the responsibility of the one consuming
-the data to close it when they no longer need it.
+to avoid running out of GPU memory. By convention, it is the responsibility of the one consuming the
+data to close it when they no longer need it.
```scala
val df = spark.sql("""select my_column from my_table""")
@@ -32,17 +66,13 @@ val maxValue = rdd.map(table => {
}).max()
```
-## RMM
-You may need to disable RMM caching when exporting data to an ML library as that library
-will likely want to use all of the GPU's memory and if it is not aware of RMM it will not have
-access to any of the memory that RMM is holding.
-
-## Spark ML Algorithms Supported by RAPIDS Accelerator
+### Examples of Spark ML Implementations leveraging ColumnarRdd
-The [spark-rapids-examples repository](https://github.com/NVIDIA/spark-rapids-examples) provides a
-[working example](https://github.com/NVIDIA/spark-rapids-examples/tree/main/examples/ML+DL-Examples/Spark-cuML/pca)
-of accelerating the `transform` API for
-[Principal Component Analysis (PCA)](https://spark.apache.org/docs/latest/mllib-dimensionality-reduction#principal-component-analysis-pca).
-The example leverages the [RAPIDS accelerated UDF interface](rapids-udfs.md) to provide a native
-implementation of the algorithm. The details of the UDF implementation can be found in the
-[spark-rapids-ml repository](https://github.com/NVIDIA/spark-rapids-ml).
+Both the Scala Spark PCA
+[implementation](https://github.com/NVIDIA/spark-rapids-ml/blob/ab575bc46e55f38ee52906b3c3b55b75f2418459/jvm/src/main/scala/org/apache/spark/ml/linalg/distributed/RapidsRowMatrix.scala)
+in Spark RAPIDS ML and XGBoost’s [GPU accelerated Scala
+SparkAPI](https://github.com/dmlc/xgboost/blob/f1e9bbcee52159d4bd5f7d25ef539777ceac147c/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala)
+leverage ColumnarRdd (search for ColumnarRdd in these files) to accelerate data transfer between the
+RAPIDS Accelerator for Apache Spark and the respective core ML algorithm computations. XGBoost in
+particular enables this when detecting that the RAPIDS Accelerator for Apache Spark is present and
+enabled.
diff --git a/docs/additional-functionality/rapids-shuffle.md b/docs/additional-functionality/rapids-shuffle.md
index 6924ae415fe..046a850e2e2 100644
--- a/docs/additional-functionality/rapids-shuffle.md
+++ b/docs/additional-functionality/rapids-shuffle.md
@@ -26,13 +26,13 @@ in our plugin:
| 3.2.3 | com.nvidia.spark.rapids.spark323.RapidsShuffleManager |
| 3.3.0 | com.nvidia.spark.rapids.spark330.RapidsShuffleManager |
| 3.3.1 | com.nvidia.spark.rapids.spark331.RapidsShuffleManager |
-| Databricks 9.1 | com.nvidia.spark.rapids.spark312db.RapidsShuffleManager |
+| 3.3.2 | com.nvidia.spark.rapids.spark332.RapidsShuffleManager |
| Databricks 10.4 | com.nvidia.spark.rapids.spark321db.RapidsShuffleManager |
| Databricks 11.3 | com.nvidia.spark.rapids.spark330db.RapidsShuffleManager |
## Multi-Threaded Mode
-Mult-threaded mode (default) is similar to the built-in Spark shuffle, but it attempts to use
+Multi-threaded mode (default) is similar to the built-in Spark shuffle, but it attempts to use
more CPU threads for compute-intensive tasks, such as compression and decompression.
Minimum configuration:
@@ -93,8 +93,8 @@ In order to enable the RAPIDS Shuffle Manager, UCX user-space libraries and its
be installed on the host and inside Docker containers (if not baremetal). A host has additional
requirements, like the MLNX_OFED driver and `nv_peer_mem` kernel module.
-The minimum UCX requirement for the RAPIDS Shuffle Manager is
-[UCX 1.12.1](https://github.com/openucx/ucx/releases/tag/v1.12.1).
+The required UCX version for the RAPIDS Shuffle Manager is
+[UCX 1.12.1](https://github.com/openucx/ucx/releases/tag/v1.12.1). Versions higher than 1.12.1 have not been tested.
#### Baremetal
@@ -330,7 +330,7 @@ In this section, we are using a docker container built using the sample dockerfi
1. Choose the version of the shuffle manager that matches your Spark version. Please refer to
the table at the top of this document for `spark.shuffle.manager` values.
-2. Settings for UCX 1.12.1+:
+2. Settings for UCX 1.12.1:
Minimum configuration:
@@ -392,10 +392,10 @@ Save the script in DBFS and add it to the "Init Scripts" list:
2) Add the UCX minimum configuration for your Cluster.
-Databricks 9.1:
+Databricks 10.4:
```
-spark.shuffle.manager com.nvidia.spark.rapids.spark312db.RapidsShuffleManager
+spark.shuffle.manager com.nvidia.spark.rapids.spark321db.RapidsShuffleManager
spark.rapids.shuffle.mode UCX
spark.shuffle.service.enabled false
spark.executorEnv.UCX_MEMTYPE_CACHE n
diff --git a/docs/archive.md b/docs/archive.md
index 37b5c676f9d..1ac82e30bba 100644
--- a/docs/archive.md
+++ b/docs/archive.md
@@ -5,6 +5,66 @@ nav_order: 15
---
Below are archived releases for RAPIDS Accelerator for Apache Spark.
+## Release v23.02.0
+Hardware Requirements:
+
+The plugin is tested on the following architectures:
+
+ GPU Models: NVIDIA P100, V100, T4 and A2/A10/A30/A100 GPUs
+
+Software Requirements:
+
+ OS: Ubuntu 18.04, Ubuntu 20.04 or CentOS 7, Rocky Linux 8
+
+ CUDA & NVIDIA Drivers*: 11.x & v450.80.02+
+
+ Apache Spark 3.1.1, 3.1.2, 3.1.3, 3.2.0, 3.2.1, 3.2.2, 3.2.3, 3.3.0, 3.3.1, Databricks 10.4 ML LTS or 11.3 ML LTS Runtime and GCP Dataproc 2.0
+
+ Python 3.6+, Scala 2.12, Java 8
+
+*Some hardware may have a minimum driver version greater than v450.80.02+. Check the GPU spec sheet
+for your hardware's minimum driver version.
+
+*For Cloudera and EMR support, please refer to the
+[Distributions](./FAQ.md#which-distributions-are-supported) section of the FAQ.
+
+### Download v23.02.0
+* Download the [RAPIDS
+ Accelerator for Apache Spark 23.02.0 jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.02.0/rapids-4-spark_2.12-23.02.0.jar)
+
+This package is built against CUDA 11.8 and all CUDA 11.x versions are supported through [CUDA forward
+compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/index.html). It is tested
+on V100, T4, A2, A10, A30 and A100 GPUs with CUDA 11.0-11.5. For those using other types of GPUs which
+do not have CUDA forward compatibility (for example, GeForce), CUDA 11.5 or later is required. Users will
+need to ensure the minimum driver (450.80.02) and CUDA toolkit are installed on each Spark node.
+
+### Verify signature
+* Download the [RAPIDS Accelerator for Apache Spark 23.02.0 jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.02.0/rapids-4-spark_2.12-23.02.0.jar)
+ and [RAPIDS Accelerator for Apache Spark 23.02.0 jars.asc](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.02.0/rapids-4-spark_2.12-23.02.0.jar.asc)
+* Download the [PUB_KEY](https://keys.openpgp.org/search?q=sw-spark@nvidia.com).
+* Import the public key: `gpg --import PUB_KEY`
+* Verify the signature: `gpg --verify rapids-4-spark_2.12-23.02.0.jar.asc rapids-4-spark_2.12-23.02.0.jar`
+
+The output if signature verify:
+
+ gpg: Good signature from "NVIDIA Spark (For the signature of spark-rapids release jars) "
+
+### Release Notes
+New functionality and performance improvements for this release include:
+* Delta Lake MERGE/DELETE/UPDATE (experimental feature, can be enabled with a config flag)
+* Function `from_json`
+* Hive text table write
+* Databricks 11.3 ML LTS support
+* Support batched full join to improve full join's performance
+* Qualification and Profiling tool:
+ * EMR user tools support for qualification
+ * EMR user tools support for bootstrap
+ * Updated estimated speedup factors for on-prem, Dataproc, and EMR environments for qualification
+
+
+For a detailed list of changes, please refer to the
+[CHANGELOG](https://github.com/NVIDIA/spark-rapids/blob/main/CHANGELOG.md).
+
## Release v22.12.0
Hardware Requirements:
diff --git a/docs/compatibility.md b/docs/compatibility.md
index 31d084a9a46..638d1c6be91 100644
--- a/docs/compatibility.md
+++ b/docs/compatibility.md
@@ -296,38 +296,21 @@ The JSON format read is a very experimental feature which is expected to have so
it by default. If you would like to test it, you need to enable `spark.rapids.sql.format.json.enabled` and
`spark.rapids.sql.format.json.read.enabled`.
-Currently, the GPU accelerated JSON reader doesn't support column pruning, which will likely make
-this difficult to use or even test. The user must specify the full schema or just let Spark infer
-the schema from the JSON file. eg,
-
-We have a `people.json` file with below content
-
+Reading input containing invalid JSON format (in any row) will throw runtime exception.
+An example of valid input is as following:
``` console
-{"name":"Michael"}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}
```
-Both below ways will work
-
-- Inferring the schema
-
- ``` scala
- val df = spark.read.json("people.json")
- ```
-
-- Specifying the full schema
-
- ``` scala
- val schema = StructType(Seq(StructField("name", StringType), StructField("age", IntegerType)))
- val df = spark.read.schema(schema).json("people.json")
- ```
-
-While the below code will not work in the current version,
+The following input is invalid and will cause error:
+```console
+{"name":"Andy", "age":30} ,,,,
+{"name":"Justin", "age":19}
+```
-``` scala
-val schema = StructType(Seq(StructField("name", StringType)))
-val df = spark.read.schema(schema).json("people.json")
+```console
+{"name": Justin", "age":19}
```
### JSON supporting types
@@ -344,7 +327,6 @@ Due to such limitations, the input JSON schema must be `MAP` and
```
scala> val df = Seq("{}", "BAD", "{\"A\": 100}").toDF
df: org.apache.spark.sql.DataFrame = [value: string]
-
scala> df.selectExpr("from_json(value, 'MAP')").show()
+----------+
| entries|
diff --git a/docs/configs.md b/docs/configs.md
index 9105b31d8a4..95484b96cb9 100644
--- a/docs/configs.md
+++ b/docs/configs.md
@@ -10,7 +10,7 @@ The following is the list of options that `rapids-plugin-4-spark` supports.
On startup use: `--conf [conf key]=[conf value]`. For example:
```
-${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-23.02.0-cuda11.jar \
+${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-23.04.0-cuda11.jar \
--conf spark.plugins=com.nvidia.spark.SQLPlugin \
--conf spark.rapids.sql.concurrentGpuTasks=2
```
@@ -31,9 +31,12 @@ scala> spark.conf.set("spark.rapids.sql.concurrentGpuTasks", 2)
Name | Description | Default Value | Applicable at
-----|-------------|--------------|--------------
-spark.rapids.alluxio.automount.enabled|Enable the feature of auto mounting the cloud storage to Alluxio. It requires the Alluxio master is the same node of Spark driver node. When it's true, it requires an environment variable ALLUXIO_HOME be set properly. The default value of ALLUXIO_HOME is "/opt/alluxio-2.8.0". You can set it as an environment variable when running a spark-submit or you can use spark.yarn.appMasterEnv.ALLUXIO_HOME to set it on Yarn. The Alluxio master's host and port will be read from alluxio.master.hostname and alluxio.master.rpc.port(default: 19998) from ALLUXIO_HOME/conf/alluxio-site.properties, then replace a cloud path which matches spark.rapids.alluxio.bucket.regex like "s3://bar/b.csv" to "alluxio://0.1.2.3:19998/bar/b.csv", and the bucket "s3://bar" will be mounted to "/bar" in Alluxio automatically.|false|Runtime
+spark.rapids.alluxio.automount.enabled|Enable the feature of auto mounting the cloud storage to Alluxio. It requires the Alluxio master is the same node of Spark driver node. The Alluxio master's host and port will be read from alluxio.master.hostname and alluxio.master.rpc.port(default: 19998) from ALLUXIO_HOME/conf/alluxio-site.properties, then replace a cloud path which matches spark.rapids.alluxio.bucket.regex like "s3://bar/b.csv" to "alluxio://0.1.2.3:19998/bar/b.csv", and the bucket "s3://bar" will be mounted to "/bar" in Alluxio automatically.|false|Runtime
spark.rapids.alluxio.bucket.regex|A regex to decide which bucket should be auto-mounted to Alluxio. E.g. when setting as "^s3://bucket.*", the bucket which starts with "s3://bucket" will be mounted to Alluxio and the path "s3://bucket-foo/a.csv" will be replaced to "alluxio://0.1.2.3:19998/bucket-foo/a.csv". It's only valid when setting spark.rapids.alluxio.automount.enabled=true. The default value matches all the buckets in "s3://" or "s3a://" scheme.|^s3a{0,1}://.*|Runtime
+spark.rapids.alluxio.home|The Alluxio installation home path or link to the installation home path. |/opt/alluxio|Startup
spark.rapids.alluxio.large.file.threshold|The threshold is used to identify whether average size of files is large when reading from S3. If reading large files from S3 and the disks used by Alluxio are slow, directly reading from S3 is better than reading caches from Alluxio, because S3 network bandwidth is faster than local disk. This improvement takes effect when spark.rapids.alluxio.slow.disk is enabled.|67108864|Runtime
+spark.rapids.alluxio.master|The Alluxio master hostname. If not set, read Alluxio master URL from spark.rapids.alluxio.home locally. This config is useful when Alluxio master and Spark driver are not co-located.||Startup
+spark.rapids.alluxio.master.port|The Alluxio master port. If not set, read Alluxio master port from spark.rapids.alluxio.home locally. This config is useful when Alluxio master and Spark driver are not co-located.|19998|Startup
spark.rapids.alluxio.pathsToReplace|List of paths to be replaced with corresponding Alluxio scheme. E.g. when configure is set to "s3://foo->alluxio://0.1.2.3:19998/foo,gs://bar->alluxio://0.1.2.3:19998/bar", it means: "s3://foo/a.csv" will be replaced to "alluxio://0.1.2.3:19998/foo/a.csv" and "gs://bar/b.csv" will be replaced to "alluxio://0.1.2.3:19998/bar/b.csv". To use this config, you have to mount the buckets to Alluxio by yourself. If you set this config, spark.rapids.alluxio.automount.enabled won't be valid.|None|Startup
spark.rapids.alluxio.replacement.algo|The algorithm used when replacing the UFS path with the Alluxio path. CONVERT_TIME and TASK_TIME are the valid options. CONVERT_TIME indicates that we do it when we convert it to a GPU file read, this has extra overhead of creating an entirely new file index, which requires listing the files and getting all new file info from Alluxio. TASK_TIME replaces the path as late as possible inside of the task. By waiting and replacing it at task time, it just replaces the path without fetching the file information again, this is faster but doesn't update locality information if that has a bit impact on performance.|TASK_TIME|Runtime
spark.rapids.alluxio.slow.disk|Indicates whether the disks used by Alluxio are slow. If it's true and reading S3 large files, Rapids Accelerator reads from S3 directly instead of reading from Alluxio caches. Refer to spark.rapids.alluxio.large.file.threshold which defines a threshold that identifying whether files are large. Typically, it's slow disks if speed is less than 300M/second. If using convert time spark.rapids.alluxio.replacement.algo, this may not apply to all file types like Delta files|true|Runtime
@@ -50,6 +53,7 @@ Name | Description | Default Value | Applicable at
spark.rapids.memory.gpu.pool|Select the RMM pooling allocator to use. Valid values are "DEFAULT", "ARENA", "ASYNC", and "NONE". With "DEFAULT", the RMM pool allocator is used; with "ARENA", the RMM arena allocator is used; with "ASYNC", the new CUDA stream-ordered memory allocator in CUDA 11.2+ is used. If set to "NONE", pooling is disabled and RMM just passes through to CUDA memory allocation directly.|ASYNC|Startup
spark.rapids.memory.gpu.pooling.enabled|Should RMM act as a pooling allocator for GPU memory, or should it just pass through to CUDA memory allocation directly. DEPRECATED: please use spark.rapids.memory.gpu.pool instead.|true|Startup
spark.rapids.memory.gpu.reserve|The amount of GPU memory that should remain unallocated by RMM and left for system use such as memory needed for kernels and kernel launches.|671088640|Startup
+spark.rapids.memory.gpu.state.debug|To better recover from out of memory errors, RMM will track several states for the threads that interact with the GPU. This provides a log of those state transitions to aid in debugging it. STDOUT or STDERR will have the logging go there empty string will disable logging and anything else will be treated as a file to write the logs to.||Startup
spark.rapids.memory.gpu.unspill.enabled|When a spilled GPU buffer is needed again, should it be unspilled, or only copied back into GPU memory temporarily. Unspilling may be useful for GPU buffers that are needed frequently, for example, broadcast variables; however, it may also increase GPU memory usage|false|Startup
spark.rapids.memory.host.pageablePool.size|The size of the pageable memory pool in bytes unless otherwise specified. Use 0 to disable the pool.|1073741824|Startup
spark.rapids.memory.host.spillStorageSize|Amount of off-heap host memory to use for buffering spilled GPU data before spilling to local disk. Use -1 to set the amount to the combined size of pinned and pageable memory pools.|-1|Startup
@@ -146,6 +150,7 @@ Name | Description | Default Value | Applicable at
spark.rapids.sql.metrics.level|GPU plans can produce a lot more metrics than CPU plans do. In very large queries this can sometimes result in going over the max result size limit for the driver. Supported values include DEBUG which will enable all metrics supported and typically only needs to be enabled when debugging the plugin. MODERATE which should output enough metrics to understand how long each part of the query is taking and how much data is going to each part of the query. ESSENTIAL which disables most metrics except those Apache Spark CPU plans will also report or their equivalents.|MODERATE|Runtime
spark.rapids.sql.mode|Set the mode for the Rapids Accelerator. The supported modes are explainOnly and executeOnGPU. This config can not be changed at runtime, you must restart the application for it to take affect. The default mode is executeOnGPU, which means the RAPIDS Accelerator plugin convert the Spark operations and execute them on the GPU when possible. The explainOnly mode allows running queries on the CPU and the RAPIDS Accelerator will evaluate the queries as if it was going to run on the GPU. The explanations of what would have run on the GPU and why are output in log messages. When using explainOnly mode, the default explain output is ALL, this can be changed by setting spark.rapids.sql.explain. See that config for more details.|executeongpu|Startup
spark.rapids.sql.multiThreadedRead.numThreads|The maximum number of threads on each executor to use for reading small files in parallel. This can not be changed at runtime after the executor has started. Used with COALESCING and MULTITHREADED readers, see spark.rapids.sql.format.parquet.reader.type, spark.rapids.sql.format.orc.reader.type, or spark.rapids.sql.format.avro.reader.type for a discussion of reader types. If it is not set explicitly and spark.executor.cores is set, it will be tried to assign value of `max(MULTITHREAD_READ_NUM_THREADS_DEFAULT, spark.executor.cores)`, where MULTITHREAD_READ_NUM_THREADS_DEFAULT = 20.|20|Startup
+spark.rapids.sql.optimizer.joinReorder.enabled|When enabled, joins may be reordered for improved query performance|true|Runtime
spark.rapids.sql.python.gpu.enabled|This is an experimental feature and is likely to change in the future. Enable (true) or disable (false) support for scheduling Python Pandas UDFs with GPU resources. When enabled, pandas UDFs are assumed to share the same GPU that the RAPIDs accelerator uses and will honor the python GPU configs|false|Runtime
spark.rapids.sql.reader.batchSizeBytes|Soft limit on the maximum number of bytes the reader reads per batch. The readers will read chunks of data until this limit is met or exceeded. Note that the reader may estimate the number of bytes that will be used on the GPU in some cases based on the schema and number of rows in each batch.|2147483647|Runtime
spark.rapids.sql.reader.batchSizeRows|Soft limit on the maximum number of rows the reader will read per batch. The orc and parquet readers will read row groups until this limit is met or exceeded. The limit is respected by the csv reader.|2147483647|Runtime
diff --git a/docs/demo/Databricks/generate-init-script.ipynb b/docs/demo/Databricks/generate-init-script.ipynb
index bbb6809cc98..4c8ba857469 100644
--- a/docs/demo/Databricks/generate-init-script.ipynb
+++ b/docs/demo/Databricks/generate-init-script.ipynb
@@ -3,7 +3,7 @@
{
"cell_type":"code",
"source":[
- "dbutils.fs.mkdirs(\"dbfs:/databricks/init_scripts/\")\n \ndbutils.fs.put(\"/databricks/init_scripts/init.sh\",\"\"\"\n#!/bin/bash\nsudo wget -O /databricks/jars/rapids-4-spark_2.12-23.02.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.02.0/rapids-4-spark_2.12-23.02.0.jar\n\"\"\", True)"
+ "dbutils.fs.mkdirs(\"dbfs:/databricks/init_scripts/\")\n \ndbutils.fs.put(\"/databricks/init_scripts/init.sh\",\"\"\"\n#!/bin/bash\nsudo wget -O /databricks/jars/rapids-4-spark_2.12-23.04.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.04.0/rapids-4-spark_2.12-23.04.0.jar\n\"\"\", True)"
],
"metadata":{
diff --git a/docs/dev/shimplify.md b/docs/dev/shimplify.md
new file mode 100644
index 00000000000..9f1ca589c84
--- /dev/null
+++ b/docs/dev/shimplify.md
@@ -0,0 +1,266 @@
+---
+layout: page
+title: Shim Source Code Layout Simplification with Shimplify
+nav_order: 8
+parent: Developer Overview
+---
+
+# Shim Source Code Layout Simplification with Shimplify
+
+This document describes the next iteration of shim source code maintenance. It addresses the
+drawback introduced with the [shim layer rework][1] resulting in the guaranteed ABI-compatible
+bytecode management for each of the 14 currently supported Spark builds but at the expense of
+maintaining 50+ directories. Many shims are spread over an overlapping set of directories making it
+hard to determine where to make additions while keeping code duplication in check.
+
+[shimplify.py][2] is the new goal in the Maven build binding to the `generate-sources` phase.
+
+* It defines a new simpler shim directory structure where there is only a single directory per shim,
+and a special comment is injected to define metadata defining all the shim builds it participates in.
+* It can convert all or a subset of existing shims to the new build. The build can support partially
+converted shims if a longer transition is desired.
+
+## Simplified Shim Source Directory Structure
+
+In our build each supported Apache Spark build and its corresponding shim is identified by its
+[`buildver`][3] property. Every Maven submodule requiring shimming (`sql-plugin`, `tests` as of the
+time of this writing) have a new set of special sibling directories
+`src/(main|test)/spark${buildver}`.
+
+Previous `src/(main|test)/${buildver}` and
+version-range-with-exceptions directories such as `src/main/311until340-non330db` are deprecated and
+will be removed soon as a result of the conversion to the new structure.
+
+`shimplify` changes the way the source code is shared among shims by using an explicit
+lexicographically sorted list of `buildver` property values
+in a source-code level comment instead of the shared directories.
+
+```scala
+/*** spark-rapids-shim-json-lines
+{"spark": "312"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
+```
+
+The content inside the tags `spark-rapids-shim-json-lines` is in the [JSON Lines][4] format where
+each line is an extensible object with the shim metadata currently consisting just of the Spark
+build dependency version. The top object in the comment, the minimum version in the comment
+intuitively represents the first version of Spark requiring shimming in the plugin, albeit it might
+not be the original one as support for older Spark releases is eventually dropped. This `buildver`
+is called the *owner shim*.
+
+On the default read-only invocation path of the Maven build shimplify does not make any changes to
+shim source code files and their locations.
+
+* It analyzes the pre-shimplify directory structure and identifies the shims that through the code
+evolution ended up using more dedicated directories than necessary contributing avoidable
+complexity on top of an inherently complex directory structure already. As an example this is one of
+such warnings:
+
+```text
+shimplify - WARNING - Consider consolidating 312db, it spans multiple dedicated directories ['/home/user/gits/NVIDIA/spark-rapids/sql-plugin/src/main/312db/scala', '/home/user/gits/NVIDIA/spark-rapids/sql-plugin/src/main/31xdb/scala']
+```
+
+* For the shimplify directory structure all files under `src/(main|test)/spark*` directories
+are read to parse the `spark-rapids-shim-json-lines` comments. It performs the following
+validations:
+
+ * It makes sure that the comment is present and can be parsed
+ * The list of shims is non-empty (e.g., has not orphaned through dropping shims) and sorted.
+ * The file is stored under the *owner shim* directory.
+
+* All files participating listing the `buildver` of the current Maven build session are symlinked to
+`target/${buildver}/generated/src/(main|test)/(scala|java)`. Thus, instead of hardcoding distinct
+lists of directories for `build-helper` Maven plugin to add (one for each shim) after the full
+transition to shimplify, the pom will have only 4 add source statements that is independent of the
+number of supported shims.
+
+With the shimplify format in place it is easy to review all the files for a single shim without
+relying on Maven:
+
+```bash
+git grep '{"spark": "323"}' '*.scala' '*.java'
+```
+
+## Conversion to the Shimplify-based Directory Structure
+
+Shimplify can automatically convert the prior version-range-with-exceptions directory structure to
+the simplified version. This allows to make it an atomic transition without having to resolve
+almost unavoidable merge conflicts due to the sheer size of this sweeping change while the shim
+development is ongoing. The conversion of the shims source code and the regular build should not
+be done simultaneously for faster isolation and correction of potential bugs in the conversion code.
+
+Prior to invoking the conversion standalone, you first run
+
+```bash
+mvn clean install -DskipTests
+```
+
+on the current state of the `spark-rapids` repo.
+
+After that you can execute conversion in one or more iterations depending on specified -D parameters
+
+```bash
+mvn generate-sources -Dshimplify=true [-D...]
+```
+
+With `-Dshimplify=true`, shimplify is put on the write call path to generate and inject
+spark-rapids-shim-json-lines comments to all shim source files. The files are not yet moved to their
+owner shim directory, and so it is easy to verify with `git diff` the comments being injected. If
+you see any issue you can fix it and re-execute the command by adding
+`-Dshimplify.overwrite=true`. However, it is usually easier to just have git restore the
+previous state:
+
+```bash
+git restore sql-plugin tests
+```
+
+Once the shim comments looks good (as expected, it was tested), you can repeat it and now actually
+move the files to designated locations by invoking
+
+```bash
+mvn generate-sources -Dshimplify=true -Dshimplify.move=true
+```
+
+Now you can run a package build with the simplified directory structure and run a few integration
+tests preferably in the test standalone mode with the RAPIDS Shuffle Manager on for increased
+coverage:
+
+```bash
+mvn clean package -DskipTests -Dbuildver=331
+SPARK_HOME=~/dist/spark-3.3.1-bin-hadoop3 \
+ NUM_LOCAL_EXECS=2 \
+ PYSP_TEST_spark_rapids_shuffle_mode=MULTITHREADED \
+ PYSP_TEST_spark_rapids_shuffle_multiThreaded_writer_threads=2 \
+ PYSP_TEST_spark_rapids_shuffle_multiThreaded_reader_threads=2 \
+ PYSP_TEST_spark_shuffle_manager=com.nvidia.spark.rapids.spark331.RapidsShuffleManager \
+ PYSP_TEST_spark_rapids_memory_gpu_minAllocFraction=0 \
+ PYSP_TEST_spark_rapids_memory_gpu_maxAllocFraction=0.1 \
+ PYSP_TEST_spark_rapids_memory_gpu_allocFraction=0.1 \
+ ./integration_tests/run_pyspark_from_build.sh -k test_hash_grpby_sum
+```
+
+If smoke testing does not reveal any issues proceed to committing the change. If there are issues
+you can undo with
+
+```bash
+git restore --staged sql-plugin tests
+git restore sql-plugin tests
+```
+
+and by reviewing and removing the new directories with
+
+```bash
+git clean -f -d --dry-run
+```
+
+### Partial Conversion
+
+It is not expected to be really necessary but it is possible to convert a subset of the shims
+
+* Either by adding -Dshimplify.shims=buildver1,buildver2,... to the commands above
+* Or by specifying a list of directories you would like to delete to have a simpler directory
+-Dshimplify.dirs=311until340-non330db,320until330-noncdh
+
+The latter is just a minor twist on the former. Instead of having an explicit list of shims, it
+first computes the list of all `buildver` values using provided directories. After this *all* the
+files for the shims, not just under specified directories are converted.
+
+In both cases, the conversion does not leave the rest of the shims totally unaffected when
+there are common files with a specified shim. However, it guarantees to leave the previous dedicated
+files under `src/(main|test)/${buildver}` in place for shims outside the list. This is useful when
+developers of a certain shim would like to continue working on it without adapting the new method.
+However, for the simplicity of future refactoring the full transition is preferred.
+
+### Evolving shims without automatic conversion
+
+Suppose a bulk-conversion of existing shims is not an option whereas the next shimming issue
+requires difficult refactoring of version ranges with adding more directories with exceptions.
+Now it can be resolved easily by placing just the affected files to owner shim directories and
+adding shim JSON lines comments by hand.
+
+## Adding a new shim
+
+Shimplify can clone an existing shim based as a basis of the new shim. For example when adding
+support for a new [maintenance][5] version of Spark, say 3.2.4, it's expected to be similar to 3.2.3.
+
+If just 3.2.3 or all shims after the full transition have already been converted you can execute
+
+```bash
+mvn generate-sources -Dshimplify=true \
+ -Dshimplify.move=true -Dshimplify.overwrite=true \
+ -Dshimplify.add.shim=324 -Dshimplify.add.base=323
+```
+
+to clone 323 as 324. This will add `{"spark": "324"}` to every shared file constituting the 323
+shim. Moreover, it will create
+
+* a copy of dedicated 323 files with spark323 under spark324 shim
+directory
+* substitute spark324 for spark323 in the package name and path,
+* and modify the comment from `{"spark": "323"}` to `{"spark": "324"}`
+
+Review the new repo state, e.g., using `git grep '{"spark": "324"}'`.
+Besides having to add the `release324` profile to various pom.xml as before, this alone
+is likely to be insufficient to complete the work on 324. It is expected you will need to
+work on resolving potential compilation failures manually.
+
+## Deleting a Shim
+
+Every Spark build is de-supported eventually. To drop a build say 311 you can run
+
+```bash
+mvn generate-sources -Dshimplify=true -Dshimplify.move=true \
+ -Dshimplify.remove.shim=311
+```
+
+This command will remove the comment line `{"spark": "311"}` from all source files contributing to
+the 311 shim. If a file belongs exclusively to 311 it will be removed.
+
+After adding or deleting shims you should sanity-check the diff in the local git repo and
+run the integration tests above.
+
+## Symlinks & IDE
+
+IDEs may or may not reveal whether a file is accessed via a symlink. IntelliJ IDEA treats the
+original file path and a path via a symlink to the same file as two independent files by default.
+
+In the context of shimplify, only the generated symlink path is part of the project
+because the owner shim path is not `add-source`d during build and therefore during IDEA Project
+Import. The user can install the [Resolve Symlinks][6] plugin to prevent IDEA from opening multiple
+windows for the same physical source file. As of the time of this writing, it works seamlessly with
+the exception when the file is open via a Debugger either on a breakpoint hit or subsequent clicking
+on the affected stack frame in which case you will see an extra editor tab being added.
+
+No matter whether or not you use the [Resolve Symlinks][6] plugin, IDEA is able to add a breakpoint
+set directly via the original physical file or a symlink path.
+
+## Reducing Code Duplication
+
+You can help reducing code complexity by consolidating copy-and-pasted shim code accumulated because
+it had been hard to fit it into a less flexible shim inheritance hierarchy based on versions with
+exceptions.
+
+You can use the CPD tool that is integrated into our Maven build to find duplicate code in the shim
+and in the regular code base. It is not ready for automation and has to invoked manually, separately
+for Java and Scala, e.g.:
+
+```bash
+mvn antrun:run@duplicate-code-detector \
+ -Dcpd.argLine='--minimum-tokens 50 --language scala --skip-blocks-pattern /*|*/' \
+ -Dcpd.sourceType='main' \
+ > target/cpd.scala.txt
+```
+
+Delete duplicate methods and move a single copy into an object such as `SomethingShim` and annotate
+its file with the list of buildvers.
+
+See [CPD user doc][7] for more details about the options you can pass inside `cpd.argLine`.
+
+[1]: https://github.com/NVIDIA/spark-rapids/issues/3223
+[2]: https://github.com/NVIDIA/spark-rapids/blob/b7b1a5d544b6a3ac35ed064b5c32ee0d63c78845/build/shimplify.py#L15-L79
+[3]: https://github.com/NVIDIA/spark-rapids/blob/74ce729ca1306db01359e68f7f0b7cc31cd3d850/pom.xml#L494-L500
+[4]: https://jsonlines.org/
+[5]: https://spark.apache.org/versioning-policy.html
+[6]: https://plugins.jetbrains.com/plugin/16429-idea-resolve-symlinks
+[7]: https://docs.pmd-code.org/latest/pmd_userdocs_cpd.html
diff --git a/docs/dev/shims.md b/docs/dev/shims.md
index 00125f7a9c3..2d4dcdc25ac 100644
--- a/docs/dev/shims.md
+++ b/docs/dev/shims.md
@@ -68,17 +68,17 @@ Using JarURLConnection URLs we create a Parallel World of the current version wi
Spark 3.0.2's URLs:
```text
-jar:file:/home/spark/rapids-4-spark_2.12-23.02.0.jar!/
-jar:file:/home/spark/rapids-4-spark_2.12-23.02.0.jar!/spark3xx-common/
-jar:file:/home/spark/rapids-4-spark_2.12-23.02.0.jar!/spark302/
+jar:file:/home/spark/rapids-4-spark_2.12-23.04.0.jar!/
+jar:file:/home/spark/rapids-4-spark_2.12-23.04.0.jar!/spark3xx-common/
+jar:file:/home/spark/rapids-4-spark_2.12-23.04.0.jar!/spark302/
```
Spark 3.2.0's URLs :
```text
-jar:file:/home/spark/rapids-4-spark_2.12-23.02.0.jar!/
-jar:file:/home/spark/rapids-4-spark_2.12-23.02.0.jar!/spark3xx-common/
-jar:file:/home/spark/rapids-4-spark_2.12-23.02.0.jar!/spark320/
+jar:file:/home/spark/rapids-4-spark_2.12-23.04.0.jar!/
+jar:file:/home/spark/rapids-4-spark_2.12-23.04.0.jar!/spark3xx-common/
+jar:file:/home/spark/rapids-4-spark_2.12-23.04.0.jar!/spark320/
```
### Late Inheritance in Public Classes
diff --git a/docs/dev/testing.md b/docs/dev/testing.md
index 2d2c51a961c..5ba180a836f 100644
--- a/docs/dev/testing.md
+++ b/docs/dev/testing.md
@@ -5,5 +5,5 @@ nav_order: 2
parent: Developer Overview
---
An overview of testing can be found within the repository at:
-* [Unit tests](https://github.com/NVIDIA/spark-rapids/tree/branch-23.02/tests#readme)
-* [Integration testing](https://github.com/NVIDIA/spark-rapids/tree/branch-23.02/integration_tests#readme)
+* [Unit tests](https://github.com/NVIDIA/spark-rapids/tree/branch-23.04/tests#readme)
+* [Integration testing](https://github.com/NVIDIA/spark-rapids/tree/branch-23.04/integration_tests#readme)
diff --git a/docs/download.md b/docs/download.md
index b1141037138..4eb66a9f037 100644
--- a/docs/download.md
+++ b/docs/download.md
@@ -18,7 +18,7 @@ cuDF jar, that is either preinstalled in the Spark classpath on all nodes or sub
that uses the RAPIDS Accelerator For Apache Spark. See the [getting-started
guide](https://nvidia.github.io/spark-rapids/Getting-Started/) for more details.
-## Release v23.02.0
+## Release v23.04.0
Hardware Requirements:
The plugin is tested on the following architectures:
@@ -27,11 +27,11 @@ The plugin is tested on the following architectures:
Software Requirements:
- OS: Ubuntu 18.04, Ubuntu 20.04 or CentOS 7, Rocky Linux 8
+ OS: Ubuntu 20.04, Ubuntu 22.04, CentOS 7, or Rocky Linux 8
CUDA & NVIDIA Drivers*: 11.x & v450.80.02+
- Apache Spark 3.1.1, 3.1.2, 3.1.3, 3.2.0, 3.2.1, 3.2.2, 3.2.3, 3.3.0, 3.3.1, Databricks 9.1 ML LTS, 10.4 ML LTS or 11.3 ML LTS Runtime and GCP Dataproc 2.0
+ Apache Spark 3.1.1, 3.1.2, 3.1.3, 3.2.0, 3.2.1, 3.2.2, 3.2.3, 3.3.0, 3.3.1, 3.3.2, Databricks 10.4 ML LTS or 11.3 ML LTS Runtime and GCP Dataproc 2.0, Dataproc 2.1
Python 3.6+, Scala 2.12, Java 8
@@ -41,22 +41,22 @@ for your hardware's minimum driver version.
*For Cloudera and EMR support, please refer to the
[Distributions](./FAQ.md#which-distributions-are-supported) section of the FAQ.
-### Download v23.02.0
+### Download v23.04.0
* Download the [RAPIDS
- Accelerator for Apache Spark 23.02.0 jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.02.0/rapids-4-spark_2.12-23.02.0.jar)
+ Accelerator for Apache Spark 23.04.0 jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.04.0/rapids-4-spark_2.12-23.04.0.jar)
-This package is built against CUDA 11.5 and all CUDA 11.x versions are supported through [CUDA forward
+This package is built against CUDA 11.8 and all CUDA 11.x versions are supported through [CUDA forward
compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/index.html). It is tested
on V100, T4, A2, A10, A30 and A100 GPUs with CUDA 11.0-11.5. For those using other types of GPUs which
do not have CUDA forward compatibility (for example, GeForce), CUDA 11.5 or later is required. Users will
need to ensure the minimum driver (450.80.02) and CUDA toolkit are installed on each Spark node.
### Verify signature
-* Download the [RAPIDS Accelerator for Apache Spark 23.02.0 jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.02.0/rapids-4-spark_2.12-23.02.0.jar)
- and [RAPIDS Accelerator for Apache Spark 23.02.0 jars.asc](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.02.0/rapids-4-spark_2.12-23.02.0.jar.asc)
+* Download the [RAPIDS Accelerator for Apache Spark 23.04.0 jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.04.0/rapids-4-spark_2.12-23.04.0.jar)
+ and [RAPIDS Accelerator for Apache Spark 23.04.0 jars.asc](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.04.0/rapids-4-spark_2.12-23.04.0.jar.asc)
* Download the [PUB_KEY](https://keys.openpgp.org/search?q=sw-spark@nvidia.com).
* Import the public key: `gpg --import PUB_KEY`
-* Verify the signature: `gpg --verify rapids-4-spark_2.12-23.02.0.jar.asc rapids-4-spark_2.12-23.02.0.jar`
+* Verify the signature: `gpg --verify rapids-4-spark_2.12-23.04.0.jar.asc rapids-4-spark_2.12-23.04.0.jar`
The output if signature verify:
@@ -64,15 +64,15 @@ The output if signature verify:
### Release Notes
New functionality and performance improvements for this release include:
-* Delta Lake MERGE/DELETE/UPDATE (experimental feature, can be enabled with a config flag)
-* Function `from_json`
-* Hive text table write
-* Databricks 11.3 ML LTS support
-* Support batched full join to improve full join's performance
+* Introduces OOM retry framework for automatic OOM handling in memory-intensive operators, such as: join, aggregates and windows, coalescing, projections and filters.
+* Support dynamic repartitioning in large/skewed hash joins
+* Optimize the transpilation in `regexp_extract` function
+* Support Delta Lake write with auto-optimization and auto-compaction on Databricks platforms
* Qualification and Profiling tool:
- * EMR user tools support for qualification
- * EMR user tools support for bootstrap
- * Updated estimated speedup factors for on-prem, Dataproc, and EMR environments for qualification
+ * Add support to recommend cluster shape options on Dataproc and EMR
+ * Add support for Databricks local mode with cost savings based on cluster metadata
+ * Add TCO calculator to estimate annualized cost savings, including estimated frequency for applications
+ * Add support in the qualification tool to generate estimated speed-up for ML functionality in Spark applications
For a detailed list of changes, please refer to the
diff --git a/docs/get-started/getting-started-alluxio.md b/docs/get-started/getting-started-alluxio.md
index d93316a0797..5a1b7e4fe13 100644
--- a/docs/get-started/getting-started-alluxio.md
+++ b/docs/get-started/getting-started-alluxio.md
@@ -198,8 +198,12 @@ NM_hostname_2
so local data access speed may vary depending on the local storage media. To learn
more about this topic, please refer to the
[tiered storage document](https://docs.alluxio.io/os/user/stable/en/core-services/Caching.html#multiple-tier-storage).
-
-3. Start Alluxio cluster
+3. Create a link to ALLUXIO_HOME
+ Execute the following commands to create a link `/opt/alluxio` to actual Alluxio Home path:
+ ```bash
+ ln -s ${ALLUXIO_HOME} /opt/alluxio
+ ```
+4. Start Alluxio cluster
- Format Alluxio
@@ -225,7 +229,7 @@ NM_hostname_2
To verify that Alluxio is running, visit `http://RM_hostname:19999`
to see the status page of the Alluxio master.
-4. Mount an existing data storage to Alluxio
+5. Mount an existing data storage to Alluxio
- Mount S3 bucket
@@ -337,7 +341,6 @@ without setting `spark.rapids.alluxio.pathsToReplace`, which takes precedence ov
``` shell
--conf spark.rapids.alluxio.automount.enabled=true
```
-If Alluxio is not installed in /opt/alluxio-2.8.0, you should set the environment variable `ALLUXIO_HOME`.
Additional configs:
``` shell
@@ -347,14 +350,6 @@ The regex is used to match the s3 URI, to decide which bucket we should auto mou
The default value is to match all the URIs which start with `s3://` or `s3a://`.
For exmaple, `^s3a{1,1}://foo.*` will match the buckets which start with `foo`.
-```shell
---conf spark.rapids.alluxio.cmd="su,ubuntu,-c,/opt/alluxio-2.8.0/bin/alluxio"
-```
-This cmd config defines a sequence to be used run the Alluxio command by a specific user,
-mostly the user with Alluxio permission. We run the command by user `ubuntu` as default.
-If you have a different user and command path, you can redefine it.
-The default value is suitable for the case of running Alluxio with RAPIDS on Databricks.
-
## Configure whether the disks used by Alluxio are fast
The default value of config `spark.rapids.alluxio.slow.disk` is true, indicating the disks used by Alluxio are slow.
The true value enables an improvement which reads from S3 directly to get better performance when the files being read are large.
diff --git a/docs/get-started/getting-started-aws-emr.md b/docs/get-started/getting-started-aws-emr.md
index 8a2d1755dff..3b085768b12 100644
--- a/docs/get-started/getting-started-aws-emr.md
+++ b/docs/get-started/getting-started-aws-emr.md
@@ -7,13 +7,13 @@ parent: Getting-Started
# Get Started with RAPIDS on AWS EMR
This is a getting started guide for the RAPIDS Accelerator for Apache Spark on AWS EMR. At the end
-of this guide, the user will be able to run a sample Apache Spark application that runs on NVIDIA
-GPUs on AWS EMR.
+of this guide, the user will be able to run a sample Apache Spark application on NVIDIA GPUs on AWS EMR.
Different versions of EMR ship with different versions of Spark, RAPIDS Accelerator, cuDF and xgboost4j-spark:
| EMR | Spark | RAPIDS Accelerator jar | cuDF jar | xgboost4j-spark jar
| --- | --- | --- | ---| --- |
+| 6.10 | 3.3.1 | rapids-4-spark_2.12-22.12.0.jar | Bundled with rapids-4-spark | xgboost4j-spark_3.0-1.4.2-0.3.0.jar |
| 6.9 | 3.3.0 | rapids-4-spark_2.12-22.08.0.jar | Bundled with rapids-4-spark | xgboost4j-spark_3.0-1.4.2-0.3.0.jar |
| 6.8 | 3.3.0 | rapids-4-spark_2.12-22.06.0.jar | Bundled with rapids-4-spark | xgboost4j-spark_3.0-1.4.2-0.3.0.jar |
| 6.7 | 3.2.1 | rapids-4-spark_2.12-22.02.0.jar | cudf-22.02.0-cuda11.jar | xgboost4j-spark_3.0-1.2.0-0.1.0.jar |
@@ -23,46 +23,109 @@ Different versions of EMR ship with different versions of Spark, RAPIDS Accelera
| 6.3 | 3.1.1 | rapids-4-spark_2.12-0.4.1.jar | cudf-0.18.1-cuda10-1.jar | xgboost4j-spark_3.0-1.2.0-0.1.0.jar |
| 6.2 | 3.0.1 | rapids-4-spark_2.12-0.2.0.jar | cudf-0.15-cuda10-1.jar | xgboost4j-spark_3.0-1.0.0-0.2.0.jar |
-For more details of supported applications, please see the [EMR release
+For more details about each EMR release, please see the [EMR release
notes](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-release-6x.html).
For more information on AWS EMR, please see the [AWS
documentation](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-what-is-emr.html).
-## Configure and Launch AWS EMR with GPU Nodes
+## Leveraging Spark RAPIDS User Tools for Qualification and Bootstrap
-The following steps are based on the AWS EMR document ["Using the NVIDIA Spark-RAPIDS Accelerator
-for Spark"](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-rapids.html)
+To use the qualification and bootstrap tools for EMR, you will want to install the Spark RAPIDS user tools package.
+Instructions for installing and setting up the Spark RAPIDS user tools package for EMR can be found here:
+[link](https://github.com/NVIDIA/spark-rapids-tools/blob/main/user_tools/docs/user-tools-aws-emr.md).
-### Launch an EMR Cluster using AWS CLI
+## Qualify CPU Workloads for GPU Acceleration
+
+The [qualification tool](https://nvidia.github.io/spark-rapids/docs/spark-qualification-tool.html) is launched to analyze CPU applications
+that have already run. The tool will output the applications recommended for acceleration along with estimated speed-up
+and cost saving metrics. Additionally, it will provide information on how to launch a GPU-accelerated cluster to take
+advantage of the speed-up and cost savings.
-You can use the AWS CLI to launch a cluster with one Master node (m5.xlarge) and two
-g4dn.2xlarge nodes:
+Usage: `spark_rapids_user_tools emr qualification --eventlogs --cpu_cluster `
+Help (to see all options available): `spark_rapids_user_tools emr qualification --help`
+
+Example output:
```
-aws emr create-cluster \
---release-label emr-6.9.0 \
---applications Name=Hadoop Name=Spark Name=Livy Name=JupyterEnterpriseGateway \
---service-role EMR_DefaultRole \
---ec2-attributes KeyName=my-key-pair,InstanceProfile=EMR_EC2_DefaultRole \
---instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=m4.4xlarge \
- InstanceGroupType=CORE,InstanceCount=1,InstanceType=g4dn.2xlarge \
- InstanceGroupType=TASK,InstanceCount=1,InstanceType=g4dn.2xlarge \
---configurations file:///my-configurations.json \
---bootstrap-actions Name='My Spark Rapids Bootstrap action',Path=s3://my-bucket/my-bootstrap-action.sh
++----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------+
+| | App Name | App ID | Recommendation | Estimated GPU | Estimated GPU | App | Estimated GPU |
+| | | | | Speedup | Duration(s) | Duration(s) | Savings(%) |
+|----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------|
+| 0 | query24 | application_1664888311321_0011 | Strongly Recommended | 3.49 | 257.18 | 897.68 | 59.70 |
+| 1 | query78 | application_1664888311321_0009 | Strongly Recommended | 3.35 | 113.89 | 382.35 | 58.10 |
+| 2 | query23 | application_1664888311321_0010 | Strongly Recommended | 3.08 | 325.77 | 1004.28 | 54.37 |
+| 3 | query64 | application_1664888311321_0008 | Strongly Recommended | 2.91 | 150.81 | 440.30 | 51.82 |
+| 4 | query50 | application_1664888311321_0003 | Recommended | 2.47 | 101.54 | 250.95 | 43.08 |
+| 5 | query16 | application_1664888311321_0005 | Recommended | 2.36 | 106.33 | 251.95 | 40.63 |
+| 6 | query38 | application_1664888311321_0004 | Recommended | 2.29 | 67.37 | 154.33 | 38.59 |
+| 7 | query87 | application_1664888311321_0006 | Recommended | 2.25 | 75.67 | 170.69 | 37.64 |
+| 8 | query51 | application_1664888311321_0002 | Recommended | 1.53 | 53.94 | 82.63 | 8.18 |
++----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------+
+
+Instance types conversions:
+----------- -- ------------
+m5d.8xlarge to g4dn.8xlarge
+----------- -- ------------
+To support acceleration with T4 GPUs, switch the worker node instance types
```
-Please fill with actual value for `KeyName` and file paths. You can further customize SubnetId,
-EmrManagedSlaveSecurityGroup, EmrManagedMasterSecurityGroup, name and region etc.
+## Configure and Launch AWS EMR with GPU Nodes
+
+Please follow AWS EMR document ["Using the NVIDIA Spark-RAPIDS Accelerator
+for Spark"](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-rapids.html).
+Below is an example.
+
+### Launch an EMR Cluster using AWS Console (GUI)
+
+Go to the AWS Management Console and select the `EMR` service from the "Analytics" section. Choose
+the region you want to launch your cluster in, e.g. US West (Oregon), using the dropdown menu in the
+top right corner. Click `Create cluster`, which will bring up a detailed cluster configuration page.
+
+#### Step 1: EMR Release and Application Bundle Selection
-The `my-configurations.json` installs the spark-rapids plugin on your cluster, configures YARN to use
+Enter a custom "Cluster name" for your cluster.
+
+Select **emr-6.10.0** for the release and pick "Custom" for the "Application bundle". Unncheck all the
+software options, and then check **Hadoop 3.3.3**, **Spark 3.3.1**, **Livy 0.7.1** and
+**JupyterEnterpriseGateway 2.6.0**.
+
+Optionally pick Amazon Linux Release or configure a "Custom AMI".
+
+![Step 1: Software, Configuration and Steps](../img/AWS-EMR/name-and-applications.png)
+
+#### Step 2: Hardware
-GPUs, configures Spark to use RAPIDS, and configures the YARN capacity scheduler. An example JSON
+Keep the default "Primary" node instance type of **m5.xlarge**.
-configuration can be found in the section on launching in the GUI below.
+Change the "Core" node "Instance type" to **g4dn.xlarge**, **g4dn.2xlarge**, or
+**p3.2xlarge**
-The `my-boostrap-action.sh` script referenced in the above script opens cgroup permissions to YARN
-on your cluster. This is required for YARN to use GPUs. An example script is as follows:
+An optional step is to have "Task" nodes. These nodes can run a Spark executor but they do not run
+the HDFS Data Node service. You can click on "Remove instance group" if you would like to only run
+"Core" nodes with the Data Node and Spark executors. If you want to add extra "Task" nodes, make sure
+that that instance type matches what you selected for "Core".
+
+Under "Cluster scaling and provisioning potion", verify that the instance count for the "Core" instance group
+is at least 1.
+
+![Step 2: Cluster Configuration](../img/AWS-EMR/cluster-configuration.png)
+
+Under "Networking", select the desired VPC and subnet. You can also create a new VPC and subnet for the cluster.
+
+*Optionally* set custom security groups in the "EC2 security groups" tab.
+
+In the "EC2 security groups" section, confirm that the security group chosen for the "Primary" node
+allows for SSH access. Follow these instructions to [allow inbound SSH
+traffic](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/authorizing-access-to-an-instance.html)
+if the security group does not allow it yet.
+
+![Step 2: Cluster Configuration](../img/AWS-EMR/networking.png)
+
+#### Step 3: General Cluster Settings
+
+Add a custom bootstrap action under "Bootstrap Actions" to allow cgroup permissions to YARN on your cluster.
+An example bootstrap script is as follows:
```bash
#!/bin/bash
@@ -72,24 +135,17 @@ sudo chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct
sudo chmod a+rwx -R /sys/fs/cgroup/devices
```
-### Launch an EMR Cluster using AWS Console (GUI)
+![Step 3: General Cluster Settings](../img/AWS-EMR/bootstrap-action.png)
-Go to the AWS Management Console and select the `EMR` service from the "Analytics" section. Choose
-the region you want to launch your cluster in, e.g. US West (Oregon), using the dropdown menu in the
-top right corner. Click `Create cluster` and select `Go to advanced options`, which will bring up a
-detailed cluster configuration page.
-
-#### Step 1: Software Configuration and Steps
+#### Step 4: Edit Software Configuration
-Select **emr-6.9.0** for the release, uncheck all the software options, and then check **Hadoop
-3.3.3**, **Spark 3.3.0**, **Livy 0.7.1** and **JupyterEnterpriseGateway 2.6.0**.
-
-In the "Edit software settings" field, copy and paste the configuration from the [EMR
-document](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-rapids.html). You can also
-create a JSON file on you own S3 bucket.
+In the "Software settings" field, copy and paste the configuration from the [EMR
+document](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-rapids.html) in the textbox provided
+under "Enter configuration". You can also create a JSON file on you own S3 bucket when selecting
+"Load JSON from Amazon S3".
For clusters with 2x g4dn.2xlarge GPU instances as worker nodes, we recommend the following
-default settings:
+default settings:
```json
[
{
@@ -162,84 +218,107 @@ default settings:
```
Adjust the settings as appropriate for your cluster. For example, setting the appropriate
number of cores based on the node type. The `spark.task.resource.gpu.amount` should be set to
-1/(number of cores per executor) which will allow multiple tasks to run in parallel on the GPU.
+1/(number of cores per executor) which will allow multiple tasks to run in parallel on the GPU.
-For example, for clusters with 2x g4dn.12xlarge as core nodes, use the following:
+For example, for clusters with 2x g4dn.12xlarge as core nodes, use the following:
```json
"spark.executor.cores":"12",
"spark.task.resource.gpu.amount":"0.0833",
```
-More configuration details can be found in the [configuration](../configs.md) documentation.
-
-![Step 1: Step 1: Software, Configuration and Steps](../img/AWS-EMR/RAPIDS_EMR_GUI_1.png)
+More configuration details can be found in the [configuration](../configs.md) documentation.
-#### Step 2: Hardware
+#### Step 5: Security
-Select the desired VPC and availability zone in the "Network" and "EC2 Subnet" fields
-respectively. (Default network and subnet are ok)
+Select an existing "EC2 key pair" that will be used to authenticate SSH access to the cluster's
+nodes. If you do not have access to an EC2 key pair, follow these instructions to [create an EC2 key
+pair](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html#having-ec2-create-your-key-pair).
-In the "Core" node row, change the "Instance type" to **g4dn.xlarge**, **g4dn.2xlarge**, or
-**p3.2xlarge** and ensure "Instance count" is set to **1** or any higher number. Keep the default
-"Master" node instance type of **m5.xlarge**.
+![Step 5: SSH Key Pair](../img/AWS-EMR/ssh-key-pair.png)
-![Step 2: Hardware](../img/AWS-EMR/RAPIDS_EMR_GUI_2.png)
+#### Finish Cluster Configuration
-#### Step 3: General Cluster Settings
+The EMR cluster management page displays the status of multiple clusters or detailed information
+about a chosen cluster. In the detailed cluster view, the "Instances" and "Monitoring" tabs can be used
+to monitor the status of the various cluster nodes.
-Enter a custom "Cluster name" and make a note of the s3 folder that cluster logs will be written to.
+When the cluster is ready, a green-dot will appear next to the cluster name and the "Status" column
+will display **Waiting, cluster ready**.
-Add a custom "Bootstrap Actions" to allow cgroup permissions to YARN on your cluster. An example
-bootstrap script is as follows:
-```bash
-#!/bin/bash
-
-set -ex
-
-sudo chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct
-sudo chmod a+rwx -R /sys/fs/cgroup/devices
-```
+In the cluster's "Summary" tab, find the "Primary node public DNS" field and click on
+"Connect to the Primary Node using SSH". Follow the instructions to SSH to the new cluster's primary node.
-*Optionally* add key-value "Tags", configure a "Custom AMI" for the EMR cluster on this page.
+### Launch an EMR Cluster using AWS CLI
-![Step 3: General Cluster Settings](../img/AWS-EMR/RAPIDS_EMR_GUI_3.png)
+In this example, we will use the AWS CLI to launch a cluster with one Primary node (m5.xlarge) and two
+g4dn.2xlarge nodes.
-#### Step 4: Security
+You will need:
+- an SSH key-pair already registered in the AWS console
+- a subnet and VPC configuration (default or a custom configuration)
-Select an existing "EC2 key pair" that will be used to authenticate SSH access to the cluster's
-nodes. If you do not have access to an EC2 key pair, follow these instructions to [create an EC2 key
-pair](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html#having-ec2-create-your-key-pair).
+```bash
+aws emr create-cluster \
+--release-label emr-6.10.0 \
+--applications Name=Hadoop Name=Spark Name=Livy Name=JupyterEnterpriseGateway \
+--service-role DemoServiceRole \
+--ec2-attributes KeyName=demo-key-pair,SubnetId=demo-subnet,InstanceProfile=DemoInstanceProfile \
+--instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=m4.4xlarge \
+ InstanceGroupType=CORE,InstanceCount=1,InstanceType=g4dn.2xlarge
+--configurations file://config.json \
+--bootstrap-actions Name='Setup cgroups bootstrap',Path=s3://demo-bucket/cgroup-bootstrap-action.sh
+```
-*Optionally* set custom security groups in the "EC2 security groups" tab.
+Please fill with actual value for `KeyName`, `SubnetId`, `service-role`, and `InstanceProfile`.
+The service role and instance profile are AWS IAM roles associated with your cluster, which allow
+the EMR cluster to access services provided by AWS.
-In the "EC2 security groups" tab, confirm that the security group chosen for the "Master" node
-allows for SSH access. Follow these instructions to [allow inbound SSH
-traffic](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/authorizing-access-to-an-instance.html)
-if the security group does not allow it yet.
+The `config.json` installs the spark-rapids plugin on your cluster, configures YARN to use
+GPUs, configures Spark to use RAPIDS, and configures the YARN capacity scheduler. An [example JSON
+configuration](#step-4--edit-software-configuration) can be found in the section on
+launching in the GUI above.
-![Step 4: Security](../img/AWS-EMR/RAPIDS_EMR_GUI_4.png)
+The `cgroup-boostrap-action.sh` script referenced in the above script opens cgroup permissions to YARN
+on your cluster. You can find an example of
+the [cgroup bootstrap action](#step-3--general-cluster-settings) above.
-#### Finish Cluster Configuration
+### Running the Spark RAPIDS User Tools Bootstrap for Optimal Cluster Spark Settings
-The EMR cluster management page displays the status of multiple clusters or detailed information
-about a chosen cluster. In the detailed cluster view, the "Summary" and "Hardware" tabs can be used
-to monitor the status of master and core nodes as they provision and initialize.
+The bootstrap tool will generate optimized settings for the RAPIDS Accelerator on Apache Spark on a
+GPU cluster for EMR. The tool will fetch the characteristics of the cluster -- including
+number of workers, worker cores, worker memory, and GPU accelerator type and count. It will use
+the cluster properties to then determine the optimal settings for running GPU-accelerated Spark
+applications.
-When the cluster is ready, a green-dot will appear next to the cluster name and the "Status" column
-will display **Waiting, cluster ready**.
+Usage: `spark_rapids_user_tools emr bootstrap --cluster `
-In the cluster's "Summary" tab, find the "Master public DNS" field and click the `SSH`
-button. Follow the instructions to SSH to the new cluster's master node.
+Help (to see all options available): `spark_rapids_user_tools emr bootstrap --help`
-![Finish Cluster Configuration](../img/AWS-EMR/RAPIDS_EMR_GUI_5.png)
+Example output:
+```
+##### BEGIN : RAPIDS bootstrap settings for gpu-cluster
+spark.executor.cores=16
+spark.executor.memory=32768m
+spark.executor.memoryOverhead=7372m
+spark.rapids.sql.concurrentGpuTasks=2
+spark.rapids.memory.pinnedPool.size=4096m
+spark.sql.files.maxPartitionBytes=512m
+spark.task.resource.gpu.amount=0.0625
+##### END : RAPIDS bootstrap settings for gpu-cluster
+```
+A detailed description for bootstrap settings with usage information is available in the [RAPIDS Accelerator for Apache Spark Configuration](https://nvidia.github.io/spark-rapids/docs/configs.html) and [Spark Configuration](https://spark.apache.org/docs/latest/configuration.html) page.
-### Running an example joint operation using Spark Shell
+### Running an Example Join Operation Using Spark Shell
-SSH to the EMR cluster's master node, get into sparks shell and run the sql join example to verify
+Please follow EMR doc [Connect to the primary node using
+SSH](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-connect-master-node-ssh.html) to ssh
+to the EMR cluster's primary node. And then get into sparks shell and run the sql join example to verify
GPU operation.
+Note: Use `hadoop` user for SSH and below command.
+
```bash
spark-shell
```
@@ -259,7 +338,7 @@ out.explain()
Similar to spark-submit for on-prem clusters, AWS EMR supports a Spark application job to be
submitted. The mortgage examples we use are also available as a spark application. You can also use
-**spark shell** to run the scala code or **pyspark** to run the python code on master node through
+**spark shell** to run the scala code or **pyspark** to run the python code on the primary node through
CLI.
### Running GPU Accelerated Mortgage ETL Example using EMR Notebook
@@ -277,11 +356,17 @@ ETL](https://github.com/NVIDIA/spark-rapids/tree/main/docs/demo)
#### Create EMR Notebook and Connect to EMR GPU Cluster
-Go to the AWS Management Console and select Notebooks on the left column. Click the Create notebook
-button. You can then click "Choose an existing cluster" and pick the right cluster after click
-Choose button. Once the instance is ready, launch the Jupyter from EMR Notebook instance.
+Go to the Amazon EMR page and select "Studios" under "EMR Studios". You can create a Studio if
+you haven't already.
+
+Create a notebook by clicking on "Workspaces (Notebooks)" on the left column and then clicking
+on the "Create Workspace" button. Select the studio you selected in the prior step.
+
+Enter a Workspace name, descritiption and a location (which should be set by default to the studio
+S3 path). Under "Advanced configuration", you can pick an EMR cluster that you have already
+launched.
-![Create EMR Notebook](../img/AWS-EMR/EMR_notebook_1.png)
+![Create EMR Notebook](../img/AWS-EMR/notebook-workspace-creation.png)
#### Run Mortgage ETL PySpark Notebook on EMR GPU Cluster
@@ -292,4 +377,4 @@ cluster. You can adjust settings in the notebook for full mortgage dataset ETL.
When executing the ETL code, you can also see the Spark Job Progress within the notebook and the
code will also display how long it takes to run the query
-![Create EMR Notebook](../img/AWS-EMR/EMR_notebook_3.png)
\ No newline at end of file
+![Create EMR Notebook](../img/AWS-EMR/EMR_notebook_3.png)
diff --git a/docs/get-started/getting-started-databricks.md b/docs/get-started/getting-started-databricks.md
index db03abdb3a6..0e1aee6ddfe 100644
--- a/docs/get-started/getting-started-databricks.md
+++ b/docs/get-started/getting-started-databricks.md
@@ -11,9 +11,9 @@ At the end of this guide, the reader will be able to run a sample Apache Spark a
on NVIDIA GPUs on Databricks.
## Prerequisites
- * Apache Spark 3.x running in Databricks Runtime 9.1 ML, 10.4 ML or 11.3 ML with GPU
- * AWS: 9.1 LTS ML (GPU, Scala 2.12, Spark 3.1.2), 10.4 LTS ML (GPU, Scala 2.12, Spark 3.2.1) or 11.3 LTS ML (GPU, Scala 2.12, Spark 3.3.0)
- * Azure: 9.1 LTS ML (GPU, Scala 2.12, Spark 3.1.2) or 10.4 LTS ML (GPU, Scala 2.12, Spark 3.2.1) or 11.3 LTS ML (GPU, Scala 2.12, Spark 3.3.0)
+ * Apache Spark 3.x running in Databricks Runtime 10.4 ML or 11.3 ML with GPU
+ * AWS: 10.4 LTS ML (GPU, Scala 2.12, Spark 3.2.1) or 11.3 LTS ML (GPU, Scala 2.12, Spark 3.3.0)
+ * Azure: 10.4 LTS ML (GPU, Scala 2.12, Spark 3.2.1) or 11.3 LTS ML (GPU, Scala 2.12, Spark 3.3.0)
Databricks may do [maintenance
releases](https://docs.databricks.com/release-notes/runtime/maintenance-updates.html) for their
@@ -58,10 +58,6 @@ The number of GPUs per node dictates the number of Spark executors that can run
of DecimalTypes with precision greater than 38. There is a bug filed in Apache Spark for it
[here](https://issues.apache.org/jira/browse/SPARK-41793), whereas when using the plugin the
correct result will be returned.
-
-6. A query may fail when Dynamic File Pruning is enabled. As a workaround, please
- disable the feature by setting `spark.databricks.optimizer.dynamicFilePruning false`. More details
- are in [issue-7648](https://github.com/NVIDIA/spark-rapids/issues/7648).
## Start a Databricks Cluster
Create a Databricks cluster by going to "Compute", then clicking `+ Create compute`. Ensure the
@@ -70,9 +66,7 @@ cluster meets the prerequisites above by configuring it as follows:
Prerequisites section.
2. Choose the number of workers that matches the number of GPUs you want to use.
3. Select a worker type. On AWS, use nodes with 1 GPU each such as `p3.2xlarge` or `g4dn.xlarge`.
- p2 nodes do not meet the architecture requirements (Pascal or higher) for the Spark worker
- (although they can be used for the driver node). For Azure, choose GPU nodes such as
- Standard_NC6s_v3. For GCP, choose N1 or A2 instance types with GPUs.
+ For Azure, choose GPU nodes such as Standard_NC6s_v3. For GCP, choose N1 or A2 instance types with GPUs.
4. Select the driver type. Generally this can be set to be the same as the worker.
5. Start the cluster.
@@ -87,9 +81,6 @@ cluster.
how to import a notebook.
Select the version of the RAPIDS Accelerator for Apache Spark based on the Databricks runtime
version:
- - [Databricks 9.1 LTS
- ML](https://docs.databricks.com/release-notes/runtime/9.1ml.html#system-environment) has CUDA 11
- installed. Users will need to use 21.12.0 or later on Databricks 9.1 LTS ML.
- [Databricks 10.4 LTS
ML](https://docs.databricks.com/release-notes/runtime/10.4ml.html#system-environment) has CUDA 11
installed. Users will need to use 22.04.0 or later on Databricks 10.4 LTS ML.
@@ -131,7 +122,6 @@ cluster.
spark.task.resource.gpu.amount 0.1
spark.rapids.memory.pinnedPool.size 2G
spark.rapids.sql.concurrentGpuTasks 2
- spark.databricks.optimizer.dynamicFilePruning false
```
![Spark Config](../img/Databricks/sparkconfig.png)
@@ -144,13 +134,16 @@ cluster.
[`spark.rapids.sql.python.gpu.enabled`](../configs.md#sql.python.gpu.enabled) to `true` to
enable GPU support for python. Add the path of the plugin jar (supposing it is placed under
`/databricks/jars/`) to the `spark.executorEnv.PYTHONPATH` option. For more details please go to
- [GPU Scheduling For Pandas UDF](../additional-functionality/rapids-udfs.md#gpu-scheduling-for-pandas-udf)
+ [GPU Scheduling For Pandas UDF](../additional-functionality/rapids-udfs.md#gpu-support-for-pandas-udf)
```bash
spark.rapids.sql.python.gpu.enabled true
spark.python.daemon.module rapids.daemon_databricks
- spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-23.02.0.jar:/databricks/spark/python
+ spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-23.04.0.jar:/databricks/spark/python
```
+ Note that since python memory pool require installing the cudf library, so you need to install cudf library in
+ each worker nodes `pip install cudf-cu11 --extra-index-url=https://pypi.nvidia.com` or disable python memory pool
+ `spark.rapids.python.memory.gpu.pooling.enabled=false`.
7. Once you’ve added the Spark config, click “Confirm and Restart”.
8. Once the cluster comes back up, it is now enabled for GPU-accelerated Spark.
diff --git a/docs/get-started/getting-started-gcp.md b/docs/get-started/getting-started-gcp.md
index d70cbeca497..a1aeeaba494 100644
--- a/docs/get-started/getting-started-gcp.md
+++ b/docs/get-started/getting-started-gcp.md
@@ -7,222 +7,38 @@ parent: Getting-Started
# Getting started with RAPIDS Accelerator on GCP Dataproc
[Google Cloud Dataproc](https://cloud.google.com/dataproc) is Google Cloud's fully managed Apache
- Spark and Hadoop service. The quick start guide will go through:
-
-* [Quick Start Prerequisites](#quick-start-prerequisites)
-* [Qualify CPU workloads for GPU acceleration](#qualify-cpu-workloads-for-gpu-acceleration)
-* [Bootstrap GPU cluster with optimized settings](#bootstrap-gpu-cluster-with-optimized-settings)
-* [Tune applications on GPU cluster](#tune-applications-on-gpu-cluster)
-* [Diagnose GPU Cluster](#diagnose-gpu-cluster)
-
-The advanced guide will walk through the steps to:
-
+ Spark and Hadoop service. The quick start guide will go through:
+
* [Create a Dataproc Cluster Accelerated by GPUs](#create-a-dataproc-cluster-accelerated-by-gpus)
+ * [Create a Dataproc Cluster using T4's](#create-a-dataproc-cluster-using-t4s)
+ * [Build custom Dataproc image to accelerate cluster initialization time](#build-custom-dataproc-image-to-accelerate-cluster-init-time)
+ * [Create a Dataproc Cluster using MIG with A100's](#create-a-dataproc-cluster-using-mig-with-a100s)
+ * [Cluster creation troubleshooting](#cluster-creation-troubleshooting)
* [Run Pyspark or Scala ETL and XGBoost training Notebook on a Dataproc Cluster Accelerated by
GPUs](#run-pyspark-or-scala-notebook-on-a-dataproc-cluster-accelerated-by-gpus)
* [Submit the same sample ETL application as a Spark job to a Dataproc Cluster Accelerated by
GPUs](#submit-spark-jobs-to-a-dataproc-cluster-accelerated-by-gpus)
-* [Build custom Dataproc image to accelerate cluster initialization time](#build-custom-dataproc-image-to-accelerate-cluster-init-time)
-## Quick Start Prerequisites
+We provide some RAPIDS tools to analyze the clusters and the applications running on [Google Cloud Dataproc](https://cloud.google.com/dataproc) including:
+* [Diagnose GPU Cluster](#diagnose-gpu-cluster)
+* [Bootstrap GPU cluster with optimized settings](#bootstrap-gpu-cluster-with-optimized-settings)
+* [Qualify CPU workloads for GPU acceleration](#qualify-cpu-workloads-for-gpu-acceleration)
+* [Tune applications on GPU cluster](#tune-applications-on-gpu-cluster)
+The Prerequisites of the RAPIDS tools including:
* gcloud CLI is installed: https://cloud.google.com/sdk/docs/install
* python 3.8+
* `pip install spark-rapids-user-tools`
-## Qualify CPU Workloads for GPU Acceleration
-
-The [qualification tool](https://nvidia.github.io/spark-rapids/docs/spark-qualification-tool.html) is launched on a Dataproc cluster that has applications that have already run.
-The tool will output the applications recommended for acceleration along with estimated speed-up
-and cost saving metrics. Additionally, it will provide information on how to launch a GPU-
-accelerated cluster to take advantage of the speed-up and cost savings.
-
-Usage: `spark_rapids_dataproc qualification --cluster --region `
-
-Help (to see all options available): `spark_rapids_dataproc qualification --help`
-
-Example output:
-```
-+----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------+
-| | App Name | App ID | Recommendation | Estimated GPU | Estimated GPU | App | Estimated GPU |
-| | | | | Speedup | Duration(s) | Duration(s) | Savings(%) |
-|----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------|
-| 0 | query24 | application_1664888311321_0011 | Strongly Recommended | 3.49 | 257.18 | 897.68 | 59.70 |
-| 1 | query78 | application_1664888311321_0009 | Strongly Recommended | 3.35 | 113.89 | 382.35 | 58.10 |
-| 2 | query23 | application_1664888311321_0010 | Strongly Recommended | 3.08 | 325.77 | 1004.28 | 54.37 |
-| 3 | query64 | application_1664888311321_0008 | Strongly Recommended | 2.91 | 150.81 | 440.30 | 51.82 |
-| 4 | query50 | application_1664888311321_0003 | Recommended | 2.47 | 101.54 | 250.95 | 43.08 |
-| 5 | query16 | application_1664888311321_0005 | Recommended | 2.36 | 106.33 | 251.95 | 40.63 |
-| 6 | query38 | application_1664888311321_0004 | Recommended | 2.29 | 67.37 | 154.33 | 38.59 |
-| 7 | query87 | application_1664888311321_0006 | Recommended | 2.25 | 75.67 | 170.69 | 37.64 |
-| 8 | query51 | application_1664888311321_0002 | Recommended | 1.53 | 53.94 | 82.63 | 8.18 |
-+----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------+
-To launch a GPU-accelerated cluster with Spark RAPIDS, add the following to your cluster creation script:
- --initialization-actions=gs://goog-dataproc-initialization-actions-us-central1/gpu/install_gpu_driver.sh,gs://goog-dataproc-initialization-actions-us-central1/rapids/rapids.sh \
- --worker-accelerator type=nvidia-tesla-t4,count=2 \
- --metadata gpu-driver-provider="NVIDIA" \
- --metadata rapids-runtime=SPARK \
- --cuda-version=11.5
-```
-
-## Bootstrap GPU Cluster with Optimized Settings
-
-The bootstrap tool will apply optimized settings for the RAPIDS Accelerator on Apache Spark on a
-GPU cluster for Dataproc. The tool will fetch the characteristics of the cluster -- including
-number of workers, worker cores, worker memory, and GPU accelerator type and count. It will use
-the cluster properties to then determine the optimal settings for running GPU-accelerated Spark
-applications.
-
-Usage: `spark_rapids_dataproc bootstrap --cluster --region `
-
-Help (to see all options available): `spark_rapids_dataproc bootstrap --help`
-
-Example output:
-```
-##### BEGIN : RAPIDS bootstrap settings for gpu-cluster
-spark.executor.cores=16
-spark.executor.memory=32768m
-spark.executor.memoryOverhead=7372m
-spark.rapids.sql.concurrentGpuTasks=2
-spark.rapids.memory.pinnedPool.size=4096m
-spark.sql.files.maxPartitionBytes=512m
-spark.task.resource.gpu.amount=0.0625
-##### END : RAPIDS bootstrap settings for gpu-cluster
-```
-
-A detailed description for bootstrap settings with usage information is available in the [RAPIDS Accelerator for Apache Spark Configuration](https://nvidia.github.io/spark-rapids/docs/configs.html) and [Spark Configuration](https://spark.apache.org/docs/latest/configuration.html) page.
-
-## Tune Applications on GPU Cluster
-
-Once Spark applications have been run on the GPU cluster, the [profiling tool](https://nvidia.github.io/spark-rapids/docs/spark-profiling-tool.html) can be run to
-analyze the event logs of the applications to determine if more optimal settings should be
-configured. The tool will output a per-application set of config settings to be adjusted for
-enhanced performance.
-
-Usage: `spark_rapids_dataproc profiling --cluster --region `
-
-Help (to see all options available): `spark_rapids_dataproc profiling --help`
-
-Example output:
-```
-+--------------------------------+--------------------------------------------------+--------------------------------------------------------------------------------------------------+
-| App ID | Recommendations | Comments |
-+================================+==================================================+==================================================================================================+
-| application_1664894105643_0011 | --conf spark.executor.cores=16 | - 'spark.task.resource.gpu.amount' was not set. |
-| | --conf spark.executor.memory=32768m | - 'spark.rapids.sql.concurrentGpuTasks' was not set. |
-| | --conf spark.executor.memoryOverhead=7372m | - 'spark.rapids.memory.pinnedPool.size' was not set. |
-| | --conf spark.rapids.memory.pinnedPool.size=4096m | - 'spark.executor.memoryOverhead' was not set. |
-| | --conf spark.rapids.sql.concurrentGpuTasks=2 | - 'spark.sql.files.maxPartitionBytes' was not set. |
-| | --conf spark.sql.files.maxPartitionBytes=1571m | - 'spark.sql.shuffle.partitions' was not set. |
-| | --conf spark.sql.shuffle.partitions=200 | |
-| | --conf spark.task.resource.gpu.amount=0.0625 | |
-+--------------------------------+--------------------------------------------------+--------------------------------------------------------------------------------------------------+
-| application_1664894105643_0002 | --conf spark.executor.cores=16 | - 'spark.task.resource.gpu.amount' was not set. |
-| | --conf spark.executor.memory=32768m | - 'spark.rapids.sql.concurrentGpuTasks' was not set. |
-| | --conf spark.executor.memoryOverhead=7372m | - 'spark.rapids.memory.pinnedPool.size' was not set. |
-| | --conf spark.rapids.memory.pinnedPool.size=4096m | - 'spark.executor.memoryOverhead' was not set. |
-| | --conf spark.rapids.sql.concurrentGpuTasks=2 | - 'spark.sql.files.maxPartitionBytes' was not set. |
-| | --conf spark.sql.files.maxPartitionBytes=3844m | - 'spark.sql.shuffle.partitions' was not set. |
-| | --conf spark.sql.shuffle.partitions=200 | |
-| | --conf spark.task.resource.gpu.amount=0.0625 | |
-+--------------------------------+--------------------------------------------------+--------------------------------------------------------------------------------------------------+
-```
-
-## Diagnose GPU Cluster
-
-The diagnostic tool can be run to check a GPU cluster with RAPIDS Accelerator for Apache Spark
-is healthy and ready for Spark jobs, such as checking the version of installed NVIDIA driver,
-cuda-toolkit, RAPIDS Accelerator and running Spark test jobs etc. This tool also can
-be used by the frontline support team for basic diagnostic and troubleshooting before escalating
-to NVIDIA RAPIDS Accelerator for Apache Spark engineering team.
-
-Usage: `spark_rapids_dataproc diagnostic --cluster --region `
-
-Help (to see all options available): `spark_rapids_dataproc diagnostic --help`
-
-Example output:
-
-```text
-*** Running diagnostic function "nv_driver" ***
-Warning: Permanently added 'compute.9009746126288801979' (ECDSA) to the list of known hosts.
-Fri Oct 14 05:17:55 2022
-+-----------------------------------------------------------------------------+
-| NVIDIA-SMI 460.106.00 Driver Version: 460.106.00 CUDA Version: 11.2 |
-|-------------------------------+----------------------+----------------------+
-| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
-| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
-| | | MIG M. |
-|===============================+======================+======================|
-| 0 Tesla T4 On | 00000000:00:04.0 Off | 0 |
-| N/A 48C P8 10W / 70W | 0MiB / 15109MiB | 0% Default |
-| | | N/A |
-+-------------------------------+----------------------+----------------------+
-
-+-----------------------------------------------------------------------------+
-| Processes: |
-| GPU GI CI PID Type Process name GPU Memory |
-| ID ID Usage |
-|=============================================================================|
-| No running processes found |
-+-----------------------------------------------------------------------------+
-NVRM version: NVIDIA UNIX x86_64 Kernel Module 460.106.00 Tue Sep 28 12:05:58 UTC 2021
-GCC version: gcc version 7.5.0 (Ubuntu 7.5.0-3ubuntu1~18.04)
-Connection to 34.68.242.247 closed.
-*** Check "nv_driver": PASS ***
-*** Running diagnostic function "nv_driver" ***
-Warning: Permanently added 'compute.6788823627063447738' (ECDSA) to the list of known hosts.
-Fri Oct 14 05:18:02 2022
-+-----------------------------------------------------------------------------+
-| NVIDIA-SMI 460.106.00 Driver Version: 460.106.00 CUDA Version: 11.2 |
-|-------------------------------+----------------------+----------------------+
-| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
-| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
-| | | MIG M. |
-|===============================+======================+======================|
-| 0 Tesla T4 On | 00000000:00:04.0 Off | 0 |
-| N/A 35C P8 9W / 70W | 0MiB / 15109MiB | 0% Default |
-| | | N/A |
-+-------------------------------+----------------------+----------------------+
-
-+-----------------------------------------------------------------------------+
-| Processes: |
-| GPU GI CI PID Type Process name GPU Memory |
-| ID ID Usage |
-|=============================================================================|
-| No running processes found |
-+-----------------------------------------------------------------------------+
-NVRM version: NVIDIA UNIX x86_64 Kernel Module 460.106.00 Tue Sep 28 12:05:58 UTC 2021
-GCC version: gcc version 7.5.0 (Ubuntu 7.5.0-3ubuntu1~18.04)
-Connection to 34.123.223.104 closed.
-*** Check "nv_driver": PASS ***
-*** Running diagnostic function "cuda_version" ***
-Connection to 34.68.242.247 closed.
-found cuda major version: 11
-*** Check "cuda_version": PASS ***
-*** Running diagnostic function "cuda_version" ***
-Connection to 34.123.223.104 closed.
-found cuda major version: 11
-*** Check "cuda_version": PASS ***
-...
-********************************************************************************
-Overall check result: PASS
-```
-
-Please note that the diagnostic tool supports the following:
-
-* Dataproc 2.0 with image of Debian 10 or Ubuntu 18.04 (Rocky8 support is coming soon)
-* GPU cluster that must have 1 worker node at least. Single node cluster (1 master, 0 workers) is
- not supported
-
## Create a Dataproc Cluster Accelerated by GPUs
-
- You can use [Cloud Shell](https://cloud.google.com/shell) to execute shell commands that will
- create a Dataproc cluster. Cloud Shell contains command line tools for interacting with Google
- Cloud Platform, including gcloud and gsutil. Alternatively, you can install [GCloud
- SDK](https://cloud.google.com/sdk/install) on your machine. From the Cloud Shell, users will need
- to enable services within your project. Enable the Compute and Dataproc APIs in order to access
- Dataproc, and enable the Storage API as you’ll need a Google Cloud Storage bucket to house your
- data. This may take several minutes.
+
+You can use [Cloud Shell](https://cloud.google.com/shell) to execute shell commands that will
+create a Dataproc cluster. Cloud Shell contains command line tools for interacting with Google
+Cloud Platform, including gcloud and gsutil. Alternatively, you can install [GCloud
+SDK](https://cloud.google.com/sdk/install) on your machine. From the Cloud Shell, users will need
+to enable services within your project. Enable the Compute and Dataproc APIs in order to access
+Dataproc, and enable the Storage API as you’ll need a Google Cloud Storage bucket to house your
+data. This may take several minutes.
```bash
gcloud services enable compute.googleapis.com
@@ -235,7 +51,7 @@ Dataproc cluster. Dataproc supports multiple different GPU types depending on yo
Generally, T4 is a good option for use with the RAPIDS Accelerator for Spark. We also support
MIG on the Ampere architecture GPUs like the A100. Using
[MIG](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/) you can request an A100 and split
-it up into multiple different compute instances and it runs like you have multiple separate GPUs.
+it up into multiple different compute instances, and it runs like you have multiple separate GPUs.
The example configurations below will allow users to run any of the [notebook
demos](https://github.com/NVIDIA/spark-rapids/tree/main/docs/demo/GCP) on GCP. Adjust the sizes and
@@ -257,22 +73,22 @@ The script below will initialize with the following:
### Create a Dataproc Cluster using T4's
* One 16-core master node and 5 32-core worker nodes
-* Four NVIDIA T4 for each worker node
+* Two NVIDIA T4 for each worker node
```bash
export REGION=[Your Preferred GCP Region]
export GCS_BUCKET=[Your GCS Bucket]
export CLUSTER_NAME=[Your Cluster Name]
export NUM_GPUS=2
- export NUM_WORKERS=4
+ export NUM_WORKERS=5
gcloud dataproc clusters create $CLUSTER_NAME \
--region=$REGION \
--image-version=2.0-ubuntu18 \
- --master-machine-type=n1-standard-16 \
+ --master-machine-type=n2-standard-16 \
--num-workers=$NUM_WORKERS \
--worker-accelerator=type=nvidia-tesla-t4,count=$NUM_GPUS \
- --worker-machine-type=n1-highmem-32\
+ --worker-machine-type=n2-highmem-32\
--num-worker-local-ssds=4 \
--initialization-actions=gs://goog-dataproc-initialization-actions-${REGION}/spark-rapids/spark-rapids.sh \
--optional-components=JUPYTER,ZEPPELIN \
@@ -294,6 +110,88 @@ Google Cloud Console to see the progress.
If you'd like to further accelerate init time to 4-5 minutes, create a custom Dataproc image using
[this](#build-custom-dataproc-image-to-accelerate-cluster-init-time) guide.
+### Build custom dataproc image to accelerate cluster init time
+In order to accelerate cluster init time to 3-4 minutes, we need to build a custom Dataproc image
+that already has NVIDIA drivers and CUDA toolkit installed, with RAPIDS deployed. The custom image
+could also be used in an air gap environment. In this section, we will be using [these instructions
+from GCP](https://cloud.google.com/dataproc/docs/guides/dataproc-images) to create a custom image.
+
+Currently, we can directly download the [spark-rapids.sh](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/spark-rapids)
+script to create the Dataproc image:
+
+Google provides a `generate_custom_image.py` script that:
+- Launches a temporary Compute Engine VM instance with the specified Dataproc base image.
+- Then runs the customization script inside the VM instance to install custom packages and/or
+update configurations.
+- After the customization script finishes, it shuts down the VM instance and creates a Dataproc
+ custom image from the disk of the VM instance.
+- The temporary VM is deleted after the custom image is created.
+- The custom image is saved and can be used to create Dataproc clusters.
+
+Download `spark-rapids.sh` in this repo. The script uses
+Google's `generate_custom_image.py` script. This step may take 20-25 minutes to complete.
+
+```bash
+git clone https://github.com/GoogleCloudDataproc/custom-images
+cd custom-images
+
+export CUSTOMIZATION_SCRIPT=/path/to/spark-rapids.sh
+export ZONE=[Your Preferred GCP Zone]
+export GCS_BUCKET=[Your GCS Bucket]
+export IMAGE_NAME=sample-20-ubuntu18-gpu-t4
+export DATAPROC_VERSION=2.0-ubuntu18
+export GPU_NAME=nvidia-tesla-t4
+export GPU_COUNT=1
+
+python generate_custom_image.py \
+ --image-name $IMAGE_NAME \
+ --dataproc-version $DATAPROC_VERSION \
+ --customization-script $CUSTOMIZATION_SCRIPT \
+ --no-smoke-test \
+ --zone $ZONE \
+ --gcs-bucket $GCS_BUCKET \
+ --machine-type n2-standard-4 \
+ --accelerator type=$GPU_NAME,count=$GPU_COUNT \
+ --disk-size 200 \
+ --subnet default
+```
+
+See [here](https://cloud.google.com/dataproc/docs/guides/dataproc-images#running_the_code) for more
+details on `generate_custom_image.py` script arguments and
+[here](https://cloud.google.com/dataproc/docs/concepts/versioning/dataproc-versions) for dataproc
+version description.
+
+The image `sample-20-ubuntu18-gpu-t4` is now ready and can be viewed in the GCP console under
+`Compute Engine > Storage > Images`. The next step is to launch the cluster using this new image
+and new initialization actions (that do not install NVIDIA drivers since we are already past that
+step).
+
+Move this to your own bucket. Let's launch the cluster:
+
+```bash
+export REGION=[Your Preferred GCP Region]
+export GCS_BUCKET=[Your GCS Bucket]
+export CLUSTER_NAME=[Your Cluster Name]
+export NUM_GPUS=1
+export NUM_WORKERS=2
+
+gcloud dataproc clusters create $CLUSTER_NAME \
+ --region=$REGION \
+ --image=sample-20-ubuntu18-gpu-t4 \
+ --master-machine-type=n2-standard-4 \
+ --num-workers=$NUM_WORKERS \
+ --worker-accelerator=type=nvidia-tesla-t4,count=$NUM_GPUS \
+ --worker-machine-type=n2-standard-4 \
+ --num-worker-local-ssds=1 \
+ --optional-components=JUPYTER,ZEPPELIN \
+ --metadata=rapids-runtime=SPARK \
+ --bucket=$GCS_BUCKET \
+ --enable-component-gateway \
+ --subnet=default
+```
+
+The new cluster should be up and running within 3-4 minutes!
+
### Create a Dataproc Cluster using MIG with A100's
* One 16-core master node and 5 12-core worker nodes
* 1 NVIDIA A100 for each worker node, split into 2 MIG instances using
@@ -311,13 +209,13 @@ gcloud dataproc clusters create $CLUSTER_NAME \
--region=$REGION \
--zone=$ZONE \
--image-version=2.0-ubuntu18 \
- --master-machine-type=n1-standard-16 \
+ --master-machine-type=n2-standard-16 \
--num-workers=$NUM_WORKERS \
--worker-accelerator=type=nvidia-tesla-a100,count=$NUM_GPUS \
--worker-machine-type=a2-highgpu-1g \
--num-worker-local-ssds=4 \
--initialization-actions=gs://goog-dataproc-initialization-actions-${REGION}/spark-rapids/spark-rapids.sh \
- --metadata=startup-script-url=gs://goog-dataproc-initialization-actions-${REGION}/gpu/mig.sh \
+ --metadata=startup-script-url=gs://goog-dataproc-initialization-actions-${REGION}/spark-rapids/mig.sh \
--optional-components=JUPYTER,ZEPPELIN \
--metadata=rapids-runtime=SPARK \
--bucket=$GCS_BUCKET \
@@ -337,18 +235,18 @@ metadata parameter `MIG_CGI`. Below is an example of using a profile name and a
```
This may take around 10-15 minutes to complete. You can navigate to the Dataproc clusters tab in
-the Google Cloud Console to see the progress.
+the Google Cloud Console to see the progress.
![Dataproc Cluster](../img/GCP/dataproc-cluster.png)
If you'd like to further accelerate init time to 4-5 minutes, create a custom Dataproc image using
-[this](#build-custom-dataproc-image-to-accelerate-cluster-init-time) guide.
+[this](#build-custom-dataproc-image-to-accelerate-cluster-init-time) guide.
### Cluster creation troubleshooting
-If you encounter an error related to GPUs not being available because of your account quotas, please
+If you encounter an error related to GPUs not being available because of your account quotas, please
go to this page for updating your quotas: [Quotas and limits](https://cloud.google.com/compute/quotas).
-If you encounter an error related to GPUs not available in the specific region or zone, you will
+If you encounter an error related to GPUs not available in the specific region or zone, you will
need to update the REGION or ZONE parameter in the cluster creation command.
## Run PySpark or Scala Notebook on a Dataproc Cluster Accelerated by GPUs
@@ -377,7 +275,7 @@ Once the data is prepared, we use the [Mortgage XGBoost4j Scala
Notebook](../demo/GCP/mortgage-xgboost4j-gpu-scala.ipynb) in Dataproc's jupyter notebook to execute
the training job on GPUs. Scala based XGBoost examples use [DLMC
XGBoost](https://github.com/dmlc/xgboost). For a PySpark based XGBoost example, please refer to
-[Spark-RAPIDS-examples](https://github.com/NVIDIA/spark-rapids-examples/blob/main/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md) that
+[Spark-RAPIDS-examples](https://github.com/NVIDIA/spark-rapids-examples/blob/main/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md) that
make sure the required libraries are installed.
The training time should be around 680 seconds (1/7 of CPU execution time with same config). This
@@ -434,92 +332,190 @@ gcloud dataproc jobs submit spark \
-maxDepth=8
```
-## Dataproc Hub in AI Platform Notebook to Dataproc cluster
-With the integration between AI Platform Notebooks and Dataproc, users can create a [Dataproc Hub
-notebook](https://cloud.google.com/blog/products/data-analytics/administering-jupyter-notebooks-for-spark-workloads-on-dataproc).
-The AI platform will connect to a Dataproc cluster through a yaml configuration.
+## Diagnose GPU Cluster
-In the future, users will be able to provision a Dataproc cluster through DataprocHub notebook. You
-can use example [pyspark notebooks](../demo/GCP/Mortgage-ETL.ipynb) to experiment.
+The diagnostic tool can be run to check a GPU cluster with RAPIDS Accelerator for Apache Spark
+is healthy and ready for Spark jobs, such as checking the version of installed NVIDIA driver,
+cuda-toolkit, RAPIDS Accelerator and running Spark test jobs etc. This tool also can
+be used by the front line support team for basic diagnostic and troubleshooting before escalating
+to NVIDIA RAPIDS Accelerator for Apache Spark engineering team.
-## Build custom dataproc image to accelerate cluster init time
-In order to accelerate cluster init time to 3-4 minutes, we need to build a custom Dataproc image
-that already has NVIDIA drivers and CUDA toolkit installed, with RAPIDS deployed. The custom image
-could also be used in an air gap environment. In this section, we will be using [these instructions
-from GCP](https://cloud.google.com/dataproc/docs/guides/dataproc-images) to create a custom image.
+Usage: `spark_rapids_dataproc diagnostic --cluster --region `
-Currently, we can directly download the [spark-rapids.sh](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/spark-rapids)
-script to create the Dataproc image:
+Help (to see all options available): `spark_rapids_dataproc diagnostic --help`
-Google provides a `generate_custom_image.py` script that:
-- Launches a temporary Compute Engine VM instance with the specified Dataproc base image.
-- Then runs the customization script inside the VM instance to install custom packages and/or
-update configurations.
-- After the customization script finishes, it shuts down the VM instance and creates a Dataproc
- custom image from the disk of the VM instance.
-- The temporary VM is deleted after the custom image is created.
-- The custom image is saved and can be used to create Dataproc clusters.
+Example output:
-Download `spark-rapids.sh` in this repo. The script uses
-Google's `generate_custom_image.py` script. This step may take 20-25 minutes to complete.
+```text
+*** Running diagnostic function "nv_driver" ***
+Warning: Permanently added 'compute.9009746126288801979' (ECDSA) to the list of known hosts.
+Fri Oct 14 05:17:55 2022
++-----------------------------------------------------------------------------+
+| NVIDIA-SMI 460.106.00 Driver Version: 460.106.00 CUDA Version: 11.2 |
+|-------------------------------+----------------------+----------------------+
+| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
+| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
+| | | MIG M. |
+|===============================+======================+======================|
+| 0 Tesla T4 On | 00000000:00:04.0 Off | 0 |
+| N/A 48C P8 10W / 70W | 0MiB / 15109MiB | 0% Default |
+| | | N/A |
++-------------------------------+----------------------+----------------------+
-```bash
-git clone https://github.com/GoogleCloudDataproc/custom-images
-cd custom-images
++-----------------------------------------------------------------------------+
+| Processes: |
+| GPU GI CI PID Type Process name GPU Memory |
+| ID ID Usage |
+|=============================================================================|
+| No running processes found |
++-----------------------------------------------------------------------------+
+NVRM version: NVIDIA UNIX x86_64 Kernel Module 460.106.00 Tue Sep 28 12:05:58 UTC 2021
+GCC version: gcc version 7.5.0 (Ubuntu 7.5.0-3ubuntu1~18.04)
+Connection to 34.68.242.247 closed.
+*** Check "nv_driver": PASS ***
+*** Running diagnostic function "nv_driver" ***
+Warning: Permanently added 'compute.6788823627063447738' (ECDSA) to the list of known hosts.
+Fri Oct 14 05:18:02 2022
++-----------------------------------------------------------------------------+
+| NVIDIA-SMI 460.106.00 Driver Version: 460.106.00 CUDA Version: 11.2 |
+|-------------------------------+----------------------+----------------------+
+| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
+| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
+| | | MIG M. |
+|===============================+======================+======================|
+| 0 Tesla T4 On | 00000000:00:04.0 Off | 0 |
+| N/A 35C P8 9W / 70W | 0MiB / 15109MiB | 0% Default |
+| | | N/A |
++-------------------------------+----------------------+----------------------+
-export CUSTOMIZATION_SCRIPT=/path/to/spark-rapids.sh
-export ZONE=[Your Preferred GCP Zone]
-export GCS_BUCKET=[Your GCS Bucket]
-export IMAGE_NAME=sample-20-ubuntu18-gpu-t4
-export DATAPROC_VERSION=2.0-ubuntu18
-export GPU_NAME=nvidia-tesla-t4
-export GPU_COUNT=1
++-----------------------------------------------------------------------------+
+| Processes: |
+| GPU GI CI PID Type Process name GPU Memory |
+| ID ID Usage |
+|=============================================================================|
+| No running processes found |
++-----------------------------------------------------------------------------+
+NVRM version: NVIDIA UNIX x86_64 Kernel Module 460.106.00 Tue Sep 28 12:05:58 UTC 2021
+GCC version: gcc version 7.5.0 (Ubuntu 7.5.0-3ubuntu1~18.04)
+Connection to 34.123.223.104 closed.
+*** Check "nv_driver": PASS ***
+*** Running diagnostic function "cuda_version" ***
+Connection to 34.68.242.247 closed.
+found cuda major version: 11
+*** Check "cuda_version": PASS ***
+*** Running diagnostic function "cuda_version" ***
+Connection to 34.123.223.104 closed.
+found cuda major version: 11
+*** Check "cuda_version": PASS ***
+...
+********************************************************************************
+Overall check result: PASS
+```
-python generate_custom_image.py \
- --image-name $IMAGE_NAME \
- --dataproc-version $DATAPROC_VERSION \
- --customization-script $CUSTOMIZATION_SCRIPT \
- --no-smoke-test \
- --zone $ZONE \
- --gcs-bucket $GCS_BUCKET \
- --machine-type n1-standard-4 \
- --accelerator type=$GPU_NAME,count=$GPU_COUNT \
- --disk-size 200 \
- --subnet default
+Please note that the diagnostic tool supports the following:
+
+* Dataproc 2.0 with image of Debian 10 or Ubuntu 18.04 (Rocky8 support is coming soon)
+* GPU clusters that must have 1 worker node at least. Single node cluster (1 master, 0 workers) is
+ not supported
+
+## Bootstrap GPU Cluster with Optimized Settings
+
+The bootstrap tool will apply optimized settings for the RAPIDS Accelerator on Apache Spark on a
+GPU cluster for Dataproc. The tool will fetch the characteristics of the cluster -- including
+number of workers, worker cores, worker memory, and GPU accelerator type and count. It will use
+the cluster properties to then determine the optimal settings for running GPU-accelerated Spark
+applications.
+
+Usage: `spark_rapids_dataproc bootstrap --cluster --region `
+
+Help (to see all options available): `spark_rapids_dataproc bootstrap --help`
+
+Example output:
+```
+##### BEGIN : RAPIDS bootstrap settings for gpu-cluster
+spark.executor.cores=16
+spark.executor.memory=32768m
+spark.executor.memoryOverhead=7372m
+spark.rapids.sql.concurrentGpuTasks=2
+spark.rapids.memory.pinnedPool.size=4096m
+spark.sql.files.maxPartitionBytes=512m
+spark.task.resource.gpu.amount=0.0625
+##### END : RAPIDS bootstrap settings for gpu-cluster
```
-See [here](https://cloud.google.com/dataproc/docs/guides/dataproc-images#running_the_code) for more
-details on `generate_custom_image.py` script arguments and
-[here](https://cloud.google.com/dataproc/docs/concepts/versioning/dataproc-versions) for dataproc
-version description.
+A detailed description for bootstrap settings with usage information is available in the
+[RAPIDS Accelerator for Apache Spark Configuration](https://nvidia.github.io/spark-rapids/docs/configs.html)
+and [Spark Configuration](https://spark.apache.org/docs/latest/configuration.html) page.
-The image `sample-20-ubuntu18-gpu-t4` is now ready and can be viewed in the GCP console under
-`Compute Engine > Storage > Images`. The next step is to launch the cluster using this new image
-and new initialization actions (that do not install NVIDIA drivers since we are already past that
-step).
+## Qualify CPU Workloads for GPU Acceleration
-Move this to your own bucket. Let's launch the cluster:
+The [qualification tool](https://pypi.org/project/spark-rapids-user-tools/) is launched on a Dataproc cluster that has applications that have already run.
+The tool will output the applications recommended for acceleration along with estimated speed-up
+and cost saving metrics. Additionally, it will provide information on how to launch a GPU-
+accelerated cluster to take advantage of the speed-up and cost savings.
-```bash
-export REGION=[Your Preferred GCP Region]
-export GCS_BUCKET=[Your GCS Bucket]
-export CLUSTER_NAME=[Your Cluster Name]
-export NUM_GPUS=1
-export NUM_WORKERS=2
+Usage: `spark_rapids_dataproc qualification --cluster --region `
-gcloud dataproc clusters create $CLUSTER_NAME \
- --region=$REGION \
- --image=sample-20-ubuntu18-gpu-t4 \
- --master-machine-type=n1-standard-4 \
- --num-workers=$NUM_WORKERS \
- --worker-accelerator=type=nvidia-tesla-t4,count=$NUM_GPUS \
- --worker-machine-type=n1-standard-4 \
- --num-worker-local-ssds=1 \
- --optional-components=JUPYTER,ZEPPELIN \
- --metadata=rapids-runtime=SPARK \
- --bucket=$GCS_BUCKET \
- --enable-component-gateway \
- --subnet=default
+Help (to see all options available): `spark_rapids_dataproc qualification --help`
+
+Example output:
+```
++----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------+
+| | App Name | App ID | Recommendation | Estimated GPU | Estimated GPU | App | Estimated GPU |
+| | | | | Speedup | Duration(s) | Duration(s) | Savings(%) |
+|----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------|
+| 0 | query24 | application_1664888311321_0011 | Strongly Recommended | 3.49 | 257.18 | 897.68 | 59.70 |
+| 1 | query78 | application_1664888311321_0009 | Strongly Recommended | 3.35 | 113.89 | 382.35 | 58.10 |
+| 2 | query23 | application_1664888311321_0010 | Strongly Recommended | 3.08 | 325.77 | 1004.28 | 54.37 |
+| 3 | query64 | application_1664888311321_0008 | Strongly Recommended | 2.91 | 150.81 | 440.30 | 51.82 |
+| 4 | query50 | application_1664888311321_0003 | Recommended | 2.47 | 101.54 | 250.95 | 43.08 |
+| 5 | query16 | application_1664888311321_0005 | Recommended | 2.36 | 106.33 | 251.95 | 40.63 |
+| 6 | query38 | application_1664888311321_0004 | Recommended | 2.29 | 67.37 | 154.33 | 38.59 |
+| 7 | query87 | application_1664888311321_0006 | Recommended | 2.25 | 75.67 | 170.69 | 37.64 |
+| 8 | query51 | application_1664888311321_0002 | Recommended | 1.53 | 53.94 | 82.63 | 8.18 |
++----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------+
+To launch a GPU-accelerated cluster with Spark RAPIDS, add the following to your cluster creation script:
+ --initialization-actions=gs://goog-dataproc-initialization-actions-${REGION}/spark-rapids/spark-rapids.sh \
+ --worker-accelerator type=nvidia-tesla-t4,count=2 \
+ --metadata gpu-driver-provider="NVIDIA" \
+ --metadata rapids-runtime=SPARK \
+ --cuda-version=11.5
```
-The new cluster should be up and running within 3-4 minutes!
+Please refer [Qualification Tool](https://nvidia.github.io/spark-rapids/docs/spark-qualification-tool.html) guide for running qualification tool on more environment.
+
+## Tune Applications on GPU Cluster
+
+Once Spark applications have been run on the GPU cluster, the [profiling tool](https://nvidia.github.io/spark-rapids/docs/spark-profiling-tool.html) can be run to
+analyze the event logs of the applications to determine if more optimal settings should be
+configured. The tool will output a per-application set of config settings to be adjusted for
+enhanced performance.
+
+Usage: `spark_rapids_dataproc profiling --cluster --region `
+
+Help (to see all options available): `spark_rapids_dataproc profiling --help`
+
+Example output:
+```
++--------------------------------+--------------------------------------------------+--------------------------------------------------------------------------------------------------+
+| App ID | Recommendations | Comments |
++================================+==================================================+==================================================================================================+
+| application_1664894105643_0011 | --conf spark.executor.cores=16 | - 'spark.task.resource.gpu.amount' was not set. |
+| | --conf spark.executor.memory=32768m | - 'spark.rapids.sql.concurrentGpuTasks' was not set. |
+| | --conf spark.executor.memoryOverhead=7372m | - 'spark.rapids.memory.pinnedPool.size' was not set. |
+| | --conf spark.rapids.memory.pinnedPool.size=4096m | - 'spark.executor.memoryOverhead' was not set. |
+| | --conf spark.rapids.sql.concurrentGpuTasks=2 | - 'spark.sql.files.maxPartitionBytes' was not set. |
+| | --conf spark.sql.files.maxPartitionBytes=1571m | - 'spark.sql.shuffle.partitions' was not set. |
+| | --conf spark.sql.shuffle.partitions=200 | |
+| | --conf spark.task.resource.gpu.amount=0.0625 | |
++--------------------------------+--------------------------------------------------+--------------------------------------------------------------------------------------------------+
+| application_1664894105643_0002 | --conf spark.executor.cores=16 | - 'spark.task.resource.gpu.amount' was not set. |
+| | --conf spark.executor.memory=32768m | - 'spark.rapids.sql.concurrentGpuTasks' was not set. |
+| | --conf spark.executor.memoryOverhead=7372m | - 'spark.rapids.memory.pinnedPool.size' was not set. |
+| | --conf spark.rapids.memory.pinnedPool.size=4096m | - 'spark.executor.memoryOverhead' was not set. |
+| | --conf spark.rapids.sql.concurrentGpuTasks=2 | - 'spark.sql.files.maxPartitionBytes' was not set. |
+| | --conf spark.sql.files.maxPartitionBytes=3844m | - 'spark.sql.shuffle.partitions' was not set. |
+| | --conf spark.sql.shuffle.partitions=200 | |
+| | --conf spark.task.resource.gpu.amount=0.0625 | |
++--------------------------------+--------------------------------------------------+--------------------------------------------------------------------------------------------------+
+```
diff --git a/docs/get-started/getting-started-on-prem.md b/docs/get-started/getting-started-on-prem.md
index 5f6f0ce0616..31e8e242d20 100644
--- a/docs/get-started/getting-started-on-prem.md
+++ b/docs/get-started/getting-started-on-prem.md
@@ -53,13 +53,13 @@ CUDA and will not run on other versions. The jars use a classifier to keep them
- CUDA 11.x => classifier cuda11
For example, here is a sample version of the jar with CUDA 11.x support:
-- rapids-4-spark_2.12-23.02.0-cuda11.jar
+- rapids-4-spark_2.12-23.04.0-cuda11.jar
For simplicity export the location to this jar. This example assumes the sample jar above has
been placed in the `/opt/sparkRapidsPlugin` directory:
```shell
export SPARK_RAPIDS_DIR=/opt/sparkRapidsPlugin
-export SPARK_RAPIDS_PLUGIN_JAR=${SPARK_RAPIDS_DIR}/rapids-4-spark_2.12-23.02.0-cuda11.jar
+export SPARK_RAPIDS_PLUGIN_JAR=${SPARK_RAPIDS_DIR}/rapids-4-spark_2.12-23.04.0-cuda11.jar
```
## Install the GPU Discovery Script
diff --git a/docs/get-started/getting-started-workload-qualification.md b/docs/get-started/getting-started-workload-qualification.md
index 2a5125303ca..624ca29349d 100644
--- a/docs/get-started/getting-started-workload-qualification.md
+++ b/docs/get-started/getting-started-workload-qualification.md
@@ -39,8 +39,8 @@ you focus on the Spark applications which are best suited for the GPU.
The profiling tool outputs SQL plan metrics and also prints out actual query plans to provide more
insights. In the following example the profiling tool output for a specific Spark application shows
-that it has a query with a large `HashAggregate` and `SortMergeJoin`. Those are indicators for a
-good candidate application for the RAPIDS Accelerator.
+that it has a query with a large (processing millions of rows) `HashAggregate` and `SortMergeJoin`.
+Those are indicators for a good candidate application for the RAPIDS Accelerator.
```
+--------+-----+------+----------------------------------------------------+-------------+------------------------------------+-------------+----------+
diff --git a/docs/img/AWS-EMR/EMR_notebook_1.png b/docs/img/AWS-EMR/EMR_notebook_1.png
deleted file mode 100644
index 18dc7a95921..00000000000
Binary files a/docs/img/AWS-EMR/EMR_notebook_1.png and /dev/null differ
diff --git a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_1.png b/docs/img/AWS-EMR/RAPIDS_EMR_GUI_1.png
deleted file mode 100644
index ec6e3eab036..00000000000
Binary files a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_1.png and /dev/null differ
diff --git a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_2.png b/docs/img/AWS-EMR/RAPIDS_EMR_GUI_2.png
deleted file mode 100644
index 83d0b577af0..00000000000
Binary files a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_2.png and /dev/null differ
diff --git a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_2b.png b/docs/img/AWS-EMR/RAPIDS_EMR_GUI_2b.png
deleted file mode 100644
index ffd1253b974..00000000000
Binary files a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_2b.png and /dev/null differ
diff --git a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_3.png b/docs/img/AWS-EMR/RAPIDS_EMR_GUI_3.png
deleted file mode 100644
index 5ac22ee1583..00000000000
Binary files a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_3.png and /dev/null differ
diff --git a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_4.png b/docs/img/AWS-EMR/RAPIDS_EMR_GUI_4.png
deleted file mode 100644
index 1953bf68b30..00000000000
Binary files a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_4.png and /dev/null differ
diff --git a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_5.png b/docs/img/AWS-EMR/RAPIDS_EMR_GUI_5.png
deleted file mode 100644
index 8e0e04671c1..00000000000
Binary files a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_5.png and /dev/null differ
diff --git a/docs/img/AWS-EMR/bootstrap-action.png b/docs/img/AWS-EMR/bootstrap-action.png
new file mode 100644
index 00000000000..b4eaf85f882
Binary files /dev/null and b/docs/img/AWS-EMR/bootstrap-action.png differ
diff --git a/docs/img/AWS-EMR/cluster-configuration.png b/docs/img/AWS-EMR/cluster-configuration.png
new file mode 100644
index 00000000000..136ffc191bf
Binary files /dev/null and b/docs/img/AWS-EMR/cluster-configuration.png differ
diff --git a/docs/img/AWS-EMR/name-and-applications.png b/docs/img/AWS-EMR/name-and-applications.png
new file mode 100644
index 00000000000..1003b7697af
Binary files /dev/null and b/docs/img/AWS-EMR/name-and-applications.png differ
diff --git a/docs/img/AWS-EMR/networking.png b/docs/img/AWS-EMR/networking.png
new file mode 100644
index 00000000000..36acf522fab
Binary files /dev/null and b/docs/img/AWS-EMR/networking.png differ
diff --git a/docs/img/AWS-EMR/notebook-workspace-creation.png b/docs/img/AWS-EMR/notebook-workspace-creation.png
new file mode 100644
index 00000000000..edef7276911
Binary files /dev/null and b/docs/img/AWS-EMR/notebook-workspace-creation.png differ
diff --git a/docs/img/AWS-EMR/ssh-key-pair.png b/docs/img/AWS-EMR/ssh-key-pair.png
new file mode 100644
index 00000000000..fa75588e3ff
Binary files /dev/null and b/docs/img/AWS-EMR/ssh-key-pair.png differ
diff --git a/docs/img/Databricks/sparkconfig.png b/docs/img/Databricks/sparkconfig.png
index d5c1070c4d0..f05b7d632fb 100644
Binary files a/docs/img/Databricks/sparkconfig.png and b/docs/img/Databricks/sparkconfig.png differ
diff --git a/docs/index.md b/docs/index.md
index b2bf634617b..0e099a609a8 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -9,7 +9,10 @@ description: This site serves as a collection of documentation about the RAPIDS
The RAPIDS Accelerator for Apache Spark leverages GPUs to accelerate processing via the
[RAPIDS libraries](http://rapids.ai).
-As data scientists shift from using traditional analytics to leveraging AI applications that better model complex market demands, traditional CPU-based processing can no longer keep up without compromising either speed or cost. The growing adoption of AI in analytics has created the need for a new framework to process data quickly and cost efficiently with GPUs.
+As data scientists shift from using traditional analytics to leveraging AI(DL/ML) applications that
+better model complex market demands, traditional CPU-based processing can no longer keep up without
+compromising either speed or cost. The growing adoption of AI in analytics has created the need for
+a new framework to process data quickly and cost-efficiently with GPUs.
The RAPIDS Accelerator for Apache Spark combines the power of the RAPIDS cuDF library and the scale of the Spark distributed computing framework. The RAPIDS Accelerator library also has a built-in accelerated shuffle based on UCX that can be configured to leverage GPU-to-GPU communication and RDMA capabilities.
@@ -20,6 +23,8 @@ Rapids Accelerator for Apache Spark reaps the benefit of GPU performance while s
[demo](https://databricks.com/session_na20/deep-dive-into-gpu-support-in-apache-spark-3-x). Costs
based on Cloud T4 GPU instance market price.
+Please refer to [spark-rapids-examples repo](https://github.com/NVIDIA/spark-rapids-examples/tree/main/examples/XGBoost-Examples)
+for details of this example job.
## Ease of Use
Run your existing Apache Spark applications with no code change. Launch Spark with the RAPIDS Accelerator for Apache Spark plugin jar and enable a configuration setting:
diff --git a/docs/spark-profiling-tool.md b/docs/spark-profiling-tool.md
index 5f5b6a28b90..943b90c7323 100644
--- a/docs/spark-profiling-tool.md
+++ b/docs/spark-profiling-tool.md
@@ -33,7 +33,7 @@ more information.
The Profiling tool requires the Spark 3.x jars to be able to run but do not need an Apache Spark run time.
If you do not already have Spark 3.x installed,
you can download the Spark distribution to any machine and include the jars in the classpath.
-- Download the jar file from [Maven repository](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/23.02.0/)
+- Download the jar file from [Maven repository](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/23.04.0/)
- [Download Apache Spark 3.x](http://spark.apache.org/downloads.html) - Spark 3.1.1 for Apache Hadoop is recommended
If you want to compile the jars, please refer to the instructions [here](./spark-qualification-tool.md#How-to-compile-the-tools-jar).
@@ -54,7 +54,7 @@ There are 3 modes of operation for the Profiling tool:
on each application individually and outputs a file per application
```bash
- Usage: java -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/*
+ Usage: java -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/* \
com.nvidia.spark.rapids.tool.profiling.ProfileMain [options]
```
@@ -66,7 +66,7 @@ There are 3 modes of operation for the Profiling tool:
together and you get one file for all applications.
```bash
- Usage: java -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/*
+ Usage: java -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/* \
com.nvidia.spark.rapids.tool.profiling.ProfileMain --combined
```
@@ -76,7 +76,7 @@ There are 3 modes of operation for the Profiling tool:
The Compare mode will use more memory if comparing lots of applications.
```bash
- Usage: java -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/*
+ Usage: java -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/* \
com.nvidia.spark.rapids.tool.profiling.ProfileMain --compare
```
@@ -583,7 +583,7 @@ The _Auto-Tuner_ output has 2 main sections:
```
Profiling tool for the RAPIDS Accelerator and Apache Spark
-Usage: java -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/*
+Usage: java -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/* \
com.nvidia.spark.rapids.tool.profiling.ProfileMain [options]
diff --git a/docs/spark-qualification-tool.md b/docs/spark-qualification-tool.md
index 330d6f8331e..54c346a15c5 100644
--- a/docs/spark-qualification-tool.md
+++ b/docs/spark-qualification-tool.md
@@ -18,11 +18,24 @@ This tool is intended to give the users a starting point and does not guarantee
queries or applications with the highest _recommendation_ will actually be accelerated the most. Currently,
it reports by looking at the amount of time spent in tasks of SQL Dataframe operations.
+The estimations for GPU duration are available for different environments and are based on benchmarks run in the
+applicable environments. Here are the cluster information for the ETL benchmarks used for the estimates:
+
+| Environment | CPU Cluster | GPU Cluster |
+|------------------|-------------------|--------------------------------|
+| On-prem | 8x 128-core | 8x 128-core + 8x A100 40 GB |
+| Dataproc | 4x n1-standard-32 | 4x n1-standard-32 + 8x T4 16GB |
+| EMR | 8x m5d.8xlarge | 4x g4dn.12xlarge |
+| Databricks AWS | 8x m6gd.8xlage | 8x g5.8xlarge |
+| Databricks Azure | 8x E8ds_v4 | 8x NC8as_T4_v3 |
+
+Note that all benchmarks were run using the [NDS benchmark](https://github.com/NVIDIA/spark-rapids-benchmarks/tree/dev/nds) at SF3K (3 TB).
+
> **Disclaimer!**
> Estimates provided by the Qualification tool are based on the currently supported "_SparkPlan_" or "_Executor Nodes_"
> used in the application. It currently does not handle all the expressions or datatypes used.
> Please refer to "[Understanding Execs report](#execs-report)" section and the
-> "[Supported Operators](./supported_ops.md)" guide to check the types and expressions you are using are supported.
+> "[Supported Operators](https://github.com/NVIDIA/spark-rapids/blob/main/docs/supported_ops.md)" guide to check the types and expressions you are using are supported.
This document covers below topics:
@@ -36,6 +49,49 @@ Spark event logs after the application(s) have run, the second is to be integrat
application using explicit API calls, and the third is to install a Spark listener which can output
results on a per SQL query basis.
+In running the qualification tool standalone on Spark event logs, the tool can be run as a user tool command
+via a [pip package](https://pypi.org/project/spark-rapids-user-tools/) for CSP environments (Google Dataproc,
+AWS EMR, Databricks AWS) or as a java application for other environments.
+
+## Running the Qualification tool standalone for CSP environments on Spark event logs
+### User Tools Prerequisites and Setup for CSP environments
+
+* [Dataproc](https://github.com/NVIDIA/spark-rapids-tools/blob/main/user_tools/docs/user-tools-dataproc.md)
+* [EMR](https://github.com/NVIDIA/spark-rapids-tools/blob/main/user_tools/docs/user-tools-aws-emr.md)
+* [Databricks AWS](https://github.com/NVIDIA/spark-rapids-tools/blob/main/user_tools/docs/user-tools-databricks-aws.md)
+
+### Qualify CPU Workloads for Potential Cost Savings and Acceleration with GPUs
+
+The qualification tool will run against logs from your CSP environment and then will output the applications
+recommended for acceleration along with estimated speed-up and cost saving metrics.
+
+Usage: `spark_rapids_user_tools qualification --cpu_cluster --eventlogs `
+
+The supported CSPs are *dataproc*, *emr*, and *databricks-aws*. The EVENTLOGS-PATH should be the storage location
+for your eventlogs. For Dataproc, it should be set to the GCS path. For EMR and Databricks-AWS, it should be set to
+the S3 path. THE CLUSTER can be a live cluster or a configuration file representing the cluster instances and size.
+More details are in the above documentation links per CSP environment. The user tools only show recommended applications in the output.
+
+Help (to see all options available): `spark_rapids_user_tools qualification --help`
+
+Example output:
+```
++----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------+
+| | App Name | App ID | Recommendation | Estimated GPU | Estimated GPU | App | Estimated GPU |
+| | | | | Speedup | Duration(s) | Duration(s) | Savings(%) |
+|----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------|
+| 0 | query24 | application_1664888311321_0011 | Strongly Recommended | 3.49 | 257.18 | 897.68 | 59.70 |
+| 1 | query78 | application_1664888311321_0009 | Strongly Recommended | 3.35 | 113.89 | 382.35 | 58.10 |
+| 2 | query23 | application_1664888311321_0010 | Strongly Recommended | 3.08 | 325.77 | 1004.28 | 54.37 |
+| 3 | query64 | application_1664888311321_0008 | Strongly Recommended | 2.91 | 150.81 | 440.30 | 51.82 |
+| 4 | query50 | application_1664888311321_0003 | Recommended | 2.47 | 101.54 | 250.95 | 43.08 |
+| 5 | query16 | application_1664888311321_0005 | Recommended | 2.36 | 106.33 | 251.95 | 40.63 |
+| 6 | query38 | application_1664888311321_0004 | Recommended | 2.29 | 67.37 | 154.33 | 38.59 |
+| 7 | query87 | application_1664888311321_0006 | Recommended | 2.25 | 75.67 | 170.69 | 37.64 |
+| 8 | query51 | application_1664888311321_0002 | Recommended | 1.53 | 53.94 | 82.63 | 8.18 |
++----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------+
+```
+
## Running the Qualification tool standalone on Spark event logs
### Prerequisites
@@ -55,7 +111,7 @@ more information.
The Qualification tool require the Spark 3.x jars to be able to run but do not need an Apache Spark run time.
If you do not already have Spark 3.x installed, you can download the Spark distribution to
any machine and include the jars in the classpath.
-- Download the jar file from [Maven repository](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/23.02.0/)
+- Download the jar file from [Maven repository](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/23.04.0/)
- [Download Apache Spark 3.x](http://spark.apache.org/downloads.html) - Spark 3.1.1 for Apache Hadoop is recommended
### Step 2 Run the Qualification tool
@@ -84,7 +140,7 @@ any machine and include the jars in the classpath.
```bash
Sample: java ${QUALIFICATION_HEAP} \
- -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/*
+ -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/* \
com.nvidia.spark.rapids.tool.qualification.QualificationMain /usr/logs/app-name1
```
@@ -110,7 +166,7 @@ java -cp ~/rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/*:$HADOOP_CO
RAPIDS Accelerator Qualification tool for Apache Spark
-Usage: java -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/*
+Usage: java -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/* \
com.nvidia.spark.rapids.tool.qualification.QualificationMain [options]
@@ -165,6 +221,8 @@ Usage: java -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/*
--max-sql-desc-length Maximum length of the SQL description
string output with the per sql output.
Default is 100.
+ --ml-functions Report if there are any SparkML or Spark XGBoost
+ functions in the eventlog.
-n, --num-output-rows Number of output rows in the summary report.
Default is 1000.
--num-threads Number of thread to use for parallel
@@ -183,6 +241,10 @@ Usage: java -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/*
It will overwrite any existing directory with
the same name.
-p, --per-sql Report at the individual SQL query level.
+ --platform Cluster platform where Spark CPU workloads were
+ executed. Options include onprem, dataproc, emr
+ databricks-aws, and databricks-azure.
+ Default is onprem.
-r, --report-read-schema Whether to output the read formats and
datatypes to the CSV file. This can be very
long. Default is false.
@@ -245,6 +307,14 @@ java ${QUALIFICATION_HEAP} \
com.nvidia.spark.rapids.tool.qualification.QualificationMain -f 1-newest-per-app-name /eventlogDir
```
+- Parse ML functions from the eventlog:
+
+```bash
+java ${QUALIFICATION_HEAP} \
+ -cp ~/rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/*:$HADOOP_CONF_DIR/ \
+ com.nvidia.spark.rapids.tool.qualification.QualificationMain --ml-functions /eventlogDir
+```
+
Note: the “regular expression” used by `-a` option is based on
[java.util.regex.Pattern](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html).
@@ -274,6 +344,8 @@ The tree structure of the output directory `${OUTPUT_FOLDER}/rapids_4_spark_qual
├── rapids_4_spark_qualification_output_persql.csv
├── rapids_4_spark_qualification_output_execs.csv
├── rapids_4_spark_qualification_output_stages.csv
+ ├── rapids_4_spark_qualification_output_mlfunctions.csv
+ ├── rapids_4_spark_qualification_output_mlfunctions_totalduration.csv
└── ui
├── assets
│ ├── bootstrap/
@@ -308,7 +380,7 @@ to [Understanding the Qualification tool output](#understanding-the-qualificatio
- Java 8 or above, Spark 3.0.1+
### Download the tools jar
-- Download the jar file from [Maven repository](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/23.02.0/)
+- Download the jar file from [Maven repository](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/23.04.0/)
### Modify your application code to call the api's
@@ -395,7 +467,7 @@ with the Rapids Accelerator for Spark.
- Java 8 or above, Spark 3.0.1+
### Download the tools jar
-- Download the jar file from [Maven repository](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/23.02.0/)
+- Download the jar file from [Maven repository](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/23.04.0/)
### Configuration
@@ -451,13 +523,14 @@ section on the file contents details.
For each processed Spark application, the Qualification tool generates two main fields to help quantify the expected
acceleration of migrating a Spark application or query to GPU.
-1. `Estimated GPU Duration`: predicted runtime of the app if it was run on GPU. It is the sum add of the accelerated
- operator durations along with durations that could not run on GPU because they are unsupported operators or not SQL/Dataframe.
-2. `Estimated Speed-up factor`: the estimated speed-up factor is simply the original CPU duration of the app divided by the
+1. `Estimated GPU Duration`: predicted runtime of the app if it was run on GPU. It is the sum of the accelerated
+ operator durations and ML functions duration(if applicable) along with durations that could not run on GPU because
+ they are unsupported operators or not SQL/Dataframe.
+2. `Estimated Speed-up`: the estimated speed-up is simply the original CPU duration of the app divided by the
estimated GPU duration. That will estimate how much faster the application would run on GPU.
The lower the estimated GPU duration, the higher the "_Estimated Speed-up_".
-The processed applications or queries are ranked by the "_Estimated Speed-up_". Based on how high the speed-up factor,
+The processed applications or queries are ranked by the "_Estimated Speed-up_". Based on how high the estimated speed-up,
the tool classifies the applications into the following different categories:
- `Strongly Recommended`
@@ -466,7 +539,7 @@ the tool classifies the applications into the following different categories:
- `Not Applicable`: indicates that the app has job or stage failures.
As mentioned before, the tool does not guarantee the applications or queries with the highest _recommendation_ will actually be
-accelerated the most. Please refer to [Supported Operators](./supported_ops.md) section.
+accelerated the most. Please refer to [Supported Operators](https://github.com/NVIDIA/spark-rapids/blob/main/docs/supported_ops.md) section.
In addition to the _recommendation_, the Qualification tool reports a set of metrics in tasks of SQL Dataframe operations
within the scope of: "_Entire App_"; "_Stages_"; and "_Execs_". The report is divided into three main levels. The fields
@@ -488,10 +561,10 @@ The report represents the entire app execution, including unsupported operators
4. _App Duration_: wall-Clock time measured since the application starts till it is completed.
If an app is not completed an estimated completion time would be computed.
5. _SQL DF duration_: wall-Clock time duration that includes only SQL-Dataframe queries.
-6. _GPU Opportunity_: wall-Clock time that shows how much of the SQL duration can be accelerated on the GPU.
+6. _GPU Opportunity_: wall-Clock time that shows how much of the SQL duration and ML functions(if applicable) can be accelerated on the GPU.
7. _Estimated GPU Duration_: predicted runtime of the app if it was run on GPU. It is the sum of the accelerated
- operator durations along with durations that could not run on GPU because they are unsupported operators or not SQL/Dataframe.
-8. _Estimated GPU Speed-up_: the speed-up factor is simply the original CPU duration of the app divided by the
+ operator durations and ML functions durations(if applicable) along with durations that could not run on GPU because they are unsupported operators or not SQL/Dataframe.
+8. _Estimated GPU Speed-up_: the speed-up is simply the original CPU duration of the app divided by the
estimated GPU duration. That will estimate how much faster the application would run on GPU.
9. _Estimated GPU Time Saved_: estimated wall-Clock time saved if it was run on the GPU.
10. _SQL Dataframe Task Duration_: amount of time spent in tasks of SQL Dataframe operations.
@@ -528,7 +601,7 @@ The report represents the entire app execution, including unsupported operators
is passed to the CLI.
**Note:** the Qualification tool won't catch all UDFs, and some of the UDFs can be handled with additional steps.
-Please refer to [Supported Operators](./supported_ops.md) for more details on UDF.
+Please refer to [Supported Operators](https://github.com/NVIDIA/spark-rapids/blob/main/docs/supported_ops.md) for more details on UDF.
By default, the applications and queries are sorted in descending order by the following fields:
- _Recommendation_;
@@ -545,13 +618,13 @@ For each stage used in SQL operations, the Qualification tool generates the foll
3. _Average Speedup Factor_: the average estimated speed-up of all the operators in the given stage.
4. _Stage Task Duration_: amount of time spent in tasks of SQL Dataframe operations for the given stage.
5. _Unsupported Task Duration_: sum of task durations for the unsupported operators. For more details,
- see [Supported Operators](./supported_ops.md).
+ see [Supported Operators](https://github.com/NVIDIA/spark-rapids/blob/main/docs/supported_ops.md).
6. _Stage Estimated_: True or False indicates if we had to estimate the stage duration.
### Execs report
The Qualification tool generates a report of the "Exec" in the "_SparkPlan_" or "_Executor Nodes_" along with the estimated
-acceleration on the GPU. Please refer to the [Supported Operators](./supported_ops.md) guide for more
+acceleration on the GPU. Please refer to the [Supported Operators](https://github.com/NVIDIA/spark-rapids/blob/main/docs/supported_ops.md) guide for more
details on limitations on UDFs and unsupported operators.
1. _App ID_
@@ -564,7 +637,7 @@ details on limitations on UDFs and unsupported operators.
6. _Exec Duration_: wall-Clock time measured since the operator starts till it is completed.
7. _SQL Node Id_
8. _Exec Is Supported_: whether the Exec is supported by RAPIDS or not. Please refer to the
- [Supported Operators](./supported_ops.md) section.
+ [Supported Operators](https://github.com/NVIDIA/spark-rapids/blob/main/docs/supported_ops.md) section.
9. _Exec Stages_: an array of stage IDs
10. _Exec Children_
11. _Exec Children Node Ids_
@@ -625,6 +698,24 @@ The following table lists the exec's name and the status of parsing their expres
| WindowExec | - | x | - |
| WindowInPandasExec | - | - | x |
+### MLFunctions report
+The Qualification tool generates a report if there are SparkML or Spark XGBoost functions used in the eventlog.
+The functions in "*spark.ml.*" or "*spark.XGBoost.*" packages are displayed in the report.
+
+1. _App ID_
+2. _Stage ID_
+3. _ML Functions_: List of ML functions used in the corresponding stage.
+4. _Stage Task Duration_: amount of time spent in tasks containing ML functions for the given stage.
+
+### MLFunctions total duration report
+The Qualification tool generates a report of total duration across all stages for ML functions which
+are supported on GPU.
+
+1. _App ID_
+2. _Stage_Ids : Stage Id's corresponding to the given ML function.
+3. _ML Function Name_: ML function name supported on GPU.
+4. _Total Duration_: total duration across all stages for the corresponding ML function.
+
## Output Formats
The Qualification tool generates the output as CSV/log files. Starting from "_22.06_", the default
@@ -711,7 +802,7 @@ It contains the following main components:
There are three searchPanes:
1. "_Is Stage Estimated_": it splits the stages into two groups based on whether the stage duration time was estimated
or not.
- 2. "_Speed-up_": groups the stages by their "average speed-up factor". Each stage can belong to one of the following
+ 2. "_Speed-up_": groups the stages by their "average speed-up". Each stage can belong to one of the following
predefined speed-up ranges: `1.0 (No Speed-up)`; `]1.0, 1.3[`; `[1.3, 2.5[`; `[2.5, 5[`; and `[5, _]`. The
search-pane does not show a range bucket if its count is 0.
3. "_Tasks GPU Support_": this filter can be used to find stages having all their execs supported by the GPU.
@@ -724,7 +815,7 @@ It contains the following main components:
There are three _searchPanes_:
1. "_Exec_": filters the rows by exec name. This filter also allows text searching by typing into the filter-title as
a text input.
- 2. "_Speed-up_": groups the stages by their "average speed-up factor". Each stage can belong to one of the following
+ 2. "_Speed-up_": groups the stages by their "average speed-up". Each stage can belong to one of the following
predefined speed-up ranges: `1.0 (No Speed-up)`; `]1.0, 1.3[`; `[1.3, 2.5[`; `[2.5, 5[`; and `[5, _]`. The
search-pane does not show a range bucket if its count is 0.
3. "_GPU Support_": filters the execs whether an exec is supported by GPU or not.
diff --git a/docs/tuning-guide.md b/docs/tuning-guide.md
index 69b450331de..5657ee90f81 100644
--- a/docs/tuning-guide.md
+++ b/docs/tuning-guide.md
@@ -152,6 +152,13 @@ performance. Running multiple tasks concurrently on the GPU will reduce the memo
to each task as they will be sharing the GPU's total memory. As a result, some queries that fail
to run with a higher concurrent task setting may run successfully with a lower setting.
+As of the 23.04 release of the RAPIDS Accelerator for Apache Spark
+many out of memory errors result in parts of the query being rolled back and retried instead
+of a task failure. The fact that this is happening will show up in the task metrics.
+These metrics include `gpuRetryCount` which is the number of times that a retry was attempted.
+As a part of this the normal `OutOfMemoryError` is thrown much less. Instead a `RetryOOM`
+or `SplitAndRetryOOM` exception is thrown.
+
To mitigate the out of memory errors you can often reduce the batch size, which will keep less
data active in a batch at a time, but can increase the overall runtime as less data is being
processed per batch.
@@ -297,6 +304,8 @@ partition sizes to avoid GPU out of memory errors.
## Metrics
+### SQL
+
Custom Spark SQL Metrics are available which can help identify performance bottlenecks in a query.
| Key | Name | Description |
@@ -322,11 +331,7 @@ Custom Spark SQL Metrics are available which can help identify performance bottl
| opTime | op time | Time that an operator takes, exclusive of the time for executing or fetching results from child operators, and typically outside of the time it takes to acquire the GPU semaphore. Note: Sometimes contains CPU times, e.g.: concatTime |
| partitionSize | partition data size | Total size in bytes of output partitions. |
| peakDevMemory | peak device memory | Peak GPU memory used during execution of an operator. |
-| semaphoreWaitTime | GPU semaphore wait time | Time spent waiting for the GPU semaphore. |
-| sortTime | sort time | Time spent in sort operations in GpuSortExec and GpuTopN. |
-| spillData | bytes spilled from GPU | Total bytes spilled from GPU. |
-| spillDisk | bytes spilled to disk | Total bytes spilled from GPU to disk. |
-| spillHost | bytes spilled to host | Total bytes spilled from GPU to host memory. |
+| sortTime | sort time | Time spent in sort operations in GpuSortExec and GpuTopN. | |
| streamTime | stream time | Time spent reading data from a child. This generally happens for the stream side of a hash join or for columnar to row and row to columnar operations. |
Not all metrics are enabled by default. The configuration setting `spark.rapids.sql.metrics.level` can be set
@@ -344,6 +349,32 @@ Many of the questions people really want to answer with the metrics are around h
operators take. Where is the bottleneck in my query? How much of my query is executing on the GPU?
How long does operator X take on the GPU vs the CPU?
+### Task
+
+Custom Task level accumulators are also included. These metrics are not for individual
+operators in the SQL plan, but are per task and roll up to stages in the plan. Timing metrics
+are reported in the format of HH:MM:SS.sss. It should be noted that spill metrics,
+including the spill to memory and disk sizes, are not isolated to a single
+task, or even a single stage in the plan. The amount of data spilled is the amount of
+data that this particular task needed to spill in order to make room for the task to
+allocate new memory. The spill time metric is how long it took that task to spill
+that memory. It could have spilled memory associated with a different task,
+or even a different stage or job in the plan. The spill read time metric is how
+long it took to read back in the data it needed to complete the task. This does not
+correspond to the data that was spilled by this task.
+
+| Name | Description |
+|-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| gpuSemaphoreWait | The time the task spent waiting on the GPU semaphore. |
+| gpuSpillBlockTime | The time that this task was blocked spilling data from the GPU. |
+| gpuSpillReadTime | The time that this task was blocked reading data to the GPU that was spilled previously. |
+| gpuRetryCount | The number of times that a retry exception was thrown in an attempt to roll back processing to free memory. |
+| gpuSplitAndRetryCount | The number of times that a split and retry exception was thrown in an attempt to roll back processing to free memory, and split the input to make more room. |
+| gpuRetryBlockTime | The amount of time that this task was blocked either hoping that other tasks will free up more memory or after a retry exception was thrown to wait until the task can go on. |
+
+The spill data sizes going to host/CPU memory and disk are the same as used by Spark task level
+metrics.
+
### Time taken on the GPU
`opTime` mainly convey the GPU time.
@@ -365,10 +396,9 @@ Some operators provide out of core algorithms, or algorithms that can process da
than can fit in GPU memory. This is often done by breaking the problem up into smaller pieces and
letting some of those pieces be moved out of GPU memory when not being worked on. Apache Spark does
similar things when processing data on the CPU. When these types of algorithms are used
-`bytes spilled from GPU` will show up as a metric to indicate how much data was transferred off of
-the GPU to either host memory or disk to make room for more data to be processed. Generally this
-spilling happens while the GPU semaphore is held, and can really slow down processing. Details
-about how much data was spilled to host memory vs spilled to disk show up in `DEBUG` mode for the
+the task level spill metrics will indicate that spilling happened. Be aware that
+the same metrics are used both for both the GPU code and the original Spark CPU code. The
+GPU spills will always be timed and show up as `gpuSpillBlockTime` in the task level
metrics.
### Time taken on the CPU
@@ -461,4 +491,4 @@ column/value, `lead` or `lag`. These allow us to compute the result in approxima
For all other cases large windows, including skewed values in partition by and order by data, can
result in slow performance. If you do run into one of these situations please file an
[issue](https://github.com/NVIDIA/spark-rapids/issues/new/choose) so we can properly prioritize
-our work to support more optimizations.
\ No newline at end of file
+our work to support more optimizations.
diff --git a/integration_tests/README.md b/integration_tests/README.md
index 95f36ffc256..00932389040 100644
--- a/integration_tests/README.md
+++ b/integration_tests/README.md
@@ -105,12 +105,7 @@ For manual installation, you need to setup your environment:
You can install all the dependencies using `pip` by running the following command:
```shell script
- pip install pytest \
- sre_yield \
- pandas \
- pyarrow \
- pytest-xdist \
- findspark
+ pip install -r requirements.txt
```
### Installing Spark
@@ -255,7 +250,7 @@ individually, so you don't risk running unit tests along with the integration te
http://www.scalatest.org/user_guide/using_the_scalatest_shell
```shell
-spark-shell --jars rapids-4-spark-tests_2.12-23.02.0-tests.jar,rapids-4-spark-integration-tests_2.12-23.02.0-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar
+spark-shell --jars rapids-4-spark-tests_2.12-23.04.0-tests.jar,rapids-4-spark-integration-tests_2.12-23.04.0-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar
```
First you import the `scalatest_shell` and tell the tests where they can find the test files you
@@ -278,7 +273,7 @@ If you just want to verify the SQL replacement is working you will need to add t
assumes CUDA 11.0 is being used.
```
-$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-23.02.0-cuda11.jar" ./runtests.py
+$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-23.04.0-cuda11.jar" ./runtests.py
```
You don't have to enable the plugin for this to work, the test framework will do that for you.
@@ -377,7 +372,7 @@ To run cudf_udf tests, need following configuration changes:
As an example, here is the `spark-submit` command with the cudf_udf parameter on CUDA 11.0:
```
-$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-23.02.0-cuda11.jar,rapids-4-spark-tests_2.12-23.02.0.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-23.02.0-cuda11.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-23.02.0-cuda11.jar" ./runtests.py --cudf_udf
+$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-23.04.0-cuda11.jar,rapids-4-spark-tests_2.12-23.04.0.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-23.04.0-cuda11.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-23.04.0-cuda11.jar" ./runtests.py --cudf_udf
```
### Enabling fuzz tests
diff --git a/integration_tests/conftest.py b/integration_tests/conftest.py
index cc9805ad0a5..13292e6bda5 100644
--- a/integration_tests/conftest.py
+++ b/integration_tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -48,3 +48,7 @@ def pytest_addoption(parser):
parser.addoption(
"--delta_lake", action="store_true", default=False, help="if true enable Delta Lake tests"
)
+ parser.addoption(
+ "--test_oom_injection_mode", action='store', default="random",
+ help="in what way, if any, should the tests inject OOMs at test time. Valid options are: random, always, or never"
+ )
diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml
index 938f4bee4fa..812d7eeca2e 100644
--- a/integration_tests/pom.xml
+++ b/integration_tests/pom.xml
@@ -1,6 +1,6 @@
/dbfs/path/foo.sh,/dbfs/path/bar.sh
-String getInitScripts(String rootDir, String files) {
- return rootDir + '/' + files.replace(',', ',' + rootDir + '/')
-}
-
void databricksBuild() {
def CLUSTER_ID = ''
def SPARK_MAJOR = BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS.replace('.', '')
+ def dbfs_path = "$INIT_SCRIPTS_DIR-$DB_TYPE"
try {
stage("Create $SPARK_MAJOR DB") {
script {
@@ -488,7 +390,18 @@ void databricksBuild() {
sh "tar -zcf spark-rapids-ci.tgz *"
def CREATE_PARAMS = " -r $DATABRICKS_RUNTIME -w $DATABRICKS_HOST -t $DATABRICKS_TOKEN" +
" -s $DB_TYPE -n CI-${BUILD_TAG}-${BASE_SPARK_VERSION} -k \"$DATABRICKS_PUBKEY\" -i $IDLE_TIMEOUT" +
- " -d $DATABRICKS_DRIVER -o $DATABRICKS_WORKER -e $NUM_WORKERS -f $INIT_SCRIPTS"
+ " -d $DATABRICKS_DRIVER -o $DATABRICKS_WORKER -e $NUM_WORKERS"
+
+ // handle init scripts if exist
+ if (env.INIT_SCRIPTS) {
+ sh "bash -c 'dbfs mkdirs $dbfs_path'"
+ env.INIT_SCRIPTS.split(',').each {
+ sh "bash -c 'dbfs cp --overwrite jenkins/databricks/${it} $dbfs_path'"
+ }
+ // foo.sh,bar.sh --> dbfs:/path/foo.sh,dbfs:/path/bar.sh
+ CREATE_PARAMS += " -f $dbfs_path/" + env.INIT_SCRIPTS.replace(',', ",$dbfs_path/")
+ }
+
CLUSTER_ID = sh(script: "python3 ./jenkins/databricks/create.py $CREATE_PARAMS",
returnStdout: true).trim()
echo CLUSTER_ID
@@ -532,6 +445,9 @@ void databricksBuild() {
if (CLUSTER_ID) {
container('cpu') {
retry(3) {
+ if (env.INIT_SCRIPTS) {
+ sh "bash -c 'dbfs rm -r $dbfs_path'"
+ }
sh "python3 ./jenkins/databricks/shutdown.py -s $DATABRICKS_HOST -t $DATABRICKS_TOKEN -c $CLUSTER_ID -d"
}
}
diff --git a/jenkins/databricks/build.sh b/jenkins/databricks/build.sh
index 5be253e198b..87c78ddcf3e 100755
--- a/jenkins/databricks/build.sh
+++ b/jenkins/databricks/build.sh
@@ -18,7 +18,7 @@
# This script installs dependencies required to build RAPIDS Accelerator for Apache Spark on DB.
# All the environments can be overwritten by shell variables:
# SPARKSRCTGZ: Archive file location of the plugin repository. Default is empty.
-# BASE_SPARK_VERSION: Spark version [3.1.2, 3.2.1, 3.3.0]. Default is pulled from current instance.
+# BASE_SPARK_VERSION: Spark version [3.2.1, 3.3.0]. Default is pulled from current instance.
# BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS: The version of Spark used when we install the
# Databricks jars in .m2. Default is {BASE_SPARK_VERSION}.
# MVN_OPT: Options to be passed to the MVN commands. Note that "-DskipTests" is hardcoded in the
@@ -114,7 +114,7 @@ initialize()
}
# Sets the JAR files prefixes based on the build version.
-# DB9.1 and 10.4 uses ----workspace as a prefix.
+# DB 10.4 uses ----workspace as a prefix.
# DB 11.3 uses more abbreviations (i.e., workspace becomes ws).
set_jars_prefixes()
{
@@ -124,7 +124,7 @@ set_jars_prefixes()
# get the hive prefix. something like hive-2.3
HIVE_VER_STRING=hive-$(echo ${sw_versions[HIVE_FULL]} | cut -d. -f 1,2)
- # defaults are for 3.1.2, and 3.2.1
+ # defaults are for 3.2.1
PREFIX_WS=----workspace
SPARK_MAJOR_VERSION_STRING=spark_${SPARK_MAJOR_VERSION_NUM_STRING}
PREFIX_SPARK=${PREFIX_WS}_${SPARK_MAJOR_VERSION_STRING}
@@ -186,24 +186,6 @@ set_sw_versions()
sw_versions[PARQUET]="1.12.0"
sw_versions[PROTOBUF]="2.6.1"
;;
- "3.1.2")
- sw_versions[COMMONS_LANG3]="3.10"
- sw_versions[COMMONS_IO]="2.4"
- sw_versions[DB]="9"
- sw_versions[FASTERXML_JACKSON]="2.10.0"
- sw_versions[HADOOP]="2.7"
- sw_versions[HIVE_FULL]="2.3.7"
- sw_versions[JSON4S_AST]="3.7.0-M5"
- sw_versions[JSON4S_CORE]="3.7.0-M5"
- sw_versions[ORC]="1.5.12"
- sw_versions[PARQUET]="1.10.1"
- sw_versions[HIVESTORAGE_API]="2.7.2"
- sw_versions[PROTOBUF]="2.6.1"
- sw_versions[KRYO]="4.0.2"
- sw_versions[ARROW]="2.0.0"
- sw_versions[JAVAASSIST]="3.25.0-GA"
- sw_versions[AVRO]="1.8.2"
- ;;
*) echo "Unexpected Spark version: $BASE_SPARK_VERSION"; exit 1;;
esac
}
@@ -290,17 +272,6 @@ set_dep_jars()
artifacts[LOG4JCORE]="-DgroupId=org.apache.logging.log4j -DartifactId=log4j-core"
dep_jars[LOG4JCORE]=${PREFIX_WS_SP_MVN_HADOOP}--org.apache.logging.log4j--log4j-core--org.apache.logging.log4j__log4j-core__${sw_versions[LOG4JCORE]}.jar
fi
-
- # spark-3.1.2 overrides some jar naming conventions
- if [[ $BASE_SPARK_VERSION == "3.1.2" ]]
- then
- dep_jars[HIVE]=${PREFIX_SPARK}--sql--hive--hive_${SCALA_VERSION}_deploy_shaded.jar
- dep_jars[HIVEMETASTORECLIENTPATCHED]=${PREFIX_SPARK}--patched-hive-with-glue--hive-12679-patch_deploy.jar
- dep_jars[PARQUETFORMAT]=${PREFIX_WS_SP_MVN_HADOOP}--org.apache.parquet--parquet-format--org.apache.parquet__parquet-format__2.4.0.jar
- dep_jars[AVROSPARK]=${PREFIX_SPARK}--vendor--avro--avro_${SCALA_VERSION}_deploy_shaded.jar
- dep_jars[AVROMAPRED]=${PREFIX_WS_SP_MVN_HADOOP}--org.apache.avro--avro-mapred-hadoop2--org.apache.avro__avro-mapred-hadoop2__${sw_versions[AVRO]}.jar
- dep_jars[AVRO]=${PREFIX_WS_SP_MVN_HADOOP}--org.apache.avro--avro--org.apache.avro__avro__${sw_versions[AVRO]}.jar
- fi
}
# Install dependency jars to MVN repository.
@@ -337,7 +308,7 @@ else
fi
if [[ "$WITH_BLOOP" == "1" ]]; then
- MVN_OPT="ch.epfl.scala:maven-bloop_2.13:bloopInstall $MVN_OPT"
+ MVN_OPT="ch.epfl.scala:bloop-maven-plugin:bloopInstall $MVN_OPT"
fi
# Build the RAPIDS plugin by running package command for databricks
diff --git a/jenkins/databricks/create.py b/jenkins/databricks/create.py
index 4e6dfee840e..8f6e66c8877 100644
--- a/jenkins/databricks/create.py
+++ b/jenkins/databricks/create.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ def main():
workspace = 'https://dbc-9ff9942e-a9c4.cloud.databricks.com'
token = ''
sshkey = ''
- cluster_name = 'CI-GPU-databricks-23.02.0'
+ cluster_name = 'CI-GPU-databricks-23.04.0'
idletime = 240
runtime = '7.0.x-gpu-ml-scala2.12'
num_workers = 1
diff --git a/jenkins/databricks/cudf_udf_test.sh b/jenkins/databricks/cudf_udf_test.sh
new file mode 100644
index 00000000000..87439c358b3
--- /dev/null
+++ b/jenkins/databricks/cudf_udf_test.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+#
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This script sets the environment to run cudf_udf tests of RAPIDS Accelerator for Apache Spark on DB.
+# cudf conda packages need to be installed in advance, please refer to
+# './jenkins/databricks/init_cudf_udf.sh' to install.
+# All the environments can be overwritten by shell variables:
+# LOCAL_JAR_PATH: Location of the RAPIDS jars
+# SPARK_CONF: Spark configuration parameters
+
+# Usage:
+# - Running tests on Databricks:
+# `./jenkins/databricks/cudf-udf-test.sh`
+# To add support of a new runtime:
+# 1. Check if any more dependencies need to be added to the apt/conda install commands.
+# 2. If you had to go beyond the above steps to support the new runtime, then update the
+# instructions accordingly.
+set -ex
+
+# Map of software versions for each dependency.
+
+LOCAL_JAR_PATH=${LOCAL_JAR_PATH:-''}
+SPARK_CONF=${SPARK_CONF:-''}
+
+# Try to use "cudf-udf" conda environment for the python cudf-udf tests.
+CONDA_HOME=${CONDA_HOME:-"/databricks/conda"}
+if [ ! -d "${CONDA_HOME}/envs/cudf-udf" ]; then
+ echo "Error not found cudf conda packages! Please refer to './jenkins/databricks/init_cudf_udf.sh' to install."
+ exit -1
+fi
+export PATH=${CONDA_HOME}/envs/cudf-udf/bin:$PATH
+export PYSPARK_PYTHON=${CONDA_HOME}/envs/cudf-udf/bin/python
+# Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
+PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
+
+# Install required packages
+sudo apt -y install zip unzip
+
+export SPARK_HOME=/databricks/spark
+# Change to not point at Databricks confs so we don't conflict with their settings.
+export SPARK_CONF_DIR=$PWD
+
+# Get the correct py4j file.
+PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")
+# Set the path of python site-packages.
+PYTHON_SITE_PACKAGES="${CONDA_HOME}/envs/cudf-udf/lib/${PYTHON_VERSION}/site-packages"
+# Databricks Koalas can conflict with the actual Pandas version, so put site packages first.
+# Note that Koala is deprecated for DB10.4+ and it is recommended to use Pandas API on Spark instead.
+export PYTHONPATH=$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
+sudo ln -s /databricks/jars/ $SPARK_HOME/jars || true
+sudo chmod 777 /databricks/data/logs/
+sudo chmod 777 /databricks/data/logs/*
+echo { \"port\":\"15002\" } > ~/.databricks-connect
+
+CUDF_UDF_TEST_ARGS="--conf spark.python.daemon.module=rapids.daemon_databricks \
+ --conf spark.rapids.memory.gpu.minAllocFraction=0 \
+ --conf spark.rapids.memory.gpu.allocFraction=0.1 \
+ --conf spark.rapids.python.memory.gpu.allocFraction=0.1 \
+ --conf spark.rapids.python.concurrentPythonWorkers=2"
+
+## 'spark.foo=1,spark.bar=2,...' to 'export PYSP_TEST_spark_foo=1 export PYSP_TEST_spark_bar=2'
+if [ -n "$SPARK_CONF" ]; then
+ CONF_LIST=${SPARK_CONF//','/' '}
+ for CONF in ${CONF_LIST}; do
+ KEY=${CONF%%=*}
+ VALUE=${CONF#*=}
+ ## run_pyspark_from_build.sh requires 'export PYSP_TEST_spark_foo=1' as the spark configs
+ export PYSP_TEST_${KEY//'.'/'_'}=$VALUE
+ done
+
+ ## 'spark.foo=1,spark.bar=2,...' to '--conf spark.foo=1 --conf spark.bar=2 --conf ...'
+ SPARK_CONF="--conf ${SPARK_CONF/','/' --conf '}"
+fi
+
+TEST_TYPE="nightly"
+PCBS_CONF="com.nvidia.spark.ParquetCachedBatchSerializer"
+
+# Enable event log for qualification & profiling tools testing
+export PYSP_TEST_spark_eventLog_enabled=true
+mkdir -p /tmp/spark-events
+
+if [ -d "$LOCAL_JAR_PATH" ]; then
+ ## Run cudf-udf tests.
+ CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls $LOCAL_JAR_PATH/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
+ LOCAL_JAR_PATH=$LOCAL_JAR_PATH SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=1 \
+ bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE
+else
+ ## Run cudf-udf tests.
+ CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls /home/ubuntu/spark-rapids/dist/target/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
+ SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=0 \
+ bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE
+fi
diff --git a/jenkins/databricks/deploy.sh b/jenkins/databricks/deploy.sh
index 437a57631cb..064bfd71bd0 100755
--- a/jenkins/databricks/deploy.sh
+++ b/jenkins/databricks/deploy.sh
@@ -1,6 +1,6 @@
#!/bin/bash
#
-# Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ echo "Maven mirror is $MVN_URM_MIRROR"
SERVER_ID='snapshots'
SERVER_URL="$URM_URL-local"
SCALA_VERSION=`mvn help:evaluate -q -pl dist -Dexpression=scala.binary.version -DforceStdout`
-# remove the periods so change something like 3.1.1 to 311
+# remove the periods so change something like 3.2.1 to 321
VERSION_NUM=${BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS//.}
SPARK_VERSION_STR=spark$VERSION_NUM
SPARK_PLUGIN_JAR_VERSION=`mvn help:evaluate -q -pl dist -Dexpression=project.version -DforceStdout`
diff --git a/jenkins/databricks/init_cudf_udf.sh b/jenkins/databricks/init_cudf_udf.sh
index 254cf0540b6..191b9a9c33d 100755
--- a/jenkins/databricks/init_cudf_udf.sh
+++ b/jenkins/databricks/init_cudf_udf.sh
@@ -1,6 +1,6 @@
#!/bin/bash
#
-# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -18,25 +18,30 @@
# The initscript to set up environment for the cudf_udf tests on Databricks
# Will be automatically pushed into the dbfs:/databricks/init_scripts once it is updated.
-set -x
+set -ex
-CUDF_VER=${CUDF_VER:-23.02}
+CUDF_VER=${CUDF_VER:-23.04}
CUDA_VER=${CUDA_VER:-11.0}
# Need to explicitly add conda into PATH environment, to activate conda environment.
export PATH=/databricks/conda/bin:$PATH
# Set Python for the running instance
+export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"}
PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("{}.{}".format(sys.version_info.major, sys.version_info.minor))')
+# cudf 23.02+ do not support python 3.9. ref: https://docs.rapids.ai/notices/rsn0022/
+[[ "$PYTHON_VERSION" == '3.9' ]] && PYTHON_VERSION='3.8'
base=$(conda info --base)
# Create and activate 'cudf-udf' conda env for cudf-udf tests
+sudo chmod a+w ${base}/envs && conda config --add envs_dirs ${base}/envs
conda create -y -n cudf-udf -c conda-forge python=$PYTHON_VERSION mamba && \
source activate && \
conda activate cudf-udf
# Use mamba to install cudf-udf packages to speed up conda resolve time
conda install -y -c conda-forge mamba python=$PYTHON_VERSION
-${base}/envs/cudf-udf/bin/mamba remove -y c-ares zstd libprotobuf pandas
+# Do not error out "This operation will remove conda without replacing it with another version of conda." for now
+${base}/envs/cudf-udf/bin/mamba remove -y c-ares zstd libprotobuf pandas || true
REQUIRED_PACKAGES=(
cudatoolkit=$CUDA_VER
@@ -55,4 +60,4 @@ ${base}/envs/cudf-udf/bin/mamba install -y \
-c rapidsai -c rapidsai-nightly -c nvidia -c conda-forge -c defaults \
"${REQUIRED_PACKAGES[@]}"
-source deactivate && conda deactivate
\ No newline at end of file
+source deactivate && conda deactivate
diff --git a/jenkins/databricks/params.py b/jenkins/databricks/params.py
index c97fe9ede57..22a36fdf7c8 100644
--- a/jenkins/databricks/params.py
+++ b/jenkins/databricks/params.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
script_dest = '/home/ubuntu/build.sh'
source_tgz = 'spark-rapids-ci.tgz'
tgz_dest = '/home/ubuntu/spark-rapids-ci.tgz'
-base_spark_pom_version = '3.1.1'
+base_spark_pom_version = '3.2.1'
base_spark_version_to_install_databricks_jars = base_spark_pom_version
clusterid = ''
# can take comma seperated maven options, e.g., -Pfoo=1,-Dbar=2,...
diff --git a/jenkins/databricks/run_it.sh b/jenkins/databricks/run_it.sh
index 620c2ecdf57..c3f34e7dff7 100755
--- a/jenkins/databricks/run_it.sh
+++ b/jenkins/databricks/run_it.sh
@@ -1,6 +1,6 @@
#!/bin/bash
#
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -33,29 +33,19 @@ if [[ -z "$SPARK_HOME" ]]; then
fi
SCALA_BINARY_VER=${SCALA_BINARY_VER:-'2.12'}
-CONDA_HOME=${CONDA_HOME:-"/databricks/conda"}
-
-# Try to use "cudf-udf" conda environment for the python cudf-udf tests.
-if [ -d "${CONDA_HOME}/envs/cudf-udf" ]; then
- export PATH=${CONDA_HOME}/envs/cudf-udf/bin:${CONDA_HOME}/bin:$PATH
- export PYSPARK_PYTHON=${CONDA_HOME}/envs/cudf-udf/bin/python
-fi
+# Set PYSPARK_PYTHON to keep the version of driver/workers python consistent.
+export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"}
# Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
-python_version=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
-
-# override incompatible versions between databricks and cudf
-if [ -d "${CONDA_HOME}/envs/cudf-udf" ]; then
- PATCH_PACKAGES_PATH="$PWD/package-overrides/${python_version}"
-fi
+PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
+# Set the path of python site-packages, packages were installed here by 'jenkins/databricks/setup.sh'.
+PYTHON_SITE_PACKAGES="$HOME/.local/lib/${PYTHON_VERSION}/site-packages"
# Get the correct py4j file.
PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")
-# Set the path of python site-packages
-PYTHON_SITE_PACKAGES=/databricks/python3/lib/${python_version}/site-packages
# Databricks Koalas can conflict with the actual Pandas version, so put site packages first.
# Note that Koala is deprecated for DB10.4+ and it is recommended to use Pandas API on Spark instead.
-export PYTHONPATH=$PATCH_PACKAGES_PATH:$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
+export PYTHONPATH=$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
# Disable parallel test as multiple tests would be executed by leveraging external parallelism, e.g. Jenkins parallelism
export TEST_PARALLEL=${TEST_PARALLEL:-0}
@@ -73,7 +63,7 @@ if [[ "$TEST_TAGS" == "iceberg" ]]; then
"3.3.0")
ICEBERG_VERSION=${ICEBERG_VERSION:-0.14.1}
;;
- "3.2.1" | "3.1.2")
+ "3.2.1")
ICEBERG_VERSION=${ICEBERG_VERSION:-0.13.2}
;;
*) echo "Unexpected Spark version: $SPARK_VER"; exit 1;;
@@ -94,5 +84,14 @@ if [[ -n "$LOCAL_JAR_PATH" ]]; then
export LOCAL_JAR_PATH=$LOCAL_JAR_PATH
fi
+set +e
# Run integration testing
./integration_tests/run_pyspark_from_build.sh --runtime_env='databricks' --test_type=$TEST_TYPE
+ret=$?
+set -e
+if [ "$ret" = 5 ]; then
+ # avoid exit script w/ code 5 when the cases are skipped in specific test
+ echo "Suppress Exit code 5: No tests were collected"
+ exit 0
+fi
+exit "$ret"
diff --git a/jenkins/databricks/setup.sh b/jenkins/databricks/setup.sh
index 274181055b3..a1a9d03c900 100755
--- a/jenkins/databricks/setup.sh
+++ b/jenkins/databricks/setup.sh
@@ -1,6 +1,6 @@
#!/bin/bash
#
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -36,32 +36,17 @@ if [ -f $SPARK_HOME/conf/spark-env.sh ]; then
sudo chmod 777 `echo $local_dir | xargs`
fi
-CONDA_HOME=${CONDA_HOME:-"/databricks/conda"}
-
-# Try to use "cudf-udf" conda environment for the python cudf-udf tests.
-if [ -d "${CONDA_HOME}/envs/cudf-udf" ]; then
- export PATH=${CONDA_HOME}/envs/cudf-udf/bin:${CONDA_HOME}/bin:$PATH
- export PYSPARK_PYTHON=${CONDA_HOME}/envs/cudf-udf/bin/python
+# Set PYSPARK_PYTHON to keep the version of driver/workers python consistent.
+export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"}
+# Install if python pip does not exist.
+if [ -z "$($PYSPARK_PYTHON -m pip --version || true)" ]; then
+ curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
+ $PYSPARK_PYTHON get-pip.py && rm get-pip.py
fi
# Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
-python_version=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
-
-# override incompatible versions between databricks and cudf
-if [ -d "${CONDA_HOME}/envs/cudf-udf" ]; then
- CONDA_SITE_PATH="${CONDA_HOME}/envs/cudf-udf/lib/${python_version}/site-packages"
- PATCH_PACKAGES_PATH="$PWD/package-overrides/${python_version}"
- mkdir -p ${PATCH_PACKAGES_PATH}
- TO_PATCH=(
- google
- llvmlite
- numba
- numpy
- pyarrow
- )
-
- echo creating symlinks to override conflicting packages
- for p in "${TO_PATCH[@]}"; do
- ln -f -s ${CONDA_SITE_PATH}/${p} ${PATCH_PACKAGES_PATH}
- done
-fi
+PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
+# Set the path of python site-packages, and install packages here.
+PYTHON_SITE_PACKAGES="$HOME/.local/lib/${PYTHON_VERSION}/site-packages"
+# Use "python -m pip install" to make sure pip matches with python.
+$PYSPARK_PYTHON -m pip install --target $PYTHON_SITE_PACKAGES pytest sre_yield requests pandas pyarrow findspark pytest-xdist pytest-order
diff --git a/jenkins/databricks/test.sh b/jenkins/databricks/test.sh
index b5dc2bbaee7..c8a8cff7633 100755
--- a/jenkins/databricks/test.sh
+++ b/jenkins/databricks/test.sh
@@ -19,7 +19,7 @@
# All the environments can be overwritten by shell variables:
# LOCAL_JAR_PATH: Location of the RAPIDS jars
# SPARK_CONF: Spark configuration parameters
-# BASE_SPARK_VERSION: Spark version [3.1.2, 3.2.1, 3.3.0]. Default is pulled from current instance.
+# BASE_SPARK_VERSION: Spark version [3.2.1, 3.3.0]. Default is pulled from current instance.
# SHUFFLE_SPARK_SHIM: Set the default value for the shuffle shim. For databricks versions, append
# db. Example: spark330 => spark330db
# ICEBERG_VERSION: The iceberg version. To find the list of supported ICEBERG versions,
@@ -27,7 +27,6 @@
# SCALA_BINARY_VER: Scala version of the provided binaries. Default is 2.12.
# TEST_MODE: Can be one of the following (`DEFAULT` is the default value):
# - DEFAULT: all tests except cudf_udf tests
-# - CUDF_UDF_ONLY: cudf_udf tests only, requires extra conda cudf-py lib
# - ICEBERG_ONLY: iceberg tests only
# - DELTA_LAKE_ONLY: delta_lake tests only
# - MULTITHREADED_SHUFFLE: shuffle tests only
@@ -59,33 +58,20 @@ SCALA_BINARY_VER=${SCALA_BINARY_VER:-'2.12'}
# install required packages
sudo apt -y install zip unzip
-# Try to use "cudf-udf" conda environment for the python cudf-udf tests.
-if [ -d "${CONDA_HOME}/envs/cudf-udf" ]; then
- export PATH=${CONDA_HOME}/envs/cudf-udf/bin:${CONDA_HOME}/bin:$PATH
- export PYSPARK_PYTHON=${CONDA_HOME}/envs/cudf-udf/bin/python
+# Set PYSPARK_PYTHON to keep the version of driver/workers python consistent.
+export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"}
+# Install if python pip does not exist.
+if [ -z "$($PYSPARK_PYTHON -m pip --version || true)" ]; then
+ curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
+ $PYSPARK_PYTHON get-pip.py && rm get-pip.py
fi
# Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
-sw_versions[PYTHON]=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
-
-# override incompatible versions between databricks and cudf
-if [ -d "${CONDA_HOME}/envs/cudf-udf" ]; then
- CONDA_SITE_PATH="${CONDA_HOME}/envs/cudf-udf/lib/${sw_versions[PYTHON]}/site-packages"
- PATCH_PACKAGES_PATH="$PWD/package-overrides/${sw_versions[PYTHON]}"
- mkdir -p ${PATCH_PACKAGES_PATH}
- TO_PATCH=(
- google
- llvmlite
- numba
- numpy
- pyarrow
- )
-
- echo creating symlinks to override conflicting packages
- for p in "${TO_PATCH[@]}"; do
- ln -f -s ${CONDA_SITE_PATH}/${p} ${PATCH_PACKAGES_PATH}
- done
-fi
+PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
+# Set the path of python site-packages, and install packages here.
+PYTHON_SITE_PACKAGES="$HOME/.local/lib/${PYTHON_VERSION}/site-packages"
+# Use "python -m pip install" to make sure pip matches with python.
+$PYSPARK_PYTHON -m pip install --target $PYTHON_SITE_PACKAGES pytest sre_yield requests pandas pyarrow findspark pytest-xdist pytest-order
export SPARK_HOME=/databricks/spark
# change to not point at databricks confs so we don't conflict with their settings
@@ -100,32 +86,20 @@ case "$BASE_SPARK_VERSION" in
# Available versions https://repo.maven.apache.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.2_2.12/
sw_versions[ICEBERG]=${ICEBERG_VERSION:-'0.13.2'}
;;
- "3.1.2")
- # Available versions https://repo.maven.apache.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.2_2.12/
- sw_versions[ICEBERG]=${ICEBERG_VERSION:-'0.13.2'}
- ;;
*) echo "Unexpected Spark version: $BASE_SPARK_VERSION"; exit 1;;
esac
# Set the iceberg_spark to something like 3.3 for DB11.3, 3.2 for DB10.4
sw_versions[ICEBERG_SPARK]=$(echo $BASE_SPARK_VERSION | cut -d. -f1,2)
# Get the correct py4j file.
PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")
-# Set the path of python site-packages
-PYTHON_SITE_PACKAGES=/databricks/python3/lib/${sw_versions[PYTHON]}/site-packages
# Databricks Koalas can conflict with the actual Pandas version, so put site packages first.
# Note that Koala is deprecated for DB10.4+ and it is recommended to use Pandas API on Spark instead.
-export PYTHONPATH=$PATCH_PACKAGES_PATH:$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
+export PYTHONPATH=$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
sudo ln -s /databricks/jars/ $SPARK_HOME/jars || true
sudo chmod 777 /databricks/data/logs/
sudo chmod 777 /databricks/data/logs/*
echo { \"port\":\"15002\" } > ~/.databricks-connect
-CUDF_UDF_TEST_ARGS="--conf spark.python.daemon.module=rapids.daemon_databricks \
- --conf spark.rapids.memory.gpu.minAllocFraction=0 \
- --conf spark.rapids.memory.gpu.allocFraction=0.1 \
- --conf spark.rapids.python.memory.gpu.allocFraction=0.1 \
- --conf spark.rapids.python.concurrentPythonWorkers=2"
-
## 'spark.foo=1,spark.bar=2,...' to 'export PYSP_TEST_spark_foo=1 export PYSP_TEST_spark_bar=2'
if [ -n "$SPARK_CONF" ]; then
CONF_LIST=${SPARK_CONF//','/' '}
@@ -140,13 +114,12 @@ if [ -n "$SPARK_CONF" ]; then
SPARK_CONF="--conf ${SPARK_CONF/','/' --conf '}"
fi
-IS_SPARK_311_OR_LATER=0
-[[ "$(printf '%s\n' "3.1.1" "$BASE_SPARK_VERSION" | sort -V | head -n1)" = "3.1.1" ]] && IS_SPARK_311_OR_LATER=1
+IS_SPARK_321_OR_LATER=0
+[[ "$(printf '%s\n' "3.2.1" "$BASE_SPARK_VERSION" | sort -V | head -n1)" = "3.2.1" ]] && IS_SPARK_321_OR_LATER=1
# TEST_MODE
# - DEFAULT: all tests except cudf_udf tests
-# - CUDF_UDF_ONLY: cudf_udf tests only, requires extra conda cudf-py lib
# - ICEBERG_ONLY: iceberg tests only
# - DELTA_LAKE_ONLY: delta_lake tests only
# - MULTITHREADED_SHUFFLE: shuffle tests only
@@ -190,19 +163,12 @@ if [ -d "$LOCAL_JAR_PATH" ]; then
LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE
## Run cache tests
- if [[ "$IS_SPARK_311_OR_LATER" -eq "1" ]]; then
+ if [[ "$IS_SPARK_321_OR_LATER" -eq "1" ]]; then
PYSP_TEST_spark_sql_cache_serializer=${PCBS_CONF} \
LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE -k cache_test
fi
fi
- if [[ "$TEST_MODE" == "CUDF_UDF_ONLY" ]]; then
- ## Run cudf-udf tests
- CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls $LOCAL_JAR_PATH/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
- LOCAL_JAR_PATH=$LOCAL_JAR_PATH SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=1 \
- bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE
- fi
-
if [[ "$TEST_MODE" == "DEFAULT" || "$TEST_MODE" == "ICEBERG_ONLY" ]]; then
## Run Iceberg tests
LOCAL_JAR_PATH=$LOCAL_JAR_PATH SPARK_SUBMIT_FLAGS="$SPARK_CONF $ICEBERG_CONFS" TEST_PARALLEL=1 \
@@ -214,19 +180,12 @@ else
bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE
## Run cache tests
- if [[ "$IS_SPARK_311_OR_LATER" -eq "1" ]]; then
+ if [[ "$IS_SPARK_321_OR_LATER" -eq "1" ]]; then
PYSP_TEST_spark_sql_cache_serializer=${PCBS_CONF} \
bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE -k cache_test
fi
fi
- if [[ "$TEST_MODE" == "CUDF_UDF_ONLY" ]]; then
- ## Run cudf-udf tests
- CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls /home/ubuntu/spark-rapids/dist/target/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
- SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=0 \
- bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE
- fi
-
if [[ "$TEST_MODE" == "DEFAULT" || "$TEST_MODE" == "ICEBERG_ONLY" ]]; then
## Run Iceberg tests
SPARK_SUBMIT_FLAGS="$SPARK_CONF $ICEBERG_CONFS" TEST_PARALLEL=1 \
diff --git a/jenkins/deploy.sh b/jenkins/deploy.sh
index 10ce355718c..d6511eec9db 100755
--- a/jenkins/deploy.sh
+++ b/jenkins/deploy.sh
@@ -36,17 +36,20 @@ set -ex
SIGN_FILE=${1:-"false"}
DIST_PL=${DIST_PL:-"dist"}
-SQL_PL=${SQL_PL:-"sql-plugin"}
-POM_FILE=${POM_FILE:-`find "$DIST_PL/target/extra-resources/" -name pom.xml`}
-OUT_PATH=${OUT_PATH:-"$DIST_PL/target"}
-SIGN_TOOL=${SIGN_TOOL:-"gpg"}
-MVN_SETTINGS=${MVN_SETTINGS:-"jenkins/settings.xml"}
-MVN="mvn -B -Dmaven.wagon.http.retryHandler.count=3 -DretryFailedDeploymentCount=3 -s $MVN_SETTINGS"
###### Build the path of jar(s) to be deployed ######
+MVN_SETTINGS=${MVN_SETTINGS:-"jenkins/settings.xml"}
+MVN="mvn -B -Dmaven.wagon.http.retryHandler.count=3 -DretryFailedDeploymentCount=3 -s $MVN_SETTINGS"
ART_ID=`$MVN help:evaluate -q -pl $DIST_PL -Dexpression=project.artifactId -DforceStdout`
+ART_GROUP_ID=`$MVN help:evaluate -q -pl $DIST_PL -Dexpression=project.groupId -DforceStdout`
ART_VER=`$MVN help:evaluate -q -f $DIST_PL -Dexpression=project.version -DforceStdout`
CUDA_CLASSIFIER=`mvn help:evaluate -q -pl $DIST_PL -Dexpression=cuda.version -DforceStdout`
+
+SQL_PL=${SQL_PL:-"sql-plugin"}
+POM_FILE=${POM_FILE:-"$DIST_PL/target/parallel-world/META-INF/maven/${ART_GROUP_ID}/${ART_ID}/pom.xml"}
+OUT_PATH=${OUT_PATH:-"$DIST_PL/target"}
+SIGN_TOOL=${SIGN_TOOL:-"gpg"}
+
FPATH="$OUT_PATH/$ART_ID-$ART_VER"
cp $FPATH-$CUDA_CLASSIFIER.jar $FPATH.jar
diff --git a/jenkins/spark-nightly-build.sh b/jenkins/spark-nightly-build.sh
index c507088194d..b058c9b9746 100755
--- a/jenkins/spark-nightly-build.sh
+++ b/jenkins/spark-nightly-build.sh
@@ -35,7 +35,7 @@ ART_GROUP_ID=$(mvnEval project.groupId)
ART_VER=$(mvnEval project.version)
DIST_FPATH="$DIST_PL/target/$ART_ID-$ART_VER-$CUDA_CLASSIFIER"
-DIST_POM_FPATH="$DIST_PL/target/extra-resources/META-INF/maven/$ART_GROUP_ID/$ART_ID/pom.xml"
+DIST_POM_FPATH="$DIST_PL/target/parallel-world/META-INF/maven/$ART_GROUP_ID/$ART_ID/pom.xml"
DIST_PROFILE_OPT=-Dincluded_buildvers=$(IFS=,; echo "${SPARK_SHIM_VERSIONS[*]}")
DIST_INCLUDES_DATABRICKS=${DIST_INCLUDES_DATABRICKS:-"true"}
diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh
index 6d9e8548ce0..083ede715a5 100755
--- a/jenkins/spark-premerge-build.sh
+++ b/jenkins/spark-premerge-build.sh
@@ -99,6 +99,13 @@ rapids_shuffle_smoke_test() {
$SPARK_HOME/sbin/spark-daemon.sh start org.apache.spark.deploy.worker.Worker 1 $SPARK_MASTER
invoke_shuffle_integration_test() {
+ # check out what else is on the GPU
+ nvidia-smi
+
+ # because the RapidsShuffleManager smoke tests work against a standalone cluster
+ # we do not want the integration tests to launch N different applications, just one app
+ # is what is expected.
+ TEST_PARALLEL=0 \
PYSP_TEST_spark_master=$SPARK_MASTER \
PYSP_TEST_spark_cores_max=2 \
PYSP_TEST_spark_executor_cores=1 \
diff --git a/jenkins/version-def.sh b/jenkins/version-def.sh
index 0ac39ca96e0..9cd5524bff1 100755
--- a/jenkins/version-def.sh
+++ b/jenkins/version-def.sh
@@ -1,6 +1,6 @@
#!/bin/bash
#
-# Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -26,10 +26,10 @@ for VAR in $OVERWRITE_PARAMS; do
done
IFS=$PRE_IFS
-CUDF_VER=${CUDF_VER:-"23.02.0"}
+CUDF_VER=${CUDF_VER:-"23.04.0"}
CUDA_CLASSIFIER=${CUDA_CLASSIFIER:-"cuda11"}
-PROJECT_VER=${PROJECT_VER:-"23.02.0"}
-PROJECT_TEST_VER=${PROJECT_TEST_VER:-"23.02.0"}
+PROJECT_VER=${PROJECT_VER:-"23.04.0"}
+PROJECT_TEST_VER=${PROJECT_TEST_VER:-"23.04.0"}
SPARK_VER=${SPARK_VER:-"3.1.1"}
# Make a best attempt to set the default value for the shuffle shim.
# Note that SPARK_VER for non-Apache Spark flavors (i.e. databricks,
diff --git a/pom.xml b/pom.xml
index 9688c7acbf9..296692d64f3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -23,7 +23,7 @@
rapids-4-spark-parentRAPIDS Accelerator for Apache Spark Root ProjectThe root project of the RAPIDS Accelerator for Apache Spark
- 23.02.0
+ 23.04.0pomhttps://nvidia.github.io/spark-rapids/
@@ -99,46 +99,6 @@
aggregator
-
-
- release312db
-
-
- buildver
- 312db
-
-
-
- 312db
-
- 3.4.4
- spark312db
-
- ${spark312db.version}
- ${spark312db.version}
- 2.7.4
- true
- 1.10.1
- ${spark312db.sources}
- ${spark312db.test.sources}
-
-
-
- delta-lake/delta-stub
- dist
- integration_tests
- shuffle-plugin
- sql-plugin
- tests
- udf-compiler
- aggregator
-
- release312
@@ -195,34 +155,6 @@
api_validation
-
- release314
-
-
- buildver
- 314
-
-
-
- 314
- ${spark314.version}
- ${spark314.version}
- 1.10.1
- ${spark314.sources}
- ${spark314.test.sources}
-
-
- delta-lake/delta-stub
- dist
- integration_tests
- shuffle-plugin
- sql-plugin
- tests
- udf-compiler
- aggregator
- api_validation
-
- release320
@@ -654,7 +586,8 @@
1.10.1spark${buildver}cuda11
- 23.02.0
+ 23.04.0
+ 23.04.02.122.8.0incremental
@@ -686,9 +619,7 @@
please update the snapshot-shims profile as well so it is accurate -->
3.1.13.1.2
- 3.1.2-databricks3.1.3
- 3.1.4-SNAPSHOT3.2.03.2.13.2.1.3.2.7171000.0-3
@@ -697,7 +628,7 @@
3.2.33.3.03.3.1
- 3.3.2-SNAPSHOT
+ 3.3.23.4.0-SNAPSHOT3.3.0.3.3.7180.0-2743.3.0-databricks
@@ -725,6 +656,8 @@
with the ones deployed to a remote Maven repo
-->
false
+
+ ${project.basedir}/target/${spark.version.classifier}/generated/src
311,
312,
@@ -736,13 +669,12 @@
323,
330,
331,
+ 332,
330cdh
- 332
- 312db,
321db,
330db
@@ -772,6 +704,8 @@
${databricks.buildvers},
340
+ ${all.buildvers}
+ main
@@ -907,6 +841,8 @@
true
+
+
-
-
-
-
-
-
-
-
-
-
-
@@ -1177,11 +1096,6 @@
-
-
-
-
-
@@ -1202,6 +1116,19 @@
+
+ shimplify-shim-sources
+ run
+ generate-sources
+
+
+
+
+
+
+
+
+ generate-build-infogenerate-resources
@@ -1233,6 +1160,21 @@
run
+
+ duplicate-code-detector
+
+ run
+
+ none
+
+
+
+
+
+
+
+
+
@@ -1245,6 +1187,16 @@
ant-contrib1.0b3
+
+ org.python
+ jython-standalone
+ 2.7.2
+
+
+ net.sourceforge.pmd
+ pmd-dist
+ 6.55.0
+
@@ -1331,6 +1283,7 @@
-Xmx1024m${scala.javac.args}
+ ${spark.rapids.source.basedir}/target/${spark.version.classifier}/.sbt/1.0/zinc/org.scala-sbt
@@ -1460,7 +1413,6 @@
-
org.apache.maven.pluginsmaven-antrun-plugin
@@ -1538,6 +1490,8 @@
org.codehaus.mojobuild-helper-maven-plugin
+
+
add-shim-sourcesgenerate-sources
@@ -1554,6 +1508,28 @@
${spark.shim.test.sources}
+
+ add-shimple-sources
+ generate-sources
+ add-source
+
+
+ ${spark.shim.dest}/main/scala
+ ${spark.shim.dest}/main/java
+
+
+
+
+ add-shimple-test-sources
+ generate-test-sources
+ add-test-source
+
+
+ ${spark.shim.dest}/test/scala
+ ${spark.shim.dest}/test/java
+
+
+
diff --git a/scripts/generate-changelog b/scripts/generate-changelog
index 8e48dc68e4e..c6f2d5c65ef 100755
--- a/scripts/generate-changelog
+++ b/scripts/generate-changelog
@@ -44,13 +44,13 @@ Github personal access token: https://github.com/settings/tokens, and make you h
Usage:
cd spark-rapids/
- # generate changelog for releases 23.02 to 23.02
+ # generate changelog for releases 23.02 to 23.04
scripts/generate-changelog --token= \
- --releases=23.02
+ --releases=23.02,23.04
# To a separate file like /tmp/CHANGELOG.md
GITHUB_TOKEN= scripts/generate-changelog \
- --releases=23.02 \
+ --releases=23.02,23.04 \
--path=/tmp/CHANGELOG.md
"""
import os
diff --git a/shuffle-plugin/pom.xml b/shuffle-plugin/pom.xml
index 91a3cdf6e98..c5dbebaf80c 100644
--- a/shuffle-plugin/pom.xml
+++ b/shuffle-plugin/pom.xml
@@ -1,6 +1,6 @@