diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml
index f874c0ed34a..dedf2416987 100644
--- a/.github/workflows/auto-merge.yml
+++ b/.github/workflows/auto-merge.yml
@@ -18,7 +18,7 @@ name: auto-merge HEAD to BASE
 on:
   pull_request_target:
     branches:
-    - branch-23.02
+    - branch-23.04
     types: [closed]
 
 jobs:
@@ -29,13 +29,13 @@ jobs:
     steps:
       - uses: actions/checkout@v3
         with:
-          ref: branch-23.02 # force to fetch from latest upstream instead of PR ref
+          ref: branch-23.04 # force to fetch from latest upstream instead of PR ref
 
       - name: auto-merge job
         uses: ./.github/workflows/auto-merge
         env:
           OWNER: NVIDIA
           REPO_NAME: spark-rapids
-          HEAD: branch-23.02
-          BASE: branch-23.04
+          HEAD: branch-23.04
+          BASE: branch-23.06
           AUTOMERGE_TOKEN: ${{ secrets.AUTOMERGE_TOKEN }} # use to merge PR
diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
index fbba835e95b..e9bd75607dd 100644
--- a/.github/workflows/blossom-ci.yml
+++ b/.github/workflows/blossom-ci.yml
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -96,10 +96,10 @@ jobs:
           java-version: 8
 
       # add blackduck properties https://synopsys.atlassian.net/wiki/spaces/INTDOCS/pages/631308372/Methods+for+Configuring+Analysis#Using-a-configuration-file
+      # currently hardcode projects here to avoid intermittent mvn scan failures
       - name: Setup blackduck properties
         run: |
-          PROJECTS=$(mvn -am dependency:tree | grep maven-dependency-plugin | awk '{ out="com.nvidia:"$(NF-1);print out }' | grep rapids | xargs | sed -e 's/ /,/g')
-          echo detect.maven.build.command="-pl=$PROJECTS -am" >> application.properties
+          echo detect.maven.build.command="-pl=com.nvidia:rapids-4-spark-parent,com.nvidia:rapids-4-spark-sql_2.12 -am" >> application.properties
           echo detect.maven.included.scopes=compile >> application.properties
 
       - name: Run blossom action
diff --git a/.github/workflows/mvn-verify-check.yml b/.github/workflows/mvn-verify-check.yml
index e781ed75758..3d034eeaa3c 100644
--- a/.github/workflows/mvn-verify-check.yml
+++ b/.github/workflows/mvn-verify-check.yml
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -46,7 +46,10 @@ jobs:
           . jenkins/version-def.sh
           svArrBodyNoSnapshot=$(printf ",{\"spark-version\":\"%s\",\"isSnapshot\":false}" "${SPARK_SHIM_VERSIONS_NOSNAPSHOTS_TAIL[@]}")
           svArrBodyNoSnapshot=${svArrBodyNoSnapshot:1}
-          svArrBodySnapshot=$(printf ",{\"spark-version\":\"%s\",\"isSnapshot\":true}" "${SPARK_SHIM_VERSIONS_SNAPSHOTS_ONLY[@]}")
+          # do not add empty snapshot versions
+          if [ ${#SPARK_SHIM_VERSIONS_SNAPSHOTS_ONLY[@]} -gt 0 ]; then
+            svArrBodySnapshot=$(printf ",{\"spark-version\":\"%s\",\"isSnapshot\":true}" "${SPARK_SHIM_VERSIONS_SNAPSHOTS_ONLY[@]}")
+          fi
 
           # add snapshot versions which are not in snapshot property in pom file
           svArrBodySnapshot+=$(printf ",{\"spark-version\":\"%s\",\"isSnapshot\":true}" 340)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 933ae8f52f9..0a328c47de7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,11 +1,223 @@
 # Change log
-Generated on 2023-02-14
+Generated on 2023-04-18
+
+## Release 23.04
+
+### Features
+|||
+|:---|:---|
+|[#7985](https://github.com/NVIDIA/spark-rapids/issues/7985)|[FEA] Expose Alluxio master URL to support K8s Env|
+|[#7880](https://github.com/NVIDIA/spark-rapids/issues/7880)|[FEA] retry framework task level metrics|
+|[#7394](https://github.com/NVIDIA/spark-rapids/issues/7394)|[FEA] Support Delta Lake auto compaction|
+|[#7463](https://github.com/NVIDIA/spark-rapids/issues/7463)|[FEA] Drop support for Databricks-9.1 ML LTS|
+|[#7253](https://github.com/NVIDIA/spark-rapids/issues/7253)|[FEA] Implement OOM retry framework|
+|[#7042](https://github.com/NVIDIA/spark-rapids/issues/7042)|[FEA] Add support in the tools event parsing for ML functions, libraries, and expressions|
+
+### Performance
+|||
+|:---|:---|
+|[#7907](https://github.com/NVIDIA/spark-rapids/issues/7907)|[FEA] Optimize regexp_replace in multi-replace scenarios|
+|[#7691](https://github.com/NVIDIA/spark-rapids/issues/7691)|[FEA] Upgrade and document UCX 1.14|
+|[#6516](https://github.com/NVIDIA/spark-rapids/issues/6516)|[FEA] Enable RAPIDS Shuffle Manager smoke testing for the databricks environment|
+|[#7695](https://github.com/NVIDIA/spark-rapids/issues/7695)|[FEA] Transpile regexp_extract expression to only have the single capture group that is needed|
+|[#7393](https://github.com/NVIDIA/spark-rapids/issues/7393)|[FEA] Support Delta Lake optimized write|
+|[#6561](https://github.com/NVIDIA/spark-rapids/issues/6561)|[FEA] Make SpillableColumnarBatch inform Spill code of actual usage of the batch|
+|[#6864](https://github.com/NVIDIA/spark-rapids/issues/6864)|[BUG] Spilling logic can spill data that cannot be freed|
+
+### Bugs Fixed
+|||
+|:---|:---|
+|[#8111](https://github.com/NVIDIA/spark-rapids/issues/8111)|[BUG] test_delta_delete_entire_table failed in databricks 10.4 runtime|
+|[#8074](https://github.com/NVIDIA/spark-rapids/issues/8074)|[BUG] test_parquet_read_nano_as_longs_31x failed on Dataproc|
+|[#7997](https://github.com/NVIDIA/spark-rapids/issues/7997)|[BUG] executors died with too much off heap in yarn UCX CI `udf_test`|
+|[#8067](https://github.com/NVIDIA/spark-rapids/issues/8067)|[BUG] extras jar sometimes fails to load|
+|[#8038](https://github.com/NVIDIA/spark-rapids/issues/8038)|[BUG] vector leaked when running NDS 3TB with memory restricted|
+|[#8030](https://github.com/NVIDIA/spark-rapids/issues/8030)|[BUG] test_re_replace_no_unicode_fallback test failes on integratoin tests Yarn|
+|[#7971](https://github.com/NVIDIA/spark-rapids/issues/7971)|[BUG] withRestoreOnRetry should look at Throwable causes in addition to retry OOMs|
+|[#6990](https://github.com/NVIDIA/spark-rapids/issues/6990)|[BUG] Several integration test failures in Spark-3.4 SNAPSHOT build|
+|[#7924](https://github.com/NVIDIA/spark-rapids/issues/7924)|[BUG] Physical plan for regexp_extract does not escape newlines|
+|[#7341](https://github.com/NVIDIA/spark-rapids/issues/7341)|[BUG] Leverage OOM retry framework for ORC writes|
+|[#7921](https://github.com/NVIDIA/spark-rapids/issues/7921)|[BUG] ORC writes with bloom filters enabled do not fall back to the CPU|
+|[#7818](https://github.com/NVIDIA/spark-rapids/issues/7818)|[BUG] Reuse of broadcast exchange can lead to unnecessary CPU fallback|
+|[#7904](https://github.com/NVIDIA/spark-rapids/issues/7904)|[BUG] test_write_sql_save_table sporadically fails on Pascal|
+|[#7922](https://github.com/NVIDIA/spark-rapids/issues/7922)|[BUG] YARN IT test test_optimized_hive_ctas_basic failures|
+|[#7933](https://github.com/NVIDIA/spark-rapids/issues/7933)|[BUG] NDS running hits DPP error on Databricks 10.4 when enable Alluxio cache.|
+|[#7850](https://github.com/NVIDIA/spark-rapids/issues/7850)|[BUG] nvcomp usage for the UCX mode of the shuffle manager is broken|
+|[#7927](https://github.com/NVIDIA/spark-rapids/issues/7927)|[BUG] Shimplify adding new shim layer fails|
+|[#6138](https://github.com/NVIDIA/spark-rapids/issues/6138)|[BUG] cast timezone-awareness check positive for date/time-unrelated types|
+|[#7914](https://github.com/NVIDIA/spark-rapids/issues/7914)|[BUG] Parquet read with integer upcast crashes|
+|[#6961](https://github.com/NVIDIA/spark-rapids/issues/6961)|[BUG] Using `\d` (or others) inside a character class results in "Unsupported escape character" |
+|[#7908](https://github.com/NVIDIA/spark-rapids/issues/7908)|[BUG] Interpolate spark.version.classifier into scala:compile `secondaryCacheDir`|
+|[#7707](https://github.com/NVIDIA/spark-rapids/issues/7707)|[BUG] IndexOutOfBoundsException when joining on 2 integer columns with DPP|
+|[#7892](https://github.com/NVIDIA/spark-rapids/issues/7892)|[BUG] Invalid or unsupported escape character `t` when trying to use tab in regexp_replace|
+|[#7640](https://github.com/NVIDIA/spark-rapids/issues/7640)|[BUG] GPU OOM using GpuRegExpExtract|
+|[#7814](https://github.com/NVIDIA/spark-rapids/issues/7814)|[BUG] GPU's output differs from CPU's for big decimals when joining by sub-partitioning algorithm|
+|[#7796](https://github.com/NVIDIA/spark-rapids/issues/7796)|[BUG] Parquet chunked reader size of output exceeds column size limit|
+|[#7833](https://github.com/NVIDIA/spark-rapids/issues/7833)|[BUG] run_pyspark_from_build computes 5 MiB per runner instead of 5 GiB|
+|[#7855](https://github.com/NVIDIA/spark-rapids/issues/7855)|[BUG] shuffle_test test_hash_grpby_sum failed OOM in premerge CI|
+|[#7858](https://github.com/NVIDIA/spark-rapids/issues/7858)|[BUG] HostToGpuCoalesceIterator leaks all host batches|
+|[#7826](https://github.com/NVIDIA/spark-rapids/issues/7826)|[BUG] buildall dist jar contains aggregator dependency|
+|[#7729](https://github.com/NVIDIA/spark-rapids/issues/7729)|[BUG] Active GPU thread not holding the semaphore|
+|[#7820](https://github.com/NVIDIA/spark-rapids/issues/7820)|[BUG] Restore pandas require_minimum_pandas_version() check|
+|[#7829](https://github.com/NVIDIA/spark-rapids/issues/7829)|[BUG] Parquet buffer time not correct with multithreaded combining reader|
+|[#7819](https://github.com/NVIDIA/spark-rapids/issues/7819)|[BUG] GpuDeviceManager allows setting UVM regardless of other RMM configs|
+|[#7643](https://github.com/NVIDIA/spark-rapids/issues/7643)|[BUG] Databricks init scripts can fail silently|
+|[#7799](https://github.com/NVIDIA/spark-rapids/issues/7799)|[BUG] Cannot lexicographic compare a table with a LIST of STRUCT column at ai.rapids.cudf.Table.sortOrder|
+|[#7767](https://github.com/NVIDIA/spark-rapids/issues/7767)|[BUG] VS Code / Metals / Bloop integration fails with java.lang.RuntimeException: 'boom' |
+|[#6383](https://github.com/NVIDIA/spark-rapids/issues/6383)|[SPARK-40066][SQL] ANSI mode: always return null on invalid access to map column|
+|[#7093](https://github.com/NVIDIA/spark-rapids/issues/7093)|[BUG] Spark-3.4 - Integration test failures in map_test|
+|[#7779](https://github.com/NVIDIA/spark-rapids/issues/7779)|[BUG] AlluxioUtilsSuite uses illegal character underscore in URI scheme|
+|[#7725](https://github.com/NVIDIA/spark-rapids/issues/7725)|[BUG] cache_test failed w/ ParquetCachedBatchSerializer in spark 3.3.2-SNAPSHOT|
+|[#7639](https://github.com/NVIDIA/spark-rapids/issues/7639)|[BUG] Databricks premerge failing with cannot find pytest|
+|[#7694](https://github.com/NVIDIA/spark-rapids/issues/7694)|[BUG] Spark-3.4 build breaks due to removing InternalRowSet|
+|[#6598](https://github.com/NVIDIA/spark-rapids/issues/6598)|[BUG] CUDA error when casting large column vector from long to string|
+|[#7739](https://github.com/NVIDIA/spark-rapids/issues/7739)|[BUG] udf_test failed in databricks 11.3 ENV|
+|[#5748](https://github.com/NVIDIA/spark-rapids/issues/5748)|[BUG] 3 cast tests fails on Spark 3.4.0|
+|[#7688](https://github.com/NVIDIA/spark-rapids/issues/7688)|[BUG] GpuParquetScan fails with NullPointerException - Delta CDF query|
+|[#7648](https://github.com/NVIDIA/spark-rapids/issues/7648)|[BUG] java.lang.ClassCastException: SerializeConcatHostBuffersDeserializeBatch cannot be cast to.HashedRelation|
+|[#6988](https://github.com/NVIDIA/spark-rapids/issues/6988)|[BUG] Integration test failures with DecimalType on Spark-3.4 SNAPSHOT build|
+|[#7615](https://github.com/NVIDIA/spark-rapids/issues/7615)|[BUG] Build fails on Spark 3.4|
+|[#7557](https://github.com/NVIDIA/spark-rapids/issues/7557)|[AUDIT][SPARK-41970] Introduce SparkPath for typesafety|
+|[#7617](https://github.com/NVIDIA/spark-rapids/issues/7617)|[BUG] Build 340 failed due to miss shim code for GpuShuffleMeta|
+
+### PRs
+|||
+|:---|:---|
+|[#8109](https://github.com/NVIDIA/spark-rapids/pull/8109)|Bump up JNI and private version to released 23.04.0|
+|[#7939](https://github.com/NVIDIA/spark-rapids/pull/7939)|[Doc]update download docs for 2304 version[skip ci]|
+|[#8127](https://github.com/NVIDIA/spark-rapids/pull/8127)|Avoid SQL result check of Delta Lake full delete on Databricks|
+|[#8098](https://github.com/NVIDIA/spark-rapids/pull/8098)|Fix loading of ORC files with missing column names|
+|[#8110](https://github.com/NVIDIA/spark-rapids/pull/8110)|Update ML integration page docs page [skip ci]|
+|[#8103](https://github.com/NVIDIA/spark-rapids/pull/8103)|Add license of spark-rapids private in NOTICE-binary[skip ci]|
+|[#8100](https://github.com/NVIDIA/spark-rapids/pull/8100)|Update/improve EMR getting started documentation [skip ci]|
+|[#8101](https://github.com/NVIDIA/spark-rapids/pull/8101)|Improve OOM exception messages|
+|[#8087](https://github.com/NVIDIA/spark-rapids/pull/8087)|Add an FAQ entry on encryption support [skip ci]|
+|[#8076](https://github.com/NVIDIA/spark-rapids/pull/8076)|Add in docs about RetryOOM [skip ci]|
+|[#8077](https://github.com/NVIDIA/spark-rapids/pull/8077)|Temporarily skip `test_parquet_read_nano_as_longs_31x` on dataproc|
+|[#8071](https://github.com/NVIDIA/spark-rapids/pull/8071)|Fix error in deploy script [skip ci]|
+|[#8070](https://github.com/NVIDIA/spark-rapids/pull/8070)|Fixes closed RapidsShuffleHandleImpl leak in ShuffleBufferCatalog|
+|[#8069](https://github.com/NVIDIA/spark-rapids/pull/8069)|Fix loading extra jar|
+|[#8044](https://github.com/NVIDIA/spark-rapids/pull/8044)|Fall back to CPU if  `spark.sql.legacy.parquet.nanosAsLong` is set|
+|[#8049](https://github.com/NVIDIA/spark-rapids/pull/8049)|[DOC] Adding user tool info to main qualification docs page [skip ci]|
+|[#8040](https://github.com/NVIDIA/spark-rapids/pull/8040)|Fix device vector leak in RmmRetryIterator.splitSpillableInHalfByRows|
+|[#8031](https://github.com/NVIDIA/spark-rapids/pull/8031)|Fix regexp_replace integration test that should fallback when unicode is disabled|
+|[#7828](https://github.com/NVIDIA/spark-rapids/pull/7828)|Fallback to arena allocator if RMM failed to initialize with async allocator|
+|[#8006](https://github.com/NVIDIA/spark-rapids/pull/8006)|Handle caused-by retry exceptions in withRestoreOnRetry|
+|[#8013](https://github.com/NVIDIA/spark-rapids/pull/8013)|[Doc] Adding user tools info into EMR getting started guide [skip ci]|
+|[#8007](https://github.com/NVIDIA/spark-rapids/pull/8007)|Fix leak where RapidsShuffleIterator for a completed task was kept alive|
+|[#8010](https://github.com/NVIDIA/spark-rapids/pull/8010)|Specify that UCX should be 1.12.1 only [skip ci]|
+|[#7967](https://github.com/NVIDIA/spark-rapids/pull/7967)|Transpile simple choice-type regular expressions into lists of choices to use with string replace multi|
+|[#7902](https://github.com/NVIDIA/spark-rapids/pull/7902)|Add oom retry handling for createGatherer in gpu hash joins|
+|[#7986](https://github.com/NVIDIA/spark-rapids/pull/7986)|Provides a config to expose Alluxio master URL to support K8s Env|
+|[#7936](https://github.com/NVIDIA/spark-rapids/pull/7936)|Stop showing internal details of ternary expressions in SparkPlan.toString|
+|[#7972](https://github.com/NVIDIA/spark-rapids/pull/7972)|Add in retry for ORC writes|
+|[#7975](https://github.com/NVIDIA/spark-rapids/pull/7975)|Publish documentation for private configs|
+|[#7976](https://github.com/NVIDIA/spark-rapids/pull/7976)|Disable GPU write for ORC and Parquet, if bloom-filters are enabled.|
+|[#7925](https://github.com/NVIDIA/spark-rapids/pull/7925)|Inject RetryOOM in CI where retry iterator is used|
+|[#7970](https://github.com/NVIDIA/spark-rapids/pull/7970)|[DOCS] Updating qual tool docs from latest in tools repo|
+|[#7952](https://github.com/NVIDIA/spark-rapids/pull/7952)|Add in minimal retry metrics|
+|[#7884](https://github.com/NVIDIA/spark-rapids/pull/7884)|Add Python requirements file for integration tests|
+|[#7958](https://github.com/NVIDIA/spark-rapids/pull/7958)|Add CheckpointRestore trait and withRestoreOnRetry|
+|[#7849](https://github.com/NVIDIA/spark-rapids/pull/7849)|Fix CPU broadcast exchanges being left unreplaced due to AQE and reuse|
+|[#7944](https://github.com/NVIDIA/spark-rapids/pull/7944)|Fix issue with dynamicpruning filters used in converted GPU scans when S3 paths are replaced with alluxio|
+|[#7949](https://github.com/NVIDIA/spark-rapids/pull/7949)|Lazily unspill the stream batches for joins by sub-partitioning|
+|[#7951](https://github.com/NVIDIA/spark-rapids/pull/7951)|Fix PMD docs URL [skip ci]|
+|[#7945](https://github.com/NVIDIA/spark-rapids/pull/7945)|Enable automerge from 2304 to 2306 [skip ci]|
+|[#7935](https://github.com/NVIDIA/spark-rapids/pull/7935)|Add GPU level task metrics|
+|[#7930](https://github.com/NVIDIA/spark-rapids/pull/7930)|Add OOM Retry handling for join gather next|
+|[#7942](https://github.com/NVIDIA/spark-rapids/pull/7942)|Revert "Upgrade to UCX 1.14.0 (#7877)"|
+|[#7889](https://github.com/NVIDIA/spark-rapids/pull/7889)|Support auto-compaction for Delta tables on|
+|[#7937](https://github.com/NVIDIA/spark-rapids/pull/7937)|Support hashing different types for sub-partitioning|
+|[#7877](https://github.com/NVIDIA/spark-rapids/pull/7877)|Upgrade to UCX 1.14.0|
+|[#7926](https://github.com/NVIDIA/spark-rapids/pull/7926)|Fixes issue where UCX compressed tables would be decompressed multiple times|
+|[#7928](https://github.com/NVIDIA/spark-rapids/pull/7928)|Adjust assert for SparkShims: no longer a per-shim file [skip ci]|
+|[#7895](https://github.com/NVIDIA/spark-rapids/pull/7895)|Some refactor of shuffled hash join|
+|[#7894](https://github.com/NVIDIA/spark-rapids/pull/7894)|Support tagging `Cast` for timezone conditionally|
+|[#7915](https://github.com/NVIDIA/spark-rapids/pull/7915)|Fix upcast of signed integral values when reading from Parquet|
+|[#7879](https://github.com/NVIDIA/spark-rapids/pull/7879)|Retry for file read operations|
+|[#7905](https://github.com/NVIDIA/spark-rapids/pull/7905)|[Doc] Fix some documentation issue based on VPR feedback on 23.04 branch (new PR) [skip CI] |
+|[#7912](https://github.com/NVIDIA/spark-rapids/pull/7912)|[Doc] Hotfix gh-pages for compatibility page format issue [skip ci]|
+|[#7913](https://github.com/NVIDIA/spark-rapids/pull/7913)|Fix resolution of GpuRapidsProcessDeltaMergeJoinExec expressions|
+|[#7916](https://github.com/NVIDIA/spark-rapids/pull/7916)|Add clarification for Delta Lake optimized write fallback due to sorting [skip ci]|
+|[#7906](https://github.com/NVIDIA/spark-rapids/pull/7906)|ColumnarToRowIterator should release the semaphore if parent is empty|
+|[#7909](https://github.com/NVIDIA/spark-rapids/pull/7909)|Interpolate buildver into secondaryCacheDir|
+|[#7844](https://github.com/NVIDIA/spark-rapids/pull/7844)|Update alluxio version to 2.9.0|
+|[#7896](https://github.com/NVIDIA/spark-rapids/pull/7896)|Update regular expression parser to handle escape character sequences|
+|[#7885](https://github.com/NVIDIA/spark-rapids/pull/7885)|Add Join Reordering Integration Test|
+|[#7862](https://github.com/NVIDIA/spark-rapids/pull/7862)|Reduce shimming of GpuFlatMapGroupsInPandasExec|
+|[#7859](https://github.com/NVIDIA/spark-rapids/pull/7859)|Remove 3.1.4-SNAPSHOT shim code|
+|[#7835](https://github.com/NVIDIA/spark-rapids/pull/7835)|Update to pull the rapids spark extra plugin jar|
+|[#7863](https://github.com/NVIDIA/spark-rapids/pull/7863)|[Doc] Address document issues [skip ci]|
+|[#7794](https://github.com/NVIDIA/spark-rapids/pull/7794)|Implement sub partitioning for large/skewed hash joins|
+|[#7864](https://github.com/NVIDIA/spark-rapids/pull/7864)|Add in basic support for OOM retry for project and filter|
+|[#7878](https://github.com/NVIDIA/spark-rapids/pull/7878)|Fixing host memory calculation to properly be 5GiB|
+|[#7860](https://github.com/NVIDIA/spark-rapids/pull/7860)|Enable manual copy-and-paste code detection [skip ci]|
+|[#7852](https://github.com/NVIDIA/spark-rapids/pull/7852)|Use withRetry in GpuCoalesceBatches|
+|[#7857](https://github.com/NVIDIA/spark-rapids/pull/7857)|Unshim getSparkShimVersion|
+|[#7854](https://github.com/NVIDIA/spark-rapids/pull/7854)|Optimize `regexp_extract*` by transpiling capture groups to non-capturing groups so that only the required capturing group is manifested|
+|[#7853](https://github.com/NVIDIA/spark-rapids/pull/7853)|Remove support for Databricks-9.1 ML LTS|
+|[#7856](https://github.com/NVIDIA/spark-rapids/pull/7856)|Update references to reduced dependencies pom [skip ci]|
+|[#7848](https://github.com/NVIDIA/spark-rapids/pull/7848)|Initialize only sql-plugin  to prevent missing submodule artifacts in buildall [skip ci]|
+|[#7839](https://github.com/NVIDIA/spark-rapids/pull/7839)|Add reduced pom to dist jar in the packaging phase|
+|[#7822](https://github.com/NVIDIA/spark-rapids/pull/7822)|Add in support for OOM retry|
+|[#7846](https://github.com/NVIDIA/spark-rapids/pull/7846)|Stop releasing semaphore in GpuUserDefinedFunction|
+|[#7840](https://github.com/NVIDIA/spark-rapids/pull/7840)|Execute mvn initialize before parallel build [skip ci]|
+|[#7222](https://github.com/NVIDIA/spark-rapids/pull/7222)|Automatic conversion to shimplified directory structure|
+|[#7824](https://github.com/NVIDIA/spark-rapids/pull/7824)|Use withRetryNoSplit in BasicWindowCalc|
+|[#7842](https://github.com/NVIDIA/spark-rapids/pull/7842)|Try fix broken blackduck scan [skip ci]|
+|[#7841](https://github.com/NVIDIA/spark-rapids/pull/7841)|Hardcode scan projects [skip ci]|
+|[#7830](https://github.com/NVIDIA/spark-rapids/pull/7830)|Fix buffer and Filter time with Parquet multithreaded combine reader|
+|[#7678](https://github.com/NVIDIA/spark-rapids/pull/7678)|Premerge CI to drop support for Databricks-9.1 ML LTS|
+|[#7823](https://github.com/NVIDIA/spark-rapids/pull/7823)|[BUG] Enable managed memory only if async allocator is not used|
+|[#7821](https://github.com/NVIDIA/spark-rapids/pull/7821)|Restore pandas import check in db113 runtime|
+|[#7810](https://github.com/NVIDIA/spark-rapids/pull/7810)|UnXfail large decimal window range queries|
+|[#7771](https://github.com/NVIDIA/spark-rapids/pull/7771)|Add withRetry and withRetryNoSplit and PoC with hash aggregate|
+|[#7815](https://github.com/NVIDIA/spark-rapids/pull/7815)|Fix the hyperlink to shimplify.py [skip ci]|
+|[#7812](https://github.com/NVIDIA/spark-rapids/pull/7812)|Fallback Delta Lake optimized writes if GPU cannot support partitioning|
+|[#7791](https://github.com/NVIDIA/spark-rapids/pull/7791)|Doc changes for new nested JSON reader [skip ci]|
+|[#7797](https://github.com/NVIDIA/spark-rapids/pull/7797)|Add GPU support for EphemeralSubstring|
+|[#7561](https://github.com/NVIDIA/spark-rapids/pull/7561)|Ant task to automatically convert to a simple shim layout|
+|[#7789](https://github.com/NVIDIA/spark-rapids/pull/7789)|Update script for integration tests on Databricks|
+|[#7798](https://github.com/NVIDIA/spark-rapids/pull/7798)|Do not error out DB IT test script when pytest code 5 [skip ci]|
+|[#7787](https://github.com/NVIDIA/spark-rapids/pull/7787)|Document a workaround to RuntimeException 'boom' [skip ci]|
+|[#7786](https://github.com/NVIDIA/spark-rapids/pull/7786)|Fix nested loop joins when there's no build-side columns|
+|[#7730](https://github.com/NVIDIA/spark-rapids/pull/7730)|[FEA] Switch to `regex_program` APIs|
+|[#7788](https://github.com/NVIDIA/spark-rapids/pull/7788)|Support released spark 3.3.2|
+|[#7095](https://github.com/NVIDIA/spark-rapids/pull/7095)|Fix the failure in `map_test.py` on Spark 3.4|
+|[#7769](https://github.com/NVIDIA/spark-rapids/pull/7769)|Fix issue where GpuSemaphore can throw NPE when logDebug is on|
+|[#7780](https://github.com/NVIDIA/spark-rapids/pull/7780)|Make AlluxioUtilsSuite pass for 340|
+|[#7772](https://github.com/NVIDIA/spark-rapids/pull/7772)|Fix cache test for Spark 3.3.2|
+|[#7717](https://github.com/NVIDIA/spark-rapids/pull/7717)|Move Databricks variables into blossom-lib|
+|[#7749](https://github.com/NVIDIA/spark-rapids/pull/7749)|Support Delta Lake optimized write on Databricks|
+|[#7696](https://github.com/NVIDIA/spark-rapids/pull/7696)|Create new version of  GpuBatchScanExec to fix Spark-3.4 build|
+|[#7747](https://github.com/NVIDIA/spark-rapids/pull/7747)|batched full join tracking batch does not need to be lazy|
+|[#7758](https://github.com/NVIDIA/spark-rapids/pull/7758)|Hardcode python 3.8 to be used in databricks runtime for cudf_udf ENV|
+|[#7716](https://github.com/NVIDIA/spark-rapids/pull/7716)|Clean the code of `GpuMetrics`|
+|[#7746](https://github.com/NVIDIA/spark-rapids/pull/7746)|Merge branch-23.02 into branch-23.04 [skip ci]|
+|[#7740](https://github.com/NVIDIA/spark-rapids/pull/7740)|Revert 7737 workaround for cudf setup in databricks 11.3 runtime [skip ci]|
+|[#7737](https://github.com/NVIDIA/spark-rapids/pull/7737)|Workaround for cudf setup in databricks 11.3 runtime|
+|[#7734](https://github.com/NVIDIA/spark-rapids/pull/7734)|Temporarily skip the test_parquet_read_ignore_missing on Databricks|
+|[#7728](https://github.com/NVIDIA/spark-rapids/pull/7728)|Fix estimatedNumBatches in case of OOM for Full Outer Join|
+|[#7718](https://github.com/NVIDIA/spark-rapids/pull/7718)|GpuParquetScan fails with NullPointerException during combining|
+|[#7712](https://github.com/NVIDIA/spark-rapids/pull/7712)|Enable Dynamic FIle Pruning on|
+|[#7702](https://github.com/NVIDIA/spark-rapids/pull/7702)|Merge 23.02 into 23.04|
+|[#7572](https://github.com/NVIDIA/spark-rapids/pull/7572)|Enables spillable/unspillable state for RapidsBuffer and allow buffer sharing|
+|[#7687](https://github.com/NVIDIA/spark-rapids/pull/7687)|Fix window tests for Spark-3.4|
+|[#7667](https://github.com/NVIDIA/spark-rapids/pull/7667)|Reenable tests originally bypassed for 3.4|
+|[#7542](https://github.com/NVIDIA/spark-rapids/pull/7542)|Support WriteFilesExec in Spark-3.4 to fix several tests|
+|[#7673](https://github.com/NVIDIA/spark-rapids/pull/7673)|Add missing spark shim test suites |
+|[#7655](https://github.com/NVIDIA/spark-rapids/pull/7655)|Fix Spark 3.4 build|
+|[#7621](https://github.com/NVIDIA/spark-rapids/pull/7621)|Document GNU sed for macOS auto-copyrighter users [skip ci]|
+|[#7618](https://github.com/NVIDIA/spark-rapids/pull/7618)|Update JNI to 23.04.0-SNAPSHOT and update new delta-stub ver to 23.04|
+|[#7541](https://github.com/NVIDIA/spark-rapids/pull/7541)|Init version 23.04.0-SNAPSHOT|
 
 ## Release 23.02
 
 ### Features
 |||
 |:---|:---|
+|[#6420](https://github.com/NVIDIA/spark-rapids/issues/6420)|[FEA]Support HiveTableScanExec to scan a Hive text table|
+|[#4897](https://github.com/NVIDIA/spark-rapids/issues/4897)|Profiling tool: create a section to focus on I/O metrics|
 |[#6419](https://github.com/NVIDIA/spark-rapids/issues/6419)|[FEA] Support write a Hive text table |
 |[#7280](https://github.com/NVIDIA/spark-rapids/issues/7280)|[FEA] Support UpdateCommand for Delta Lake|
 |[#7281](https://github.com/NVIDIA/spark-rapids/issues/7281)|[FEA] Support DeleteCommand for Delta Lake|
@@ -16,6 +228,7 @@ Generated on 2023-02-14
 |[#6698](https://github.com/NVIDIA/spark-rapids/issues/6698)|[FEA] Support json_tuple|
 |[#6885](https://github.com/NVIDIA/spark-rapids/issues/6885)|[FEA] Support reverse|
 |[#6879](https://github.com/NVIDIA/spark-rapids/issues/6879)|[FEA] Support Databricks 11.3 ML LTS|
+|[#5618](https://github.com/NVIDIA/spark-rapids/issues/5618)|Qualification tool use expressions parsed in duration and speedup factors|
 
 ### Performance
 |||
@@ -30,6 +243,8 @@ Generated on 2023-02-14
 ### Bugs Fixed
 |||
 |:---|:---|
+|[#7069](https://github.com/NVIDIA/spark-rapids/issues/7069)|[BUG] GPU Hive Text Reader reads empty strings as null|
+|[#7068](https://github.com/NVIDIA/spark-rapids/issues/7068)|[BUG] GPU Hive Text Reader skips empty lines|
 |[#7448](https://github.com/NVIDIA/spark-rapids/issues/7448)|[BUG] GDS cufile test failed in elder cuda runtime|
 |[#7686](https://github.com/NVIDIA/spark-rapids/issues/7686)|[BUG] Large floating point values written as `Inf` not `Infinity` in Hive text writer|
 |[#7703](https://github.com/NVIDIA/spark-rapids/issues/7703)|[BUG] test_basic_hive_text_write fails|
@@ -104,6 +319,7 @@ Generated on 2023-02-14
 ### PRs
 |||
 |:---|:---|
+|[#7763](https://github.com/NVIDIA/spark-rapids/pull/7763)|23.02 changelog update 2/14 [skip ci]|
 |[#7761](https://github.com/NVIDIA/spark-rapids/pull/7761)|[Doc] remove xgboost demo from aws-emr doc due to nccl issue [skip ci]|
 |[#7760](https://github.com/NVIDIA/spark-rapids/pull/7760)|Add notice in gds to install cuda 11.8 [skip ci]|
 |[#7570](https://github.com/NVIDIA/spark-rapids/pull/7570)|[Doc] 23.02 doc updates [skip ci]|
@@ -265,6 +481,7 @@ Generated on 2023-02-14
 |[#7167](https://github.com/NVIDIA/spark-rapids/pull/7167)|Handle two changes related to `FileFormatWriter` since Spark 340|
 |[#7194](https://github.com/NVIDIA/spark-rapids/pull/7194)|Skip tests that fail due to recent cuDF changes related to end of string/line anchors|
 |[#7170](https://github.com/NVIDIA/spark-rapids/pull/7170)|Fix the `limit_test` failures on Spark 3.4|
+|[#7075](https://github.com/NVIDIA/spark-rapids/pull/7075)|Fix the failure of `test_array_element_at_zero_index_fail` on Spark3.4|
 |[#7126](https://github.com/NVIDIA/spark-rapids/pull/7126)|Fix support for binary encoded decimal for parquet|
 |[#7113](https://github.com/NVIDIA/spark-rapids/pull/7113)|Use an improved API for appending binary to host vector|
 |[#7130](https://github.com/NVIDIA/spark-rapids/pull/7130)|Enable chunked parquet reads by default|
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index c48b76dd91e..24529414347 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -152,7 +152,7 @@ To this end in a pre-production build you can set the Boolean property
 
 The time saved is more significant if you are merely changing
 the `aggregator` module, or the `dist` module, or just incorporating changes from
-[spark-rapids-jni](https://github.com/NVIDIA/spark-rapids-jni/blob/branch-23.02/CONTRIBUTING.md#local-testing-of-cross-repo-contributions-cudf-spark-rapids-jni-and-spark-rapids)
+[spark-rapids-jni](https://github.com/NVIDIA/spark-rapids-jni/blob/branch-23.04/CONTRIBUTING.md#local-testing-of-cross-repo-contributions-cudf-spark-rapids-jni-and-spark-rapids)
 
 For example, to quickly repackage `rapids-4-spark` after the
 initial `./build/buildall` you can iterate by invoking
@@ -186,15 +186,20 @@ The following acronyms may appear in directory names:
 |cdh    |Cloudera CDH|321cdh |Cloudera CDH Spark based on Apache Spark 3.2.1|
 
 The version-specific directory names have one of the following forms / use cases:
-- `src/main/312/scala` contains Scala source code for a single Spark version, 3.1.2 in this case
-- `src/main/312+-apache/scala`contains Scala source code for *upstream* **Apache** Spark builds,
+
+#### Version range directories
+
+The following source directory system is deprecated. See below and [shimplify.md][1]
+
+* `src/main/312/scala` contains Scala source code for a single Spark version, 3.1.2 in this case
+* `src/main/312+-apache/scala`contains Scala source code for *upstream* **Apache** Spark builds,
    only beginning with version Spark 3.1.2, and + signifies there is no upper version boundary
    among the supported versions
-- `src/main/311until320-all` contains code that applies to all shims between 3.1.1 *inclusive*,
+* `src/main/311until320-all` contains code that applies to all shims between 3.1.1 *inclusive*,
 3.2.0 *exclusive*
-- `src/main/pre320-treenode` contains shims for the Catalyst `TreeNode` class before the
+* `src/main/pre320-treenode` contains shims for the Catalyst `TreeNode` class before the
   [children trait specialization in Apache Spark 3.2.0](https://issues.apache.org/jira/browse/SPARK-34906).
-- `src/main/post320-treenode` contains shims for the Catalyst `TreeNode` class after the
+* `src/main/post320-treenode` contains shims for the Catalyst `TreeNode` class after the
   [children trait specialization in Apache Spark 3.2.0](https://issues.apache.org/jira/browse/SPARK-34906).
 
 For each Spark shim, we use Ant path patterns to compute the property
@@ -202,6 +207,17 @@ For each Spark shim, we use Ant path patterns to compute the property
 picked up as additional source code roots. When possible path patterns are reused using
 the conventions outlined in the pom.
 
+#### Simplified version directory structure
+
+Going forward new shim files should be added under:
+
+* `src/main/spark${buildver}`, example: `src/main/spark330db`
+* `src/test/spark${buildver}`, example: `src/test/spark340`
+
+with a special shim descriptor as a Scala/Java comment. See [shimplify.md][1]
+
+[1]: ./docs/dev/shimplify.md
+
 ### Setting up an Integrated Development Environment
 
 Our project currently uses `build-helper-maven-plugin` for shimming against conflicting definitions of superclasses
@@ -238,7 +254,12 @@ Known Issues:
 
 * There is a known issue that the test sources added via the `build-helper-maven-plugin` are not handled
 [properly](https://youtrack.jetbrains.com/issue/IDEA-100532). The workaround is to `mark` the affected folders
-such as `tests/src/test/320+-noncdh-nondb` manually as `Test Sources Root`
+such as
+
+  * `tests/src/test/320+-noncdh-nondb`
+  * `tests/src/test/spark340`
+
+manually as `Test Sources Root`
 
 * There is a known issue where, even after selecting a different Maven profile in the Maven submenu,
 the source folders from a previously selected profile may remain active. As a workaround,
@@ -264,7 +285,7 @@ interested in. For example, to generate the Bloop projects for the Spark 3.2.0 d
 just for the production code run:
 
 ```shell script
-mvn install ch.epfl.scala:maven-bloop_2.13:1.4.9:bloopInstall -pl aggregator -am \
+mvn install ch.epfl.scala:bloop-maven-plugin:bloopInstall -pl aggregator -am \
   -DdownloadSources=true \
   -Dbuildver=320 \
   -DskipTests \
@@ -296,7 +317,7 @@ You can now open the spark-rapids as a
 
 Read on for VS Code Scala Metals instructions.
 
-# Bloop, Scala Metals, and Visual Studio Code
+#### Bloop, Scala Metals, and Visual Studio Code
 
 _Last tested with 1.63.0-insider (Universal) Commit: bedf867b5b02c1c800fbaf4d6ce09cefba_
 
@@ -338,6 +359,29 @@ jps -l
 72349 scala.meta.metals.Main
 ```
 
+##### Known Issues
+
+###### java.lang.RuntimeException: boom
+
+Metals background compilation process status appears to be resetting to 0% after reaching 99%
+and you see a peculiar error message [`java.lang.RuntimeException: boom`][1]. You can work around
+it by making sure Metals Server (Bloop client) and Bloop Server are both running on Java 11+.
+
+1. To this end make sure that Bloop projects are generated using Java 11+
+
+    ```bash
+    JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 \
+      mvn install ch.epfl.scala:bloop-maven-plugin:bloopInstall \
+      -DdownloadSources=true \
+      -Dbuildver=331 \
+      -Dskip -DskipTests -Dmaven.javadoc.skip
+    ```
+
+1. Add [`metals.javaHome`][2] to VSCode preferences to point to Java 11+.
+
+[1]: https://github.com/sourcegraph/scip-java/blob/b7d268233f1a303f66b6d9804a68f64b1e5d7032/semanticdb-javac/src/main/java/com/sourcegraph/semanticdb_javac/SemanticdbTaskListener.java#L76
+
+[2]: https://github.com/scalameta/metals-vscode/pull/644/files#diff-04bba6a35cad1c794cbbe677678a51de13441b7a6ee8592b7b50be1f05c6f626R132
 #### Other IDEs
 We welcome pull requests with tips how to setup your favorite IDE!
 
@@ -481,6 +525,16 @@ You can confirm that the update actually has happened by either inspecting its e
 `git diff` first or simply reexecuting `git commit` right away. The second time no file
 modification should be triggered by the copyright year update hook and the commit should succeed.
 
+There is a known issue for macOS users if they use the default version of `sed`. The copyright update
+script may fail and generate an unexpected file named `source-file-E`. As a workaround, please
+install GNU sed
+
+```bash
+brew install gnu-sed
+# and add to PATH to make it as default sed for your shell
+export PATH="/usr/local/opt/gnu-sed/libexec/gnubin:$PATH"
+```
+
 ### Pull request status checks
 A pull request should pass all status checks before merged.
 #### signoff check
diff --git a/NOTICE-binary b/NOTICE-binary
index 6d488b89a7b..52d0395f2a3 100644
--- a/NOTICE-binary
+++ b/NOTICE-binary
@@ -463,3 +463,110 @@ misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 
 Mark Adler    madler@alumni.caltech.edu
+
+--------------------------------------------------------------------------------
+This software includes the SPARK-RAPIDS PRIVATE jar with the following licenses:
+
+NVIDIA SPARK-RAPIDS PRIVATE LICENSE AGREEMENT
+
+IMPORTANT NOTICE – PLEASE READ AND AGREE BEFORE USING THE SOFTWARE.
+
+This license agreement (“Agreement”) is a legal agreement between you, whether an individual or entity ("you”) and NVIDIA Corporation ("NVIDIA") and governs the use of NVIDIA Spark-RAPIDS PRIVATE, including the software and materials provided hereunder (“SOFTWARE”). 
+
+This Agreement can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this Agreement. 
+
+If you don’t have the required age or authority to accept this Agreement, or if you don’t accept all the terms and conditions of this Agreement, do not use the SOFTWARE. 
+
+You agree to use the SOFTWARE only for purposes that are permitted by this Agreement and any applicable law or regulation in the relevant jurisdictions. 
+
+1. License. 
+
+Subject to the terms of this Agreement, NVIDIA grants you a non-exclusive, revocable, non-transferable, non-sublicensable (except as expressly provided in this Agreement) license to install and use copies of the SOFTWARE in systems with NVIDIA GPUS.
+
+2. Limitations. Your license to use the SOFTWARE is restricted as follows:
+
+2.1 You may not reverse engineer, decompile, or disassemble the SOFTWARE components provided in binary form, nor attempt in any other manner to obtain source code of such SOFTWARE.
+
+2.2 You may not change or remove copyright or other proprietary notices in the SOFTWARE.
+
+2.3 Except as expressly granted in this Agreement, you may not copy, sell, rent, sublicense, transfer, distribute, modify or create derivative works of the SOFTWARE, or make its functionality available to others. 
+
+2.4 You may not bypass, disable, or circumvent any technical limitation, encryption, security, digital rights management or authentication mechanism in the SOFTWARE.
+
+2.5 You may not use the SOFTWARE for the purpose of developing competing products or technologies or assisting a third party in such activities. 
+
+2.6 You may not use the SOFTWARE in any manner that would cause it to become subject to an open source software license; subject to the terms in the “Components Under Other Licenses” section below. 
+
+2.7 You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney’s fees and costs incident to establishing the right of indemnification) arising out of use of the SOFTWARE outside of the scope of this Agreement or not in compliance with its terms. 
+
+3. Authorized Users. 
+
+You may allow employees and contractors of your entity or of your subsidiary(ies) to access and use the SOFTWARE from your secure network to perform the work authorized by this Agreement on your behalf. If you are an academic institution, you may allow users enrolled or employed by the academic institution to access and use the SOFTWARE as authorized by this Agreement from your secure network. You are responsible for the compliance with the terms of this Agreement by your authorized users. Any act or omission that if committed by you would constitute a breach of this Agreement will be deemed to constitute a breach of this Agreement if committed by your authorized users.
+
+4. Pre-Release Versions. 
+
+SOFTWARE versions or specific features identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA offerings. You may use a pre-release SOFTWARE at your own risk, understanding that such versions are not intended for use in business-critical systems. NVIDIA may choose not to make available a commercial version of any pre-release SOFTWARE. NVIDIA may also choose to abandon development and terminate the availability of a pre-release SOFTWARE at any time without liability.
+
+5. Updates. 
+
+NVIDIA may, at its option, make available patches, workarounds or other updates to the SOFTWARE. Unless the updates are provided with their separate governing terms, they are deemed part of the SOFTWARE licensed to you as provided in this Agreement.
+
+6. Components Under Other Licenses. 
+
+The SOFTWARE may include or be distributed with components provided with separate legal notices or terms that accompany the components, such as open source software licenses and other license terms ("Other Licenses”). The components are subject to the applicable Other Licenses, including any proprietary notices, disclaimers, requirements and extended use rights; except that this Agreement will prevail regarding the use of third-party open source software, unless a third-party open source software license requires its license terms to prevail. Open source software license means any software, data or documentation subject to any license identified as an open source license by the Open Source Initiative (http://opensource.org), Free Software Foundation (http://www.fsf.org) or other similar open source organization or listed by the Software Package Data Exchange (SPDX) Workgroup under the Linux Foundation (http://www.spdx.org).
+
+7. Termination. 
+
+This Agreement will automatically terminate without notice from NVIDIA if you fail to comply with any of the terms in this Agreement or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. Additionally, either party may terminate this Agreement at any time with prior written notice to the other party. Upon any termination, you must stop using and destroy all copies of the SOFTWARE. Upon written request, you will certify in writing that you have complied with your commitments under this section. All provisions will survive termination, except for the licenses granted to you.
+
+8. Ownership. 
+
+The SOFTWARE, including all intellectual property rights, is and will remain the sole and exclusive property of NVIDIA or its licensors. Except as expressly granted in this Agreement, (i) NVIDIA reserves all rights, interests, and remedies in connection with the SOFTWARE, and (ii) no other license or right is granted to you by implication, estoppel or otherwise. You agree to cooperate with NVIDIA and provide reasonably requested information to verify your compliance with this Agreement.
+
+9. Feedback. 
+
+You may, but you are not obligated to, provide suggestions, requests, fixes, modifications, enhancements, or other feedback regarding the SOFTWARE (collectively, “Feedback”). Feedback, even if designated as confidential by you, will not create any confidentiality obligation for NVIDIA or its affiliates. If you provide Feedback, you hereby grant NVIDIA, its affiliates and its designees a non-exclusive, perpetual, irrevocable, sublicensable, worldwide, royalty-free, fully paid-up and transferable license, under your intellectual property rights, to publicly perform, publicly display, reproduce, use, make, have made, sell, offer for sale, distribute (through multiple tiers of distribution), import, create derivative works of and otherwise commercialize and exploit the Feedback at NVIDIA’s discretion. You will not give Feedback (i) that you have reason to believe is subject to any restriction that impairs the exercise of the grant stated in this section, such as third-party intellectual property rights; or (ii) subject to license terms which seek to require any product incorporating or developed using such Feedback, or other intellectual property of NVIDIA or its affiliates, to be licensed to or otherwise shared with any third party. 
+
+10. Disclaimer of Warranties. 
+
+THE SOFTWARE IS PROVIDED BY NVIDIA AS-IS AND WITH ALL FAULTS. TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, NVIDIA DISCLAIMS ALL WARRANTIES AND REPRESENTATIONS OF ANY KIND, WHETHER EXPRESS, IMPLIED OR STATUTORY, RELATING TO OR ARISING UNDER THIS AGREEMENT, INCLUDING, WITHOUT LIMITATION, THE WARRANTIES OF TITLE, NONINFRINGEMENT, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, USAGE OF TRADE AND COURSE OF DEALING. WITHOUT LIMITING THE FOREGOING, NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS; THAT ANY DEFECTS OR ERRORS WILL BE CORRECTED; THAT ANY CERTAIN CONTENT WILL BE AVAILABLE; OR THAT THE SOFTWARE IS FREE OF VIRUSES OR OTHER HARMFUL COMPONENTS.  NO INFORMATION OR ADVICE GIVEN BY NVIDIA WILL IN ANY WAY INCREASE THE SCOPE OF ANY WARRANTY EXPRESSLY PROVIDED IN THIS AGREEMENT. 
+
+11. Limitations of Liability. 
+
+TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT WILL NVIDIA BE LIABLE FOR ANY (I) INDIRECT, PUNITIVE, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES, OR (II) DAMAGES FOR THE (A) COST OF PROCURING SUBSTITUTE GOODS, OR (B) LOSS OF PROFITS, REVENUES, USE, DATA OR GOODWILL ARISING OUT OF OR RELATED TO THIS AGREEMENT, WHETHER BASED ON BREACH OF CONTRACT, TORT (INCLUDING NEGLIGENCE), STRICT LIABILITY, OR OTHERWISE, AND EVEN IF NVIDIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES AND EVEN IF A PARTY'S REMEDIES FAIL THEIR ESSENTIAL PURPOSE.
+
+ADDITIONALLY, TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, NVIDIA’S TOTAL CUMULATIVE AGGREGATE LIABILITY FOR ANY AND ALL LIABILITIES, OBLIGATIONS OR CLAIMS ARISING OUT OF OR RELATED TO THIS AGREEMENT WILL NOT EXCEED FIVE U.S. DOLLARS (US$5).
+
+12. Governing Law and Jurisdiction. 
+
+This Agreement will be governed in all respects by the laws of the United States and the laws of the State of Delaware, without regard to conflict of laws principles or the United Nations Convention on Contracts for the International Sale of Goods. The state and federal courts residing in Santa Clara County, California will have exclusive jurisdiction over any dispute or claim arising out of or related to this Agreement, and the parties irrevocably consent to personal jurisdiction and venue in those courts; except that either party may apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. 
+
+13. No Assignment. 
+
+NVIDIA may assign, delegate or transfer its rights or obligations under this Agreement by any means or operation of law. You may not, without NVIDIA’s prior written consent, assign, delegate or transfer any of your rights or obligations under this Agreement by any means or operation of law, and any attempt to do so is null and void.
+
+14. Waiver. 
+
+No failure or delay by a party to enforce any Agreement term or obligation will operate as a waiver by that party, nor prevent the enforcement of such term or obligation later. 
+
+15. Export. 
+
+You agree to comply with all applicable export, import, trade and economic sanctions laws and regulations, including U.S. Export Administration Regulations and Office of Foreign Assets Control regulations. These laws include restrictions on destinations, end-users and end-use.
+
+16. Government Use. 
+
+The SOFTWARE, including related documentation (“Protected Items”) is a “Commercial product” as this term is defined at 48 C.F.R. 2.101, consisting of “commercial computer software” and “commercial computer software documentation” as such terms are used in, respectively, 48 C.F.R. 12.212 and 48 C.F.R. 227.7202 & 252.227-7014(a)(1). Before any Protected Items are supplied to the U.S. Government, you will (i) inform the U.S. Government in writing that the Protected Items are and must be treated as commercial computer software and commercial computer software documentation developed at private expense; (ii) inform the U.S. Government that the Protected Items are provided subject to the terms of this Agreement; and (iii) mark the Protected Items as commercial computer software and commercial computer software documentation developed at private expense. In no event will you permit the U.S. Government to acquire rights in Protected Items beyond those specified in 48 C.F.R. 52.227-19(b)(1)-(2) or 252.227-7013(c) except as expressly approved by NVIDIA in writing. 
+
+17. Notices. 
+
+Please direct your legal notices or other correspondence to NVIDIA Corporation, 2788 San Tomas Expressway, Santa Clara, California 95051, United States of America, Attention: Legal Department. If NVIDIA needs to contact you about the SOFTWARE, you consent to receive the notices by email and that such notices will satisfy any legal communication requirements.
+
+18. Force Majeure. 
+
+Neither party will be liable during any period where an event or circumstance prevents or delays that party from performing its obligations under this Agreement and that event or circumstance: (i) is not within the reasonable control of that party and is not the result of that party’s negligence, and (ii) cannot be overcome or avoided by that party using reasonably diligent efforts.
+
+19. Entire Agreement. 
+
+Regarding the subject matter of this Agreement, the parties agree that (i) this Agreement constitutes the entire and exclusive agreement between the parties and supersedes all prior and contemporaneous communications and (ii) any additional or different terms or conditions, whether contained in purchase orders, order acknowledgments, invoices or otherwise, will not be binding on the receiving party and are null and void. If a court of competent jurisdiction rules that a provision of this Agreement is unenforceable, that provision will be deemed modified to the extent necessary to make it enforceable and the remainder of this Agreement will continue in full force and effect. Any amendment to this Agreement must be in writing and signed by authorized representatives of both parties. 
+
+(v. April 10, 2023)
diff --git a/README.md b/README.md
index 97e754dddd8..7e905afac06 100644
--- a/README.md
+++ b/README.md
@@ -73,7 +73,7 @@ as a `provided` dependency.
 <dependency>
     <groupId>com.nvidia</groupId>
     <artifactId>rapids-4-spark_2.12</artifactId>
-    <version>23.02.0</version>
+    <version>23.04.0</version>
     <scope>provided</scope>
 </dependency>
 ```
diff --git a/aggregator/pom.xml b/aggregator/pom.xml
index 0af849ce3d2..2b090d31517 100644
--- a/aggregator/pom.xml
+++ b/aggregator/pom.xml
@@ -22,12 +22,12 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.02.0</version>
+        <version>23.04.0</version>
     </parent>
     <artifactId>rapids-4-spark-aggregator_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Aggregator</name>
     <description>Creates an aggregated shaded package of the RAPIDS plugin for Apache Spark</description>
-    <version>23.02.0</version>
+    <version>23.04.0</version>
 
     <properties>
         <!--
@@ -55,6 +55,12 @@
             <version>${project.version}</version>
             <classifier>${spark.version.classifier}</classifier>
         </dependency>
+        <dependency>
+            <groupId>com.nvidia</groupId>
+            <artifactId>rapids-4-spark-private_${scala.binary.version}</artifactId>
+            <version>${spark-rapids-private.version}</version>
+            <classifier>${spark.version.classifier}</classifier>
+        </dependency>
     </dependencies>
     <build>
         <plugins>
diff --git a/api_validation/pom.xml b/api_validation/pom.xml
index 28a9bbcd1cc..3cf112fe9d9 100644
--- a/api_validation/pom.xml
+++ b/api_validation/pom.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
-  Copyright (c) 2020-2022, NVIDIA CORPORATION.
+  Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
@@ -22,10 +22,10 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.02.0</version>
+        <version>23.04.0</version>
     </parent>
     <artifactId>rapids-4-spark-api-validation</artifactId>
-    <version>23.02.0</version>
+    <version>23.04.0</version>
 
     <profiles>
        <profile>
diff --git a/api_validation/src/main/scala/com/nvidia/spark/rapids/api/ApiValidation.scala b/api_validation/src/main/scala/com/nvidia/spark/rapids/api/ApiValidation.scala
index 83835071bd0..58d273d2148 100644
--- a/api_validation/src/main/scala/com/nvidia/spark/rapids/api/ApiValidation.scala
+++ b/api_validation/src/main/scala/com/nvidia/spark/rapids/api/ApiValidation.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,6 @@ import scala.reflect.api
 import scala.reflect.runtime.universe._
 
 import com.nvidia.spark.rapids._
-import com.nvidia.spark.rapids.shims.SparkShimImpl
 
 import org.apache.spark.internal.Logging
 
@@ -71,7 +70,7 @@ object ApiValidation extends Logging {
     var printNewline = false
 
     val sparkToShimMap = Map("3.1.1" -> "spark311")
-    val sparkVersion = SparkShimImpl.getSparkShimVersion.toString
+    val sparkVersion = ShimLoader.getShimVersion.toString
     val shimVersion = sparkToShimMap(sparkVersion)
 
     gpuKeys.foreach { e =>
@@ -102,7 +101,7 @@ object ApiValidation extends Logging {
         }
 
         // TODO: Add error handling if Type is not present
-        val gpuTypes = classToTypeTag(ShimLoader.loadClass(gpu))
+        val gpuTypes = classToTypeTag(ShimReflectionUtils.loadClass(gpu))
 
         val sparkToGpuExecMap = Map(
           "org.apache.spark.sql.catalyst.expressions.Expression" ->
diff --git a/build/buildall b/build/buildall
index 3bd32e8ae03..a1848f68c6f 100755
--- a/build/buildall
+++ b/build/buildall
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,8 +19,6 @@ set -e
 
 shopt -s extglob
 
-BLOOP_VERSION=${BLOOP_VERSION:-"1.4.13"}
-BLOOP_SCALA_VERSION=${BLOOP_SCALA_VERSION:-"2.13"}
 SKIP_CLEAN=1
 BUILD_ALL_DEBUG=0
 
@@ -66,7 +64,7 @@ function bloopInstall() {
       bloopTmpConfigDir="$bloopTmpDir/.bloop$bv"
       mkdir -p $bloopTmpConfigDir
       $MVN -B clean install \
-        ch.epfl.scala:maven-bloop_${BLOOP_SCALA_VERSION}:${BLOOP_VERSION}:bloopInstall \
+        ch.epfl.scala:bloop-maven-plugin:bloopInstall \
         -pl aggregator -am \
         -Dbloop.configDirectory="$bloopTmpConfigDir" \
         -DdownloadSources=true \
@@ -265,6 +263,12 @@ export -f build_single_shim
 time (
   # printf a single buildver array element per line
   if [[ "$SKIP_DIST_DEPS" != "1" ]]; then
+    # Execute initialize to download a massive jar for spark-rapids-jni in a single thread to
+    # avoid repeating this work in parallel
+    # Initialize sql-plugin only to avoid dealing with missing submodule dependencies
+    #
+    $MVN initialize -pl sql-plugin -am
+
     printf "%s\n" "${SPARK_SHIM_VERSIONS[@]}" | \
       xargs -t -I% -P "$BUILD_PARALLEL" -n 1 \
       bash -c 'build_single_shim "$@"' _ %
diff --git a/build/shimplify.py b/build/shimplify.py
new file mode 100644
index 00000000000..15f7bd884db
--- /dev/null
+++ b/build/shimplify.py
@@ -0,0 +1,589 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Implementation of <shimplify> Ant task in Python2.7 for Jython
+
+Simplifies the prior version-range directory system
+
+The if=false version is run by default during the generate-sources phase. If the user is
+to run shimplify to perform file modifications for converting to the new system or to add a new
+shim, it is recommended albeit not required to do it in a dedicated run after running `mvn install`.
+
+```bash
+mvn clean install -DskipTests
+mvn generate-sources
+```
+
+Switches:
+
+shimplify               - property passed to task attribute `if` whether to modify files
+shimplify.add.base      - old buildver to base the new one provided by shimplify.add.shim
+shimplify.add.shim      - add new shim/buildver based on the one provided by shimplify.add.base
+shimplify.dirs          - comma-separated list of dirs to modify, supersedes shimplify.shims
+shimplify.move          - property to allow moving files to canonical location, otherwise update
+                          without moving
+shimplify.overwrite     - property to allow shimplify executing file changes repeatedly,
+                          error out otherwise
+shimplify.shims         - comma-separated list of shims to simplify instead of all, superseded by
+                          shimplify.dirs
+shimplify.remove.shim   - drop support for shim/buildver, its exclusive files are removed
+shimplify.trace         - property to enable trace logging
+
+If the task attribute "if" evaluates to false shimplify does not alter files on the filesystem.
+The task merely suggests to consolidate shim directories if it finds some shims that are comprised
+of multiple directories not shared with other shims. The prior design expects to find only one such
+directory per shim.
+
+If the task attribute "if" evaluates to true shimplify is allowed to make changes on the filesystem.
+It is expected that this is only done under a local git repo for an easy undo (otherwise making
+manual backups is recommended). Undo typically involves three steps
+
+First undo all potential renames
+
+git restore --staged sql-plugin tests
+
+Then undo all non-staged changes including produced by the command above.
+
+git restore sql-plugin tests
+
+Optionally review and remove empty directories
+
+git clean -f -d [--dry-run]
+
+Each shim Scala/Java file receives a comment describing all Spark builds it
+belongs to. Lines are sorted by the Spark `buildver` lexicographically.
+Each line is assumed to be a JSON to keep it extensible.
+
+/*** spark-rapids-shim-json-lines
+{"spark": "312"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
+
+The canonical location of a source file shared by multiple shims is
+src/main/<top_buildver_in_the_comment>
+
+You can find all shim files for a particular shim, e.g. 312, easily by executing:
+git grep '{"spark": "312"}' '*.java' '*.scala'
+"""
+
+import errno
+import json
+import logging
+import os
+import re
+import subprocess
+
+
+def __project():
+    """
+    Wraps access to the implicitly provided Ant project reference[1] to reduce undefined-name
+    linting warnings
+
+    TODO Pass only Python types if possible
+
+    [1] https://ant.apache.org/manual/api/org/apache/tools/ant/Project.html
+    """
+    return project
+
+
+def __task():
+    """
+    Wraps access to the implicitly provided Ant task attributes map to reduce
+    undefined-name linting warnings
+    """
+    return self
+
+
+def __fail(message):
+    "Fails this task with the error message"
+    __task().fail(message)
+
+
+def __attributes():
+    """
+    Wraps access to the implicitly provided Ant task attributes map to reduce
+    undefined-name linting warnings
+
+    TODO Pass only Python types if possible
+    """
+    return attributes
+
+
+def __ant_proj_prop(name):
+    """Returns an Ant project property value as a Python string"""
+    prop_val = __project().getProperty(name)
+    return None if prop_val is None else str(prop_val)
+
+
+def __ant_attr(name):
+    """Returns this Ant task attribute value as a Python string"""
+    attr_val = __attributes().get(name)
+    return None if attr_val is None else str(attr_val)
+
+
+def __is_enabled_property(prop_name):
+    """Returns True if the required project property is set to true"""
+    assert prop_name is not None, "Invalid property: None"
+    prop_val = __ant_proj_prop(prop_name)
+    return str(True).lower() == prop_val
+
+
+def __is_enabled_attr(attr):
+    """Returns True if the required project property is set to true"""
+    assert attr is not None, "Invalid attribute: None"
+    attr_val = __ant_attr(attr)
+    return attr_val is not None and __is_enabled_property(attr_val)
+
+
+def __csv_ant_prop_as_arr(name):
+    """Splits a CSV value for a property into a list"""
+    prop_val = __ant_proj_prop(name)
+    return __csv_as_arr(prop_val)
+
+
+def __csv_as_arr(str_val):
+    """Splits a string CSV value into a list, returns [] if undefined or empty"""
+    if str_val in (None, ''):
+        return []
+    else:
+        return str_val.translate(None, ' ' + os.linesep).split(',')
+
+
+__should_add_comment = __is_enabled_attr('if')
+
+# should we move files?
+__should_move_files = __is_enabled_property('shimplify.move')
+
+# enable log tracing?
+__should_trace = __is_enabled_property('shimplify.trace')
+
+__add_shim_buildver = __ant_proj_prop('shimplify.add.shim')
+__add_shim_base = __ant_proj_prop('shimplify.add.base')
+
+__remove_shim_buildver = __ant_proj_prop('shimplify.remove.shim')
+
+# allowed to overwrite the existing comment?
+__should_overwrite = (__is_enabled_property('shimplify.overwrite')
+                      or __add_shim_buildver is not None
+                      or __remove_shim_buildver is not None)
+
+
+__shim_comment_tag = 'spark-rapids-shim-json-lines'
+__opening_shim_tag = '/*** ' + __shim_comment_tag
+__closing_shim_tag = __shim_comment_tag + ' ***/'
+__shims_arr = sorted(__csv_ant_prop_as_arr('shimplify.shims'))
+__dirs_to_derive_shims = sorted(__csv_ant_prop_as_arr('shimplify.dirs'))
+
+__all_shims_arr = sorted(__csv_ant_prop_as_arr('all.buildvers'))
+
+__log = logging.getLogger('shimplify')
+__log.setLevel(logging.DEBUG if __should_trace else logging.INFO)
+__ch = logging.StreamHandler()
+__ch.setFormatter(logging.Formatter('%(name)s - %(levelname)s - %(message)s'))
+__log.addHandler(__ch)
+
+__shim_dir_pattern = re.compile(r'spark\d{3}')
+__shim_comment_pattern = re.compile(re.escape(__opening_shim_tag) +
+                                    r'\n(.*)\n' +
+                                    re.escape(__closing_shim_tag), re.DOTALL)
+
+
+def __upsert_shim_json(filename, bv_list):
+    with open(filename, 'r') as file:
+        contents = file.readlines()
+    __delete_prior_comment_if_allowed(contents, __shim_comment_tag, filename)
+    shim_comment = [__opening_shim_tag]
+    for build_ver in bv_list:
+        shim_comment.append(json.dumps({'spark':  build_ver}))
+    shim_comment.append(__closing_shim_tag)
+    shim_comment = map(lambda x: x + '\n', shim_comment)
+    __log.debug("Inserting comment %s to %s", shim_comment, filename)
+    package_line = next(i for i in range(len(contents)) if str(contents[i]).startswith('package'))
+    __log.debug("filename %s package_line_number=%d", filename, package_line)
+    for i in range(len(shim_comment)):
+        contents.insert(package_line + i, shim_comment[i])
+    with open(filename, 'w') as file:
+        file.writelines(contents)
+
+
+def __delete_prior_comment_if_allowed(contents, tag, filename):
+    opening_shim_comment_line = None
+    closing_shim_comment_line = None
+    try:
+        opening_shim_comment_line = next(i for i in range(len(contents))
+                                         if tag in str(contents[i]))
+        shim_comment_and_below = range(opening_shim_comment_line + 1, len(contents))
+        closing_shim_comment_line = next(i for i in shim_comment_and_below
+                                         if tag in str(contents[i]))
+    except StopIteration as si_exc:
+        if (opening_shim_comment_line is not None) and (closing_shim_comment_line is None):
+            __fail("%s: no closing comment for %s:%d"
+                   % (si_exc, filename, opening_shim_comment_line))
+    if opening_shim_comment_line is None:
+        # no work
+        return
+    if not __should_overwrite:
+        __fail("found shim comment from prior execution at %s:%d, use -Dshimplify.overwrite=true "
+               "to overwrite" % (filename, opening_shim_comment_line))
+    assert (opening_shim_comment_line is not None) and (closing_shim_comment_line is not None)
+    __log.debug("removing comments %s:%d:%d", filename, opening_shim_comment_line,
+                closing_shim_comment_line)
+    del contents[opening_shim_comment_line:(closing_shim_comment_line + 1)]
+
+
+def __git_rename_or_copy(shim_file, owner_shim, from_shim=None):
+    __log.debug("git rename %s to the canonical dir of %s", shim_file, owner_shim)
+    parent_pom_dir = __ant_proj_prop('spark.rapids.source.basedir')
+    # sql-plugin/src/main/320+-nondb/scala/org/apache/spark/...
+    rel_path = os.path.relpath(shim_file, parent_pom_dir)
+    __log.debug("spark-rapids root dir: %s", parent_pom_dir)
+    __log.debug("shim file path relative to root dir: %s", rel_path)
+    path_parts = rel_path.split(os.sep)
+    owner_path_comp = "spark%s" % owner_shim
+
+    from_path_comp = None
+    if from_shim is not None:
+        # may have to update package path
+        from_path_comp = "spark%s" % from_shim
+        path_parts = [p.replace(from_path_comp, owner_path_comp) for p in path_parts]
+
+    # to enable both builds at the same time the original location should change too
+    # <module>/src/test/331 =>  <module>/src/test/spark331
+    path_parts[3] = owner_path_comp
+    new_shim_file = os.sep.join([parent_pom_dir] + path_parts)
+    if shim_file == new_shim_file:
+        __log.info("%s is already at the right location, skipping git rename", shim_file)
+    else:
+        new_shim_dir = os.path.dirname(new_shim_file)
+        __log.debug("creating new shim path %s", new_shim_dir)
+        __makedirs(new_shim_dir)
+        if from_path_comp is None:
+            shell_cmd = ['git', 'mv', shim_file, new_shim_file]
+            __shell_exec(shell_cmd)
+        else:
+            with open(shim_file, 'r') as src_shim_fh:
+                with open(new_shim_file, 'w') as dst_shim_fh:
+                    content = src_shim_fh.read()
+                    dst_content = content.replace(from_path_comp, owner_path_comp)
+                    dst_shim_fh.write(dst_content)
+            git_add_cmd = ['git', 'add', new_shim_file]
+            __shell_exec(git_add_cmd)
+    return new_shim_file
+
+
+def __shell_exec(shell_cmd):
+    ret_code = subprocess.call(shell_cmd)
+    if ret_code != 0:
+        __fail("failed to execute %s" % shell_cmd)
+
+
+def __makedirs(new_dir):
+    try:
+        __log.debug("__makedirs %s", new_dir)
+        os.makedirs(new_dir)
+    except OSError as ose:
+        if not (ose.errno == errno.EEXIST and os.path.isdir(new_dir)):
+            raise
+
+
+def task_impl():
+    """Ant task entry point """
+    __log.info('# Starting Jython Task Shimplify #')
+    config_format = """#   config:
+#           shimplify (if)=%s
+#           shimplify.add.base=%s
+#           shimplify.add.shim=%s
+#           shimplify.dirs=%s
+#           shimplify.move=%s
+#           shimplify.overwrite=%s
+#           shimplify.shims=%s
+#           shimplify.trace=%s"""
+    __log.info(config_format,
+               __should_add_comment,
+               __add_shim_base,
+               __add_shim_buildver,
+               __dirs_to_derive_shims,
+               __should_move_files,
+               __should_overwrite,
+               __shims_arr,
+               __should_trace)
+    __log.info("review changes and `git restore` if necessary")
+    buildvers_from_dirs = []
+    dirs2bv = {}
+
+    for prop_pattern in ["spark%s.sources", "spark%s.test.sources"]:
+        per_pattern_dir_map = __build_dirs_to_buildvers_map(prop_pattern)
+        __log.debug("Map dirs2bv = %s", per_pattern_dir_map)
+        __warn_shims_with_multiple_dedicated_dirs(per_pattern_dir_map)
+        dirs2bv.update(per_pattern_dir_map)
+
+    # restrict set of dirs to shimplify.dirs?
+    for dir, buildvers in dirs2bv.items():
+        for dir_substr in __dirs_to_derive_shims:
+            if dir_substr in dir:
+                buildvers_from_dirs += buildvers
+
+    buildvers_from_dirs_sorted_deduped = sorted(set(buildvers_from_dirs))
+    if len(buildvers_from_dirs_sorted_deduped) > 0:
+        __log.info("shimplify.dirs = %s, overriding shims from dirs: %s", __dirs_to_derive_shims,
+                   buildvers_from_dirs_sorted_deduped)
+        __shims_arr[:] = buildvers_from_dirs_sorted_deduped
+
+    if __should_add_comment:
+        __log.info('Shim layout is being updated! Review and git commit (or restore to undo)'
+                   '-Dshimplify=true`. New symlinks will be generated in a regular build with the '
+                   'default -Dshimplify=false')
+        __shimplify_layout()
+    else:
+        __log.info('Shim layout is not updated! If desired invoke '
+                   '`mvn generate-sources -Dshimplify=true` to manipulate shims')
+        __generate_symlinks()
+
+
+def __generate_symlinks():
+    """
+    link
+    <module>/src/<main|test>/<buildver>/scala/<package_path>/SomeClass.scala
+    <module>/target/<buildver>/generated/src/<main|test>/scala/<package_path>/SomeClass.scala
+    """
+    buildver = __ant_proj_prop('buildver')
+    for src_type in ['main', 'test']:
+        __log.info("# generating symlinks for shim %s %s files", buildver, src_type)
+        __traverse_source_tree_of_all_shims(
+            src_type,
+            lambda src_type, path, build_ver_arr: __generate_symlink_to_file(buildver,
+                                                                             src_type,
+                                                                             path,
+                                                                             build_ver_arr))
+
+
+def __traverse_source_tree_of_all_shims(src_type, func):
+    """Walks src/<src_type>/sparkXYZ"""
+    base_dir = str(__project().getBaseDir())
+    src_root = os.path.join(base_dir, 'src', src_type)
+    for dir, subdirs, files in os.walk(src_root, topdown=True):
+        if dir == src_root:
+            subdirs[:] = [d for d in subdirs if re.match(__shim_dir_pattern, d)]
+        for f in files:
+            shim_file_path = os.path.join(dir, f)
+            __log.debug("processing shim comment at %s", shim_file_path)
+            with open(shim_file_path, 'r') as shim_file:
+                shim_file_txt = shim_file.read()
+                shim_match = __shim_comment_pattern.search(shim_file_txt)
+                assert shim_match is not None and shim_match.groups(), \
+                    "no shim comment located in %s, " \
+                    "orphan shim files should be deleted" % shim_file_path
+                shim_arr = shim_match.group(1).split(os.linesep)
+                assert len(shim_arr) > 0, "invalid empty shim comment,"\
+                    "orphan shim files should be deleted"
+                build_ver_arr = map(lambda s: str(json.loads(s).get('spark')), shim_arr)
+                __log.debug("extracted shims %s", build_ver_arr)
+                assert build_ver_arr == sorted(build_ver_arr),\
+                    "%s shim list is not properly sorted" % shim_file_path
+                func(src_type, shim_file_path, build_ver_arr)
+
+
+def __generate_symlink_to_file(buildver, src_type, shim_file_path, build_ver_arr):
+    if buildver in build_ver_arr:
+        base_dir = str(__project().getBaseDir())
+        src_root = os.path.join(base_dir, 'src', src_type)
+        target_root = os.path.join(base_dir, 'target', "spark%s" % buildver, 'generated', 'src',
+                                   src_type)
+        first_build_ver = build_ver_arr[0]
+        __log.debug("top shim comment %s", first_build_ver)
+        shim_file_rel_path = os.path.relpath(shim_file_path, src_root)
+        expected_prefix = "spark%s%s" % (first_build_ver, os.sep)
+        assert shim_file_rel_path.startswith(expected_prefix), "Unexpected: %s is not prefixed " \
+               "by %s" % (shim_file_rel_path, expected_prefix)
+        shim_file_rel_path_parts = shim_file_rel_path.split(os.sep)
+        # drop spark3XY from spark3XY/scala/com/nvidia
+        target_file_parts = shim_file_rel_path_parts[1:]
+        target_rel_path = os.sep.join(target_file_parts)
+        target_shim_file_path = os.path.join(target_root, target_rel_path)
+        __log.debug("creating symlink %s -> %s", target_shim_file_path, shim_file_path)
+        __makedirs(os.path.dirname(target_shim_file_path))
+        if __should_overwrite:
+            __remove_file(target_shim_file_path)
+        __symlink(shim_file_path, target_shim_file_path)
+
+
+def __symlink(src, target):
+    try:
+        os.symlink(src, target)
+    except OSError as ose:
+        if ose.errno != errno.EEXIST:
+            raise
+
+
+def __remove_file(target_shim_file_path):
+    try:
+        os.remove(target_shim_file_path)
+    except OSError as ose:
+        # ignore non-exisiting files
+        if ose.errno != errno.ENOENT:
+            raise
+
+
+def __shimplify_layout():
+    __log.info("executing __shimplify_layout")
+    assert ((__add_shim_buildver is None) and (__add_shim_base is None) or
+            (__add_shim_buildver is not None) and (__add_shim_base is not None)),\
+           "shimplify.add.shim cannot be specified without shimplify.add.base and vice versa"
+    assert __add_shim_base is None or __add_shim_base in __shims_arr,\
+           "shimplify.add.base is not in %s" % __shims_arr
+    assert __add_shim_buildver is None or __remove_shim_buildver is None,\
+           "Adding and deleting a shim in a single invocation is not supported!"
+    # map file -> [shims it's part of]
+    files2bv = {}
+    for buildver in __all_shims_arr:
+        src_roots = __csv_ant_prop_as_arr("spark%s.sources" % buildver)
+        test_src_roots = __csv_ant_prop_as_arr("spark%s.test.sources" % buildver)
+        __log.debug("check %s sources: %s", buildver, src_roots)
+        __log.debug("check %s test sources: %s", buildver, test_src_roots)
+        main_and_test_roots = src_roots + test_src_roots
+        # alternatively we can use range dirs instead of files, which is more efficient.
+        # file level provides flexibility until/unless the shim layer becomes unexpectedly
+        # large
+        for src_root in main_and_test_roots:
+            __log.debug("os.walk looking for shim files from %s", src_root)
+            for dir, _, shim_source_files in os.walk(src_root):
+                for shim_file in shim_source_files:
+                    shim_path = os.path.join(dir, shim_file)
+                    __log.debug("updating files2bv %s -> %s", shim_path, buildver)
+                    if shim_path in files2bv.keys():
+                        files2bv[shim_path] += [buildver]
+                    else:
+                        files2bv[shim_path] = [buildver]
+
+    # if the user allows to overwrite / reorganize shimplified shims,
+    # commonly while adding or removing shims we must include new shim locations
+    if __should_overwrite:
+        for src_type in ['main', 'test']:
+            __traverse_source_tree_of_all_shims(
+                src_type,
+                lambda unused_src_type, shim_file_path, build_ver_arr:
+                __update_files2bv(files2bv, shim_file_path, build_ver_arr))
+
+    # adding a new shim?
+    if __add_shim_buildver is not None:
+        __add_new_shim_to_file_map(files2bv)
+
+    if __remove_shim_buildver is not None:
+        __remove_shim_from_file_map(files2bv)
+
+    for shim_file, bv_list in files2bv.items():
+        if len(bv_list) == 0:
+            if __should_move_files:
+                __log.info("Removing orphaned file %s", shim_file)
+                __shell_exec(['git', 'rm', shim_file])
+            else:
+                __log.info("Detected an orphaned shim file %s, possibly while removing a shim."
+                           " git rm it manually or rerun with -Dshimplify.move=true",
+                           shim_file)
+        else:
+            sorted_build_vers = sorted(bv_list)
+            __log.debug("calling upsert_shim_json on shim_file %s bv_list=%s", shim_file,
+                        sorted_build_vers)
+            owner_shim = sorted_build_vers[0]
+            if owner_shim in __shims_arr:
+                __upsert_shim_json(shim_file, sorted_build_vers)
+                if __should_move_files:
+                    __git_rename_or_copy(shim_file, owner_shim)
+
+
+def __update_files2bv(files2bv, path, buildver_arr):
+    assert path not in files2bv.keys(), "new path %s %s should be "\
+        "encountered only once, current map\n%s" % (path, buildver_arr, files2bv)
+    __log.debug("Adding %s %s to files to shim map", path, buildver_arr)
+    files2bv[path] = buildver_arr
+
+
+def __add_new_shim_to_file_map(files2bv):
+    if __add_shim_buildver not in __all_shims_arr:
+        __log.warning("Update pom.xml to add %s to all.buildvers", __add_shim_buildver)
+    if __add_shim_buildver not in __shims_arr:
+        # TODO  should we just bail and ask the user to add to all.buildvers manually first?
+        __shims_arr.append(__add_shim_buildver)
+
+    # copy keys to be able to modify the original dictionary while iterating
+    for shim_file in set(files2bv.keys()):
+        bv_list = files2bv[shim_file]
+        if __add_shim_base in bv_list:
+            # adding a lookalike
+            # case 1) dedicated per-shim files with a spark${buildver} in the package path:
+            #         CLONE the file with modifications
+            # case 2) otherwise simply add the new buildver to the files2bv[shimfile] mapping
+            base_package = "spark%s" % __add_shim_base
+            if base_package in shim_file:
+                assert len(bv_list) == 1, "Per-shim file %s is expected to belong to a single "\
+                        "shim, actual shims: %s" % (shim_file, bv_list)
+                new_shim_file = __git_rename_or_copy(shim_file, __add_shim_buildver,
+                                                     from_shim=__add_shim_base)
+                # schedule new file for comment update
+                __log.info("Adding a per-shim file %s for %s", new_shim_file,
+                           __add_shim_buildver)
+                files2bv[new_shim_file] = [__add_shim_buildver]
+            else:
+                # TODO figure out why __add_shim_buildver is unicode class, not a simple str
+                # and if we care
+                bv_list.append(__add_shim_buildver)
+
+
+def __remove_shim_from_file_map(files2bv):
+    __log.info("Removing %s shim, pom.xml should be updated manually.", __remove_shim_buildver)
+    # copy keys to be able to modify the original dictionary while iterating
+    for shim_file in set(files2bv.keys()):
+        bv_list = files2bv[shim_file]
+        try:
+            bv_list.remove(__remove_shim_buildver)
+        except ValueError as ve:
+            # __remove_shim_buildver is not in the list
+            __log.debug("%s: file %s does not belong to shim %s, skipping it", ve, shim_file,
+                        __remove_shim_buildver)
+            pass
+
+
+def __warn_shims_with_multiple_dedicated_dirs(dirs2bv):
+    # each shim has at least one dedicated dir, report shims
+    # with multiple dedicated dirs because they can be merged
+    single_shim_dirs = {dir: shims[0] for dir, shims in dirs2bv.items() if len(shims) == 1}
+    __log.debug("shims with exclusive dirs %s", single_shim_dirs)
+    multi_dir_shims = {}
+    for dir, single_shim in single_shim_dirs.items():
+        if single_shim in multi_dir_shims.keys():
+            multi_dir_shims[single_shim].append(dir)
+        else:
+            multi_dir_shims[single_shim] = [dir]
+    for shim, dirs in multi_dir_shims.items():
+        if len(dirs) > 1:
+            __log.warning("Consider consolidating %s, it spans multiple dedicated directories %s",
+                          shim, dirs)
+
+
+def __build_dirs_to_buildvers_map(prop_pattern):
+    dirs2bv = {}
+    for build_ver in __all_shims_arr:
+        __log.debug("updating dirs2bv for %s", build_ver)
+        shim_dirs = __csv_ant_prop_as_arr(prop_pattern % build_ver)
+        for dir in shim_dirs:
+            if dir not in dirs2bv.keys():
+                dirs2bv[dir] = [build_ver]
+            else:
+                dirs2bv[dir] += [build_ver]
+    __log.debug("Map build_ver -> shim_dirs %s" % dirs2bv)
+    return dirs2bv
+
+
+task_impl()
diff --git a/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDeltaInvariantCheckerExec.scala b/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDeltaInvariantCheckerExec.scala
index 85f005b176d..787a6609d6c 100644
--- a/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDeltaInvariantCheckerExec.scala
+++ b/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDeltaInvariantCheckerExec.scala
@@ -42,7 +42,7 @@ case class GpuDeltaInvariantCheckerExec(
     throw new IllegalStateException("ROW BASED PROCESSING IS NOT SUPPORTED")
   }
 
-  override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     if (checks.isEmpty) return child.executeColumnar()
     val boundRefs = checks.map(_.withBoundReferences(child.output))
 
diff --git a/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuMergeIntoCommand.scala b/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuMergeIntoCommand.scala
index 969f036f42c..bc28abce5b8 100644
--- a/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuMergeIntoCommand.scala
+++ b/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuMergeIntoCommand.scala
@@ -602,11 +602,14 @@ case class GpuMergeIntoCommand(
        """.stripMargin)
 
     // UDFs to update metrics
+    // Make UDFs that appear in the custom join processor node deterministic, as they always
+    // return true and update a metric. Catalyst precludes non-deterministic UDFs that are not
+    // allowed outside a very specific set of Catalyst nodes (Project, Filter, Window, Aggregate).
     val incrSourceRowCountExpr = makeMetricUpdateUDF("numSourceRowsInSecondScan")
-    val incrUpdatedCountExpr = makeMetricUpdateUDF("numTargetRowsUpdated")
-    val incrInsertedCountExpr = makeMetricUpdateUDF("numTargetRowsInserted")
-    val incrNoopCountExpr = makeMetricUpdateUDF("numTargetRowsCopied")
-    val incrDeletedCountExpr = makeMetricUpdateUDF("numTargetRowsDeleted")
+    val incrUpdatedCountExpr = makeMetricUpdateUDF("numTargetRowsUpdated", deterministic = true)
+    val incrInsertedCountExpr = makeMetricUpdateUDF("numTargetRowsInserted", deterministic = true)
+    val incrNoopCountExpr = makeMetricUpdateUDF("numTargetRowsCopied", deterministic = true)
+    val incrDeletedCountExpr = makeMetricUpdateUDF("numTargetRowsDeleted", deterministic = true)
 
     // Apply an outer join to find both, matches and non-matches. We are adding two boolean fields
     // with value `true`, one to each side of the join. Whether this field is null or not after
@@ -864,16 +867,17 @@ case class GpuMergeIntoCommand(
     }
 
     if (canReplace) {
-      val processedJoinPlan = RapidsProcessDeltaMergeJoin(joinedPlan, outputRowSchema.toAttributes,
-        RapidsProcessDeltaMergeJoinExpressions(
-          targetRowHasNoMatch = targetRowHasNoMatchMeta.convertToGpu(),
-          sourceRowHasNoMatch = sourceRowHasNoMatchMeta.convertToGpu(),
-          matchedConditions = matchedConditionsMetas.map(_.convertToGpu()),
-          matchedOutputs = matchedOutputsMetas.map(_.map(_.map(_.convertToGpu()))),
-          notMatchedConditions = notMatchedConditionsMetas.map(_.convertToGpu()),
-          notMatchedOutputs = notMatchedOutputsMetas.map(_.map(_.map(_.convertToGpu()))),
-          noopCopyOutput = noopCopyOutputMetas.map(_.convertToGpu()),
-          deleteRowOutput = deleteRowOutputMetas.map(_.convertToGpu())))
+      val processedJoinPlan = RapidsProcessDeltaMergeJoin(
+        joinedPlan,
+        outputRowSchema.toAttributes,
+        targetRowHasNoMatch = targetRowHasNoMatch,
+        sourceRowHasNoMatch = sourceRowHasNoMatch,
+        matchedConditions = matchedConditions,
+        matchedOutputs = matchedOutputs,
+        notMatchedConditions = notMatchedConditions,
+        notMatchedOutputs = notMatchedOutputs,
+        noopCopyOutput = noopCopyOutput,
+        deleteRowOutput = deleteRowOutput)
       Dataset.ofRows(spark, processedJoinPlan)
     } else {
       val joinedRowEncoder = RowEncoder(joinedPlan.schema)
@@ -965,10 +969,14 @@ case class GpuMergeIntoCommand(
   }
 
   /** Expressions to increment SQL metrics */
-  private def makeMetricUpdateUDF(name: String): Expression = {
+  private def makeMetricUpdateUDF(name: String, deterministic: Boolean = false): Expression = {
     // only capture the needed metric in a local variable
     val metric = metrics(name)
-    udf(new GpuDeltaMetricUpdateUDF(metric)).asNondeterministic().apply().expr
+    var u = udf(new GpuDeltaMetricUpdateUDF(metric))
+    if (!deterministic) {
+      u = u.asNondeterministic()
+    }
+    u.apply().expr
   }
 
   private def seqToString(exprs: Seq[Expression]): String = exprs.map(_.sql).mkString("\n\t")
diff --git a/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransactionBase.scala b/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransactionBase.scala
index a58d17fa5f7..d41c73c5535 100644
--- a/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransactionBase.scala
+++ b/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransactionBase.scala
@@ -24,15 +24,20 @@ package com.databricks.sql.transaction.tahoe.rapids
 import com.databricks.sql.transaction.tahoe._
 import com.databricks.sql.transaction.tahoe.actions.FileAction
 import com.databricks.sql.transaction.tahoe.constraints.{Constraint, DeltaInvariantCheckerExec}
+import com.databricks.sql.transaction.tahoe.files.TahoeBatchFileIndex
 import com.databricks.sql.transaction.tahoe.metering.DeltaLogging
 import com.databricks.sql.transaction.tahoe.sources.DeltaSQLConf
 import com.nvidia.spark.rapids._
 
-import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.{Dataset, SparkSession}
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, NamedExpression}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
+import org.apache.spark.sql.rapids.GpuShuffleEnv
 import org.apache.spark.sql.rapids.GpuV1WriteUtils.GpuEmpty2Null
-import org.apache.spark.sql.types.StringType
+import org.apache.spark.sql.rapids.delta.{DeltaShufflePartitionsUtil, GpuOptimizeWriteExchangeExec, OptimizeWriteExchangeExec}
+import org.apache.spark.sql.types.{StringType, StructType}
 import org.apache.spark.util.Clock
 
 /**
@@ -48,7 +53,7 @@ import org.apache.spark.util.Clock
  * @param rapidsConf RAPIDS Accelerator config settings.
  */
 abstract class GpuOptimisticTransactionBase
-    (deltaLog: DeltaLog, snapshot: Snapshot, rapidsConf: RapidsConf)
+    (deltaLog: DeltaLog, snapshot: Snapshot, val rapidsConf: RapidsConf)
     (implicit clock: Clock)
   extends OptimisticTransaction(deltaLog, snapshot)(clock)
   with DeltaLogging {
@@ -130,6 +135,50 @@ abstract class GpuOptimisticTransactionBase
     writeFiles(inputData, None, additionalConstraints)
   }
 
+  protected def applyOptimizeWriteIfNeeded(
+      spark: SparkSession,
+      physicalPlan: SparkPlan,
+      partitionSchema: StructType,
+      isOptimize: Boolean): SparkPlan = {
+    val optimizeWriteEnabled = !isOptimize &&
+        spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_WRITE_ENABLED)
+            .orElse(DeltaConfigs.OPTIMIZE_WRITE.fromMetaData(metadata)).getOrElse(false)
+    if (optimizeWriteEnabled) {
+      val planWithoutTopRepartition =
+        DeltaShufflePartitionsUtil.removeTopRepartition(physicalPlan)
+      val partitioning = DeltaShufflePartitionsUtil.partitioningForRebalance(
+        physicalPlan.output, partitionSchema, spark.sessionState.conf.numShufflePartitions)
+      planWithoutTopRepartition match {
+        case p: GpuExec =>
+          val partMeta = GpuOverrides.wrapPart(partitioning, rapidsConf, None)
+          partMeta.tagForGpu()
+          if (partMeta.canThisBeReplaced) {
+            val plan = GpuOptimizeWriteExchangeExec(partMeta.convertToGpu(), p)
+            if (GpuShuffleEnv.useGPUShuffle(rapidsConf)) {
+              GpuCoalesceBatches(plan, TargetSize(rapidsConf.gpuTargetBatchSizeBytes))
+            } else {
+              GpuShuffleCoalesceExec(plan, rapidsConf.gpuTargetBatchSizeBytes)
+            }
+          } else {
+            GpuColumnarToRowExec(OptimizeWriteExchangeExec(partitioning, p))
+          }
+        case p =>
+          OptimizeWriteExchangeExec(partitioning, p)
+      }
+    } else {
+      physicalPlan
+    }
+  }
+
+  protected def isOptimizeCommand(plan: LogicalPlan): Boolean = {
+    val leaves = plan.collectLeaves()
+    leaves.size == 1 && leaves.head.collect {
+      case LogicalRelation(HadoopFsRelation(
+      index: TahoeBatchFileIndex, _, _, _, _, _), _, _, _) =>
+        index.actionType.equals("Optimize")
+    }.headOption.getOrElse(false)
+  }
+
   protected def convertToCpu(plan: SparkPlan): SparkPlan = plan match {
     case GpuRowToColumnarExec(p, _) => p
     case p: GpuExec => GpuColumnarToRowExec(p)
diff --git a/delta-lake/common/src/main/databricks/scala/com/nvidia/spark/rapids/delta/RapidsDeltaSQLConf.scala b/delta-lake/common/src/main/databricks/scala/com/nvidia/spark/rapids/delta/RapidsDeltaSQLConf.scala
new file mode 100644
index 00000000000..92e5c171301
--- /dev/null
+++ b/delta-lake/common/src/main/databricks/scala/com/nvidia/spark/rapids/delta/RapidsDeltaSQLConf.scala
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * This file was derived from DeltaSQLConf.scala
+ * in the Delta Lake project at https://github.com/delta-io/delta.
+ * (pending at https://github.com/delta-io/delta/pull/1198).
+*
+ * Copyright (2021) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.delta
+
+import java.util.Locale
+
+import com.databricks.sql.transaction.tahoe.sources.DeltaSQLConf
+
+import org.apache.spark.network.util.ByteUnit
+
+/** Delta Lake related configs that are not yet provided by Delta Lake. */
+trait RapidsDeltaSQLConf {
+  val OPTIMIZE_WRITE_SMALL_PARTITION_FACTOR =
+    DeltaSQLConf.buildConf("optimizeWrite.smallPartitionFactor")
+        .internal()
+        .doc("Factor used to coalesce partitions for optimize write.")
+        .doubleConf
+        .createWithDefault(0.5)
+
+  val OPTIMIZE_WRITE_MERGED_PARTITION_FACTOR =
+    DeltaSQLConf.buildConf("optimizeWrite.mergedPartitionFactor")
+        .internal()
+        .doc("Factor used to rebalance partitions for optimize write.")
+        .doubleConf
+        .createWithDefault(1.2)
+
+  val AUTO_COMPACT_TARGET =
+    DeltaSQLConf.buildConf("autoCompact.target")
+      .internal()
+      .doc(
+        """
+          |Target files for auto compaction.
+          | "table", "commit", "partition" options are available. (default: partition)
+          | If "table", all files in table are eligible for auto compaction.
+          | If "commit", added/updated files by the commit are eligible.
+          | If "partition", all files in partitions containing any added/updated files
+          |  by the commit are eligible.
+          |""".stripMargin
+      )
+      .stringConf
+      .transform(_.toLowerCase(Locale.ROOT))
+      .createWithDefault("partition")
+
+  val AUTO_COMPACT_MAX_COMPACT_BYTES =
+    DeltaSQLConf.buildConf("autoCompact.maxCompactBytes")
+      .internal()
+      .doc("Maximum amount of data for auto compaction.")
+      .bytesConf(ByteUnit.BYTE)
+      .createWithDefaultString("20GB")
+}
+
+object RapidsDeltaSQLConf extends RapidsDeltaSQLConf
diff --git a/delta-lake/common/src/main/databricks/scala/com/nvidia/spark/rapids/delta/RapidsDeltaUtils.scala b/delta-lake/common/src/main/databricks/scala/com/nvidia/spark/rapids/delta/RapidsDeltaUtils.scala
index 42a8b175586..99f110acb88 100644
--- a/delta-lake/common/src/main/databricks/scala/com/nvidia/spark/rapids/delta/RapidsDeltaUtils.scala
+++ b/delta-lake/common/src/main/databricks/scala/com/nvidia/spark/rapids/delta/RapidsDeltaUtils.scala
@@ -17,9 +17,10 @@
 package com.nvidia.spark.rapids.delta
 
 import com.databricks.sql.transaction.tahoe.{DeltaConfigs, DeltaLog, DeltaOptions, DeltaParquetFileFormat}
-import com.nvidia.spark.rapids.{DeltaFormatType, FileFormatChecks, GpuParquetFileFormat, RapidsMeta, WriteFileOp}
+import com.nvidia.spark.rapids.{DeltaFormatType, FileFormatChecks, GpuOverrides, GpuParquetFileFormat, RapidsMeta, TypeSig, WriteFileOp}
 
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.execution.datasources.DataSourceUtils
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.StructType
 
@@ -37,11 +38,12 @@ object RapidsDeltaUtils {
       case f =>
         meta.willNotWorkOnGpu(s"file format $f is not supported")
     }
-    checkIncompatibleConfs(meta, deltaLog, spark.sessionState.conf, options)
+    checkIncompatibleConfs(meta, schema, deltaLog, spark.sessionState.conf, options)
   }
 
   private def checkIncompatibleConfs(
       meta: RapidsMeta[_, _, _],
+      schema: StructType,
       deltaLog: DeltaLog,
       sqlConf: SQLConf,
       options: Map[String, String]): Unit = {
@@ -53,32 +55,40 @@ object RapidsDeltaUtils {
       }
     }
 
-    val optimizeWriteEnabled = {
-      val deltaOptions = new DeltaOptions(options, sqlConf)
-      deltaOptions.optimizeWrite.orElse {
-        getSQLConf("spark.databricks.delta.optimizeWrite.enabled").map(_.toBoolean).orElse {
-          val metadata = deltaLog.snapshot.metadata
-          DeltaConfigs.AUTO_OPTIMIZE.fromMetaData(metadata).orElse {
-            metadata.configuration.get("delta.autoOptimize.optimizeWrite").orElse {
-              getSQLConf("spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite")
-            }.map(_.toBoolean)
-          }
-        }
-      }.getOrElse(false)
-    }
-    if (optimizeWriteEnabled) {
-      meta.willNotWorkOnGpu("optimized write of Delta Lake tables is not supported")
-    }
-
-    val autoCompactEnabled =
-      getSQLConf("spark.databricks.delta.autoCompact.enabled").orElse {
+    // Optimized writes for non-partitioned tables involves a round-robin partitioning, and that
+    // can involve a sort on all columns. The GPU doesn't currently support sorting on all types,
+    // so we fallback if the GPU cannot support the round-robin partitioning.
+    if (sqlConf.sortBeforeRepartition) {
+      val orderableTypeSig = (GpuOverrides.pluginSupportedOrderableSig + TypeSig.DECIMAL_128
+          + TypeSig.STRUCT).nested()
+      val unorderableTypes = schema.map(_.dataType).filterNot { t =>
+        orderableTypeSig.isSupportedByPlugin(t)
+      }
+      if (unorderableTypes.nonEmpty) {
         val metadata = deltaLog.snapshot.metadata
-        metadata.configuration.get("delta.autoOptimize.autoCompact").orElse {
-          getSQLConf("spark.databricks.delta.properties.defaults.autoOptimize.autoCompact")
+        val hasPartitioning = metadata.partitionColumns.nonEmpty ||
+            options.get(DataSourceUtils.PARTITIONING_COLUMNS_KEY).exists(_.nonEmpty)
+        if (!hasPartitioning) {
+          val optimizeWriteEnabled = {
+            val deltaOptions = new DeltaOptions(options, sqlConf)
+            deltaOptions.optimizeWrite.orElse {
+              getSQLConf("spark.databricks.delta.optimizeWrite.enabled").map(_.toBoolean).orElse {
+                DeltaConfigs.AUTO_OPTIMIZE.fromMetaData(metadata).orElse {
+                  metadata.configuration.get("delta.autoOptimize.optimizeWrite").orElse {
+                    getSQLConf(
+                      "spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite")
+                  }.map(_.toBoolean)
+                }
+              }
+            }.getOrElse(false)
+          }
+          if (optimizeWriteEnabled) {
+            unorderableTypes.foreach { t =>
+              meta.willNotWorkOnGpu(s"round-robin partitioning cannot sort $t")
+            }
+          }
         }
-      }.exists(_.toBoolean)
-    if (autoCompactEnabled) {
-      meta.willNotWorkOnGpu("automatic compaction of Delta Lake tables is not supported")
+      }
     }
   }
 }
diff --git a/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala b/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala
new file mode 100644
index 00000000000..1a9936ea808
--- /dev/null
+++ b/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * This file was derived from OptimizeWriteExchange.scala
+ * in the Delta Lake project at https://github.com/delta-io/delta
+ * (pending at https://github.com/delta-io/delta/pull/1198).
+ *
+ * Copyright (2021) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.rapids.delta
+
+import scala.concurrent.Future
+import scala.concurrent.duration.Duration
+
+import com.databricks.sql.transaction.tahoe.sources.DeltaSQLConf
+import com.nvidia.spark.rapids.{GpuColumnarBatchSerializer, GpuExec, GpuMetric, GpuPartitioning, GpuRoundRobinPartitioning}
+import com.nvidia.spark.rapids.delta.RapidsDeltaSQLConf
+
+import org.apache.spark.{MapOutputStatistics, ShuffleDependency}
+import org.apache.spark.network.util.ByteUnit
+import org.apache.spark.rdd.RDD
+import org.apache.spark.serializer.Serializer
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning}
+import org.apache.spark.sql.execution.{CoalescedPartitionSpec, ShufflePartitionSpec, SparkPlan, SQLExecution}
+import org.apache.spark.sql.execution.exchange.Exchange
+import org.apache.spark.sql.execution.metric.{SQLMetrics, SQLShuffleReadMetricsReporter, SQLShuffleWriteMetricsReporter}
+import org.apache.spark.sql.rapids.execution.{GpuShuffleExchangeExecBase, ShuffledBatchRDD}
+import org.apache.spark.sql.vectorized.ColumnarBatch
+import org.apache.spark.util.ThreadUtils
+
+/**
+ * Execution plan to repartition and rebalance the data for Optimize Write.
+ * The plan will be added on top of the query plan of each write job to avoid any interference
+ * from Adaptive Query Execution.
+ *
+ * @param partitioning Partitioning to use data exchange.
+ * @param child Input plan of write job.
+ */
+case class GpuOptimizeWriteExchangeExec(
+    partitioning: GpuPartitioning,
+    override val child: SparkPlan) extends Exchange with GpuExec {
+  import GpuMetric._
+
+  // Use 150% of target file size hint config considering parquet compression.
+  // Still the result file can be smaller/larger than the config due to data skew or
+  // variable compression ratio for each data type.
+  final val PARQUET_COMPRESSION_RATIO = 1.5
+
+  // Dummy partitioning because:
+  // 1) The exact output partitioning is determined at query runtime
+  // 2) optimizeWrite is always placed right after the top node(DeltaInvariantChecker),
+  //    there is no parent plan to refer to outputPartitioning
+  override def outputPartitioning: Partitioning = UnknownPartitioning(partitioning.numPartitions)
+
+  private lazy val writeMetrics =
+    SQLShuffleWriteMetricsReporter.createShuffleWriteMetrics(sparkContext)
+  private[sql] lazy val readMetrics =
+    SQLShuffleReadMetricsReporter.createShuffleReadMetrics(sparkContext)
+
+  override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
+    "dataSize" -> createSizeMetric(ESSENTIAL_LEVEL, "data size"),
+    "dataReadSize" -> createSizeMetric(MODERATE_LEVEL, "data read size"),
+    "rapidsShuffleSerializationTime" ->
+        createNanoTimingMetric(DEBUG_LEVEL, "rs. serialization time"),
+    "rapidsShuffleDeserializationTime" ->
+        createNanoTimingMetric(DEBUG_LEVEL, "rs. deserialization time"),
+    "rapidsShuffleWriteTime" ->
+        createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle write time"),
+    "rapidsShuffleCombineTime" ->
+        createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle combine time"),
+    "rapidsShuffleWriteIoTime" ->
+        createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle write io time"),
+    "rapidsShuffleReadTime" ->
+        createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle read time")
+  ) ++ GpuMetric.wrap(readMetrics) ++ GpuMetric.wrap(writeMetrics)
+
+  override lazy val allMetrics: Map[String, GpuMetric] = {
+    Map(
+      PARTITION_SIZE -> createMetric(ESSENTIAL_LEVEL, DESCRIPTION_PARTITION_SIZE),
+      NUM_PARTITIONS -> createMetric(ESSENTIAL_LEVEL, DESCRIPTION_NUM_PARTITIONS),
+      NUM_OUTPUT_ROWS -> createMetric(ESSENTIAL_LEVEL, DESCRIPTION_NUM_OUTPUT_ROWS),
+      NUM_OUTPUT_BATCHES -> createMetric(MODERATE_LEVEL, DESCRIPTION_NUM_OUTPUT_BATCHES)
+    ) ++ additionalMetrics
+  }
+
+  private lazy val serializer: Serializer =
+    new GpuColumnarBatchSerializer(gpuLongMetric("dataSize"))
+
+  @transient lazy val inputRDD: RDD[ColumnarBatch] = child.executeColumnar()
+
+  @transient lazy val mapOutputStatisticsFuture: Future[MapOutputStatistics] = {
+    if (inputRDD.getNumPartitions == 0) {
+      Future.successful(null)
+    } else {
+      sparkContext.submitMapStage(shuffleDependency)
+    }
+  }
+
+
+  @transient lazy val shuffleDependency: ShuffleDependency[Int, ColumnarBatch, ColumnarBatch] = {
+    val dep = GpuShuffleExchangeExecBase.prepareBatchShuffleDependency(
+      inputRDD,
+      child.output,
+      partitioning,
+      child.output.map(_.dataType).toArray,
+      serializer,
+      useGPUShuffle=partitioning.usesGPUShuffle,
+      useMultiThreadedShuffle=partitioning.usesMultiThreadedShuffle,
+      metrics=allMetrics,
+      writeMetrics=writeMetrics,
+      additionalMetrics=additionalMetrics)
+    metrics("numPartitions").set(dep.partitioner.numPartitions)
+    val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
+    SQLMetrics.postDriverMetricUpdates(
+      sparkContext, executionId, metrics("numPartitions") :: Nil)
+    dep
+  }
+
+  override protected def doExecute(): RDD[InternalRow] = {
+    throw new IllegalStateException(s"Row-based execution should not occur for $this")
+  }
+
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
+    // Collect execution statistics, these will be used to adjust/decide how to split files
+    val stats = ThreadUtils.awaitResult(mapOutputStatisticsFuture, Duration.Inf)
+    if (stats == null) {
+      new ShuffledBatchRDD(shuffleDependency, metrics)
+    } else {
+      try {
+        val partitionSpecs = Some(rebalancePartitions(stats))
+        new ShuffledBatchRDD(shuffleDependency, metrics, partitionSpecs.get.toArray)
+      } catch {
+        case e: Throwable =>
+          logWarning("Failed to apply OptimizeWrite.", e)
+          new ShuffledBatchRDD(shuffleDependency, metrics)
+      }
+    }
+  }
+
+  private def rebalancePartitions(stats: MapOutputStatistics): Seq[ShufflePartitionSpec] = {
+    val binSize = ByteUnit.BYTE.convertFrom(
+      conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_WRITE_BIN_SIZE), ByteUnit.MiB)
+    val smallPartitionFactor =
+      conf.getConf(RapidsDeltaSQLConf.OPTIMIZE_WRITE_SMALL_PARTITION_FACTOR)
+    val mergedPartitionFactor =
+      conf.getConf(RapidsDeltaSQLConf.OPTIMIZE_WRITE_MERGED_PARTITION_FACTOR)
+    val bytesByPartitionId = stats.bytesByPartitionId
+    val targetPartitionSize = (binSize * PARQUET_COMPRESSION_RATIO).toLong
+
+    val splitPartitions = if (partitioning.isInstanceOf[GpuRoundRobinPartitioning]) {
+      DeltaShufflePartitionsUtil.splitSizeListByTargetSize(
+        bytesByPartitionId,
+        targetPartitionSize,
+        smallPartitionFactor,
+        mergedPartitionFactor)
+    } else {
+      // For partitioned data, do not coalesce small partitions as it will hurt parallelism.
+      // Eg. a partition containing 100 partition keys => a task will write 100 files.
+      Seq.range(0, bytesByPartitionId.length).toArray
+    }
+
+    def optimizeSkewedPartition(reduceIndex: Int): Seq[ShufflePartitionSpec] = {
+      val partitionSize = bytesByPartitionId(reduceIndex)
+      if (partitionSize > targetPartitionSize) {
+        val shuffleId = shuffleDependency.shuffleId
+        val newPartitionSpec = DeltaShufflePartitionsUtil.createSkewPartitionSpecs(
+          shuffleId,
+          reduceIndex,
+          targetPartitionSize,
+          smallPartitionFactor,
+          mergedPartitionFactor)
+
+        if (newPartitionSpec.isEmpty) {
+          CoalescedPartitionSpec(reduceIndex, reduceIndex + 1) :: Nil
+        } else {
+          logDebug(s"[OptimizeWrite] Partition $reduceIndex is skew, " +
+              s"split it into ${newPartitionSpec.get.size} parts.")
+          newPartitionSpec.get
+        }
+      } else if (partitionSize > 0) {
+        CoalescedPartitionSpec(reduceIndex, reduceIndex + 1) :: Nil
+      } else {
+        Nil
+      }
+    }
+
+    // Transform the partitions to the ranges.
+    // e.g. [0, 3, 6, 7, 10] -> [[0, 3), [3, 6), [6, 7), [7, 10)]
+    (splitPartitions :+ stats.bytesByPartitionId.length).sliding(2).flatMap { k =>
+      if (k.head == k.last - 1) {
+        // If not a merged partition, split it if needed.
+        optimizeSkewedPartition(k.head)
+      } else {
+        CoalescedPartitionSpec(k.head, k.last) :: Nil
+      }
+    }.toList
+  }
+
+  override protected def withNewChildInternal(
+      newChild: SparkPlan): GpuOptimizeWriteExchangeExec = {
+    copy(child = newChild)
+  }
+}
diff --git a/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/OptimizeWriteExchangeExec.scala b/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/OptimizeWriteExchangeExec.scala
new file mode 100644
index 00000000000..d18443704ce
--- /dev/null
+++ b/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/OptimizeWriteExchangeExec.scala
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * This file was derived from OptimizeWriteExchange.scala
+ * in the Delta Lake project at https://github.com/delta-io/delta
+ * (pending at https://github.com/delta-io/delta/pull/1198).
+ *
+ * Copyright (2021) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.rapids.delta
+
+import scala.concurrent.Future
+import scala.concurrent.duration.Duration
+
+import com.databricks.sql.transaction.tahoe.sources.DeltaSQLConf
+import com.nvidia.spark.rapids.delta.RapidsDeltaSQLConf
+
+import org.apache.spark.{MapOutputStatistics, ShuffleDependency}
+import org.apache.spark.network.util.ByteUnit
+import org.apache.spark.rdd.RDD
+import org.apache.spark.serializer.Serializer
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, RoundRobinPartitioning, UnknownPartitioning}
+import org.apache.spark.sql.execution.{CoalescedPartitionSpec, ShuffledRowRDD, ShufflePartitionSpec, SparkPlan, SQLExecution, UnsafeRowSerializer}
+import org.apache.spark.sql.execution.exchange.{Exchange, ShuffleExchangeExec}
+import org.apache.spark.sql.execution.metric.{SQLMetrics, SQLShuffleReadMetricsReporter, SQLShuffleWriteMetricsReporter}
+import org.apache.spark.util.ThreadUtils
+
+/**
+ * Execution plan to repartition and rebalance the data for Optimize Write.
+ * The plan will be added on top of the query plan of each write job to avoid any interference
+ * from Adaptive Query Execution.
+ *
+ * @param partitioning Partitioning to use data exchange.
+ * @param child Input plan of write job.
+ */
+case class OptimizeWriteExchangeExec(
+    partitioning: Partitioning,
+    override val child: SparkPlan) extends Exchange {
+
+  // Use 150% of target file size hint config considering parquet compression.
+  // Still the result file can be smaller/larger than the config due to data skew or
+  // variable compression ratio for each data type.
+  final val PARQUET_COMPRESSION_RATIO = 1.5
+
+  // Dummy partitioning because:
+  // 1) The exact output partitioning is determined at query runtime
+  // 2) optimizeWrite is always placed right after the top node(DeltaInvariantChecker),
+  //    there is no parent plan to refer to outputPartitioning
+  override def outputPartitioning: Partitioning = UnknownPartitioning(partitioning.numPartitions)
+
+  private lazy val writeMetrics =
+    SQLShuffleWriteMetricsReporter.createShuffleWriteMetrics(sparkContext)
+  private[sql] lazy val readMetrics =
+    SQLShuffleReadMetricsReporter.createShuffleReadMetrics(sparkContext)
+  override lazy val metrics = Map(
+    "dataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size"),
+    "numPartitions" -> SQLMetrics.createMetric(sparkContext, "number of partitions")
+  ) ++ readMetrics ++ writeMetrics
+
+  private lazy val serializer: Serializer =
+    new UnsafeRowSerializer(child.output.size, longMetric("dataSize"))
+
+  @transient lazy val inputRDD: RDD[InternalRow] = child.execute()
+
+  @transient lazy val mapOutputStatisticsFuture: Future[MapOutputStatistics] = {
+    if (inputRDD.getNumPartitions == 0) {
+      Future.successful(null)
+    } else {
+      sparkContext.submitMapStage(shuffleDependency)
+    }
+  }
+
+
+  @transient lazy val shuffleDependency: ShuffleDependency[Int, InternalRow, InternalRow] = {
+    val dep = ShuffleExchangeExec.prepareShuffleDependency(
+      inputRDD,
+      child.output,
+      partitioning,
+      serializer,
+      writeMetrics)
+    metrics("numPartitions").set(dep.partitioner.numPartitions)
+    val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
+    SQLMetrics.postDriverMetricUpdates(
+      sparkContext, executionId, metrics("numPartitions") :: Nil)
+    dep
+  }
+
+  override protected def doExecute(): RDD[InternalRow] = {
+    // Collect execution statistics, these will be used to adjust/decide how to split files
+    val stats = ThreadUtils.awaitResult(mapOutputStatisticsFuture, Duration.Inf)
+    if (stats == null) {
+      new ShuffledRowRDD(shuffleDependency, readMetrics)
+    } else {
+      try {
+        val partitionSpecs = Some(rebalancePartitions(stats))
+        new ShuffledRowRDD(shuffleDependency, readMetrics, partitionSpecs.get.toArray)
+      } catch {
+        case e: Throwable =>
+          logWarning("Failed to apply OptimizeWrite.", e)
+          new ShuffledRowRDD(shuffleDependency, readMetrics)
+      }
+    }
+  }
+
+  private def rebalancePartitions(stats: MapOutputStatistics): Seq[ShufflePartitionSpec] = {
+    val binSize = ByteUnit.BYTE.convertFrom(
+      conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_WRITE_BIN_SIZE), ByteUnit.MiB)
+    val smallPartitionFactor =
+      conf.getConf(RapidsDeltaSQLConf.OPTIMIZE_WRITE_SMALL_PARTITION_FACTOR)
+    val mergedPartitionFactor =
+      conf.getConf(RapidsDeltaSQLConf.OPTIMIZE_WRITE_MERGED_PARTITION_FACTOR)
+    val bytesByPartitionId = stats.bytesByPartitionId
+    val targetPartitionSize = (binSize * PARQUET_COMPRESSION_RATIO).toLong
+
+    val splitPartitions = if (partitioning.isInstanceOf[RoundRobinPartitioning]) {
+      DeltaShufflePartitionsUtil.splitSizeListByTargetSize(
+        bytesByPartitionId,
+        targetPartitionSize,
+        smallPartitionFactor,
+        mergedPartitionFactor)
+    } else {
+      // For partitioned data, do not coalesce small partitions as it will hurt parallelism.
+      // Eg. a partition containing 100 partition keys => a task will write 100 files.
+      Seq.range(0, bytesByPartitionId.length).toArray
+    }
+
+    def optimizeSkewedPartition(reduceIndex: Int): Seq[ShufflePartitionSpec] = {
+      val partitionSize = bytesByPartitionId(reduceIndex)
+      if (partitionSize > targetPartitionSize) {
+        val shuffleId = shuffleDependency.shuffleId
+        val newPartitionSpec = DeltaShufflePartitionsUtil.createSkewPartitionSpecs(
+          shuffleId,
+          reduceIndex,
+          targetPartitionSize,
+          smallPartitionFactor,
+          mergedPartitionFactor)
+
+        if (newPartitionSpec.isEmpty) {
+          CoalescedPartitionSpec(reduceIndex, reduceIndex + 1) :: Nil
+        } else {
+          logDebug(s"[OptimizeWrite] Partition $reduceIndex is skew, " +
+              s"split it into ${newPartitionSpec.get.size} parts.")
+          newPartitionSpec.get
+        }
+      } else if (partitionSize > 0) {
+        CoalescedPartitionSpec(reduceIndex, reduceIndex + 1) :: Nil
+      } else {
+        Nil
+      }
+    }
+
+    // Transform the partitions to the ranges.
+    // e.g. [0, 3, 6, 7, 10] -> [[0, 3), [3, 6), [6, 7), [7, 10)]
+    (splitPartitions :+ stats.bytesByPartitionId.length).sliding(2).flatMap { k =>
+      if (k.head == k.last - 1) {
+        // If not a merged partition, split it if needed.
+        optimizeSkewedPartition(k.head)
+      } else {
+        CoalescedPartitionSpec(k.head, k.last) :: Nil
+      }
+    }.toList
+  }
+
+  override protected def withNewChildInternal(newChild: SparkPlan): OptimizeWriteExchangeExec = {
+    copy(child = newChild)
+  }
+}
diff --git a/delta-lake/common/src/main/delta-io/scala/com/nvidia/spark/rapids/delta/DeltaIOProvider.scala b/delta-lake/common/src/main/delta-io/scala/com/nvidia/spark/rapids/delta/DeltaIOProvider.scala
index de22801396b..0b7da564a93 100644
--- a/delta-lake/common/src/main/delta-io/scala/com/nvidia/spark/rapids/delta/DeltaIOProvider.scala
+++ b/delta-lake/common/src/main/delta-io/scala/com/nvidia/spark/rapids/delta/DeltaIOProvider.scala
@@ -86,7 +86,7 @@ class DeltaProbeImpl extends DeltaProbe {
   override def getDeltaProvider: DeltaProvider = {
     val cpuClassName = "org.apache.spark.sql.delta.sources.DeltaDataSource"
     val hasDeltaJar = UnshimmedTrampolineUtil.classIsLoadable(cpuClassName) &&
-        Try(ShimLoader.loadClass(cpuClassName)).isSuccess
+        Try(ShimReflectionUtils.loadClass(cpuClassName)).isSuccess
     if (hasDeltaJar) {
       DeltaRuntimeShim.getDeltaProvider
     } else {
diff --git a/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/DeltaRuntimeShim.scala b/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/DeltaRuntimeShim.scala
index e5b516d4f49..5fe6b8752c6 100644
--- a/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/DeltaRuntimeShim.scala
+++ b/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/DeltaRuntimeShim.scala
@@ -18,7 +18,7 @@ package org.apache.spark.sql.delta.rapids
 
 import scala.util.Try
 
-import com.nvidia.spark.rapids.{RapidsConf, ShimLoader, VersionUtils}
+import com.nvidia.spark.rapids.{RapidsConf, ShimReflectionUtils, VersionUtils}
 import com.nvidia.spark.rapids.delta.DeltaProvider
 
 import org.apache.spark.sql.delta.{DeltaLog, DeltaUDF, Snapshot}
@@ -52,7 +52,7 @@ object DeltaRuntimeShim {
 
   private lazy val shimInstance = {
     val shimClassName = getShimClassName
-    val shimClass = ShimLoader.loadClass(shimClassName)
+    val shimClass = ShimReflectionUtils.loadClass(shimClassName)
     shimClass.getConstructor().newInstance().asInstanceOf[DeltaRuntimeShim]
   }
 
diff --git a/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/GpuDeltaInvariantCheckerExec.scala b/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/GpuDeltaInvariantCheckerExec.scala
index 28cb7b7d0d2..c78984fd6cf 100644
--- a/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/GpuDeltaInvariantCheckerExec.scala
+++ b/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/GpuDeltaInvariantCheckerExec.scala
@@ -42,7 +42,7 @@ case class GpuDeltaInvariantCheckerExec(
     throw new IllegalStateException("ROW BASED PROCESSING IS NOT SUPPORTED")
   }
 
-  override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     if (checks.isEmpty) return child.executeColumnar()
     val boundRefs = checks.map(_.withBoundReferences(child.output))
 
diff --git a/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/GpuRapidsProcessDeltaMergeJoinExec.scala b/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/GpuRapidsProcessDeltaMergeJoinExec.scala
index 9e7d5cf71bb..9573b4b7cde 100644
--- a/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/GpuRapidsProcessDeltaMergeJoinExec.scala
+++ b/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/GpuRapidsProcessDeltaMergeJoinExec.scala
@@ -19,7 +19,7 @@ package com.nvidia.spark.rapids.delta
 import scala.collection.mutable.ArrayBuffer
 
 import ai.rapids.cudf.{NvtxColor, Table}
-import com.nvidia.spark.rapids.{Arm, DataFromReplacementRule, GpuBindReferences, GpuBoundReference, GpuColumnVector, GpuExec, GpuExpression, GpuExpressionsUtils, GpuMetric, GpuProjectExec, NoopMetric, NvtxWithMetrics, RapidsConf, RapidsMeta, SparkPlanMeta}
+import com.nvidia.spark.rapids._
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 
 import org.apache.spark.TaskContext
@@ -31,20 +31,20 @@ import org.apache.spark.sql.execution.{SparkPlan, SparkStrategy, UnaryExecNode}
 import org.apache.spark.sql.types.DataType
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
-case class RapidsProcessDeltaMergeJoinExpressions(
-    targetRowHasNoMatch: Expression,
-    sourceRowHasNoMatch: Expression,
-    matchedConditions: Seq[Expression],
-    matchedOutputs: Seq[Seq[Seq[Expression]]],
-    notMatchedConditions: Seq[Expression],
-    notMatchedOutputs: Seq[Seq[Seq[Expression]]],
-    noopCopyOutput: Seq[Expression],
-    deleteRowOutput: Seq[Expression])
-
 object RapidsProcessDeltaMergeJoinStrategy extends SparkStrategy {
   override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
     case p: RapidsProcessDeltaMergeJoin =>
-      Seq(RapidsProcessDeltaMergeJoinExec(planLater(p.child), p.output, p.gpuExprs))
+      Seq(RapidsProcessDeltaMergeJoinExec(
+        planLater(p.child),
+        p.output,
+        targetRowHasNoMatch = p.targetRowHasNoMatch,
+        sourceRowHasNoMatch = p.sourceRowHasNoMatch,
+        matchedConditions = p.matchedConditions,
+        matchedOutputs = p.matchedOutputs,
+        notMatchedConditions = p.notMatchedConditions,
+        notMatchedOutputs = p.notMatchedOutputs,
+        noopCopyOutput = p.noopCopyOutput,
+        deleteRowOutput = p.deleteRowOutput))
     case _ => Nil
   }
 }
@@ -52,7 +52,14 @@ object RapidsProcessDeltaMergeJoinStrategy extends SparkStrategy {
 case class RapidsProcessDeltaMergeJoin(
     child: LogicalPlan,
     override val output: Seq[Attribute],
-    gpuExprs: RapidsProcessDeltaMergeJoinExpressions) extends UnaryNode {
+    targetRowHasNoMatch: Expression,
+    sourceRowHasNoMatch: Expression,
+    matchedConditions: Seq[Expression],
+    matchedOutputs: Seq[Seq[Seq[Expression]]],
+    notMatchedConditions: Seq[Expression],
+    notMatchedOutputs: Seq[Seq[Seq[Expression]]],
+    noopCopyOutput: Seq[Expression],
+    deleteRowOutput: Seq[Expression]) extends UnaryNode {
 
   @transient
   override lazy val references: AttributeSet = inputSet
@@ -65,7 +72,14 @@ case class RapidsProcessDeltaMergeJoin(
 case class RapidsProcessDeltaMergeJoinExec(
     child: SparkPlan,
     override val output: Seq[Attribute],
-    gpuExprs: RapidsProcessDeltaMergeJoinExpressions) extends UnaryExecNode {
+    targetRowHasNoMatch: Expression,
+    sourceRowHasNoMatch: Expression,
+    matchedConditions: Seq[Expression],
+    matchedOutputs: Seq[Seq[Seq[Expression]]],
+    notMatchedConditions: Seq[Expression],
+    notMatchedOutputs: Seq[Seq[Seq[Expression]]],
+    noopCopyOutput: Seq[Expression],
+    deleteRowOutput: Seq[Expression]) extends UnaryExecNode {
 
   override protected def doExecute(): RDD[InternalRow] = {
     throw new IllegalStateException("Should have been replaced by a GpuRapidsProcessMergeJoinExec")
@@ -83,18 +97,28 @@ class RapidsProcessDeltaMergeJoinMeta(
     rule: DataFromReplacementRule)
     extends SparkPlanMeta[RapidsProcessDeltaMergeJoinExec](p, conf, parent, rule) {
 
+  // handling child expressions manually since they're grouped into separate sequences
+  override val childExprs: Seq[BaseExprMeta[_]] = Seq.empty
+
   override def convertToGpu(): GpuExec = {
     GpuRapidsProcessDeltaMergeJoinExec(
       childPlans.head.convertIfNeeded(),
       p.output,
-      targetRowHasNoMatch = p.gpuExprs.targetRowHasNoMatch,
-      sourceRowHasNoMatch = p.gpuExprs.sourceRowHasNoMatch,
-      matchedConditions = p.gpuExprs.matchedConditions,
-      matchedOutputs = p.gpuExprs.matchedOutputs,
-      notMatchedConditions = p.gpuExprs.notMatchedConditions,
-      notMatchedOutputs = p.gpuExprs.notMatchedOutputs,
-      noopCopyOutput = p.gpuExprs.noopCopyOutput,
-      deleteRowOutput = p.gpuExprs.deleteRowOutput)
+      targetRowHasNoMatch = convertExprToGpu(p.targetRowHasNoMatch),
+      sourceRowHasNoMatch = convertExprToGpu(p.sourceRowHasNoMatch),
+      matchedConditions = p.matchedConditions.map(convertExprToGpu),
+      matchedOutputs = p.matchedOutputs.map(_.map(_.map(convertExprToGpu))),
+      notMatchedConditions = p.notMatchedConditions.map(convertExprToGpu),
+      notMatchedOutputs = p.notMatchedOutputs.map(_.map(_.map(convertExprToGpu))),
+      noopCopyOutput = p.noopCopyOutput.map(convertExprToGpu),
+      deleteRowOutput = p.deleteRowOutput.map(convertExprToGpu))
+  }
+
+  private def convertExprToGpu(e: Expression): Expression = {
+    val meta = GpuOverrides.wrapExpr(e, conf, None)
+    meta.tagForGpu()
+    assert(meta.canExprTreeBeReplaced, meta.toString)
+    meta.convertToGpu()
   }
 }
 
@@ -121,33 +145,29 @@ case class GpuRapidsProcessDeltaMergeJoinExec(
     case (attr, idx) =>
       GpuBoundReference(idx, attr.dataType, attr.nullable)(attr.exprId, attr.name)
   }
-  private lazy val boundTargetRowHasNoMatch =
-    GpuBindReferences.bindGpuReference(targetRowHasNoMatch, child.output)
-  private lazy val boundSourceRowHasNoMatch =
-    GpuBindReferences.bindGpuReference(sourceRowHasNoMatch, child.output)
-  private lazy val boundMatchedConditions =
-    GpuBindReferences.bindGpuReferences(matchedConditions, child.output)
-  private lazy val boundMatchedOutputs =
-    matchedOutputs.map(_.map(GpuBindReferences.bindGpuReferences(_, child.output)))
-  private lazy val boundNotMatchedConditions =
-    GpuBindReferences.bindGpuReferences(notMatchedConditions, child.output)
-  private lazy val boundNotMatchedOutputs =
-    notMatchedOutputs.map(_.map(GpuBindReferences.bindGpuReferences(_, child.output)))
-  private lazy val boundNoopCopyOutput =
-    GpuBindReferences.bindGpuReferences(noopCopyOutput, child.output)
-  private lazy val boundDeleteRowOutput =
-    GpuBindReferences.bindGpuReferences(deleteRowOutput, child.output)
+  private lazy val boundTargetRowHasNoMatch = bindForGpu(targetRowHasNoMatch)
+  private lazy val boundSourceRowHasNoMatch = bindForGpu(sourceRowHasNoMatch)
+  private lazy val boundMatchedConditions = matchedConditions.map(bindForGpu)
+  private lazy val boundMatchedOutputs = matchedOutputs.map(_.map(_.map(bindForGpu)))
+  private lazy val boundNotMatchedConditions = notMatchedConditions.map(bindForGpu)
+  private lazy val boundNotMatchedOutputs = notMatchedOutputs.map(_.map(_.map(bindForGpu)))
+  private lazy val boundNoopCopyOutput = noopCopyOutput.map(bindForGpu)
+  private lazy val boundDeleteRowOutput = deleteRowOutput.map(bindForGpu)
 
   override lazy val additionalMetrics: Map[String, GpuMetric] = {
     import GpuMetric._
     Map(OP_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_OP_TIME))
   }
 
+  private def bindForGpu(e: Expression): GpuExpression = {
+    GpuBindReferences.bindGpuReference(e, child.output)
+  }
+
   override protected def doExecute(): RDD[InternalRow] = {
     throw new IllegalStateException("Row-based execution should not occur for this class")
   }
 
-  override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     val localInputTypes = inputTypes
     val localOutput = output
     val localOutputExprs = outputExprs
diff --git a/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/RapidsDeltaWrite.scala b/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/RapidsDeltaWrite.scala
index d9b55c2dd93..6ae9156d1ba 100644
--- a/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/RapidsDeltaWrite.scala
+++ b/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/RapidsDeltaWrite.scala
@@ -95,7 +95,7 @@ case class GpuRapidsDeltaWriteExec(child: SparkPlan) extends V2CommandExec
   override lazy val allMetrics: Map[String, GpuMetric] =
     GpuMetric.wrap(basicMetrics ++ taskMetrics)
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     // This is just a stub node for planning purposes and does not actually perform
     // the write. See the documentation for RapidsDeltaWrite and its use in
     // GpuOptimisticTransaction for details on how the write is handled.
diff --git a/delta-lake/common/src/main/scala/org/apache/spark/sql/rapids/delta/DeltaShufflePartitionsUtil.scala b/delta-lake/common/src/main/scala/org/apache/spark/sql/rapids/delta/DeltaShufflePartitionsUtil.scala
new file mode 100644
index 00000000000..c573819f31e
--- /dev/null
+++ b/delta-lake/common/src/main/scala/org/apache/spark/sql/rapids/delta/DeltaShufflePartitionsUtil.scala
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * This file was derived from DeltaShufflePartitionUtil.scala
+ * in the Delta Lake project at https://github.com/delta-io/delta
+ * (pending at https://github.com/delta-io/delta/pull/1198).
+ *
+ * Copyright (2021) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.rapids.delta
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.{MapOutputTrackerMaster, SparkEnv}
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RoundRobinPartitioning}
+import org.apache.spark.sql.execution.{CoalesceExec, PartialReducerPartitionSpec, SparkPlan}
+import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec
+import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, ShuffleExchangeExec}
+import org.apache.spark.sql.types.StructType
+
+/**
+ * Utility functions to rebalance shuffle partitions for OptimizeWrite.
+ */
+object DeltaShufflePartitionsUtil {
+
+  // scalastyle:off line.size.limit
+  /**
+   * Splits the skewed partition based on the map size and the target partition size
+   * after split, and create a list of `PartialMapperPartitionSpec`. Returns None if can't split.
+   *
+   * The function is copied from Spark 3.2:
+   *   https://github.com/apache/spark/blob/v3.2.1/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ShufflePartitionsUtil.scala#L376
+   * EDIT: Configurable smallPartitionFactor and mergedPartitionFactor.
+   *
+   * @param shuffleId Shuffle ID for retrieve partition info.
+   * @param reducerId Reducer ID (Partition ID) for retrieve partition info.
+   * @param targetSize Target bin size for splitting.
+   * @param smallPartitionFactor Threshold to avoid too small partition. If a partial partition is
+   *                       smaller than targetSize * smallPartitionFactor, merge it to an adjacent
+   *                       partition.
+   * @param mergedPartitionFactor Threshold to avoid too large partition. If a partial partition is
+   *                        larger than targetSize * mergedPartitionFactor, do not merge other
+   *                        adjacent partitions to it.
+   */
+  // scalastyle:on
+  def createSkewPartitionSpecs(
+      shuffleId: Int,
+      reducerId: Int,
+      targetSize: Long,
+      smallPartitionFactor: Double,
+      mergedPartitionFactor: Double): Option[Seq[PartialReducerPartitionSpec]] = {
+    val mapPartitionSizes = getMapSizesForReduceId(shuffleId, reducerId)
+    if (mapPartitionSizes.exists(_ < 0)) return None
+    val mapStartIndices = splitSizeListByTargetSize(
+      mapPartitionSizes,
+      targetSize,
+      smallPartitionFactor,
+      mergedPartitionFactor)
+    if (mapStartIndices.length > 1) {
+      Some(mapStartIndices.indices.map { i =>
+        val startMapIndex = mapStartIndices(i)
+        val endMapIndex = if (i == mapStartIndices.length - 1) {
+          mapPartitionSizes.length
+        } else {
+          mapStartIndices(i + 1)
+        }
+        val dataSize = startMapIndex.until(endMapIndex).map(mapPartitionSizes(_)).sum
+        PartialReducerPartitionSpec(reducerId, startMapIndex, endMapIndex, dataSize)
+      })
+    } else {
+      None
+    }
+  }
+
+  // scalastyle:off line.size.limit
+  /**
+   * Given a list of size, return an array of indices to split the list into multiple partitions,
+   * so that the size sum of each partition is close to the target size. Each index indicates the
+   * start of a partition.
+   *
+   * The function is copied from Spark 3.2:
+   *   https://github.com/apache/spark/blob/v3.2.1/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ShufflePartitionsUtil.scala#L319
+   * EDIT: Configurable smallPartitionFactor and mergedPartitionFactor.
+   *
+   *
+   * @param targetSize Target bin size for splitting.
+   * @param smallPartitionFactor Threshold to avoid too small partition. If a partial partition is
+   *                       smaller than targetSize * smallPartitionFactor, merge it to an adjacent
+   *                       partition.
+   * @param mergedPartitionFactor Threshold to avoid too large partition. If a partial partition is
+   *                        larger than targetSize * mergedPartitionFactor, do not merge other
+   *                        adjacent partitions to it.
+   */
+  // scalastyle:on
+  // Visible for testing
+  private[sql] def splitSizeListByTargetSize(
+      sizes: Seq[Long],
+      targetSize: Long,
+      smallPartitionFactor: Double,
+      mergedPartitionFactor: Double): Array[Int] = {
+    val partitionStartIndices = ArrayBuffer[Int]()
+    partitionStartIndices += 0
+    var i = 0
+    var currentPartitionSize = 0L
+    var lastPartitionSize = -1L
+
+    def tryMergePartitions() = {
+      // When we are going to start a new partition, it's possible that the current partition or
+      // the previous partition is very small and it's better to merge the current partition into
+      // the previous partition.
+      val shouldMergePartitions = lastPartitionSize > -1 &&
+          ((currentPartitionSize + lastPartitionSize) < targetSize * mergedPartitionFactor ||
+              (currentPartitionSize < targetSize * smallPartitionFactor ||
+                  lastPartitionSize < targetSize * smallPartitionFactor))
+      if (shouldMergePartitions) {
+        // We decide to merge the current partition into the previous one, so the start index of
+        // the current partition should be removed.
+        partitionStartIndices.remove(partitionStartIndices.length - 1)
+        lastPartitionSize += currentPartitionSize
+      } else {
+        lastPartitionSize = currentPartitionSize
+      }
+    }
+
+    while (i < sizes.length) {
+      // If including the next size in the current partition exceeds the target size, package the
+      // current partition and start a new partition.
+      if (i > 0 && currentPartitionSize + sizes(i) > targetSize) {
+        tryMergePartitions()
+        partitionStartIndices += i
+        currentPartitionSize = sizes(i)
+      } else {
+        currentPartitionSize += sizes(i)
+      }
+      i += 1
+    }
+    tryMergePartitions()
+    partitionStartIndices.toArray
+  }
+
+  // scalastyle:off line.size.limit
+  /**
+   * Get the map size of the specific shuffle and reduce ID. Note that, some map outputs can be
+   * missing due to issues like executor lost. The size will be -1 for missing map outputs and the
+   * caller side should take care of it.
+   *
+   * The function is copied from Spark 3.2:
+   *   https://github.com/apache/spark/blob/v3.2.1/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ShufflePartitionsUtil.scala#L365
+   */
+  // scalastyle:on
+  def getMapSizesForReduceId(shuffleId: Int, partitionId: Int): Array[Long] = {
+    val mapOutputTracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]
+    mapOutputTracker.shuffleStatuses(shuffleId).withMapStatuses(_.map { stat =>
+      if (stat == null) -1 else stat.getSizeForBlock(partitionId)
+    })
+  }
+
+  def removeTopRepartition(plan: SparkPlan): SparkPlan = {
+    plan match {
+      case p: AdaptiveSparkPlanExec =>
+        p.inputPlan match {
+          case s: ShuffleExchangeExec if s.shuffleOrigin.equals(ENSURE_REQUIREMENTS) =>
+            p.copy(inputPlan = s.child)
+          case c: CoalesceExec =>
+            c.child
+          case _ => p
+        }
+      case ShuffleExchangeExec(_, child, shuffleOrigin)
+        if !shuffleOrigin.equals(ENSURE_REQUIREMENTS) =>
+        child
+      case CoalesceExec(_, child) =>
+        child
+      case _ =>
+        plan
+    }
+  }
+
+  def partitioningForRebalance(
+      outputColumns: Seq[Attribute],
+      partitionSchema: StructType,
+      numShufflePartitions: Int): Partitioning = {
+    if (partitionSchema.fields.isEmpty) {
+      // Non-partitioned data.
+      RoundRobinPartitioning(numShufflePartitions)
+    } else {
+      // Partitioned data.
+      val partitionColumnsExpr = partitionSchema.fields.map { f =>
+        outputColumns.find(c => c.name.equals(f.name)).get
+      }
+      HashPartitioning(partitionColumnsExpr, numShufflePartitions)
+    }
+  }
+}
diff --git a/delta-lake/delta-20x/pom.xml b/delta-lake/delta-20x/pom.xml
index d423b21a19f..def7a2d540c 100644
--- a/delta-lake/delta-20x/pom.xml
+++ b/delta-lake/delta-20x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.02.0</version>
+        <version>23.04.0</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-20x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.0.x Support</name>
     <description>Delta Lake 2.0.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.02.0</version>
+    <version>23.04.0</version>
 
     <dependencies>
         <dependency>
diff --git a/delta-lake/delta-20x/src/main/scala/org/apache/spark/sql/delta/rapids/delta20x/GpuMergeIntoCommand.scala b/delta-lake/delta-20x/src/main/scala/org/apache/spark/sql/delta/rapids/delta20x/GpuMergeIntoCommand.scala
index c07d7d78673..23eb1ba0cfc 100644
--- a/delta-lake/delta-20x/src/main/scala/org/apache/spark/sql/delta/rapids/delta20x/GpuMergeIntoCommand.scala
+++ b/delta-lake/delta-20x/src/main/scala/org/apache/spark/sql/delta/rapids/delta20x/GpuMergeIntoCommand.scala
@@ -608,11 +608,14 @@ case class GpuMergeIntoCommand(
        """.stripMargin)
 
     // UDFs to update metrics
+    // Make UDFs that appear in the custom join processor node deterministic, as they always
+    // return true and update a metric. Catalyst precludes non-deterministic UDFs that are not
+    // allowed outside a very specific set of Catalyst nodes (Project, Filter, Window, Aggregate).
     val incrSourceRowCountExpr = makeMetricUpdateUDF("numSourceRowsInSecondScan")
-    val incrUpdatedCountExpr = makeMetricUpdateUDF("numTargetRowsUpdated")
-    val incrInsertedCountExpr = makeMetricUpdateUDF("numTargetRowsInserted")
-    val incrNoopCountExpr = makeMetricUpdateUDF("numTargetRowsCopied")
-    val incrDeletedCountExpr = makeMetricUpdateUDF("numTargetRowsDeleted")
+    val incrUpdatedCountExpr = makeMetricUpdateUDF("numTargetRowsUpdated", deterministic = true)
+    val incrInsertedCountExpr = makeMetricUpdateUDF("numTargetRowsInserted", deterministic = true)
+    val incrNoopCountExpr = makeMetricUpdateUDF("numTargetRowsCopied", deterministic = true)
+    val incrDeletedCountExpr = makeMetricUpdateUDF("numTargetRowsDeleted", deterministic = true)
 
     // Apply an outer join to find both, matches and non-matches. We are adding two boolean fields
     // with value `true`, one to each side of the join. Whether this field is null or not after
@@ -870,16 +873,17 @@ case class GpuMergeIntoCommand(
     }
 
     if (canReplace) {
-      val processedJoinPlan = RapidsProcessDeltaMergeJoin(joinedPlan, outputRowSchema.toAttributes,
-        RapidsProcessDeltaMergeJoinExpressions(
-          targetRowHasNoMatch = targetRowHasNoMatchMeta.convertToGpu(),
-          sourceRowHasNoMatch = sourceRowHasNoMatchMeta.convertToGpu(),
-          matchedConditions = matchedConditionsMetas.map(_.convertToGpu()),
-          matchedOutputs = matchedOutputsMetas.map(_.map(_.map(_.convertToGpu()))),
-          notMatchedConditions = notMatchedConditionsMetas.map(_.convertToGpu()),
-          notMatchedOutputs = notMatchedOutputsMetas.map(_.map(_.map(_.convertToGpu()))),
-          noopCopyOutput = noopCopyOutputMetas.map(_.convertToGpu()),
-          deleteRowOutput = deleteRowOutputMetas.map(_.convertToGpu())))
+      val processedJoinPlan = RapidsProcessDeltaMergeJoin(
+        joinedPlan,
+        outputRowSchema.toAttributes,
+        targetRowHasNoMatch = targetRowHasNoMatch,
+        sourceRowHasNoMatch = sourceRowHasNoMatch,
+        matchedConditions = matchedConditions,
+        matchedOutputs = matchedOutputs,
+        notMatchedConditions = notMatchedConditions,
+        notMatchedOutputs = notMatchedOutputs,
+        noopCopyOutput = noopCopyOutput,
+        deleteRowOutput = deleteRowOutput)
       Dataset.ofRows(spark, processedJoinPlan)
     } else {
       val joinedRowEncoder = RowEncoder(joinedPlan.schema)
@@ -970,10 +974,14 @@ case class GpuMergeIntoCommand(
   }
 
   /** Expressions to increment SQL metrics */
-  private def makeMetricUpdateUDF(name: String): Expression = {
+  private def makeMetricUpdateUDF(name: String, deterministic: Boolean = false): Expression = {
     // only capture the needed metric in a local variable
     val metric = metrics(name)
-    udf(new GpuDeltaMetricUpdateUDF(metric)).asNondeterministic().apply().expr
+    var u = udf(new GpuDeltaMetricUpdateUDF(metric))
+    if (!deterministic) {
+      u = u.asNondeterministic()
+    }
+    u.apply().expr
   }
 
   private def seqToString(exprs: Seq[Expression]): String = exprs.map(_.sql).mkString("\n\t")
diff --git a/delta-lake/delta-21x/pom.xml b/delta-lake/delta-21x/pom.xml
index 9ddbdc39a28..c561aa8c1b5 100644
--- a/delta-lake/delta-21x/pom.xml
+++ b/delta-lake/delta-21x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.02.0</version>
+        <version>23.04.0</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-21x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.1.x Support</name>
     <description>Delta Lake 2.1.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.02.0</version>
+    <version>23.04.0</version>
 
     <dependencies>
         <dependency>
diff --git a/delta-lake/delta-21x/src/main/scala/org/apache/spark/sql/delta/rapids/delta21x/GpuMergeIntoCommand.scala b/delta-lake/delta-21x/src/main/scala/org/apache/spark/sql/delta/rapids/delta21x/GpuMergeIntoCommand.scala
index 34b88f13886..0b9f6e4ac41 100644
--- a/delta-lake/delta-21x/src/main/scala/org/apache/spark/sql/delta/rapids/delta21x/GpuMergeIntoCommand.scala
+++ b/delta-lake/delta-21x/src/main/scala/org/apache/spark/sql/delta/rapids/delta21x/GpuMergeIntoCommand.scala
@@ -639,11 +639,14 @@ case class GpuMergeIntoCommand(
        """.stripMargin)
 
     // UDFs to update metrics
+    // Make UDFs that appear in the custom join processor node deterministic, as they always
+    // return true and update a metric. Catalyst precludes non-deterministic UDFs that are not
+    // allowed outside a very specific set of Catalyst nodes (Project, Filter, Window, Aggregate).
     val incrSourceRowCountExpr = makeMetricUpdateUDF("numSourceRowsInSecondScan")
-    val incrUpdatedCountExpr = makeMetricUpdateUDF("numTargetRowsUpdated")
-    val incrInsertedCountExpr = makeMetricUpdateUDF("numTargetRowsInserted")
-    val incrNoopCountExpr = makeMetricUpdateUDF("numTargetRowsCopied")
-    val incrDeletedCountExpr = makeMetricUpdateUDF("numTargetRowsDeleted")
+    val incrUpdatedCountExpr = makeMetricUpdateUDF("numTargetRowsUpdated", deterministic = true)
+    val incrInsertedCountExpr = makeMetricUpdateUDF("numTargetRowsInserted", deterministic = true)
+    val incrNoopCountExpr = makeMetricUpdateUDF("numTargetRowsCopied", deterministic = true)
+    val incrDeletedCountExpr = makeMetricUpdateUDF("numTargetRowsDeleted", deterministic = true)
 
     // Apply an outer join to find both, matches and non-matches. We are adding two boolean fields
     // with value `true`, one to each side of the join. Whether this field is null or not after
@@ -909,16 +912,17 @@ case class GpuMergeIntoCommand(
     }
 
     if (canReplace) {
-      val processedJoinPlan = RapidsProcessDeltaMergeJoin(joinedPlan, outputRowSchema.toAttributes,
-        RapidsProcessDeltaMergeJoinExpressions(
-          targetRowHasNoMatch = targetRowHasNoMatchMeta.convertToGpu(),
-          sourceRowHasNoMatch = sourceRowHasNoMatchMeta.convertToGpu(),
-          matchedConditions = matchedConditionsMetas.map(_.convertToGpu()),
-          matchedOutputs = matchedOutputsMetas.map(_.map(_.map(_.convertToGpu()))),
-          notMatchedConditions = notMatchedConditionsMetas.map(_.convertToGpu()),
-          notMatchedOutputs = notMatchedOutputsMetas.map(_.map(_.map(_.convertToGpu()))),
-          noopCopyOutput = noopCopyOutputMetas.map(_.convertToGpu()),
-          deleteRowOutput = deleteRowOutputMetas.map(_.convertToGpu())))
+      val processedJoinPlan = RapidsProcessDeltaMergeJoin(
+        joinedPlan,
+        outputRowSchema.toAttributes,
+        targetRowHasNoMatch = targetRowHasNoMatch,
+        sourceRowHasNoMatch = sourceRowHasNoMatch,
+        matchedConditions = matchedConditions,
+        matchedOutputs = matchedOutputs,
+        notMatchedConditions = notMatchedConditions,
+        notMatchedOutputs = notMatchedOutputs,
+        noopCopyOutput = noopCopyOutput,
+        deleteRowOutput = deleteRowOutput)
       Dataset.ofRows(spark, processedJoinPlan)
     } else {
       val joinedRowEncoder = RowEncoder(joinedPlan.schema)
@@ -1009,10 +1013,14 @@ case class GpuMergeIntoCommand(
   }
 
   /** Expressions to increment SQL metrics */
-  private def makeMetricUpdateUDF(name: String): Expression = {
+  private def makeMetricUpdateUDF(name: String, deterministic: Boolean = false): Expression = {
     // only capture the needed metric in a local variable
     val metric = metrics(name)
-    udf(new GpuDeltaMetricUpdateUDF(metric)).asNondeterministic().apply().expr
+    var u = udf(new GpuDeltaMetricUpdateUDF(metric))
+    if (!deterministic) {
+      u = u.asNondeterministic()
+    }
+    u.apply().expr
   }
 
   private def seqToString(exprs: Seq[Expression]): String = exprs.map(_.sql).mkString("\n\t")
diff --git a/delta-lake/delta-22x/pom.xml b/delta-lake/delta-22x/pom.xml
index 91658405c24..07d33a3e4db 100644
--- a/delta-lake/delta-22x/pom.xml
+++ b/delta-lake/delta-22x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.02.0</version>
+        <version>23.04.0</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-22x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.2.x Support</name>
     <description>Delta Lake 2.2.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.02.0</version>
+    <version>23.04.0</version>
 
     <dependencies>
         <dependency>
diff --git a/delta-lake/delta-22x/src/main/scala/org/apache/spark/sql/delta/rapids/delta22x/GpuMergeIntoCommand.scala b/delta-lake/delta-22x/src/main/scala/org/apache/spark/sql/delta/rapids/delta22x/GpuMergeIntoCommand.scala
index 26fad610adc..3e7c3b5cb99 100644
--- a/delta-lake/delta-22x/src/main/scala/org/apache/spark/sql/delta/rapids/delta22x/GpuMergeIntoCommand.scala
+++ b/delta-lake/delta-22x/src/main/scala/org/apache/spark/sql/delta/rapids/delta22x/GpuMergeIntoCommand.scala
@@ -676,11 +676,14 @@ case class GpuMergeIntoCommand(
        """.stripMargin)
 
     // UDFs to update metrics
+    // Make UDFs that appear in the custom join processor node deterministic, as they always
+    // return true and update a metric. Catalyst precludes non-deterministic UDFs that are not
+    // allowed outside a very specific set of Catalyst nodes (Project, Filter, Window, Aggregate).
     val incrSourceRowCountExpr = makeMetricUpdateUDF("numSourceRowsInSecondScan")
-    val incrUpdatedCountExpr = makeMetricUpdateUDF("numTargetRowsUpdated")
-    val incrInsertedCountExpr = makeMetricUpdateUDF("numTargetRowsInserted")
-    val incrNoopCountExpr = makeMetricUpdateUDF("numTargetRowsCopied")
-    val incrDeletedCountExpr = makeMetricUpdateUDF("numTargetRowsDeleted")
+    val incrUpdatedCountExpr = makeMetricUpdateUDF("numTargetRowsUpdated", deterministic = true)
+    val incrInsertedCountExpr = makeMetricUpdateUDF("numTargetRowsInserted", deterministic = true)
+    val incrNoopCountExpr = makeMetricUpdateUDF("numTargetRowsCopied", deterministic = true)
+    val incrDeletedCountExpr = makeMetricUpdateUDF("numTargetRowsDeleted", deterministic = true)
 
     // Apply an outer join to find both, matches and non-matches. We are adding two boolean fields
     // with value `true`, one to each side of the join. Whether this field is null or not after
@@ -947,16 +950,17 @@ case class GpuMergeIntoCommand(
     }
 
     if (canReplace) {
-      val processedJoinPlan = RapidsProcessDeltaMergeJoin(joinedPlan, outputRowSchema.toAttributes,
-        RapidsProcessDeltaMergeJoinExpressions(
-          targetRowHasNoMatch = targetRowHasNoMatchMeta.convertToGpu(),
-          sourceRowHasNoMatch = sourceRowHasNoMatchMeta.convertToGpu(),
-          matchedConditions = matchedConditionsMetas.map(_.convertToGpu()),
-          matchedOutputs = matchedOutputsMetas.map(_.map(_.map(_.convertToGpu()))),
-          notMatchedConditions = notMatchedConditionsMetas.map(_.convertToGpu()),
-          notMatchedOutputs = notMatchedOutputsMetas.map(_.map(_.map(_.convertToGpu()))),
-          noopCopyOutput = noopCopyOutputMetas.map(_.convertToGpu()),
-          deleteRowOutput = deleteRowOutputMetas.map(_.convertToGpu())))
+      val processedJoinPlan = RapidsProcessDeltaMergeJoin(
+        joinedPlan,
+        outputRowSchema.toAttributes,
+        targetRowHasNoMatch = targetRowHasNoMatch,
+        sourceRowHasNoMatch = sourceRowHasNoMatch,
+        matchedConditions = matchedConditions,
+        matchedOutputs = matchedOutputs,
+        notMatchedConditions = notMatchedConditions,
+        notMatchedOutputs = notMatchedOutputs,
+        noopCopyOutput = noopCopyOutput,
+        deleteRowOutput = deleteRowOutput)
       Dataset.ofRows(spark, processedJoinPlan)
     } else {
       val joinedRowEncoder = RowEncoder(joinedPlan.schema)
@@ -1047,10 +1051,14 @@ case class GpuMergeIntoCommand(
   }
 
   /** Expressions to increment SQL metrics */
-  private def makeMetricUpdateUDF(name: String): Expression = {
+  private def makeMetricUpdateUDF(name: String, deterministic: Boolean = false): Expression = {
     // only capture the needed metric in a local variable
     val metric = metrics(name)
-    DeltaUDF.boolean(new GpuDeltaMetricUpdateUDF(metric)).asNondeterministic().apply().expr
+    var u = DeltaUDF.boolean(new GpuDeltaMetricUpdateUDF(metric))
+    if (!deterministic) {
+      u = u.asNondeterministic()
+    }
+    u.apply().expr
   }
 
   private def getTargetOutputCols(txn: OptimisticTransaction): Seq[NamedExpression] = {
diff --git a/delta-lake/delta-spark321db/pom.xml b/delta-lake/delta-spark321db/pom.xml
index 10d65ecdd44..49d8cb84e98 100644
--- a/delta-lake/delta-spark321db/pom.xml
+++ b/delta-lake/delta-spark321db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.02.0</version>
+        <version>23.04.0</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark321db_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 10.4 Delta Lake Support</name>
     <description>Databricks 10.4 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.02.0</version>
+    <version>23.04.0</version>
 
     <dependencies>
         <dependency>
diff --git a/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDoAutoCompaction.scala b/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDoAutoCompaction.scala
new file mode 100644
index 00000000000..84a86807c15
--- /dev/null
+++ b/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDoAutoCompaction.scala
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * This file was derived from DoAutoCompaction.scala
+ * from https://github.com/delta-io/delta/pull/1156
+ * in the Delta Lake project at https://github.com/delta-io/delta.
+ *
+ * Copyright (2021) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.databricks.sql.transaction.tahoe.rapids
+
+import com.databricks.sql.transaction.tahoe._
+import com.databricks.sql.transaction.tahoe.actions.Action
+import com.databricks.sql.transaction.tahoe.hooks.PostCommitHook
+import com.databricks.sql.transaction.tahoe.metering.DeltaLogging
+
+import org.apache.spark.sql.SparkSession
+
+object GpuDoAutoCompaction extends PostCommitHook
+    with DeltaLogging
+    with Serializable {
+  override val name: String = "Triggers compaction if necessary"
+
+  override def run(spark: SparkSession,
+                   txn: OptimisticTransactionImpl,
+                   committedActions: Seq[Action]): Unit = {
+    val gpuTxn = txn.asInstanceOf[GpuOptimisticTransaction]
+    val newTxn = new GpuDeltaLog(gpuTxn.deltaLog, gpuTxn.rapidsConf).startTransaction()
+    // Note: The Databricks AutoCompact PostCommitHook cannot be used here
+    // (with a GpuOptimisticTransaction). It appears that AutoCompact creates a new transaction,
+    // thereby circumventing GpuOptimisticTransaction (which intercepts Parquet writes
+    // to go through the GPU).
+    new GpuOptimizeExecutor(spark, newTxn, Seq.empty, Seq.empty, committedActions).optimize()
+  }
+
+  override def handleError(error: Throwable, version: Long): Unit =
+    throw DeltaErrors.postCommitHookFailedException(this, version, name, error)
+}
\ No newline at end of file
diff --git a/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransaction.scala b/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransaction.scala
index 6a55a4b609f..4b4b1616527 100644
--- a/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransaction.scala
+++ b/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransaction.scala
@@ -164,6 +164,8 @@ class GpuOptimisticTransaction(
     val constraints =
       Constraints.getAll(metadata, spark) ++ generatedColumnConstraints ++ additionalConstraints
 
+    val isOptimize = isOptimizeCommand(queryExecution.analyzed)
+
     SQLExecution.withNewExecutionId(queryExecution, Option("deltaTransactionalWrite")) {
       val outputSpec = FileFormatWriter.OutputSpec(
         outputPath.toString,
@@ -182,7 +184,9 @@ class GpuOptimisticTransaction(
 
       val empty2NullPlan = convertEmptyToNullIfNeeded(queryPhysicalPlan,
         partitioningColumns, constraints)
-      val planWithInvariants = addInvariantChecks(empty2NullPlan, constraints)
+      val optimizedPlan =
+        applyOptimizeWriteIfNeeded(spark, empty2NullPlan, partitionSchema, isOptimize)
+      val planWithInvariants = addInvariantChecks(optimizedPlan, constraints)
       val physicalPlan = convertToGpu(planWithInvariants)
 
       val statsTrackers: ListBuffer[ColumnarWriteJobStatsTracker] = ListBuffer()
@@ -248,6 +252,23 @@ class GpuOptimisticTransaction(
     identityTracker.foreach { tracker =>
       updatedIdentityHighWaterMarks.appendAll(tracker.highWaterMarks.toSeq)
     }
-    resultFiles.toSeq ++ committer.changeFiles
+    val fileActions = resultFiles.toSeq ++ committer.changeFiles
+
+    // Check if auto-compaction is enabled.
+    // (Auto compaction checks are derived from the work in
+    //  https://github.com/delta-io/delta/pull/1156).
+    lazy val autoCompactEnabled =
+    spark.sessionState.conf
+      .getConf[String](DeltaSQLConf.DELTA_AUTO_COMPACT_ENABLED)
+      .getOrElse {
+        DeltaConfigs.AUTO_COMPACT.fromMetaData(metadata)
+          .getOrElse("false")
+      }.toBoolean
+
+    if (!isOptimize && autoCompactEnabled && fileActions.nonEmpty) {
+      registerPostCommitHook(GpuDoAutoCompaction)
+    }
+
+    fileActions
   }
 }
diff --git a/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimizeExecutor.scala b/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimizeExecutor.scala
new file mode 100644
index 00000000000..590487d4c60
--- /dev/null
+++ b/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimizeExecutor.scala
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * This file was derived from:
+ *  1. DoAutoCompaction.scala from PR#1156 at https://github.com/delta-io/delta/pull/1156,
+ *  2. OptimizeTableCommand.scala from the Delta Lake project at https://github.com/delta-io/delta.
+ *
+ * Copyright (2021) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.databricks.sql.transaction.tahoe.rapids
+
+import java.util.ConcurrentModificationException
+
+import scala.annotation.tailrec
+import scala.collection.mutable.ArrayBuffer
+
+import com.databricks.sql.transaction.tahoe._
+import com.databricks.sql.transaction.tahoe.DeltaOperations.Operation
+import com.databricks.sql.transaction.tahoe.actions.{Action, AddFile, FileAction, RemoveFile}
+import com.databricks.sql.transaction.tahoe.commands.DeltaCommand
+import com.databricks.sql.transaction.tahoe.commands.optimize._
+import com.databricks.sql.transaction.tahoe.files.SQLMetricsReporting
+import com.databricks.sql.transaction.tahoe.sources.DeltaSQLConf
+import com.nvidia.spark.rapids.delta.RapidsDeltaSQLConf
+
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext.SPARK_JOB_GROUP_ID
+import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.execution.metric.SQLMetrics.createMetric
+import org.apache.spark.util.ThreadUtils
+
+class GpuOptimizeExecutor(
+                        sparkSession: SparkSession,
+                        txn: OptimisticTransaction,
+                        partitionPredicate: Seq[Expression],
+                        zOrderByColumns: Seq[String],
+                        prevCommitActions: Seq[Action])
+  extends DeltaCommand with SQLMetricsReporting with Serializable {
+
+  /** Timestamp to use in [[FileAction]] */
+  private val operationTimestamp = System.currentTimeMillis
+
+  private val isMultiDimClustering = zOrderByColumns.nonEmpty
+  private val isAutoCompact = prevCommitActions.nonEmpty
+  private val optimizeType = GpuOptimizeType(isMultiDimClustering, isAutoCompact)
+
+  def optimize(): Seq[Row] = {
+    recordDeltaOperation(txn.deltaLog, "delta.optimize") {
+      val maxFileSize = optimizeType.maxFileSize
+      require(maxFileSize > 0, "maxFileSize must be > 0")
+
+      val minNumFilesInDir = optimizeType.minNumFiles
+      val (candidateFiles, filesToProcess) = optimizeType.targetFiles
+      val partitionSchema = txn.metadata.partitionSchema
+
+      // select all files in case of multi-dimensional clustering
+      val partitionsToCompact = filesToProcess
+        .groupBy(_.partitionValues)
+        .filter { case (_, filesInPartition) => filesInPartition.size >= minNumFilesInDir }
+        .toSeq
+
+      val groupedJobs = groupFilesIntoBins(partitionsToCompact, maxFileSize)
+      val jobs = optimizeType.targetBins(groupedJobs)
+
+      val maxThreads =
+        sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_MAX_THREADS)
+      val updates = ThreadUtils.parmap(jobs, "OptimizeJob", maxThreads) { partitionBinGroup =>
+        runOptimizeBinJob(txn, partitionBinGroup._1, partitionBinGroup._2, maxFileSize)
+      }.flatten
+
+      val addedFiles = updates.collect { case a: AddFile => a }
+      val removedFiles = updates.collect { case r: RemoveFile => r }
+      if (addedFiles.nonEmpty) {
+        val operation = DeltaOperations.Optimize(partitionPredicate.map(_.sql), zOrderByColumns)
+        val metrics = createMetrics(sparkSession.sparkContext, addedFiles, removedFiles)
+        commitAndRetry(txn, operation, updates, metrics) { newTxn =>
+          val newPartitionSchema = newTxn.metadata.partitionSchema
+          val candidateSetOld = candidateFiles.map(_.path).toSet
+          val candidateSetNew = newTxn.filterFiles(partitionPredicate).map(_.path).toSet
+
+          // As long as all of the files that we compacted are still part of the table,
+          // and the partitioning has not changed it is valid to continue to try
+          // and commit this checkpoint.
+          if (candidateSetOld.subsetOf(candidateSetNew) && partitionSchema == newPartitionSchema) {
+            true
+          } else {
+            val deleted = candidateSetOld -- candidateSetNew
+            logWarning(s"The following compacted files were delete " +
+              s"during checkpoint ${deleted.mkString(",")}. Aborting the compaction.")
+            false
+          }
+        }
+      }
+
+      val optimizeStats = OptimizeStats()
+      optimizeStats.addedFilesSizeStats.merge(addedFiles)
+      optimizeStats.removedFilesSizeStats.merge(removedFiles)
+      optimizeStats.numPartitionsOptimized = jobs.map(j => j._1).distinct.size
+      optimizeStats.numBatches = jobs.size
+      optimizeStats.totalConsideredFiles = candidateFiles.size
+      optimizeStats.totalFilesSkipped = optimizeStats.totalConsideredFiles - removedFiles.size
+
+      if (isMultiDimClustering) {
+        val inputFileStats =
+          ZOrderFileStats(removedFiles.size, removedFiles.map(_.size.getOrElse(0L)).sum)
+        optimizeStats.zOrderStats = Some(ZOrderStats(
+          strategyName = "all", // means process all files in a partition
+          inputCubeFiles = ZOrderFileStats(0, 0),
+          inputOtherFiles = inputFileStats,
+          inputNumCubes = 0,
+          mergedFiles = inputFileStats,
+          // There will one z-cube for each partition
+          numOutputCubes = optimizeStats.numPartitionsOptimized))
+      }
+
+      return Seq(Row(txn.deltaLog.dataPath.toString, optimizeStats.toOptimizeMetrics))
+    }
+  }
+
+  /**
+   * Utility methods to group files into bins for optimize.
+   *
+   * @param partitionsToCompact List of files to compact group by partition.
+   *                            Partition is defined by the partition values (partCol -> partValue)
+   * @param maxTargetFileSize Max size (in bytes) of the compaction output file.
+   * @return Sequence of bins. Each bin contains one or more files from the same
+   *         partition and targeted for one output file.
+   */
+  private def groupFilesIntoBins(
+    partitionsToCompact: Seq[(Map[String, String], Seq[AddFile])],
+    maxTargetFileSize: Long): Seq[(Map[String, String], Seq[AddFile])] = {
+
+    partitionsToCompact.flatMap {
+      case (partition, files) =>
+        val bins = new ArrayBuffer[Seq[AddFile]]()
+
+        val currentBin = new ArrayBuffer[AddFile]()
+        var currentBinSize = 0L
+
+        files.sortBy(_.size).foreach { file =>
+          // Generally, a bin is a group of existing files, whose total size does not exceed the
+          // desired maxFileSize. They will be coalesced into a single output file.
+          // However, if isMultiDimClustering = true, all files in a partition will be read by the
+          // same job, the data will be range-partitioned and numFiles = totalFileSize / maxFileSize
+          // will be produced. See below.
+          if (file.size + currentBinSize > maxTargetFileSize && !isMultiDimClustering) {
+            bins += currentBin.toVector
+            currentBin.clear()
+            currentBin += file
+            currentBinSize = file.size
+          } else {
+            currentBin += file
+            currentBinSize += file.size
+          }
+        }
+
+        if (currentBin.nonEmpty) {
+          bins += currentBin.toVector
+        }
+
+        bins.map(b => (partition, b))
+          // select bins that have at least two files or in case of multi-dim clustering
+          // select all bins
+          .filter(_._2.size > 1 || isMultiDimClustering)
+    }
+  }
+
+  /**
+   * Utility method to run a Spark job to compact the files in given bin
+   *
+   * @param txn [[OptimisticTransaction]] instance in use to commit the changes to DeltaLog.
+   * @param partition Partition values of the partition that files in [[bin]] belongs to.
+   * @param bin List of files to compact into one large file.
+   * @param maxFileSize Targeted output file size in bytes
+   */
+  private def runOptimizeBinJob(
+                                 txn: OptimisticTransaction,
+                                 partition: Map[String, String],
+                                 bin: Seq[AddFile],
+                                 maxFileSize: Long): Seq[FileAction] = {
+    val baseTablePath = txn.deltaLog.dataPath
+
+    val input = txn.deltaLog.createDataFrame(txn.snapshot, bin, actionTypeOpt = Some("Optimize"))
+    val repartitionDF = if (isMultiDimClustering) {
+      // TODO: MultiDimClustering is not currently supported on Databricks 10.4.
+      //  val totalSize = bin.map(_.size).sum
+      // val approxNumFiles = Math.max(1, totalSize / maxFileSize).toInt
+      // MultiDimClustering.cluster(
+      //   txn.deltaLog,
+      //   input,
+      //   approxNumFiles,
+      //   zOrderByColumns)
+      throw new UnsupportedOperationException("MultiDimClustering not supported on compaction")
+    } else {
+      // Re-partition is not available in Databricks 10.4 (spark321db)
+      input.coalesce(numPartitions = 1)
+    }
+
+    val partitionDesc = partition.toSeq.map(entry => entry._1 + "=" + entry._2).mkString(",")
+
+    val partitionName = if (partition.isEmpty) "" else s" in partition ($partitionDesc)"
+    val description = s"$baseTablePath<br/>Optimizing ${bin.size} files" + partitionName
+    sparkSession.sparkContext.setJobGroup(
+      sparkSession.sparkContext.getLocalProperty(SPARK_JOB_GROUP_ID),
+      description)
+
+    val addFiles = txn.writeFiles(repartitionDF).collect {
+      case a: AddFile =>
+        a.copy(dataChange = false)
+      case other =>
+        throw new IllegalStateException(
+          s"Unexpected action $other with type ${other.getClass}. File compaction job output" +
+            s"should only have AddFiles")
+    }
+    val removeFiles = bin.map(f => f.removeWithTimestamp(operationTimestamp, dataChange = false))
+    val updates = addFiles ++ removeFiles
+    updates
+  }
+
+  private type PartitionedBin = (Map[String, String], Seq[AddFile])
+
+  private trait GpuOptimizeType {
+    def minNumFiles: Long
+
+    def maxFileSize: Long =
+      sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_MAX_FILE_SIZE)
+
+    def targetFiles: (Seq[AddFile], Seq[AddFile])
+
+    def targetBins(jobs: Seq[PartitionedBin]): Seq[PartitionedBin] = jobs
+  }
+
+  private case class GpuCompaction() extends GpuOptimizeType {
+    def minNumFiles: Long = 2
+
+    def targetFiles: (Seq[AddFile], Seq[AddFile]) = {
+      val minFileSize = sparkSession.sessionState.conf.getConf(
+        DeltaSQLConf.DELTA_OPTIMIZE_MIN_FILE_SIZE)
+      require(minFileSize > 0, "minFileSize must be > 0")
+      val candidateFiles = txn.filterFiles(partitionPredicate)
+      val filesToProcess = candidateFiles.filter(_.size < minFileSize)
+      (candidateFiles, filesToProcess)
+    }
+  }
+
+  private case class GpuMultiDimOrdering() extends GpuOptimizeType {
+    def minNumFiles: Long = 1
+
+    def targetFiles: (Seq[AddFile], Seq[AddFile]) = {
+      // select all files in case of multi-dimensional clustering
+      val candidateFiles = txn.filterFiles(partitionPredicate)
+      (candidateFiles, candidateFiles)
+    }
+  }
+
+  private case class GpuAutoCompaction() extends GpuOptimizeType {
+    def minNumFiles: Long = {
+      val minNumFiles =
+        sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_AUTO_COMPACT_MIN_NUM_FILES)
+      require(minNumFiles > 0, "minNumFiles must be > 0")
+      minNumFiles
+    }
+
+    override def maxFileSize: Long =
+      sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_AUTO_COMPACT_MAX_FILE_SIZE)
+        .getOrElse(128 * 1024 * 1024)
+
+    override def targetFiles: (Seq[AddFile], Seq[AddFile]) = {
+      val autoCompactTarget =
+        sparkSession.sessionState.conf.getConf(RapidsDeltaSQLConf.AUTO_COMPACT_TARGET)
+      // Filter the candidate files according to autoCompact.target config.
+      lazy val addedFiles = prevCommitActions.collect { case a: AddFile => a }
+      val candidateFiles = autoCompactTarget match {
+        case "table" =>
+          txn.filterFiles()
+        case "commit" =>
+          addedFiles
+        case "partition" =>
+          val eligiblePartitions = addedFiles.map(_.partitionValues).toSet
+          txn.filterFiles().filter(f => eligiblePartitions.contains(f.partitionValues))
+        case _ =>
+          logError(s"Invalid config for autoCompact.target: $autoCompactTarget. " +
+            s"Falling back to the default value 'table'.")
+          txn.filterFiles()
+      }
+      val filesToProcess = candidateFiles.filter(_.size < maxFileSize)
+      (candidateFiles, filesToProcess)
+    }
+
+    override def targetBins(jobs: Seq[PartitionedBin]): Seq[PartitionedBin] = {
+      var acc = 0L
+      val maxCompactBytes =
+        sparkSession.sessionState.conf.getConf(RapidsDeltaSQLConf.AUTO_COMPACT_MAX_COMPACT_BYTES)
+      // bins with more files are prior to less files.
+      jobs
+        .sortBy { case (_, filesInBin) => -filesInBin.length }
+        .takeWhile { case (_, filesInBin) =>
+          acc += filesInBin.map(_.size).sum
+          acc <= maxCompactBytes
+        }
+    }
+  }
+
+  private object GpuOptimizeType {
+
+    def apply(isMultiDimClustering: Boolean, isAutoCompact: Boolean): GpuOptimizeType = {
+      if (isMultiDimClustering) {
+        GpuMultiDimOrdering()
+      } else if (isAutoCompact) {
+        GpuAutoCompaction()
+      } else {
+        GpuCompaction()
+      }
+    }
+  }
+
+  /**
+   * Attempts to commit the given actions to the log. In the case of a concurrent update,
+   * the given function will be invoked with a new transaction to allow custom conflict
+   * detection logic to indicate it is safe to try again, by returning `true`.
+   *
+   * This function will continue to try to commit to the log as long as `f` returns `true`,
+   * otherwise throws a subclass of [[ConcurrentModificationException]].
+   */
+  @tailrec
+  private def commitAndRetry(
+                              txn: OptimisticTransaction,
+                              optimizeOperation: Operation,
+                              actions: Seq[Action],
+                              metrics: Map[String, SQLMetric])(f: OptimisticTransaction => Boolean)
+  : Unit = {
+    try {
+      txn.registerSQLMetrics(sparkSession, metrics)
+      txn.commit(actions, optimizeOperation)
+    } catch {
+      case e: ConcurrentModificationException =>
+        val newTxn = txn.deltaLog.startTransaction()
+        if (f(newTxn)) {
+          logInfo("Retrying commit after checking for semantic conflicts with concurrent updates.")
+          commitAndRetry(newTxn, optimizeOperation, actions, metrics)(f)
+        } else {
+          logWarning("Semantic conflicts detected. Aborting operation.")
+          throw e
+        }
+    }
+  }
+
+  /** Create a map of SQL metrics for adding to the commit history. */
+  private def createMetrics(
+                             sparkContext: SparkContext,
+                             addedFiles: Seq[AddFile],
+                             removedFiles: Seq[RemoveFile]): Map[String, SQLMetric] = {
+
+    def setAndReturnMetric(description: String, value: Long) = {
+      val metric = createMetric(sparkContext, description)
+      metric.set(value)
+      metric
+    }
+
+    def totalSize(actions: Seq[FileAction]): Long = {
+      var totalSize = 0L
+      actions.foreach { file =>
+        val fileSize = file match {
+          case addFile: AddFile => addFile.size
+          case removeFile: RemoveFile => removeFile.size.getOrElse(0L)
+          case default =>
+            throw new IllegalArgumentException(s"Unknown FileAction type: ${default.getClass}")
+        }
+        totalSize += fileSize
+      }
+      totalSize
+    }
+
+    val sizeStats = FileSizeStatsWithHistogram.create(addedFiles.map(_.size).sorted)
+    Map[String, SQLMetric](
+      "minFileSize" -> setAndReturnMetric("minimum file size", sizeStats.get.min),
+      "p25FileSize" -> setAndReturnMetric("25th percentile file size", sizeStats.get.p25),
+      "p50FileSize" -> setAndReturnMetric("50th percentile file size", sizeStats.get.p50),
+      "p75FileSize" -> setAndReturnMetric("75th percentile file size", sizeStats.get.p75),
+      "maxFileSize" -> setAndReturnMetric("maximum file size", sizeStats.get.max),
+      "numAddedFiles" -> setAndReturnMetric("total number of files added.", addedFiles.size),
+      "numRemovedFiles" -> setAndReturnMetric("total number of files removed.", removedFiles.size),
+      "numAddedBytes" -> setAndReturnMetric("total number of bytes added", totalSize(addedFiles)),
+      "numRemovedBytes" ->
+        setAndReturnMetric("total number of bytes removed", totalSize(removedFiles)))
+  }
+}
\ No newline at end of file
diff --git a/delta-lake/delta-spark330db/pom.xml b/delta-lake/delta-spark330db/pom.xml
index 50fff6eee80..6363c245e58 100644
--- a/delta-lake/delta-spark330db/pom.xml
+++ b/delta-lake/delta-spark330db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.02.0</version>
+        <version>23.04.0</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark330db_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 11.3 Delta Lake Support</name>
     <description>Databricks 11.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.02.0</version>
+    <version>23.04.0</version>
 
     <dependencies>
         <dependency>
diff --git a/delta-lake/delta-spark330db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDoAutoCompaction.scala b/delta-lake/delta-spark330db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDoAutoCompaction.scala
new file mode 100644
index 00000000000..9726511ad44
--- /dev/null
+++ b/delta-lake/delta-spark330db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDoAutoCompaction.scala
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * This file was derived from DoAutoCompaction.scala
+ * from https://github.com/delta-io/delta/pull/1156
+ * in the Delta Lake project at https://github.com/delta-io/delta.
+ *
+ * Copyright (2021) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.databricks.sql.transaction.tahoe.rapids
+
+import com.databricks.sql.transaction.tahoe._
+import com.databricks.sql.transaction.tahoe.actions.Action
+import com.databricks.sql.transaction.tahoe.hooks.PostCommitHook
+import com.databricks.sql.transaction.tahoe.metering.DeltaLogging
+
+import org.apache.spark.sql.SparkSession
+
+object GpuDoAutoCompaction extends PostCommitHook
+    with DeltaLogging
+    with Serializable {
+  override val name: String = "Triggers compaction if necessary"
+
+  override def run(spark: SparkSession,
+                   txn: OptimisticTransactionImpl,
+                   committedVersion: Long,
+                   postCommitSnapshot: Snapshot,
+                   committedActions: Seq[Action]): Unit = {
+    val gpuTxn = txn.asInstanceOf[GpuOptimisticTransaction]
+    val newTxn = new GpuDeltaLog(gpuTxn.deltaLog, gpuTxn.rapidsConf).startTransaction()
+    // Note: The Databricks AutoCompact PostCommitHook cannot be used here
+    // (with a GpuOptimisticTransaction). It appears that AutoCompact creates a new transaction,
+    // thereby circumventing GpuOptimisticTransaction (which intercepts Parquet writes
+    // to go through the GPU).
+    new GpuOptimizeExecutor(spark, newTxn, Seq.empty, Seq.empty, committedActions).optimize()
+  }
+
+  override def handleError(error: Throwable, version: Long): Unit =
+    throw DeltaErrors.postCommitHookFailedException(this, version, name, error)
+}
\ No newline at end of file
diff --git a/delta-lake/delta-spark330db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransaction.scala b/delta-lake/delta-spark330db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransaction.scala
index dc1cfde91d8..0e8c7f74fde 100644
--- a/delta-lake/delta-spark330db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransaction.scala
+++ b/delta-lake/delta-spark330db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransaction.scala
@@ -176,6 +176,8 @@ class GpuOptimisticTransaction(
     val constraints =
       Constraints.getAll(metadata, spark) ++ generatedColumnConstraints ++ additionalConstraints
 
+    val isOptimize = isOptimizeCommand(queryExecution.analyzed)
+
     SQLExecution.withNewExecutionId(queryExecution, Option("deltaTransactionalWrite")) {
       val outputSpec = FileFormatWriter.OutputSpec(
         outputPath.toString,
@@ -194,7 +196,9 @@ class GpuOptimisticTransaction(
 
       val empty2NullPlan = convertEmptyToNullIfNeeded(queryPhysicalPlan,
         partitioningColumns, constraints)
-      val planWithInvariants = addInvariantChecks(empty2NullPlan, constraints)
+      val optimizedPlan =
+        applyOptimizeWriteIfNeeded(spark, empty2NullPlan, partitionSchema, isOptimize)
+      val planWithInvariants = addInvariantChecks(optimizedPlan, constraints)
       val physicalPlan = convertToGpu(planWithInvariants)
 
       val statsTrackers: ListBuffer[ColumnarWriteJobStatsTracker] = ListBuffer()
@@ -275,6 +279,23 @@ class GpuOptimisticTransaction(
     identityTracker.foreach { tracker =>
       updatedIdentityHighWaterMarks.appendAll(tracker.highWaterMarks.toSeq)
     }
-    resultFiles.toSeq ++ committer.changeFiles
+    val fileActions = resultFiles.toSeq ++ committer.changeFiles
+
+    // Check if auto-compaction is enabled.
+    // (Auto compaction checks are derived from the work in
+    //  https://github.com/delta-io/delta/pull/1156).
+    lazy val autoCompactEnabled =
+      spark.sessionState.conf
+        .getConf[String](DeltaSQLConf.DELTA_AUTO_COMPACT_ENABLED)
+        .getOrElse {
+          DeltaConfigs.AUTO_COMPACT.fromMetaData(metadata)
+            .getOrElse("false")
+        }.toBoolean
+
+    if (!isOptimize && autoCompactEnabled && fileActions.nonEmpty) {
+      registerPostCommitHook(GpuDoAutoCompaction)
+    }
+
+    fileActions
   }
 }
diff --git a/delta-lake/delta-spark330db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimizeExecutor.scala b/delta-lake/delta-spark330db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimizeExecutor.scala
new file mode 100644
index 00000000000..cfa1468b7c9
--- /dev/null
+++ b/delta-lake/delta-spark330db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimizeExecutor.scala
@@ -0,0 +1,405 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * This file was derived from:
+ *  1. DoAutoCompaction.scala from PR#1156 at https://github.com/delta-io/delta/pull/1156,
+ *  2. OptimizeTableCommand.scala from the Delta Lake project at https://github.com/delta-io/delta.
+ *
+ * Copyright (2021) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.databricks.sql.transaction.tahoe.rapids
+
+import java.util.ConcurrentModificationException
+
+import scala.annotation.tailrec
+import scala.collection.mutable.ArrayBuffer
+
+import com.databricks.sql.io.skipping.MultiDimClustering
+import com.databricks.sql.transaction.tahoe._
+import com.databricks.sql.transaction.tahoe.DeltaOperations.Operation
+import com.databricks.sql.transaction.tahoe.actions.{Action, AddFile, FileAction, RemoveFile}
+import com.databricks.sql.transaction.tahoe.commands.DeltaCommand
+import com.databricks.sql.transaction.tahoe.commands.optimize._
+import com.databricks.sql.transaction.tahoe.files.SQLMetricsReporting
+import com.databricks.sql.transaction.tahoe.sources.DeltaSQLConf
+import com.nvidia.spark.rapids.delta.RapidsDeltaSQLConf
+
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext.SPARK_JOB_GROUP_ID
+import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.execution.metric.SQLMetrics.createMetric
+import org.apache.spark.util.ThreadUtils
+
+class GpuOptimizeExecutor(
+                        sparkSession: SparkSession,
+                        txn: OptimisticTransaction,
+                        partitionPredicate: Seq[Expression],
+                        zOrderByColumns: Seq[String],
+                        prevCommitActions: Seq[Action])
+  extends DeltaCommand with SQLMetricsReporting with Serializable {
+
+  /** Timestamp to use in [[FileAction]] */
+  private val operationTimestamp = System.currentTimeMillis
+
+  private val isMultiDimClustering = zOrderByColumns.nonEmpty
+  private val isAutoCompact = prevCommitActions.nonEmpty
+  private val optimizeType = GpuOptimizeType(isMultiDimClustering, isAutoCompact)
+
+  def optimize(): Seq[Row] = {
+    recordDeltaOperation(txn.deltaLog, "delta.optimize") {
+      val maxFileSize = optimizeType.maxFileSize
+      require(maxFileSize > 0, "maxFileSize must be > 0")
+
+      val minNumFilesInDir = optimizeType.minNumFiles
+      val (candidateFiles, filesToProcess) = optimizeType.targetFiles
+      val partitionSchema = txn.metadata.partitionSchema
+
+      // select all files in case of multi-dimensional clustering
+      val partitionsToCompact = filesToProcess
+        .groupBy(_.partitionValues)
+        .filter { case (_, filesInPartition) => filesInPartition.size >= minNumFilesInDir }
+        .toSeq
+
+      val groupedJobs = groupFilesIntoBins(partitionsToCompact, maxFileSize)
+      val jobs = optimizeType.targetBins(groupedJobs)
+
+      val maxThreads =
+        sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_MAX_THREADS)
+      val updates = ThreadUtils.parmap(jobs, "OptimizeJob", maxThreads) { partitionBinGroup =>
+        runOptimizeBinJob(txn, partitionBinGroup._1, partitionBinGroup._2, maxFileSize)
+      }.flatten
+
+      val addedFiles = updates.collect { case a: AddFile => a }
+      val removedFiles = updates.collect { case r: RemoveFile => r }
+      if (addedFiles.nonEmpty) {
+        val operation = DeltaOperations.Optimize(partitionPredicate.map(_.sql), zOrderByColumns)
+        val metrics = createMetrics(sparkSession.sparkContext, addedFiles, removedFiles)
+        commitAndRetry(txn, operation, updates, metrics) { newTxn =>
+          val newPartitionSchema = newTxn.metadata.partitionSchema
+          val candidateSetOld = candidateFiles.map(_.path).toSet
+          val candidateSetNew = newTxn.filterFiles(partitionPredicate).map(_.path).toSet
+
+          // As long as all of the files that we compacted are still part of the table,
+          // and the partitioning has not changed it is valid to continue to try
+          // and commit this checkpoint.
+          if (candidateSetOld.subsetOf(candidateSetNew) && partitionSchema == newPartitionSchema) {
+            true
+          } else {
+            val deleted = candidateSetOld -- candidateSetNew
+            logWarning(s"The following compacted files were delete " +
+              s"during checkpoint ${deleted.mkString(",")}. Aborting the compaction.")
+            false
+          }
+        }
+      }
+
+      val optimizeStats = OptimizeStats()
+      optimizeStats.addedFilesSizeStats.merge(addedFiles)
+      optimizeStats.removedFilesSizeStats.merge(removedFiles)
+      optimizeStats.numPartitionsOptimized = jobs.map(j => j._1).distinct.size
+      optimizeStats.numBatches = jobs.size
+      optimizeStats.totalConsideredFiles = candidateFiles.size
+      optimizeStats.totalFilesSkipped = optimizeStats.totalConsideredFiles - removedFiles.size
+      optimizeStats.totalClusterParallelism = sparkSession.sparkContext.defaultParallelism
+
+      if (isMultiDimClustering) {
+        val inputFileStats =
+          ZOrderFileStats(removedFiles.size, removedFiles.map(_.size.getOrElse(0L)).sum)
+        optimizeStats.zOrderStats = Some(ZOrderStats(
+          strategyName = "all", // means process all files in a partition
+          inputCubeFiles = ZOrderFileStats(0, 0),
+          inputOtherFiles = inputFileStats,
+          inputNumCubes = 0,
+          mergedFiles = inputFileStats,
+          // There will one z-cube for each partition
+          numOutputCubes = optimizeStats.numPartitionsOptimized))
+      }
+
+      return Seq(Row(txn.deltaLog.dataPath.toString, optimizeStats.toOptimizeMetrics))
+    }
+  }
+
+  /**
+   * Utility methods to group files into bins for optimize.
+   *
+   * @param partitionsToCompact List of files to compact group by partition.
+   *                            Partition is defined by the partition values (partCol -> partValue)
+   * @param maxTargetFileSize Max size (in bytes) of the compaction output file.
+   * @return Sequence of bins. Each bin contains one or more files from the same
+   *         partition and targeted for one output file.
+   */
+  private def groupFilesIntoBins(
+    partitionsToCompact: Seq[(Map[String, String], Seq[AddFile])],
+    maxTargetFileSize: Long): Seq[(Map[String, String], Seq[AddFile])] = {
+
+    partitionsToCompact.flatMap {
+      case (partition, files) =>
+        val bins = new ArrayBuffer[Seq[AddFile]]()
+
+        val currentBin = new ArrayBuffer[AddFile]()
+        var currentBinSize = 0L
+
+        files.sortBy(_.size).foreach { file =>
+          // Generally, a bin is a group of existing files, whose total size does not exceed the
+          // desired maxFileSize. They will be coalesced into a single output file.
+          // However, if isMultiDimClustering = true, all files in a partition will be read by the
+          // same job, the data will be range-partitioned and numFiles = totalFileSize / maxFileSize
+          // will be produced. See below.
+          if (file.size + currentBinSize > maxTargetFileSize && !isMultiDimClustering) {
+            bins += currentBin.toVector
+            currentBin.clear()
+            currentBin += file
+            currentBinSize = file.size
+          } else {
+            currentBin += file
+            currentBinSize += file.size
+          }
+        }
+
+        if (currentBin.nonEmpty) {
+          bins += currentBin.toVector
+        }
+
+        bins.map(b => (partition, b))
+          // select bins that have at least two files or in case of multi-dim clustering
+          // select all bins
+          .filter(_._2.size > 1 || isMultiDimClustering)
+    }
+  }
+
+  /**
+   * Utility method to run a Spark job to compact the files in given bin
+   *
+   * @param txn [[OptimisticTransaction]] instance in use to commit the changes to DeltaLog.
+   * @param partition Partition values of the partition that files in [[bin]] belongs to.
+   * @param bin List of files to compact into one large file.
+   * @param maxFileSize Targeted output file size in bytes
+   */
+  private def runOptimizeBinJob(
+                                 txn: OptimisticTransaction,
+                                 partition: Map[String, String],
+                                 bin: Seq[AddFile],
+                                 maxFileSize: Long): Seq[FileAction] = {
+    val baseTablePath = txn.deltaLog.dataPath
+
+    val input = txn.deltaLog.createDataFrame(txn.snapshot, bin, actionTypeOpt = Some("Optimize"))
+    val repartitionDF = if (isMultiDimClustering) {
+      val totalSize = bin.map(_.size).sum
+      val approxNumFiles = Math.max(1, totalSize / maxFileSize).toInt
+      MultiDimClustering.cluster(
+        input,
+        approxNumFiles,
+        zOrderByColumns)
+    } else {
+      val useRepartition = sparkSession.sessionState.conf.getConf(
+        DeltaSQLConf.DELTA_OPTIMIZE_REPARTITION_ENABLED)
+      if (useRepartition) {
+        input.repartition(numPartitions = 1)
+      } else {
+        input.coalesce(numPartitions = 1)
+      }
+    }
+
+    val partitionDesc = partition.toSeq.map(entry => entry._1 + "=" + entry._2).mkString(",")
+
+    val partitionName = if (partition.isEmpty) "" else s" in partition ($partitionDesc)"
+    val description = s"$baseTablePath<br/>Optimizing ${bin.size} files" + partitionName
+    sparkSession.sparkContext.setJobGroup(
+      sparkSession.sparkContext.getLocalProperty(SPARK_JOB_GROUP_ID),
+      description)
+
+    val addFiles = txn.writeFiles(repartitionDF).collect {
+      case a: AddFile =>
+        a.copy(dataChange = false)
+      case other =>
+        throw new IllegalStateException(
+          s"Unexpected action $other with type ${other.getClass}. File compaction job output" +
+            s"should only have AddFiles")
+    }
+    val removeFiles = bin.map(f => f.removeWithTimestamp(operationTimestamp, dataChange = false))
+    val updates = addFiles ++ removeFiles
+    updates
+  }
+
+  private type PartitionedBin = (Map[String, String], Seq[AddFile])
+
+  private trait GpuOptimizeType {
+    def minNumFiles: Long
+
+    def maxFileSize: Long =
+      sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_MAX_FILE_SIZE)
+
+    def targetFiles: (Seq[AddFile], Seq[AddFile])
+
+    def targetBins(jobs: Seq[PartitionedBin]): Seq[PartitionedBin] = jobs
+  }
+
+  private case class GpuCompaction() extends GpuOptimizeType {
+    def minNumFiles: Long = 2
+
+    def targetFiles: (Seq[AddFile], Seq[AddFile]) = {
+      val minFileSize = sparkSession.sessionState.conf.getConf(
+        DeltaSQLConf.DELTA_OPTIMIZE_MIN_FILE_SIZE)
+      require(minFileSize > 0, "minFileSize must be > 0")
+      val candidateFiles = txn.filterFiles(partitionPredicate)
+      val filesToProcess = candidateFiles.filter(_.size < minFileSize)
+      (candidateFiles, filesToProcess)
+    }
+  }
+
+  private case class GpuMultiDimOrdering() extends GpuOptimizeType {
+    def minNumFiles: Long = 1
+
+    def targetFiles: (Seq[AddFile], Seq[AddFile]) = {
+      // select all files in case of multi-dimensional clustering
+      val candidateFiles = txn.filterFiles(partitionPredicate)
+      (candidateFiles, candidateFiles)
+    }
+  }
+
+  private case class GpuAutoCompaction() extends GpuOptimizeType {
+    def minNumFiles: Long = {
+      val minNumFiles =
+        sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_AUTO_COMPACT_MIN_NUM_FILES)
+      require(minNumFiles > 0, "minNumFiles must be > 0")
+      minNumFiles
+    }
+
+    override def maxFileSize: Long =
+      sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_AUTO_COMPACT_MAX_FILE_SIZE)
+        .getOrElse(128 * 1024 * 1024)
+
+    override def targetFiles: (Seq[AddFile], Seq[AddFile]) = {
+      val autoCompactTarget =
+        sparkSession.sessionState.conf.getConf(RapidsDeltaSQLConf.AUTO_COMPACT_TARGET)
+      // Filter the candidate files according to autoCompact.target config.
+      lazy val addedFiles = prevCommitActions.collect { case a: AddFile => a }
+      val candidateFiles = autoCompactTarget match {
+        case "table" =>
+          txn.filterFiles()
+        case "commit" =>
+          addedFiles
+        case "partition" =>
+          val eligiblePartitions = addedFiles.map(_.partitionValues).toSet
+          txn.filterFiles().filter(f => eligiblePartitions.contains(f.partitionValues))
+        case _ =>
+          logError(s"Invalid config for autoCompact.target: $autoCompactTarget. " +
+            s"Falling back to the default value 'table'.")
+          txn.filterFiles()
+      }
+      val filesToProcess = candidateFiles.filter(_.size < maxFileSize)
+      (candidateFiles, filesToProcess)
+    }
+
+    override def targetBins(jobs: Seq[PartitionedBin]): Seq[PartitionedBin] = {
+      var acc = 0L
+      val maxCompactBytes =
+        sparkSession.sessionState.conf.getConf(RapidsDeltaSQLConf.AUTO_COMPACT_MAX_COMPACT_BYTES)
+      // bins with more files are prior to less files.
+      jobs
+        .sortBy { case (_, filesInBin) => -filesInBin.length }
+        .takeWhile { case (_, filesInBin) =>
+          acc += filesInBin.map(_.size).sum
+          acc <= maxCompactBytes
+        }
+    }
+  }
+
+  private object GpuOptimizeType {
+
+    def apply(isMultiDimClustering: Boolean, isAutoCompact: Boolean): GpuOptimizeType = {
+      if (isMultiDimClustering) {
+        GpuMultiDimOrdering()
+      } else if (isAutoCompact) {
+        GpuAutoCompaction()
+      } else {
+        GpuCompaction()
+      }
+    }
+  }
+
+  /**
+   * Attempts to commit the given actions to the log. In the case of a concurrent update,
+   * the given function will be invoked with a new transaction to allow custom conflict
+   * detection logic to indicate it is safe to try again, by returning `true`.
+   *
+   * This function will continue to try to commit to the log as long as `f` returns `true`,
+   * otherwise throws a subclass of [[ConcurrentModificationException]].
+   */
+  @tailrec
+  private def commitAndRetry(
+                              txn: OptimisticTransaction,
+                              optimizeOperation: Operation,
+                              actions: Seq[Action],
+                              metrics: Map[String, SQLMetric])(f: OptimisticTransaction => Boolean)
+  : Unit = {
+    try {
+      txn.registerSQLMetrics(sparkSession, metrics)
+      txn.commit(actions, optimizeOperation)
+    } catch {
+      case e: ConcurrentModificationException =>
+        val newTxn = txn.deltaLog.startTransaction()
+        if (f(newTxn)) {
+          logInfo("Retrying commit after checking for semantic conflicts with concurrent updates.")
+          commitAndRetry(newTxn, optimizeOperation, actions, metrics)(f)
+        } else {
+          logWarning("Semantic conflicts detected. Aborting operation.")
+          throw e
+        }
+    }
+  }
+
+  /** Create a map of SQL metrics for adding to the commit history. */
+  private def createMetrics(
+                             sparkContext: SparkContext,
+                             addedFiles: Seq[AddFile],
+                             removedFiles: Seq[RemoveFile]): Map[String, SQLMetric] = {
+
+    def setAndReturnMetric(description: String, value: Long) = {
+      val metric = createMetric(sparkContext, description)
+      metric.set(value)
+      metric
+    }
+
+    def totalSize(actions: Seq[FileAction]): Long = {
+      var totalSize = 0L
+      actions.foreach { file =>
+        val fileSize = file match {
+          case addFile: AddFile => addFile.size
+          case removeFile: RemoveFile => removeFile.size.getOrElse(0L)
+          case default =>
+            throw new IllegalArgumentException(s"Unknown FileAction type: ${default.getClass}")
+        }
+        totalSize += fileSize
+      }
+      totalSize
+    }
+
+    val sizeStats = FileSizeStatsWithHistogram.create(addedFiles.map(_.size).sorted)
+    Map[String, SQLMetric](
+      "minFileSize" -> setAndReturnMetric("minimum file size", sizeStats.get.min),
+      "p25FileSize" -> setAndReturnMetric("25th percentile file size", sizeStats.get.p25),
+      "p50FileSize" -> setAndReturnMetric("50th percentile file size", sizeStats.get.p50),
+      "p75FileSize" -> setAndReturnMetric("75th percentile file size", sizeStats.get.p75),
+      "maxFileSize" -> setAndReturnMetric("maximum file size", sizeStats.get.max),
+      "numAddedFiles" -> setAndReturnMetric("total number of files added.", addedFiles.size),
+      "numRemovedFiles" -> setAndReturnMetric("total number of files removed.", removedFiles.size),
+      "numAddedBytes" -> setAndReturnMetric("total number of bytes added", totalSize(addedFiles)),
+      "numRemovedBytes" ->
+        setAndReturnMetric("total number of bytes removed", totalSize(removedFiles)))
+  }
+}
\ No newline at end of file
diff --git a/delta-lake/delta-stub/pom.xml b/delta-lake/delta-stub/pom.xml
index e54d394760f..29c2b8172f1 100644
--- a/delta-lake/delta-stub/pom.xml
+++ b/delta-lake/delta-stub/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.02.0</version>
+        <version>23.04.0</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-stub_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake Stub</name>
     <description>Delta Lake stub for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.02.0</version>
+    <version>23.04.0</version>
 
     <dependencies>
         <dependency>
diff --git a/dist/maven-antrun/build-parallel-worlds.xml b/dist/maven-antrun/build-parallel-worlds.xml
index 89d95c845aa..e4f30daf1c1 100644
--- a/dist/maven-antrun/build-parallel-worlds.xml
+++ b/dist/maven-antrun/build-parallel-worlds.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0"?>
 <!--
-  Copyright (c) 2021-2022, NVIDIA CORPORATION.
+  Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
@@ -199,7 +199,8 @@
             <fileset dir="${project.build.directory}/parallel-world/spark3xx-common"
                      includesfile="${project.basedir}/unshimmed-common-from-spark311.txt"/>
         </delete>
-
+    </target>
+    <target name="remove-dependencies-from-pom" depends="build-parallel-worlds">
         <echo level="info">Generating dependency-reduced-pom.xml</echo>
         <resources id="aggregatorDependencyRegexWithoutWhitespace">
             <string>&lt;dependency&gt;</string>
@@ -216,7 +217,7 @@
         ${aggregatorDependencyRegex}
         </echo>
         <copy file="${project.basedir}/pom.xml"
-            tofile="${project.build.directory}/extra-resources/META-INF/maven/${project.groupId}/${project.artifactId}/pom.xml"
+            tofile="${project.build.directory}/parallel-world/META-INF/maven/${project.groupId}/${project.artifactId}/pom.xml"
             overwrite="true">
             <filterchain>
                 <replaceregex flags="gs" byline="false" replace=""
diff --git a/dist/pom.xml b/dist/pom.xml
index 31315159cd2..2961be2972e 100644
--- a/dist/pom.xml
+++ b/dist/pom.xml
@@ -22,12 +22,12 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.02.0</version>
+        <version>23.04.0</version>
     </parent>
     <artifactId>rapids-4-spark_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Distribution</name>
     <description>Creates the distribution package of the RAPIDS plugin for Apache Spark</description>
-    <version>23.02.0</version>
+    <version>23.04.0</version>
     <dependencies>
         <dependency>
             <groupId>com.nvidia</groupId>
@@ -41,6 +41,7 @@
     <properties>
         <target.classifier/>
         <dist.jar.name>${project.build.directory}/${project.build.finalName}-${cuda.version}.jar</dist.jar.name>
+        <dist.jar.pom.url>jar:file:${dist.jar.name}!/META-INF/maven/${project.groupId}/${project.artifactId}/pom.xml</dist.jar.pom.url>
     </properties>
     <profiles>
         <profile>
@@ -342,7 +343,7 @@
                         <id>default-resources</id>
                         <phase>process-resources</phase>
                         <configuration>
-                            <outputDirectory>${project.build.directory}/parallel-world</outputDirectory>>
+                            <outputDirectory>${project.build.directory}/parallel-world</outputDirectory>
                         </configuration>
                     </execution>
                 </executions>
@@ -368,10 +369,6 @@
                             <archive><compress>${dist.jar.compress}</compress></archive>
                             <classesDirectory>${project.build.directory}/parallel-world</classesDirectory>
                             <classifier>${cuda.version}</classifier>
-                            <excludes>
-                                <!-- get rid of all maven poms from shim builds -->
-                                <exclude>META-INF/maven/**</exclude>
-                            </excludes>
                         </configuration>
                     </execution>
                     <execution>
@@ -418,7 +415,7 @@
                                 <property name="included_buildvers" value="${included_buildvers}"/>
                                 <ant
                                     antfile="${project.basedir}/maven-antrun/build-parallel-worlds.xml"
-                                    target="build-parallel-worlds"
+                                    target="remove-dependencies-from-pom"
                                 />
                             </target>
                         </configuration>
@@ -428,12 +425,28 @@
                         <goals>
                             <goal>run</goal>
                         </goals>
-                        <id>reduce-pom-deps-in-the-jar</id>
+                        <id>check-pom-dependencies-empty</id>
                         <configuration>
                             <target>
-                                <zip update="true" basedir="${project.build.directory}/extra-resources"
-                                    compress="${dist.jar.compress}"
-                                    destfile="${dist.jar.name}"/>
+                                <scriptdef name="validateReducedPom" language="jython">
+<![CDATA[
+import xml.etree.ElementTree as ET
+
+self.log("Verifying pom.xml in distribution jar has zero dependencies ...")
+pom_xml = project.getProperty("pomXmlInJar")
+ns = 'http://maven.apache.org/POM/4.0.0'
+root = ET.fromstring(pom_xml)
+# verify: either no child element dependencies or it is empty
+deps = root.findall('{%s}dependencies' % ns)
+assert len(deps) == 0 or len(deps) == 1 and len(deps[0]) == 0, \
+       "Dist pom must not have dependencies"
+self.log("... OK")
+]]>
+                                </scriptdef>
+                                <loadresource property="pomXmlInJar">
+                                    <url url="${dist.jar.pom.url}"/>
+                                </loadresource>
+                                <validateReducedPom/>
                             </target>
                         </configuration>
                     </execution>
@@ -536,14 +549,14 @@
                                     <groupId>com.nvidia</groupId>
                                     <artifactId>spark-rapids-jni</artifactId>
                                     <classifier>${cuda.version}</classifier>
-                                    <excludes>META-INF</excludes>
+                                    <excludes>META-INF/**</excludes>
                                     <outputDirectory>${project.build.directory}/parallel-world</outputDirectory>
                                     <overWrite>true</overWrite>
                                 </artifactItem>
                                 <artifactItem>
                                     <groupId>org.openucx</groupId>
                                     <artifactId>jucx</artifactId>
-                                    <excludes>META-INF</excludes>
+                                    <excludes>META-INF/**</excludes>
                                     <outputDirectory>${project.build.directory}/parallel-world</outputDirectory>
                                     <overWrite>true</overWrite>
                                 </artifactItem>
diff --git a/dist/unshimmed-common-from-spark311.txt b/dist/unshimmed-common-from-spark311.txt
index 66c4d7a46aa..c1c4e4fdb1f 100644
--- a/dist/unshimmed-common-from-spark311.txt
+++ b/dist/unshimmed-common-from-spark311.txt
@@ -1,7 +1,6 @@
 META-INF/DEPENDENCIES
 META-INF/LICENSE
 META-INF/NOTICE
-META-INF/maven/**
 com/nvidia/spark/ExclusiveModeGpuDiscoveryPlugin*
 com/nvidia/spark/GpuCachedBatchSerializer*
 com/nvidia/spark/ParquetCachedBatchSerializer*
@@ -22,10 +21,12 @@ com/nvidia/spark/rapids/RapidsExecutorUpdateMsg*
 com/nvidia/spark/rapids/RapidsShuffleHeartbeatHandler*
 com/nvidia/spark/rapids/SQLExecPlugin*
 com/nvidia/spark/rapids/ShimLoader*
+com/nvidia/spark/rapids/ShimReflectionUtils*
 com/nvidia/spark/rapids/ShimVersion*
 com/nvidia/spark/rapids/SparkShimServiceProvider*
 com/nvidia/spark/rapids/SparkShimVersion*
 com/nvidia/spark/rapids/SparkShims*
+com/nvidia/spark/rapids/optimizer/SQLOptimizerPlugin*
 com/nvidia/spark/udf/Plugin*
 org/apache/spark/sql/rapids/ExecutionPlanCaptureCallback*
 org/apache/spark/sql/rapids/ProxyRapidsShuffleInternalManagerBase*
diff --git a/docs/FAQ.md b/docs/FAQ.md
index 1cf25e96b5f..def8e8b5c8b 100644
--- a/docs/FAQ.md
+++ b/docs/FAQ.md
@@ -10,7 +10,7 @@ nav_order: 12
 
 ### What versions of Apache Spark does the RAPIDS Accelerator for Apache Spark support?
 
-The RAPIDS Accelerator for Apache Spark requires version 3.1.1, 3.1.2, 3.1.3, 3.2.0, 3.2.1, 3.2.2, 3.2.3, 3.3.0 or 3.3.1 of
+The RAPIDS Accelerator for Apache Spark requires version 3.1.1, 3.1.2, 3.1.3, 3.2.0, 3.2.1, 3.2.2, 3.2.3, 3.3.0, 3.3.1 or 3.3.2 of
 Apache Spark. Because the plugin replaces parts of the physical plan that Apache Spark considers to
 be internal the code for those plans can change even between bug fix releases. As a part of our
 process, we try to stay on top of these changes and release updates as quickly as possible.
@@ -20,7 +20,7 @@ process, we try to stay on top of these changes and release updates as quickly a
 The RAPIDS Accelerator for Apache Spark officially supports:
 - [Apache Spark](get-started/getting-started-on-prem.md)
 - [AWS EMR 6.2+](get-started/getting-started-aws-emr.md)
-- [Databricks Runtime 9.1, 10.4, 11.3](get-started/getting-started-databricks.md)
+- [Databricks Runtime 10.4, 11.3](get-started/getting-started-databricks.md)
 - [Google Cloud Dataproc 2.0](get-started/getting-started-gcp.md)
 - [Azure Synapse](get-started/getting-started-azure-synapse-analytics.md)
 - Cloudera provides the plugin packaged through
@@ -39,7 +39,7 @@ release.
 
 ### What hardware is supported?
 
-The plugin is tested and supported on P100, V100, T4, A2, A10, A30 and A100 datacenter GPUs.  It is possible
+The plugin is tested and supported on P100, V100, T4, A2, A10, A30, A100 and L4 datacenter GPUs.  It is possible
 to run the plugin on GeForce desktop hardware with Volta or better architectures.  GeForce hardware
 does not support [CUDA forward
 compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/index.html#forward-compatibility-title),
@@ -403,7 +403,7 @@ There are multiple reasons why this a problematic configuration:
 - CUDA context switches between processes sharing a single GPU can be expensive
 - Each executor would have a fraction of the GPU memory available for processing
 
-### Is [Multi-Instance GPU (MIG)](https://docs.nvidia.com/cuda/mig/index.html) supported?
+### Is [Multi-Instance GPU (MIG)](https://www.nvidia.com/en-gb/technologies/multi-instance-gpu/) supported?
 
 Yes, but it requires support from the underlying cluster manager to isolate the MIG GPU instance
 for each executor (e.g.: by setting `CUDA_VISIBLE_DEVICES`,
@@ -540,7 +540,7 @@ Below are some troubleshooting tips on GPU query performance issue:
   `spark.sql.files.maxPartitionBytes` and `spark.rapids.sql.concurrentGpuTasks` as these configurations can affect performance of queries significantly.
   Please refer to [Tuning Guide](./tuning-guide.md) for more details.
 
-### Why is Avro library not found by RAPIDS?
+### Why is the Avro library not found by RAPIDS?
 
 If you are getting a warning `Avro library not found by the RAPIDS plugin.` or if you are getting the
 `java.lang.NoClassDefFoundError: org/apache/spark/sql/v2/avro/AvroScan` error, make sure you ran the
@@ -561,6 +561,51 @@ use the RAPIDS Shuffle Manager, your deployment option may be limited to the ext
 Starting from 22.06, the default value for `spark.rapids.memory.gpu.pool` is changed to `ASYNC` from
 `ARENA` for CUDA 11.5+. For CUDA 11.4 and older, it will fall back to `ARENA`.
 
+### What is a `RetryOOM` or `SplitAndRetryOOM` exception?
+
+In the 23.04 release of the accelerator two new exceptions were added to replace a
+regular `OutOfMemoryError` that was thrown before when the GPU ran out of memory.
+Originally we used `OutOfMemoryError` like on the CPU thinking that it would help to
+trigger GC in case handles pointing to GPU memory were leaked in the JVM heap. But
+`OutOfMemoryError` is technically a fatal exception and recovering from it is
+not strictly supported. As such Apache Spark treats it as a fatal exception and will
+kill the process that sees this exception. This can result in a lot of tasks
+being rerun if the GPU runs out of memory. These new exceptions prevent that. They
+also provide an indication to various GPU operators that the GPU ran out of memory
+and how that operator might be able to recover. `RetryOOM` indicates that the operator
+should roll back to a known good spot and then wait until the memory allocation
+framework decides that it should be retried. `SplitAndRetryOOM` is used
+when there is really only one task unblocked and the only way to recover would be to
+roll back to a good spot and try to split the input so that less total GPU memory is
+needed.
+
+These are not implemented for all GPU operations. A number of GPU operations that
+use a significant amount of memory have been updated to handle `RetryOOM`, but fewer
+have been updated to handle `SplitAndRetryOOM`. If you do run into these exceptions
+it is an indication that you are using too much GPU memory. The tuning guide can
+help you to reduce your memory usage. Be aware that some algorithms do not have
+a way to split their usage, things like window operations over some large windows.
+If tuning does not fix the problem please file an issue to help us understand what
+operators may need better out of core algorithm support.
+
+### Encryption Support
+
+The RAPIDS Accelerator for Apache Spark has several components that may or may not follow
+the encryption configurations that Apache Spark provides. The following documents the 
+exceptions that are known at the time of writing this FAQ entry:
+
+Local storage encryption (`spark.io.encryption.enabled`) is not supported for spilled buffers that the 
+plugin uses to help with GPU out-of-memory situations. The RAPIDS Shuffle Manager does not implement 
+local storage encryption for shuffle blocks when configured for UCX, but it does when configured in 
+MULTITHREADED mode.
+
+Network encryption (`spark.network.crypto.enabled`) is not supported in the RAPIDS Shuffle Manager
+when configured for UCX, but it is supported when configured in MULTITHREADED mode.
+
+If your environment has specific encryption requirements for network or IO, please make sure
+that the RAPIDS Accelerator suits your needs, and file and issue or discussion if you have doubts
+or would like expanded encryption support.
+
 ### I have more questions, where do I go?
 We use github to track bugs, feature requests, and answer questions. File an
 [issue](https://github.com/NVIDIA/spark-rapids/issues/new/choose) for a bug or feature request. Ask
diff --git a/docs/additional-functionality/delta-lake-support.md b/docs/additional-functionality/delta-lake-support.md
index e55acff1e7f..a0a68587df1 100644
--- a/docs/additional-functionality/delta-lake-support.md
+++ b/docs/additional-functionality/delta-lake-support.md
@@ -50,9 +50,36 @@ operation which is typically triggered via the DataFrame `write` API, e.g.:
 Table creation from selection, table insertion from SQL, and table merges are not currently
 GPU accelerated. These operations will fallback to the CPU.
 
-[Automatic optimization](https://docs.databricks.com/optimizations/auto-optimize.html)
-during Delta Lake writes is not supported. Write operations that are configured to
-automatically optimize or automatically compact will fallback to the CPU.
+#### Automatic Optimization of Writes
+
+Delta Lake on Databricks has
+[automatic optimization](https://docs.databricks.com/optimizations/auto-optimize.html)
+features for optimized writes and automatic compaction.
+
+Optimized writes are supported only on Databricks platforms. The algorithm used is similar but
+not identical to the Databricks version. The following table describes configuration settings
+that control the operation of the optimized write.
+
+| Configuration                                               | Default | Description                                                                                |
+|-------------------------------------------------------------|---------|--------------------------------------------------------------------------------------------|
+| spark.databricks.delta.optimizeWrite.binSize                | 512     | Target uncompressed partition size in megabytes                                            |
+| spark.databricks.delta.optimizeWrite.smallPartitionFactor   | 0.5     | Merge partitions smaller than this factor multiplied by the target partition size          |
+| spark.databricks.delta.optimizeWrite.mergedPartitionFactor  | 1.2     | Avoid combining partitions larger than this factor multiplied by the target partition size |
+
+Automatic compaction is supported only on Databricks platforms. The algorithm is similar but 
+not identical to the Databricks version. The following table describes configuration settings
+that control the operation of automatic compaction.
+
+| Configuration                                                       | Default | Description                                                                                            |
+|---------------------------------------------------------------------|---------|--------------------------------------------------------------------------------------------------------|
+| spark.databricks.delta.autoCompact.enabled                          | false   | Enable/disable auto compaction for writes to Delta directories                                         |
+| spark.databricks.delta.properties.defaults.autoOptimize.autoCompact | false   | Whether to enable auto compaction by default, if spark.databricks.delta.autoCompact.enabled is not set |
+| spark.databricks.delta.autoCompact.minNumFiles                      | 50      | Minimum number of files in the Delta directory before which auto optimize does not begin compaction    |
+
+Note that optimized write support requires round-robin partitioning of the data, and round-robin
+partitioning requires sorting across all columns for deterministic operation. If the GPU cannot
+support sorting a particular column type in order to support the round-robin partitioning, the
+Delta Lake write will fallback to the CPU.
 
 ### RapidsDeltaWrite Node in Query Plans
 
@@ -98,6 +125,16 @@ spark.rapids.sql.command.DeleteCommandEdge=true on Databricks platforms.
 Deleting data from Delta Lake tables via the SQL `DELETE FROM` statement or via the DeltaTable
 `delete` API is supported.
 
+### num_affected_rows Difference with Databricks
+
+The Delta Lake delete command returns a single row result with a `num_affected_rows` column.
+When entire partition files in the table are deleted, the open source Delta Lake and RAPIDS
+Acclerator implementations of delete can return -1 for `num_affected_rows` since it could be
+expensive to open the files and produce an accurate row count. Databricks changed the behavior
+of delete operations that delete entire partition files to return the actual row count.
+This is only a difference in the statistics of the operation, and the table contents will still
+be accurately deleted with the RAPIDS Accelerator.
+
 ## Update Operations on Delta Lake Tables
 
 Delta Lake update acceleration is experimental and is disabled by default. To enable acceleration
diff --git a/docs/additional-functionality/ml-integration.md b/docs/additional-functionality/ml-integration.md
index 35bd5970b45..bc5ba67bee3 100644
--- a/docs/additional-functionality/ml-integration.md
+++ b/docs/additional-functionality/ml-integration.md
@@ -6,19 +6,53 @@ nav_order: 1
 ---
 # RAPIDS Accelerator for Apache Spark ML Library Integration
 
-There are cases where you may want to get access to the raw data on the GPU, preferably without
-copying it. One use case for this is exporting the data to an ML framework after doing feature
-extraction. To do this we provide a simple Scala utility `com.nvidia.spark.rapids.ColumnarRdd` that can
-be used to convert a `DataFrame` to an `RDD[ai.rapids.cudf.Table]`. Each `Table` will have the same
-schema as the `DataFrame` passed in.
-
-`Table` is not a typical thing in an `RDD` so special care needs to be taken when working with it.
-By default, it is not serializable so repartitioning the `RDD` or any other operator that involves
-a shuffle will not work. This is because it is relatively expensive to serialize and
+## Existing ML Libraries
+
+The RAPIDS Accelerator for Apache Spark can be used to accelerate the ETL portions (e.g., loading
+training data from parquet files) of applications using ML libraries with Spark DataFrame APIs.
+Examples of such libraries include the original [Apache Spark
+MLlib](https://spark.apache.org/mllib/), [XGBoost](https://xgboost.readthedocs.io/en/stable/),
+[Spark RAPIDS ML](https://nvidia.github.io/spark-rapids-ml/), and the [DL inference UDF
+function](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.functions.predict_batch_udf.html)
+introduced in Spark 3.4.  The latter three also enable leveraging GPUs (in the case of the DL
+inference UDF, indirectly via the underlying DL framework) to accelerate the core ML algorithms, and
+thus, in conjunction with the RAPIDS Accelerator for Apache Spark for ETL, can further enhance the
+cost-benefit of GPU accelerated Spark clusters.
+
+For Spark API compatible ML libraries that implement their core ML computations inside pandas UDFs,
+such as XGBoost’s pySpark API, Spark RAPIDS ML pySpark API, and the DL inference UDF it is
+recommended to enable the RAPIDS Accelerator for Apache Spark’s [support for GPU accelerated pandas
+UDFs](https://nvidia.github.io/spark-rapids/docs/additional-functionality/rapids-udfs.html#gpu-support-for-pandas-udf).
+
+### RMM
+
+One consideration when using the RAPIDS Accelerator for Apache Spark with a GPU accelerated ML
+library is the sharing of GPU memory between the two, as the ML library would typically have a
+distinct GPU memory manager from the RAPIDS Accelerator’s RMM instance.  Accordingly, you may need
+to disable RMM pooling in the RAPIDS Accelerator via the config `spark.rapids.memory.gpu.pool` when
+exporting data to an ML library since that library will likely not have access to any of the memory
+that the RAPIDS Accelerator’s RMM instance is holding.  Similarly, aggressive GPU memory reservation
+on the side of the ML library may also need to be disabled, as via these steps in the case of
+[Tensorflow](https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth).
+
+## GPU accelerated ML Library development
+
+### ColumnarRdd
+
+When developing a GPU accelerated ML library for Spark, there are cases where you may want to get
+access to the raw data on the GPU, preferably without copying it. One use case for this is exporting
+the data to the ML library after doing feature extraction. To enable this for Scala development, the
+RAPIDS Accelerator for Apache Spark provides a simple utility `com.nvidia.spark.rapids.ColumnarRdd`
+that can be used to convert a `DataFrame` to an `RDD[ai.rapids.cudf.Table]`. Each `Table` will have
+the same schema as the `DataFrame` passed in.
+
+Note that `Table` is not a typical thing in an `RDD` so special care needs to be taken when working
+with it.  By default, it is not serializable so repartitioning the `RDD` or any other operator that
+involves a shuffle will not work. This is because it is relatively expensive to serialize and
 deserialize GPU data using a conventional Spark shuffle. In addition, most of the memory associated
 with the `Table` is on the GPU itself. So, each `Table` must be closed when it is no longer needed
-to avoid running out of GPU memory. By convention, it is the responsibility of the one consuming
-the data to close it when they no longer need it.
+to avoid running out of GPU memory. By convention, it is the responsibility of the one consuming the
+data to close it when they no longer need it.
 
 ```scala
 val df = spark.sql("""select my_column from my_table""")
@@ -32,17 +66,13 @@ val maxValue = rdd.map(table => {
 }).max()
 ```
 
-## RMM
-You may need to disable RMM caching when exporting data to an ML library as that library
-will likely want to use all of the GPU's memory and if it is not aware of RMM it will not have
-access to any of the memory that RMM is holding.
-
-## Spark ML Algorithms Supported by RAPIDS Accelerator
+### Examples of Spark ML Implementations leveraging ColumnarRdd
 
-The [spark-rapids-examples repository](https://github.com/NVIDIA/spark-rapids-examples) provides a
-[working example](https://github.com/NVIDIA/spark-rapids-examples/tree/main/examples/ML+DL-Examples/Spark-cuML/pca)
-of accelerating the `transform` API for
-[Principal Component Analysis (PCA)](https://spark.apache.org/docs/latest/mllib-dimensionality-reduction#principal-component-analysis-pca).
-The example leverages the [RAPIDS accelerated UDF interface](rapids-udfs.md) to provide a native
-implementation of the algorithm. The details of the UDF implementation can be found in the
-[spark-rapids-ml repository](https://github.com/NVIDIA/spark-rapids-ml).
+Both the Scala Spark PCA
+[implementation](https://github.com/NVIDIA/spark-rapids-ml/blob/ab575bc46e55f38ee52906b3c3b55b75f2418459/jvm/src/main/scala/org/apache/spark/ml/linalg/distributed/RapidsRowMatrix.scala)
+in Spark RAPIDS ML and XGBoost’s [GPU accelerated Scala
+SparkAPI](https://github.com/dmlc/xgboost/blob/f1e9bbcee52159d4bd5f7d25ef539777ceac147c/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala)
+leverage ColumnarRdd (search for ColumnarRdd in these files) to accelerate data transfer between the
+RAPIDS Accelerator for Apache Spark and the respective core ML algorithm computations.  XGBoost in
+particular enables this when detecting that the RAPIDS Accelerator for Apache Spark is present and
+enabled.
diff --git a/docs/additional-functionality/rapids-shuffle.md b/docs/additional-functionality/rapids-shuffle.md
index 6924ae415fe..046a850e2e2 100644
--- a/docs/additional-functionality/rapids-shuffle.md
+++ b/docs/additional-functionality/rapids-shuffle.md
@@ -26,13 +26,13 @@ in our plugin:
 | 3.2.3           | com.nvidia.spark.rapids.spark323.RapidsShuffleManager    |
 | 3.3.0           | com.nvidia.spark.rapids.spark330.RapidsShuffleManager    |
 | 3.3.1           | com.nvidia.spark.rapids.spark331.RapidsShuffleManager    |
-| Databricks 9.1  | com.nvidia.spark.rapids.spark312db.RapidsShuffleManager  |
+| 3.3.2           | com.nvidia.spark.rapids.spark332.RapidsShuffleManager    |
 | Databricks 10.4 | com.nvidia.spark.rapids.spark321db.RapidsShuffleManager  |
 | Databricks 11.3 | com.nvidia.spark.rapids.spark330db.RapidsShuffleManager  |
 
 ## Multi-Threaded Mode
 
-Mult-threaded mode (default) is similar to the built-in Spark shuffle, but it attempts to use
+Multi-threaded mode (default) is similar to the built-in Spark shuffle, but it attempts to use
 more CPU threads for compute-intensive tasks, such as compression and decompression. 
 
 Minimum configuration:
@@ -93,8 +93,8 @@ In order to enable the RAPIDS Shuffle Manager, UCX user-space libraries and its
 be installed on the host and inside Docker containers (if not baremetal). A host has additional
 requirements, like the MLNX_OFED driver and `nv_peer_mem` kernel module.
 
-The minimum UCX requirement for the RAPIDS Shuffle Manager is
-[UCX 1.12.1](https://github.com/openucx/ucx/releases/tag/v1.12.1).
+The required UCX version for the RAPIDS Shuffle Manager is
+[UCX 1.12.1](https://github.com/openucx/ucx/releases/tag/v1.12.1). Versions higher than 1.12.1 have not been tested.
 
 #### Baremetal
 
@@ -330,7 +330,7 @@ In this section, we are using a docker container built using the sample dockerfi
 1. Choose the version of the shuffle manager that matches your Spark version. Please refer to
    the table at the top of this document for `spark.shuffle.manager` values.
 
-2. Settings for UCX 1.12.1+:
+2. Settings for UCX 1.12.1:
 
     Minimum configuration:
 
@@ -392,10 +392,10 @@ Save the script in DBFS and add it to the "Init Scripts" list:
 
 2) Add the UCX minimum configuration for your Cluster. 
 
-Databricks 9.1:
+Databricks 10.4:
 
 ```
-spark.shuffle.manager com.nvidia.spark.rapids.spark312db.RapidsShuffleManager
+spark.shuffle.manager com.nvidia.spark.rapids.spark321db.RapidsShuffleManager
 spark.rapids.shuffle.mode UCX
 spark.shuffle.service.enabled false
 spark.executorEnv.UCX_MEMTYPE_CACHE n
diff --git a/docs/archive.md b/docs/archive.md
index 37b5c676f9d..1ac82e30bba 100644
--- a/docs/archive.md
+++ b/docs/archive.md
@@ -5,6 +5,66 @@ nav_order: 15
 ---
 Below are archived releases for RAPIDS Accelerator for Apache Spark.
 
+## Release v23.02.0
+Hardware Requirements:
+
+The plugin is tested on the following architectures:
+
+	GPU Models: NVIDIA P100, V100, T4 and A2/A10/A30/A100 GPUs
+
+Software Requirements:
+
+	OS: Ubuntu 18.04, Ubuntu 20.04 or CentOS 7, Rocky Linux 8
+
+	CUDA & NVIDIA Drivers*: 11.x & v450.80.02+
+
+	Apache Spark 3.1.1, 3.1.2, 3.1.3, 3.2.0, 3.2.1, 3.2.2, 3.2.3, 3.3.0, 3.3.1, Databricks 10.4 ML LTS or 11.3 ML LTS Runtime and GCP Dataproc 2.0
+
+	Python 3.6+, Scala 2.12, Java 8
+
+*Some hardware may have a minimum driver version greater than v450.80.02+.  Check the GPU spec sheet
+for your hardware's minimum driver version.
+
+*For Cloudera and EMR support, please refer to the
+[Distributions](./FAQ.md#which-distributions-are-supported) section of the FAQ.
+
+### Download v23.02.0
+* Download the [RAPIDS
+  Accelerator for Apache Spark 23.02.0 jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.02.0/rapids-4-spark_2.12-23.02.0.jar)
+
+This package is built against CUDA 11.8 and all CUDA 11.x versions are supported through [CUDA forward
+compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/index.html). It is tested
+on V100, T4, A2, A10, A30 and A100 GPUs with CUDA 11.0-11.5.  For those using other types of GPUs which
+do not have CUDA forward compatibility (for example, GeForce), CUDA 11.5 or later is required. Users will
+need to ensure the minimum driver (450.80.02) and CUDA toolkit are installed on each Spark node.
+
+### Verify signature
+* Download the [RAPIDS Accelerator for Apache Spark 23.02.0 jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.02.0/rapids-4-spark_2.12-23.02.0.jar)
+  and [RAPIDS Accelerator for Apache Spark 23.02.0 jars.asc](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.02.0/rapids-4-spark_2.12-23.02.0.jar.asc)
+* Download the [PUB_KEY](https://keys.openpgp.org/search?q=sw-spark@nvidia.com).
+* Import the public key: `gpg --import PUB_KEY`
+* Verify the signature: `gpg --verify rapids-4-spark_2.12-23.02.0.jar.asc rapids-4-spark_2.12-23.02.0.jar`
+
+The output if signature verify:
+
+	gpg: Good signature from "NVIDIA Spark (For the signature of spark-rapids release jars) <sw-spark@nvidia.com>"
+
+### Release Notes
+New functionality and performance improvements for this release include:
+* Delta Lake MERGE/DELETE/UPDATE (experimental feature, can be enabled with a config flag)
+* Function `from_json`
+* Hive text table write 
+* Databricks 11.3 ML LTS support
+* Support batched full join to improve full join's performance
+* Qualification and Profiling tool:
+  * EMR user tools support for qualification
+  * EMR user tools support for bootstrap
+  * Updated estimated speedup factors for on-prem, Dataproc, and EMR environments for qualification
+  
+  
+For a detailed list of changes, please refer to the
+[CHANGELOG](https://github.com/NVIDIA/spark-rapids/blob/main/CHANGELOG.md).
+
 ## Release v22.12.0
 Hardware Requirements:
 
diff --git a/docs/compatibility.md b/docs/compatibility.md
index 31d084a9a46..638d1c6be91 100644
--- a/docs/compatibility.md
+++ b/docs/compatibility.md
@@ -296,38 +296,21 @@ The JSON format read is a very experimental feature which is expected to have so
 it by default. If you would like to test it, you need to enable `spark.rapids.sql.format.json.enabled` and 
 `spark.rapids.sql.format.json.read.enabled`.
 
-Currently, the GPU accelerated JSON reader doesn't support column pruning, which will likely make 
-this difficult to use or even test. The user must specify the full schema or just let Spark infer 
-the schema from the JSON file. eg,
-
-We have a `people.json` file with below content
-
+Reading input containing invalid JSON format (in any row) will throw runtime exception.
+An example of valid input is as following:
 ``` console
-{"name":"Michael"}
 {"name":"Andy", "age":30}
 {"name":"Justin", "age":19}
 ```
 
-Both below ways will work
-
-- Inferring the schema
-
-  ``` scala
-  val df = spark.read.json("people.json")
-  ```
-
-- Specifying the full schema
-
-  ``` scala
-  val schema = StructType(Seq(StructField("name", StringType), StructField("age", IntegerType)))
-  val df = spark.read.schema(schema).json("people.json")
-  ```
-
-While the below code will not work in the current version,
+The following input is invalid and will cause error:
+```console
+{"name":"Andy", "age":30} ,,,,
+{"name":"Justin", "age":19}
+```
 
-``` scala
-val schema = StructType(Seq(StructField("name", StringType)))
-val df = spark.read.schema(schema).json("people.json")
+```console
+{"name":  Justin", "age":19}
 ```
 
 ### JSON supporting types
@@ -344,7 +327,6 @@ Due to such limitations, the input JSON schema must be `MAP<STRING,STRING>` and
  ```
 scala> val df = Seq("{}", "BAD", "{\"A\": 100}").toDF
 df: org.apache.spark.sql.DataFrame = [value: string]
-
 scala> df.selectExpr("from_json(value, 'MAP<STRING,STRING>')").show()
 +----------+
 |   entries|
diff --git a/docs/configs.md b/docs/configs.md
index 9105b31d8a4..95484b96cb9 100644
--- a/docs/configs.md
+++ b/docs/configs.md
@@ -10,7 +10,7 @@ The following is the list of options that `rapids-plugin-4-spark` supports.
 On startup use: `--conf [conf key]=[conf value]`. For example:
 
 ```
-${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-23.02.0-cuda11.jar \
+${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-23.04.0-cuda11.jar \
 --conf spark.plugins=com.nvidia.spark.SQLPlugin \
 --conf spark.rapids.sql.concurrentGpuTasks=2
 ```
@@ -31,9 +31,12 @@ scala> spark.conf.set("spark.rapids.sql.concurrentGpuTasks", 2)
 
 Name | Description | Default Value | Applicable at
 -----|-------------|--------------|--------------
-<a name="alluxio.automount.enabled"></a>spark.rapids.alluxio.automount.enabled|Enable the feature of auto mounting the cloud storage to Alluxio. It requires the Alluxio master is the same node of Spark driver node. When it's true, it requires an environment variable ALLUXIO_HOME be set properly. The default value of ALLUXIO_HOME is "/opt/alluxio-2.8.0". You can set it as an environment variable when running a spark-submit or you can use spark.yarn.appMasterEnv.ALLUXIO_HOME to set it on Yarn. The Alluxio master's host and port will be read from alluxio.master.hostname and alluxio.master.rpc.port(default: 19998) from ALLUXIO_HOME/conf/alluxio-site.properties, then replace a cloud path which matches spark.rapids.alluxio.bucket.regex like "s3://bar/b.csv" to "alluxio://0.1.2.3:19998/bar/b.csv", and the bucket "s3://bar" will be mounted to "/bar" in Alluxio automatically.|false|Runtime
+<a name="alluxio.automount.enabled"></a>spark.rapids.alluxio.automount.enabled|Enable the feature of auto mounting the cloud storage to Alluxio. It requires the Alluxio master is the same node of Spark driver node. The Alluxio master's host and port will be read from alluxio.master.hostname and alluxio.master.rpc.port(default: 19998) from ALLUXIO_HOME/conf/alluxio-site.properties, then replace a cloud path which matches spark.rapids.alluxio.bucket.regex like "s3://bar/b.csv" to "alluxio://0.1.2.3:19998/bar/b.csv", and the bucket "s3://bar" will be mounted to "/bar" in Alluxio automatically.|false|Runtime
 <a name="alluxio.bucket.regex"></a>spark.rapids.alluxio.bucket.regex|A regex to decide which bucket should be auto-mounted to Alluxio. E.g. when setting as "^s3://bucket.*", the bucket which starts with "s3://bucket" will be mounted to Alluxio and the path "s3://bucket-foo/a.csv" will be replaced to "alluxio://0.1.2.3:19998/bucket-foo/a.csv". It's only valid when setting spark.rapids.alluxio.automount.enabled=true. The default value matches all the buckets in "s3://" or "s3a://" scheme.|^s3a{0,1}://.*|Runtime
+<a name="alluxio.home"></a>spark.rapids.alluxio.home|The Alluxio installation home path or link to the installation home path. |/opt/alluxio|Startup
 <a name="alluxio.large.file.threshold"></a>spark.rapids.alluxio.large.file.threshold|The threshold is used to identify whether average size of files is large when reading from S3. If reading large files from S3 and the disks used by Alluxio are slow, directly reading from S3 is better than reading caches from Alluxio, because S3 network bandwidth is faster than local disk. This improvement takes effect when spark.rapids.alluxio.slow.disk is enabled.|67108864|Runtime
+<a name="alluxio.master"></a>spark.rapids.alluxio.master|The Alluxio master hostname. If not set, read Alluxio master URL from spark.rapids.alluxio.home locally. This config is useful when Alluxio master and Spark driver are not co-located.||Startup
+<a name="alluxio.master.port"></a>spark.rapids.alluxio.master.port|The Alluxio master port. If not set, read Alluxio master port from spark.rapids.alluxio.home locally. This config is useful when Alluxio master and Spark driver are not co-located.|19998|Startup
 <a name="alluxio.pathsToReplace"></a>spark.rapids.alluxio.pathsToReplace|List of paths to be replaced with corresponding Alluxio scheme. E.g. when configure is set to "s3://foo->alluxio://0.1.2.3:19998/foo,gs://bar->alluxio://0.1.2.3:19998/bar", it means: "s3://foo/a.csv" will be replaced to "alluxio://0.1.2.3:19998/foo/a.csv" and "gs://bar/b.csv" will be replaced to "alluxio://0.1.2.3:19998/bar/b.csv". To use this config, you have to mount the buckets to Alluxio by yourself. If you set this config, spark.rapids.alluxio.automount.enabled won't be valid.|None|Startup
 <a name="alluxio.replacement.algo"></a>spark.rapids.alluxio.replacement.algo|The algorithm used when replacing the UFS path with the Alluxio path. CONVERT_TIME and TASK_TIME are the valid options. CONVERT_TIME indicates that we do it when we convert it to a GPU file read, this has extra overhead of creating an entirely new file index, which requires listing the files and getting all new file info from Alluxio. TASK_TIME replaces the path as late as possible inside of the task. By waiting and replacing it at task time, it just replaces the path without fetching the file information again, this is faster but doesn't update locality information if that has a bit impact on performance.|TASK_TIME|Runtime
 <a name="alluxio.slow.disk"></a>spark.rapids.alluxio.slow.disk|Indicates whether the disks used by Alluxio are slow. If it's true and reading S3 large files, Rapids Accelerator reads from S3 directly instead of reading from Alluxio caches. Refer to spark.rapids.alluxio.large.file.threshold which defines a threshold that identifying whether files are large. Typically, it's slow disks if speed is less than 300M/second. If using convert time spark.rapids.alluxio.replacement.algo, this may not apply to all file types like Delta files|true|Runtime
@@ -50,6 +53,7 @@ Name | Description | Default Value | Applicable at
 <a name="memory.gpu.pool"></a>spark.rapids.memory.gpu.pool|Select the RMM pooling allocator to use. Valid values are "DEFAULT", "ARENA", "ASYNC", and "NONE". With "DEFAULT", the RMM pool allocator is used; with "ARENA", the RMM arena allocator is used; with "ASYNC", the new CUDA stream-ordered memory allocator in CUDA 11.2+ is used. If set to "NONE", pooling is disabled and RMM just passes through to CUDA memory allocation directly.|ASYNC|Startup
 <a name="memory.gpu.pooling.enabled"></a>spark.rapids.memory.gpu.pooling.enabled|Should RMM act as a pooling allocator for GPU memory, or should it just pass through to CUDA memory allocation directly. DEPRECATED: please use spark.rapids.memory.gpu.pool instead.|true|Startup
 <a name="memory.gpu.reserve"></a>spark.rapids.memory.gpu.reserve|The amount of GPU memory that should remain unallocated by RMM and left for system use such as memory needed for kernels and kernel launches.|671088640|Startup
+<a name="memory.gpu.state.debug"></a>spark.rapids.memory.gpu.state.debug|To better recover from out of memory errors, RMM will track several states for the threads that interact with the GPU. This provides a log of those state transitions to aid in debugging it. STDOUT or STDERR will have the logging go there empty string will disable logging and anything else will be treated as a file to write the logs to.||Startup
 <a name="memory.gpu.unspill.enabled"></a>spark.rapids.memory.gpu.unspill.enabled|When a spilled GPU buffer is needed again, should it be unspilled, or only copied back into GPU memory temporarily. Unspilling may be useful for GPU buffers that are needed frequently, for example, broadcast variables; however, it may also increase GPU memory usage|false|Startup
 <a name="memory.host.pageablePool.size"></a>spark.rapids.memory.host.pageablePool.size|The size of the pageable memory pool in bytes unless otherwise specified. Use 0 to disable the pool.|1073741824|Startup
 <a name="memory.host.spillStorageSize"></a>spark.rapids.memory.host.spillStorageSize|Amount of off-heap host memory to use for buffering spilled GPU data before spilling to local disk. Use -1 to set the amount to the combined size of pinned and pageable memory pools.|-1|Startup
@@ -146,6 +150,7 @@ Name | Description | Default Value | Applicable at
 <a name="sql.metrics.level"></a>spark.rapids.sql.metrics.level|GPU plans can produce a lot more metrics than CPU plans do. In very large queries this can sometimes result in going over the max result size limit for the driver. Supported values include DEBUG which will enable all metrics supported and typically only needs to be enabled when debugging the plugin. MODERATE which should output enough metrics to understand how long each part of the query is taking and how much data is going to each part of the query. ESSENTIAL which disables most metrics except those Apache Spark CPU plans will also report or their equivalents.|MODERATE|Runtime
 <a name="sql.mode"></a>spark.rapids.sql.mode|Set the mode for the Rapids Accelerator. The supported modes are explainOnly and executeOnGPU. This config can not be changed at runtime, you must restart the application for it to take affect. The default mode is executeOnGPU, which means the RAPIDS Accelerator plugin convert the Spark operations and execute them on the GPU when possible. The explainOnly mode allows running queries on the CPU and the RAPIDS Accelerator will evaluate the queries as if it was going to run on the GPU. The explanations of what would have run on the GPU and why are output in log messages. When using explainOnly mode, the default explain output is ALL, this can be changed by setting spark.rapids.sql.explain. See that config for more details.|executeongpu|Startup
 <a name="sql.multiThreadedRead.numThreads"></a>spark.rapids.sql.multiThreadedRead.numThreads|The maximum number of threads on each executor to use for reading small files in parallel. This can not be changed at runtime after the executor has started. Used with COALESCING and MULTITHREADED readers, see spark.rapids.sql.format.parquet.reader.type, spark.rapids.sql.format.orc.reader.type, or spark.rapids.sql.format.avro.reader.type for a discussion of reader types. If it is not set explicitly and spark.executor.cores is set, it will be tried to assign value of `max(MULTITHREAD_READ_NUM_THREADS_DEFAULT, spark.executor.cores)`, where MULTITHREAD_READ_NUM_THREADS_DEFAULT = 20.|20|Startup
+<a name="sql.optimizer.joinReorder.enabled"></a>spark.rapids.sql.optimizer.joinReorder.enabled|When enabled, joins may be reordered for improved query performance|true|Runtime
 <a name="sql.python.gpu.enabled"></a>spark.rapids.sql.python.gpu.enabled|This is an experimental feature and is likely to change in the future. Enable (true) or disable (false) support for scheduling Python Pandas UDFs with GPU resources. When enabled, pandas UDFs are assumed to share the same GPU that the RAPIDs accelerator uses and will honor the python GPU configs|false|Runtime
 <a name="sql.reader.batchSizeBytes"></a>spark.rapids.sql.reader.batchSizeBytes|Soft limit on the maximum number of bytes the reader reads per batch. The readers will read chunks of data until this limit is met or exceeded. Note that the reader may estimate the number of bytes that will be used on the GPU in some cases based on the schema and number of rows in each batch.|2147483647|Runtime
 <a name="sql.reader.batchSizeRows"></a>spark.rapids.sql.reader.batchSizeRows|Soft limit on the maximum number of rows the reader will read per batch. The orc and parquet readers will read row groups until this limit is met or exceeded. The limit is respected by the csv reader.|2147483647|Runtime
diff --git a/docs/demo/Databricks/generate-init-script.ipynb b/docs/demo/Databricks/generate-init-script.ipynb
index bbb6809cc98..4c8ba857469 100644
--- a/docs/demo/Databricks/generate-init-script.ipynb
+++ b/docs/demo/Databricks/generate-init-script.ipynb
@@ -3,7 +3,7 @@
       {
          "cell_type":"code",
          "source":[
-            "dbutils.fs.mkdirs(\"dbfs:/databricks/init_scripts/\")\n \ndbutils.fs.put(\"/databricks/init_scripts/init.sh\",\"\"\"\n#!/bin/bash\nsudo wget -O /databricks/jars/rapids-4-spark_2.12-23.02.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.02.0/rapids-4-spark_2.12-23.02.0.jar\n\"\"\", True)"
+            "dbutils.fs.mkdirs(\"dbfs:/databricks/init_scripts/\")\n \ndbutils.fs.put(\"/databricks/init_scripts/init.sh\",\"\"\"\n#!/bin/bash\nsudo wget -O /databricks/jars/rapids-4-spark_2.12-23.04.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.04.0/rapids-4-spark_2.12-23.04.0.jar\n\"\"\", True)"
          ],
          "metadata":{
 
diff --git a/docs/dev/shimplify.md b/docs/dev/shimplify.md
new file mode 100644
index 00000000000..9f1ca589c84
--- /dev/null
+++ b/docs/dev/shimplify.md
@@ -0,0 +1,266 @@
+---
+layout: page
+title: Shim Source Code Layout Simplification with Shimplify
+nav_order: 8
+parent: Developer Overview
+---
+
+# Shim Source Code Layout Simplification with Shimplify
+
+This document describes the next iteration of shim source code maintenance. It addresses the
+drawback introduced with the [shim layer rework][1] resulting in the guaranteed ABI-compatible
+bytecode management for each of the 14 currently supported Spark builds but at the expense of
+maintaining 50+ directories. Many shims are spread over an overlapping set of directories making it
+hard to determine where to make additions while keeping code duplication in check.
+
+[shimplify.py][2] is the new goal in the Maven build binding to the `generate-sources` phase.
+
+* It defines a new simpler shim directory structure where there is only a single directory per shim,
+and a special comment is injected to define metadata defining all the shim builds it participates in.
+* It can convert all or a subset of existing shims to the new build. The build can support partially
+converted shims if a longer transition is desired.
+
+## Simplified Shim Source Directory Structure
+
+In our build each supported Apache Spark build and its corresponding shim is identified by its
+[`buildver`][3] property. Every Maven submodule requiring shimming (`sql-plugin`, `tests` as of the
+time of this writing) have a new set of special sibling directories
+`src/(main|test)/spark${buildver}`.
+
+Previous `src/(main|test)/${buildver}` and
+version-range-with-exceptions directories such as `src/main/311until340-non330db` are deprecated and
+will be removed soon as a result of the conversion to the new structure.
+
+`shimplify` changes the way the source code is shared among shims by using an explicit
+lexicographically sorted list of `buildver` property values
+in a source-code level comment instead of the shared directories.
+
+```scala
+/*** spark-rapids-shim-json-lines
+{"spark": "312"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
+```
+
+The content inside the tags `spark-rapids-shim-json-lines` is in the [JSON Lines][4] format where
+each line is an extensible object with the shim metadata currently consisting just of the Spark
+build dependency version. The top object in the comment, the minimum version in the comment
+intuitively represents the first version of Spark requiring shimming in the plugin, albeit it might
+not be the original one as support for older Spark releases is eventually dropped. This `buildver`
+is called the *owner shim*.
+
+On the default read-only invocation path of the Maven build shimplify does not make any changes to
+shim source code files and their locations.
+
+* It analyzes the pre-shimplify directory structure and identifies the shims that through the code
+evolution ended up using more dedicated directories than necessary contributing avoidable
+complexity on top of an inherently complex directory structure already. As an example this is one of
+such warnings:
+
+```text
+shimplify - WARNING - Consider consolidating 312db, it spans multiple dedicated directories ['/home/user/gits/NVIDIA/spark-rapids/sql-plugin/src/main/312db/scala', '/home/user/gits/NVIDIA/spark-rapids/sql-plugin/src/main/31xdb/scala']
+```
+
+* For the shimplify directory structure all files under `src/(main|test)/spark*` directories
+are read to parse the `spark-rapids-shim-json-lines` comments. It performs the following
+validations:
+
+  * It makes sure that the comment is present and can be parsed
+  * The list of shims is non-empty (e.g., has not orphaned through dropping shims) and sorted.
+  * The file is stored under the *owner shim* directory.
+
+* All files participating listing the `buildver` of the current Maven build session are symlinked to
+`target/${buildver}/generated/src/(main|test)/(scala|java)`. Thus, instead of hardcoding distinct
+lists of directories for `build-helper` Maven plugin to add (one for each shim) after the full
+transition to shimplify, the pom will have only 4 add source statements that is independent of the
+number of supported shims.
+
+With the shimplify format in place it is easy to review all the files for a single shim without
+relying on Maven:
+
+```bash
+git grep '{"spark": "323"}' '*.scala' '*.java'
+```
+
+## Conversion to the Shimplify-based Directory Structure
+
+Shimplify can automatically convert the prior version-range-with-exceptions directory structure to
+the simplified version. This allows to make it an atomic transition without having to resolve
+almost unavoidable merge conflicts due to the sheer size of this sweeping change while the shim
+development is ongoing. The conversion of the shims source code and the regular build should not
+be done simultaneously for faster isolation and correction of potential bugs in the conversion code.
+
+Prior to invoking the conversion standalone, you first run
+
+```bash
+mvn clean install -DskipTests
+```
+
+on the current state of the `spark-rapids` repo.
+
+After that you can execute conversion in one or more iterations depending on specified -D parameters
+
+```bash
+mvn generate-sources -Dshimplify=true [-D...]
+```
+
+With `-Dshimplify=true`, shimplify is put on the write call path to generate and inject
+spark-rapids-shim-json-lines comments to all shim source files. The files are not yet moved to their
+owner shim directory, and so it is easy to verify with `git diff` the comments being injected. If
+you see any issue you can fix it and re-execute the command by adding
+`-Dshimplify.overwrite=true`. However, it is usually easier to just have git restore the
+previous state:
+
+```bash
+git restore sql-plugin tests
+```
+
+Once the shim comments looks good (as expected, it was tested), you can repeat it and now actually
+move the files to designated locations by invoking
+
+```bash
+mvn generate-sources -Dshimplify=true -Dshimplify.move=true
+```
+
+Now you can run a package build with the simplified directory structure and run a few integration
+tests preferably in the test standalone mode with the RAPIDS Shuffle Manager on for increased
+coverage:
+
+```bash
+mvn clean package -DskipTests -Dbuildver=331
+SPARK_HOME=~/dist/spark-3.3.1-bin-hadoop3 \
+    NUM_LOCAL_EXECS=2 \
+    PYSP_TEST_spark_rapids_shuffle_mode=MULTITHREADED \
+    PYSP_TEST_spark_rapids_shuffle_multiThreaded_writer_threads=2 \
+    PYSP_TEST_spark_rapids_shuffle_multiThreaded_reader_threads=2 \
+    PYSP_TEST_spark_shuffle_manager=com.nvidia.spark.rapids.spark331.RapidsShuffleManager \
+    PYSP_TEST_spark_rapids_memory_gpu_minAllocFraction=0 \
+    PYSP_TEST_spark_rapids_memory_gpu_maxAllocFraction=0.1 \
+    PYSP_TEST_spark_rapids_memory_gpu_allocFraction=0.1 \
+    ./integration_tests/run_pyspark_from_build.sh -k test_hash_grpby_sum
+```
+
+If smoke testing does not reveal any issues proceed to committing the change. If there are issues
+you can undo with
+
+```bash
+git restore --staged sql-plugin tests
+git restore sql-plugin tests
+```
+
+and by reviewing and removing the new directories with
+
+```bash
+git clean -f -d --dry-run
+```
+
+### Partial Conversion
+
+It is not expected to be really necessary but it is possible to convert a subset of the shims
+
+* Either by adding -Dshimplify.shims=buildver1,buildver2,... to the commands above
+* Or by specifying a list of directories you would like to delete to have a simpler directory
+-Dshimplify.dirs=311until340-non330db,320until330-noncdh
+
+The latter is just a minor twist on the former. Instead of having an explicit list of shims, it
+first computes the list of all `buildver` values using provided directories. After this *all* the
+files for the shims, not just under specified directories are converted.
+
+In both cases, the conversion does not leave the rest of the shims totally unaffected when
+there are common files with a specified shim. However, it guarantees to leave the previous dedicated
+files under `src/(main|test)/${buildver}` in place for shims outside the list. This is useful when
+developers of a certain shim would like to continue working on it without adapting the new method.
+However, for the simplicity of future refactoring the full transition is preferred.
+
+### Evolving shims without automatic conversion
+
+Suppose a bulk-conversion of existing shims is not an option whereas the next shimming issue
+requires difficult refactoring of version ranges with adding more directories with exceptions.
+Now it can be resolved easily by placing just the affected files to owner shim directories and
+adding shim JSON lines comments by hand.
+
+## Adding a new shim
+
+Shimplify can clone an existing shim based as a basis of the new shim. For example when adding
+support for a new [maintenance][5] version of Spark, say 3.2.4, it's expected to be similar to 3.2.3.
+
+If just 3.2.3 or all shims after the full transition have already been converted you can execute
+
+```bash
+mvn generate-sources -Dshimplify=true \
+    -Dshimplify.move=true -Dshimplify.overwrite=true \
+    -Dshimplify.add.shim=324 -Dshimplify.add.base=323
+```
+
+to clone 323 as 324. This will add `{"spark": "324"}` to every shared file constituting the 323
+shim. Moreover, it will create
+
+* a copy of dedicated 323 files with spark323 under spark324 shim
+directory
+* substitute spark324 for spark323 in the package name and path,
+* and modify the comment from `{"spark": "323"}` to `{"spark": "324"}`
+
+Review the new repo state, e.g., using `git grep '{"spark": "324"}'`.
+Besides having to add the `release324` profile to various pom.xml as before, this alone
+is likely to be insufficient to complete the work on 324. It is expected you will need to
+work on resolving potential compilation failures manually.
+
+## Deleting a Shim
+
+Every Spark build is de-supported eventually. To drop a build say 311 you can run
+
+```bash
+mvn generate-sources -Dshimplify=true -Dshimplify.move=true \
+    -Dshimplify.remove.shim=311
+```
+
+This command will remove the comment line `{"spark": "311"}` from all source files contributing to
+the 311 shim. If a file belongs exclusively to 311 it will be removed.
+
+After adding or deleting shims you should sanity-check the diff in the local git repo and
+run the integration tests above.
+
+## Symlinks & IDE
+
+IDEs may or may not reveal whether a file is accessed via a symlink. IntelliJ IDEA treats the
+original file path and a path via a symlink to the same file as two independent files by default.
+
+In the context of shimplify, only the generated symlink path is part of the project
+because the owner shim path is not `add-source`d during build and therefore during IDEA Project
+Import. The user can install the [Resolve Symlinks][6] plugin to prevent IDEA from opening multiple
+windows for the same physical source file. As of the time of this writing, it works seamlessly with
+the exception when the file is open via a Debugger either on a breakpoint hit or subsequent clicking
+on the affected stack frame in which case you will see an extra editor tab being added.
+
+No matter whether or not you use the [Resolve Symlinks][6] plugin, IDEA is able to add a breakpoint
+set directly via the original physical file or a symlink path.
+
+## Reducing Code Duplication
+
+You can help reducing code complexity by consolidating copy-and-pasted shim code accumulated because
+it had been hard to fit it into a less flexible shim inheritance hierarchy based on versions with
+exceptions.
+
+You can use the CPD tool that is integrated into our Maven build to find duplicate code in the shim
+and in the regular code base. It is not ready for automation and has to invoked manually, separately
+for Java and Scala, e.g.:
+
+```bash
+mvn antrun:run@duplicate-code-detector \
+    -Dcpd.argLine='--minimum-tokens 50 --language scala --skip-blocks-pattern /*|*/' \
+    -Dcpd.sourceType='main' \
+    > target/cpd.scala.txt
+```
+
+Delete duplicate methods and move a single copy into an object such as `SomethingShim` and annotate
+its file with the list of buildvers.
+
+See [CPD user doc][7] for more details about the options you can pass inside `cpd.argLine`.
+
+[1]: https://github.com/NVIDIA/spark-rapids/issues/3223
+[2]: https://github.com/NVIDIA/spark-rapids/blob/b7b1a5d544b6a3ac35ed064b5c32ee0d63c78845/build/shimplify.py#L15-L79
+[3]: https://github.com/NVIDIA/spark-rapids/blob/74ce729ca1306db01359e68f7f0b7cc31cd3d850/pom.xml#L494-L500
+[4]: https://jsonlines.org/
+[5]: https://spark.apache.org/versioning-policy.html
+[6]: https://plugins.jetbrains.com/plugin/16429-idea-resolve-symlinks
+[7]: https://docs.pmd-code.org/latest/pmd_userdocs_cpd.html
diff --git a/docs/dev/shims.md b/docs/dev/shims.md
index 00125f7a9c3..2d4dcdc25ac 100644
--- a/docs/dev/shims.md
+++ b/docs/dev/shims.md
@@ -68,17 +68,17 @@ Using JarURLConnection URLs we create a Parallel World of the current version wi
 Spark 3.0.2's URLs:
 
 ```text
-jar:file:/home/spark/rapids-4-spark_2.12-23.02.0.jar!/
-jar:file:/home/spark/rapids-4-spark_2.12-23.02.0.jar!/spark3xx-common/
-jar:file:/home/spark/rapids-4-spark_2.12-23.02.0.jar!/spark302/
+jar:file:/home/spark/rapids-4-spark_2.12-23.04.0.jar!/
+jar:file:/home/spark/rapids-4-spark_2.12-23.04.0.jar!/spark3xx-common/
+jar:file:/home/spark/rapids-4-spark_2.12-23.04.0.jar!/spark302/
 ```
 
 Spark 3.2.0's URLs :
 
 ```text
-jar:file:/home/spark/rapids-4-spark_2.12-23.02.0.jar!/
-jar:file:/home/spark/rapids-4-spark_2.12-23.02.0.jar!/spark3xx-common/
-jar:file:/home/spark/rapids-4-spark_2.12-23.02.0.jar!/spark320/
+jar:file:/home/spark/rapids-4-spark_2.12-23.04.0.jar!/
+jar:file:/home/spark/rapids-4-spark_2.12-23.04.0.jar!/spark3xx-common/
+jar:file:/home/spark/rapids-4-spark_2.12-23.04.0.jar!/spark320/
 ```
 
 ### Late Inheritance in Public Classes
diff --git a/docs/dev/testing.md b/docs/dev/testing.md
index 2d2c51a961c..5ba180a836f 100644
--- a/docs/dev/testing.md
+++ b/docs/dev/testing.md
@@ -5,5 +5,5 @@ nav_order: 2
 parent: Developer Overview
 ---
 An overview of testing can be found within the repository at:
-* [Unit tests](https://github.com/NVIDIA/spark-rapids/tree/branch-23.02/tests#readme)
-* [Integration testing](https://github.com/NVIDIA/spark-rapids/tree/branch-23.02/integration_tests#readme)
+* [Unit tests](https://github.com/NVIDIA/spark-rapids/tree/branch-23.04/tests#readme)
+* [Integration testing](https://github.com/NVIDIA/spark-rapids/tree/branch-23.04/integration_tests#readme)
diff --git a/docs/download.md b/docs/download.md
index b1141037138..4eb66a9f037 100644
--- a/docs/download.md
+++ b/docs/download.md
@@ -18,7 +18,7 @@ cuDF jar, that is either preinstalled in the Spark classpath on all nodes or sub
 that uses the RAPIDS Accelerator For Apache Spark. See the [getting-started
 guide](https://nvidia.github.io/spark-rapids/Getting-Started/) for more details.
 
-## Release v23.02.0
+## Release v23.04.0
 Hardware Requirements:
 
 The plugin is tested on the following architectures:
@@ -27,11 +27,11 @@ The plugin is tested on the following architectures:
 
 Software Requirements:
 
-	OS: Ubuntu 18.04, Ubuntu 20.04 or CentOS 7, Rocky Linux 8
+	OS: Ubuntu 20.04, Ubuntu 22.04, CentOS 7, or Rocky Linux 8
 
 	CUDA & NVIDIA Drivers*: 11.x & v450.80.02+
 
-	Apache Spark 3.1.1, 3.1.2, 3.1.3, 3.2.0, 3.2.1, 3.2.2, 3.2.3, 3.3.0, 3.3.1, Databricks 9.1 ML LTS, 10.4 ML LTS or 11.3 ML LTS Runtime and GCP Dataproc 2.0
+	Apache Spark 3.1.1, 3.1.2, 3.1.3, 3.2.0, 3.2.1, 3.2.2, 3.2.3, 3.3.0, 3.3.1, 3.3.2, Databricks 10.4 ML LTS or 11.3 ML LTS Runtime and GCP Dataproc 2.0, Dataproc 2.1
 
 	Python 3.6+, Scala 2.12, Java 8
 
@@ -41,22 +41,22 @@ for your hardware's minimum driver version.
 *For Cloudera and EMR support, please refer to the
 [Distributions](./FAQ.md#which-distributions-are-supported) section of the FAQ.
 
-### Download v23.02.0
+### Download v23.04.0
 * Download the [RAPIDS
-  Accelerator for Apache Spark 23.02.0 jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.02.0/rapids-4-spark_2.12-23.02.0.jar)
+  Accelerator for Apache Spark 23.04.0 jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.04.0/rapids-4-spark_2.12-23.04.0.jar)
 
-This package is built against CUDA 11.5 and all CUDA 11.x versions are supported through [CUDA forward
+This package is built against CUDA 11.8 and all CUDA 11.x versions are supported through [CUDA forward
 compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/index.html). It is tested
 on V100, T4, A2, A10, A30 and A100 GPUs with CUDA 11.0-11.5.  For those using other types of GPUs which
 do not have CUDA forward compatibility (for example, GeForce), CUDA 11.5 or later is required. Users will
 need to ensure the minimum driver (450.80.02) and CUDA toolkit are installed on each Spark node.
 
 ### Verify signature
-* Download the [RAPIDS Accelerator for Apache Spark 23.02.0 jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.02.0/rapids-4-spark_2.12-23.02.0.jar)
-  and [RAPIDS Accelerator for Apache Spark 23.02.0 jars.asc](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.02.0/rapids-4-spark_2.12-23.02.0.jar.asc)
+* Download the [RAPIDS Accelerator for Apache Spark 23.04.0 jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.04.0/rapids-4-spark_2.12-23.04.0.jar)
+  and [RAPIDS Accelerator for Apache Spark 23.04.0 jars.asc](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.04.0/rapids-4-spark_2.12-23.04.0.jar.asc)
 * Download the [PUB_KEY](https://keys.openpgp.org/search?q=sw-spark@nvidia.com).
 * Import the public key: `gpg --import PUB_KEY`
-* Verify the signature: `gpg --verify rapids-4-spark_2.12-23.02.0.jar.asc rapids-4-spark_2.12-23.02.0.jar`
+* Verify the signature: `gpg --verify rapids-4-spark_2.12-23.04.0.jar.asc rapids-4-spark_2.12-23.04.0.jar`
 
 The output if signature verify:
 
@@ -64,15 +64,15 @@ The output if signature verify:
 
 ### Release Notes
 New functionality and performance improvements for this release include:
-* Delta Lake MERGE/DELETE/UPDATE (experimental feature, can be enabled with a config flag)
-* Function `from_json`
-* Hive text table write 
-* Databricks 11.3 ML LTS support
-* Support batched full join to improve full join's performance
+* Introduces OOM retry framework for automatic OOM handling in memory-intensive operators, such as: join, aggregates and windows, coalescing, projections and filters.
+* Support dynamic repartitioning in large/skewed hash joins
+* Optimize the transpilation in `regexp_extract` function
+* Support Delta Lake write with auto-optimization and auto-compaction on Databricks platforms
 * Qualification and Profiling tool:
-  * EMR user tools support for qualification
-  * EMR user tools support for bootstrap
-  * Updated estimated speedup factors for on-prem, Dataproc, and EMR environments for qualification
+  * Add support to recommend cluster shape options on Dataproc and EMR
+  * Add support for Databricks local mode with cost savings based on cluster metadata
+  * Add TCO calculator to estimate annualized cost savings, including estimated frequency for applications
+  * Add support in the qualification tool to generate estimated speed-up for ML functionality in Spark applications
   
   
 For a detailed list of changes, please refer to the
diff --git a/docs/get-started/getting-started-alluxio.md b/docs/get-started/getting-started-alluxio.md
index d93316a0797..5a1b7e4fe13 100644
--- a/docs/get-started/getting-started-alluxio.md
+++ b/docs/get-started/getting-started-alluxio.md
@@ -198,8 +198,12 @@ NM_hostname_2
       so local data access speed may vary depending on the local storage media. To learn
       more about this topic, please refer to the
       [tiered storage document](https://docs.alluxio.io/os/user/stable/en/core-services/Caching.html#multiple-tier-storage).
-
-3. Start Alluxio cluster
+3. Create a link to ALLUXIO_HOME  
+    Execute the following commands to create a link `/opt/alluxio` to actual Alluxio Home path:  
+    ```bash
+    ln -s ${ALLUXIO_HOME} /opt/alluxio
+    ```
+4. Start Alluxio cluster
 
    - Format Alluxio
 
@@ -225,7 +229,7 @@ NM_hostname_2
       To verify that Alluxio is running, visit `http://RM_hostname:19999`
       to see the status page of the Alluxio master.
 
-4. Mount an existing data storage to Alluxio
+5. Mount an existing data storage to Alluxio
 
     - Mount S3 bucket
 
@@ -337,7 +341,6 @@ without setting `spark.rapids.alluxio.pathsToReplace`, which takes precedence ov
 ``` shell
 --conf spark.rapids.alluxio.automount.enabled=true
 ```
-If Alluxio is not installed in /opt/alluxio-2.8.0, you should set the environment variable `ALLUXIO_HOME`.
 
 Additional configs:
 ``` shell
@@ -347,14 +350,6 @@ The regex is used to match the s3 URI, to decide which bucket we should auto mou
 The default value is to match all the URIs which start with `s3://` or `s3a://`.
 For exmaple, `^s3a{1,1}://foo.*` will match the buckets which start with `foo`.
 
-```shell
---conf spark.rapids.alluxio.cmd="su,ubuntu,-c,/opt/alluxio-2.8.0/bin/alluxio"
-```
-This cmd config defines a sequence to be used run the Alluxio command by a specific user,
-mostly the user with Alluxio permission. We run the command by user `ubuntu` as default.
-If you have a different user and command path, you can redefine it.
-The default value is suitable for the case of running Alluxio with RAPIDS on Databricks.
-
 ## Configure whether the disks used by Alluxio are fast
 The default value of config `spark.rapids.alluxio.slow.disk` is true, indicating the disks used by Alluxio are slow.   
 The true value enables an improvement which reads from S3 directly to get better performance when the files being read are large.   
diff --git a/docs/get-started/getting-started-aws-emr.md b/docs/get-started/getting-started-aws-emr.md
index 8a2d1755dff..3b085768b12 100644
--- a/docs/get-started/getting-started-aws-emr.md
+++ b/docs/get-started/getting-started-aws-emr.md
@@ -7,13 +7,13 @@ parent: Getting-Started
 # Get Started with RAPIDS on AWS EMR
 
 This is a getting started guide for the RAPIDS Accelerator for Apache Spark on AWS EMR. At the end
-of this guide, the user will be able to run a sample Apache Spark application that runs on NVIDIA
-GPUs on AWS EMR.
+of this guide, the user will be able to run a sample Apache Spark application on NVIDIA GPUs on AWS EMR.
 
 Different versions of EMR ship with different versions of Spark, RAPIDS Accelerator, cuDF and xgboost4j-spark:
 
 | EMR | Spark | RAPIDS Accelerator jar | cuDF jar | xgboost4j-spark jar
 | --- | --- | --- | ---| --- |
+| 6.10 | 3.3.1 | rapids-4-spark_2.12-22.12.0.jar | Bundled with rapids-4-spark | xgboost4j-spark_3.0-1.4.2-0.3.0.jar |
 | 6.9 | 3.3.0 | rapids-4-spark_2.12-22.08.0.jar | Bundled with rapids-4-spark | xgboost4j-spark_3.0-1.4.2-0.3.0.jar |
 | 6.8 | 3.3.0 | rapids-4-spark_2.12-22.06.0.jar | Bundled with rapids-4-spark | xgboost4j-spark_3.0-1.4.2-0.3.0.jar |
 | 6.7 | 3.2.1 | rapids-4-spark_2.12-22.02.0.jar | cudf-22.02.0-cuda11.jar | xgboost4j-spark_3.0-1.2.0-0.1.0.jar |
@@ -23,46 +23,109 @@ Different versions of EMR ship with different versions of Spark, RAPIDS Accelera
 | 6.3 | 3.1.1 | rapids-4-spark_2.12-0.4.1.jar | cudf-0.18.1-cuda10-1.jar | xgboost4j-spark_3.0-1.2.0-0.1.0.jar |
 | 6.2 | 3.0.1 | rapids-4-spark_2.12-0.2.0.jar | cudf-0.15-cuda10-1.jar | xgboost4j-spark_3.0-1.0.0-0.2.0.jar |
 
-For more details of supported applications, please see the [EMR release
+For more details about each EMR release, please see the [EMR release
 notes](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-release-6x.html).
 
 For more information on AWS EMR, please see the [AWS
 documentation](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-what-is-emr.html).
 
-## Configure and Launch AWS EMR with GPU Nodes
+## Leveraging Spark RAPIDS User Tools for Qualification and Bootstrap
 
-The following steps are based on the AWS EMR document ["Using the NVIDIA Spark-RAPIDS Accelerator
-for Spark"](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-rapids.html)
+To use the qualification and bootstrap tools for EMR, you will want to install the Spark RAPIDS user tools package.
+Instructions for installing and setting up the Spark RAPIDS user tools package for EMR can be found here:
+[link](https://github.com/NVIDIA/spark-rapids-tools/blob/main/user_tools/docs/user-tools-aws-emr.md).
 
-###  Launch an EMR Cluster using AWS CLI
+## Qualify CPU Workloads for GPU Acceleration
+
+The [qualification tool](https://nvidia.github.io/spark-rapids/docs/spark-qualification-tool.html) is launched to analyze CPU applications
+that have already run. The tool will output the applications recommended for acceleration along with estimated speed-up
+and cost saving metrics.  Additionally, it will provide information on how to launch a GPU-accelerated cluster to take
+advantage of the speed-up and cost savings.
 
-You can use the AWS CLI to launch a cluster with one Master node (m5.xlarge) and two 
-g4dn.2xlarge nodes: 
+Usage: `spark_rapids_user_tools emr qualification --eventlogs <s3-log-path> --cpu_cluster <cluster-name>`
 
+Help (to see all options available): `spark_rapids_user_tools emr qualification --help`
+
+Example output:
 ```
-aws emr create-cluster \
---release-label emr-6.9.0 \
---applications Name=Hadoop Name=Spark Name=Livy Name=JupyterEnterpriseGateway \
---service-role EMR_DefaultRole \
---ec2-attributes KeyName=my-key-pair,InstanceProfile=EMR_EC2_DefaultRole \
---instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=m4.4xlarge \
-                  InstanceGroupType=CORE,InstanceCount=1,InstanceType=g4dn.2xlarge \
-                  InstanceGroupType=TASK,InstanceCount=1,InstanceType=g4dn.2xlarge \
---configurations file:///my-configurations.json \
---bootstrap-actions Name='My Spark Rapids Bootstrap action',Path=s3://my-bucket/my-bootstrap-action.sh
++----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------+
+|    | App Name   | App ID                         | Recommendation       |   Estimated GPU |   Estimated GPU |           App |   Estimated GPU |
+|    |            |                                |                      |         Speedup |     Duration(s) |   Duration(s) |      Savings(%) |
+|----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------|
+|  0 | query24    | application_1664888311321_0011 | Strongly Recommended |            3.49 |          257.18 |        897.68 |           59.70 |
+|  1 | query78    | application_1664888311321_0009 | Strongly Recommended |            3.35 |          113.89 |        382.35 |           58.10 |
+|  2 | query23    | application_1664888311321_0010 | Strongly Recommended |            3.08 |          325.77 |       1004.28 |           54.37 |
+|  3 | query64    | application_1664888311321_0008 | Strongly Recommended |            2.91 |          150.81 |        440.30 |           51.82 |
+|  4 | query50    | application_1664888311321_0003 | Recommended          |            2.47 |          101.54 |        250.95 |           43.08 |
+|  5 | query16    | application_1664888311321_0005 | Recommended          |            2.36 |          106.33 |        251.95 |           40.63 |
+|  6 | query38    | application_1664888311321_0004 | Recommended          |            2.29 |           67.37 |        154.33 |           38.59 |
+|  7 | query87    | application_1664888311321_0006 | Recommended          |            2.25 |           75.67 |        170.69 |           37.64 |
+|  8 | query51    | application_1664888311321_0002 | Recommended          |            1.53 |           53.94 |         82.63 |            8.18 |
++----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------+
+
+Instance types conversions:
+-----------  --  ------------
+m5d.8xlarge  to  g4dn.8xlarge
+-----------  --  ------------
+To support acceleration with T4 GPUs, switch the worker node instance types
 ```
 
-Please fill with actual value for `KeyName` and file paths. You can further customize SubnetId,
-EmrManagedSlaveSecurityGroup, EmrManagedMasterSecurityGroup, name and region etc. 
+## Configure and Launch AWS EMR with GPU Nodes
+
+Please follow AWS EMR document ["Using the NVIDIA Spark-RAPIDS Accelerator
+for Spark"](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-rapids.html).
+Below is an example.
+
+###  Launch an EMR Cluster using AWS Console (GUI)
+
+Go to the AWS Management Console and select the `EMR` service from the "Analytics" section. Choose
+the region you want to launch your cluster in, e.g. US West (Oregon), using the dropdown menu in the
+top right corner. Click `Create cluster`, which will bring up a detailed cluster configuration page.
+
+#### Step 1: EMR Release and Application Bundle Selection
 
-The `my-configurations.json` installs the spark-rapids plugin on your cluster, configures YARN to use
+Enter a custom "Cluster name" for your cluster.
+
+Select **emr-6.10.0** for the release and pick "Custom" for the "Application bundle". Unncheck all the 
+software options, and then check **Hadoop 3.3.3**, **Spark 3.3.1**, **Livy 0.7.1** and 
+**JupyterEnterpriseGateway 2.6.0**.
+
+Optionally pick Amazon Linux Release or configure a "Custom AMI".
+
+![Step 1: Software, Configuration and Steps](../img/AWS-EMR/name-and-applications.png)
+
+#### Step 2: Hardware
 
-GPUs, configures Spark to use RAPIDS, and configures the YARN capacity scheduler.  An example JSON
+Keep the default "Primary" node instance type of **m5.xlarge**.
 
-configuration can be found in the section on launching in the GUI below. 
+Change the "Core" node "Instance type" to **g4dn.xlarge**, **g4dn.2xlarge**, or
+**p3.2xlarge**
 
-The `my-boostrap-action.sh` script referenced in the above script opens cgroup permissions to YARN
-on your cluster.  This is required for YARN to use GPUs.  An example script is as follows: 
+An optional step is to have "Task" nodes. These nodes can run a Spark executor but they do not run 
+the HDFS Data Node service. You can click on "Remove instance group" if you would like to only run 
+"Core" nodes with the Data Node and Spark executors. If you want to add extra "Task" nodes, make sure
+that that instance type matches what you selected for "Core".
+
+Under "Cluster scaling and provisioning potion", verify that the instance count for the "Core" instance group
+is at least 1.
+
+![Step 2: Cluster Configuration](../img/AWS-EMR/cluster-configuration.png)
+
+Under "Networking", select the desired VPC and subnet. You can also create a new VPC and subnet for the cluster.
+
+*Optionally* set custom security groups in the "EC2 security groups" tab.
+
+In the "EC2 security groups" section, confirm that the security group chosen for the "Primary" node
+allows for SSH access. Follow these instructions to [allow inbound SSH
+traffic](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/authorizing-access-to-an-instance.html)
+if the security group does not allow it yet.
+
+![Step 2: Cluster Configuration](../img/AWS-EMR/networking.png)
+
+#### Step 3:  General Cluster Settings
+
+Add a custom bootstrap action under "Bootstrap Actions" to allow cgroup permissions to YARN on your cluster.  
+An example bootstrap script is as follows:
 ```bash
 #!/bin/bash
  
@@ -72,24 +135,17 @@ sudo chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct
 sudo chmod a+rwx -R /sys/fs/cgroup/devices
 ```
 
-###  Launch an EMR Cluster using AWS Console (GUI)
+![Step 3: General Cluster Settings](../img/AWS-EMR/bootstrap-action.png)
 
-Go to the AWS Management Console and select the `EMR` service from the "Analytics" section. Choose
-the region you want to launch your cluster in, e.g. US West (Oregon), using the dropdown menu in the
-top right corner. Click `Create cluster` and select `Go to advanced options`, which will bring up a
-detailed cluster configuration page.
-
-#### Step 1:  Software Configuration and Steps
+#### Step 4: Edit Software Configuration
 
-Select **emr-6.9.0** for the release, uncheck all the software options, and then check **Hadoop
-3.3.3**, **Spark 3.3.0**, **Livy 0.7.1** and **JupyterEnterpriseGateway 2.6.0**.
-
-In the "Edit software settings" field, copy and paste the configuration from the [EMR
-document](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-rapids.html). You can also
-create a JSON file on you own S3 bucket. 
+In the "Software settings" field, copy and paste the configuration from the [EMR
+document](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-rapids.html) in the textbox provided
+under "Enter configuration". You can also create a JSON file on you own S3 bucket when selecting 
+"Load JSON from Amazon S3".
 
 For clusters with 2x g4dn.2xlarge GPU instances as worker nodes, we recommend the following
-default settings: 
+default settings:
 ```json
 [
 	{
@@ -162,84 +218,107 @@ default settings:
 ```
 Adjust the settings as appropriate for your cluster.  For example, setting the appropriate
 number of cores based on the node type.  The `spark.task.resource.gpu.amount` should be set to
-1/(number of cores per executor) which will allow multiple tasks to run in parallel on the GPU. 
+1/(number of cores per executor) which will allow multiple tasks to run in parallel on the GPU.
 
-For example, for clusters with 2x g4dn.12xlarge as core nodes, use the following: 
+For example, for clusters with 2x g4dn.12xlarge as core nodes, use the following:
 
 ```json
         "spark.executor.cores":"12",
         "spark.task.resource.gpu.amount":"0.0833",
 ```
 
-More configuration details can be found in the [configuration](../configs.md) documentation. 
-
-![Step 1: Step 1:  Software, Configuration and Steps](../img/AWS-EMR/RAPIDS_EMR_GUI_1.png)
+More configuration details can be found in the [configuration](../configs.md) documentation.
 
-#### Step 2: Hardware
+####  Step 5: Security
 
-Select the desired VPC and availability zone in the "Network" and "EC2 Subnet" fields
-respectively. (Default network and subnet are ok)
+Select an existing "EC2 key pair" that will be used to authenticate SSH access to the cluster's
+nodes. If you do not have access to an EC2 key pair, follow these instructions to [create an EC2 key
+pair](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html#having-ec2-create-your-key-pair).
 
-In the "Core" node row, change the "Instance type" to **g4dn.xlarge**, **g4dn.2xlarge**, or
-**p3.2xlarge** and ensure "Instance count" is set to **1** or any higher number. Keep the default
-"Master" node instance type of **m5.xlarge**.
+![Step 5: SSH Key Pair](../img/AWS-EMR/ssh-key-pair.png)
 
-![Step 2: Hardware](../img/AWS-EMR/RAPIDS_EMR_GUI_2.png)
+#### Finish Cluster Configuration
 
-#### Step 3:  General Cluster Settings
+The EMR cluster management page displays the status of multiple clusters or detailed information
+about a chosen cluster. In the detailed cluster view, the "Instances" and "Monitoring" tabs can be used
+to monitor the status of the various cluster nodes.
 
-Enter a custom "Cluster name" and make a note of the s3 folder that cluster logs will be written to.
+When the cluster is ready, a green-dot will appear next to the cluster name and the "Status" column
+will display **Waiting, cluster ready**.
 
-Add a custom "Bootstrap Actions" to allow cgroup permissions to YARN on your cluster.  An example
-bootstrap script is as follows: 
-```bash
-#!/bin/bash
- 
-set -ex
- 
-sudo chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct
-sudo chmod a+rwx -R /sys/fs/cgroup/devices
-```
+In the cluster's "Summary" tab, find the "Primary node public DNS" field and click on 
+"Connect to the Primary Node using SSH". Follow the instructions to SSH to the new cluster's primary node.
 
-*Optionally* add key-value "Tags", configure a "Custom AMI" for the EMR cluster on this page.
+###  Launch an EMR Cluster using AWS CLI
 
-![Step 3: General Cluster Settings](../img/AWS-EMR/RAPIDS_EMR_GUI_3.png)
+In this example, we will use the AWS CLI to launch a cluster with one Primary node (m5.xlarge) and two 
+g4dn.2xlarge nodes.
 
-####  Step 4: Security
+You will need:
+- an SSH key-pair already registered in the AWS console
+- a subnet and VPC configuration (default or a custom configuration)
 
-Select an existing "EC2 key pair" that will be used to authenticate SSH access to the cluster's
-nodes. If you do not have access to an EC2 key pair, follow these instructions to [create an EC2 key
-pair](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html#having-ec2-create-your-key-pair).
+```bash
+aws emr create-cluster \
+--release-label emr-6.10.0 \
+--applications Name=Hadoop Name=Spark Name=Livy Name=JupyterEnterpriseGateway \
+--service-role DemoServiceRole \
+--ec2-attributes KeyName=demo-key-pair,SubnetId=demo-subnet,InstanceProfile=DemoInstanceProfile \
+--instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=m4.4xlarge \
+                  InstanceGroupType=CORE,InstanceCount=1,InstanceType=g4dn.2xlarge
+--configurations file://config.json \
+--bootstrap-actions Name='Setup cgroups bootstrap',Path=s3://demo-bucket/cgroup-bootstrap-action.sh
+```
 
-*Optionally* set custom security groups in the "EC2 security groups" tab.
+Please fill with actual value for `KeyName`, `SubnetId`, `service-role`, and `InstanceProfile`.
+The service role and instance profile are AWS IAM roles associated with your cluster, which allow
+the EMR cluster to access services provided by AWS.
 
-In the "EC2 security groups" tab, confirm that the security group chosen for the "Master" node
-allows for SSH access. Follow these instructions to [allow inbound SSH
-traffic](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/authorizing-access-to-an-instance.html)
-if the security group does not allow it yet.
+The `config.json` installs the spark-rapids plugin on your cluster, configures YARN to use
+GPUs, configures Spark to use RAPIDS, and configures the YARN capacity scheduler.  An [example JSON
+configuration](#step-4--edit-software-configuration) can be found in the section on 
+launching in the GUI above. 
 
-![Step 4: Security](../img/AWS-EMR/RAPIDS_EMR_GUI_4.png)
+The `cgroup-boostrap-action.sh` script referenced in the above script opens cgroup permissions to YARN
+on your cluster.  You can find an example of 
+the [cgroup bootstrap action](#step-3--general-cluster-settings) above.
 
-#### Finish Cluster Configuration
+### Running the Spark RAPIDS User Tools Bootstrap for Optimal Cluster Spark Settings
 
-The EMR cluster management page displays the status of multiple clusters or detailed information
-about a chosen cluster. In the detailed cluster view, the "Summary" and "Hardware" tabs can be used
-to monitor the status of master and core nodes as they provision and initialize.
+The bootstrap tool will generate optimized settings for the RAPIDS Accelerator on Apache Spark on a
+GPU cluster for EMR.  The tool will fetch the characteristics of the cluster -- including
+number of workers, worker cores, worker memory, and GPU accelerator type and count.  It will use
+the cluster properties to then determine the optimal settings for running GPU-accelerated Spark
+applications.
 
-When the cluster is ready, a green-dot will appear next to the cluster name and the "Status" column
-will display **Waiting, cluster ready**.
+Usage: `spark_rapids_user_tools emr bootstrap --cluster <cluster-name>`
 
-In the cluster's "Summary" tab, find the "Master public DNS" field and click the `SSH`
-button. Follow the instructions to SSH to the new cluster's master node.
+Help (to see all options available): `spark_rapids_user_tools emr bootstrap --help`
 
-![Finish Cluster Configuration](../img/AWS-EMR/RAPIDS_EMR_GUI_5.png)
+Example output:
+```
+##### BEGIN : RAPIDS bootstrap settings for gpu-cluster
+spark.executor.cores=16
+spark.executor.memory=32768m
+spark.executor.memoryOverhead=7372m
+spark.rapids.sql.concurrentGpuTasks=2
+spark.rapids.memory.pinnedPool.size=4096m
+spark.sql.files.maxPartitionBytes=512m
+spark.task.resource.gpu.amount=0.0625
+##### END : RAPIDS bootstrap settings for gpu-cluster
+```
 
+A detailed description for bootstrap settings with usage information is available in the [RAPIDS Accelerator for Apache Spark Configuration](https://nvidia.github.io/spark-rapids/docs/configs.html) and [Spark Configuration](https://spark.apache.org/docs/latest/configuration.html) page.
 
-### Running an example joint operation using Spark Shell
+### Running an Example Join Operation Using Spark Shell 
 
-SSH to the EMR cluster's master node, get into sparks shell and run the sql join example to verify
+Please follow EMR doc [Connect to the primary node using
+SSH](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-connect-master-node-ssh.html) to ssh
+to the EMR cluster's primary node. And then get into sparks shell and run the sql join example to verify
 GPU operation.
 
+Note: Use `hadoop` user for SSH and below command.
+
 ```bash
 spark-shell
 ```
@@ -259,7 +338,7 @@ out.explain()
 
 Similar to spark-submit for on-prem clusters, AWS EMR supports a Spark application job to be
 submitted. The mortgage examples we use are also available as a spark application.  You can also use
-**spark shell** to run the scala code or **pyspark** to run the python code on master node through
+**spark shell** to run the scala code or **pyspark** to run the python code on the primary node through
 CLI.
 
 ### Running GPU Accelerated Mortgage ETL Example using EMR Notebook
@@ -277,11 +356,17 @@ ETL](https://github.com/NVIDIA/spark-rapids/tree/main/docs/demo)
 
 #### Create EMR Notebook and Connect to EMR GPU Cluster 
 
-Go to the AWS Management Console and select Notebooks on the left column. Click the Create notebook
-button. You can then click "Choose an existing cluster" and pick the right cluster after click
-Choose button. Once the instance is ready, launch the Jupyter from EMR Notebook instance.
+Go to the Amazon EMR page and select "Studios" under "EMR Studios". You can create a Studio if
+you haven't already.
+
+Create a notebook by clicking on "Workspaces (Notebooks)" on the left column and then clicking
+on the "Create Workspace" button. Select the studio you selected in the prior step.
+
+Enter a Workspace name, descritiption and a location (which should be set by default to the studio
+S3 path). Under "Advanced configuration", you can pick an EMR cluster that you have already
+launched. 
 
-![Create EMR Notebook](../img/AWS-EMR/EMR_notebook_1.png)
+![Create EMR Notebook](../img/AWS-EMR/notebook-workspace-creation.png)
 
 #### Run Mortgage ETL PySpark Notebook on EMR GPU Cluster 
 
@@ -292,4 +377,4 @@ cluster. You can adjust settings in the notebook for full mortgage dataset ETL.
 When executing the ETL code, you can also see the Spark Job Progress within the notebook and the
 code will also display how long it takes to run the query
 
-![Create EMR Notebook](../img/AWS-EMR/EMR_notebook_3.png)
\ No newline at end of file
+![Create EMR Notebook](../img/AWS-EMR/EMR_notebook_3.png)
diff --git a/docs/get-started/getting-started-databricks.md b/docs/get-started/getting-started-databricks.md
index db03abdb3a6..0e1aee6ddfe 100644
--- a/docs/get-started/getting-started-databricks.md
+++ b/docs/get-started/getting-started-databricks.md
@@ -11,9 +11,9 @@ At the end of this guide, the reader will be able to run a sample Apache Spark a
 on NVIDIA GPUs on Databricks.
 
 ## Prerequisites
-    * Apache Spark 3.x running in Databricks Runtime 9.1 ML, 10.4 ML or 11.3 ML with GPU
-    * AWS: 9.1 LTS ML (GPU, Scala 2.12, Spark 3.1.2), 10.4 LTS ML (GPU, Scala 2.12, Spark 3.2.1) or 11.3 LTS ML (GPU, Scala 2.12, Spark 3.3.0)
-    * Azure: 9.1 LTS ML (GPU, Scala 2.12, Spark 3.1.2) or 10.4 LTS ML (GPU, Scala 2.12, Spark 3.2.1) or 11.3 LTS ML (GPU, Scala 2.12, Spark 3.3.0)
+    * Apache Spark 3.x running in Databricks Runtime 10.4 ML or 11.3 ML with GPU
+    * AWS: 10.4 LTS ML (GPU, Scala 2.12, Spark 3.2.1) or 11.3 LTS ML (GPU, Scala 2.12, Spark 3.3.0)
+    * Azure: 10.4 LTS ML (GPU, Scala 2.12, Spark 3.2.1) or 11.3 LTS ML (GPU, Scala 2.12, Spark 3.3.0)
 
 Databricks may do [maintenance
 releases](https://docs.databricks.com/release-notes/runtime/maintenance-updates.html) for their
@@ -58,10 +58,6 @@ The number of GPUs per node dictates the number of Spark executors that can run
    of DecimalTypes with precision greater than 38. There is a bug filed in Apache Spark for it 
    [here](https://issues.apache.org/jira/browse/SPARK-41793), whereas when using the plugin the 
    correct result will be returned.
-
-6. A query may fail when Dynamic File Pruning is enabled. As a workaround, please
-   disable the feature by setting `spark.databricks.optimizer.dynamicFilePruning false`. More details
-   are in [issue-7648](https://github.com/NVIDIA/spark-rapids/issues/7648).
    
 ## Start a Databricks Cluster
 Create a Databricks cluster by going to "Compute", then clicking `+ Create compute`.  Ensure the
@@ -70,9 +66,7 @@ cluster meets the prerequisites above by configuring it as follows:
    Prerequisites section.
 2. Choose the number of workers that matches the number of GPUs you want to use.
 3. Select a worker type. On AWS, use nodes with 1 GPU each such as `p3.2xlarge` or `g4dn.xlarge`.
-   p2 nodes do not meet the architecture requirements (Pascal or higher) for the Spark worker
-   (although they can be used for the driver node).  For Azure, choose GPU nodes such as
-   Standard_NC6s_v3. For GCP, choose N1 or A2 instance types with GPUs. 
+   For Azure, choose GPU nodes such as Standard_NC6s_v3. For GCP, choose N1 or A2 instance types with GPUs. 
 4. Select the driver type. Generally this can be set to be the same as the worker.
 5. Start the cluster.
 
@@ -87,9 +81,6 @@ cluster.
    how to import a notebook.  
    Select the version of the RAPIDS Accelerator for Apache Spark based on the Databricks runtime
    version:
-    - [Databricks 9.1 LTS
-    ML](https://docs.databricks.com/release-notes/runtime/9.1ml.html#system-environment) has CUDA 11
-    installed.  Users will need to use 21.12.0 or later on Databricks 9.1 LTS ML. 
    - [Databricks 10.4 LTS
      ML](https://docs.databricks.com/release-notes/runtime/10.4ml.html#system-environment) has CUDA 11
      installed.  Users will need to use 22.04.0 or later on Databricks 10.4 LTS ML.
@@ -131,7 +122,6 @@ cluster.
     spark.task.resource.gpu.amount 0.1
     spark.rapids.memory.pinnedPool.size 2G
     spark.rapids.sql.concurrentGpuTasks 2
-    spark.databricks.optimizer.dynamicFilePruning false
     ```
 
     ![Spark Config](../img/Databricks/sparkconfig.png)
@@ -144,13 +134,16 @@ cluster.
     [`spark.rapids.sql.python.gpu.enabled`](../configs.md#sql.python.gpu.enabled) to `true` to
     enable GPU support for python. Add the path of the plugin jar (supposing it is placed under
     `/databricks/jars/`) to the `spark.executorEnv.PYTHONPATH` option. For more details please go to
-    [GPU Scheduling For Pandas UDF](../additional-functionality/rapids-udfs.md#gpu-scheduling-for-pandas-udf)
+    [GPU Scheduling For Pandas UDF](../additional-functionality/rapids-udfs.md#gpu-support-for-pandas-udf)
 
     ```bash
     spark.rapids.sql.python.gpu.enabled true
     spark.python.daemon.module rapids.daemon_databricks
-    spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-23.02.0.jar:/databricks/spark/python
+    spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-23.04.0.jar:/databricks/spark/python
     ```
+   Note that since python memory pool require installing the cudf library, so you need to install cudf library in 
+   each worker nodes `pip install cudf-cu11 --extra-index-url=https://pypi.nvidia.com` or disable python memory pool
+   `spark.rapids.python.memory.gpu.pooling.enabled=false`.
 
 7. Once you’ve added the Spark config, click “Confirm and Restart”.
 8. Once the cluster comes back up, it is now enabled for GPU-accelerated Spark.
diff --git a/docs/get-started/getting-started-gcp.md b/docs/get-started/getting-started-gcp.md
index d70cbeca497..a1aeeaba494 100644
--- a/docs/get-started/getting-started-gcp.md
+++ b/docs/get-started/getting-started-gcp.md
@@ -7,222 +7,38 @@ parent: Getting-Started
 
 # Getting started with RAPIDS Accelerator on GCP Dataproc
  [Google Cloud Dataproc](https://cloud.google.com/dataproc) is Google Cloud's fully managed Apache
- Spark and Hadoop service.  The quick start guide will go through:
-
-* [Quick Start Prerequisites](#quick-start-prerequisites) 
-* [Qualify CPU workloads for GPU acceleration](#qualify-cpu-workloads-for-gpu-acceleration)
-* [Bootstrap GPU cluster with optimized settings](#bootstrap-gpu-cluster-with-optimized-settings)
-* [Tune applications on GPU cluster](#tune-applications-on-gpu-cluster)
-* [Diagnose GPU Cluster](#diagnose-gpu-cluster)
-
-The advanced guide will walk through the steps to:
-
+ Spark and Hadoop service. The quick start guide will go through:
+ 
 * [Create a Dataproc Cluster Accelerated by GPUs](#create-a-dataproc-cluster-accelerated-by-gpus)
+  * [Create a Dataproc Cluster using T4's](#create-a-dataproc-cluster-using-t4s)
+  * [Build custom Dataproc image to accelerate cluster initialization time](#build-custom-dataproc-image-to-accelerate-cluster-init-time)
+  * [Create a Dataproc Cluster using MIG with A100's](#create-a-dataproc-cluster-using-mig-with-a100s)
+  * [Cluster creation troubleshooting](#cluster-creation-troubleshooting)
 * [Run Pyspark or Scala ETL and XGBoost training Notebook on a Dataproc Cluster Accelerated by
   GPUs](#run-pyspark-or-scala-notebook-on-a-dataproc-cluster-accelerated-by-gpus)
 * [Submit the same sample ETL application as a Spark job to a Dataproc Cluster Accelerated by
   GPUs](#submit-spark-jobs-to-a-dataproc-cluster-accelerated-by-gpus)
-* [Build custom Dataproc image to accelerate cluster initialization time](#build-custom-dataproc-image-to-accelerate-cluster-init-time)
 
-## Quick Start Prerequisites
+We provide some RAPIDS tools to analyze the clusters and the applications running on [Google Cloud Dataproc](https://cloud.google.com/dataproc) including:
+* [Diagnose GPU Cluster](#diagnose-gpu-cluster)
+* [Bootstrap GPU cluster with optimized settings](#bootstrap-gpu-cluster-with-optimized-settings)
+* [Qualify CPU workloads for GPU acceleration](#qualify-cpu-workloads-for-gpu-acceleration)
+* [Tune applications on GPU cluster](#tune-applications-on-gpu-cluster)
 
+The Prerequisites of the RAPIDS tools including:
 * gcloud CLI is installed: https://cloud.google.com/sdk/docs/install
 * python 3.8+
 * `pip install spark-rapids-user-tools`
 
-## Qualify CPU Workloads for GPU Acceleration
-
-The [qualification tool](https://nvidia.github.io/spark-rapids/docs/spark-qualification-tool.html) is launched on a Dataproc cluster that has applications that have already run.
-The tool will output the applications recommended for acceleration along with estimated speed-up
-and cost saving metrics.  Additionally, it will provide information on how to launch a GPU-
-accelerated cluster to take advantage of the speed-up and cost savings.
-
-Usage: `spark_rapids_dataproc qualification --cluster <cluster-name> --region <region>`
-
-Help (to see all options available): `spark_rapids_dataproc qualification --help`
-
-Example output:
-```
-+----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------+
-|    | App Name   | App ID                         | Recommendation       |   Estimated GPU |   Estimated GPU |           App |   Estimated GPU |
-|    |            |                                |                      |         Speedup |     Duration(s) |   Duration(s) |      Savings(%) |
-|----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------|
-|  0 | query24    | application_1664888311321_0011 | Strongly Recommended |            3.49 |          257.18 |        897.68 |           59.70 |
-|  1 | query78    | application_1664888311321_0009 | Strongly Recommended |            3.35 |          113.89 |        382.35 |           58.10 |
-|  2 | query23    | application_1664888311321_0010 | Strongly Recommended |            3.08 |          325.77 |       1004.28 |           54.37 |
-|  3 | query64    | application_1664888311321_0008 | Strongly Recommended |            2.91 |          150.81 |        440.30 |           51.82 |
-|  4 | query50    | application_1664888311321_0003 | Recommended          |            2.47 |          101.54 |        250.95 |           43.08 |
-|  5 | query16    | application_1664888311321_0005 | Recommended          |            2.36 |          106.33 |        251.95 |           40.63 |
-|  6 | query38    | application_1664888311321_0004 | Recommended          |            2.29 |           67.37 |        154.33 |           38.59 |
-|  7 | query87    | application_1664888311321_0006 | Recommended          |            2.25 |           75.67 |        170.69 |           37.64 |
-|  8 | query51    | application_1664888311321_0002 | Recommended          |            1.53 |           53.94 |         82.63 |            8.18 |
-+----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------+
-To launch a GPU-accelerated cluster with Spark RAPIDS, add the following to your cluster creation script:
-        --initialization-actions=gs://goog-dataproc-initialization-actions-us-central1/gpu/install_gpu_driver.sh,gs://goog-dataproc-initialization-actions-us-central1/rapids/rapids.sh \
-        --worker-accelerator type=nvidia-tesla-t4,count=2 \
-        --metadata gpu-driver-provider="NVIDIA" \
-        --metadata rapids-runtime=SPARK \
-        --cuda-version=11.5
-```
-
-## Bootstrap GPU Cluster with Optimized Settings
-
-The bootstrap tool will apply optimized settings for the RAPIDS Accelerator on Apache Spark on a 
-GPU cluster for Dataproc.  The tool will fetch the characteristics of the cluster -- including 
-number of workers, worker cores, worker memory, and GPU accelerator type and count.  It will use
-the cluster properties to then determine the optimal settings for running GPU-accelerated Spark 
-applications.
-
-Usage: `spark_rapids_dataproc bootstrap --cluster <cluster-name> --region <region>`
-
-Help (to see all options available): `spark_rapids_dataproc bootstrap --help`
-
-Example output: 
-```
-##### BEGIN : RAPIDS bootstrap settings for gpu-cluster
-spark.executor.cores=16
-spark.executor.memory=32768m
-spark.executor.memoryOverhead=7372m
-spark.rapids.sql.concurrentGpuTasks=2
-spark.rapids.memory.pinnedPool.size=4096m
-spark.sql.files.maxPartitionBytes=512m
-spark.task.resource.gpu.amount=0.0625
-##### END : RAPIDS bootstrap settings for gpu-cluster
-```
-
-A detailed description for bootstrap settings with usage information is available in the [RAPIDS Accelerator for Apache Spark Configuration](https://nvidia.github.io/spark-rapids/docs/configs.html) and [Spark Configuration](https://spark.apache.org/docs/latest/configuration.html) page.
-
-## Tune Applications on GPU Cluster
-
-Once Spark applications have been run on the GPU cluster, the [profiling tool](https://nvidia.github.io/spark-rapids/docs/spark-profiling-tool.html) can be run to 
-analyze the event logs of the applications to determine if more optimal settings should be
-configured.  The tool will output a per-application set of config settings to be adjusted for
-enhanced performance.
-
-Usage: `spark_rapids_dataproc profiling --cluster <cluster-name> --region <region>`
-
-Help (to see all options available): `spark_rapids_dataproc profiling --help`
-
-Example output:
-```
-+--------------------------------+--------------------------------------------------+--------------------------------------------------------------------------------------------------+
-| App ID                         | Recommendations                                  | Comments                                                                                         |
-+================================+==================================================+==================================================================================================+
-| application_1664894105643_0011 | --conf spark.executor.cores=16                   | - 'spark.task.resource.gpu.amount' was not set.                                                  |
-|                                | --conf spark.executor.memory=32768m              | - 'spark.rapids.sql.concurrentGpuTasks' was not set.                                             |
-|                                | --conf spark.executor.memoryOverhead=7372m       | - 'spark.rapids.memory.pinnedPool.size' was not set.                                             |
-|                                | --conf spark.rapids.memory.pinnedPool.size=4096m | - 'spark.executor.memoryOverhead' was not set.                                                   |
-|                                | --conf spark.rapids.sql.concurrentGpuTasks=2     | - 'spark.sql.files.maxPartitionBytes' was not set.                                               |
-|                                | --conf spark.sql.files.maxPartitionBytes=1571m   | - 'spark.sql.shuffle.partitions' was not set.                                                    |
-|                                | --conf spark.sql.shuffle.partitions=200          |                                                                                                  |
-|                                | --conf spark.task.resource.gpu.amount=0.0625     |                                                                                                  |
-+--------------------------------+--------------------------------------------------+--------------------------------------------------------------------------------------------------+
-| application_1664894105643_0002 | --conf spark.executor.cores=16                   | - 'spark.task.resource.gpu.amount' was not set.                                                  |
-|                                | --conf spark.executor.memory=32768m              | - 'spark.rapids.sql.concurrentGpuTasks' was not set.                                             |
-|                                | --conf spark.executor.memoryOverhead=7372m       | - 'spark.rapids.memory.pinnedPool.size' was not set.                                             |
-|                                | --conf spark.rapids.memory.pinnedPool.size=4096m | - 'spark.executor.memoryOverhead' was not set.                                                   |
-|                                | --conf spark.rapids.sql.concurrentGpuTasks=2     | - 'spark.sql.files.maxPartitionBytes' was not set.                                               |
-|                                | --conf spark.sql.files.maxPartitionBytes=3844m   | - 'spark.sql.shuffle.partitions' was not set.                                                    |
-|                                | --conf spark.sql.shuffle.partitions=200          |                                                                                                  |
-|                                | --conf spark.task.resource.gpu.amount=0.0625     |                                                                                                  |
-+--------------------------------+--------------------------------------------------+--------------------------------------------------------------------------------------------------+
-```
-
-## Diagnose GPU Cluster
-
-The diagnostic tool can be run to check a GPU cluster with RAPIDS Accelerator for Apache Spark
-is healthy and ready for Spark jobs, such as checking the version of installed NVIDIA driver,
-cuda-toolkit, RAPIDS Accelerator and running Spark test jobs etc. This tool also can
-be used by the frontline support team for basic diagnostic and troubleshooting before escalating
-to NVIDIA RAPIDS Accelerator for Apache Spark engineering team.
-
-Usage: `spark_rapids_dataproc diagnostic --cluster <cluster-name> --region <region>`
-
-Help (to see all options available): `spark_rapids_dataproc diagnostic --help`
-
-Example output:
-
-```text
-*** Running diagnostic function "nv_driver" ***
-Warning: Permanently added 'compute.9009746126288801979' (ECDSA) to the list of known hosts.
-Fri Oct 14 05:17:55 2022
-+-----------------------------------------------------------------------------+
-| NVIDIA-SMI 460.106.00   Driver Version: 460.106.00   CUDA Version: 11.2     |
-|-------------------------------+----------------------+----------------------+
-| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
-| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
-|                               |                      |               MIG M. |
-|===============================+======================+======================|
-|   0  Tesla T4            On   | 00000000:00:04.0 Off |                    0 |
-| N/A   48C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
-|                               |                      |                  N/A |
-+-------------------------------+----------------------+----------------------+
-
-+-----------------------------------------------------------------------------+
-| Processes:                                                                  |
-|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
-|        ID   ID                                                   Usage      |
-|=============================================================================|
-|  No running processes found                                                 |
-+-----------------------------------------------------------------------------+
-NVRM version: NVIDIA UNIX x86_64 Kernel Module  460.106.00  Tue Sep 28 12:05:58 UTC 2021
-GCC version:  gcc version 7.5.0 (Ubuntu 7.5.0-3ubuntu1~18.04)
-Connection to 34.68.242.247 closed.
-*** Check "nv_driver": PASS ***
-*** Running diagnostic function "nv_driver" ***
-Warning: Permanently added 'compute.6788823627063447738' (ECDSA) to the list of known hosts.
-Fri Oct 14 05:18:02 2022
-+-----------------------------------------------------------------------------+
-| NVIDIA-SMI 460.106.00   Driver Version: 460.106.00   CUDA Version: 11.2     |
-|-------------------------------+----------------------+----------------------+
-| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
-| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
-|                               |                      |               MIG M. |
-|===============================+======================+======================|
-|   0  Tesla T4            On   | 00000000:00:04.0 Off |                    0 |
-| N/A   35C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
-|                               |                      |                  N/A |
-+-------------------------------+----------------------+----------------------+
-
-+-----------------------------------------------------------------------------+
-| Processes:                                                                  |
-|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
-|        ID   ID                                                   Usage      |
-|=============================================================================|
-|  No running processes found                                                 |
-+-----------------------------------------------------------------------------+
-NVRM version: NVIDIA UNIX x86_64 Kernel Module  460.106.00  Tue Sep 28 12:05:58 UTC 2021
-GCC version:  gcc version 7.5.0 (Ubuntu 7.5.0-3ubuntu1~18.04)
-Connection to 34.123.223.104 closed.
-*** Check "nv_driver": PASS ***
-*** Running diagnostic function "cuda_version" ***
-Connection to 34.68.242.247 closed.
-found cuda major version: 11
-*** Check "cuda_version": PASS ***
-*** Running diagnostic function "cuda_version" ***
-Connection to 34.123.223.104 closed.
-found cuda major version: 11
-*** Check "cuda_version": PASS ***
-...
-********************************************************************************
-Overall check result: PASS
-```
-
-Please note that the diagnostic tool supports the following:
-
-* Dataproc 2.0 with image of Debian 10 or Ubuntu 18.04 (Rocky8 support is coming soon)
-* GPU cluster that must have 1 worker node at least. Single node cluster (1 master, 0 workers) is
-  not supported
-
 ## Create a Dataproc Cluster Accelerated by GPUs
- 
- You can use [Cloud Shell](https://cloud.google.com/shell) to execute shell commands that will
- create a Dataproc cluster.  Cloud Shell contains command line tools for interacting with Google
- Cloud Platform, including gcloud and gsutil.  Alternatively, you can install [GCloud
- SDK](https://cloud.google.com/sdk/install) on your machine.  From the Cloud Shell, users will need
- to enable services within your project.  Enable the Compute and Dataproc APIs in order to access
- Dataproc, and enable the Storage API as you’ll need a Google Cloud Storage bucket to house your
- data.  This may take several minutes.
+
+You can use [Cloud Shell](https://cloud.google.com/shell) to execute shell commands that will
+create a Dataproc cluster.  Cloud Shell contains command line tools for interacting with Google
+Cloud Platform, including gcloud and gsutil.  Alternatively, you can install [GCloud
+SDK](https://cloud.google.com/sdk/install) on your machine.  From the Cloud Shell, users will need
+to enable services within your project.  Enable the Compute and Dataproc APIs in order to access
+Dataproc, and enable the Storage API as you’ll need a Google Cloud Storage bucket to house your
+data.  This may take several minutes.
 
 ```bash
 gcloud services enable compute.googleapis.com
@@ -235,7 +51,7 @@ Dataproc cluster. Dataproc supports multiple different GPU types depending on yo
 Generally, T4 is a good option for use with the RAPIDS Accelerator for Spark. We also support
 MIG on the Ampere architecture GPUs like the A100. Using
 [MIG](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/) you can request an A100 and split
-it up into multiple different compute instances and it runs like you have multiple separate GPUs.
+it up into multiple different compute instances, and it runs like you have multiple separate GPUs.
 
 The example configurations below will allow users to run any of the [notebook
 demos](https://github.com/NVIDIA/spark-rapids/tree/main/docs/demo/GCP) on GCP. Adjust the sizes and
@@ -257,22 +73,22 @@ The script below will initialize with the following:
 
 ### Create a Dataproc Cluster using T4's
 * One 16-core master node and 5 32-core worker nodes
-* Four NVIDIA T4 for each worker node
+* Two NVIDIA T4 for each worker node
 
 ```bash
     export REGION=[Your Preferred GCP Region]
     export GCS_BUCKET=[Your GCS Bucket]
     export CLUSTER_NAME=[Your Cluster Name]
     export NUM_GPUS=2
-    export NUM_WORKERS=4
+    export NUM_WORKERS=5
 
 gcloud dataproc clusters create $CLUSTER_NAME  \
     --region=$REGION \
     --image-version=2.0-ubuntu18 \
-    --master-machine-type=n1-standard-16 \
+    --master-machine-type=n2-standard-16 \
     --num-workers=$NUM_WORKERS \
     --worker-accelerator=type=nvidia-tesla-t4,count=$NUM_GPUS \
-    --worker-machine-type=n1-highmem-32\
+    --worker-machine-type=n2-highmem-32\
     --num-worker-local-ssds=4 \
     --initialization-actions=gs://goog-dataproc-initialization-actions-${REGION}/spark-rapids/spark-rapids.sh \
     --optional-components=JUPYTER,ZEPPELIN \
@@ -294,6 +110,88 @@ Google Cloud Console to see the progress.
 If you'd like to further accelerate init time to 4-5 minutes, create a custom Dataproc image using
 [this](#build-custom-dataproc-image-to-accelerate-cluster-init-time) guide.
 
+### Build custom dataproc image to accelerate cluster init time
+In order to accelerate cluster init time to 3-4 minutes, we need to build a custom Dataproc image
+that already has NVIDIA drivers and CUDA toolkit installed, with RAPIDS deployed. The custom image
+could also be used in an air gap environment. In this section, we will be using [these instructions
+from GCP](https://cloud.google.com/dataproc/docs/guides/dataproc-images) to create a custom image.
+
+Currently, we can directly download the [spark-rapids.sh](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/spark-rapids)
+script to create the Dataproc image:
+
+Google provides a `generate_custom_image.py` script that:
+- Launches a temporary Compute Engine VM instance with the specified Dataproc base image.
+- Then runs the customization script inside the VM instance to install custom packages and/or
+update configurations.
+- After the customization script finishes, it shuts down the VM instance and creates a Dataproc
+  custom image from the disk of the VM instance.
+- The temporary VM is deleted after the custom image is created.
+- The custom image is saved and can be used to create Dataproc clusters.
+
+Download `spark-rapids.sh` in this repo.  The script uses
+Google's `generate_custom_image.py` script.  This step may take 20-25 minutes to complete.
+
+```bash
+git clone https://github.com/GoogleCloudDataproc/custom-images
+cd custom-images
+
+export CUSTOMIZATION_SCRIPT=/path/to/spark-rapids.sh
+export ZONE=[Your Preferred GCP Zone]
+export GCS_BUCKET=[Your GCS Bucket]
+export IMAGE_NAME=sample-20-ubuntu18-gpu-t4
+export DATAPROC_VERSION=2.0-ubuntu18
+export GPU_NAME=nvidia-tesla-t4
+export GPU_COUNT=1
+
+python generate_custom_image.py \
+    --image-name $IMAGE_NAME \
+    --dataproc-version $DATAPROC_VERSION \
+    --customization-script $CUSTOMIZATION_SCRIPT \
+    --no-smoke-test \
+    --zone $ZONE \
+    --gcs-bucket $GCS_BUCKET \
+    --machine-type n2-standard-4 \
+    --accelerator type=$GPU_NAME,count=$GPU_COUNT \
+    --disk-size 200 \
+    --subnet default 
+```
+
+See [here](https://cloud.google.com/dataproc/docs/guides/dataproc-images#running_the_code) for more
+details on `generate_custom_image.py` script arguments and
+[here](https://cloud.google.com/dataproc/docs/concepts/versioning/dataproc-versions) for dataproc
+version description.
+
+The image `sample-20-ubuntu18-gpu-t4` is now ready and can be viewed in the GCP console under
+`Compute Engine > Storage > Images`. The next step is to launch the cluster using this new image
+and new initialization actions (that do not install NVIDIA drivers since we are already past that
+step).
+
+Move this to your own bucket. Let's launch the cluster:
+
+```bash 
+export REGION=[Your Preferred GCP Region]
+export GCS_BUCKET=[Your GCS Bucket]
+export CLUSTER_NAME=[Your Cluster Name]
+export NUM_GPUS=1
+export NUM_WORKERS=2
+
+gcloud dataproc clusters create $CLUSTER_NAME  \
+    --region=$REGION \
+    --image=sample-20-ubuntu18-gpu-t4 \
+    --master-machine-type=n2-standard-4 \
+    --num-workers=$NUM_WORKERS \
+    --worker-accelerator=type=nvidia-tesla-t4,count=$NUM_GPUS \
+    --worker-machine-type=n2-standard-4 \
+    --num-worker-local-ssds=1 \
+    --optional-components=JUPYTER,ZEPPELIN \
+    --metadata=rapids-runtime=SPARK \
+    --bucket=$GCS_BUCKET \
+    --enable-component-gateway \
+    --subnet=default 
+```
+
+The new cluster should be up and running within 3-4 minutes!
+
 ### Create a Dataproc Cluster using MIG with A100's
 * One 16-core master node and 5 12-core worker nodes
 * 1 NVIDIA A100 for each worker node, split into 2 MIG instances using
@@ -311,13 +209,13 @@ gcloud dataproc clusters create $CLUSTER_NAME  \
     --region=$REGION \
     --zone=$ZONE \
     --image-version=2.0-ubuntu18 \
-    --master-machine-type=n1-standard-16 \
+    --master-machine-type=n2-standard-16 \
     --num-workers=$NUM_WORKERS \
     --worker-accelerator=type=nvidia-tesla-a100,count=$NUM_GPUS \
     --worker-machine-type=a2-highgpu-1g \
     --num-worker-local-ssds=4 \
     --initialization-actions=gs://goog-dataproc-initialization-actions-${REGION}/spark-rapids/spark-rapids.sh \
-    --metadata=startup-script-url=gs://goog-dataproc-initialization-actions-${REGION}/gpu/mig.sh \
+    --metadata=startup-script-url=gs://goog-dataproc-initialization-actions-${REGION}/spark-rapids/mig.sh \
     --optional-components=JUPYTER,ZEPPELIN \
     --metadata=rapids-runtime=SPARK \
     --bucket=$GCS_BUCKET \
@@ -337,18 +235,18 @@ metadata parameter `MIG_CGI`. Below is an example of using a profile name and a
 ```
 
 This may take around 10-15 minutes to complete.  You can navigate to the Dataproc clusters tab in
-the Google Cloud Console to see the progress.  
+the Google Cloud Console to see the progress.
 
 ![Dataproc Cluster](../img/GCP/dataproc-cluster.png)
 
 If you'd like to further accelerate init time to 4-5 minutes, create a custom Dataproc image using
-[this](#build-custom-dataproc-image-to-accelerate-cluster-init-time) guide. 
+[this](#build-custom-dataproc-image-to-accelerate-cluster-init-time) guide.
 
 ### Cluster creation troubleshooting
-If you encounter an error related to GPUs not being available because of your account quotas, please 
+If you encounter an error related to GPUs not being available because of your account quotas, please
 go to this page for updating your quotas: [Quotas and limits](https://cloud.google.com/compute/quotas).
 
-If you encounter an error related to GPUs not available in the specific region or zone, you will 
+If you encounter an error related to GPUs not available in the specific region or zone, you will
 need to update the REGION or ZONE parameter in the cluster creation command.
 
 ## Run PySpark or Scala Notebook on a Dataproc Cluster Accelerated by GPUs
@@ -377,7 +275,7 @@ Once the data is prepared, we use the [Mortgage XGBoost4j Scala
 Notebook](../demo/GCP/mortgage-xgboost4j-gpu-scala.ipynb) in Dataproc's jupyter notebook to execute
 the training job on GPUs. Scala based XGBoost examples use [DLMC
 XGBoost](https://github.com/dmlc/xgboost). For a PySpark based XGBoost example, please refer to
-[Spark-RAPIDS-examples](https://github.com/NVIDIA/spark-rapids-examples/blob/main/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md) that 
+[Spark-RAPIDS-examples](https://github.com/NVIDIA/spark-rapids-examples/blob/main/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md) that
 make sure the required libraries are installed.
 
 The training time should be around 680 seconds (1/7 of CPU execution time with same config). This
@@ -434,92 +332,190 @@ gcloud dataproc jobs submit spark \
     -maxDepth=8   
 ``` 
 
-## Dataproc Hub in AI Platform Notebook to Dataproc cluster 
-With the integration between AI Platform Notebooks and Dataproc, users can create a [Dataproc Hub
-notebook](https://cloud.google.com/blog/products/data-analytics/administering-jupyter-notebooks-for-spark-workloads-on-dataproc).
-The AI platform will connect to a Dataproc cluster through a yaml configuration.
+## Diagnose GPU Cluster
 
-In the future, users will be able to provision a Dataproc cluster through DataprocHub notebook. You
-can use example [pyspark notebooks](../demo/GCP/Mortgage-ETL.ipynb) to experiment.
+The diagnostic tool can be run to check a GPU cluster with RAPIDS Accelerator for Apache Spark
+is healthy and ready for Spark jobs, such as checking the version of installed NVIDIA driver,
+cuda-toolkit, RAPIDS Accelerator and running Spark test jobs etc. This tool also can
+be used by the front line support team for basic diagnostic and troubleshooting before escalating
+to NVIDIA RAPIDS Accelerator for Apache Spark engineering team.
 
-## Build custom dataproc image to accelerate cluster init time
-In order to accelerate cluster init time to 3-4 minutes, we need to build a custom Dataproc image
-that already has NVIDIA drivers and CUDA toolkit installed, with RAPIDS deployed. The custom image
-could also be used in an air gap environment. In this section, we will be using [these instructions
-from GCP](https://cloud.google.com/dataproc/docs/guides/dataproc-images) to create a custom image.
+Usage: `spark_rapids_dataproc diagnostic --cluster <cluster-name> --region <region>`
 
-Currently, we can directly download the [spark-rapids.sh](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/spark-rapids)
-script to create the Dataproc image:
+Help (to see all options available): `spark_rapids_dataproc diagnostic --help`
 
-Google provides a `generate_custom_image.py` script that:
-- Launches a temporary Compute Engine VM instance with the specified Dataproc base image.
-- Then runs the customization script inside the VM instance to install custom packages and/or
-update configurations.
-- After the customization script finishes, it shuts down the VM instance and creates a Dataproc
-  custom image from the disk of the VM instance.
-- The temporary VM is deleted after the custom image is created.
-- The custom image is saved and can be used to create Dataproc clusters.
+Example output:
 
-Download `spark-rapids.sh` in this repo.  The script uses
-Google's `generate_custom_image.py` script.  This step may take 20-25 minutes to complete.
+```text
+*** Running diagnostic function "nv_driver" ***
+Warning: Permanently added 'compute.9009746126288801979' (ECDSA) to the list of known hosts.
+Fri Oct 14 05:17:55 2022
++-----------------------------------------------------------------------------+
+| NVIDIA-SMI 460.106.00   Driver Version: 460.106.00   CUDA Version: 11.2     |
+|-------------------------------+----------------------+----------------------+
+| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
+|                               |                      |               MIG M. |
+|===============================+======================+======================|
+|   0  Tesla T4            On   | 00000000:00:04.0 Off |                    0 |
+| N/A   48C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
+|                               |                      |                  N/A |
++-------------------------------+----------------------+----------------------+
 
-```bash
-git clone https://github.com/GoogleCloudDataproc/custom-images
-cd custom-images
++-----------------------------------------------------------------------------+
+| Processes:                                                                  |
+|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
+|        ID   ID                                                   Usage      |
+|=============================================================================|
+|  No running processes found                                                 |
++-----------------------------------------------------------------------------+
+NVRM version: NVIDIA UNIX x86_64 Kernel Module  460.106.00  Tue Sep 28 12:05:58 UTC 2021
+GCC version:  gcc version 7.5.0 (Ubuntu 7.5.0-3ubuntu1~18.04)
+Connection to 34.68.242.247 closed.
+*** Check "nv_driver": PASS ***
+*** Running diagnostic function "nv_driver" ***
+Warning: Permanently added 'compute.6788823627063447738' (ECDSA) to the list of known hosts.
+Fri Oct 14 05:18:02 2022
++-----------------------------------------------------------------------------+
+| NVIDIA-SMI 460.106.00   Driver Version: 460.106.00   CUDA Version: 11.2     |
+|-------------------------------+----------------------+----------------------+
+| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
+|                               |                      |               MIG M. |
+|===============================+======================+======================|
+|   0  Tesla T4            On   | 00000000:00:04.0 Off |                    0 |
+| N/A   35C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
+|                               |                      |                  N/A |
++-------------------------------+----------------------+----------------------+
 
-export CUSTOMIZATION_SCRIPT=/path/to/spark-rapids.sh
-export ZONE=[Your Preferred GCP Zone]
-export GCS_BUCKET=[Your GCS Bucket]
-export IMAGE_NAME=sample-20-ubuntu18-gpu-t4
-export DATAPROC_VERSION=2.0-ubuntu18
-export GPU_NAME=nvidia-tesla-t4
-export GPU_COUNT=1
++-----------------------------------------------------------------------------+
+| Processes:                                                                  |
+|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
+|        ID   ID                                                   Usage      |
+|=============================================================================|
+|  No running processes found                                                 |
++-----------------------------------------------------------------------------+
+NVRM version: NVIDIA UNIX x86_64 Kernel Module  460.106.00  Tue Sep 28 12:05:58 UTC 2021
+GCC version:  gcc version 7.5.0 (Ubuntu 7.5.0-3ubuntu1~18.04)
+Connection to 34.123.223.104 closed.
+*** Check "nv_driver": PASS ***
+*** Running diagnostic function "cuda_version" ***
+Connection to 34.68.242.247 closed.
+found cuda major version: 11
+*** Check "cuda_version": PASS ***
+*** Running diagnostic function "cuda_version" ***
+Connection to 34.123.223.104 closed.
+found cuda major version: 11
+*** Check "cuda_version": PASS ***
+...
+********************************************************************************
+Overall check result: PASS
+```
 
-python generate_custom_image.py \
-    --image-name $IMAGE_NAME \
-    --dataproc-version $DATAPROC_VERSION \
-    --customization-script $CUSTOMIZATION_SCRIPT \
-    --no-smoke-test \
-    --zone $ZONE \
-    --gcs-bucket $GCS_BUCKET \
-    --machine-type n1-standard-4 \
-    --accelerator type=$GPU_NAME,count=$GPU_COUNT \
-    --disk-size 200 \
-    --subnet default 
+Please note that the diagnostic tool supports the following:
+
+* Dataproc 2.0 with image of Debian 10 or Ubuntu 18.04 (Rocky8 support is coming soon)
+* GPU clusters that must have 1 worker node at least. Single node cluster (1 master, 0 workers) is
+  not supported
+
+## Bootstrap GPU Cluster with Optimized Settings
+
+The bootstrap tool will apply optimized settings for the RAPIDS Accelerator on Apache Spark on a 
+GPU cluster for Dataproc.  The tool will fetch the characteristics of the cluster -- including 
+number of workers, worker cores, worker memory, and GPU accelerator type and count.  It will use
+the cluster properties to then determine the optimal settings for running GPU-accelerated Spark 
+applications.
+
+Usage: `spark_rapids_dataproc bootstrap --cluster <cluster-name> --region <region>`
+
+Help (to see all options available): `spark_rapids_dataproc bootstrap --help`
+
+Example output: 
+```
+##### BEGIN : RAPIDS bootstrap settings for gpu-cluster
+spark.executor.cores=16
+spark.executor.memory=32768m
+spark.executor.memoryOverhead=7372m
+spark.rapids.sql.concurrentGpuTasks=2
+spark.rapids.memory.pinnedPool.size=4096m
+spark.sql.files.maxPartitionBytes=512m
+spark.task.resource.gpu.amount=0.0625
+##### END : RAPIDS bootstrap settings for gpu-cluster
 ```
 
-See [here](https://cloud.google.com/dataproc/docs/guides/dataproc-images#running_the_code) for more
-details on `generate_custom_image.py` script arguments and
-[here](https://cloud.google.com/dataproc/docs/concepts/versioning/dataproc-versions) for dataproc
-version description.
+A detailed description for bootstrap settings with usage information is available in the
+[RAPIDS Accelerator for Apache Spark Configuration](https://nvidia.github.io/spark-rapids/docs/configs.html) 
+and [Spark Configuration](https://spark.apache.org/docs/latest/configuration.html) page.
 
-The image `sample-20-ubuntu18-gpu-t4` is now ready and can be viewed in the GCP console under
-`Compute Engine > Storage > Images`. The next step is to launch the cluster using this new image
-and new initialization actions (that do not install NVIDIA drivers since we are already past that
-step).
+## Qualify CPU Workloads for GPU Acceleration
 
-Move this to your own bucket. Let's launch the cluster:
+The [qualification tool](https://pypi.org/project/spark-rapids-user-tools/) is launched on a Dataproc cluster that has applications that have already run.
+The tool will output the applications recommended for acceleration along with estimated speed-up
+and cost saving metrics.  Additionally, it will provide information on how to launch a GPU-
+accelerated cluster to take advantage of the speed-up and cost savings.
 
-```bash 
-export REGION=[Your Preferred GCP Region]
-export GCS_BUCKET=[Your GCS Bucket]
-export CLUSTER_NAME=[Your Cluster Name]
-export NUM_GPUS=1
-export NUM_WORKERS=2
+Usage: `spark_rapids_dataproc qualification --cluster <cluster-name> --region <region>`
 
-gcloud dataproc clusters create $CLUSTER_NAME  \
-    --region=$REGION \
-    --image=sample-20-ubuntu18-gpu-t4 \
-    --master-machine-type=n1-standard-4 \
-    --num-workers=$NUM_WORKERS \
-    --worker-accelerator=type=nvidia-tesla-t4,count=$NUM_GPUS \
-    --worker-machine-type=n1-standard-4 \
-    --num-worker-local-ssds=1 \
-    --optional-components=JUPYTER,ZEPPELIN \
-    --metadata=rapids-runtime=SPARK \
-    --bucket=$GCS_BUCKET \
-    --enable-component-gateway \
-    --subnet=default 
+Help (to see all options available): `spark_rapids_dataproc qualification --help`
+
+Example output:
+```
++----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------+
+|    | App Name   | App ID                         | Recommendation       |   Estimated GPU |   Estimated GPU |           App |   Estimated GPU |
+|    |            |                                |                      |         Speedup |     Duration(s) |   Duration(s) |      Savings(%) |
+|----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------|
+|  0 | query24    | application_1664888311321_0011 | Strongly Recommended |            3.49 |          257.18 |        897.68 |           59.70 |
+|  1 | query78    | application_1664888311321_0009 | Strongly Recommended |            3.35 |          113.89 |        382.35 |           58.10 |
+|  2 | query23    | application_1664888311321_0010 | Strongly Recommended |            3.08 |          325.77 |       1004.28 |           54.37 |
+|  3 | query64    | application_1664888311321_0008 | Strongly Recommended |            2.91 |          150.81 |        440.30 |           51.82 |
+|  4 | query50    | application_1664888311321_0003 | Recommended          |            2.47 |          101.54 |        250.95 |           43.08 |
+|  5 | query16    | application_1664888311321_0005 | Recommended          |            2.36 |          106.33 |        251.95 |           40.63 |
+|  6 | query38    | application_1664888311321_0004 | Recommended          |            2.29 |           67.37 |        154.33 |           38.59 |
+|  7 | query87    | application_1664888311321_0006 | Recommended          |            2.25 |           75.67 |        170.69 |           37.64 |
+|  8 | query51    | application_1664888311321_0002 | Recommended          |            1.53 |           53.94 |         82.63 |            8.18 |
++----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------+
+To launch a GPU-accelerated cluster with Spark RAPIDS, add the following to your cluster creation script:
+        --initialization-actions=gs://goog-dataproc-initialization-actions-${REGION}/spark-rapids/spark-rapids.sh \
+        --worker-accelerator type=nvidia-tesla-t4,count=2 \
+        --metadata gpu-driver-provider="NVIDIA" \
+        --metadata rapids-runtime=SPARK \
+        --cuda-version=11.5
 ```
 
-The new cluster should be up and running within 3-4 minutes!
+Please refer [Qualification Tool](https://nvidia.github.io/spark-rapids/docs/spark-qualification-tool.html) guide for running qualification tool on more environment.
+
+## Tune Applications on GPU Cluster
+
+Once Spark applications have been run on the GPU cluster, the [profiling tool](https://nvidia.github.io/spark-rapids/docs/spark-profiling-tool.html) can be run to 
+analyze the event logs of the applications to determine if more optimal settings should be
+configured.  The tool will output a per-application set of config settings to be adjusted for
+enhanced performance.
+
+Usage: `spark_rapids_dataproc profiling --cluster <cluster-name> --region <region>`
+
+Help (to see all options available): `spark_rapids_dataproc profiling --help`
+
+Example output:
+```
++--------------------------------+--------------------------------------------------+--------------------------------------------------------------------------------------------------+
+| App ID                         | Recommendations                                  | Comments                                                                                         |
++================================+==================================================+==================================================================================================+
+| application_1664894105643_0011 | --conf spark.executor.cores=16                   | - 'spark.task.resource.gpu.amount' was not set.                                                  |
+|                                | --conf spark.executor.memory=32768m              | - 'spark.rapids.sql.concurrentGpuTasks' was not set.                                             |
+|                                | --conf spark.executor.memoryOverhead=7372m       | - 'spark.rapids.memory.pinnedPool.size' was not set.                                             |
+|                                | --conf spark.rapids.memory.pinnedPool.size=4096m | - 'spark.executor.memoryOverhead' was not set.                                                   |
+|                                | --conf spark.rapids.sql.concurrentGpuTasks=2     | - 'spark.sql.files.maxPartitionBytes' was not set.                                               |
+|                                | --conf spark.sql.files.maxPartitionBytes=1571m   | - 'spark.sql.shuffle.partitions' was not set.                                                    |
+|                                | --conf spark.sql.shuffle.partitions=200          |                                                                                                  |
+|                                | --conf spark.task.resource.gpu.amount=0.0625     |                                                                                                  |
++--------------------------------+--------------------------------------------------+--------------------------------------------------------------------------------------------------+
+| application_1664894105643_0002 | --conf spark.executor.cores=16                   | - 'spark.task.resource.gpu.amount' was not set.                                                  |
+|                                | --conf spark.executor.memory=32768m              | - 'spark.rapids.sql.concurrentGpuTasks' was not set.                                             |
+|                                | --conf spark.executor.memoryOverhead=7372m       | - 'spark.rapids.memory.pinnedPool.size' was not set.                                             |
+|                                | --conf spark.rapids.memory.pinnedPool.size=4096m | - 'spark.executor.memoryOverhead' was not set.                                                   |
+|                                | --conf spark.rapids.sql.concurrentGpuTasks=2     | - 'spark.sql.files.maxPartitionBytes' was not set.                                               |
+|                                | --conf spark.sql.files.maxPartitionBytes=3844m   | - 'spark.sql.shuffle.partitions' was not set.                                                    |
+|                                | --conf spark.sql.shuffle.partitions=200          |                                                                                                  |
+|                                | --conf spark.task.resource.gpu.amount=0.0625     |                                                                                                  |
++--------------------------------+--------------------------------------------------+--------------------------------------------------------------------------------------------------+
+```
diff --git a/docs/get-started/getting-started-on-prem.md b/docs/get-started/getting-started-on-prem.md
index 5f6f0ce0616..31e8e242d20 100644
--- a/docs/get-started/getting-started-on-prem.md
+++ b/docs/get-started/getting-started-on-prem.md
@@ -53,13 +53,13 @@ CUDA and will not run on other versions. The jars use a classifier to keep them
 - CUDA 11.x => classifier cuda11
 
 For example, here is a sample version of the jar with CUDA 11.x support:
-- rapids-4-spark_2.12-23.02.0-cuda11.jar
+- rapids-4-spark_2.12-23.04.0-cuda11.jar
 
 For simplicity export the location to this jar. This example assumes the sample jar above has
 been placed in the `/opt/sparkRapidsPlugin` directory:
 ```shell 
 export SPARK_RAPIDS_DIR=/opt/sparkRapidsPlugin
-export SPARK_RAPIDS_PLUGIN_JAR=${SPARK_RAPIDS_DIR}/rapids-4-spark_2.12-23.02.0-cuda11.jar
+export SPARK_RAPIDS_PLUGIN_JAR=${SPARK_RAPIDS_DIR}/rapids-4-spark_2.12-23.04.0-cuda11.jar
 ```
 
 ## Install the GPU Discovery Script
diff --git a/docs/get-started/getting-started-workload-qualification.md b/docs/get-started/getting-started-workload-qualification.md
index 2a5125303ca..624ca29349d 100644
--- a/docs/get-started/getting-started-workload-qualification.md
+++ b/docs/get-started/getting-started-workload-qualification.md
@@ -39,8 +39,8 @@ you focus on the Spark applications which are best suited for the GPU.
 
 The profiling tool outputs SQL plan metrics and also prints out actual query plans to provide more
 insights.  In the following example the profiling tool output for a specific Spark application shows
-that it has a query with a large `HashAggregate` and `SortMergeJoin`. Those are indicators for a
-good candidate application for the RAPIDS Accelerator.
+that it has a query with a large  (processing millions of rows) `HashAggregate` and `SortMergeJoin`. 
+Those are indicators for a good candidate application for the RAPIDS Accelerator.
 
 ```
 +--------+-----+------+----------------------------------------------------+-------------+------------------------------------+-------------+----------+
diff --git a/docs/img/AWS-EMR/EMR_notebook_1.png b/docs/img/AWS-EMR/EMR_notebook_1.png
deleted file mode 100644
index 18dc7a95921..00000000000
Binary files a/docs/img/AWS-EMR/EMR_notebook_1.png and /dev/null differ
diff --git a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_1.png b/docs/img/AWS-EMR/RAPIDS_EMR_GUI_1.png
deleted file mode 100644
index ec6e3eab036..00000000000
Binary files a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_1.png and /dev/null differ
diff --git a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_2.png b/docs/img/AWS-EMR/RAPIDS_EMR_GUI_2.png
deleted file mode 100644
index 83d0b577af0..00000000000
Binary files a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_2.png and /dev/null differ
diff --git a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_2b.png b/docs/img/AWS-EMR/RAPIDS_EMR_GUI_2b.png
deleted file mode 100644
index ffd1253b974..00000000000
Binary files a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_2b.png and /dev/null differ
diff --git a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_3.png b/docs/img/AWS-EMR/RAPIDS_EMR_GUI_3.png
deleted file mode 100644
index 5ac22ee1583..00000000000
Binary files a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_3.png and /dev/null differ
diff --git a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_4.png b/docs/img/AWS-EMR/RAPIDS_EMR_GUI_4.png
deleted file mode 100644
index 1953bf68b30..00000000000
Binary files a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_4.png and /dev/null differ
diff --git a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_5.png b/docs/img/AWS-EMR/RAPIDS_EMR_GUI_5.png
deleted file mode 100644
index 8e0e04671c1..00000000000
Binary files a/docs/img/AWS-EMR/RAPIDS_EMR_GUI_5.png and /dev/null differ
diff --git a/docs/img/AWS-EMR/bootstrap-action.png b/docs/img/AWS-EMR/bootstrap-action.png
new file mode 100644
index 00000000000..b4eaf85f882
Binary files /dev/null and b/docs/img/AWS-EMR/bootstrap-action.png differ
diff --git a/docs/img/AWS-EMR/cluster-configuration.png b/docs/img/AWS-EMR/cluster-configuration.png
new file mode 100644
index 00000000000..136ffc191bf
Binary files /dev/null and b/docs/img/AWS-EMR/cluster-configuration.png differ
diff --git a/docs/img/AWS-EMR/name-and-applications.png b/docs/img/AWS-EMR/name-and-applications.png
new file mode 100644
index 00000000000..1003b7697af
Binary files /dev/null and b/docs/img/AWS-EMR/name-and-applications.png differ
diff --git a/docs/img/AWS-EMR/networking.png b/docs/img/AWS-EMR/networking.png
new file mode 100644
index 00000000000..36acf522fab
Binary files /dev/null and b/docs/img/AWS-EMR/networking.png differ
diff --git a/docs/img/AWS-EMR/notebook-workspace-creation.png b/docs/img/AWS-EMR/notebook-workspace-creation.png
new file mode 100644
index 00000000000..edef7276911
Binary files /dev/null and b/docs/img/AWS-EMR/notebook-workspace-creation.png differ
diff --git a/docs/img/AWS-EMR/ssh-key-pair.png b/docs/img/AWS-EMR/ssh-key-pair.png
new file mode 100644
index 00000000000..fa75588e3ff
Binary files /dev/null and b/docs/img/AWS-EMR/ssh-key-pair.png differ
diff --git a/docs/img/Databricks/sparkconfig.png b/docs/img/Databricks/sparkconfig.png
index d5c1070c4d0..f05b7d632fb 100644
Binary files a/docs/img/Databricks/sparkconfig.png and b/docs/img/Databricks/sparkconfig.png differ
diff --git a/docs/index.md b/docs/index.md
index b2bf634617b..0e099a609a8 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -9,7 +9,10 @@ description: This site serves as a collection of documentation about the RAPIDS
 The RAPIDS Accelerator for Apache Spark leverages GPUs to accelerate processing via the
 [RAPIDS libraries](http://rapids.ai).
 
-As data scientists shift from using traditional analytics to leveraging AI applications that better model complex market demands, traditional CPU-based processing can no longer keep up without compromising either speed or cost. The growing adoption of AI in analytics has created the need for a new framework to process data quickly and cost efficiently with GPUs.
+As data scientists shift from using traditional analytics to leveraging AI(DL/ML) applications that 
+better model complex market demands, traditional CPU-based processing can no longer keep up without 
+compromising either speed or cost. The growing adoption of AI in analytics has created the need for 
+a new framework to process data quickly and cost-efficiently with GPUs.
 
 The RAPIDS Accelerator for Apache Spark combines the power of the <a href="https://github.com/rapidsai/cudf/">RAPIDS cuDF</a> library and the scale of the Spark distributed computing framework.  The RAPIDS Accelerator library also has a built-in accelerated shuffle based on <a href="https://github.com/openucx/ucx/">UCX</a> that can be configured to leverage GPU-to-GPU communication and RDMA capabilities. 
 
@@ -20,6 +23,8 @@ Rapids Accelerator for Apache Spark reaps the benefit of GPU performance while s
 [demo](https://databricks.com/session_na20/deep-dive-into-gpu-support-in-apache-spark-3-x). Costs 
 based on Cloud T4 GPU instance market price.
 
+Please refer to [spark-rapids-examples repo](https://github.com/NVIDIA/spark-rapids-examples/tree/main/examples/XGBoost-Examples)
+for details of this example job.
 
 ## Ease of Use
 Run your existing Apache Spark applications with no code change.  Launch Spark with the RAPIDS Accelerator for Apache Spark plugin jar and enable a configuration setting: 
diff --git a/docs/spark-profiling-tool.md b/docs/spark-profiling-tool.md
index 5f5b6a28b90..943b90c7323 100644
--- a/docs/spark-profiling-tool.md
+++ b/docs/spark-profiling-tool.md
@@ -33,7 +33,7 @@ more information.
 The Profiling tool requires the Spark 3.x jars to be able to run but do not need an Apache Spark run time. 
 If you do not already have Spark 3.x installed, 
 you can download the Spark distribution to any machine and include the jars in the classpath.
-- Download the jar file from [Maven repository](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/23.02.0/)
+- Download the jar file from [Maven repository](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/23.04.0/)
 - [Download Apache Spark 3.x](http://spark.apache.org/downloads.html) - Spark 3.1.1 for Apache Hadoop is recommended
 If you want to compile the jars, please refer to the instructions [here](./spark-qualification-tool.md#How-to-compile-the-tools-jar). 
 
@@ -54,7 +54,7 @@ There are 3 modes of operation for the Profiling tool:
     on each application individually and outputs a file per application
     
     ```bash
-    Usage: java -cp rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/*
+    Usage: java -cp rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/* \
            com.nvidia.spark.rapids.tool.profiling.ProfileMain [options]
            <eventlogs | eventlog directories ...>
     ```
@@ -66,7 +66,7 @@ There are 3 modes of operation for the Profiling tool:
     together and you get one file for all applications.
     
     ```bash
-    Usage: java -cp rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/*
+    Usage: java -cp rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/* \
            com.nvidia.spark.rapids.tool.profiling.ProfileMain --combined
            <eventlogs | eventlog directories ...>
     ```
@@ -76,7 +76,7 @@ There are 3 modes of operation for the Profiling tool:
     The Compare mode will use more memory if comparing lots of applications.
     
     ```bash
-    Usage: java -cp rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/*
+    Usage: java -cp rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/* \
            com.nvidia.spark.rapids.tool.profiling.ProfileMain --compare
            <eventlogs | eventlog directories ...>
     ```
@@ -583,7 +583,7 @@ The _Auto-Tuner_ output has 2 main sections:
 ```
 Profiling tool for the RAPIDS Accelerator and Apache Spark
 
-Usage: java -cp rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/*
+Usage: java -cp rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/* \
        com.nvidia.spark.rapids.tool.profiling.ProfileMain [options]
        <eventlogs | eventlog directories ...>
 
diff --git a/docs/spark-qualification-tool.md b/docs/spark-qualification-tool.md
index 330d6f8331e..54c346a15c5 100644
--- a/docs/spark-qualification-tool.md
+++ b/docs/spark-qualification-tool.md
@@ -18,11 +18,24 @@ This tool is intended to give the users a starting point and does not guarantee
 queries or applications with the highest _recommendation_ will actually be accelerated the most. Currently,
 it reports by looking at the amount of time spent in tasks of SQL Dataframe operations.
 
+The estimations for GPU duration are available for different environments and are based on benchmarks run in the
+applicable environments.  Here are the cluster information for the ETL benchmarks used for the estimates:
+
+| Environment      | CPU Cluster       | GPU Cluster                    |
+|------------------|-------------------|--------------------------------|
+| On-prem          | 8x 128-core       | 8x 128-core + 8x A100 40 GB    |
+| Dataproc         | 4x n1-standard-32 | 4x n1-standard-32 + 8x T4 16GB |
+| EMR              | 8x m5d.8xlarge    | 4x g4dn.12xlarge               |
+| Databricks AWS   | 8x m6gd.8xlage    | 8x g5.8xlarge                  |
+| Databricks Azure | 8x E8ds_v4        | 8x NC8as_T4_v3                 |
+
+Note that all benchmarks were run using the [NDS benchmark](https://github.com/NVIDIA/spark-rapids-benchmarks/tree/dev/nds) at SF3K (3 TB).
+
 > **Disclaimer!**  
 > Estimates provided by the Qualification tool are based on the currently supported "_SparkPlan_" or "_Executor Nodes_"
 > used in the application. It currently does not handle all the expressions or datatypes used.  
 > Please refer to "[Understanding Execs report](#execs-report)" section and the
-> "[Supported Operators](./supported_ops.md)" guide to check the types and expressions you are using are supported.
+> "[Supported Operators](https://github.com/NVIDIA/spark-rapids/blob/main/docs/supported_ops.md)" guide to check the types and expressions you are using are supported.
 
 This document covers below topics:
 
@@ -36,6 +49,49 @@ Spark event logs after the application(s) have run, the second is to be integrat
 application using explicit API calls, and the third is to install a Spark listener which can output
 results on a per SQL query basis.
 
+In running the qualification tool standalone on Spark event logs, the tool can be run as a user tool command
+via a [pip package](https://pypi.org/project/spark-rapids-user-tools/) for CSP environments (Google Dataproc,
+AWS EMR, Databricks AWS) or as a java application for other environments.
+
+## Running the Qualification tool standalone for CSP environments on Spark event logs
+### User Tools Prerequisites and Setup for CSP environments
+
+* [Dataproc](https://github.com/NVIDIA/spark-rapids-tools/blob/main/user_tools/docs/user-tools-dataproc.md)
+* [EMR](https://github.com/NVIDIA/spark-rapids-tools/blob/main/user_tools/docs/user-tools-aws-emr.md)
+* [Databricks AWS](https://github.com/NVIDIA/spark-rapids-tools/blob/main/user_tools/docs/user-tools-databricks-aws.md)
+
+### Qualify CPU Workloads for Potential Cost Savings and Acceleration with GPUs
+
+The qualification tool will run against logs from your CSP environment and then will output the applications
+recommended for acceleration along with estimated speed-up and cost saving metrics.
+
+Usage: `spark_rapids_user_tools <CSP> qualification --cpu_cluster <CLUSTER> --eventlogs <EVENTLOGS-PATH>`
+
+The supported CSPs are *dataproc*, *emr*, and *databricks-aws*.  The EVENTLOGS-PATH should be the storage location
+for your eventlogs.  For Dataproc, it should be set to the GCS path.  For EMR and Databricks-AWS, it should be set to
+the S3 path.  THE CLUSTER can be a live cluster or a configuration file representing the cluster instances and size.
+More details are in the above documentation links per CSP environment. The user tools only show recommended applications in the output.
+
+Help (to see all options available): `spark_rapids_user_tools <CSP> qualification --help`
+
+Example output:
+```
++----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------+
+|    | App Name   | App ID                         | Recommendation       |   Estimated GPU |   Estimated GPU |           App |   Estimated GPU |
+|    |            |                                |                      |         Speedup |     Duration(s) |   Duration(s) |      Savings(%) |
+|----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------|
+|  0 | query24    | application_1664888311321_0011 | Strongly Recommended |            3.49 |          257.18 |        897.68 |           59.70 |
+|  1 | query78    | application_1664888311321_0009 | Strongly Recommended |            3.35 |          113.89 |        382.35 |           58.10 |
+|  2 | query23    | application_1664888311321_0010 | Strongly Recommended |            3.08 |          325.77 |       1004.28 |           54.37 |
+|  3 | query64    | application_1664888311321_0008 | Strongly Recommended |            2.91 |          150.81 |        440.30 |           51.82 |
+|  4 | query50    | application_1664888311321_0003 | Recommended          |            2.47 |          101.54 |        250.95 |           43.08 |
+|  5 | query16    | application_1664888311321_0005 | Recommended          |            2.36 |          106.33 |        251.95 |           40.63 |
+|  6 | query38    | application_1664888311321_0004 | Recommended          |            2.29 |           67.37 |        154.33 |           38.59 |
+|  7 | query87    | application_1664888311321_0006 | Recommended          |            2.25 |           75.67 |        170.69 |           37.64 |
+|  8 | query51    | application_1664888311321_0002 | Recommended          |            1.53 |           53.94 |         82.63 |            8.18 |
++----+------------+--------------------------------+----------------------+-----------------+-----------------+---------------+-----------------+
+```
+
 ## Running the Qualification tool standalone on Spark event logs
 
 ### Prerequisites
@@ -55,7 +111,7 @@ more information.
 The Qualification tool require the Spark 3.x jars to be able to run but do not need an Apache Spark run time.
 If you do not already have Spark 3.x installed, you can download the Spark distribution to
 any machine and include the jars in the classpath.
-- Download the jar file from [Maven repository](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/23.02.0/)
+- Download the jar file from [Maven repository](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/23.04.0/)
 - [Download Apache Spark 3.x](http://spark.apache.org/downloads.html) - Spark 3.1.1 for Apache Hadoop is recommended
 
 ### Step 2 Run the Qualification tool
@@ -84,7 +140,7 @@ any machine and include the jars in the classpath.
 
     ```bash
     Sample: java ${QUALIFICATION_HEAP} \
-              -cp rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/*
+              -cp rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/* \
               com.nvidia.spark.rapids.tool.qualification.QualificationMain /usr/logs/app-name1
     ```
 
@@ -110,7 +166,7 @@ java -cp ~/rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/*:$HADOOP_CO
 
 RAPIDS Accelerator Qualification tool for Apache Spark
 
-Usage: java -cp rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/*
+Usage: java -cp rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/* \
        com.nvidia.spark.rapids.tool.qualification.QualificationMain [options]
        <eventlogs | eventlog directories ...>
 
@@ -165,6 +221,8 @@ Usage: java -cp rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/*
       --max-sql-desc-length  <arg>   Maximum length of the SQL description
                                      string output with the per sql output.
                                      Default is 100.
+      --ml-functions                 Report if there are any SparkML or Spark XGBoost
+                                     functions in the eventlog.
   -n, --num-output-rows  <arg>       Number of output rows in the summary report.
                                      Default is 1000.
       --num-threads  <arg>           Number of thread to use for parallel
@@ -183,6 +241,10 @@ Usage: java -cp rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/*
                                      It will overwrite any existing directory with
                                      the same name.
   -p, --per-sql                      Report at the individual SQL query level.
+      --platform  <arg>              Cluster platform where Spark CPU workloads were
+                                     executed. Options include onprem, dataproc, emr
+                                     databricks-aws, and databricks-azure.
+                                     Default is onprem.
   -r, --report-read-schema           Whether to output the read formats and
                                      datatypes to the CSV file. This can be very
                                      long. Default is false.
@@ -245,6 +307,14 @@ java ${QUALIFICATION_HEAP} \
   com.nvidia.spark.rapids.tool.qualification.QualificationMain -f 1-newest-per-app-name /eventlogDir
 ```
 
+- Parse ML functions from the eventlog:
+
+```bash
+java ${QUALIFICATION_HEAP} \
+  -cp ~/rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/*:$HADOOP_CONF_DIR/ \
+  com.nvidia.spark.rapids.tool.qualification.QualificationMain --ml-functions /eventlogDir
+```
+
 Note: the “regular expression” used by `-a` option is based on
 [java.util.regex.Pattern](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html).
 
@@ -274,6 +344,8 @@ The tree structure of the output directory `${OUTPUT_FOLDER}/rapids_4_spark_qual
     ├── rapids_4_spark_qualification_output_persql.csv
     ├── rapids_4_spark_qualification_output_execs.csv
     ├── rapids_4_spark_qualification_output_stages.csv
+    ├── rapids_4_spark_qualification_output_mlfunctions.csv
+    ├── rapids_4_spark_qualification_output_mlfunctions_totalduration.csv
     └── ui
         ├── assets
         │   ├── bootstrap/
@@ -308,7 +380,7 @@ to [Understanding the Qualification tool output](#understanding-the-qualificatio
 - Java 8 or above, Spark 3.0.1+
 
 ### Download the tools jar
-- Download the jar file from [Maven repository](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/23.02.0/)
+- Download the jar file from [Maven repository](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/23.04.0/)
 
 ### Modify your application code to call the api's
 
@@ -395,7 +467,7 @@ with the Rapids Accelerator for Spark.
 - Java 8 or above, Spark 3.0.1+
 
 ### Download the tools jar
-- Download the jar file from [Maven repository](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/23.02.0/)
+- Download the jar file from [Maven repository](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/23.04.0/)
 
 ### Configuration
 
@@ -451,13 +523,14 @@ section on the file contents details.
 For each processed Spark application, the Qualification tool generates two main fields to help quantify the expected
 acceleration of migrating a Spark application or query to GPU.
 
-1. `Estimated GPU Duration`: predicted runtime of the app if it was run on GPU. It is the sum add of the accelerated
-   operator durations along with durations that could not run on GPU because they are unsupported operators or not SQL/Dataframe.
-2. `Estimated Speed-up factor`: the estimated speed-up factor is simply the original CPU duration of the app divided by the
+1. `Estimated GPU Duration`: predicted runtime of the app if it was run on GPU. It is the sum of the accelerated
+   operator durations and ML functions duration(if applicable) along with durations that could not run on GPU because
+   they are unsupported operators or not SQL/Dataframe.
+2. `Estimated Speed-up`: the estimated speed-up is simply the original CPU duration of the app divided by the
    estimated GPU duration. That will estimate how much faster the application would run on GPU.
 
 The lower the estimated GPU duration, the higher the "_Estimated Speed-up_".
-The processed applications or queries are ranked by the "_Estimated Speed-up_". Based on how high the speed-up factor,
+The processed applications or queries are ranked by the "_Estimated Speed-up_". Based on how high the estimated speed-up,
 the tool classifies the applications into the following different categories:
 
 - `Strongly Recommended`
@@ -466,7 +539,7 @@ the tool classifies the applications into the following different categories:
 - `Not Applicable`: indicates that the app has job or stage failures.
 
 As mentioned before, the tool does not guarantee the applications or queries with the highest _recommendation_ will actually be
-accelerated the most. Please refer to [Supported Operators](./supported_ops.md) section.
+accelerated the most. Please refer to [Supported Operators](https://github.com/NVIDIA/spark-rapids/blob/main/docs/supported_ops.md) section.
 
 In addition to the _recommendation_, the Qualification tool reports a set of metrics in tasks of SQL Dataframe operations
 within the scope of: "_Entire App_"; "_Stages_"; and "_Execs_". The report is divided into three main levels. The fields
@@ -488,10 +561,10 @@ The report represents the entire app execution, including unsupported operators
 4. _App Duration_: wall-Clock time measured since the application starts till it is completed.
    If an app is not completed an estimated completion time would be computed.
 5. _SQL DF duration_: wall-Clock time duration that includes only SQL-Dataframe queries.
-6. _GPU Opportunity_: wall-Clock time that shows how much of the SQL duration can be accelerated on the GPU.
+6. _GPU Opportunity_: wall-Clock time that shows how much of the SQL duration and ML functions(if applicable) can be accelerated on the GPU.
 7. _Estimated GPU Duration_: predicted runtime of the app if it was run on GPU. It is the sum of the accelerated
-   operator durations along with durations that could not run on GPU because they are unsupported operators or not SQL/Dataframe.
-8. _Estimated GPU Speed-up_: the speed-up factor is simply the original CPU duration of the app divided by the
+   operator durations and ML functions durations(if applicable) along with durations that could not run on GPU because they are unsupported operators or not SQL/Dataframe.
+8. _Estimated GPU Speed-up_: the speed-up is simply the original CPU duration of the app divided by the
    estimated GPU duration. That will estimate how much faster the application would run on GPU.
 9. _Estimated GPU Time Saved_: estimated wall-Clock time saved if it was run on the GPU.
 10. _SQL Dataframe Task Duration_: amount of time spent in tasks of SQL Dataframe operations.
@@ -528,7 +601,7 @@ The report represents the entire app execution, including unsupported operators
     is passed to the CLI.
 
 **Note:** the Qualification tool won't catch all UDFs, and some of the UDFs can be handled with additional steps.
-Please refer to [Supported Operators](./supported_ops.md) for more details on UDF.
+Please refer to [Supported Operators](https://github.com/NVIDIA/spark-rapids/blob/main/docs/supported_ops.md) for more details on UDF.
 
 By default, the applications and queries are sorted in descending order by the following fields:
 - _Recommendation_;
@@ -545,13 +618,13 @@ For each stage used in SQL operations, the Qualification tool generates the foll
 3. _Average Speedup Factor_: the average estimated speed-up of all the operators in the given stage.
 4. _Stage Task Duration_: amount of time spent in tasks of SQL Dataframe operations for the given stage.
 5. _Unsupported Task Duration_: sum of task durations for the unsupported operators. For more details,
-   see [Supported Operators](./supported_ops.md).
+   see [Supported Operators](https://github.com/NVIDIA/spark-rapids/blob/main/docs/supported_ops.md).
 6. _Stage Estimated_: True or False indicates if we had to estimate the stage duration.
 
 ### Execs report
 
 The Qualification tool generates a report of the "Exec" in the "_SparkPlan_" or "_Executor Nodes_" along with the estimated
-acceleration on the GPU. Please refer to the [Supported Operators](./supported_ops.md) guide for more
+acceleration on the GPU. Please refer to the [Supported Operators](https://github.com/NVIDIA/spark-rapids/blob/main/docs/supported_ops.md) guide for more
 details on limitations on UDFs and unsupported operators.
 
 1. _App ID_
@@ -564,7 +637,7 @@ details on limitations on UDFs and unsupported operators.
 6. _Exec Duration_: wall-Clock time measured since the operator starts till it is completed.
 7. _SQL Node Id_
 8. _Exec Is Supported_: whether the Exec is supported by RAPIDS or not. Please refer to the
-  [Supported Operators](./supported_ops.md) section.
+  [Supported Operators](https://github.com/NVIDIA/spark-rapids/blob/main/docs/supported_ops.md) section.
 9. _Exec Stages_: an array of stage IDs
 10. _Exec Children_
 11. _Exec Children Node Ids_
@@ -625,6 +698,24 @@ The following table lists the exec's name and the status of parsing their expres
 | WindowExec                            |              -              |         x        |        -        |
 | WindowInPandasExec                    |              -              |         -        |        x        |
 
+### MLFunctions report
+The Qualification tool generates a report if there are SparkML or Spark XGBoost functions used in the eventlog.
+The functions in "*spark.ml.*" or "*spark.XGBoost.*" packages are displayed in the report.
+
+1. _App ID_
+2. _Stage ID_
+3. _ML Functions_: List of ML functions used in the corresponding stage.
+4. _Stage Task Duration_: amount of time spent in tasks containing ML functions for the given stage.
+
+### MLFunctions total duration report
+The Qualification tool generates a report of total duration across all stages for ML functions which 
+are supported on GPU.
+
+1. _App ID_
+2. _Stage_Ids : Stage Id's corresponding to the given ML function.
+3. _ML Function Name_: ML function name supported on GPU.
+4. _Total Duration_: total duration across all stages for the corresponding ML function.
+
 ## Output Formats
 
 The Qualification tool generates the output as CSV/log files. Starting from "_22.06_", the default
@@ -711,7 +802,7 @@ It contains the following main components:
    There are three searchPanes:
     1. "_Is Stage Estimated_": it splits the stages into two groups based on whether the stage duration time was estimated
        or not.
-    2. "_Speed-up_": groups the stages by their "average speed-up factor". Each stage can belong to one of the following
+    2. "_Speed-up_": groups the stages by their "average speed-up". Each stage can belong to one of the following
        predefined speed-up ranges: `1.0 (No Speed-up)`; `]1.0, 1.3[`; `[1.3, 2.5[`; `[2.5, 5[`; and `[5, _]`. The
        search-pane does not show a range bucket if its count is 0.
     3. "_Tasks GPU Support_": this filter can be used to find stages having all their execs supported by the GPU.
@@ -724,7 +815,7 @@ It contains the following main components:
    There are three _searchPanes_:
     1. "_Exec_": filters the rows by exec name. This filter also allows text searching by typing into the filter-title as
        a text input.
-    2. "_Speed-up_": groups the stages by their "average speed-up factor". Each stage can belong to one of the following
+    2. "_Speed-up_": groups the stages by their "average speed-up". Each stage can belong to one of the following
        predefined speed-up ranges: `1.0 (No Speed-up)`; `]1.0, 1.3[`; `[1.3, 2.5[`; `[2.5, 5[`; and `[5, _]`. The
        search-pane does not show a range bucket if its count is 0.
     3. "_GPU Support_": filters the execs whether an exec is supported by GPU or not.
diff --git a/docs/tuning-guide.md b/docs/tuning-guide.md
index 69b450331de..5657ee90f81 100644
--- a/docs/tuning-guide.md
+++ b/docs/tuning-guide.md
@@ -152,6 +152,13 @@ performance. Running multiple tasks concurrently on the GPU will reduce the memo
 to each task as they will be sharing the GPU's total memory. As a result, some queries that fail
 to run with a higher concurrent task setting may run successfully with a lower setting.
 
+As of the 23.04 release of the RAPIDS Accelerator for Apache Spark
+many out of memory errors result in parts of the query being rolled back and retried instead
+of a task failure. The fact that this is happening will show up in the task metrics.
+These metrics include `gpuRetryCount` which is the number of times that a retry was attempted.
+As a part of this the normal `OutOfMemoryError` is thrown much less. Instead a `RetryOOM`
+or `SplitAndRetryOOM` exception is thrown.
+
 To mitigate the out of memory errors you can often reduce the batch size, which will keep less
 data active in a batch at a time, but can increase the overall runtime as less data is being
 processed per batch.
@@ -297,6 +304,8 @@ partition sizes to avoid GPU out of memory errors.
 
 ## Metrics
 
+### SQL
+
 Custom Spark SQL Metrics are available which can help identify performance bottlenecks in a query.
 
 | Key               | Name                         | Description                                                                                                                                                                                                                                                            |
@@ -322,11 +331,7 @@ Custom Spark SQL Metrics are available which can help identify performance bottl
 | opTime            | op time                      | Time that an operator takes, exclusive of the time for executing or fetching results from child operators, and typically outside of the time it takes to acquire the GPU semaphore. <br/> Note: Sometimes contains CPU times, e.g.: concatTime                         |
 | partitionSize     | partition data size          | Total size in bytes of output partitions.                                                                                                                                                                                                                              |
 | peakDevMemory     | peak device memory           | Peak GPU memory used during execution of an operator.                                                                                                                                                                                                                  |
-| semaphoreWaitTime | GPU semaphore wait time      | Time spent waiting for the GPU semaphore.                                                                                                                                                                                                                              |
-| sortTime          | sort time                    | Time spent in sort operations in GpuSortExec and GpuTopN.                                                                                                                                                                                                              |
-| spillData         | bytes spilled from GPU       | Total bytes spilled from GPU.                                                                                                                                                                                                                                          |
-| spillDisk         | bytes spilled to disk        | Total bytes spilled from GPU to disk.                                                                                                                                                                                                                                  |
-| spillHost         | bytes spilled to host        | Total bytes spilled from GPU to host memory.                                                                                                                                                                                                                           |
+| sortTime          | sort time                    | Time spent in sort operations in GpuSortExec and GpuTopN.                                                                                                                                                                                                              |                                                                                                                                                                                          |
 | streamTime        | stream time                  | Time spent reading data from a child. This generally happens for the stream side of a hash join or for columnar to row and row to columnar operations.                                                                                                                 |
 
 Not all metrics are enabled by default. The configuration setting `spark.rapids.sql.metrics.level` can be set
@@ -344,6 +349,32 @@ Many of the questions people really want to answer with the metrics are around h
 operators take. Where is the bottleneck in my query? How much of my query is executing on the GPU?
 How long does operator X take on the GPU vs the CPU?
 
+### Task
+
+Custom Task level accumulators are also included. These metrics are not for individual
+operators in the SQL plan, but are per task and roll up to stages in the plan. Timing metrics
+are reported in the format of HH:MM:SS.sss. It should be noted that spill metrics,
+including the spill to memory and disk sizes, are not isolated to a single
+task, or even a single stage in the plan. The amount of data spilled is the amount of
+data that this particular task needed to spill in order to make room for the task to
+allocate new memory. The spill time metric is how long it took that task to spill
+that memory. It could have spilled memory associated with a different task,
+or even a different stage or job in the plan.  The spill read time metric is how
+long it took to read back in the data it needed to complete the task. This does not
+correspond to the data that was spilled by this task.
+
+| Name              | Description                                                                                                                                                                   |
+|-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| gpuSemaphoreWait  | The time the task spent waiting on the GPU semaphore.                                                                                                                         |
+| gpuSpillBlockTime | The time that this task was blocked spilling data from the GPU.                                                                                                               |
+| gpuSpillReadTime  | The time that this task was blocked reading data to the GPU that was spilled previously.                                                                                      |
+| gpuRetryCount | The number of times that a retry exception was thrown in an attempt to roll back processing to free memory.                                                                   |
+| gpuSplitAndRetryCount | The number of times that a split and retry exception was thrown in an attempt to roll back processing to free memory, and split the input to make more room.                  |
+| gpuRetryBlockTime | The amount of time that this task was blocked either hoping that other tasks will free up more memory or after a retry exception was thrown to wait until the task can go on. |
+
+The spill data sizes going to host/CPU memory and disk are the same as used by Spark task level
+metrics.
+
 ### Time taken on the GPU
 
 `opTime` mainly convey the GPU time.
@@ -365,10 +396,9 @@ Some operators provide out of core algorithms, or algorithms that can process da
 than can fit in GPU memory. This is often done by breaking the problem up into smaller pieces and
 letting some of those pieces be moved out of GPU memory when not being worked on. Apache Spark does
 similar things when processing data on the CPU. When these types of algorithms are used
-`bytes spilled from GPU` will show up as a metric to indicate how much data was transferred off of
-the GPU to either host memory or disk to make room for more data to be processed. Generally this
-spilling happens while the GPU semaphore is held, and can really slow down processing. Details
-about how much data was spilled to host memory vs spilled to disk show up in `DEBUG` mode for the
+the task level spill metrics will indicate that spilling happened. Be aware that
+the same metrics are used both for both the GPU code and the original Spark CPU code. The
+GPU spills will always be timed and show up as `gpuSpillBlockTime` in the task level
 metrics.
 
 ### Time taken on the CPU
@@ -461,4 +491,4 @@ column/value, `lead` or `lag`. These allow us to compute the result in approxima
 For all other cases large windows, including skewed values in partition by and order by data, can
 result in slow performance. If you do run into one of these situations please file an
 [issue](https://github.com/NVIDIA/spark-rapids/issues/new/choose) so we can properly prioritize
-our work to support more optimizations.
\ No newline at end of file
+our work to support more optimizations.
diff --git a/integration_tests/README.md b/integration_tests/README.md
index 95f36ffc256..00932389040 100644
--- a/integration_tests/README.md
+++ b/integration_tests/README.md
@@ -105,12 +105,7 @@ For manual installation, you need to setup your environment:
 You can install all the dependencies using `pip` by running the following command:
 
   ```shell script
-  pip install pytest \
-              sre_yield \
-              pandas \
-              pyarrow \
-              pytest-xdist \
-              findspark
+  pip install -r requirements.txt
   ```
 
 ### Installing Spark
@@ -255,7 +250,7 @@ individually, so you don't risk running unit tests along with the integration te
 http://www.scalatest.org/user_guide/using_the_scalatest_shell
 
 ```shell
-spark-shell --jars rapids-4-spark-tests_2.12-23.02.0-tests.jar,rapids-4-spark-integration-tests_2.12-23.02.0-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar
+spark-shell --jars rapids-4-spark-tests_2.12-23.04.0-tests.jar,rapids-4-spark-integration-tests_2.12-23.04.0-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar
 ```
 
 First you import the `scalatest_shell` and tell the tests where they can find the test files you
@@ -278,7 +273,7 @@ If you just want to verify the SQL replacement is working you will need to add t
 assumes CUDA 11.0 is being used.
 
 ```
-$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-23.02.0-cuda11.jar" ./runtests.py
+$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-23.04.0-cuda11.jar" ./runtests.py
 ```
 
 You don't have to enable the plugin for this to work, the test framework will do that for you.
@@ -377,7 +372,7 @@ To run cudf_udf tests, need following configuration changes:
 As an example, here is the `spark-submit` command with the cudf_udf parameter on CUDA 11.0:
 
 ```
-$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-23.02.0-cuda11.jar,rapids-4-spark-tests_2.12-23.02.0.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-23.02.0-cuda11.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-23.02.0-cuda11.jar" ./runtests.py --cudf_udf
+$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-23.04.0-cuda11.jar,rapids-4-spark-tests_2.12-23.04.0.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-23.04.0-cuda11.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-23.04.0-cuda11.jar" ./runtests.py --cudf_udf
 ```
 
 ### Enabling fuzz tests
diff --git a/integration_tests/conftest.py b/integration_tests/conftest.py
index cc9805ad0a5..13292e6bda5 100644
--- a/integration_tests/conftest.py
+++ b/integration_tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -48,3 +48,7 @@ def pytest_addoption(parser):
     parser.addoption(
         "--delta_lake", action="store_true", default=False, help="if true enable Delta Lake tests"
     )
+    parser.addoption(
+        "--test_oom_injection_mode", action='store', default="random",
+        help="in what way, if any, should the tests inject OOMs at test time. Valid options are: random, always, or never"
+    )
diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml
index 938f4bee4fa..812d7eeca2e 100644
--- a/integration_tests/pom.xml
+++ b/integration_tests/pom.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
-  Copyright (c) 2020-2022, NVIDIA CORPORATION.
+  Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
@@ -22,10 +22,10 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.02.0</version>
+        <version>23.04.0</version>
     </parent>
     <artifactId>rapids-4-spark-integration-tests_2.12</artifactId>
-    <version>23.02.0</version>
+    <version>23.04.0</version>
     <properties>
         <target.classifier/>
     </properties>
diff --git a/integration_tests/requirements.txt b/integration_tests/requirements.txt
new file mode 100644
index 00000000000..7ffeb1122cc
--- /dev/null
+++ b/integration_tests/requirements.txt
@@ -0,0 +1,19 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+pytest
+sre_yield
+pandas
+pyarrow
+pytest-xdist >= 2.0.0
+findspark
\ No newline at end of file
diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh
index e61d682518c..be408e1e350 100755
--- a/integration_tests/run_pyspark_from_build.sh
+++ b/integration_tests/run_pyspark_from_build.sh
@@ -124,7 +124,7 @@ else
         # below where the processes are launched.
         GPU_MEM_PARALLEL=`nvidia-smi --query-gpu=memory.free --format=csv,noheader | awk '{if (MAX < $1){ MAX = $1}} END {print int((MAX - 2 * 1024) / ((1.5 * 1024) + 750))}'`
         CPU_CORES=`nproc`
-        HOST_MEM_PARALLEL=`cat /proc/meminfo | grep MemAvailable | awk '{print int($2 / (5 * 1024))}'`
+        HOST_MEM_PARALLEL=`cat /proc/meminfo | grep MemAvailable | awk '{print int($2 / (5 * 1024 * 1024))}'`
         TMP_PARALLEL=$(( $GPU_MEM_PARALLEL > $CPU_CORES ? $CPU_CORES : $GPU_MEM_PARALLEL ))
         TMP_PARALLEL=$(( $TMP_PARALLEL > $HOST_MEM_PARALLEL ? $HOST_MEM_PARALLEL : $TMP_PARALLEL ))
 
@@ -270,6 +270,10 @@ else
       fi
     fi
 
+    # Set a seed to be used to pick random tests to inject with OOM
+    export SPARK_RAPIDS_TEST_INJECT_OOM_SEED=${SPARK_RAPIDS_TEST_INJECT_OOM_SEED:-`date +%s`}
+    echo "SPARK_RAPIDS_TEST_INJECT_OOM_SEED used: $SPARK_RAPIDS_TEST_INJECT_OOM_SEED"
+
     # If you want to change the amount of GPU memory allocated you have to change it here
     # and where TEST_PARALLEL is calculated
     if [[ -n "${PYSP_TEST_spark_rapids_memory_gpu_allocSize}" ]]; then
diff --git a/integration_tests/src/main/python/arithmetic_ops_test.py b/integration_tests/src/main/python/arithmetic_ops_test.py
index 2a10847b03e..91c8677251c 100644
--- a/integration_tests/src/main/python/arithmetic_ops_test.py
+++ b/integration_tests/src/main/python/arithmetic_ops_test.py
@@ -369,7 +369,7 @@ def test_mod_pmod_by_zero(data_gen, overflow_exp):
 def test_cast_neg_to_decimal_err():
     # -12 cannot be represented as decimal(7,7)
     data_gen = _decimal_gen_7_7
-    if is_before_spark_314() or ((not is_before_spark_320()) and is_before_spark_322()):
+    if is_before_spark_322():
         exception_content = "Decimal(compact,-120000000,20,0}) cannot be represented as Decimal(7, 7)"
     elif is_databricks113_or_later() or not is_before_spark_340():
         exception_content = "[NUMERIC_VALUE_OUT_OF_RANGE] -12 cannot be represented as Decimal(7, 7)"
@@ -1039,7 +1039,7 @@ def test_div_overflow_exception_when_ansi(expr, ansi_enabled):
 
 # Only run this test before Spark v3.2.0 to verify IntegralDivide will NOT
 # throw exceptions for overflow even ANSI mode is enabled.
-@pytest.mark.skipif(not is_before_spark_320() or is_databricks91_or_later(), reason='https://github.com/apache/spark/pull/32260')
+@pytest.mark.skipif(not is_before_spark_320(), reason='https://github.com/apache/spark/pull/32260')
 @pytest.mark.parametrize('expr', div_overflow_exprs)
 @pytest.mark.parametrize('ansi_enabled', ['false', 'true'])
 def test_div_overflow_no_exception_when_ansi(expr, ansi_enabled):
diff --git a/integration_tests/src/main/python/asserts.py b/integration_tests/src/main/python/asserts.py
index 50c0c5996eb..7537c07340d 100644
--- a/integration_tests/src/main/python/asserts.py
+++ b/integration_tests/src/main/python/asserts.py
@@ -323,6 +323,18 @@ def assert_gpu_fallback_write(write_func,
         base_path,
         cpu_fallback_class_name,
         conf={}):
+    assert_gpu_fallback_write(write_func,
+                              read_func,
+                              base_path,
+                              [cpu_fallback_class_name], # make a single item list
+                              conf)
+
+# Similar to
+def assert_gpu_fallback_write(write_func,
+                              read_func,
+                              base_path,
+                              cpu_fallback_class_name_list,
+                              conf={}):
     conf = _prep_incompat_conf(conf)
 
     print('### CPU RUN ###')
@@ -337,7 +349,7 @@ def assert_gpu_fallback_write(write_func,
     gpu_path = base_path + '/GPU'
     with_gpu_session(lambda spark : write_func(spark, gpu_path), conf=conf)
     gpu_end = time.time()
-    jvm.org.apache.spark.sql.rapids.ExecutionPlanCaptureCallback.assertCapturedAndGpuFellBack(cpu_fallback_class_name, 10000)
+    jvm.org.apache.spark.sql.rapids.ExecutionPlanCaptureCallback.assertCapturedAndGpuFellBack(cpu_fallback_class_name_list, 10000)
     print('### WRITE: GPU TOOK {} CPU TOOK {} ###'.format(
         gpu_end - gpu_start, cpu_end - cpu_start))
 
diff --git a/integration_tests/src/main/python/cast_test.py b/integration_tests/src/main/python/cast_test.py
index 674f8f6adad..f2519dd909d 100644
--- a/integration_tests/src/main/python/cast_test.py
+++ b/integration_tests/src/main/python/cast_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql, assert_gpu_and_cpu_error, assert_gpu_fallback_collect, assert_py4j_exception
 from data_gen import *
-from spark_session import is_before_spark_320, is_before_spark_330, is_databricks91_or_later, with_gpu_session
+from spark_session import is_before_spark_320, is_before_spark_330, with_gpu_session
 from marks import allow_non_gpu, approximate_float
 from pyspark.sql.types import *
 from spark_init_internal import spark_version
@@ -88,7 +88,7 @@ def test_cast_string_date_valid_format():
 # Spark 320+ and databricks support Ansi mode when casting string to date
 # This means an exception will be thrown when casting invalid string to date on Spark 320+ or databricks
 # test Spark versions < 3.2.0 and non databricks, ANSI mode
-@pytest.mark.skipif((not is_before_spark_320()) or is_databricks91_or_later(), reason="ansi cast(string as date) throws exception only in 3.2.0+ or db")
+@pytest.mark.skipif(not is_before_spark_320(), reason="ansi cast(string as date) throws exception only in 3.2.0+ or db")
 def test_cast_string_date_invalid_ansi_before_320():
     data_rows = [(v,) for v in values_string_to_data]
     assert_gpu_and_cpu_are_equal_collect(
@@ -96,19 +96,9 @@ def test_cast_string_date_invalid_ansi_before_320():
         conf={'spark.rapids.sql.hasExtendedYearValues': 'false',
               'spark.sql.ansi.enabled': 'true'}, )
 
-# test databricks, ANSI mode, all databricks versions supports Ansi mode when casting string to date
-@pytest.mark.skipif(not is_databricks91_or_later(), reason="Spark versions(< 320) not support Ansi mode when casting string to date")
-@pytest.mark.parametrize('invalid', invalid_values_string_to_date)
-def test_cast_string_date_invalid_ansi_databricks(invalid):
-    assert_gpu_and_cpu_error(
-        lambda spark: spark.createDataFrame([(invalid,)], "a string").select(f.col('a').cast(DateType())).collect(),
-        conf={'spark.rapids.sql.hasExtendedYearValues': 'false',
-              'spark.sql.ansi.enabled': 'true'},
-        error_message="DateTimeException")
-
-# test databricks, ANSI mode, valid values
-@pytest.mark.skipif(not is_databricks91_or_later(), reason="Spark versions(< 320) not support Ansi mode when casting string to date")
-def test_cast_string_date_valid_ansi_databricks():
+# test Spark versions >= 320 and databricks, ANSI mode, valid values
+@pytest.mark.skipif(is_before_spark_320(), reason="Spark versions(< 320) not support Ansi mode when casting string to date")
+def test_cast_string_date_valid_ansi():
     data_rows = [(v,) for v in valid_values_string_to_date]
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())),
@@ -635,3 +625,21 @@ def getDf(spark):
 
 def test_cast_binary_to_string():
     assert_gpu_and_cpu_are_equal_collect(lambda spark: unary_op_df(spark, binary_gen).selectExpr("a", "CAST(a AS STRING) as str"))
+
+def test_cast_int_to_string_not_UTC():
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark: unary_op_df(spark, int_gen, 100).selectExpr("a", "CAST(a AS STRING) as str"),
+        {"spark.sql.session.timeZone": "+08"})
+
+not_utc_fallback_test_params = [(timestamp_gen, 'STRING'), (timestamp_gen, 'DATE'),
+        (date_gen, 'TIMESTAMP'),
+        (SetValuesGen(StringType(), ['2023-03-20 10:38:50', '2023-03-20 10:39:02']), 'TIMESTAMP')]
+
+@allow_non_gpu('ProjectExec')
+@pytest.mark.parametrize('from_gen, to_type', not_utc_fallback_test_params)
+def test_cast_fallback_not_UTC(from_gen, to_type):
+    assert_gpu_fallback_collect(
+        lambda spark: unary_op_df(spark, from_gen, 100).selectExpr("CAST(a AS {}) as casted".format(to_type)),
+        "Cast",
+        {"spark.sql.session.timeZone": "+08",
+         "spark.rapids.sql.castStringToTimestamp.enabled": "true"})
diff --git a/integration_tests/src/main/python/conftest.py b/integration_tests/src/main/python/conftest.py
index 5d52f33dea9..7df838047d4 100644
--- a/integration_tests/src/main/python/conftest.py
+++ b/integration_tests/src/main/python/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -102,6 +102,12 @@ def skip_unless_precommit_tests(description):
 
 _limit = -1
 
+_inject_oom = None
+
+def should_inject_oom():
+    global _inject_oom
+    return _inject_oom != None
+
 def get_limit():
     return _limit
 
@@ -114,6 +120,8 @@ def _get_limit_from_mark(mark):
 def pytest_runtest_setup(item):
     global _sort_on_spark
     global _sort_locally
+    global _inject_oom
+    _inject_oom = item.get_closest_marker('inject_oom')
     order = item.get_closest_marker('ignore_order')
     if order:
         if order.kwargs.get('local', False):
@@ -211,10 +219,30 @@ def pytest_configure(config):
     elif "developer" != test_type:
         raise Exception("not supported test type {}".format(test_type))
 
+# For OOM injection: we expect a seed to be provided by the environment, or default to 1.
+# This is done such that any worker started by the xdist plugin for pytest will
+# have the same seed. Since each worker creates a list of tests independently and then
+# pytest expects this starting list to match for all workers, it is important that the same seed
+# is set for all, either from the environment or as a constant.
+oom_random_injection_seed = int(os.getenv("SPARK_RAPIDS_TEST_INJECT_OOM_SEED", 1))
+print(f"Starting with OOM injection seed: {oom_random_injection_seed}. " 
+      "Set env variable SPARK_RAPIDS_TEST_INJECT_OOM_SEED to override.")
+
 def pytest_collection_modifyitems(config, items):
+    r = random.Random(oom_random_injection_seed)
     for item in items:
         extras = []
         order = item.get_closest_marker('ignore_order')
+        # decide if OOMs should be injected, and when
+        injection_mode = config.getoption('test_oom_injection_mode').lower()
+        inject_choice = False
+        if injection_mode == 'random':
+            inject_choice = r.randrange(0, 2) == 1
+        elif injection_mode == 'always':
+            inject_choice = True
+        if inject_choice:
+            extras.append('INJECT_OOM')
+            item.add_marker('inject_oom', append=True)
         if order:
             if order.kwargs:
                 extras.append('IGNORE_ORDER(' + str(order.kwargs) + ')')
diff --git a/integration_tests/src/main/python/delta_lake_auto_compact_test.py b/integration_tests/src/main/python/delta_lake_auto_compact_test.py
new file mode 100644
index 00000000000..937da312589
--- /dev/null
+++ b/integration_tests/src/main/python/delta_lake_auto_compact_test.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+from asserts import assert_gpu_and_cpu_writes_are_equal_collect, with_cpu_session, with_gpu_session
+from data_gen import copy_and_update
+from delta_lake_write_test import delta_meta_allow
+from marks import allow_non_gpu, delta_lake
+from pyspark.sql.functions import *
+from spark_session import is_databricks104_or_later
+
+_conf = {'spark.rapids.sql.explain': 'ALL',
+         'spark.databricks.delta.autoCompact.minNumFiles': 3}  # Num files before compaction.
+
+
+def write_to_delta(num_rows=30, is_partitioned=False, num_writes=3):
+    """
+    Returns bound function that writes to a delta table.
+    """
+
+    def write(spark, table_path):
+        input_data = spark.range(num_rows)
+        input_data = input_data.withColumn("part", expr("id % 3")) if is_partitioned \
+            else input_data.repartition(1)
+        writer = input_data.write.format("delta").mode("append")
+        for _ in range(num_writes):
+            writer.save(table_path)
+
+    return write
+
+
+@delta_lake
+@allow_non_gpu(*delta_meta_allow)
+@pytest.mark.skipif(not is_databricks104_or_later(),
+                    reason="Auto compaction of Delta Lake tables is only supported "
+                           "on Databricks 10.4+")
+@pytest.mark.parametrize("auto_compact_conf",
+                         ["spark.databricks.delta.autoCompact.enabled",
+                          "spark.databricks.delta.properties.defaults.autoOptimize.autoCompact"])
+def test_auto_compact_basic(spark_tmp_path, auto_compact_conf):
+    """
+    This test checks whether the results of auto compactions on an un-partitioned table
+    match, when written via CPU and GPU.
+    It also checks that the snapshot metrics (number of files added/removed, etc.)
+    match.
+    """
+    from delta.tables import DeltaTable
+    data_path = spark_tmp_path + "/AUTO_COMPACT_TEST_DATA"
+
+    def read_data(spark, table_path):
+        return spark.read.format("delta").load(table_path)
+
+    assert_gpu_and_cpu_writes_are_equal_collect(
+        write_func=write_to_delta(is_partitioned=False),
+        read_func=read_data,
+        base_path=data_path,
+        conf=_conf)
+
+    def read_metadata(spark, table_path):
+        input_table = DeltaTable.forPath(spark, table_path)
+        table_history = input_table.history()
+        return table_history.select(
+            "version",
+            "operation",
+            expr("operationMetrics[\"numFiles\"]").alias("numFiles"),
+            expr("operationMetrics[\"numRemovedFiles\"]").alias("numRemoved"),
+            expr("operationMetrics[\"numAddedFiles\"]").alias("numAdded")
+        )
+
+    conf_enable_auto_compact = copy_and_update(_conf, {auto_compact_conf: "true"})
+
+    assert_gpu_and_cpu_writes_are_equal_collect(
+        write_func=lambda spark, table_path: None,  # Already written.
+        read_func=read_metadata,
+        base_path=data_path,
+        conf=conf_enable_auto_compact)
+
+
+@delta_lake
+@allow_non_gpu(*delta_meta_allow)
+@pytest.mark.skipif(not is_databricks104_or_later(),
+                    reason="Auto compaction of Delta Lake tables is only supported "
+                           "on Databricks 10.4+")
+@pytest.mark.parametrize("auto_compact_conf",
+                         ["spark.databricks.delta.autoCompact.enabled",
+                          "spark.databricks.delta.properties.defaults.autoOptimize.autoCompact"])
+def test_auto_compact_partitioned(spark_tmp_path, auto_compact_conf):
+    """
+    This test checks whether the results of auto compaction on a partitioned table
+    match, when written via CPU and GPU.
+    Note: The behaviour of compaction itself differs from Databricks, in that
+    the plugin enforces `minFiles` restriction uniformly across all partitions.
+    Databricks' Delta implementation appears not to.
+    """
+    from delta.tables import DeltaTable
+    data_path = spark_tmp_path + "/AUTO_COMPACT_TEST_DATA_PARTITIONED"
+
+    def read_data(spark, table_path):
+        return spark.read.format("delta").load(table_path).orderBy("id", "part")
+
+    assert_gpu_and_cpu_writes_are_equal_collect(
+        write_func=write_to_delta(is_partitioned=True),
+        read_func=read_data,
+        base_path=data_path,
+        conf=_conf)
+
+    def read_metadata(spark, table_path):
+        """
+        The snapshots might not look alike, in the partitioned case.
+        Ensure that auto compaction has occurred, even if it's not identical.
+        """
+        input_table = DeltaTable.forPath(spark, table_path)
+        table_history = input_table.history()
+        return table_history.select(
+            "version",
+            "operation",
+            expr("operationMetrics[\"numFiles\"] > 0").alias("numFiles_gt_0"),
+            expr("operationMetrics[\"numRemovedFiles\"] > 0").alias("numRemoved_gt_0"),
+            expr("operationMetrics[\"numAddedFiles\"] > 0").alias("numAdded_gt_0")
+        )
+
+    conf_enable_auto_compact = copy_and_update(_conf, {auto_compact_conf: "true"})
+
+    assert_gpu_and_cpu_writes_are_equal_collect(
+        write_func=lambda spark, table_path: None,  # Already written.
+        read_func=read_metadata,
+        base_path=data_path,
+        conf=conf_enable_auto_compact)
+
+
+@delta_lake
+@allow_non_gpu(*delta_meta_allow)
+@pytest.mark.skipif(not is_databricks104_or_later(),
+                    reason="Auto compaction of Delta Lake tables is only supported "
+                           "on Databricks 10.4+")
+@pytest.mark.parametrize("auto_compact_conf",
+                         ["spark.databricks.delta.autoCompact.enabled",
+                          "spark.databricks.delta.properties.defaults.autoOptimize.autoCompact"])
+def test_auto_compact_disabled(spark_tmp_path, auto_compact_conf):
+    """
+    This test verifies that auto-compaction does not run if disabled.
+    """
+    from delta.tables import DeltaTable
+    data_path = spark_tmp_path + "/AUTO_COMPACT_TEST_CHECK_DISABLED"
+
+    disable_auto_compaction = copy_and_update(_conf, {auto_compact_conf: 'false'})
+
+    writer = write_to_delta(num_writes=10)
+    with_gpu_session(func=lambda spark: writer(spark, data_path),
+                     conf=disable_auto_compaction)
+
+    # 10 writes should correspond to 10 commits.
+    # (i.e. there should be no OPTIMIZE commits.)
+    def verify_table_history(spark):
+        input_table = DeltaTable.forPath(spark, data_path)
+        table_history = input_table.history()
+        assert table_history.select("version", "operation").count() == 10, \
+            "Expected 10 versions, 1 for each WRITE."
+        assert table_history.select("version")\
+            .where("operation = 'OPTIMIZE'")\
+            .count() == 0,\
+            "Expected 0 OPTIMIZE operations."
+
+    with_cpu_session(verify_table_history, {})
+
+
+@delta_lake
+@allow_non_gpu(*delta_meta_allow)
+@pytest.mark.skipif(not is_databricks104_or_later(),
+                    reason="Auto compaction of Delta Lake tables is only supported "
+                           "on Databricks 10.4+")
+def test_auto_compact_min_num_files(spark_tmp_path):
+    """
+    This test verifies that auto-compaction honours the minNumFiles setting.
+    """
+    from delta.tables import DeltaTable
+    data_path = spark_tmp_path + "/AUTO_COMPACT_TEST_MIN_FILES"
+    enable_auto_compaction_on_5 = {
+        'spark.databricks.delta.autoCompact.enabled': 'true',  # Enable auto compaction.
+        'spark.databricks.delta.autoCompact.minNumFiles': 5  # Num files before compaction.
+    }
+
+    # Minimum number of input files == 5.
+    # If 4 files are written, there should be no OPTIMIZE.
+    writer = write_to_delta(num_writes=4)
+    with_gpu_session(func=lambda spark: writer(spark, data_path),
+                     conf=enable_auto_compaction_on_5)
+
+    def verify_table_history_before_limit(spark):
+        input_table = DeltaTable.forPath(spark, data_path)
+        table_history = input_table.history()
+        assert table_history.select("version", "operation").count() == 4, \
+            "Expected 4 versions, 1 for each WRITE."
+        assert table_history.select("version") \
+                   .where("operation = 'OPTIMIZE'") \
+                   .count() == 0, \
+            "Expected 0 OPTIMIZE operations."
+    with_cpu_session(verify_table_history_before_limit, {})
+
+    # On the 5th file write, auto-OPTIMIZE should kick in.
+    with_gpu_session(func=lambda spark: write_to_delta(num_writes=1)(spark, data_path),
+                     conf=enable_auto_compaction_on_5)
+
+    def verify_table_history_after_limit(spark):
+        input_table = DeltaTable.forPath(spark, data_path)
+        table_history = input_table.history()
+        assert table_history.select("version", "operation").count() == 6, \
+            "Expected 6 versions, i.e. 5 WRITEs + 1 OPTIMIZE."
+        assert table_history.select("version") \
+                   .where("operation = 'OPTIMIZE'") \
+                   .count() == 1, \
+            "Expected 1 OPTIMIZE operations."
+    with_cpu_session(verify_table_history_after_limit, {})
diff --git a/integration_tests/src/main/python/delta_lake_delete_test.py b/integration_tests/src/main/python/delta_lake_delete_test.py
index 40661727884..ae4d2cc2365 100644
--- a/integration_tests/src/main/python/delta_lake_delete_test.py
+++ b/integration_tests/src/main/python/delta_lake_delete_test.py
@@ -37,7 +37,8 @@ def do_delete(spark, path):
 
 def assert_delta_sql_delete_collect(spark_tmp_path, use_cdf, dest_table_func, delete_sql,
                                     partition_columns=None,
-                                    conf=delta_delete_enabled_conf):
+                                    conf=delta_delete_enabled_conf,
+                                    skip_sql_result_check=False):
     def read_data(spark, path):
         read_func = read_delta_path_with_cdf if use_cdf else read_delta_path
         df = read_func(spark, path)
@@ -45,10 +46,11 @@ def read_data(spark, path):
     def checker(data_path, do_delete):
         cpu_path = data_path + "/CPU"
         gpu_path = data_path + "/GPU"
-        # compare resulting dataframe from the delete operation (some older Spark versions return empty here)
-        cpu_result = with_cpu_session(lambda spark: do_delete(spark, cpu_path).collect(), conf=conf)
-        gpu_result = with_gpu_session(lambda spark: do_delete(spark, gpu_path).collect(), conf=conf)
-        assert_equal(cpu_result, gpu_result)
+        if not skip_sql_result_check:
+            # compare resulting dataframe from the delete operation (some older Spark versions return empty here)
+            cpu_result = with_cpu_session(lambda spark: do_delete(spark, cpu_path).collect(), conf=conf)
+            gpu_result = with_gpu_session(lambda spark: do_delete(spark, gpu_path).collect(), conf=conf)
+            assert_equal(cpu_result, gpu_result)
         # compare table data results, read both via CPU to make sure GPU write can be read by CPU
         cpu_result = with_cpu_session(lambda spark: read_data(spark, cpu_path).collect(), conf=conf)
         gpu_result = with_cpu_session(lambda spark: read_data(spark, gpu_path).collect(), conf=conf)
@@ -94,8 +96,14 @@ def generate_dest_data(spark):
                             SetValuesGen(StringType(), "abcdefg"),
                             string_gen)
     delete_sql = "DELETE FROM delta.`{path}`"
+    # Databricks recently changed how the num_affected_rows is computed
+    # on deletes of entire files, RAPIDS Accelerator has yet to be updated.
+    # https://github.com/NVIDIA/spark-rapids/issues/8123
+    skip_sql_result = is_databricks_runtime()
     assert_delta_sql_delete_collect(spark_tmp_path, use_cdf, generate_dest_data,
-                                    delete_sql, partition_columns)
+                                    delete_sql, partition_columns,
+                                    skip_sql_result_check=skip_sql_result)
+
 @allow_non_gpu(*delta_meta_allow)
 @delta_lake
 @ignore_order
@@ -109,8 +117,13 @@ def generate_dest_data(spark):
                             SetValuesGen(StringType(), "abcdefg"),
                             string_gen)
     delete_sql = "DELETE FROM delta.`{path}` WHERE a = 3"
+    # Databricks recently changed how the num_affected_rows is computed
+    # on deletes of entire files, RAPIDS Accelerator has yet to be updated.
+    # https://github.com/NVIDIA/spark-rapids/issues/8123
+    skip_sql_result = is_databricks_runtime()
     assert_delta_sql_delete_collect(spark_tmp_path, use_cdf, generate_dest_data,
-                                    delete_sql, partition_columns)
+                                    delete_sql, partition_columns,
+                                    skip_sql_result_check=skip_sql_result)
 
 @allow_non_gpu(*delta_meta_allow)
 @delta_lake
diff --git a/integration_tests/src/main/python/delta_lake_merge_test.py b/integration_tests/src/main/python/delta_lake_merge_test.py
index 606cf92eef9..26579ca6980 100644
--- a/integration_tests/src/main/python/delta_lake_merge_test.py
+++ b/integration_tests/src/main/python/delta_lake_merge_test.py
@@ -259,6 +259,25 @@ def test_delta_merge_upsert_with_unmatchable_match_condition(spark_tmp_path, spa
     assert_delta_sql_merge_collect(spark_tmp_path, spark_tmp_table_factory, use_cdf,
                                    src_table_func, dest_table_func, merge_sql, compare_logs)
 
+@allow_non_gpu(*delta_meta_allow)
+@delta_lake
+@ignore_order
+@pytest.mark.skipif(is_before_spark_320(), reason="Delta Lake writes are not supported before Spark 3.2.x")
+@pytest.mark.parametrize("use_cdf", [True, False], ids=idfn)
+def test_delta_merge_update_with_aggregation(spark_tmp_path, spark_tmp_table_factory, use_cdf):
+    # Need to eliminate duplicate keys in the source table otherwise update semantics are ambiguous
+    src_table_func = lambda spark: spark.range(10).withColumn("x", f.col("id") + 1)\
+        .select(f.col("id"), (f.col("x") + 1).alias("x"))\
+        .drop_duplicates(["id"])\
+        .limit(10)
+    dest_table_func = lambda spark: spark.range(5).withColumn("x", f.col("id") + 1)
+    merge_sql = "MERGE INTO {dest_table} USING {src_table} ON {dest_table}.id == {src_table}.id" \
+                " WHEN MATCHED THEN UPDATE SET {dest_table}.x = {src_table}.x + 2" \
+                " WHEN NOT MATCHED AND {src_table}.x < 7 THEN INSERT *"
+
+    assert_delta_sql_merge_collect(spark_tmp_path, spark_tmp_table_factory, use_cdf,
+                                   src_table_func, dest_table_func, merge_sql, compare_logs=False)
+
 @allow_non_gpu(*delta_meta_allow)
 @delta_lake
 @ignore_order
diff --git a/integration_tests/src/main/python/delta_lake_test.py b/integration_tests/src/main/python/delta_lake_test.py
index edfc5efd369..f0518ad8370 100644
--- a/integration_tests/src/main/python/delta_lake_test.py
+++ b/integration_tests/src/main/python/delta_lake_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,15 +16,13 @@
 from pyspark.sql import Row
 from asserts import assert_gpu_fallback_collect
 from marks import allow_non_gpu, delta_lake
-from spark_session import with_cpu_session, is_databricks91_or_later, spark_version
-from spark_session import with_cpu_session, with_gpu_session, is_databricks91_or_later, is_databricks104_or_later, \
-    spark_version
+from spark_session import with_cpu_session, with_gpu_session, is_databricks_runtime, spark_version
 
 _conf = {'spark.rapids.sql.explain': 'ALL'}
 
 @delta_lake
 @allow_non_gpu('FileSourceScanExec')
-@pytest.mark.skipif(not (is_databricks91_or_later() or spark_version().startswith("3.2.")), \
+@pytest.mark.skipif(not (is_databricks_runtime() or spark_version().startswith("3.2.")), \
     reason="Delta Lake is already configured on Databricks and CI supports Delta Lake OSS with Spark 3.2.x so far")
 def test_delta_metadata_query_fallback(spark_tmp_table_factory):
     table = spark_tmp_table_factory.get()
@@ -40,7 +38,7 @@ def setup_delta_table(spark):
         "FileSourceScanExec", conf = _conf)
 
 @delta_lake
-@pytest.mark.skipif(not is_databricks104_or_later(), \
+@pytest.mark.skipif(not is_databricks_runtime(), \
     reason="This test is specific to Databricks because we only fall back to CPU for merges on Databricks")
 @allow_non_gpu(any = True)
 def test_delta_merge_query(spark_tmp_table_factory):
diff --git a/integration_tests/src/main/python/delta_lake_write_test.py b/integration_tests/src/main/python/delta_lake_write_test.py
index 38d6410f28b..5966d06bf52 100644
--- a/integration_tests/src/main/python/delta_lake_write_test.py
+++ b/integration_tests/src/main/python/delta_lake_write_test.py
@@ -23,7 +23,7 @@
 from data_gen import *
 from conftest import is_databricks_runtime
 from marks import *
-from parquet_write_test import parquet_part_write_gens, parquet_write_gens_list, writer_confs
+from parquet_write_test import limited_timestamp, parquet_part_write_gens, parquet_write_gens_list, writer_confs
 from pyspark.sql.types import *
 from spark_session import is_before_spark_320, is_before_spark_330, with_cpu_session
 
@@ -57,9 +57,9 @@ def fixup_operation_metrics(opm):
     for k in "executionTimeMs", "numOutputBytes", "rewriteTimeMs", "scanTimeMs":
         opm.pop(k, None)
 
-TMP_TABLE_PATTERN=re.compile("tmp_table_\w+")
-TMP_TABLE_PATH_PATTERN=re.compile("delta.`[^`]*`")
-REF_ID_PATTERN=re.compile("#[0-9]+")
+TMP_TABLE_PATTERN=re.compile(r"tmp_table_\w+")
+TMP_TABLE_PATH_PATTERN=re.compile(r"delta.`[^`]*`")
+REF_ID_PATTERN=re.compile(r"#[0-9]+")
 
 def fixup_operation_parameters(opp):
     """Update the specified operationParameters node to facilitate log comparisons"""
@@ -139,6 +139,13 @@ def assert_gpu_and_cpu_delta_logs_equivalent(spark, data_path):
         for cpu_json, gpu_json in zip(cpu_jsons, gpu_jsons):
             assert_delta_log_json_equivalent(file, cpu_json, gpu_json)
 
+def get_last_operation_metrics(path):
+    from delta.tables import DeltaTable
+    return with_cpu_session(lambda spark: DeltaTable.forPath(spark, path)\
+                            .history(1)\
+                            .selectExpr("operationMetrics")\
+                            .head()[0])
+
 @allow_non_gpu("ExecutedCommandExec", *delta_meta_allow)
 @delta_lake
 @ignore_order
@@ -639,8 +646,7 @@ def append_data(spark, path):
 @ignore_order
 @pytest.mark.parametrize("confkey", ["optimizeWrite"], ids=idfn)
 @pytest.mark.skipif(is_before_spark_320(), reason="Delta Lake writes are not supported before Spark 3.2.x")
-@pytest.mark.skipif(is_databricks_runtime() and is_before_spark_330(),
-                    reason="Databricks 10.4 does not properly handle options passed during DataFrame API write")
+@pytest.mark.skipif(is_databricks_runtime(), reason="Optimized write is supported on Databricks")
 def test_delta_write_auto_optimize_write_opts_fallback(confkey, spark_tmp_path):
     data_path = spark_tmp_path + "/DELTA_DATA"
     assert_gpu_fallback_write(
@@ -654,9 +660,10 @@ def test_delta_write_auto_optimize_write_opts_fallback(confkey, spark_tmp_path):
 @delta_lake
 @ignore_order
 @pytest.mark.parametrize("confkey", [
-    "delta.autoOptimize",
-    "delta.autoOptimize.optimizeWrite",
-    "delta.autoOptimize.autoCompact" ], ids=idfn)
+    pytest.param("delta.autoOptimize", marks=pytest.mark.skipif(
+        is_databricks_runtime(), reason="Optimize write is supported on Databricks")),
+    pytest.param("delta.autoOptimize.optimizeWrite", marks=pytest.mark.skipif(
+        is_databricks_runtime(), reason="Optimize write is supported on Databricks"))], ids=idfn)
 @pytest.mark.skipif(is_before_spark_320(), reason="Delta Lake writes are not supported before Spark 3.2.x")
 @pytest.mark.skipif(not is_databricks_runtime(), reason="Auto optimize only supported on Databricks")
 def test_delta_write_auto_optimize_table_props_fallback(confkey, spark_tmp_path):
@@ -676,9 +683,10 @@ def setup_tables(spark):
 @delta_lake
 @ignore_order
 @pytest.mark.parametrize("confkey", [
-    "spark.databricks.delta.optimizeWrite.enabled",
-    "spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite",
-    "spark.databricks.delta.properties.defaults.autoOptimize.autoCompact" ], ids=idfn)
+    pytest.param("spark.databricks.delta.optimizeWrite.enabled", marks=pytest.mark.skipif(
+        is_databricks_runtime(), reason="Optimize write is supported on Databricks")),
+    pytest.param("spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite", marks=pytest.mark.skipif(
+        is_databricks_runtime(), reason="Optimize write is supported on Databricks"))], ids=idfn)
 @pytest.mark.skipif(is_before_spark_320(), reason="Delta Lake writes are not supported before Spark 3.2.x")
 def test_delta_write_auto_optimize_sql_conf_fallback(confkey, spark_tmp_path):
     data_path = spark_tmp_path + "/DELTA_DATA"
@@ -706,3 +714,176 @@ def do_join(spark, path):
         data_path,
         conf=confs)
     with_cpu_session(lambda spark: assert_gpu_and_cpu_delta_logs_equivalent(spark, data_path))
+
+@allow_non_gpu(*delta_meta_allow)
+@delta_lake
+@ignore_order
+@pytest.mark.skipif(is_before_spark_320(), reason="Delta Lake writes are not supported before Spark 3.2.x")
+@pytest.mark.skipif(not is_databricks_runtime(), reason="Delta Lake optimized writes are only supported on Databricks")
+@pytest.mark.parametrize("enable_conf_key", [
+    "spark.databricks.delta.optimizeWrite.enabled",
+    "spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite"], ids=idfn)
+@pytest.mark.parametrize("aqe_enabled", [True, False], ids=idfn)
+def test_delta_write_optimized_aqe(spark_tmp_path, enable_conf_key, aqe_enabled):
+    num_chunks = 20
+    def do_write(data_path, is_optimize_write):
+        confs=copy_and_update(delta_writes_enabled_conf, {
+            enable_conf_key : str(is_optimize_write),
+            "spark.sql.adaptive.enabled" : str(aqe_enabled)
+        })
+        assert_gpu_and_cpu_writes_are_equal_collect(
+            lambda spark, path: unary_op_df(spark, int_gen)\
+                .repartition(num_chunks).write.format("delta").save(path),
+            lambda spark, path: spark.read.format("delta").load(path),
+            data_path,
+            conf=confs)
+    data_path = spark_tmp_path + "/DELTA_DATA1"
+    do_write(data_path, is_optimize_write=False)
+    opmetrics = get_last_operation_metrics(data_path + "/GPU")
+    assert int(opmetrics["numFiles"]) == num_chunks
+    data_path = spark_tmp_path + "/DELTA_DATA2"
+    do_write(data_path, is_optimize_write=True)
+    opmetrics = get_last_operation_metrics(data_path + "/GPU")
+    assert int(opmetrics["numFiles"]) == 1
+
+@allow_non_gpu(*delta_meta_allow)
+@delta_lake
+@ignore_order(local=True)
+@pytest.mark.skipif(is_before_spark_320(), reason="Delta Lake writes are not supported before Spark 3.2.x")
+@pytest.mark.skipif(not is_databricks_runtime(), reason="Delta Lake optimized writes are only supported on Databricks")
+def test_delta_write_optimized_supported_types(spark_tmp_path):
+    num_chunks = 20
+    data_path = spark_tmp_path + "/DELTA_DATA"
+    confs=copy_and_update(writer_confs, delta_writes_enabled_conf, {
+        "spark.sql.execution.sortBeforeRepartition": "true",
+        "spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite": "true"
+    })
+    simple_gens = [ byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
+                    string_gen, boolean_gen, date_gen, limited_timestamp() ]
+    genlist = simple_gens + \
+        [ StructGen([("child" + str(i), gen) for i, gen in enumerate(simple_gens)]) ] + \
+        [ StructGen([("x", StructGen([("y", int_gen)]))]) ]
+    gens = [("c" + str(i), gen) for i, gen in enumerate(genlist)]
+    assert_gpu_and_cpu_writes_are_equal_collect(
+        lambda spark, path: gen_df(spark, gens) \
+            .repartition(num_chunks).write.format("delta").save(path),
+        lambda spark, path: spark.read.format("delta").load(path),
+        data_path,
+        conf=confs)
+    opmetrics = get_last_operation_metrics(data_path + "/GPU")
+    assert int(opmetrics["numFiles"]) < 20
+
+@allow_non_gpu(*delta_meta_allow)
+@delta_lake
+@ignore_order(local=True)
+@pytest.mark.skipif(is_before_spark_320(), reason="Delta Lake writes are not supported before Spark 3.2.x")
+@pytest.mark.skipif(not is_databricks_runtime(), reason="Delta Lake optimized writes are only supported on Databricks")
+def test_delta_write_optimized_supported_types_partitioned(spark_tmp_path):
+    data_path = spark_tmp_path + "/DELTA_DATA"
+    confs=copy_and_update(writer_confs, delta_writes_enabled_conf, {
+        "spark.sql.execution.sortBeforeRepartition": "true",
+        "spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite": "true"
+    })
+    genlist = [ SetValuesGen(StringType(), ["a", "b", "c"]) ] + \
+              [ x for sublist in parquet_write_gens_list for x in sublist ]
+    gens = [("c" + str(i), gen) for i, gen in enumerate(genlist)]
+    assert_gpu_and_cpu_writes_are_equal_collect(
+        lambda spark, path: gen_df(spark, gens) \
+            .write.format("delta").partitionBy("c0").save(path),
+        lambda spark, path: spark.read.format("delta").load(path),
+        data_path,
+        conf=confs)
+
+@allow_non_gpu("ExecutedCommandExec", *delta_meta_allow)
+@delta_lake
+@ignore_order(local=True)
+@pytest.mark.skipif(is_before_spark_320(), reason="Delta Lake writes are not supported before Spark 3.2.x")
+@pytest.mark.skipif(not is_databricks_runtime(), reason="Delta Lake optimized writes are only supported on Databricks")
+@pytest.mark.parametrize("gen", [
+    simple_string_to_string_map_gen,
+    StructGen([("x", ArrayGen(int_gen))]),
+    ArrayGen(StructGen([("x", long_gen)]))], ids=idfn)
+def test_delta_write_optimized_unsupported_sort_fallback(spark_tmp_path, gen):
+    data_path = spark_tmp_path + "/DELTA_DATA"
+    confs=copy_and_update(delta_writes_enabled_conf, {
+        "spark.sql.execution.sortBeforeRepartition": "true",
+        "spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite": "true"
+    })
+    assert_gpu_fallback_write(
+        lambda spark, path: unary_op_df(spark, gen).coalesce(1).write.format("delta").save(path),
+        lambda spark, path: spark.read.format("delta").load(path),
+        data_path,
+        "ExecutedCommandExec",
+        conf=confs)
+
+@allow_non_gpu(*delta_meta_allow)
+@delta_lake
+@ignore_order
+@pytest.mark.skipif(is_before_spark_320(), reason="Delta Lake writes are not supported before Spark 3.2.x")
+@pytest.mark.skipif(not is_databricks_runtime(), reason="Delta Lake optimized writes are only supported on Databricks")
+def test_delta_write_optimized_table_confs(spark_tmp_path):
+    data_path = spark_tmp_path + "/DELTA_DATA"
+    gpu_data_path = data_path + "/GPU"
+    num_chunks = 20
+    def do_write(confs):
+        assert_gpu_and_cpu_writes_are_equal_collect(
+            lambda spark, path: unary_op_df(spark, int_gen)\
+                .repartition(num_chunks).write.format("delta").mode("overwrite").save(path),
+            lambda spark, path: spark.read.format("delta").load(path),
+            data_path,
+            conf=confs)
+    confs=copy_and_update(delta_writes_enabled_conf, {
+        "spark.databricks.delta.optimizeWrite.enabled" : "true"
+    })
+    do_write(confs)
+    opmetrics = get_last_operation_metrics(gpu_data_path)
+    assert int(opmetrics["numFiles"]) == 1
+    # Verify SQL conf takes precedence over table setting
+    confs=copy_and_update(delta_writes_enabled_conf, {
+        "spark.databricks.delta.optimizeWrite.enabled" : "false"
+    })
+    do_write(confs)
+    opmetrics = get_last_operation_metrics(gpu_data_path)
+    assert int(opmetrics["numFiles"]) == num_chunks
+    # Verify default conf is not honored after table setting
+    def do_prop_update(spark):
+        spark.sql("ALTER TABLE delta.`{}`".format(gpu_data_path) +
+                  " SET TBLPROPERTIES (delta.autoOptimize.optimizeWrite = true)")
+    with_cpu_session(do_prop_update)
+    confs=copy_and_update(delta_writes_enabled_conf, {
+        "spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite" : "false"
+    })
+    do_write(confs)
+    opmetrics = get_last_operation_metrics(gpu_data_path)
+    assert int(opmetrics["numFiles"]) == 1
+
+@allow_non_gpu(*delta_meta_allow)
+@delta_lake
+@ignore_order
+@pytest.mark.skipif(is_before_spark_320(), reason="Delta Lake writes are not supported before Spark 3.2.x")
+@pytest.mark.skipif(not is_databricks_runtime(), reason="Delta Lake optimized writes are only supported on Databricks")
+def test_delta_write_optimized_partitioned(spark_tmp_path):
+    data_path = spark_tmp_path + "/DELTA_DATA"
+    gpu_data_path = data_path + "/GPU"
+    num_chunks = 20
+    def do_write(confs):
+        assert_gpu_and_cpu_writes_are_equal_collect(
+            lambda spark, path: two_col_df(spark, int_gen, SetValuesGen(StringType(), ["x", "y"]))\
+                .repartition(num_chunks).write.format("delta")\
+                .mode("overwrite").partitionBy("b").save(path),
+            lambda spark, path: spark.read.format("delta").load(path),
+            data_path,
+            conf=confs)
+    confs=copy_and_update(delta_writes_enabled_conf, {
+        "spark.databricks.delta.optimizeWrite.enabled" : "false"
+    })
+    do_write(confs)
+    opmetrics = get_last_operation_metrics(gpu_data_path)
+    assert int(opmetrics["numFiles"]) == 2 * num_chunks
+    # Verify SQL conf takes precedence over table setting
+    confs=copy_and_update(delta_writes_enabled_conf, {
+        "spark.databricks.delta.optimizeWrite.enabled" : "true"
+    })
+    do_write(confs)
+    opmetrics = get_last_operation_metrics(gpu_data_path)
+    assert int(opmetrics["numFiles"]) == 2
diff --git a/integration_tests/src/main/python/delta_zorder_test.py b/integration_tests/src/main/python/delta_zorder_test.py
index 8039b9c9efd..a420413cf77 100644
--- a/integration_tests/src/main/python/delta_zorder_test.py
+++ b/integration_tests/src/main/python/delta_zorder_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,10 +14,12 @@
 
 import pytest
 
-from asserts import assert_gpu_and_cpu_are_equal_collect
+from asserts import assert_cpu_and_gpu_are_equal_collect_with_capture, assert_gpu_and_cpu_are_equal_collect
 from data_gen import *
 from marks import allow_non_gpu, ignore_order, delta_lake
-from spark_session import is_databricks_runtime, with_cpu_session, with_gpu_session
+from spark_session import is_databricks_runtime, with_cpu_session, with_gpu_session, is_databricks104_or_later, is_databricks113_or_later
+
+from dpp_test import _exchange_reuse_conf
 
 # Almost all of this is the metadata query
 # the important part is to not have InterleaveBits or HilbertLongIndex and PartitionerExpr
@@ -48,3 +50,124 @@ def optimize_table(spark):
                   "spark.rapids.sql.castFloatToString.enabled": True,
                   "spark.rapids.sql.explain": "ALL"})
 
+_statements = [
+    # join on z-ordered column
+    '''
+    SELECT fact.ex_key, sum(fact.value)
+    FROM {0} fact
+    JOIN {1} dim
+    ON fact.ex_key = dim.ex_key
+    WHERE dim.filter = {2}
+    GROUP BY fact.ex_key
+    ''',
+    # join on 2 z-ordered columns
+    '''
+    SELECT fact.ex_key, fact.ex_skey, sum(fact.value)
+    FROM {0} fact
+    JOIN {1} dim
+    ON fact.ex_key = dim.ex_key AND fact.ex_skey = dim.ex_skey
+    WHERE dim.filter = {2}
+    GROUP BY fact.ex_key, fact.ex_skey
+    ''',
+    # join on 1 partitioned and 1 z-ordered column
+    '''
+    SELECT fact.key, fact.ex_key, sum(fact.value)
+    FROM {0} fact
+    JOIN {1} dim
+    ON fact.key = dim.key AND fact.ex_key = dim.ex_key
+    WHERE dim.filter = {2}
+    GROUP BY fact.key, fact.ex_key
+    ''',
+    # join on 2 partitioned and 1 z-ordered columns
+    '''
+    SELECT fact.key, fact.skey, fact.ex_key, sum(fact.value)
+    FROM {0} fact
+    JOIN {1} dim
+    ON fact.key = dim.key AND fact.skey = dim.skey AND fact.ex_key = dim.ex_key
+    WHERE dim.filter = {2}
+    GROUP BY fact.key, fact.skey, fact.ex_key
+    ''',
+    # reused subquery, join on z-ordered column
+    '''
+    SELECT ex_key, max(value)
+    FROM (
+        SELECT fact.ex_key as ex_key, fact.value as value
+        FROM {0} fact
+        JOIN {1} dim
+        ON fact.ex_key = dim.ex_key
+        WHERE dim.filter = {2}
+    UNION ALL
+        SELECT fact.ex_key as ex_key, fact.value as value
+        FROM {0} fact
+        JOIN {1} dim
+        ON fact.ex_key = dim.ex_key
+        WHERE dim.filter = {2}
+    )
+    GROUP BY ex_key
+    '''
+]
+
+# This tests Dynamic File Pruning, a feature in Databricks that is similar to Dynamic Partition Pruning 
+# except that it adds the DynamicPruningExpression for columns that are not partition columns but are still
+# optimized. In this case the DynamicPruningExpression should be added to the DataFilters in the scan.
+# This test is very similar to `test_dpp_reuse_broadcast_exchange` but it tests joining using a Z-ordered
+# column
+@delta_lake
+@ignore_order(local=True)
+@pytest.mark.skipif(not is_databricks104_or_later(), reason="Dynamic File Pruning is only supported in Databricks 10.4+")
+@pytest.mark.parametrize('s_index', list(range(len(_statements))), ids=idfn)
+@pytest.mark.parametrize('aqe_enabled', ['false', 'true'])
+def test_delta_dfp_reuse_broadcast_exchange(spark_tmp_table_factory, s_index, aqe_enabled):
+    fact_table, dim_table = spark_tmp_table_factory.get(), spark_tmp_table_factory.get()
+
+    def build_and_optimize_tables(spark):
+        # Note that ex_key is a high-cardinality column, which makes it a good candidate for 
+        # for Z-ordering, which then means it can then be used in Dynamic File Pruning in joins
+        df = gen_df(spark, [
+            ('key', IntegerGen(nullable=False, min_val=0, max_val=9, special_cases=[])),
+            ('skey', IntegerGen(nullable=False, min_val=0, max_val=4, special_cases=[])),
+            ('ex_key', IntegerGen(nullable=False, min_val=0, max_val=10000, special_cases=[])),
+            ('ex_skey', IntegerGen(nullable=False, min_val=0, max_val=1000, special_cases=[])),
+            ('value', int_gen),
+        ], 10000)
+
+        df.write.format("delta") \
+            .mode("overwrite") \
+            .partitionBy("key", "skey") \
+            .saveAsTable(fact_table)
+        spark.sql("OPTIMIZE {} ZORDER BY (ex_key, ex_skey)".format(fact_table)).show()
+
+        df = gen_df(spark, [
+            ('key', IntegerGen(nullable=False, min_val=0, max_val=9, special_cases=[])),
+            ('skey', IntegerGen(nullable=False, min_val=0, max_val=4, special_cases=[])),
+            ('ex_key', IntegerGen(nullable=False, min_val=0, max_val=10000, special_cases=[])),
+            ('ex_skey', IntegerGen(nullable=False, min_val=0, max_val=1000, special_cases=[])),
+            ('value', int_gen),
+            ('filter', RepeatSeqGen(
+                IntegerGen(min_val=0, max_val=2000, special_cases=[]), length=2000 // 20))
+        ], 2000)
+        df.write.format("delta") \
+            .mode("overwrite") \
+            .saveAsTable(dim_table)
+        return df.select('filter').first()[0]
+
+    filter_val = with_cpu_session(build_and_optimize_tables)
+
+    statement = _statements[s_index].format(fact_table, dim_table, filter_val)
+
+    if is_databricks113_or_later() and aqe_enabled == 'true':
+        # SubqueryBroadcastExec is unoptimized in Databricks 11.3 with EXECUTOR_BROADCAST
+        # See https://github.com/NVIDIA/spark-rapids/issues/7425
+        exist_classes='DynamicPruningExpression,SubqueryBroadcastExec,ReusedExchangeExec'
+    else:
+        exist_classes='DynamicPruningExpression,GpuSubqueryBroadcastExec,ReusedExchangeExec'
+    assert_cpu_and_gpu_are_equal_collect_with_capture(
+        lambda spark: spark.sql(statement),
+        # The existence of GpuSubqueryBroadcastExec indicates the reuse works on the GPU
+        exist_classes,
+        # Ensure Dynamic File Pruning kicks in by setting thresholds to 0
+        conf=dict(_exchange_reuse_conf + [
+            ('spark.databricks.optimizer.dynamicFilePruning', 'true'),
+            ('spark.databricks.optimizer.deltaTableSizeThreshold', '0'),
+            ('spark.databricks.optimizer.deltaTableFilesThreshold', '0'),
+            ('spark.sql.adaptive.enabled', aqe_enabled)]))
diff --git a/integration_tests/src/main/python/dpp_test.py b/integration_tests/src/main/python/dpp_test.py
index a8f3c3a34ef..d3a33401c63 100644
--- a/integration_tests/src/main/python/dpp_test.py
+++ b/integration_tests/src/main/python/dpp_test.py
@@ -94,6 +94,14 @@ def fn(spark):
     GROUP BY f.key
     ''',
     '''
+    SELECT fact.key, fact.skey, sum(fact.value)
+    FROM {0} fact
+    JOIN {1} dim
+    ON fact.key = dim.key AND fact.skey = dim.skey
+    WHERE dim.filter = {2}
+    GROUP BY fact.key, fact.skey
+    ''',
+    '''
     SELECT fact.key, fact.skey, fact.ex_key, sum(fact.value)
     FROM {0} fact
     JOIN {1} dim
diff --git a/integration_tests/src/main/python/join_test.py b/integration_tests/src/main/python/join_test.py
index 5afec088fdd..918dcd83062 100644
--- a/integration_tests/src/main/python/join_test.py
+++ b/integration_tests/src/main/python/join_test.py
@@ -211,11 +211,15 @@ def do_join(spark):
 @ignore_order(local=True)
 @pytest.mark.parametrize('data_gen', basic_nested_gens + [decimal_gen_128bit], ids=idfn)
 @pytest.mark.parametrize('join_type', all_join_types, ids=idfn)
-def test_hash_join_ridealong(data_gen, join_type):
+@pytest.mark.parametrize('sub_part_enabled', ['false', 'true'], ids=['SubPartition_OFF', 'SubPartition_ON'])
+def test_hash_join_ridealong(data_gen, join_type, sub_part_enabled):
     def do_join(spark):
         left, right = create_ridealong_df(spark, short_gen, data_gen, 50, 500)
         return left.join(right, left.key == right.r_key, join_type)
-    assert_gpu_and_cpu_are_equal_collect(do_join, conf=_hash_join_conf)
+    _all_conf = copy_and_update(_hash_join_conf, {
+        "spark.rapids.sql.test.subPartitioning.enabled": sub_part_enabled
+    })
+    assert_gpu_and_cpu_are_equal_collect(do_join, conf=_all_conf)
 
 # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84
 # After 3.1.0 is the min spark version we can drop this
@@ -864,3 +868,64 @@ def do_join(spark):
     capture_regexp = r"GpuBroadcastNestedLoopJoin ExistenceJoin\(exists#[0-9]+\),"
     assert_cpu_and_gpu_are_equal_collect_with_capture(do_join, capture_regexp,
                                                       conf={"spark.sql.adaptive.enabled": aqeEnabled})
+
+@ignore_order
+@pytest.mark.parametrize('aqeEnabled', [True, False], ids=['aqe:on', 'aqe:off'])
+def test_degenerate_broadcast_nested_loop_existence_join(spark_tmp_table_factory, aqeEnabled):
+    left_table_name = spark_tmp_table_factory.get()
+    right_table_name = spark_tmp_table_factory.get()
+
+    def do_join(spark):
+        gen = LongGen(min_val=0, max_val=5)
+
+        left_df = binary_op_df(spark, gen)
+        left_df.createOrReplaceTempView(left_table_name)
+        right_df = binary_op_df(spark, gen)
+        right_df.createOrReplaceTempView(right_table_name)
+
+        return spark.sql(("select * "
+                          "from {} as l "
+                          "where l.a >= 3 "
+                          "   or exists (select * from {} as r where l.b < l.a)"
+                          ).format(left_table_name, right_table_name))
+
+    capture_regexp = r"GpuBroadcastNestedLoopJoin ExistenceJoin\(exists#[0-9]+\),"
+    assert_cpu_and_gpu_are_equal_collect_with_capture(do_join, capture_regexp,
+                                                      conf={"spark.sql.adaptive.enabled": aqeEnabled})
+
+@ignore_order(local=True)
+@pytest.mark.parametrize('data_gen', [StringGen(), IntegerGen()], ids=idfn)
+@pytest.mark.parametrize("aqe_enabled", [True, False], ids=idfn)
+@pytest.mark.parametrize("join_reorder_enabled", [True, False], ids=idfn)
+def test_multi_table_hash_join(data_gen, aqe_enabled, join_reorder_enabled):
+    def do_join(spark):
+        t1 = binary_op_df(spark, data_gen, length=1000)
+        t2 = binary_op_df(spark, data_gen, length=800)
+        t3 = binary_op_df(spark, data_gen, length=300)
+        t4 = binary_op_df(spark, data_gen, length=50)
+        return t1.join(t2, t1.a == t2.a, 'Inner') \
+                 .join(t3, t2.a == t3.a, 'Inner') \
+                 .join(t4, t3.a == t4.a, 'Inner')
+    conf = copy_and_update(_hash_join_conf, {
+        'spark.sql.adaptive.enabled': aqe_enabled,
+        'spark.rapids.sql.optimizer.joinReorder.enabled': join_reorder_enabled
+    })
+    assert_gpu_and_cpu_are_equal_collect(do_join, conf=conf)
+
+
+limited_integral_gens = [byte_gen, ShortGen(max_val=BYTE_MAX), IntegerGen(max_val=BYTE_MAX), LongGen(max_val=BYTE_MAX)]
+
+@validate_execs_in_gpu_plan('GpuShuffledHashJoinExec')
+@ignore_order(local=True)
+@pytest.mark.parametrize('left_gen', limited_integral_gens, ids=idfn)
+@pytest.mark.parametrize('right_gen', limited_integral_gens, ids=idfn)
+@pytest.mark.parametrize('join_type', all_join_types, ids=idfn)
+def test_hash_join_different_key_integral_types(left_gen, right_gen, join_type):
+    def do_join(spark):
+        left = unary_op_df(spark, left_gen, length=50)
+        right = unary_op_df(spark, right_gen, length=500)
+        return left.join(right, left.a == right.a, join_type)
+    _all_conf = copy_and_update(_hash_join_conf, {
+        "spark.rapids.sql.test.subPartitioning.enabled": True
+    })
+    assert_gpu_and_cpu_are_equal_collect(do_join, conf=_all_conf)
diff --git a/integration_tests/src/main/python/map_test.py b/integration_tests/src/main/python/map_test.py
index 6ef9418e857..c312f6e3a8b 100644
--- a/integration_tests/src/main/python/map_test.py
+++ b/integration_tests/src/main/python/map_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -323,6 +323,14 @@ def test_simple_get_map_value_ansi_fail(data_gen):
             conf=ansi_enabled_conf,
             error_message=message)
 
+@pytest.mark.skipif(is_before_spark_340() and not is_databricks113_or_later(),
+                    reason="Only in Spark 3.4+ with ANSI mode, map key returns null on no such element")
+@pytest.mark.parametrize('data_gen', [simple_string_to_string_map_gen], ids=idfn)
+def test_simple_get_map_value_ansi_null(data_gen):
+        assert_gpu_and_cpu_are_equal_collect(
+                lambda spark: unary_op_df(spark, data_gen).selectExpr(
+                        'a["NOT_FOUND"]'),
+                conf=ansi_enabled_conf)
 
 @pytest.mark.skipif(not is_spark_33X() or is_databricks_runtime(),
                     reason="Only in Spark 3.3.X + ANSI mode + Strict Index, map key throws on no such element")
@@ -408,14 +416,13 @@ def test_get_map_value_element_at_map_string_col_keys(data_gen):
             'element_at(a, b)', 'a[b]'),
         conf={'spark.sql.ansi.enabled': False})
 
-
 @pytest.mark.parametrize('data_gen', [simple_string_to_string_map_gen], ids=idfn)
 @pytest.mark.skipif(is_spark_340_or_later() or is_databricks113_or_later(),
                     reason="Since Spark3.4 and DB11.3, null will always be returned on invalid access to map")
 def test_element_at_map_string_col_keys_ansi_fail(data_gen):
     keys = StringGen(pattern='NOT_FOUND')
     message = "org.apache.spark.SparkNoSuchElementException" if (not is_before_spark_330() or is_databricks104_or_later()) else "java.util.NoSuchElementException"
-    # For 3.3.0+ strictIndexOperator should not affect element_at
+    # For 3.3.X strictIndexOperator should not affect element_at
     test_conf = copy_and_update(ansi_enabled_conf, {'spark.sql.ansi.strictIndexOperator': 'false'})
     assert_gpu_and_cpu_error(
         lambda spark: two_col_df(spark, data_gen, keys).selectExpr(
@@ -423,6 +430,15 @@ def test_element_at_map_string_col_keys_ansi_fail(data_gen):
         conf=test_conf,
         error_message=message)
 
+@pytest.mark.skipif(is_before_spark_340() and not is_databricks113_or_later(),
+                    reason="Only in Spark 3.4 + with ANSI mode, map key returns null on no such element")
+@pytest.mark.parametrize('data_gen', [simple_string_to_string_map_gen], ids=idfn)
+def test_element_at_map_string_col_keys_ansi_null(data_gen):
+    keys = StringGen(pattern='NOT_FOUND')
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark: two_col_df(spark, data_gen, keys).selectExpr(
+            'element_at(a, b)'),
+        conf=ansi_enabled_conf)
 
 @pytest.mark.parametrize('data_gen', [simple_string_to_string_map_gen], ids=idfn)
 @pytest.mark.skipif(is_spark_340_or_later() or is_databricks113_or_later(),
@@ -436,6 +452,15 @@ def test_get_map_value_string_col_keys_ansi_fail(data_gen):
         conf=ansi_enabled_conf,
         error_message=message)
 
+@pytest.mark.skipif(is_before_spark_340() and not is_databricks113_or_later(),
+                    reason="Only in Spark 3.4 + ANSI mode, map key returns null on no such element")
+@pytest.mark.parametrize('data_gen', [simple_string_to_string_map_gen], ids=idfn)
+def test_get_map_value_string_col_keys_ansi_null(data_gen):
+    keys = StringGen(pattern='NOT_FOUND')
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark: two_col_df(spark, data_gen, keys).selectExpr(
+            'a[b]'),
+        conf=ansi_enabled_conf)
 
 @pytest.mark.parametrize('data_gen',
                          [MapGen(DateGen(nullable=False), value(), max_length=6)
@@ -461,7 +486,6 @@ def test_element_at_map_timestamp_keys(data_gen):
             'element_at(a, null)'),
         conf={'spark.sql.ansi.enabled': False})
 
-
 @pytest.mark.parametrize('data_gen', [simple_string_to_string_map_gen], ids=idfn)
 @pytest.mark.skipif(is_spark_340_or_later() or is_databricks113_or_later(),
                     reason="Since Spark3.4 and DB11.3, null will always be returned on invalid access to map")
@@ -475,6 +499,14 @@ def test_map_element_at_ansi_fail(data_gen):
             conf=test_conf,
             error_message=message)
 
+@pytest.mark.skipif(is_before_spark_340() and not is_databricks113_or_later(),
+                    reason="Only in Spark 3.4 + ANSI mode, map key returns null on no such element")
+@pytest.mark.parametrize('data_gen', [simple_string_to_string_map_gen], ids=idfn)
+def test_map_element_at_ansi_null(data_gen):
+    assert_gpu_and_cpu_are_equal_collect(
+            lambda spark: unary_op_df(spark, data_gen).selectExpr(
+                'element_at(a, "NOT_FOUND")'),
+            conf=ansi_enabled_conf)
 
 @pytest.mark.parametrize('data_gen', map_gens_sample, ids=idfn)
 def test_transform_values(data_gen):
diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py
index 8ca2b17d77c..45deb5aefed 100644
--- a/integration_tests/src/main/python/orc_test.py
+++ b/integration_tests/src/main/python/orc_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -405,9 +405,11 @@ def test_input_meta_fallback(spark_tmp_path, v1_enabled_list, reader_confs, disa
                         'input_file_block_length()'),
             conf=all_confs)
 
-def setup_orc_file_no_column_names(spark, table_name):
+def setup_orc_file_no_column_names(spark, table_name, location=None):
     drop_query = "DROP TABLE IF EXISTS {}".format(table_name)
     create_query = "CREATE TABLE `{}` (`_col1` INT, `_col2` STRING, `_col3` INT) USING orc".format(table_name)
+    if location:
+        create_query += f" LOCATION '{location}'"
     insert_query = "INSERT INTO {} VALUES(13, '155', 2020)".format(table_name)
     spark.sql(drop_query).collect
     spark.sql(create_query).collect
@@ -421,6 +423,16 @@ def test_missing_column_names(spark_tmp_table_factory, reader_confs):
         lambda spark : spark.sql("SELECT _col3,_col2 FROM {}".format(table_name)),
         reader_confs)
 
+@pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn)
+def test_missing_column_names_with_schema(spark_tmp_table_factory, spark_tmp_path, reader_confs):
+    table_name = spark_tmp_table_factory.get()
+    table_location = spark_tmp_path + "/ORC_DATA"
+    with_cpu_session(lambda spark : setup_orc_file_no_column_names(spark, table_name, table_location))
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark : spark.read.schema("a int, b string, c int").orc(table_location),
+        reader_confs)
+
+
 def setup_orc_file_with_column_names(spark, table_name):
     drop_query = "DROP TABLE IF EXISTS {}".format(table_name)
     create_query = "CREATE TABLE `{}` (`c_1` INT, `c_2` STRING, `c_3` ARRAY<INT>) USING orc".format(table_name)
diff --git a/integration_tests/src/main/python/orc_write_test.py b/integration_tests/src/main/python/orc_write_test.py
index 7cc9c81d6c8..2a5ffaac32e 100644
--- a/integration_tests/src/main/python/orc_write_test.py
+++ b/integration_tests/src/main/python/orc_write_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -189,6 +189,39 @@ def test_buckets_write_fallback(spark_tmp_path, spark_tmp_table_factory):
             'DataWritingCommandExec',
             conf = {'spark.rapids.sql.format.orc.write.enabled': True})
 
+
+@ignore_order
+@allow_non_gpu('DataWritingCommandExec')
+def test_orc_write_bloom_filter_with_options_cpu_fallback(spark_tmp_path, spark_tmp_table_factory):
+    data_path = spark_tmp_path + '/ORC_DATA'
+    assert_gpu_fallback_write(
+      lambda spark, path: spark.range(10e4).write.mode('overwrite').option("orc.bloom.filter.columns", "id").orc(path),
+      lambda spark, path: spark.read.orc(path),
+      data_path,
+      'DataWritingCommandExec',
+      conf={'spark.rapids.sql.format.orc.write.enabled': True})
+
+
+@ignore_order
+@allow_non_gpu('DataWritingCommandExec')
+def test_orc_write_bloom_filter_sql_cpu_fallback(spark_tmp_path, spark_tmp_table_factory):
+    data_path = spark_tmp_path + '/ORC_DATA'
+    base_table_name = spark_tmp_table_factory.get()
+
+    def sql_write(spark, path):
+        is_gpu = path.endswith('GPU')
+        table_name = base_table_name + '_GPU' if is_gpu else base_table_name + '_CPU'
+        spark.sql('CREATE TABLE `{}` STORED AS ORCFILE location \'{}\' TBLPROPERTIES("orc.bloom.filter.columns"="id") '
+                  'AS SELECT id from range(100)'.format(table_name, path))
+
+    assert_gpu_fallback_write(
+      sql_write,
+      lambda spark, path: spark.read.orc(path),
+      data_path,
+      'DataWritingCommandExec',
+      conf={'spark.rapids.sql.format.orc.write.enabled': True})
+
+
 @pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn)
 def test_write_empty_orc_round_trip(spark_tmp_path, orc_gens):
     def create_empty_df(spark, path):
diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py
index 3a2f3719abe..bfe41fdc0ef 100644
--- a/integration_tests/src/main/python/parquet_test.py
+++ b/integration_tests/src/main/python/parquet_test.py
@@ -23,8 +23,9 @@
 import pyarrow.parquet as pa_pq
 from pyspark.sql.types import *
 from pyspark.sql.functions import *
+from spark_init_internal import spark_version
 from spark_session import with_cpu_session, with_gpu_session, is_before_spark_320, is_before_spark_330, is_spark_321cdh
-from conftest import is_databricks_runtime
+from conftest import is_databricks_runtime, is_dataproc_runtime
 
 
 def read_parquet_df(data_path):
@@ -514,6 +515,74 @@ def test_parquet_read_buffer_allocation_empty_blocks(spark_tmp_path, v1_enabled_
             lambda spark : spark.read.parquet(data_path).filter("id < 2 or id > 990"),
             conf=all_confs)
 
+@pytest.mark.parametrize('reader_confs', reader_opt_confs)
+@pytest.mark.parametrize('v1_enabled_list', ["", "parquet"])
+@pytest.mark.skipif(is_databricks_runtime(), reason="https://github.com/NVIDIA/spark-rapids/issues/7733")
+def test_parquet_read_ignore_missing(spark_tmp_path, v1_enabled_list, reader_confs):
+    data_path = spark_tmp_path + '/PARQUET_DATA/'
+    data_path_tmp = spark_tmp_path + '/PARQUET_DATA_TMP/'
+
+    # we need to create the files, get the dataframe but remove the file before we
+    # actually read the file contents. Here we save the data into a second directory
+    # so that when CPU runs, it can remove the file and then put the data back to run
+    # on the GPU.
+    def setup_data(spark):
+        df = spark.range(0, 1000, 1, 2).write.parquet(data_path)
+        sc = spark.sparkContext
+        config = sc._jsc.hadoopConfiguration()
+        src_path = sc._jvm.org.apache.hadoop.fs.Path(data_path)
+        dst_path = sc._jvm.org.apache.hadoop.fs.Path(data_path_tmp)
+        fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(config)
+        sc._jvm.org.apache.hadoop.fs.FileUtil.copy(fs, src_path, fs, dst_path, False, config)
+        df
+
+    with_cpu_session(lambda spark : setup_data(spark))
+    file_deleted = ""
+
+    def read_and_remove(spark):
+        sc = spark.sparkContext
+        config = sc._jsc.hadoopConfiguration()
+        path = sc._jvm.org.apache.hadoop.fs.Path(data_path_tmp)
+        src_path = sc._jvm.org.apache.hadoop.fs.Path(data_path)
+        dst_path = sc._jvm.org.apache.hadoop.fs.Path(data_path_tmp)
+        fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(config)
+        fs.delete(src_path)
+        sc._jvm.org.apache.hadoop.fs.FileUtil.copy(fs, dst_path, fs, src_path, False, config)
+        # input_file_name doesn't use combine so get the input file names in a different dataframe
+        # that we ultimately don't return
+        df = spark.read.parquet(data_path)
+        df_with_file_names = df.withColumn("input_file", input_file_name())
+        distinct_file_names = df_with_file_names.select("input_file").distinct().sort("input_file")
+        num_files = distinct_file_names.count()
+        assert(num_files == 2)
+        files_to_read=[]
+        for i in range(0, 2):
+            files_to_read.insert(i, distinct_file_names.collect()[i][0])
+
+        df_to_test = spark.read.parquet(files_to_read[0], files_to_read[1])
+        # we do our best to try to remove the one Spark will read first but its not
+        # guaranteed
+        file_to_delete = files_to_read[1]
+        path_to_delete = sc._jvm.org.apache.hadoop.fs.Path(file_to_delete)
+        fs.delete(path_to_delete)
+        df_with_file_names_after = df.withColumn("input_file", input_file_name())
+        distinct_file_names_after = df_with_file_names_after.select("input_file").distinct()
+        num_files_after_delete = distinct_file_names_after.count()
+        assert(num_files_after_delete == 1)
+        return df_to_test
+
+
+    # we want all the files to be read by a single Spark task
+    all_confs = copy_and_update(reader_confs, {
+        'spark.sql.files.ignoreMissingFiles': 'true',
+        'spark.sql.sources.useV1SourceList': v1_enabled_list,
+        'spark.sql.files.maxPartitionBytes': '2g',
+        'spark.sql.files.minPartitionNum': '1',
+        'spark.sql.openCostInBytes': '1'})
+    assert_gpu_and_cpu_row_counts_equal(
+            lambda spark : read_and_remove(spark),
+            conf=all_confs)
+
 @pytest.mark.parametrize('reader_confs', reader_opt_confs)
 @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"])
 def test_parquet_read_merge_schema(spark_tmp_path, v1_enabled_list, reader_confs):
@@ -711,6 +780,48 @@ def test_spark_32639(std_input_path):
         lambda spark: spark.read.schema(schema_str).parquet(data_path),
         conf=original_parquet_file_reader_conf)
 
+@pytest.mark.skipif(not is_before_spark_320(), reason='Spark 3.1.x does not need special handling')
+@pytest.mark.skipif(is_dataproc_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/8074')
+def test_parquet_read_nano_as_longs_31x(std_input_path):
+    data_path = "%s/timestamp-nanos.parquet" % (std_input_path)
+    # we correctly return timestamp_micros when running against Spark 3.1.x
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark: spark.read.parquet(data_path))
+
+@pytest.mark.skipif(is_before_spark_320(), reason='Spark 3.1.x supports reading timestamps in nanos')
+def test_parquet_read_nano_as_longs_false(std_input_path):
+    data_path = "%s/timestamp-nanos.parquet" % (std_input_path)
+    conf = copy_and_update(original_parquet_file_reader_conf, {
+            'spark.sql.legacy.parquet.nanosAsLong': False })
+    def read_timestamp_nano_parquet(spark):
+        spark.read.parquet(data_path).collect()
+    assert_gpu_and_cpu_error(
+        read_timestamp_nano_parquet,
+        conf,
+        error_message="Illegal Parquet type: INT64 (TIMESTAMP(NANOS,true))")
+
+@pytest.mark.skipif(is_before_spark_320(), reason='Spark 3.1.x supports reading timestamps in nanos')
+def test_parquet_read_nano_as_longs_not_configured(std_input_path):
+    data_path = "%s/timestamp-nanos.parquet" % (std_input_path)
+    def read_timestamp_nano_parquet(spark):
+        spark.read.parquet(data_path).collect()
+    assert_gpu_and_cpu_error(
+        read_timestamp_nano_parquet,
+        conf=original_parquet_file_reader_conf,
+        error_message="Illegal Parquet type: INT64 (TIMESTAMP(NANOS,true))")
+
+@pytest.mark.skipif(is_before_spark_320(), reason='Spark 3.1.x supports reading timestamps in nanos')
+@pytest.mark.skipif(spark_version() >= '3.2.0' and spark_version() < '3.2.4', reason='New config added in 3.2.4')
+@pytest.mark.skipif(spark_version() >= '3.3.0' and spark_version() < '3.3.2', reason='New config added in 3.3.2')
+@allow_non_gpu('FileSourceScanExec, ColumnarToRowExec')
+def test_parquet_read_nano_as_longs_true(std_input_path):
+    data_path = "%s/timestamp-nanos.parquet" % (std_input_path)
+    conf = copy_and_update(original_parquet_file_reader_conf, {
+            'spark.sql.legacy.parquet.nanosAsLong': True })
+    assert_gpu_fallback_collect(
+        lambda spark: spark.read.parquet(data_path),
+        'FileSourceScanExec',
+        conf=conf)
 
 def test_many_column_project():
     def _create_wide_data_frame(spark, num_cols):
@@ -1219,6 +1330,21 @@ def test_parquet_int32_downcast(spark_tmp_path, reader_confs, v1_enabled_list):
         lambda spark: spark.read.schema(read_schema).parquet(data_path),
         conf=conf)
 
+@pytest.mark.parametrize('reader_confs', reader_opt_confs)
+@pytest.mark.parametrize('v1_enabled_list', ["", "parquet"])
+@pytest.mark.parametrize("types", [("byte", "short"), ("byte", "int"), ("short", "int")], ids=idfn)
+def test_parquet_read_int_upcast(spark_tmp_path, reader_confs, v1_enabled_list, types):
+    data_path = spark_tmp_path + "/PARQUET_DATA"
+    store_type, load_type = types
+    with_cpu_session(lambda spark: spark.range(10) \
+                     .selectExpr(f"cast(id as {store_type})") \
+                     .write.parquet(data_path))
+    conf = copy_and_update(reader_confs,
+                           {'spark.sql.sources.useV1SourceList': v1_enabled_list})
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark: spark.read.schema(f"id {load_type}").parquet(data_path),
+        conf=conf)
+
 @pytest.mark.parametrize('reader_confs', reader_opt_confs)
 @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"])
 def test_parquet_nested_column_missing(spark_tmp_path, reader_confs, v1_enabled_list):
diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py
index c3177049950..0c29bd98fa0 100644
--- a/integration_tests/src/main/python/parquet_write_test.py
+++ b/integration_tests/src/main/python/parquet_write_test.py
@@ -20,7 +20,8 @@
 from enum import Enum
 from marks import *
 from pyspark.sql.types import *
-from spark_session import with_cpu_session, with_gpu_session, is_before_spark_330, is_before_spark_320, is_spark_cdh, is_databricks_runtime
+from spark_session import with_cpu_session, with_gpu_session, is_before_spark_330, is_before_spark_320, is_spark_cdh, is_databricks_runtime, is_before_spark_340, is_spark_340_or_later
+
 import pyspark.sql.functions as f
 import pyspark.sql.utils
 import random
@@ -163,6 +164,8 @@ def test_catch_int96_overflow(spark_tmp_path, data_gen):
     assert_py4j_exception(lambda: with_gpu_session(
         lambda spark: unary_op_df(spark, data_gen).coalesce(1).write.parquet(data_path), conf=confs), "org.apache.spark.SparkException: Job aborted.")
 
+
+@pytest.mark.skipif(is_spark_340_or_later(), reason="`WriteFilesExec` is only supported in Spark 340+")
 @pytest.mark.parametrize('data_gen', [TimestampGen()], ids=idfn)
 @pytest.mark.allow_non_gpu("DataWritingCommandExec")
 def test_int96_write_conf(spark_tmp_path, data_gen):
@@ -170,7 +173,30 @@ def test_int96_write_conf(spark_tmp_path, data_gen):
     confs = copy_and_update(writer_confs, {
         'spark.sql.parquet.outputTimestampType': 'INT96',
         'spark.rapids.sql.format.parquet.writer.int96.enabled': 'false'})
-    with_gpu_session(lambda spark: unary_op_df(spark, data_gen).coalesce(1).write.parquet(data_path), conf=confs)
+
+    assert_gpu_fallback_write(
+        lambda spark, path: unary_op_df(spark, data_gen).coalesce(1).write.parquet(path),
+        lambda spark, path: spark.read.parquet(path),
+        data_path,
+        ['DataWritingCommandExec'],
+        confs)
+
+@pytest.mark.skipif(is_before_spark_340(), reason="`WriteFilesExec` is only supported in Spark 340+")
+@pytest.mark.parametrize('data_gen', [TimestampGen()], ids=idfn)
+# Note: From Spark 340, WriteFilesExec is introduced.
+@pytest.mark.allow_non_gpu("DataWritingCommandExec", "WriteFilesExec")
+def test_int96_write_conf_with_write_exec(spark_tmp_path, data_gen):
+    data_path = spark_tmp_path + '/PARQUET_DATA'
+    confs = copy_and_update(writer_confs, {
+        'spark.sql.parquet.outputTimestampType': 'INT96',
+        'spark.rapids.sql.format.parquet.writer.int96.enabled': 'false'})
+
+    assert_gpu_fallback_write(
+        lambda spark, path: unary_op_df(spark, data_gen).coalesce(1).write.parquet(path),
+        lambda spark, path: spark.read.parquet(path),
+        data_path,
+        ['DataWritingCommandExec', 'WriteFilesExec'],
+        confs)
 
 def test_all_null_int96(spark_tmp_path):
     class AllNullTimestampGen(TimestampGen):
@@ -378,6 +404,40 @@ def test_buckets_write_fallback(spark_tmp_path, spark_tmp_table_factory):
             data_path,
             'DataWritingCommandExec')
 
+
+@ignore_order
+@allow_non_gpu('DataWritingCommandExec')
+def test_parquet_write_bloom_filter_with_options_cpu_fallback(spark_tmp_path, spark_tmp_table_factory):
+    data_path = spark_tmp_path + '/PARQUET_DATA'
+    assert_gpu_fallback_write(
+      lambda spark, path: spark.range(10e4).write.mode('overwrite')
+                               .option("parquet.bloom.filter.enabled#id", "true")
+                               .parquet(path),
+      lambda spark, path: spark.read.parquet(path),
+      data_path,
+      'DataWritingCommandExec')
+
+
+@ignore_order
+@allow_non_gpu('DataWritingCommandExec')
+def test_parquet_write_bloom_filter_sql_cpu_fallback(spark_tmp_path, spark_tmp_table_factory):
+    data_path = spark_tmp_path + '/PARQUET_DATA'
+    base_table_name = spark_tmp_table_factory.get()
+
+    def sql_write(spark, path):
+        is_gpu = path.endswith('GPU')
+        table_name = base_table_name + '_GPU' if is_gpu else base_table_name + '_CPU'
+        spark.sql('CREATE TABLE `{}` STORED AS PARQUET location \'{}\' '
+                  'TBLPROPERTIES("parquet.bloom.filter.enabled#id"="true") '
+                  'AS SELECT id from range(100)'.format(table_name, path))
+
+    assert_gpu_fallback_write(
+        sql_write,
+        lambda spark, path: spark.read.parquet(path),
+        data_path,
+        'DataWritingCommandExec')
+
+
 # This test is testing how the parquet_writer will behave if column has a validity mask without having any nulls.
 # There is no straight forward to do it besides creating a vector with nulls and then dropping nulls
 # cudf will create a vector with a null_mask even though we have just filtered them
@@ -663,3 +723,29 @@ def write_partitions(spark, table_path):
         base_output_path,
         conf={}
     )
+
+
+@ignore_order
+@pytest.mark.skipif(is_before_spark_340(), reason="`spark.sql.optimizer.plannedWrite.enabled` is only supported in Spark 340+")
+# empty string will not set the `planned_write_enabled` option
+@pytest.mark.parametrize('planned_write_enabled', ["", "true", "false"])
+# df to be written has 25 partitions
+#   0 will not set the concurrent writers option
+#   100 > 25 will always use concurrent writer without fallback
+#   20 <25 will fall back to single writer from concurrent writer
+@pytest.mark.parametrize('max_concurrent_writers', [0, 100, 20])
+def test_write_with_planned_write_enabled(spark_tmp_path, planned_write_enabled, max_concurrent_writers):
+    data_path = spark_tmp_path + '/PARQUET_DATA'
+    conf = {}
+    if planned_write_enabled != "":
+        conf = copy_and_update(conf, {"spark.sql.optimizer.plannedWrite.enabled": planned_write_enabled})
+    if max_concurrent_writers != 0:
+        conf = copy_and_update(conf, {"spark.sql.maxConcurrentOutputFileWriters": max_concurrent_writers})
+
+    assert_gpu_and_cpu_writes_are_equal_collect(
+        lambda spark, path: get_25_partitions_df(spark)  # df has 25 partitions for (c1, c2)
+            .repartition(2)
+            .write.mode("overwrite").partitionBy('c1', 'c2').parquet(path),
+        lambda spark, path: spark.read.parquet(path),
+        data_path,
+        conf)
diff --git a/integration_tests/src/main/python/regexp_no_unicode_test.py b/integration_tests/src/main/python/regexp_no_unicode_test.py
index 960a9349bb0..71ca525d0d1 100644
--- a/integration_tests/src/main/python/regexp_no_unicode_test.py
+++ b/integration_tests/src/main/python/regexp_no_unicode_test.py
@@ -35,7 +35,7 @@ def test_rlike_no_unicode_fallback():
     gen = mk_str_gen('[abcd]{1,3}')
     assert_gpu_fallback_collect(
         lambda spark: unary_op_df(spark, gen).selectExpr(
-            'a rlike "ab"'),
+            'a rlike "ab+"'),
         'RLike',
         conf=_regexp_conf)
 
@@ -44,7 +44,7 @@ def test_re_replace_no_unicode_fallback():
     gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}')
     assert_gpu_fallback_collect(
         lambda spark: unary_op_df(spark, gen).selectExpr(
-            'REGEXP_REPLACE(a, "TEST", "PROD")'),
+            'REGEXP_REPLACE(a, "[A-Z]+", "PROD")'),
         'RegExpReplace',
         conf=_regexp_conf)
 
diff --git a/integration_tests/src/main/python/regexp_test.py b/integration_tests/src/main/python/regexp_test.py
index 2cc69ecfa7c..3d4d8517b18 100644
--- a/integration_tests/src/main/python/regexp_test.py
+++ b/integration_tests/src/main/python/regexp_test.py
@@ -28,7 +28,7 @@
 else:
     pytestmark = pytest.mark.regexp
 
-_regexp_conf = { 'spark.rapids.sql.regexp.enabled': 'true' }
+_regexp_conf = { 'spark.rapids.sql.regexp.enabled': True }
 
 def mk_str_gen(pattern):
     return StringGen(pattern).with_special_case('').with_special_pattern('.{0,10}')
@@ -273,6 +273,23 @@ def test_re_replace_issue_5492():
         'RegExpReplace',
         conf=_regexp_conf)
 
+def test_re_replace_escaped_chars():
+    # https://github.com/NVIDIA/spark-rapids/issues/7892
+    gen = mk_str_gen('.{0,5}TEST[\n\r\t\f\a\b\u001b]{0,5}')
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark: unary_op_df(spark, gen).selectExpr(
+            'REGEXP_REPLACE(a, "\\\\t", " ")',
+            'REGEXP_REPLACE(a, "\\\\n", " ")',
+            'REGEXP_REPLACE(a, "TEST\\\\n", "PROD")',
+            'REGEXP_REPLACE(a, "TEST\\\\r", "PROD")',
+            'REGEXP_REPLACE(a, "TEST\\\\f", "PROD")',
+            'REGEXP_REPLACE(a, "TEST\\\\a", "PROD")',
+            'REGEXP_REPLACE(a, "TEST\\\\b", "PROD")',
+            'REGEXP_REPLACE(a, "TEST\\\\e", "PROD")',
+            'REGEXP_REPLACE(a, "TEST[\\\\r\\\\n]", "PROD")'),
+        conf=_regexp_conf)
+
+
 def test_re_replace_backrefs():
     gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}TEST')
     assert_gpu_and_cpu_are_equal_collect(
@@ -787,6 +804,59 @@ def test_regexp_replace_unicode_support():
         ),
         conf=_regexp_conf)
 
+@allow_non_gpu('ProjectExec', 'RegExpReplace')
+def test_regexp_replace_fallback():
+    gen = mk_str_gen('[abcdef]{0,2}')
+
+    conf = { 'spark.rapids.sql.regexp.enabled': 'false' }
+
+    assert_gpu_fallback_collect(
+        lambda spark: unary_op_df(spark, gen).selectExpr(
+            'REGEXP_REPLACE(a, "[a-z]+", "PROD")',
+            'REGEXP_REPLACE(a, "aa", "PROD")',
+        ),
+        cpu_fallback_class_name='RegExpReplace',
+        conf=conf
+    )
+
+@pytest.mark.parametrize("regexp_enabled", ['true', 'false'])
+def test_regexp_replace_simple(regexp_enabled):
+    gen = mk_str_gen('[abcdef]{0,2}')
+
+    conf = { 'spark.rapids.sql.regexp.enabled': regexp_enabled }
+
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark: unary_op_df(spark, gen).selectExpr(
+            'REGEXP_REPLACE(a, "aa", "PROD")',
+            'REGEXP_REPLACE(a, "ab", "PROD")',
+            'REGEXP_REPLACE(a, "ae", "PROD")',
+            'REGEXP_REPLACE(a, "bc", "PROD")',
+            'REGEXP_REPLACE(a, "fa", "PROD")'
+        ),
+        conf=conf
+    )
+
+@pytest.mark.parametrize("regexp_enabled", ['true', 'false'])
+def test_regexp_replace_multi_optimization(regexp_enabled):
+    gen = mk_str_gen('[abcdef]{0,2}')
+
+    conf = { 'spark.rapids.sql.regexp.enabled': regexp_enabled }
+
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark: unary_op_df(spark, gen).selectExpr(
+            'REGEXP_REPLACE(a, "aa|bb", "PROD")',
+            'REGEXP_REPLACE(a, "(aa)|(bb)", "PROD")',
+            'REGEXP_REPLACE(a, "aa|bb|cc", "PROD")',
+            'REGEXP_REPLACE(a, "(aa)|(bb)|(cc)", "PROD")',
+            'REGEXP_REPLACE(a, "aa|bb|cc|dd", "PROD")',
+            'REGEXP_REPLACE(a, "(aa|bb)|(cc|dd)", "PROD")',
+            'REGEXP_REPLACE(a, "aa|bb|cc|dd|ee", "PROD")',
+            'REGEXP_REPLACE(a, "aa|bb|cc|dd|ee|ff", "PROD")'
+        ),
+        conf=conf
+    )
+
+
 def test_regexp_split_unicode_support():
     data_gen = mk_str_gen('([bf]o{0,2}青){1,7}') \
         .with_special_case('boo青and青foo')
@@ -819,7 +889,7 @@ def test_regexp_memory_fallback():
         ),
         cpu_fallback_class_name='RLike',
         conf={ 
-            'spark.rapids.sql.regexp.enabled': 'true',
+            'spark.rapids.sql.regexp.enabled': True,
             'spark.rapids.sql.regexp.maxStateMemoryBytes': '10',
             'spark.rapids.sql.batchSizeBytes': '20' # 1 row in the batch
         }
@@ -841,7 +911,7 @@ def test_regexp_memory_ok():
             'a rlike "1|2|3|4|5|6"'
         ),
         conf={ 
-            'spark.rapids.sql.regexp.enabled': 'true',
+            'spark.rapids.sql.regexp.enabled': True,
             'spark.rapids.sql.regexp.maxStateMemoryBytes': '12',
             'spark.rapids.sql.batchSizeBytes': '20' # 1 row in the batch
         }
diff --git a/integration_tests/src/main/python/spark_session.py b/integration_tests/src/main/python/spark_session.py
index 018007b911a..d976cda2e65 100644
--- a/integration_tests/src/main/python/spark_session.py
+++ b/integration_tests/src/main/python/spark_session.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import os
-from conftest import is_allowing_any_non_gpu, get_non_gpu_allowed, get_validate_execs_in_gpu_plan, is_databricks_runtime, is_at_least_precommit_run
+from conftest import is_allowing_any_non_gpu, get_non_gpu_allowed, get_validate_execs_in_gpu_plan, is_databricks_runtime, is_at_least_precommit_run, should_inject_oom
 from pyspark.sql import DataFrame
 from spark_init_internal import get_spark_i_know_what_i_am_doing, spark_version
 
@@ -68,6 +68,10 @@ def is_tz_utc(spark=_spark):
 
 def _set_all_confs(conf):
     newconf = _default_conf.copy()
+    if (should_inject_oom()):
+        _spark.conf.set("spark.rapids.sql.test.injectRetryOOM", "true")
+    else:
+        _spark.conf.set("spark.rapids.sql.test.injectRetryOOM", "false")
     newconf.update(conf)
     for key, value in newconf.items():
         if _spark.conf.get(key, None) != value:
@@ -135,9 +139,6 @@ def is_before_spark_312():
 def is_before_spark_313():
     return spark_version() < "3.1.3"
 
-def is_before_spark_314():
-    return spark_version() < "3.1.4"
-
 def is_before_spark_320():
     return spark_version() < "3.2.0"
 
@@ -184,9 +185,6 @@ def is_databricks_version_or_later(major, minor):
     db_minor = int(parts[1])
     return db_minor >= minor if (db_major == major) else db_major >= major
 
-def is_databricks91_or_later():
-    return is_databricks_version_or_later(9, 1)
-
 def is_databricks104_or_later():
     return is_databricks_version_or_later(10, 4)
 
diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py
index 3cdf080438c..a4e04c2a8ff 100644
--- a/integration_tests/src/main/python/string_test.py
+++ b/integration_tests/src/main/python/string_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 from marks import *
 from pyspark.sql.types import *
 import pyspark.sql.functions as f
-from spark_session import is_before_spark_320
+from spark_session import is_databricks104_or_later
 
 _regexp_conf = { 'spark.rapids.sql.regexp.enabled': 'true' }
 
@@ -378,6 +378,14 @@ def test_substring_column():
             'SUBSTRING(\'abc\', b)',
             'SUBSTRING(a, b)'))
 
+@pytest.mark.skipif(is_databricks_runtime() and not is_databricks104_or_later(),
+                    reason="https://github.com/NVIDIA/spark-rapids/issues/7463")
+def test_ephemeral_substring():
+    str_gen = mk_str_gen('.{0,30}')
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark: three_col_df(spark, str_gen, int_gen, int_gen)\
+            .filter("substr(a, 1, 3) > 'mmm'"))
+
 def test_repeat_scalar_and_column():
     gen_s = StringGen(nullable=False)
     gen_r = IntegerGen(min_val=-100, max_val=100, special_cases=[0], nullable=True)
diff --git a/integration_tests/src/main/python/udf_cudf_test.py b/integration_tests/src/main/python/udf_cudf_test.py
index ead92abb832..04416315702 100644
--- a/integration_tests/src/main/python/udf_cudf_test.py
+++ b/integration_tests/src/main/python/udf_cudf_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 from conftest import is_at_least_precommit_run
 
 from pyspark.sql.pandas.utils import require_minimum_pyarrow_version, require_minimum_pandas_version
+
 try:
     require_minimum_pandas_version()
 except Exception as e:
diff --git a/integration_tests/src/main/python/udf_test.py b/integration_tests/src/main/python/udf_test.py
index 86f94652bd1..36a00e395f4 100644
--- a/integration_tests/src/main/python/udf_test.py
+++ b/integration_tests/src/main/python/udf_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,9 +15,10 @@
 import pytest
 
 from conftest import is_at_least_precommit_run
-from spark_session import is_databricks91_or_later, is_before_spark_330
+from spark_session import is_databricks_runtime, is_before_spark_330
 
 from pyspark.sql.pandas.utils import require_minimum_pyarrow_version, require_minimum_pandas_version
+
 try:
     require_minimum_pandas_version()
 except Exception as e:
@@ -217,7 +218,7 @@ def pandas_sum(to_process: pd.Series) -> list:
 
 # separate the tests into before and after db 91. To verify
 # the new "zero-conf-conversion" feature introduced from db 9.1.
-@pytest.mark.skipif(not is_databricks91_or_later(), reason="zero-conf is supported only from db9.1")
+@pytest.mark.skipif(not is_databricks_runtime(), reason="zero-conf is supported only from db9.1")
 @ignore_order(local=True)
 @pytest.mark.parametrize('zero_enabled', [False, True])
 @pytest.mark.parametrize('data_gen', [LongGen()], ids=idfn)
@@ -237,7 +238,7 @@ def pandas_add(data):
             conf=conf_with_zero)
 
 
-@pytest.mark.skipif(is_databricks91_or_later(), reason="This is tested by other tests from db9.1")
+@pytest.mark.skipif(is_databricks_runtime(), reason="This is tested by other tests from db9.1")
 @ignore_order(local=True)
 @pytest.mark.parametrize('data_gen', [LongGen()], ids=idfn)
 def test_group_apply_udf(data_gen):
diff --git a/integration_tests/src/main/python/window_function_test.py b/integration_tests/src/main/python/window_function_test.py
index 4ef7cdef585..fe03e978407 100644
--- a/integration_tests/src/main/python/window_function_test.py
+++ b/integration_tests/src/main/python/window_function_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -386,7 +386,7 @@ def test_window_aggs_for_rows(data_gen, batch_size):
 # This is for aggregations that work with a running window optimization. They don't need to be batched
 # specially, but it only works if all of the aggregations can support this.
 # the order returned should be consistent because the data ends up in a single task (no partitioning)
-@pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches 
+@pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches
 @pytest.mark.parametrize('b_gen', all_basic_gens + [decimal_gen_32bit, decimal_gen_128bit], ids=meta_idfn('data:'))
 def test_window_running_no_part(b_gen, batch_size):
     conf = {'spark.rapids.sql.batchSizeBytes': batch_size,
@@ -415,7 +415,7 @@ def test_window_running_no_part(b_gen, batch_size):
 # positive and negative values that interfere with each other.
 # the order returned should be consistent because the data ends up in a single task (no partitioning)
 @approximate_float
-@pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches 
+@pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches
 def test_running_float_sum_no_part(batch_size):
     conf = {'spark.rapids.sql.batchSizeBytes': batch_size,
             'spark.rapids.sql.variableFloatAgg.enabled': True,
@@ -534,7 +534,7 @@ def test_window_running_float_decimal_sum(batch_size):
     conf = {'spark.rapids.sql.batchSizeBytes': batch_size,
             'spark.rapids.sql.variableFloatAgg.enabled': True,
             'spark.rapids.sql.castFloatToDecimal.enabled': True}
-    query_parts = ['b', 'a', 
+    query_parts = ['b', 'a',
             'sum(cast(c as double)) over (partition by b order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as dbl_sum',
             'sum(abs(dbl)) over (partition by b order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as dbl_sum',
             'sum(cast(c as float)) over (partition by b order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as flt_sum',
@@ -830,7 +830,7 @@ def test_window_aggs_for_ranges_timestamps(data_gen):
   _grpkey_longs_with_nullable_larger_decimals,
   pytest.param(_grpkey_longs_with_nullable_largest_decimals,
     marks=pytest.mark.xfail(
-      condition=((not is_before_spark_340()) or is_databricks113_or_later()),
+      condition=is_databricks113_or_later(),
       reason='https://github.com/NVIDIA/spark-rapids/issues/7429'))
 ], ids=idfn)
 def test_window_aggregations_for_decimal_ranges(data_gen):
@@ -872,7 +872,7 @@ def test_window_aggregations_for_decimal_ranges(data_gen):
 @pytest.mark.parametrize('data_gen', [
   pytest.param(_grpkey_longs_with_nullable_largest_decimals,
     marks=pytest.mark.xfail(
-      condition=((not is_before_spark_340()) or is_databricks113_or_later()),
+      condition=is_databricks113_or_later(),
       reason='https://github.com/NVIDIA/spark-rapids/issues/7429'))
 ], ids=idfn)
 def test_window_aggregations_for_big_decimal_ranges(data_gen):
@@ -1155,7 +1155,7 @@ def do_it(spark):
         df = spark.sql(
             """select a, b,
               collect_set(c_struct_array_1) over
-                (partition by a order by b,c_int rows between CURRENT ROW and UNBOUNDED FOLLOWING) as cc_struct_array_1, 
+                (partition by a order by b,c_int rows between CURRENT ROW and UNBOUNDED FOLLOWING) as cc_struct_array_1,
               collect_set(c_struct_array_2) over
                 (partition by a order by b,c_int rows between CURRENT ROW and UNBOUNDED FOLLOWING) as cc_struct_array_2,
               collect_set(c_array_struct) over
diff --git a/integration_tests/src/test/resources/timestamp-nanos.parquet b/integration_tests/src/test/resources/timestamp-nanos.parquet
new file mode 100644
index 00000000000..962aa909b82
Binary files /dev/null and b/integration_tests/src/test/resources/timestamp-nanos.parquet differ
diff --git a/jenkins/Jenkinsfile-blossom.premerge b/jenkins/Jenkinsfile-blossom.premerge
index 259da1cd810..24535bc8761 100644
--- a/jenkins/Jenkinsfile-blossom.premerge
+++ b/jenkins/Jenkinsfile-blossom.premerge
@@ -40,52 +40,6 @@ def db_build = false
 def sourcePattern = 'shuffle-plugin/src/main/scala/,udf-compiler/src/main/scala/,' +
     'sql-plugin/src/main/java/,sql-plugin/src/main/scala/'
 
-// constant parameters for aws and azure databricks cluster
-// CSP params
-ID_HOST = 0
-ID_TOKEN = 1
-ID_DRIVER = 2
-ID_WORKER = 3
-PARAM_MAP = [
-        'aws'  : [
-                "${common.AWS_DATABRICKS_URL}",
-                'SPARK_AWS_DATABRICKS_TOKEN',
-                'g4dn.2xlarge',
-                'g4dn.2xlarge'
-        ],
-        'azure': [
-                "${common.AZURE_DATABRICKS_URL}",
-                'SPARK_AZURE_DATABRICKS_TOKEN',
-                'Standard_NC6s_v3',
-                'Standard_NC6s_v3'
-        ]
-]
-// runtime params
-ID_RUNTIME = 0
-ID_SPARK = 1
-ID_INITSCRIPTS = 2
-ID_INSTALL = 3
-RUNTIME_MAP = [
-        '9.1': [
-                '9.1.x-gpu-ml-scala2.12',
-                '3.1.2',
-                'init_cudf_udf.sh',
-                '3.1.2'
-        ],
-        '10.4': [
-                '10.4.x-gpu-ml-scala2.12',
-                '3.2.1',
-                'init_cudf_udf.sh',
-                '3.2.1'
-        ],
-        '11.3': [
-                '11.3.x-gpu-ml-scala2.12',
-                '3.3.0',
-                'init_cudf_udf.sh',
-                '3.3.0'
-        ]
-]
-
 pipeline {
     agent {
         kubernetes {
@@ -125,11 +79,11 @@ pipeline {
         IDLE_TIMEOUT = '240' // 4 hours
         NUM_WORKERS = '0'
         DB_TYPE = getDbType()
-        DATABRICKS_HOST = "${PARAM_MAP["$DB_TYPE"][ID_HOST]}"
-        DATABRICKS_TOKEN = credentials("${PARAM_MAP["$DB_TYPE"][ID_TOKEN]}")
+        DATABRICKS_HOST = DbUtils.getHost("$DB_TYPE")
+        DATABRICKS_TOKEN = credentials("${DbUtils.getToken("$DB_TYPE")}")
         DATABRICKS_PUBKEY = credentials("SPARK_DATABRICKS_PUBKEY")
-        DATABRICKS_DRIVER = "${PARAM_MAP["$DB_TYPE"][ID_DRIVER]}"
-        DATABRICKS_WORKER = "${PARAM_MAP["$DB_TYPE"][ID_WORKER]}"
+        DATABRICKS_DRIVER = DbUtils.getDriver("$DB_TYPE")
+        DATABRICKS_WORKER = DbUtils.getWorker("$DB_TYPE")
         INIT_SCRIPTS_DIR = "dbfs:/databricks/init_scripts/${BUILD_TAG}"
     }
 
@@ -231,13 +185,6 @@ pipeline {
                 script {
                     githubHelper.updateCommitStatus("$BUILD_URL", "Running - includes databricks", GitHubCommitState.PENDING)
                     unstash "source_tree"
-                    container('cpu') {
-                        sh """
-                            bash -c 'dbfs mkdirs $INIT_SCRIPTS_DIR'
-                            bash -c 'dbfs cp --overwrite jenkins/databricks/init_cudf_udf.sh $INIT_SCRIPTS_DIR'
-                            bash -c 'dbfs cp --overwrite jenkins/databricks/init_cuda11_runtime.sh $INIT_SCRIPTS_DIR'
-                        """
-                    }
                 }
             }
         }
@@ -321,39 +268,6 @@ pipeline {
                     }
                 } // end of Unit Test stage
 
-                stage('DB runtime 9.1') {
-                    when {
-                        beforeAgent true
-                        anyOf {
-                            expression { db_build }
-                        }
-                    }
-
-                    agent {
-                        kubernetes {
-                            label "premerge-ci-db-9.1-${BUILD_NUMBER}"
-                            cloud 'sc-ipp-blossom-prod'
-                            yaml "${IMAGE_DB}"
-                        }
-                    }
-                    environment {
-                        DB_RUNTIME = '9.1'
-                        DATABRICKS_RUNTIME = "${RUNTIME_MAP["$DB_RUNTIME"][ID_RUNTIME]}"
-                        BASE_SPARK_VERSION = "${RUNTIME_MAP["$DB_RUNTIME"][ID_SPARK]}"
-                        BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS = "${RUNTIME_MAP["$DB_RUNTIME"][ID_INSTALL]}"
-                        INIT_SCRIPTS = getInitScripts("$INIT_SCRIPTS_DIR",
-                                "${RUNTIME_MAP["$DB_RUNTIME"][ID_INITSCRIPTS]}")
-                    }
-                    steps {
-                        script {
-                            timeout(time: 5, unit: 'HOURS') {
-                                unstash "source_tree"
-                                databricksBuild()
-                            }
-                        }
-                    }
-                } // end of DB runtime 9.1
-
                 stage('DB runtime 10.4') {
                     when {
                         beforeAgent true
@@ -371,11 +285,10 @@ pipeline {
                     }
                     environment {
                         DB_RUNTIME = '10.4'
-                        DATABRICKS_RUNTIME = "${RUNTIME_MAP["$DB_RUNTIME"][ID_RUNTIME]}"
-                        BASE_SPARK_VERSION = "${RUNTIME_MAP["$DB_RUNTIME"][ID_SPARK]}"
-                        BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS = "${RUNTIME_MAP["$DB_RUNTIME"][ID_INSTALL]}"
-                        INIT_SCRIPTS = getInitScripts("$INIT_SCRIPTS_DIR",
-                                "${RUNTIME_MAP["$DB_RUNTIME"][ID_INITSCRIPTS]}")
+                        DATABRICKS_RUNTIME = DbUtils.getRuntime("$DB_RUNTIME")
+                        BASE_SPARK_VERSION = DbUtils.getSparkVer("$DB_RUNTIME")
+                        BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS = DbUtils.getInstallVer("$DB_RUNTIME")
+                        INIT_SCRIPTS = DbUtils.getInitScripts("$DB_RUNTIME")
                     }
                     steps {
                         script {
@@ -404,11 +317,10 @@ pipeline {
                     }
                     environment {
                         DB_RUNTIME = '11.3'
-                        DATABRICKS_RUNTIME = "${RUNTIME_MAP["$DB_RUNTIME"][ID_RUNTIME]}"
-                        BASE_SPARK_VERSION = "${RUNTIME_MAP["$DB_RUNTIME"][ID_SPARK]}"
-                        BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS = "${RUNTIME_MAP["$DB_RUNTIME"][ID_INSTALL]}"
-                        INIT_SCRIPTS = getInitScripts("$INIT_SCRIPTS_DIR",
-                                "${RUNTIME_MAP["$DB_RUNTIME"][ID_INITSCRIPTS]}")
+                        DATABRICKS_RUNTIME = DbUtils.getRuntime("$DB_RUNTIME")
+                        BASE_SPARK_VERSION = DbUtils.getSparkVer("$DB_RUNTIME")
+                        BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS = DbUtils.getInstallVer("$DB_RUNTIME")
+                        INIT_SCRIPTS = DbUtils.getInitScripts("$DB_RUNTIME")
                     }
                     steps {
                         script {
@@ -450,12 +362,6 @@ pipeline {
                     }
                 }
 
-                if (db_build) {
-                    container('cpu') {
-                        sh "bash -c 'dbfs rm -r $INIT_SCRIPTS_DIR || true'"
-                    }
-                }
-
                 if (TEMP_IMAGE_BUILD) {
                     container('cpu') {
                         deleteDockerTempTag("${PREMERGE_TAG}") // clean premerge temp image
@@ -472,14 +378,10 @@ String getDbType() {
     return params.DATABRICKS_TYPE ? params.DATABRICKS_TYPE : 'aws'
 }
 
-// e.g. foo.sh,bar.sh --> /dbfs/path/foo.sh,/dbfs/path/bar.sh
-String getInitScripts(String rootDir, String files) {
-    return rootDir + '/' + files.replace(',', ',' + rootDir + '/')
-}
-
 void databricksBuild() {
     def CLUSTER_ID = ''
     def SPARK_MAJOR = BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS.replace('.', '')
+    def dbfs_path = "$INIT_SCRIPTS_DIR-$DB_TYPE"
     try {
         stage("Create $SPARK_MAJOR DB") {
             script {
@@ -488,7 +390,18 @@ void databricksBuild() {
                     sh "tar -zcf spark-rapids-ci.tgz *"
                     def CREATE_PARAMS = " -r $DATABRICKS_RUNTIME -w $DATABRICKS_HOST -t $DATABRICKS_TOKEN" +
                             " -s $DB_TYPE -n CI-${BUILD_TAG}-${BASE_SPARK_VERSION} -k \"$DATABRICKS_PUBKEY\" -i $IDLE_TIMEOUT" +
-                            " -d $DATABRICKS_DRIVER -o $DATABRICKS_WORKER -e $NUM_WORKERS -f $INIT_SCRIPTS"
+                            " -d $DATABRICKS_DRIVER -o $DATABRICKS_WORKER -e $NUM_WORKERS"
+
+                    // handle init scripts if exist
+                    if (env.INIT_SCRIPTS) {
+                        sh "bash -c 'dbfs mkdirs $dbfs_path'"
+                        env.INIT_SCRIPTS.split(',').each {
+                            sh "bash -c 'dbfs cp --overwrite jenkins/databricks/${it} $dbfs_path'"
+                        }
+                        // foo.sh,bar.sh --> dbfs:/path/foo.sh,dbfs:/path/bar.sh
+                        CREATE_PARAMS += " -f $dbfs_path/" + env.INIT_SCRIPTS.replace(',', ",$dbfs_path/")
+                    }
+
                     CLUSTER_ID = sh(script: "python3 ./jenkins/databricks/create.py $CREATE_PARAMS",
                             returnStdout: true).trim()
                     echo CLUSTER_ID
@@ -532,6 +445,9 @@ void databricksBuild() {
         if (CLUSTER_ID) {
             container('cpu') {
                 retry(3) {
+                    if (env.INIT_SCRIPTS) {
+                        sh "bash -c 'dbfs rm -r $dbfs_path'"
+                    }
                     sh "python3 ./jenkins/databricks/shutdown.py -s $DATABRICKS_HOST -t $DATABRICKS_TOKEN -c $CLUSTER_ID -d"
                 }
             }
diff --git a/jenkins/databricks/build.sh b/jenkins/databricks/build.sh
index 5be253e198b..87c78ddcf3e 100755
--- a/jenkins/databricks/build.sh
+++ b/jenkins/databricks/build.sh
@@ -18,7 +18,7 @@
 # This script installs dependencies required to build RAPIDS Accelerator for Apache Spark on DB.
 # All the environments can be overwritten by shell variables:
 #   SPARKSRCTGZ: Archive file location of the plugin repository. Default is empty.
-#   BASE_SPARK_VERSION: Spark version [3.1.2, 3.2.1, 3.3.0]. Default is pulled from current instance.
+#   BASE_SPARK_VERSION: Spark version [3.2.1, 3.3.0]. Default is pulled from current instance.
 #   BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS: The version of Spark used when we install the
 #       Databricks jars in .m2. Default is {BASE_SPARK_VERSION}.
 #   MVN_OPT: Options to be passed to the MVN commands. Note that "-DskipTests" is hardcoded in the
@@ -114,7 +114,7 @@ initialize()
 }
 
 # Sets the JAR files prefixes based on the build version.
-# DB9.1 and 10.4 uses ----workspace as a prefix.
+# DB 10.4 uses ----workspace as a prefix.
 # DB 11.3 uses more abbreviations (i.e., workspace becomes ws).
 set_jars_prefixes()
 {
@@ -124,7 +124,7 @@ set_jars_prefixes()
     # get the hive prefix. something like hive-2.3
     HIVE_VER_STRING=hive-$(echo ${sw_versions[HIVE_FULL]} | cut -d. -f 1,2)
 
-    # defaults are for 3.1.2, and 3.2.1
+    # defaults are for 3.2.1
     PREFIX_WS=----workspace
     SPARK_MAJOR_VERSION_STRING=spark_${SPARK_MAJOR_VERSION_NUM_STRING}
     PREFIX_SPARK=${PREFIX_WS}_${SPARK_MAJOR_VERSION_STRING}
@@ -186,24 +186,6 @@ set_sw_versions()
             sw_versions[PARQUET]="1.12.0"
             sw_versions[PROTOBUF]="2.6.1"
             ;;
-        "3.1.2")
-            sw_versions[COMMONS_LANG3]="3.10"
-            sw_versions[COMMONS_IO]="2.4"
-            sw_versions[DB]="9"
-            sw_versions[FASTERXML_JACKSON]="2.10.0"
-            sw_versions[HADOOP]="2.7"
-            sw_versions[HIVE_FULL]="2.3.7"
-            sw_versions[JSON4S_AST]="3.7.0-M5"
-            sw_versions[JSON4S_CORE]="3.7.0-M5"
-            sw_versions[ORC]="1.5.12"
-            sw_versions[PARQUET]="1.10.1"
-            sw_versions[HIVESTORAGE_API]="2.7.2"
-            sw_versions[PROTOBUF]="2.6.1"
-            sw_versions[KRYO]="4.0.2"
-            sw_versions[ARROW]="2.0.0"
-            sw_versions[JAVAASSIST]="3.25.0-GA"
-            sw_versions[AVRO]="1.8.2"
-            ;;
         *) echo "Unexpected Spark version: $BASE_SPARK_VERSION"; exit 1;;
     esac
 }
@@ -290,17 +272,6 @@ set_dep_jars()
         artifacts[LOG4JCORE]="-DgroupId=org.apache.logging.log4j -DartifactId=log4j-core"
         dep_jars[LOG4JCORE]=${PREFIX_WS_SP_MVN_HADOOP}--org.apache.logging.log4j--log4j-core--org.apache.logging.log4j__log4j-core__${sw_versions[LOG4JCORE]}.jar
     fi
-
-    # spark-3.1.2 overrides some jar naming conventions
-    if [[ $BASE_SPARK_VERSION == "3.1.2" ]]
-    then
-        dep_jars[HIVE]=${PREFIX_SPARK}--sql--hive--hive_${SCALA_VERSION}_deploy_shaded.jar
-        dep_jars[HIVEMETASTORECLIENTPATCHED]=${PREFIX_SPARK}--patched-hive-with-glue--hive-12679-patch_deploy.jar
-        dep_jars[PARQUETFORMAT]=${PREFIX_WS_SP_MVN_HADOOP}--org.apache.parquet--parquet-format--org.apache.parquet__parquet-format__2.4.0.jar
-        dep_jars[AVROSPARK]=${PREFIX_SPARK}--vendor--avro--avro_${SCALA_VERSION}_deploy_shaded.jar
-        dep_jars[AVROMAPRED]=${PREFIX_WS_SP_MVN_HADOOP}--org.apache.avro--avro-mapred-hadoop2--org.apache.avro__avro-mapred-hadoop2__${sw_versions[AVRO]}.jar
-        dep_jars[AVRO]=${PREFIX_WS_SP_MVN_HADOOP}--org.apache.avro--avro--org.apache.avro__avro__${sw_versions[AVRO]}.jar
-    fi
 }
 
 # Install dependency jars to MVN repository.
@@ -337,7 +308,7 @@ else
 fi
 
 if [[ "$WITH_BLOOP" == "1" ]]; then
-    MVN_OPT="ch.epfl.scala:maven-bloop_2.13:bloopInstall $MVN_OPT"
+    MVN_OPT="ch.epfl.scala:bloop-maven-plugin:bloopInstall $MVN_OPT"
 fi
 
 # Build the RAPIDS plugin by running package command for databricks
diff --git a/jenkins/databricks/create.py b/jenkins/databricks/create.py
index 4e6dfee840e..8f6e66c8877 100644
--- a/jenkins/databricks/create.py
+++ b/jenkins/databricks/create.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ def main():
   workspace = 'https://dbc-9ff9942e-a9c4.cloud.databricks.com'
   token = ''
   sshkey = ''
-  cluster_name = 'CI-GPU-databricks-23.02.0'
+  cluster_name = 'CI-GPU-databricks-23.04.0'
   idletime = 240
   runtime = '7.0.x-gpu-ml-scala2.12'
   num_workers = 1
diff --git a/jenkins/databricks/cudf_udf_test.sh b/jenkins/databricks/cudf_udf_test.sh
new file mode 100644
index 00000000000..87439c358b3
--- /dev/null
+++ b/jenkins/databricks/cudf_udf_test.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+#
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This script sets the environment to run cudf_udf tests of RAPIDS Accelerator for Apache Spark on DB.
+# cudf conda packages need to be installed in advance, please refer to
+#   './jenkins/databricks/init_cudf_udf.sh' to install.
+# All the environments can be overwritten by shell variables:
+#   LOCAL_JAR_PATH: Location of the RAPIDS jars
+#   SPARK_CONF: Spark configuration parameters
+
+# Usage:
+# - Running tests on Databricks:
+#       `./jenkins/databricks/cudf-udf-test.sh`
+# To add support of a new runtime:
+#   1. Check if any more dependencies need to be added to the apt/conda install commands.
+#   2. If you had to go beyond the above steps to support the new runtime, then update the
+#      instructions accordingly.
+set -ex
+
+# Map of software versions for each dependency.
+
+LOCAL_JAR_PATH=${LOCAL_JAR_PATH:-''}
+SPARK_CONF=${SPARK_CONF:-''}
+
+# Try to use "cudf-udf" conda environment for the python cudf-udf tests.
+CONDA_HOME=${CONDA_HOME:-"/databricks/conda"}
+if [ ! -d "${CONDA_HOME}/envs/cudf-udf" ]; then
+    echo "Error not found cudf conda packages! Please refer to './jenkins/databricks/init_cudf_udf.sh' to install."
+    exit -1
+fi
+export PATH=${CONDA_HOME}/envs/cudf-udf/bin:$PATH
+export PYSPARK_PYTHON=${CONDA_HOME}/envs/cudf-udf/bin/python
+# Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
+PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
+
+# Install required packages
+sudo apt -y install zip unzip
+
+export SPARK_HOME=/databricks/spark
+# Change to not point at Databricks confs so we don't conflict with their settings.
+export SPARK_CONF_DIR=$PWD
+
+# Get the correct py4j file.
+PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")
+# Set the path of python site-packages.
+PYTHON_SITE_PACKAGES="${CONDA_HOME}/envs/cudf-udf/lib/${PYTHON_VERSION}/site-packages"
+# Databricks Koalas can conflict with the actual Pandas version, so put site packages first.
+# Note that Koala is deprecated for DB10.4+ and it is recommended to use Pandas API on Spark instead.
+export PYTHONPATH=$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
+sudo ln -s /databricks/jars/ $SPARK_HOME/jars || true
+sudo chmod 777 /databricks/data/logs/
+sudo chmod 777 /databricks/data/logs/*
+echo { \"port\":\"15002\" } > ~/.databricks-connect
+
+CUDF_UDF_TEST_ARGS="--conf spark.python.daemon.module=rapids.daemon_databricks \
+    --conf spark.rapids.memory.gpu.minAllocFraction=0 \
+    --conf spark.rapids.memory.gpu.allocFraction=0.1 \
+    --conf spark.rapids.python.memory.gpu.allocFraction=0.1 \
+    --conf spark.rapids.python.concurrentPythonWorkers=2"
+
+## 'spark.foo=1,spark.bar=2,...' to 'export PYSP_TEST_spark_foo=1 export PYSP_TEST_spark_bar=2'
+if [ -n "$SPARK_CONF" ]; then
+    CONF_LIST=${SPARK_CONF//','/' '}
+    for CONF in ${CONF_LIST}; do
+        KEY=${CONF%%=*}
+        VALUE=${CONF#*=}
+        ## run_pyspark_from_build.sh requires 'export PYSP_TEST_spark_foo=1' as the spark configs
+        export PYSP_TEST_${KEY//'.'/'_'}=$VALUE
+    done
+
+    ## 'spark.foo=1,spark.bar=2,...' to '--conf spark.foo=1 --conf spark.bar=2 --conf ...'
+    SPARK_CONF="--conf ${SPARK_CONF/','/' --conf '}"
+fi
+
+TEST_TYPE="nightly"
+PCBS_CONF="com.nvidia.spark.ParquetCachedBatchSerializer"
+
+# Enable event log for qualification & profiling tools testing
+export PYSP_TEST_spark_eventLog_enabled=true
+mkdir -p /tmp/spark-events
+
+if [ -d "$LOCAL_JAR_PATH" ]; then
+    ## Run cudf-udf tests.
+    CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls $LOCAL_JAR_PATH/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
+    LOCAL_JAR_PATH=$LOCAL_JAR_PATH SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=1 \
+        bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE
+else
+    ## Run cudf-udf tests.
+    CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls /home/ubuntu/spark-rapids/dist/target/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
+    SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=0 \
+        bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE
+fi
diff --git a/jenkins/databricks/deploy.sh b/jenkins/databricks/deploy.sh
index 437a57631cb..064bfd71bd0 100755
--- a/jenkins/databricks/deploy.sh
+++ b/jenkins/databricks/deploy.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ echo "Maven mirror is $MVN_URM_MIRROR"
 SERVER_ID='snapshots'
 SERVER_URL="$URM_URL-local"
 SCALA_VERSION=`mvn help:evaluate -q -pl dist -Dexpression=scala.binary.version -DforceStdout`
-# remove the periods so change something like 3.1.1 to 311
+# remove the periods so change something like 3.2.1 to 321
 VERSION_NUM=${BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS//.}
 SPARK_VERSION_STR=spark$VERSION_NUM
 SPARK_PLUGIN_JAR_VERSION=`mvn help:evaluate -q -pl dist -Dexpression=project.version -DforceStdout`
diff --git a/jenkins/databricks/init_cudf_udf.sh b/jenkins/databricks/init_cudf_udf.sh
index 254cf0540b6..191b9a9c33d 100755
--- a/jenkins/databricks/init_cudf_udf.sh
+++ b/jenkins/databricks/init_cudf_udf.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,25 +18,30 @@
 # The initscript to set up environment for the cudf_udf tests on Databricks
 # Will be automatically pushed into the dbfs:/databricks/init_scripts once it is updated.
 
-set -x
+set -ex
 
-CUDF_VER=${CUDF_VER:-23.02}
+CUDF_VER=${CUDF_VER:-23.04}
 CUDA_VER=${CUDA_VER:-11.0}
 
 # Need to explicitly add conda into PATH environment, to activate conda environment.
 export PATH=/databricks/conda/bin:$PATH
 # Set Python for the running instance
+export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"}
 PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("{}.{}".format(sys.version_info.major, sys.version_info.minor))')
+# cudf 23.02+ do not support python 3.9. ref: https://docs.rapids.ai/notices/rsn0022/
+[[ "$PYTHON_VERSION" == '3.9' ]] && PYTHON_VERSION='3.8'
 
 base=$(conda info --base)
 # Create and activate 'cudf-udf' conda env for cudf-udf tests
+sudo chmod a+w ${base}/envs && conda config --add envs_dirs ${base}/envs
 conda create -y -n cudf-udf -c conda-forge python=$PYTHON_VERSION mamba && \
   source activate && \
   conda activate cudf-udf
 
 # Use mamba to install cudf-udf packages to speed up conda resolve time
 conda install -y -c conda-forge mamba python=$PYTHON_VERSION
-${base}/envs/cudf-udf/bin/mamba remove -y c-ares zstd libprotobuf pandas
+# Do not error out "This operation will remove conda without replacing it with another version of conda." for now
+${base}/envs/cudf-udf/bin/mamba remove -y c-ares zstd libprotobuf pandas || true
 
 REQUIRED_PACKAGES=(
   cudatoolkit=$CUDA_VER
@@ -55,4 +60,4 @@ ${base}/envs/cudf-udf/bin/mamba install -y \
   -c rapidsai -c rapidsai-nightly -c nvidia -c conda-forge -c defaults \
   "${REQUIRED_PACKAGES[@]}"
 
-source deactivate && conda deactivate
\ No newline at end of file
+source deactivate && conda deactivate
diff --git a/jenkins/databricks/params.py b/jenkins/databricks/params.py
index c97fe9ede57..22a36fdf7c8 100644
--- a/jenkins/databricks/params.py
+++ b/jenkins/databricks/params.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 script_dest = '/home/ubuntu/build.sh'
 source_tgz = 'spark-rapids-ci.tgz'
 tgz_dest = '/home/ubuntu/spark-rapids-ci.tgz'
-base_spark_pom_version = '3.1.1'
+base_spark_pom_version = '3.2.1'
 base_spark_version_to_install_databricks_jars = base_spark_pom_version
 clusterid = ''
 # can take comma seperated maven options, e.g., -Pfoo=1,-Dbar=2,...
diff --git a/jenkins/databricks/run_it.sh b/jenkins/databricks/run_it.sh
index 620c2ecdf57..c3f34e7dff7 100755
--- a/jenkins/databricks/run_it.sh
+++ b/jenkins/databricks/run_it.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -33,29 +33,19 @@ if [[ -z "$SPARK_HOME" ]]; then
 fi
 
 SCALA_BINARY_VER=${SCALA_BINARY_VER:-'2.12'}
-CONDA_HOME=${CONDA_HOME:-"/databricks/conda"}
-
-# Try to use "cudf-udf" conda environment for the python cudf-udf tests.
-if [ -d "${CONDA_HOME}/envs/cudf-udf" ]; then
-    export PATH=${CONDA_HOME}/envs/cudf-udf/bin:${CONDA_HOME}/bin:$PATH
-    export PYSPARK_PYTHON=${CONDA_HOME}/envs/cudf-udf/bin/python
-fi
 
+# Set PYSPARK_PYTHON to keep the version of driver/workers python consistent.
+export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"}
 # Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
-python_version=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
-
-# override incompatible versions between databricks and cudf
-if [ -d "${CONDA_HOME}/envs/cudf-udf" ]; then
-    PATCH_PACKAGES_PATH="$PWD/package-overrides/${python_version}"
-fi
+PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
+# Set the path of python site-packages, packages were installed here by 'jenkins/databricks/setup.sh'.
+PYTHON_SITE_PACKAGES="$HOME/.local/lib/${PYTHON_VERSION}/site-packages"
 
 # Get the correct py4j file.
 PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")
-# Set the path of python site-packages
-PYTHON_SITE_PACKAGES=/databricks/python3/lib/${python_version}/site-packages
 # Databricks Koalas can conflict with the actual Pandas version, so put site packages first.
 # Note that Koala is deprecated for DB10.4+ and it is recommended to use Pandas API on Spark instead.
-export PYTHONPATH=$PATCH_PACKAGES_PATH:$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
+export PYTHONPATH=$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
 
 # Disable parallel test as multiple tests would be executed by leveraging external parallelism, e.g. Jenkins parallelism
 export TEST_PARALLEL=${TEST_PARALLEL:-0}
@@ -73,7 +63,7 @@ if [[ "$TEST_TAGS" == "iceberg" ]]; then
         "3.3.0")
             ICEBERG_VERSION=${ICEBERG_VERSION:-0.14.1}
             ;;
-        "3.2.1" | "3.1.2")
+        "3.2.1")
             ICEBERG_VERSION=${ICEBERG_VERSION:-0.13.2}
             ;;
         *) echo "Unexpected Spark version: $SPARK_VER"; exit 1;;
@@ -94,5 +84,14 @@ if [[ -n "$LOCAL_JAR_PATH" ]]; then
     export LOCAL_JAR_PATH=$LOCAL_JAR_PATH
 fi
 
+set +e
 # Run integration testing
 ./integration_tests/run_pyspark_from_build.sh --runtime_env='databricks' --test_type=$TEST_TYPE
+ret=$?
+set -e
+if [ "$ret" = 5 ]; then
+  # avoid exit script w/ code 5 when the cases are skipped in specific test
+  echo "Suppress Exit code 5: No tests were collected"
+  exit 0
+fi
+exit "$ret"
diff --git a/jenkins/databricks/setup.sh b/jenkins/databricks/setup.sh
index 274181055b3..a1a9d03c900 100755
--- a/jenkins/databricks/setup.sh
+++ b/jenkins/databricks/setup.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -36,32 +36,17 @@ if [ -f $SPARK_HOME/conf/spark-env.sh ]; then
     sudo chmod 777 `echo $local_dir | xargs`
 fi
 
-CONDA_HOME=${CONDA_HOME:-"/databricks/conda"}
-
-# Try to use "cudf-udf" conda environment for the python cudf-udf tests.
-if [ -d "${CONDA_HOME}/envs/cudf-udf" ]; then
-    export PATH=${CONDA_HOME}/envs/cudf-udf/bin:${CONDA_HOME}/bin:$PATH
-    export PYSPARK_PYTHON=${CONDA_HOME}/envs/cudf-udf/bin/python
+# Set PYSPARK_PYTHON to keep the version of driver/workers python consistent.
+export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"}
+# Install if python pip does not exist.
+if [ -z "$($PYSPARK_PYTHON -m pip --version || true)" ]; then
+    curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
+        $PYSPARK_PYTHON get-pip.py && rm get-pip.py
 fi
 
 # Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
-python_version=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
-
-# override incompatible versions between databricks and cudf
-if [ -d "${CONDA_HOME}/envs/cudf-udf" ]; then
-    CONDA_SITE_PATH="${CONDA_HOME}/envs/cudf-udf/lib/${python_version}/site-packages"
-    PATCH_PACKAGES_PATH="$PWD/package-overrides/${python_version}"
-    mkdir -p ${PATCH_PACKAGES_PATH}
-    TO_PATCH=(
-        google
-        llvmlite
-        numba
-        numpy
-        pyarrow
-    )
-
-    echo creating symlinks to override conflicting packages
-    for p in "${TO_PATCH[@]}"; do
-        ln -f -s ${CONDA_SITE_PATH}/${p} ${PATCH_PACKAGES_PATH}
-    done
-fi
+PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
+# Set the path of python site-packages, and install packages here.
+PYTHON_SITE_PACKAGES="$HOME/.local/lib/${PYTHON_VERSION}/site-packages"
+# Use "python -m pip install" to make sure pip matches with python.
+$PYSPARK_PYTHON -m pip install --target $PYTHON_SITE_PACKAGES pytest sre_yield requests pandas pyarrow findspark pytest-xdist pytest-order
diff --git a/jenkins/databricks/test.sh b/jenkins/databricks/test.sh
index b5dc2bbaee7..c8a8cff7633 100755
--- a/jenkins/databricks/test.sh
+++ b/jenkins/databricks/test.sh
@@ -19,7 +19,7 @@
 # All the environments can be overwritten by shell variables:
 #   LOCAL_JAR_PATH: Location of the RAPIDS jars
 #   SPARK_CONF: Spark configuration parameters
-#   BASE_SPARK_VERSION: Spark version [3.1.2, 3.2.1, 3.3.0]. Default is pulled from current instance.
+#   BASE_SPARK_VERSION: Spark version [3.2.1, 3.3.0]. Default is pulled from current instance.
 #   SHUFFLE_SPARK_SHIM: Set the default value for the shuffle shim. For databricks versions, append
 #                       db. Example: spark330 => spark330db
 #   ICEBERG_VERSION: The iceberg version. To find the list of supported ICEBERG versions,
@@ -27,7 +27,6 @@
 #   SCALA_BINARY_VER: Scala version of the provided binaries. Default is 2.12.
 #   TEST_MODE: Can be one of the following (`DEFAULT` is the default value):
 #       - DEFAULT: all tests except cudf_udf tests
-#       - CUDF_UDF_ONLY: cudf_udf tests only, requires extra conda cudf-py lib
 #       - ICEBERG_ONLY: iceberg tests only
 #       - DELTA_LAKE_ONLY: delta_lake tests only
 #       - MULTITHREADED_SHUFFLE: shuffle tests only
@@ -59,33 +58,20 @@ SCALA_BINARY_VER=${SCALA_BINARY_VER:-'2.12'}
 # install required packages
 sudo apt -y install zip unzip
 
-# Try to use "cudf-udf" conda environment for the python cudf-udf tests.
-if [ -d "${CONDA_HOME}/envs/cudf-udf" ]; then
-    export PATH=${CONDA_HOME}/envs/cudf-udf/bin:${CONDA_HOME}/bin:$PATH
-    export PYSPARK_PYTHON=${CONDA_HOME}/envs/cudf-udf/bin/python
+# Set PYSPARK_PYTHON to keep the version of driver/workers python consistent.
+export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"}
+# Install if python pip does not exist.
+if [ -z "$($PYSPARK_PYTHON -m pip --version || true)" ]; then
+    curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
+        $PYSPARK_PYTHON get-pip.py && rm get-pip.py
 fi
 
 # Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
-sw_versions[PYTHON]=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
-
-# override incompatible versions between databricks and cudf
-if [ -d "${CONDA_HOME}/envs/cudf-udf" ]; then
-    CONDA_SITE_PATH="${CONDA_HOME}/envs/cudf-udf/lib/${sw_versions[PYTHON]}/site-packages"
-    PATCH_PACKAGES_PATH="$PWD/package-overrides/${sw_versions[PYTHON]}"
-    mkdir -p ${PATCH_PACKAGES_PATH}
-    TO_PATCH=(
-        google
-        llvmlite
-        numba
-        numpy
-        pyarrow
-    )
-
-    echo creating symlinks to override conflicting packages
-    for p in "${TO_PATCH[@]}"; do
-        ln -f -s ${CONDA_SITE_PATH}/${p} ${PATCH_PACKAGES_PATH}
-    done
-fi
+PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
+# Set the path of python site-packages, and install packages here.
+PYTHON_SITE_PACKAGES="$HOME/.local/lib/${PYTHON_VERSION}/site-packages"
+# Use "python -m pip install" to make sure pip matches with python.
+$PYSPARK_PYTHON -m pip install --target $PYTHON_SITE_PACKAGES pytest sre_yield requests pandas pyarrow findspark pytest-xdist pytest-order
 
 export SPARK_HOME=/databricks/spark
 # change to not point at databricks confs so we don't conflict with their settings
@@ -100,32 +86,20 @@ case "$BASE_SPARK_VERSION" in
         # Available versions https://repo.maven.apache.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.2_2.12/
         sw_versions[ICEBERG]=${ICEBERG_VERSION:-'0.13.2'}
         ;;
-    "3.1.2")
-        # Available versions https://repo.maven.apache.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.2_2.12/
-        sw_versions[ICEBERG]=${ICEBERG_VERSION:-'0.13.2'}
-        ;;
     *) echo "Unexpected Spark version: $BASE_SPARK_VERSION"; exit 1;;
 esac
 # Set the iceberg_spark to something like 3.3 for DB11.3, 3.2 for DB10.4
 sw_versions[ICEBERG_SPARK]=$(echo $BASE_SPARK_VERSION | cut -d. -f1,2)
 # Get the correct py4j file.
 PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")
-# Set the path of python site-packages
-PYTHON_SITE_PACKAGES=/databricks/python3/lib/${sw_versions[PYTHON]}/site-packages
 # Databricks Koalas can conflict with the actual Pandas version, so put site packages first.
 # Note that Koala is deprecated for DB10.4+ and it is recommended to use Pandas API on Spark instead.
-export PYTHONPATH=$PATCH_PACKAGES_PATH:$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
+export PYTHONPATH=$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
 sudo ln -s /databricks/jars/ $SPARK_HOME/jars || true
 sudo chmod 777 /databricks/data/logs/
 sudo chmod 777 /databricks/data/logs/*
 echo { \"port\":\"15002\" } > ~/.databricks-connect
 
-CUDF_UDF_TEST_ARGS="--conf spark.python.daemon.module=rapids.daemon_databricks \
-    --conf spark.rapids.memory.gpu.minAllocFraction=0 \
-    --conf spark.rapids.memory.gpu.allocFraction=0.1 \
-    --conf spark.rapids.python.memory.gpu.allocFraction=0.1 \
-    --conf spark.rapids.python.concurrentPythonWorkers=2"
-
 ## 'spark.foo=1,spark.bar=2,...' to 'export PYSP_TEST_spark_foo=1 export PYSP_TEST_spark_bar=2'
 if [ -n "$SPARK_CONF" ]; then
     CONF_LIST=${SPARK_CONF//','/' '}
@@ -140,13 +114,12 @@ if [ -n "$SPARK_CONF" ]; then
     SPARK_CONF="--conf ${SPARK_CONF/','/' --conf '}"
 fi
 
-IS_SPARK_311_OR_LATER=0
-[[ "$(printf '%s\n' "3.1.1" "$BASE_SPARK_VERSION" | sort -V | head -n1)" = "3.1.1" ]] && IS_SPARK_311_OR_LATER=1
+IS_SPARK_321_OR_LATER=0
+[[ "$(printf '%s\n' "3.2.1" "$BASE_SPARK_VERSION" | sort -V | head -n1)" = "3.2.1" ]] && IS_SPARK_321_OR_LATER=1
 
 
 # TEST_MODE
 # - DEFAULT: all tests except cudf_udf tests
-# - CUDF_UDF_ONLY: cudf_udf tests only, requires extra conda cudf-py lib
 # - ICEBERG_ONLY: iceberg tests only
 # - DELTA_LAKE_ONLY: delta_lake tests only
 # - MULTITHREADED_SHUFFLE: shuffle tests only
@@ -190,19 +163,12 @@ if [ -d "$LOCAL_JAR_PATH" ]; then
         LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh  --runtime_env="databricks" --test_type=$TEST_TYPE
 
         ## Run cache tests
-        if [[ "$IS_SPARK_311_OR_LATER" -eq "1" ]]; then
+        if [[ "$IS_SPARK_321_OR_LATER" -eq "1" ]]; then
           PYSP_TEST_spark_sql_cache_serializer=${PCBS_CONF} \
            LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh  --runtime_env="databricks" --test_type=$TEST_TYPE -k cache_test
         fi
     fi
 
-    if [[ "$TEST_MODE" == "CUDF_UDF_ONLY" ]]; then
-        ## Run cudf-udf tests
-        CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls $LOCAL_JAR_PATH/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
-        LOCAL_JAR_PATH=$LOCAL_JAR_PATH SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=1 \
-            bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE
-    fi
-
     if [[ "$TEST_MODE" == "DEFAULT" || "$TEST_MODE" == "ICEBERG_ONLY" ]]; then
         ## Run Iceberg tests
         LOCAL_JAR_PATH=$LOCAL_JAR_PATH SPARK_SUBMIT_FLAGS="$SPARK_CONF $ICEBERG_CONFS" TEST_PARALLEL=1 \
@@ -214,19 +180,12 @@ else
         bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE
 
         ## Run cache tests
-        if [[ "$IS_SPARK_311_OR_LATER" -eq "1" ]]; then
+        if [[ "$IS_SPARK_321_OR_LATER" -eq "1" ]]; then
             PYSP_TEST_spark_sql_cache_serializer=${PCBS_CONF} \
             bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE -k cache_test
         fi
     fi
 
-    if [[ "$TEST_MODE" == "CUDF_UDF_ONLY" ]]; then
-        ## Run cudf-udf tests
-        CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls /home/ubuntu/spark-rapids/dist/target/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
-        SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=0 \
-            bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks"  -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE
-    fi
-
     if [[ "$TEST_MODE" == "DEFAULT" || "$TEST_MODE" == "ICEBERG_ONLY" ]]; then
         ## Run Iceberg tests
         SPARK_SUBMIT_FLAGS="$SPARK_CONF $ICEBERG_CONFS" TEST_PARALLEL=1 \
diff --git a/jenkins/deploy.sh b/jenkins/deploy.sh
index 10ce355718c..d6511eec9db 100755
--- a/jenkins/deploy.sh
+++ b/jenkins/deploy.sh
@@ -36,17 +36,20 @@ set -ex
 
 SIGN_FILE=${1:-"false"}
 DIST_PL=${DIST_PL:-"dist"}
-SQL_PL=${SQL_PL:-"sql-plugin"}
-POM_FILE=${POM_FILE:-`find "$DIST_PL/target/extra-resources/" -name pom.xml`}
-OUT_PATH=${OUT_PATH:-"$DIST_PL/target"}
-SIGN_TOOL=${SIGN_TOOL:-"gpg"}
-MVN_SETTINGS=${MVN_SETTINGS:-"jenkins/settings.xml"}
 
-MVN="mvn -B -Dmaven.wagon.http.retryHandler.count=3 -DretryFailedDeploymentCount=3 -s $MVN_SETTINGS"
 ###### Build the path of jar(s) to be deployed ######
+MVN_SETTINGS=${MVN_SETTINGS:-"jenkins/settings.xml"}
+MVN="mvn -B -Dmaven.wagon.http.retryHandler.count=3 -DretryFailedDeploymentCount=3 -s $MVN_SETTINGS"
 ART_ID=`$MVN help:evaluate -q -pl $DIST_PL -Dexpression=project.artifactId -DforceStdout`
+ART_GROUP_ID=`$MVN help:evaluate -q -pl $DIST_PL -Dexpression=project.groupId -DforceStdout`
 ART_VER=`$MVN help:evaluate -q -f $DIST_PL -Dexpression=project.version -DforceStdout`
 CUDA_CLASSIFIER=`mvn help:evaluate -q -pl $DIST_PL -Dexpression=cuda.version -DforceStdout`
+
+SQL_PL=${SQL_PL:-"sql-plugin"}
+POM_FILE=${POM_FILE:-"$DIST_PL/target/parallel-world/META-INF/maven/${ART_GROUP_ID}/${ART_ID}/pom.xml"}
+OUT_PATH=${OUT_PATH:-"$DIST_PL/target"}
+SIGN_TOOL=${SIGN_TOOL:-"gpg"}
+
 FPATH="$OUT_PATH/$ART_ID-$ART_VER"
 cp $FPATH-$CUDA_CLASSIFIER.jar $FPATH.jar
 
diff --git a/jenkins/spark-nightly-build.sh b/jenkins/spark-nightly-build.sh
index c507088194d..b058c9b9746 100755
--- a/jenkins/spark-nightly-build.sh
+++ b/jenkins/spark-nightly-build.sh
@@ -35,7 +35,7 @@ ART_GROUP_ID=$(mvnEval project.groupId)
 ART_VER=$(mvnEval project.version)
 
 DIST_FPATH="$DIST_PL/target/$ART_ID-$ART_VER-$CUDA_CLASSIFIER"
-DIST_POM_FPATH="$DIST_PL/target/extra-resources/META-INF/maven/$ART_GROUP_ID/$ART_ID/pom.xml"
+DIST_POM_FPATH="$DIST_PL/target/parallel-world/META-INF/maven/$ART_GROUP_ID/$ART_ID/pom.xml"
 
 DIST_PROFILE_OPT=-Dincluded_buildvers=$(IFS=,; echo "${SPARK_SHIM_VERSIONS[*]}")
 DIST_INCLUDES_DATABRICKS=${DIST_INCLUDES_DATABRICKS:-"true"}
diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh
index 6d9e8548ce0..083ede715a5 100755
--- a/jenkins/spark-premerge-build.sh
+++ b/jenkins/spark-premerge-build.sh
@@ -99,6 +99,13 @@ rapids_shuffle_smoke_test() {
     $SPARK_HOME/sbin/spark-daemon.sh start org.apache.spark.deploy.worker.Worker 1 $SPARK_MASTER
 
     invoke_shuffle_integration_test() {
+      # check out what else is on the GPU
+      nvidia-smi
+
+      # because the RapidsShuffleManager smoke tests work against a standalone cluster
+      # we do not want the integration tests to launch N different applications, just one app
+      # is what is expected.
+      TEST_PARALLEL=0 \
       PYSP_TEST_spark_master=$SPARK_MASTER \
         PYSP_TEST_spark_cores_max=2 \
         PYSP_TEST_spark_executor_cores=1 \
diff --git a/jenkins/version-def.sh b/jenkins/version-def.sh
index 0ac39ca96e0..9cd5524bff1 100755
--- a/jenkins/version-def.sh
+++ b/jenkins/version-def.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,10 +26,10 @@ for VAR in $OVERWRITE_PARAMS; do
 done
 IFS=$PRE_IFS
 
-CUDF_VER=${CUDF_VER:-"23.02.0"}
+CUDF_VER=${CUDF_VER:-"23.04.0"}
 CUDA_CLASSIFIER=${CUDA_CLASSIFIER:-"cuda11"}
-PROJECT_VER=${PROJECT_VER:-"23.02.0"}
-PROJECT_TEST_VER=${PROJECT_TEST_VER:-"23.02.0"}
+PROJECT_VER=${PROJECT_VER:-"23.04.0"}
+PROJECT_TEST_VER=${PROJECT_TEST_VER:-"23.04.0"}
 SPARK_VER=${SPARK_VER:-"3.1.1"}
 # Make a best attempt to set the default value for the shuffle shim.
 # Note that SPARK_VER for non-Apache Spark flavors (i.e. databricks,
diff --git a/pom.xml b/pom.xml
index 9688c7acbf9..296692d64f3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -23,7 +23,7 @@
     <artifactId>rapids-4-spark-parent</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Root Project</name>
     <description>The root project of the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.02.0</version>
+    <version>23.04.0</version>
     <packaging>pom</packaging>
 
     <url>https://nvidia.github.io/spark-rapids/</url>
@@ -99,46 +99,6 @@
                 <module>aggregator</module>
             </modules>
         </profile>
-        <profile>
-            <!-- Note Databricks requires 2 properties -Ddatabricks and -Dbuildver=312db -->
-            <id>release312db</id>
-            <activation>
-                <property>
-                    <name>buildver</name>
-                    <value>312db</value>
-                </property>
-            </activation>
-            <properties>
-                <buildver>312db</buildver>
-                <!-- Downgrade scala plugin version due to: https://github.com/sbt/sbt/issues/4305 -->
-                <scala.plugin.version>3.4.4</scala.plugin.version>
-                <spark.version.classifier>spark312db</spark.version.classifier>
-                <!--
-                     Note that we are using the Spark version for all of the Databricks dependencies as well.
-                     The jenkins/databricks/build.sh script handles installing the jars as maven artifacts.
-                     This is to make it easier and not have to change version numbers for each individual dependency
-                     and deal with differences between Databricks versions
-                -->
-                <spark.version>${spark312db.version}</spark.version>
-                <spark.test.version>${spark312db.version}</spark.test.version>
-                <hadoop.client.version>2.7.4</hadoop.client.version>
-                <rat.consoleOutput>true</rat.consoleOutput>
-                <parquet.hadoop.version>1.10.1</parquet.hadoop.version>
-                <spark.shim.sources>${spark312db.sources}</spark.shim.sources>
-                <spark.shim.test.sources>${spark312db.test.sources}</spark.shim.test.sources>
-            </properties>
-
-            <modules>
-                <module>delta-lake/delta-stub</module>
-                <module>dist</module>
-                <module>integration_tests</module>
-                <module>shuffle-plugin</module>
-                <module>sql-plugin</module>
-                <module>tests</module>
-                <module>udf-compiler</module>
-                <module>aggregator</module>
-            </modules>
-        </profile>
         <profile>
             <id>release312</id>
             <activation>
@@ -195,34 +155,6 @@
                 <module>api_validation</module>
             </modules>
         </profile>
-        <profile>
-            <id>release314</id>
-            <activation>
-                <property>
-                    <name>buildver</name>
-                    <value>314</value>
-                </property>
-            </activation>
-            <properties>
-                <buildver>314</buildver>
-                <spark.version>${spark314.version}</spark.version>
-                <spark.test.version>${spark314.version}</spark.test.version>
-                <parquet.hadoop.version>1.10.1</parquet.hadoop.version>
-                <spark.shim.sources>${spark314.sources}</spark.shim.sources>
-                <spark.shim.test.sources>${spark314.test.sources}</spark.shim.test.sources>
-            </properties>
-            <modules>
-                <module>delta-lake/delta-stub</module>
-                <module>dist</module>
-                <module>integration_tests</module>
-                <module>shuffle-plugin</module>
-                <module>sql-plugin</module>
-                <module>tests</module>
-                <module>udf-compiler</module>
-                <module>aggregator</module>
-                <module>api_validation</module>
-            </modules>
-        </profile>
         <profile>
             <id>release320</id>
             <activation>
@@ -654,7 +586,8 @@
         <parquet.hadoop.version>1.10.1</parquet.hadoop.version>
         <spark.version.classifier>spark${buildver}</spark.version.classifier>
         <cuda.version>cuda11</cuda.version>
-        <spark-rapids-jni.version>23.02.0</spark-rapids-jni.version>
+        <spark-rapids-jni.version>23.04.0</spark-rapids-jni.version>
+        <spark-rapids-private.version>23.04.0</spark-rapids-private.version>
         <scala.binary.version>2.12</scala.binary.version>
         <alluxio.client.version>2.8.0</alluxio.client.version>
         <scala.recompileMode>incremental</scala.recompileMode>
@@ -686,9 +619,7 @@
          please update the snapshot-shims profile as well so it is accurate -->
         <spark311.version>3.1.1</spark311.version>
         <spark312.version>3.1.2</spark312.version>
-        <spark312db.version>3.1.2-databricks</spark312db.version>
         <spark313.version>3.1.3</spark313.version>
-        <spark314.version>3.1.4-SNAPSHOT</spark314.version>
         <spark320.version>3.2.0</spark320.version>
         <spark321.version>3.2.1</spark321.version>
         <spark321cdh.version>3.2.1.3.2.7171000.0-3</spark321cdh.version>
@@ -697,7 +628,7 @@
         <spark323.version>3.2.3</spark323.version>
         <spark330.version>3.3.0</spark330.version>
         <spark331.version>3.3.1</spark331.version>
-        <spark332.version>3.3.2-SNAPSHOT</spark332.version>
+        <spark332.version>3.3.2</spark332.version>
         <spark340.version>3.4.0-SNAPSHOT</spark340.version>
         <spark330cdh.version>3.3.0.3.3.7180.0-274</spark330cdh.version>
         <spark330db.version>3.3.0-databricks</spark330db.version>
@@ -725,6 +656,8 @@
             with the ones deployed to a remote Maven repo
         -->
         <ignore.shim.revisions.check>false</ignore.shim.revisions.check>
+
+        <spark.shim.dest>${project.basedir}/target/${spark.version.classifier}/generated/src</spark.shim.dest>
         <noSnapshot.buildvers>
             311,
             312,
@@ -736,13 +669,12 @@
             323,
             330,
             331,
+            332,
             330cdh
         </noSnapshot.buildvers>
         <snapshot.buildvers>
-            332
         </snapshot.buildvers>
         <databricks.buildvers>
-            312db,
             321db,
             330db
         </databricks.buildvers>
@@ -772,6 +704,8 @@
             ${databricks.buildvers},
             340
         </all.buildvers>
+        <shimplify.shims>${all.buildvers}</shimplify.shims>
+        <cpd.sourceType>main</cpd.sourceType>
     </properties>
 
     <dependencyManagement>
@@ -907,6 +841,8 @@
                         <configuration>
                             <exportAntProperties>true</exportAntProperties>
                             <target xmlns:ac="antlib:net.sf.antcontrib">
+
+                                <!-- deprecated -->
                                 <!--
                                     Rules for adding new shim directories:
                                     ######################################
@@ -935,6 +871,7 @@
                                             <include name="311until320-noncdh/*"/>
                                             <include name="311until320-nondb/*"/>
                                             <include name="311until330-all/*"/>
+                                            <include name="311until332-all/*"/>
                                             <include name="311until330-nondb/*"/>
                                             <include name="311until340-all/*"/>
                                             <include name="311until340-non330db/*"/>
@@ -956,12 +893,6 @@
                                         <include name="313/*"/>
                                     </dirset>
                                 </pathconvert>
-                                <pathconvert property="spark314.sources" pathsep=",">
-                                    <dirset id="spark314.dirset.id" dir="${project.basedir}/src/main" erroronmissingdir="false">
-                                        <patternset refid="spark311+.pattern"/>
-                                        <include name="314/*"/>
-                                    </dirset>
-                                </pathconvert>
                                 <pathconvert property="spark320.sources" pathsep=",">
                                     <dirset id="spark320.dirset.id" dir="${project.basedir}/src/main" erroronmissingdir="false">
                                         <patternset id="spark320+.pattern">
@@ -1035,7 +966,9 @@
                                 <pathconvert property="spark332.sources" pathsep=",">
                                     <dirset dir="${project.basedir}/src/main" erroronmissingdir="false">
                                         <patternset refid="spark331+.pattern"/>
+                                        <exclude name="*until332*/*"/>
                                         <include name="332/*"/>
+                                        <include name="332+/*"/>
                                     </dirset>
                                 </pathconvert>
                                 <pathconvert property="spark340.sources" pathsep=",">
@@ -1072,20 +1005,6 @@
                                     </dirset>
                                 </pathconvert>
 
-                                <pathconvert property="spark312db.sources" pathsep=",">
-                                    <dirset id="spark312db.dirset.id" dir="${project.basedir}/src/main" erroronmissingdir="false">
-                                        <!-- inherit 311+ except nondb -->
-                                        <patternset refid="spark311+.pattern"/>
-                                        <exclude name="pre320-treenode/*"/>
-                                        <exclude name="*nondb*/*"/>
-
-                                        <include name="311+-db/*"/>
-                                        <include name="31xdb/*"/>
-                                        <include name="post320-treenode/*"/>
-
-                                        <include name="312db/*"/>
-                                    </dirset>
-                                </pathconvert>
                                 <pathconvert property="spark321db.sources" pathsep=",">
                                     <dirset id="spark321db.dirset.id" dir="${project.basedir}/src/main" erroronmissingdir="false">
                                         <patternset id="spark321db+.pattern">
@@ -1177,11 +1096,6 @@
                                         <include name="330cdh/*"/>
                                     </dirset>
                                 </pathconvert>
-                                <pathconvert property="spark312db.test.sources" pathsep=",">
-                                    <dirset dir="${project.basedir}/src/test" erroronmissingdir="false">
-                                        <include name="312db/*"/>
-                                    </dirset>
-                                </pathconvert>
                                 <pathconvert property="spark321db.test.sources" pathsep=",">
                                     <dirset dir="${project.basedir}/src/test" erroronmissingdir="false">
                                         <include name="321db/*"/>
@@ -1202,6 +1116,19 @@
                             </target>
                         </configuration>
                     </execution>
+                    <execution>
+                        <id>shimplify-shim-sources</id>
+                        <goals><goal>run</goal></goals>
+                        <phase>generate-sources</phase>
+                        <configuration>
+                            <target xmlns:ac="antlib:net.sf.antcontrib">
+                                <scriptdef name="shimplify" language="jython" src="${spark.rapids.source.basedir}/build/shimplify.py">
+                                    <attribute name="if"/>
+                                </scriptdef>
+                                <shimplify if="shimplify"/>
+                            </target>
+                        </configuration>
+                    </execution>
                     <execution>
                         <id>generate-build-info</id>
                         <phase>generate-resources</phase>
@@ -1233,6 +1160,21 @@
                             <goal>run</goal>
                         </goals>
                     </execution>
+                    <execution>
+                        <id>duplicate-code-detector</id>
+                        <goals>
+                            <goal>run</goal>
+                        </goals>
+                        <phase>none</phase>
+                        <configuration>
+                            <target>
+                                <java classname="net.sourceforge.pmd.cpd.CPD">
+                                    <arg line="--dir ${project.basedir}/src/${cpd.sourceType}"/>
+                                    <arg line="${cpd.argLine}"/>
+                                </java>
+                            </target>
+                        </configuration>
+                    </execution>
                   </executions>
                 <dependencies>
                     <dependency>
@@ -1245,6 +1187,16 @@
                         <artifactId>ant-contrib</artifactId>
                         <version>1.0b3</version>
                     </dependency>
+                    <dependency>
+                        <groupId>org.python</groupId>
+                        <artifactId>jython-standalone</artifactId>
+                        <version>2.7.2</version>
+                    </dependency>
+                    <dependency>
+                        <groupId>net.sourceforge.pmd</groupId>
+                        <artifactId>pmd-dist</artifactId>
+                        <version>6.55.0</version>
+                    </dependency>
                 </dependencies>
                 </plugin>
                 <plugin>
@@ -1331,6 +1283,7 @@
                             <jvmArg>-Xmx1024m</jvmArg>
                         </jvmArgs>
                         <addJavacArgs>${scala.javac.args}</addJavacArgs>
+                        <secondaryCacheDir>${spark.rapids.source.basedir}/target/${spark.version.classifier}/.sbt/1.0/zinc/org.scala-sbt</secondaryCacheDir>
                     </configuration>
                 </plugin>
                 <plugin>
@@ -1460,7 +1413,6 @@
                     </execution>
                 </executions>
             </plugin>
-
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-antrun-plugin</artifactId>
@@ -1538,6 +1490,8 @@
                 <groupId>org.codehaus.mojo</groupId>
                 <artifactId>build-helper-maven-plugin</artifactId>
                 <executions>
+
+                    <!-- deprecated -->
                     <execution>
                         <id>add-shim-sources</id>
                         <phase>generate-sources</phase>
@@ -1554,6 +1508,28 @@
                             <sources>${spark.shim.test.sources}</sources>
                         </configuration>
                     </execution>
+                    <execution>
+                        <id>add-shimple-sources</id>
+                        <phase>generate-sources</phase>
+                        <goals><goal>add-source</goal></goals>
+                        <configuration>
+                            <sources>
+                                <source>${spark.shim.dest}/main/scala</source>
+                                <source>${spark.shim.dest}/main/java</source>
+                            </sources>
+                        </configuration>
+                    </execution>
+                    <execution>
+                        <id>add-shimple-test-sources</id>
+                        <phase>generate-test-sources</phase>
+                        <goals><goal>add-test-source</goal></goals>
+                        <configuration>
+                            <sources>
+                                <source>${spark.shim.dest}/test/scala</source>
+                                <source>${spark.shim.dest}/test/java</source>
+                            </sources>
+                        </configuration>
+                    </execution>
                 </executions>
             </plugin>
             <plugin>
diff --git a/scripts/generate-changelog b/scripts/generate-changelog
index 8e48dc68e4e..c6f2d5c65ef 100755
--- a/scripts/generate-changelog
+++ b/scripts/generate-changelog
@@ -44,13 +44,13 @@ Github personal access token: https://github.com/settings/tokens, and make you h
 Usage:
     cd spark-rapids/
 
-    # generate changelog for releases 23.02 to 23.02
+    # generate changelog for releases 23.02 to 23.04
     scripts/generate-changelog --token=<GITHUB_PERSONAL_ACCESS_TOKEN> \
-    --releases=23.02
+    --releases=23.02,23.04
 
     # To a separate file like /tmp/CHANGELOG.md
     GITHUB_TOKEN=<GITHUB_PERSONAL_ACCESS_TOKEN> scripts/generate-changelog \
-    --releases=23.02 \
+    --releases=23.02,23.04 \
     --path=/tmp/CHANGELOG.md
 """
 import os
diff --git a/shuffle-plugin/pom.xml b/shuffle-plugin/pom.xml
index 91a3cdf6e98..c5dbebaf80c 100644
--- a/shuffle-plugin/pom.xml
+++ b/shuffle-plugin/pom.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
-  Copyright (c) 2020-2022, NVIDIA CORPORATION.
+  Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.02.0</version>
+        <version>23.04.0</version>
     </parent>
 
     <artifactId>rapids-4-spark-shuffle_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Shuffle Plugin</name>
     <description>Accelerated shuffle plugin for the RAPIDS plugin for Apache Spark</description>
-    <version>23.02.0</version>
+    <version>23.04.0</version>
 
     <dependencies>
         <dependency>
diff --git a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala
index b9557fb5f63..e2fea5b8c34 100644
--- a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala
+++ b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@ import ai.rapids.cudf.{BaseDeviceMemoryBuffer, MemoryBuffer, NvtxColor, NvtxRang
 import com.nvidia.spark.rapids.{Arm, GpuDeviceManager, RapidsConf}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.ThreadFactoryBuilder
+import com.nvidia.spark.rapids.jni.RmmSpark
 import com.nvidia.spark.rapids.shuffle.{ClientConnection, MemoryRegistrationCallback, MessageType, MetadataTransportBuffer, TransportBuffer, TransportUtils}
 import org.openucx.jucx._
 import org.openucx.jucx.ucp._
@@ -105,7 +106,9 @@ class UCX(transport: UCXShuffleTransport, executor: BlockManagerId, rapidsConf:
       new ThreadFactoryBuilder()
         .setNameFormat("progress-thread-%d")
         .setDaemon(true)
-        .build))
+        .build,
+      () => RmmSpark.associateCurrentThreadWithShuffle(),
+      () => RmmSpark.removeCurrentThreadAssociation()))
 
   // The pending queues are used to enqueue [[PendingReceive]] or [[PendingSend]], from executor
   // task threads and [[progressThread]] will hand them to the UcpWorker thread.
diff --git a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala
index d7b22affda7..873dd2e9359 100644
--- a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala
+++ b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala
@@ -25,6 +25,7 @@ import scala.collection.mutable.ArrayBuffer
 import ai.rapids.cudf.{BaseDeviceMemoryBuffer, CudaMemoryBuffer, DeviceMemoryBuffer, HostMemoryBuffer, MemoryBuffer}
 import com.nvidia.spark.rapids.{GpuDeviceManager, HashedPriorityQueue, RapidsConf}
 import com.nvidia.spark.rapids.ThreadFactoryBuilder
+import com.nvidia.spark.rapids.jni.RmmSpark
 import com.nvidia.spark.rapids.shuffle._
 import com.nvidia.spark.rapids.shuffle.{BounceBufferManager, BufferReceiveState, ClientConnection, PendingTransferRequest, RapidsShuffleClient, RapidsShuffleRequestHandler, RapidsShuffleServer, RapidsShuffleTransport, RefCountedDirectByteBuffer}
 
@@ -248,7 +249,9 @@ class UCXShuffleTransport(shuffleServerId: BlockManagerId, rapidsConf: RapidsCon
       new ThreadFactoryBuilder()
         .setNameFormat("shuffle-transport-client-exec-%d")
         .setDaemon(true)
-        .build),
+        .build,
+      () => RmmSpark.associateCurrentThreadWithShuffle(),
+      () => RmmSpark.removeCurrentThreadAssociation()),
     // if we can't hand off because we are too busy, block the caller (in UCX's case,
     // the progress thread)
     new CallerRunsAndLogs())
@@ -258,7 +261,9 @@ class UCXShuffleTransport(shuffleServerId: BlockManagerId, rapidsConf: RapidsCon
     GpuDeviceManager.wrapThreadFactory(new ThreadFactoryBuilder()
       .setNameFormat("shuffle-client-copy-thread-%d")
       .setDaemon(true)
-      .build))
+      .build,
+      () => RmmSpark.associateCurrentThreadWithShuffle(),
+      () => RmmSpark.removeCurrentThreadAssociation()))
 
   override def makeClient(blockManagerId: BlockManagerId): RapidsShuffleClient = {
     val peerExecutorId = blockManagerId.executorId.toLong
@@ -280,14 +285,18 @@ class UCXShuffleTransport(shuffleServerId: BlockManagerId, rapidsConf: RapidsCon
     GpuDeviceManager.wrapThreadFactory(new ThreadFactoryBuilder()
       .setNameFormat(s"shuffle-server-conn-thread-${shuffleServerId.executorId}-%d")
       .setDaemon(true)
-      .build))
+      .build,
+      () => RmmSpark.associateCurrentThreadWithShuffle(),
+      () => RmmSpark.removeCurrentThreadAssociation()))
 
   // This executor handles any task that would block (e.g. wait for spill synchronously due to OOM)
   private[this] val serverCopyExecutor = Executors.newSingleThreadExecutor(
     GpuDeviceManager.wrapThreadFactory(new ThreadFactoryBuilder()
       .setNameFormat(s"shuffle-server-copy-thread-%d")
       .setDaemon(true)
-      .build))
+      .build,
+      () => RmmSpark.associateCurrentThreadWithShuffle(),
+      () => RmmSpark.removeCurrentThreadAssociation()))
 
   // This is used to queue up on the server all the [[BufferSendState]] as the server waits for
   // bounce buffers to become available (it is the equivalent of the transport's throttle, minus
@@ -296,7 +305,9 @@ class UCXShuffleTransport(shuffleServerId: BlockManagerId, rapidsConf: RapidsCon
     GpuDeviceManager.wrapThreadFactory(new ThreadFactoryBuilder()
       .setNameFormat(s"shuffle-server-bss-thread-%d")
       .setDaemon(true)
-      .build))
+      .build,
+      () => RmmSpark.associateCurrentThreadWithShuffle(),
+      () => RmmSpark.removeCurrentThreadAssociation()))
 
   /**
    * Construct a server instance
@@ -356,7 +367,9 @@ class UCXShuffleTransport(shuffleServerId: BlockManagerId, rapidsConf: RapidsCon
       new ThreadFactoryBuilder()
         .setNameFormat(s"shuffle-transport-throttle-monitor")
         .setDaemon(true)
-        .build))
+        .build,
+      () => RmmSpark.associateCurrentThreadWithShuffle(),
+      () => RmmSpark.removeCurrentThreadAssociation()))
 
   // helper class to hold transfer requests that have a bounce buffer
   // and should be ready to be handled by a `BufferReceiveState`
diff --git a/sql-plugin/pom.xml b/sql-plugin/pom.xml
index 900a907f9c0..23649776e74 100644
--- a/sql-plugin/pom.xml
+++ b/sql-plugin/pom.xml
@@ -22,12 +22,12 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.02.0</version>
+        <version>23.04.0</version>
     </parent>
     <artifactId>rapids-4-spark-sql_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark SQL Plugin</name>
     <description>The RAPIDS SQL plugin for Apache Spark</description>
-    <version>23.02.0</version>
+    <version>23.04.0</version>
 
     <dependencies>
         <dependency>
diff --git a/sql-plugin/src/main/311until320-nondb/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/311until320-nondb/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
deleted file mode 100644
index 03526d0de8b..00000000000
--- a/sql-plugin/src/main/311until320-nondb/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.rapids.shims
-
-import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.trees.Origin
-import org.apache.spark.sql.types.{DataType, Decimal, DecimalType}
-
-object RapidsErrorUtils {
-  def invalidArrayIndexError(index: Int, numElements: Int,
-      isElementAtF: Boolean = false): ArrayIndexOutOfBoundsException = {
-    // Follow the Spark string format before 3.3.0
-    new ArrayIndexOutOfBoundsException(s"Invalid index: $index, numElements: $numElements")
-  }
-
-  def mapKeyNotExistError(
-      key: String,
-      keyType: DataType,
-      origin: Origin): NoSuchElementException = {
-    // Follow the Spark string format before 3.3.0
-    new NoSuchElementException(s"Key $key does not exist.")
-  }
-
-  def sqlArrayIndexNotStartAtOneError(): RuntimeException = {
-    new ArrayIndexOutOfBoundsException("SQL array indices start at 1")
-  }
-
-  def divByZeroError(origin: Origin): ArithmeticException = {
-    new ArithmeticException("divide by zero")
-  }
-
-  def divOverflowError(origin: Origin): ArithmeticException = {
-    new ArithmeticException("Overflow in integral divide.")
-  }
-
-  def arithmeticOverflowError(
-      message: String,
-      hint: String = "",
-      errorContext: String = ""): ArithmeticException = {
-    new ArithmeticException(message)
-  }
-
-  def cannotChangeDecimalPrecisionError(      
-      value: Decimal,
-      toType: DecimalType,
-      context: String = ""): ArithmeticException = {
-    new ArithmeticException(s"${value.toDebugString} cannot be represented as " +
-      s"Decimal(${toType.precision}, ${toType.scale}).")
-  }
-
-  def overflowInIntegralDivideError(context: String = ""): ArithmeticException = {
-    new ArithmeticException("Overflow in integral divide.")
-  }
-
-  def foundDuplicateFieldInCaseInsensitiveModeError(
-      requiredFieldName: String, matchedFields: String): Throwable = {
-    new RuntimeException(s"""Found duplicate field(s) "$requiredFieldName": """ +
-        s"$matchedFields in case-insensitive mode")
-  }
-
-  def tableIdentifierExistsError(tableIdentifier: TableIdentifier): Throwable = {
-    throw new AnalysisException(s"$tableIdentifier already exists.")
-  }
-}
diff --git a/sql-plugin/src/main/311until340-nondb/scala/com/nvidia/spark/rapids/shims/ShimDataSourceV2ScanExecBase.scala b/sql-plugin/src/main/311until340-nondb/scala/com/nvidia/spark/rapids/shims/ShimDataSourceV2ScanExecBase.scala
deleted file mode 100644
index 6f669d11be8..00000000000
--- a/sql-plugin/src/main/311until340-nondb/scala/com/nvidia/spark/rapids/shims/ShimDataSourceV2ScanExecBase.scala
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.shims
-
-import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanExecBase
-
-trait ShimDataSourceV2ScanExecBase extends DataSourceV2ScanExecBase
diff --git a/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala b/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala
deleted file mode 100644
index f0addcac237..00000000000
--- a/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.shims
-
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.execution.adaptive.{QueryStageExec, ShuffleQueryStageExec}
-import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
-import org.apache.spark.sql.internal.SQLConf
-
-/** Utility methods for manipulating Catalyst classes involved in Adaptive Query Execution */
-object AQEUtils {
-  /** Return a new QueryStageExec reuse instance with updated output attributes */
-  def newReuseInstance(sqse: ShuffleQueryStageExec, newOutput: Seq[Attribute]): QueryStageExec = {
-    val reusedExchange = ReusedExchangeExec(newOutput, sqse.shuffle)
-    ShuffleQueryStageExec(sqse.id, reusedExchange, sqse.originalPlan)
-  }
-
-  def isAdaptiveExecutionSupportedInSparkVersion(conf: SQLConf): Boolean = true
-}
diff --git a/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/DeltaLakeUtils.scala b/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/DeltaLakeUtils.scala
deleted file mode 100644
index cc6367b7cf4..00000000000
--- a/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/DeltaLakeUtils.scala
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.shims
-
-import org.apache.spark.sql.execution.FileSourceScanExec
-
-object DeltaLakeUtils {
-  /* Check for _databricks_internal when running on Databricks */
-  def isDatabricksDeltaLakeScan(f: FileSourceScanExec): Boolean = {
-    f.requiredSchema.fields.exists(_.name.startsWith("_databricks_internal"))
-  }
-}
diff --git a/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/DistributionUtil.scala b/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/DistributionUtil.scala
deleted file mode 100644
index 564c5daaee4..00000000000
--- a/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/DistributionUtil.scala
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.shims
-
-import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, BroadcastDistribution, ClusteredDistribution, Distribution, HashClusteredDistribution, OrderedDistribution, UnspecifiedDistribution}
-import org.apache.spark.sql.rapids.execution.TrampolineUtil
-
-object DistributionUtil {
-
-  def isSupported(distribution: Distribution): Boolean = {
-    distribution match {
-      case UnspecifiedDistribution => true
-      case AllTuples => true
-      case b: BroadcastDistribution => TrampolineUtil.isSupportedRelation(b.mode)
-      case _: ClusteredDistribution => true
-      case _: OrderedDistribution => true
-      case _: HashClusteredDistribution => true
-      case _ => false
-    }
-  }
-}
diff --git a/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala b/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala
deleted file mode 100644
index c2810f37d91..00000000000
--- a/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.shims
-
-import com.nvidia.spark.rapids.GpuHashPartitioningBase
-
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Distribution, HashClusteredDistribution}
-
-case class GpuHashPartitioning(expressions: Seq[Expression], numPartitions: Int)
-  extends GpuHashPartitioningBase(expressions, numPartitions) {
-
-  override def satisfies0(required: Distribution): Boolean = {
-    super.satisfies0(required) || {
-      required match {
-        case h: HashClusteredDistribution =>
-          expressions.length == h.expressions.length && expressions.zip(h.expressions).forall {
-            case (l, r) => l.semanticEquals(r)
-          }
-        case ClusteredDistribution(requiredClustering, _) =>
-          expressions.forall(x => requiredClustering.exists(_.semanticEquals(x)))
-        case _ => false
-      }
-    }
-  }
-
-}
-
-object GpuHashPartitioning {
-  def getDistribution(exprs: Seq[Expression]): Distribution = HashClusteredDistribution(exprs)
-}
diff --git a/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala b/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala
deleted file mode 100644
index 5de5c7423b2..00000000000
--- a/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.shims
-
-import com.nvidia.spark.rapids.{GpuExpression, GpuPartitioning}
-
-import org.apache.spark.sql.catalyst.expressions.SortOrder
-import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Distribution, OrderedDistribution}
-import org.apache.spark.sql.types.{DataType, IntegerType}
-import org.apache.spark.sql.vectorized.ColumnarBatch
-
-/**
- * A GPU accelerated `org.apache.spark.sql.catalyst.plans.physical.Partitioning` that partitions
- * sortable records by range into roughly equal ranges. The ranges are determined by sampling
- * the content of the RDD passed in.
- *
- * @note The actual number of partitions created might not be the same
- * as the `numPartitions` parameter, in the case where the number of sampled records is less than
- * the value of `partitions`.
- *
- * The GpuRangePartitioner is where all of the processing actually happens.
- */
-case class GpuRangePartitioning(
-    gpuOrdering: Seq[SortOrder],
-    numPartitions: Int) extends GpuExpression with ShimExpression with GpuPartitioning {
-
-  override def children: Seq[SortOrder] = gpuOrdering
-  override def nullable: Boolean = false
-  override def dataType: DataType = IntegerType
-
-  override def satisfies0(required: Distribution): Boolean = {
-    super.satisfies0(required) || {
-      required match {
-        case OrderedDistribution(requiredOrdering) =>
-          // If `ordering` is a prefix of `requiredOrdering`:
-          //   Let's say `ordering` is [a, b] and `requiredOrdering` is [a, b, c]. According to the
-          //   RangePartitioning definition, any [a, b] in a previous partition must be smaller
-          //   than any [a, b] in the following partition. This also means any [a, b, c] in a
-          //   previous partition must be smaller than any [a, b, c] in the following partition.
-          //   Thus `RangePartitioning(a, b)` satisfies `OrderedDistribution(a, b, c)`.
-          //
-          // If `requiredOrdering` is a prefix of `ordering`:
-          //   Let's say `ordering` is [a, b, c] and `requiredOrdering` is [a, b]. According to the
-          //   RangePartitioning definition, any [a, b, c] in a previous partition must be smaller
-          //   than any [a, b, c] in the following partition. If there is a [a1, b1] from a
-          //   previous partition which is larger than a [a2, b2] from the following partition,
-          //   then there must be a [a1, b1 c1] larger than [a2, b2, c2], which violates
-          //   RangePartitioning definition. So it's guaranteed that, any [a, b] in a previous
-          //   partition must not be greater(i.e. smaller or equal to) than any [a, b] in the
-          //   following partition. Thus `RangePartitioning(a, b, c)` satisfies
-          //   `OrderedDistribution(a, b)`.
-          val minSize = Seq(requiredOrdering.size, gpuOrdering.size).min
-          requiredOrdering.take(minSize) == gpuOrdering.take(minSize)
-        case ClusteredDistribution(requiredClustering, _) =>
-          gpuOrdering.map(_.child).forall(x => requiredClustering.exists(_.semanticEquals(x)))
-        case _ => false
-      }
-    }
-  }
-
-  override def columnarEval(batch: ColumnarBatch): Any =
-    throw new IllegalStateException("This cannot be executed")
-}
diff --git a/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
deleted file mode 100644
index 557db5e7dff..00000000000
--- a/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.shims
-
-import com.nvidia.spark.rapids._
-import org.apache.parquet.schema.MessageType
-
-import org.apache.spark.sql.execution.command.{CreateDataSourceTableAsSelectCommand, DataWritingCommand, RunnableCommand}
-import org.apache.spark.sql.execution.datasources.DataSourceUtils
-import org.apache.spark.sql.execution.datasources.parquet.ParquetFilters
-
-object SparkShimImpl extends Spark31XdbShims {
-  override def getParquetFilters(
-      schema: MessageType,
-      pushDownDate: Boolean,
-      pushDownTimestamp: Boolean,
-      pushDownDecimal: Boolean,
-      pushDownStartWith: Boolean,
-      pushDownInFilterThreshold: Int,
-      caseSensitive: Boolean,
-      lookupFileMeta: String => String,
-      dateTimeRebaseModeFromConf: String): ParquetFilters = {
-    val datetimeRebaseMode = DataSourceUtils
-      .datetimeRebaseMode(lookupFileMeta, dateTimeRebaseModeFromConf)
-    new ParquetFilters(schema, pushDownDate, pushDownTimestamp, pushDownDecimal, pushDownStartWith,
-      pushDownInFilterThreshold, caseSensitive, datetimeRebaseMode)
-  }
-
-  override def isCastingStringToNegDecimalScaleSupported: Boolean = false
-
-  override def hasCastFloatTimestampUpcast: Boolean = true
-
-  override def reproduceEmptyStringBug: Boolean = true
-
-  override def getDataWriteCmds: Map[Class[_ <: DataWritingCommand],
-      DataWritingCommandRule[_ <: DataWritingCommand]] = {
-    Seq(GpuOverrides.dataWriteCmd[CreateDataSourceTableAsSelectCommand](
-    "Create table with select command",
-    (a, conf, p, r) => new CreateDataSourceTableAsSelectCommandMeta(a, conf, p, r))
-    ).map(r => (r.getClassFor.asSubclass(classOf[DataWritingCommand]), r)).toMap
-  }
-
-  override def getRunnableCmds: Map[Class[_ <: RunnableCommand],
-      RunnableCommandRule[_ <: RunnableCommand]] = {
-    Map.empty
-  }
-}
diff --git a/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/spark312db/SparkShimServiceProvider.scala b/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/spark312db/SparkShimServiceProvider.scala
deleted file mode 100644
index 8c095fb9ef2..00000000000
--- a/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/shims/spark312db/SparkShimServiceProvider.scala
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.shims.spark312db
-
-import com.nvidia.spark.rapids.{DatabricksShimVersion, ShimVersion}
-
-import org.apache.spark.SparkEnv
-
-object SparkShimServiceProvider {
-  val VERSION = DatabricksShimVersion(3, 1, 2)
-}
-
-class SparkShimServiceProvider extends com.nvidia.spark.rapids.SparkShimServiceProvider {
-
-  override def getShimVersion: ShimVersion = SparkShimServiceProvider.VERSION
-
-  def matchesVersion(version: String): Boolean = {
-    SparkEnv.get.conf.get("spark.databricks.clusterUsageTags.sparkVersion", "").startsWith("9.1.")
-  }
-}
diff --git a/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/spark312db/RapidsShuffleManager.scala b/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/spark312db/RapidsShuffleManager.scala
deleted file mode 100644
index fd0d3737350..00000000000
--- a/sql-plugin/src/main/312db/scala/com/nvidia/spark/rapids/spark312db/RapidsShuffleManager.scala
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.spark312db
-
-import org.apache.spark.SparkConf
-import org.apache.spark.sql.rapids.shims.spark312db.ProxyRapidsShuffleInternalManager
-
-/** A shuffle manager optimized for the RAPIDS Plugin for Apache Spark. */
-sealed class RapidsShuffleManager(
-    conf: SparkConf,
-    isDriver: Boolean) extends ProxyRapidsShuffleInternalManager(conf, isDriver)
diff --git a/sql-plugin/src/main/312db/scala/org/apache/spark/sql/rapids/shims/spark312db/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/312db/scala/org/apache/spark/sql/rapids/shims/spark312db/RapidsShuffleInternalManager.scala
deleted file mode 100644
index b63b142b801..00000000000
--- a/sql-plugin/src/main/312db/scala/org/apache/spark/sql/rapids/shims/spark312db/RapidsShuffleInternalManager.scala
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.rapids.shims.spark312db
-
-import org.apache.spark.SparkConf
-import org.apache.spark.shuffle._
-import org.apache.spark.sql.rapids.{ProxyRapidsShuffleInternalManagerBase, RapidsShuffleInternalManagerBase}
-
-/**
- * A shuffle manager optimized for the RAPIDS Plugin For Apache Spark.
- * @note This is an internal class to obtain access to the private
- *       `ShuffleManager` and `SortShuffleManager` classes.
- */
-class RapidsShuffleInternalManager(conf: SparkConf, isDriver: Boolean)
-    extends RapidsShuffleInternalManagerBase(conf, isDriver)
-
-class ProxyRapidsShuffleInternalManager(conf: SparkConf, isDriver: Boolean)
-    extends ProxyRapidsShuffleInternalManagerBase(conf, isDriver)
-      with ShuffleManager
\ No newline at end of file
diff --git a/sql-plugin/src/main/314/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/314/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
deleted file mode 100644
index edf626b7cb9..00000000000
--- a/sql-plugin/src/main/314/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.shims
-
-import org.apache.parquet.schema.MessageType
-
-import org.apache.spark.sql.execution.datasources.DataSourceUtils
-import org.apache.spark.sql.execution.datasources.parquet.ParquetFilters
-
-object SparkShimImpl extends Spark31XShims {
-  override def getParquetFilters(
-      schema: MessageType,
-      pushDownDate: Boolean,
-      pushDownTimestamp: Boolean,
-      pushDownDecimal: Boolean,
-      pushDownStartWith: Boolean,
-      pushDownInFilterThreshold: Int,
-      caseSensitive: Boolean,
-      lookupFileMeta: String => String,
-      dateTimeRebaseModeFromConf: String): ParquetFilters = {
-    val datetimeRebaseMode = DataSourceUtils
-      .datetimeRebaseMode(lookupFileMeta, dateTimeRebaseModeFromConf)
-    new ParquetFilters(schema, pushDownDate, pushDownTimestamp, pushDownDecimal, pushDownStartWith,
-      pushDownInFilterThreshold, caseSensitive, datetimeRebaseMode)
-  }
-
-  override def hasCastFloatTimestampUpcast: Boolean = true
-
-  override def isCastingStringToNegDecimalScaleSupported: Boolean = true
-
-  override def reproduceEmptyStringBug: Boolean = false
-}
diff --git a/sql-plugin/src/main/314/scala/com/nvidia/spark/rapids/shims/spark314/SparkShimServiceProvider.scala b/sql-plugin/src/main/314/scala/com/nvidia/spark/rapids/shims/spark314/SparkShimServiceProvider.scala
deleted file mode 100644
index 2b7c4a4c117..00000000000
--- a/sql-plugin/src/main/314/scala/com/nvidia/spark/rapids/shims/spark314/SparkShimServiceProvider.scala
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.shims.spark314
-
-import com.nvidia.spark.rapids.SparkShimVersion
-
-object SparkShimServiceProvider {
-  val VERSION = SparkShimVersion(3, 1, 4)
-  val VERSIONNAMES = Seq(s"$VERSION", s"$VERSION-SNAPSHOT")
-}
-
-class SparkShimServiceProvider extends com.nvidia.spark.rapids.SparkShimServiceProvider {
-
-  override def getShimVersion: SparkShimVersion = SparkShimServiceProvider.VERSION
-
-  def matchesVersion(version: String): Boolean = {
-    SparkShimServiceProvider.VERSIONNAMES.contains(version)
-  }
-}
diff --git a/sql-plugin/src/main/314/scala/com/nvidia/spark/rapids/spark314/RapidsShuffleManager.scala b/sql-plugin/src/main/314/scala/com/nvidia/spark/rapids/spark314/RapidsShuffleManager.scala
deleted file mode 100644
index 4ff99b06d9b..00000000000
--- a/sql-plugin/src/main/314/scala/com/nvidia/spark/rapids/spark314/RapidsShuffleManager.scala
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.spark314
-
-import org.apache.spark.SparkConf
-import org.apache.spark.sql.rapids.shims.spark314.ProxyRapidsShuffleInternalManager
-
-/** A shuffle manager optimized for the RAPIDS Plugin for Apache Spark. */
-sealed class RapidsShuffleManager(
-    conf: SparkConf,
-    isDriver: Boolean) extends ProxyRapidsShuffleInternalManager(conf, isDriver) {
-}
diff --git a/sql-plugin/src/main/314/scala/org/apache/spark/sql/rapids/shims/spark314/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/314/scala/org/apache/spark/sql/rapids/shims/spark314/RapidsShuffleInternalManager.scala
deleted file mode 100644
index 9c8ca8c60fb..00000000000
--- a/sql-plugin/src/main/314/scala/org/apache/spark/sql/rapids/shims/spark314/RapidsShuffleInternalManager.scala
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.rapids.shims.spark314
-
-import org.apache.spark.SparkConf
-import org.apache.spark.shuffle._
-import org.apache.spark.sql.rapids.{ProxyRapidsShuffleInternalManagerBase, RapidsShuffleInternalManagerBase}
-/**
- * A shuffle manager optimized for the RAPIDS Plugin For Apache Spark.
- * @note This is an internal class to obtain access to the private
- *       `ShuffleManager` and `SortShuffleManager` classes.
- */
-class RapidsShuffleInternalManager(conf: SparkConf, isDriver: Boolean)
-    extends RapidsShuffleInternalManagerBase(conf, isDriver)
-
-class ProxyRapidsShuffleInternalManager(conf: SparkConf, isDriver: Boolean)
-    extends ProxyRapidsShuffleInternalManagerBase(conf, isDriver)
-      with ShuffleManager
diff --git a/sql-plugin/src/main/31xdb/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala b/sql-plugin/src/main/31xdb/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala
deleted file mode 100644
index 49d366ca7a3..00000000000
--- a/sql-plugin/src/main/31xdb/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.shims
-
-import scala.concurrent.Promise
-
-import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.execution.exchange.BroadcastExchangeLike
-
-/**
- * This shim handles the completion future differences between
- * Apache Spark and Databricks.
- */
-trait ShimBroadcastExchangeLike extends BroadcastExchangeLike {
-  @transient final val session = SparkSession.getActiveSession.orNull
-
-  @transient
-  protected lazy val promise = Promise[Broadcast[Any]]()
-
-  override def doCompletionFuture: concurrent.Future[Broadcast[Any]] = promise.future
-}
diff --git a/sql-plugin/src/main/31xdb/scala/com/nvidia/spark/rapids/shims/Spark31XdbShims.scala b/sql-plugin/src/main/31xdb/scala/com/nvidia/spark/rapids/shims/Spark31XdbShims.scala
deleted file mode 100644
index 97d435ee718..00000000000
--- a/sql-plugin/src/main/31xdb/scala/com/nvidia/spark/rapids/shims/Spark31XdbShims.scala
+++ /dev/null
@@ -1,445 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.shims
-
-import com.databricks.sql.execution.window.RunningWindowFunctionExec
-import com.databricks.sql.optimizer.PlanDynamicPruningFilters
-import com.nvidia.spark.rapids._
-import org.apache.hadoop.fs.FileStatus
-
-import org.apache.spark.TaskContext
-import org.apache.spark.internal.Logging
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
-import org.apache.spark.sql.catalyst.errors.attachTree
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.aggregate.Average
-import org.apache.spark.sql.catalyst.plans.physical.BroadcastMode
-import org.apache.spark.sql.catalyst.trees.TreeNode
-import org.apache.spark.sql.connector.read.Scan
-import org.apache.spark.sql.execution._
-import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, BroadcastQueryStageExec, ShuffleQueryStageExec}
-import org.apache.spark.sql.execution.command.{AlterTableRecoverPartitionsCommand, RunnableCommand}
-import org.apache.spark.sql.execution.datasources._
-import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
-import org.apache.spark.sql.execution.datasources.v2.ShowCurrentNamespaceExec
-import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan
-import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan
-import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
-import org.apache.spark.sql.execution.python._
-import org.apache.spark.sql.execution.window.WindowExecBase
-import org.apache.spark.sql.rapids._
-import org.apache.spark.sql.rapids.execution._
-import org.apache.spark.sql.rapids.execution.python._
-import org.apache.spark.sql.rapids.execution.shims.{GpuSubqueryBroadcastMeta, ReuseGpuBroadcastExchangeAndSubquery}
-import org.apache.spark.sql.rapids.shims._
-import org.apache.spark.sql.types._
-
-abstract class Spark31XdbShims extends Spark31XdbShimsBase with Logging {
-
-  override def v1RepairTableCommand(tableName: TableIdentifier): RunnableCommand =
-    AlterTableRecoverPartitionsCommand(tableName)
-
-  override def isWindowFunctionExec(plan: SparkPlan): Boolean =
-    plan.isInstanceOf[WindowExecBase] || plan.isInstanceOf[RunningWindowFunctionExec]
-
-  override def applyShimPlanRules(plan: SparkPlan, conf: RapidsConf): SparkPlan = {
-    if (plan.conf.adaptiveExecutionEnabled) {
-      plan // AQE+DPP cooperation ensures the optimization runs early
-    } else {
-      val sparkSession = SparkSession.getActiveSession.orNull
-      val rules = Seq(
-        PlanDynamicPruningFilters(sparkSession)
-      )
-      rules.foldLeft(plan) { case (sp, rule) =>
-        rule.apply(sp)
-      }
-    }
-  }
-
-  override def applyPostShimPlanRules(plan: SparkPlan): SparkPlan = {
-    if (plan.conf.adaptiveExecutionEnabled) {
-      plan // AQE+DPP cooperation ensures the optimization runs early
-    } else {
-      val rules = Seq(
-        ReuseGpuBroadcastExchangeAndSubquery
-      )
-      rules.foldLeft(plan) { case (sp, rule) =>
-        rule.apply(sp)
-      }
-    }
-  }
-
-  override def ansiCastRule: ExprRule[_ <: Expression] = {
-    GpuOverrides.expr[AnsiCast](
-      "Convert a column of one type of data into another type",
-      new CastChecks {
-        import TypeSig._
-        // nullChecks are the same
-
-        override val booleanChecks: TypeSig = integral + fp + BOOLEAN + STRING + DECIMAL_128
-        override val sparkBooleanSig: TypeSig = cpuNumeric + BOOLEAN + STRING
-
-        override val integralChecks: TypeSig = gpuNumeric + BOOLEAN + STRING
-        override val sparkIntegralSig: TypeSig = cpuNumeric + BOOLEAN + STRING
-
-        override val fpChecks: TypeSig = (gpuNumeric + BOOLEAN + STRING)
-            .withPsNote(TypeEnum.STRING, fpToStringPsNote)
-        override val sparkFpSig: TypeSig = cpuNumeric + BOOLEAN + STRING
-
-        override val dateChecks: TypeSig = TIMESTAMP + DATE + STRING
-        override val sparkDateSig: TypeSig = TIMESTAMP + DATE + STRING
-
-        override val timestampChecks: TypeSig = TIMESTAMP + DATE + STRING
-        override val sparkTimestampSig: TypeSig = TIMESTAMP + DATE + STRING
-
-        // stringChecks are the same
-        // binaryChecks are the same
-        override val decimalChecks: TypeSig = gpuNumeric + STRING
-        override val sparkDecimalSig: TypeSig = cpuNumeric + BOOLEAN + STRING
-
-        // calendarChecks are the same
-
-        override val arrayChecks: TypeSig =
-          ARRAY.nested(commonCudfTypes + DECIMAL_128 + NULL + ARRAY + BINARY + STRUCT) +
-              psNote(TypeEnum.ARRAY, "The array's child type must also support being cast to " +
-                  "the desired child type")
-        override val sparkArraySig: TypeSig = ARRAY.nested(all)
-
-        override val mapChecks: TypeSig =
-          MAP.nested(commonCudfTypes + DECIMAL_128 + NULL + ARRAY + BINARY + STRUCT + MAP) +
-              psNote(TypeEnum.MAP, "the map's key and value must also support being cast to the " +
-                  "desired child types")
-        override val sparkMapSig: TypeSig = MAP.nested(all)
-
-        override val structChecks: TypeSig =
-          STRUCT.nested(commonCudfTypes + DECIMAL_128 + NULL + ARRAY + BINARY + STRUCT) +
-            psNote(TypeEnum.STRUCT, "the struct's children must also support being cast to the " +
-                "desired child type(s)")
-        override val sparkStructSig: TypeSig = STRUCT.nested(all)
-
-        override val udtChecks: TypeSig = none
-        override val sparkUdtSig: TypeSig = UDT
-      },
-      // 312db supports Ansi mode when casting string to date, this means that an exception
-      // will be thrown when casting an invalid value to date in Ansi mode.
-      // Set `stringToAnsiDate` = true
-      (cast, conf, p, r) => new CastExprMeta[AnsiCast](cast, ansiEnabled = true, conf = conf,
-        parent = p, rule = r, doFloatToIntCheck = true, stringToAnsiDate = true))
-  }
-
-  override def getExprs: Map[Class[_ <: Expression], ExprRule[_ <: Expression]] = Seq(
-    GpuOverrides.expr[Cast](
-        "Convert a column of one type of data into another type",
-        new CastChecks(),
-        // 312db supports Ansi mode when casting string to date, this means that an exception
-        // will be thrown when casting an invalid value to date in Ansi mode.
-        // Set `stringToAnsiDate` = true
-        (cast, conf, p, r) => new CastExprMeta[Cast](cast,
-          SparkSession.active.sessionState.conf.ansiEnabled, conf, p, r,
-          doFloatToIntCheck = true, stringToAnsiDate = true)),
-    GpuOverrides.expr[Average](
-      "Average aggregate operator",
-      ExprChecks.fullAgg(
-        TypeSig.DOUBLE + TypeSig.DECIMAL_128,
-        TypeSig.DOUBLE + TypeSig.DECIMAL_128,
-        Seq(ParamCheck("input",
-          TypeSig.integral + TypeSig.fp + TypeSig.DECIMAL_128,
-          TypeSig.cpuNumeric))),
-      (a, conf, p, r) => new AggExprMeta[Average](a, conf, p, r) {
-        override def tagAggForGpu(): Unit = {
-          GpuOverrides.checkAndTagFloatAgg(a.child.dataType, conf, this)
-        }
-
-        override def convertToGpu(childExprs: Seq[Expression]): GpuExpression =
-          GpuAverage(childExprs.head)
-
-        // Average is not supported in ANSI mode right now, no matter the type
-        override val ansiTypeToCheck: Option[DataType] = None
-      }),
-    GpuOverrides.expr[Abs](
-      "Absolute value",
-      ExprChecks.unaryProjectAndAstInputMatchesOutput(
-        TypeSig.implicitCastsAstTypes, TypeSig.gpuNumeric,
-        TypeSig.cpuNumeric),
-      (a, conf, p, r) => new UnaryAstExprMeta[Abs](a, conf, p, r) {
-        // ANSI support for ABS was added in 3.2.0 SPARK-33275
-        override def convertToGpu(child: Expression): GpuExpression = GpuAbs(child, false)
-      })
-  ).map(r => (r.getClassFor.asSubclass(classOf[Expression]), r)).toMap
-
-  override def getExecs: Map[Class[_ <: SparkPlan], ExecRule[_ <: SparkPlan]] = {
-    Seq(
-      GpuOverrides.exec[SubqueryBroadcastExec](
-        "Plan to collect and transform the broadcast key values",
-        ExecChecks(TypeSig.all, TypeSig.all),
-        (s, conf, p, r) => new GpuSubqueryBroadcastMeta(s, conf, p, r)
-      ),
-      GpuOverrides.exec[WindowInPandasExec](
-        "The backend for Window Aggregation Pandas UDF, Accelerates the data transfer between" +
-          " the Java process and the Python process. It also supports scheduling GPU resources" +
-          " for the Python process when enabled. For now it only supports row based window frame.",
-        ExecChecks(
-          (TypeSig.commonCudfTypes + TypeSig.ARRAY).nested(TypeSig.commonCudfTypes),
-          TypeSig.all),
-        (winPy, conf, p, r) => new GpuWindowInPandasExecMetaBase(winPy, conf, p, r) {
-          override val windowExpressions: Seq[BaseExprMeta[NamedExpression]] =
-            winPy.projectList.map(GpuOverrides.wrapExpr(_, conf, Some(this)))
-
-          override def convertToGpu(): GpuExec = {
-            GpuWindowInPandasExec(
-              windowExpressions.map(_.convertToGpu()),
-              partitionSpec.map(_.convertToGpu()),
-              // leave ordering expression on the CPU, it's not used for GPU computation
-              winPy.orderSpec,
-              childPlans.head.convertIfNeeded()
-            )(winPy.partitionSpec)
-          }
-        }).disabledByDefault("it only supports row based frame for now"),
-      GpuOverrides.exec[RunningWindowFunctionExec](
-        "Databricks-specific window function exec, for \"running\" windows, " +
-            "i.e. (UNBOUNDED PRECEDING TO CURRENT ROW)",
-        ExecChecks(
-          (TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.DECIMAL_128 +
-            TypeSig.STRUCT + TypeSig.ARRAY + TypeSig.MAP).nested(),
-          TypeSig.all,
-          Map("partitionSpec" ->
-              InputCheck(TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.DECIMAL_128,
-                TypeSig.all))),
-          (runningWindowFunctionExec, conf, p, r) =>
-            new GpuRunningWindowExecMeta(runningWindowFunctionExec, conf, p, r)
-      ),
-      GpuOverrides.exec[FileSourceScanExec](
-        "Reading data from files, often from Hive tables",
-        ExecChecks((TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.STRUCT + TypeSig.MAP +
-            TypeSig.ARRAY + TypeSig.BINARY + TypeSig.DECIMAL_128).nested(), TypeSig.all),
-        (fsse, conf, p, r) => new SparkPlanMeta[FileSourceScanExec](fsse, conf, p, r) {
-
-          // Replaces SubqueryBroadcastExec inside dynamic pruning filters with GPU counterpart
-          // if possible. Instead regarding filters as childExprs of current Meta, we create
-          // a new meta for SubqueryBroadcastExec. The reason is that the GPU replacement of
-          // FileSourceScan is independent from the replacement of the partitionFilters. It is
-          // possible that the FileSourceScan is on the CPU, while the dynamic partitionFilters
-          // are on the GPU. And vice versa.
-          private lazy val partitionFilters = {
-            val convertBroadcast = (bc: SubqueryBroadcastExec) => {
-              val meta = GpuOverrides.wrapAndTagPlan(bc, conf)
-              meta.tagForExplain()
-              val converted = meta.convertIfNeeded() 
-              // Because the PlanSubqueries rule is not called (and does not work as expected),
-              // we might actually have to fully convert the subquery plan as the plugin would 
-              // intend (in this case calling GpuTransitionOverrides to insert GpuCoalesceBatches, 
-              // etc.) to match the other side of the join to reuse the BroadcastExchange.
-              // This happens when SubqueryBroadcast has the original (Gpu)BroadcastExchangeExec
-              converted match {
-                case e: GpuSubqueryBroadcastExec => e.child match {
-                  // If the GpuBroadcastExchange is here, then we will need to run the transition 
-                  // overrides here
-                  case _: GpuBroadcastExchangeExec =>
-                    var updated = ApplyColumnarRulesAndInsertTransitions(Seq())
-                        .apply(converted)
-                    updated = (new GpuTransitionOverrides()).apply(updated)
-                    updated match {
-                      case h: GpuBringBackToHost =>
-                        h.child.asInstanceOf[BaseSubqueryExec]
-                      case c2r: GpuColumnarToRowExec =>
-                        c2r.child.asInstanceOf[BaseSubqueryExec]
-                      case _: GpuSubqueryBroadcastExec =>
-                        updated.asInstanceOf[BaseSubqueryExec]
-                    }
-                  // Otherwise, if this SubqueryBroadcast is using a ReusedExchange, then we don't
-                  // do anything further
-                  case _: ReusedExchangeExec => 
-                    converted.asInstanceOf[BaseSubqueryExec]
-                }
-                case _ =>
-                  converted.asInstanceOf[BaseSubqueryExec]
-              }
-            }
-            wrapped.partitionFilters.map { filter =>
-              filter.transformDown {
-                case dpe @ DynamicPruningExpression(inSub: InSubqueryExec) =>
-                  inSub.plan match {
-                    case bc: SubqueryBroadcastExec =>
-                      dpe.copy(inSub.copy(plan = convertBroadcast(bc)))
-                    case reuse @ ReusedSubqueryExec(bc: SubqueryBroadcastExec) =>
-                      dpe.copy(inSub.copy(plan = reuse.copy(convertBroadcast(bc))))
-                    case _ =>
-                      dpe
-                  }
-              }
-            }
-          }
-
-          // partition filters and data filters are not run on the GPU
-          override val childExprs: Seq[ExprMeta[_]] = Seq.empty
-
-          override def tagPlanForGpu(): Unit = {
-            // this is very specific check to have any of the Delta log metadata queries
-            // fallback and run on the CPU since there is some incompatibilities in
-            // Databricks Spark and Apache Spark.
-            if (wrapped.relation.fileFormat.isInstanceOf[JsonFileFormat] &&
-              wrapped.relation.location.getClass.getCanonicalName() ==
-                "com.databricks.sql.transaction.tahoe.DeltaLogFileIndex") {
-              this.entirePlanWillNotWork("Plans that read Delta Index JSON files can not run " +
-                "any part of the plan on the GPU!")
-            }
-            GpuFileSourceScanExec.tagSupport(this)
-          }
-
-          override def convertToCpu(): SparkPlan = {
-            wrapped.copy(partitionFilters = partitionFilters)
-          }
-
-          override def convertToGpu(): GpuExec = {
-            val sparkSession = wrapped.relation.sparkSession
-            val options = wrapped.relation.options
-            val (location, alluxioPathsToReplaceMap) =
-              if (AlluxioCfgUtils.enabledAlluxioReplacementAlgoConvertTime(conf)) {
-                val shouldReadFromS3 = wrapped.relation.location match {
-                  // Only handle InMemoryFileIndex
-                  //
-                  // skip handle `MetadataLogFileIndex`, from the description of this class:
-                  // it's about the files generated by the `FileStreamSink`.
-                  // The streaming data source is not in our scope.
-                  //
-                  // For CatalogFileIndex and FileIndex of `delta` data source,
-                  // need more investigation.
-                  case inMemory: InMemoryFileIndex =>
-                    // List all the partitions to reduce overhead, pass in 2 empty filters.
-                    // Subsequent process will do the right partition pruning.
-                    // This operation is fast, because it lists files from the caches and the caches
-                    // already exist in the `InMemoryFileIndex`.
-                    val pds = inMemory.listFiles(Seq.empty, Seq.empty)
-                    AlluxioUtils.shouldReadDirectlyFromS3(conf, pds)
-                  case _ =>
-                    false
-                }
-
-                if (!shouldReadFromS3) {
-                  // it's convert time algorithm and some paths are not large tables
-                  AlluxioUtils.replacePathIfNeeded(
-                    conf,
-                    wrapped.relation,
-                    partitionFilters,
-                    wrapped.dataFilters)
-                } else {
-                  // convert time algorithm and read large files
-                  (wrapped.relation.location, None)
-                }
-              } else {
-                // it's not convert time algorithm or read large files, do not replace
-                (wrapped.relation.location, None)
-              }
-
-            val newRelation = HadoopFsRelation(
-              location,
-              wrapped.relation.partitionSchema,
-              wrapped.relation.dataSchema,
-              wrapped.relation.bucketSpec,
-              GpuFileSourceScanExec.convertFileFormat(wrapped.relation.fileFormat),
-              options)(sparkSession)
-
-            GpuFileSourceScanExec(
-              newRelation,
-              wrapped.output,
-              wrapped.requiredSchema,
-              partitionFilters,
-              wrapped.optionalBucketSet,
-              // TODO: Does Databricks have coalesced bucketing implemented?
-              None,
-              wrapped.dataFilters,
-              wrapped.tableIdentifier,
-              wrapped.disableBucketedScan,
-              queryUsesInputFile = false,
-              alluxioPathsToReplaceMap)(conf)
-            }
-        })
-    ).map(r => (r.getClassFor.asSubclass(classOf[SparkPlan]), r)).toMap
-  }
-
-  override def getScans: Map[Class[_ <: Scan], ScanRule[_ <: Scan]] = Seq(
-    GpuOverrides.scan[ParquetScan](
-      "Parquet parsing",
-      (a, conf, p, r) => new RapidsParquetScanMeta(a, conf, p, r)),
-    GpuOverrides.scan[OrcScan](
-      "ORC parsing",
-      (a, conf, p, r) => new RapidsOrcScanMeta(a, conf, p, r))
-  ).map(r => (r.getClassFor.asSubclass(classOf[Scan]), r)).toMap
-
-  override def getFileScanRDD(
-      sparkSession: SparkSession,
-      readFunction: PartitionedFile => Iterator[InternalRow],
-      filePartitions: Seq[FilePartition],
-      readDataSchema: StructType,
-      metadataColumns: Seq[AttributeReference]): RDD[InternalRow] = {
-    new GpuFileScanRDD(sparkSession, readFunction, filePartitions)
-  }
-
-  override def reusedExchangeExecPfn: PartialFunction[SparkPlan, ReusedExchangeExec] = {
-    case ShuffleQueryStageExec(_, e: ReusedExchangeExec, _) => e
-    case BroadcastQueryStageExec(_, e: ReusedExchangeExec, _) => e
-  }
-
-  /** dropped by SPARK-34234 */
-  override def attachTreeIfSupported[TreeType <: TreeNode[_], A](
-      tree: TreeType,
-      msg: String)(
-      f: => A
-  ): A = {
-    attachTree(tree, msg)(f)
-  }
-
-  override def hasAliasQuoteFix: Boolean = false
-
-  override def hasCastFloatTimestampUpcast: Boolean = false
-
-  override def filesFromFileIndex(fileCatalog: PartitioningAwareFileIndex): Seq[FileStatus] = {
-    fileCatalog.allFiles().map(_.toFileStatus)
-  }
-
-  override def isEmptyRelation(relation: Any): Boolean = false
-  override def tryTransformIfEmptyRelation(mode: BroadcastMode): Option[Any] = None
-
-  override def broadcastModeTransform(mode: BroadcastMode, rows: Array[InternalRow]): Any =
-    mode.transform(rows, TaskContext.get.taskMemoryManager())
-
-  override def getAdaptiveInputPlan(adaptivePlan: AdaptiveSparkPlanExec): SparkPlan = {
-    adaptivePlan.inputPlan
-  }
-
-  override def supportsColumnarAdaptivePlans: Boolean = false
-
-  override def columnarAdaptivePlan(a: AdaptiveSparkPlanExec, goal: CoalesceSizeGoal): SparkPlan = {
-    // When the input is an adaptive plan we do not get to see the GPU version until
-    // the plan is executed and sometimes the plan will have a GpuColumnarToRowExec as the
-    // final operator and we can bypass this to keep the data columnar by inserting
-    // the [[AvoidAdaptiveTransitionToRow]] operator here
-    AvoidAdaptiveTransitionToRow(GpuRowToColumnarExec(a, goal))
-  }
-
-  def neverReplaceShowCurrentNamespaceCommand: ExecRule[_ <: SparkPlan] = {
-    GpuOverrides.neverReplaceExec[ShowCurrentNamespaceExec]("Namespace metadata operation")
-  }
-}
-
-// First, Last and Collect have mistakenly been marked as non-deterministic until Spark-3.3.
-// They are actually deterministic iff their child expression is deterministic.
-trait GpuDeterministicFirstLastCollectShim extends Expression {
-  override lazy val deterministic = false
-}
diff --git a/sql-plugin/src/main/31xdb/scala/com/nvidia/spark/rapids/shims/Spark31XdbShimsBase.scala b/sql-plugin/src/main/31xdb/scala/com/nvidia/spark/rapids/shims/Spark31XdbShimsBase.scala
deleted file mode 100644
index 698105056c2..00000000000
--- a/sql-plugin/src/main/31xdb/scala/com/nvidia/spark/rapids/shims/Spark31XdbShimsBase.scala
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.shims
-
-import scala.collection.mutable.ListBuffer
-
-import com.nvidia.spark.rapids.{ExecChecks, ExecRule, SparkPlanMeta, SparkShims, TypeSig}
-import com.nvidia.spark.rapids.GpuOverrides.exec
-
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils}
-import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, BroadcastQueryStageExec, CustomShuffleReaderExec, QueryStageExec, ShuffleQueryStageExec}
-import org.apache.spark.sql.execution.exchange.BroadcastExchangeExec
-import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, BroadcastNestedLoopJoinExec, ShuffledHashJoinExec}
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.rapids.execution.GpuCustomShuffleReaderExec
-
-/**
-* Shim base class that can be compiled with every supported 31xdb
-*/
-trait Spark31XdbShimsBase extends SparkShims {
-  override def parquetRebaseReadKey: String =
-    SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_READ.key
-  override def parquetRebaseWriteKey: String =
-    SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key
-  override def avroRebaseReadKey: String =
-    SQLConf.LEGACY_AVRO_REBASE_MODE_IN_READ.key
-  override def avroRebaseWriteKey: String =
-    SQLConf.LEGACY_AVRO_REBASE_MODE_IN_WRITE.key
-  override def parquetRebaseRead(conf: SQLConf): String =
-    conf.getConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_READ)
-  override def parquetRebaseWrite(conf: SQLConf): String =
-    conf.getConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE)
-  override def int96ParquetRebaseRead(conf: SQLConf): String =
-    conf.getConf(SQLConf.LEGACY_PARQUET_INT96_REBASE_MODE_IN_READ)
-  override def int96ParquetRebaseWrite(conf: SQLConf): String =
-    conf.getConf(SQLConf.LEGACY_PARQUET_INT96_REBASE_MODE_IN_WRITE)
-  override def int96ParquetRebaseReadKey: String =
-    SQLConf.LEGACY_PARQUET_INT96_REBASE_MODE_IN_READ.key
-  override def int96ParquetRebaseWriteKey: String =
-    SQLConf.LEGACY_PARQUET_INT96_REBASE_MODE_IN_WRITE.key
-
-  override def sessionFromPlan(plan: SparkPlan): SparkSession = {
-    plan.sqlContext.sparkSession
-  }
-
-  override def newBroadcastQueryStageExec(
-      old: BroadcastQueryStageExec,
-      newPlan: SparkPlan): BroadcastQueryStageExec =
-    BroadcastQueryStageExec(old.id, newPlan, old.originalPlan)
-
-  override def getDateFormatter(): DateFormatter = {
-    DateFormatter(DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone))
-  }
-
-  override def isExchangeOp(plan: SparkPlanMeta[_]): Boolean = {
-    // if the child query stage already executed on GPU then we need to keep the
-    // next operator on GPU in these cases
-    SQLConf.get.adaptiveExecutionEnabled && (plan.wrapped match {
-      case _: CustomShuffleReaderExec
-           | _: ShuffledHashJoinExec
-           | _: BroadcastHashJoinExec
-           | _: BroadcastExchangeExec
-           | _: BroadcastNestedLoopJoinExec => true
-      case _ => false
-    })
-  }
-
-  override def isAqePlan(p: SparkPlan): Boolean = p match {
-    case _: AdaptiveSparkPlanExec |
-         _: QueryStageExec |
-         _: CustomShuffleReaderExec => true
-    case _ => false
-  }
-
-  override def isCustomReaderExec(x: SparkPlan): Boolean = x match {
-    case _: GpuCustomShuffleReaderExec | _: CustomShuffleReaderExec => true
-    case _ => false
-  }
-
-  override def aqeShuffleReaderExec: ExecRule[_ <: SparkPlan] = exec[CustomShuffleReaderExec](
-    "A wrapper of shuffle query stage",
-    ExecChecks((TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.DECIMAL_128 + TypeSig.ARRAY +
-        TypeSig.STRUCT + TypeSig.MAP).nested(), TypeSig.all),
-    (exec, conf, p, r) => new GpuCustomShuffleReaderMeta(exec, conf, p, r))
-
-  override def findOperators(plan: SparkPlan, predicate: SparkPlan => Boolean): Seq[SparkPlan] = {
-    def recurse(
-        plan: SparkPlan,
-        predicate: SparkPlan => Boolean,
-        accum: ListBuffer[SparkPlan]): Seq[SparkPlan] = {
-      if (predicate(plan)) {
-        accum += plan
-      }
-      plan match {
-        case a: AdaptiveSparkPlanExec => recurse(a.executedPlan, predicate, accum)
-        case qs: BroadcastQueryStageExec => recurse(qs.broadcast, predicate, accum)
-        case qs: ShuffleQueryStageExec => recurse(qs.shuffle, predicate, accum)
-        case other => other.children.flatMap(p => recurse(p, predicate, accum)).headOption
-      }
-      accum
-    }
-    recurse(plan, predicate, new ListBuffer[SparkPlan]())
-  }
-
-  override def skipAssertIsOnTheGpu(plan: SparkPlan): Boolean = false
-
-  override def shouldFailDivOverflow(): Boolean = false
-
-  override def leafNodeDefaultParallelism(ss: SparkSession): Int = {
-    ss.sparkContext.defaultParallelism
-  }
-}
diff --git a/sql-plugin/src/main/31xdb/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala b/sql-plugin/src/main/31xdb/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala
deleted file mode 100644
index 57aff608c82..00000000000
--- a/sql-plugin/src/main/31xdb/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.rapids.shims
-
-import scala.concurrent.Future
-
-import com.nvidia.spark.rapids.GpuPartitioning
-
-import org.apache.spark.MapOutputStatistics
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.plans.logical.Statistics
-import org.apache.spark.sql.catalyst.plans.physical.Partitioning
-import org.apache.spark.sql.execution.{ShufflePartitionSpec, SparkPlan}
-import org.apache.spark.sql.execution.exchange.{ShuffleExchangeLike, ShuffleOrigin}
-import org.apache.spark.sql.rapids.execution.{GpuShuffleExchangeExecBase, ShuffledBatchRDD}
-
-case class GpuShuffleExchangeExec(
-    gpuOutputPartitioning: GpuPartitioning,
-    child: SparkPlan,
-    shuffleOrigin: ShuffleOrigin)(
-    cpuOutputPartitioning: Partitioning)
-  extends GpuShuffleExchangeExecBase(gpuOutputPartitioning, child) with ShuffleExchangeLike {
-
-  override def otherCopyArgs: Seq[AnyRef] = cpuOutputPartitioning :: Nil
-
-  override val outputPartitioning: Partitioning = cpuOutputPartitioning
-
-  // 'mapOutputStatisticsFuture' is only needed when enable AQE.
-  @transient
-  override lazy val doMapOutputStatisticsFuture: Future[MapOutputStatistics] = {
-    if (inputBatchRDD.getNumPartitions == 0) {
-      Future.successful(null)
-    } else {
-      sparkContext.submitMapStage(shuffleDependencyColumnar)
-    }
-  }
-
-  override def numMappers: Int = shuffleDependencyColumnar.rdd.getNumPartitions
-
-  override def numPartitions: Int = shuffleDependencyColumnar.partitioner.numPartitions
-
-  override def getShuffleRDD(
-      partitionSpecs: Array[ShufflePartitionSpec],
-      partitionSizes: Option[Array[Long]]): RDD[_] = {
-    new ShuffledBatchRDD(shuffleDependencyColumnar, metrics ++ readMetrics, partitionSpecs)
-  }
-
-  // DB SPECIFIC - throw if called since we don't know how its used
-  override def withNewOutputPartitioning(outputPartitioning: Partitioning) = {
-    throw new UnsupportedOperationException
-  }
-
-  override def runtimeStatistics: Statistics = {
-    // note that Spark will only use the sizeInBytes statistic but making the rowCount
-    // available here means that we can more easily reference it in GpuOverrides when
-    // planning future query stages when AQE is on
-    Statistics(
-      sizeInBytes = metrics("dataSize").value,
-      rowCount = Some(metrics("numOutputRows").value)
-    )
-  }
-
-  override def shuffleId: Int = shuffleDependencyColumnar.shuffleId
-}
diff --git a/sql-plugin/src/main/31xdb/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala b/sql-plugin/src/main/31xdb/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala
deleted file mode 100644
index cb18e29cc60..00000000000
--- a/sql-plugin/src/main/31xdb/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.rapids.shims.api.python
-
-import java.io.DataInputStream
-import java.net.Socket
-import java.util.concurrent.atomic.AtomicBoolean
-
-import com.nvidia.spark.rapids.Arm
-
-import org.apache.spark.{SparkEnv, TaskContext}
-import org.apache.spark.api.python.BasePythonRunner
-
-// pid is not a constructor argument in 30x and 31x
-abstract class ShimBasePythonRunner[IN, OUT](
-    funcs : scala.Seq[org.apache.spark.api.python.ChainedPythonFunctions],
-    evalType : scala.Int, argOffsets : scala.Array[scala.Array[scala.Int]]
-) extends BasePythonRunner[IN, OUT](funcs, evalType, argOffsets)
-    with Arm {
-  protected abstract class ShimReaderIterator(
-    stream: DataInputStream,
-    writerThread: WriterThread,
-    startTime: Long,
-    env: SparkEnv,
-    worker: Socket,
-    pid: Option[Int],
-    releasedOrClosed: AtomicBoolean,
-    context: TaskContext
-  ) extends ReaderIterator(stream, writerThread, startTime, env, worker, pid, releasedOrClosed,
-    context)
-}
diff --git a/sql-plugin/src/main/31xdb/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala b/sql-plugin/src/main/31xdb/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala
deleted file mode 100644
index 866fa191e46..00000000000
--- a/sql-plugin/src/main/31xdb/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.csv
-
-object GpuCsvUtils {
-  def dateFormatInRead(options: CSVOptions): String = options.dateFormat
-  def timestampFormatInRead(options: CSVOptions): String = options.timestampFormat
-}
diff --git a/sql-plugin/src/main/31xdb/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala b/sql-plugin/src/main/31xdb/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala
deleted file mode 100644
index 3cfe43fa5f5..00000000000
--- a/sql-plugin/src/main/31xdb/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.json
-
-object GpuJsonUtils {
-  def dateFormatInRead(options: JSONOptions): String = options.dateFormat
-  def timestampFormatInRead(options: JSONOptions): String = options.timestampFormat
-}
diff --git a/sql-plugin/src/main/31xdb/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuFlatMapGroupsInPandasExec.scala b/sql-plugin/src/main/31xdb/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuFlatMapGroupsInPandasExec.scala
deleted file mode 100644
index 575e42dc53d..00000000000
--- a/sql-plugin/src/main/31xdb/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuFlatMapGroupsInPandasExec.scala
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.rapids.execution.python.shims
-
-import com.nvidia.spark.rapids._
-import com.nvidia.spark.rapids.python.PythonWorkerSemaphore
-
-import org.apache.spark.TaskContext
-import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, Partitioning}
-import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
-import org.apache.spark.sql.execution.python.FlatMapGroupsInPandasExec
-import org.apache.spark.sql.rapids.execution.python.{GpuArrowPythonRunner, GpuPythonExecBase, GpuPythonHelper, GpuPythonUDF, GroupArgs}
-import org.apache.spark.sql.rapids.execution.python.BatchGroupUtils._
-import org.apache.spark.sql.types.{StructField, StructType}
-import org.apache.spark.sql.util.ArrowUtils
-import org.apache.spark.sql.vectorized.ColumnarBatch
-
-class GpuFlatMapGroupsInPandasExecMeta(
-    flatPandas: FlatMapGroupsInPandasExec,
-    conf: RapidsConf,
-    parent: Option[RapidsMeta[_, _, _]],
-    rule: DataFromReplacementRule)
-  extends SparkPlanMeta[FlatMapGroupsInPandasExec](flatPandas, conf, parent, rule) {
-
-  override def replaceMessage: String = "partially run on GPU"
-  override def noReplacementPossibleMessage(reasons: String): String =
-    s"cannot run even partially on the GPU because $reasons"
-
-  private val groupingAttrs: Seq[BaseExprMeta[Attribute]] =
-    flatPandas.groupingAttributes.map(GpuOverrides.wrapExpr(_, conf, Some(this)))
-
-  private val udf: BaseExprMeta[PythonUDF] = GpuOverrides.wrapExpr(
-    flatPandas.func.asInstanceOf[PythonUDF], conf, Some(this))
-
-  private val resultAttrs: Seq[BaseExprMeta[Attribute]] =
-    flatPandas.output.map(GpuOverrides.wrapExpr(_, conf, Some(this)))
-
-  override val childExprs: Seq[BaseExprMeta[_]] = groupingAttrs ++ resultAttrs :+ udf
-
-  override def convertToGpu(): GpuExec =
-    GpuFlatMapGroupsInPandasExec(
-      groupingAttrs.map(_.convertToGpu()).asInstanceOf[Seq[Attribute]],
-      udf.convertToGpu(),
-      resultAttrs.map(_.convertToGpu()).asInstanceOf[Seq[Attribute]],
-      childPlans.head.convertIfNeeded()
-    )
-}
-
-/**
- * GPU version of Spark's `FlatMapGroupsInPandasExec`
- *
- * Rows in each group are passed to the Python worker as an Arrow record batch.
- * The Python worker turns the record batch to a `pandas.DataFrame`, invoke the
- * user-defined function, and passes the resulting `pandas.DataFrame`
- * as an Arrow record batch. Finally, each record batch is turned to
- * a ColumnarBatch.
- *
- * This node aims at accelerating the data transfer between JVM and Python for GPU pipeline, and
- * scheduling GPU resources for its Python processes.
- */
-case class GpuFlatMapGroupsInPandasExec(
-    groupingAttributes: Seq[Attribute],
-    func: Expression,
-    output: Seq[Attribute],
-    child: SparkPlan)
-  extends SparkPlan with UnaryExecNode with GpuPythonExecBase {
-
-  override def producedAttributes: AttributeSet = AttributeSet(output)
-
-  override def outputPartitioning: Partitioning = child.outputPartitioning
-
-  override def requiredChildDistribution: Seq[Distribution] = {
-    if (groupingAttributes.isEmpty) {
-      AllTuples :: Nil
-    } else {
-      ClusteredDistribution(groupingAttributes) :: Nil
-    }
-  }
-
-  override def requiredChildOrdering: Seq[Seq[SortOrder]] =
-    Seq(groupingAttributes.map(SortOrder(_, Ascending)))
-
-  private val pandasFunction = func.asInstanceOf[GpuPythonUDF].func
-
-  // One batch as input to keep the integrity for each group
-  override def childrenCoalesceGoal: Seq[CoalesceGoal] = Seq(RequireSingleBatch)
-
-  // The input batch will be split into multiple batches by grouping expression, and
-  // processed by Python executors group by group, so better to coalesce the output batches.
-  override def coalesceAfter: Boolean = true
-
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
-    val (mNumInputRows, mNumInputBatches, mNumOutputRows, mNumOutputBatches,
-         spillCallback) = commonGpuMetrics()
-
-    lazy val isPythonOnGpuEnabled = GpuPythonHelper.isPythonOnGpuEnabled(conf)
-    val chainedFunc = Seq(ChainedPythonFunctions(Seq(pandasFunction)))
-    val sessionLocalTimeZone = conf.sessionLocalTimeZone
-    val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf)
-    val localOutput = output
-    val localChildOutput = child.output
-    // Python wraps the resulting columns in a single struct column.
-    val pythonOutputSchema = StructType(
-        StructField("out_struct", StructType.fromAttributes(localOutput)) :: Nil)
-
-    // Configs from DB 9.1 runtime
-    val maxBytes = conf.pandasZeroConfConversionGroupbyApplyMaxBytesPerSlice
-    val zeroConfEnabled = conf.pandasZeroConfConversionGroupbyApplyEnabled
-
-    // Resolve the argument offsets and related attributes.
-    val GroupArgs(dedupAttrs, argOffsets, groupingOffsets) =
-        resolveArgOffsets(child, groupingAttributes)
-
-    // Start processing. Map grouped batches to ArrowPythonRunner results.
-    child.executeColumnar().mapPartitionsInternal { inputIter =>
-      if (isPythonOnGpuEnabled) {
-        GpuPythonHelper.injectGpuInfo(chainedFunc, isPythonOnGpuEnabled)
-        PythonWorkerSemaphore.acquireIfNecessary(TaskContext.get())
-      }
-
-      // Projects each input batch into the deduplicated schema, and splits
-      // into separate group batches to sends them to Python group by group later.
-      val pyInputIter = projectAndGroup(inputIter, localChildOutput, dedupAttrs, groupingOffsets,
-          mNumInputRows, mNumInputBatches, spillCallback)
-
-      if (pyInputIter.hasNext) {
-        // Launch Python workers only when the data is not empty.
-        // Choose the right DB SPECIFIC serializer from 9.1 runtime.
-        val pyRunner = if (zeroConfEnabled && maxBytes > 0L) {
-          new GpuGroupUDFArrowPythonRunner(
-            chainedFunc,
-            PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
-            Array(argOffsets),
-            StructType.fromAttributes(dedupAttrs),
-            sessionLocalTimeZone,
-            pythonRunnerConf,
-            // The whole group data should be written in a single call, so here is unlimited
-            Int.MaxValue,
-            spillCallback.semaphoreWaitTime,
-            pythonOutputSchema)
-        } else {
-          new GpuArrowPythonRunner(
-            chainedFunc,
-            PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
-            Array(argOffsets),
-            StructType.fromAttributes(dedupAttrs),
-            sessionLocalTimeZone,
-            pythonRunnerConf,
-            Int.MaxValue,
-            spillCallback.semaphoreWaitTime,
-            pythonOutputSchema)
-        }
-
-        executePython(pyInputIter, localOutput, pyRunner, mNumOutputRows, mNumOutputBatches)
-      } else {
-        // Empty partition, return it directly
-        inputIter
-      }
-    } // end of mapPartitionsInternal
-  }
-}
diff --git a/sql-plugin/src/main/31xdb/scala/org/apache/spark/sql/rapids/execution/shims/GpuSubqueryBroadcastMeta.scala b/sql-plugin/src/main/31xdb/scala/org/apache/spark/sql/rapids/execution/shims/GpuSubqueryBroadcastMeta.scala
deleted file mode 100644
index 840d3134560..00000000000
--- a/sql-plugin/src/main/31xdb/scala/org/apache/spark/sql/rapids/execution/shims/GpuSubqueryBroadcastMeta.scala
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.rapids.execution.shims
-
-import com.nvidia.spark.rapids.{BaseExprMeta, DataFromReplacementRule, GpuExec, RapidsConf, RapidsMeta, SparkPlanMeta}
-
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.plans.physical.IdentityBroadcastMode
-import org.apache.spark.sql.execution.{SparkPlan, SubqueryBroadcastExec}
-import org.apache.spark.sql.execution.adaptive.{BroadcastQueryStageExec}
-import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec}
-import org.apache.spark.sql.execution.joins.HashedRelationBroadcastMode
-import org.apache.spark.sql.rapids.execution._
-
-class GpuSubqueryBroadcastMeta(
-    s: SubqueryBroadcastExec,
-    conf: RapidsConf,
-    p: Option[RapidsMeta[_, _, _]],
-    r: DataFromReplacementRule) extends
-    SparkPlanMeta[SubqueryBroadcastExec](s, conf, p, r) {
-
-  private var broadcastBuilder: () => SparkPlan = _
-
-  override val childExprs: Seq[BaseExprMeta[_]] = Nil
-
-  override val childPlans: Seq[SparkPlanMeta[SparkPlan]] = Nil
-
-  override def tagPlanForGpu(): Unit = s.child match {
-    // DPP: For AQE off, in this case, we handle DPP by converting the underlying 
-    // BroadcastExchangeExec to GpuBroadcastExchangeExec.
-    // This is slightly different from the Apache Spark case, because Spark 
-    // sends the underlying plan into the plugin in advance via the PlanSubqueries rule.
-    // Here, we have the full non-GPU subquery plan, so we convert the whole
-    // thing.
-    case ex @ BroadcastExchangeExec(_, child) =>
-      val exMeta = new GpuBroadcastMeta(ex.copy(child = child), conf, p, r)
-      exMeta.tagForGpu()
-      if (exMeta.canThisBeReplaced) {
-        broadcastBuilder = () => exMeta.convertToGpu()
-      } else {
-        willNotWorkOnGpu("underlying BroadcastExchange can not run in the GPU.")
-      }
-    // DPP: For AQE on, we have an almost completely different scenario then before, 
-    // Databricks uses a BroadcastQueryStageExec and either:
-    //  1) provide an underlying BroadcastExchangeExec that we will have to convert
-    //     somehow
-    //  2) might already do the reuse work for us. The ReusedExchange is now a 
-    //     part of the SubqueryBroadcast, so we send it back here as underlying the 
-    //     GpuSubqueryBroadcastExchangeExec
-    case bqse: BroadcastQueryStageExec =>
-      bqse.plan match {
-        case ex: BroadcastExchangeExec =>
-          val exMeta = new GpuBroadcastMeta(ex, conf, p, r)
-          exMeta.tagForGpu()
-          if (exMeta.canThisBeReplaced) {
-            broadcastBuilder = () => exMeta.convertToGpu()
-          } else {
-            willNotWorkOnGpu("underlying BroadcastExchange can not run in the GPU.")
-          }
-        case reuse: ReusedExchangeExec =>
-          reuse.child match {
-            case gpuExchange: GpuBroadcastExchangeExec =>
-              // A BroadcastExchange has already been replaced, so it can run on the GPU
-              broadcastBuilder = () => reuse
-            case _ =>
-              willNotWorkOnGpu("underlying BroadcastExchange can not run in the GPU.")
-          }
-      }
-    case _ =>
-      willNotWorkOnGpu("the subquery to broadcast can not entirely run in the GPU.")
-  }
-  /**
-  * Simply returns the original plan. Because its only child, BroadcastExchange, doesn't
-  * need to change if SubqueryBroadcastExec falls back to the CPU.
-  */
-  override def convertToCpu(): SparkPlan = s
-
-  override def convertToGpu(): GpuExec = {
-    GpuSubqueryBroadcastExec(s.name, s.index, s.buildKeys, broadcastBuilder())(
-      getBroadcastModeKeyExprs)
-  }
-
-  /** Extract the broadcast mode key expressions if there are any. */
-  private def getBroadcastModeKeyExprs: Option[Seq[Expression]] = {
-    val broadcastMode = s.child match {
-      case b: BroadcastExchangeExec =>
-        b.mode
-      case bqse: BroadcastQueryStageExec =>
-        bqse.plan match {
-          case b: BroadcastExchangeExec =>
-            b.mode
-          case reuse: ReusedExchangeExec =>
-            reuse.child match {
-              case g: GpuBroadcastExchangeExec =>
-                g.mode
-            }
-          case _ =>
-            throw new AssertionError("should not reach here")
-        }
-    }
-
-    broadcastMode match {
-      case HashedRelationBroadcastMode(keys, _) => Some(keys)
-      case IdentityBroadcastMode => None
-      case m => throw new UnsupportedOperationException(s"Unknown broadcast mode $m")
-    }
-  }
-}
\ No newline at end of file
diff --git a/sql-plugin/src/main/321+-db/scala/org/apache/spark/rapids/execution/shims/ReuseGpuBroadcastExchangeAndSubquery.scala b/sql-plugin/src/main/321+-db/scala/org/apache/spark/rapids/execution/shims/ReuseGpuBroadcastExchangeAndSubquery.scala
deleted file mode 100644
index d28e4f5c8c0..00000000000
--- a/sql-plugin/src/main/321+-db/scala/org/apache/spark/rapids/execution/shims/ReuseGpuBroadcastExchangeAndSubquery.scala
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.rapids.execution.shims
-
-import scala.collection.mutable
-
-import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.catalyst.trees.TreePattern._
-import org.apache.spark.sql.execution.{BaseSubqueryExec, ExecSubqueryExpression, ReusedSubqueryExec, SparkPlan}
-import org.apache.spark.sql.execution.exchange.{Exchange, ReusedExchangeExec}
-import org.apache.spark.sql.rapids.execution.GpuBroadcastExchangeExec
-
-/**
- * GPU version of the ReuseExchangeAndSubquery rule from Apache Spark 3.2.x. This only works with
- * GpuBroadcastExchangeExec in the reuse context, since we need to skip ahead of Databricks' 
- * version of the rule which accomodates for the codegenStageId in non-GPU nodes in the Spark plan
- * for DPP. We also don't want to reuse too much, in certain circumstances if we used the Databricks
- * rule as-is, we would try to reuse non-reusable nodes in other cases that are not DPP.
- * 
- * Find out duplicated exchanges and subqueries in the whole spark plan including subqueries, then
- * use the same exchange or subquery for all the references.
- *
- * Note that the Spark plan is a mutually recursive data structure:
- * SparkPlan -> Expr -> Subquery -> SparkPlan -> Expr -> Subquery -> ...
- * Therefore, in this rule, we recursively rewrite the exchanges and subqueries in a bottom-up way,
- * in one go.
- */
-case object ReuseGpuBroadcastExchangeAndSubquery extends Rule[SparkPlan] {
-
-  def apply(plan: SparkPlan): SparkPlan = {
-    if (conf.exchangeReuseEnabled || conf.subqueryReuseEnabled) {
-      val exchanges = mutable.Map.empty[SparkPlan, Exchange]
-      val subqueries = mutable.Map.empty[SparkPlan, BaseSubqueryExec]
-
-      def reuse(plan: SparkPlan): SparkPlan = {
-        plan.transformUpWithPruning(_.containsAnyPattern(EXCHANGE, PLAN_EXPRESSION)) {
-          case exchange: GpuBroadcastExchangeExec if conf.exchangeReuseEnabled =>
-            val cachedExchange = exchanges.getOrElseUpdate(exchange.canonicalized, exchange)
-            if (cachedExchange.ne(exchange)) {
-              ReusedExchangeExec(exchange.output, cachedExchange)
-            } else {
-              cachedExchange
-            }
-          case other =>
-            other.transformExpressionsUpWithPruning(_.containsPattern(PLAN_EXPRESSION)) {
-              case sub: ExecSubqueryExpression =>
-                val subquery = reuse(sub.plan).asInstanceOf[BaseSubqueryExec]
-                val newSubquery = if (conf.subqueryReuseEnabled) {
-                  val cachedSubquery = subqueries.getOrElseUpdate(subquery.canonicalized, subquery)
-                  if (cachedSubquery.ne(subquery)) {
-                    ReusedSubqueryExec(cachedSubquery)
-                  } else {
-                    cachedSubquery
-                  }
-                } else {
-                  subquery
-                }
-                sub.withNewPlan(newSubquery)
-            }
-        }
-      }
-
-      reuse(plan)
-    } else {
-      plan
-    }
-  }
-}
diff --git a/sql-plugin/src/main/321+-db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuFlatMapGroupsInPandasExec.scala b/sql-plugin/src/main/321+-db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuFlatMapGroupsInPandasExec.scala
deleted file mode 100644
index a80ec0c49cd..00000000000
--- a/sql-plugin/src/main/321+-db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuFlatMapGroupsInPandasExec.scala
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.rapids.execution.python.shims
-
-import com.nvidia.spark.rapids._
-import com.nvidia.spark.rapids.python.PythonWorkerSemaphore
-import com.nvidia.spark.rapids.shims.ShimUnaryExecNode
-
-import org.apache.spark.TaskContext
-import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, Partitioning}
-import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.python.FlatMapGroupsInPandasExec
-import org.apache.spark.sql.rapids.execution.python.{GpuArrowPythonRunner, GpuPythonExecBase, GpuPythonHelper, GpuPythonUDF, GroupArgs}
-import org.apache.spark.sql.rapids.execution.python.BatchGroupUtils._
-import org.apache.spark.sql.types.{StructField, StructType}
-import org.apache.spark.sql.util.ArrowUtils
-import org.apache.spark.sql.vectorized.ColumnarBatch
-
-class GpuFlatMapGroupsInPandasExecMeta(
-    flatPandas: FlatMapGroupsInPandasExec,
-    conf: RapidsConf,
-    parent: Option[RapidsMeta[_, _, _]],
-    rule: DataFromReplacementRule)
-  extends SparkPlanMeta[FlatMapGroupsInPandasExec](flatPandas, conf, parent, rule) {
-
-  override def replaceMessage: String = "partially run on GPU"
-  override def noReplacementPossibleMessage(reasons: String): String =
-    s"cannot run even partially on the GPU because $reasons"
-
-  private val groupingAttrs: Seq[BaseExprMeta[Attribute]] =
-    flatPandas.groupingAttributes.map(GpuOverrides.wrapExpr(_, conf, Some(this)))
-
-  private val udf: BaseExprMeta[PythonUDF] = GpuOverrides.wrapExpr(
-    flatPandas.func.asInstanceOf[PythonUDF], conf, Some(this))
-
-  private val resultAttrs: Seq[BaseExprMeta[Attribute]] =
-    flatPandas.output.map(GpuOverrides.wrapExpr(_, conf, Some(this)))
-
-  override val childExprs: Seq[BaseExprMeta[_]] = groupingAttrs ++ resultAttrs :+ udf
-
-  override def convertToGpu(): GpuExec =
-    GpuFlatMapGroupsInPandasExec(
-      groupingAttrs.map(_.convertToGpu()).asInstanceOf[Seq[Attribute]],
-      udf.convertToGpu(),
-      resultAttrs.map(_.convertToGpu()).asInstanceOf[Seq[Attribute]],
-      childPlans.head.convertIfNeeded()
-    )
-}
-
-/**
- * GPU version of Spark's `FlatMapGroupsInPandasExec`
- *
- * Rows in each group are passed to the Python worker as an Arrow record batch.
- * The Python worker turns the record batch to a `pandas.DataFrame`, invoke the
- * user-defined function, and passes the resulting `pandas.DataFrame`
- * as an Arrow record batch. Finally, each record batch is turned to
- * a ColumnarBatch.
- *
- * This node aims at accelerating the data transfer between JVM and Python for GPU pipeline, and
- * scheduling GPU resources for its Python processes.
- */
-case class GpuFlatMapGroupsInPandasExec(
-    groupingAttributes: Seq[Attribute],
-    func: Expression,
-    output: Seq[Attribute],
-    child: SparkPlan)
-  extends SparkPlan with ShimUnaryExecNode with GpuPythonExecBase {
-
-  override def producedAttributes: AttributeSet = AttributeSet(output)
-
-  override def outputPartitioning: Partitioning = child.outputPartitioning
-
-  override def requiredChildDistribution: Seq[Distribution] = {
-    if (groupingAttributes.isEmpty) {
-      AllTuples :: Nil
-    } else {
-      ClusteredDistribution(groupingAttributes) :: Nil
-    }
-  }
-
-  override def requiredChildOrdering: Seq[Seq[SortOrder]] =
-    Seq(groupingAttributes.map(SortOrder(_, Ascending)))
-
-  private val pandasFunction = func.asInstanceOf[GpuPythonUDF].func
-
-  // One batch as input to keep the integrity for each group
-  override def childrenCoalesceGoal: Seq[CoalesceGoal] = Seq(RequireSingleBatch)
-
-  // The input batch will be split into multiple batches by grouping expression, and
-  // processed by Python executors group by group, so better to coalesce the output batches.
-  override def coalesceAfter: Boolean = true
-
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
-    val (mNumInputRows, mNumInputBatches, mNumOutputRows, mNumOutputBatches,
-         spillCallback) = commonGpuMetrics()
-
-    lazy val isPythonOnGpuEnabled = GpuPythonHelper.isPythonOnGpuEnabled(conf)
-    val chainedFunc = Seq(ChainedPythonFunctions(Seq(pandasFunction)))
-    val sessionLocalTimeZone = conf.sessionLocalTimeZone
-    val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf)
-    val localOutput = output
-    val localChildOutput = child.output
-    // Python wraps the resulting columns in a single struct column.
-    val pythonOutputSchema = StructType(
-        StructField("out_struct", StructType.fromAttributes(localOutput)) :: Nil)
-
-    // Configs from DB 10.4 runtime
-    val maxBytes = conf.pandasZeroConfConversionGroupbyApplyMaxBytesPerSlice
-    val zeroConfEnabled = conf.pandasZeroConfConversionGroupbyApplyEnabled
-
-    // Resolve the argument offsets and related attributes.
-    val GroupArgs(dedupAttrs, argOffsets, groupingOffsets) =
-        resolveArgOffsets(child, groupingAttributes)
-
-    // Start processing. Map grouped batches to ArrowPythonRunner results.
-    child.executeColumnar().mapPartitionsInternal { inputIter =>
-      if (isPythonOnGpuEnabled) {
-        GpuPythonHelper.injectGpuInfo(chainedFunc, isPythonOnGpuEnabled)
-        PythonWorkerSemaphore.acquireIfNecessary(TaskContext.get())
-      }
-
-      // Projects each input batch into the deduplicated schema, and splits
-      // into separate group batches to sends them to Python group by group later.
-      val pyInputIter = projectAndGroup(inputIter, localChildOutput, dedupAttrs, groupingOffsets,
-          mNumInputRows, mNumInputBatches, spillCallback)
-
-      if (pyInputIter.hasNext) {
-        // Launch Python workers only when the data is not empty.
-        // Choose the right DB SPECIFIC serializer from 9.1 runtime.
-        val pyRunner = if (zeroConfEnabled && maxBytes > 0L) {
-          new GpuGroupUDFArrowPythonRunner(
-            chainedFunc,
-            PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
-            Array(argOffsets),
-            StructType.fromAttributes(dedupAttrs),
-            sessionLocalTimeZone,
-            pythonRunnerConf,
-            // The whole group data should be written in a single call, so here is unlimited
-            Int.MaxValue,
-            spillCallback.semaphoreWaitTime,
-            pythonOutputSchema)
-        } else {
-          new GpuArrowPythonRunner(
-            chainedFunc,
-            PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
-            Array(argOffsets),
-            StructType.fromAttributes(dedupAttrs),
-            sessionLocalTimeZone,
-            pythonRunnerConf,
-            Int.MaxValue,
-            spillCallback.semaphoreWaitTime,
-            pythonOutputSchema)
-        }
-
-        executePython(pyInputIter, localOutput, pyRunner, mNumOutputRows, mNumOutputBatches)
-      } else {
-        // Empty partition, return it directly
-        inputIter
-      }
-    } // end of mapPartitionsInternal
-  }
-}
diff --git a/sql-plugin/src/main/321/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/321/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
deleted file mode 100644
index ec502eedbf1..00000000000
--- a/sql-plugin/src/main/321/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.shims
-
-import com.nvidia.spark.rapids._
-
-object SparkShimImpl extends Spark321PlusShims
-    with Spark320PlusNonDBShims
-    with Spark31Xuntil33XShims
-    with AnsiCastRuleShims {
-  override def getSparkShimVersion: ShimVersion = ShimLoader.getShimVersion
-
-  override def reproduceEmptyStringBug: Boolean = true
-}
diff --git a/sql-plugin/src/main/322/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/322/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
deleted file mode 100644
index 1bd4cd77818..00000000000
--- a/sql-plugin/src/main/322/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.shims
-
-import com.nvidia.spark.rapids._
-
-object SparkShimImpl extends Spark321PlusShims
-    with Spark320PlusNonDBShims
-    with Spark31Xuntil33XShims
-    with AnsiCastRuleShims {
-  override def getSparkShimVersion: ShimVersion = ShimLoader.getShimVersion
-}
diff --git a/sql-plugin/src/main/332/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/332/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
deleted file mode 100644
index 4c7269807bb..00000000000
--- a/sql-plugin/src/main/332/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.shims
-
-import com.nvidia.spark.rapids._
-
-import org.apache.spark.sql.execution.command.{CreateDataSourceTableAsSelectCommand, DataWritingCommand, RunnableCommand}
-
-object SparkShimImpl extends Spark331PlusShims with AnsiCastRuleShims {
-  override def getDataWriteCmds: Map[Class[_ <: DataWritingCommand],
-      DataWritingCommandRule[_ <: DataWritingCommand]] = {
-    Seq(GpuOverrides.dataWriteCmd[CreateDataSourceTableAsSelectCommand](
-    "Create table with select command",
-    (a, conf, p, r) => new CreateDataSourceTableAsSelectCommandMeta(a, conf, p, r))
-    ).map(r => (r.getClassFor.asSubclass(classOf[DataWritingCommand]), r)).toMap
-  }
-
-  override def getRunnableCmds: Map[Class[_ <: RunnableCommand],
-      RunnableCommandRule[_ <: RunnableCommand]] = {
-    Map.empty
-  }
-}
diff --git a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/ShimDataSourceV2ScanExecBase.scala b/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/ShimDataSourceV2ScanExecBase.scala
deleted file mode 100644
index 2a645e5a8f4..00000000000
--- a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/ShimDataSourceV2ScanExecBase.scala
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.shims
-
-import org.apache.spark.sql.catalyst.expressions.SortOrder
-import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanExecBase
-
-trait ShimDataSourceV2ScanExecBase extends DataSourceV2ScanExecBase {
-  override def ordering: Option[Seq[SortOrder]] = None
-}
diff --git a/sql-plugin/src/main/340/scala/com/nvidia/spark/rapids/SparkShims.scala b/sql-plugin/src/main/340/scala/com/nvidia/spark/rapids/SparkShims.scala
deleted file mode 100644
index 2e77a3361f1..00000000000
--- a/sql-plugin/src/main/340/scala/com/nvidia/spark/rapids/SparkShims.scala
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.shims
-
-import com.nvidia.spark.rapids.{ShimLoader, ShimVersion}
-
-object SparkShimImpl extends Spark340PlusShims {
-  override def getSparkShimVersion: ShimVersion = ShimLoader.getShimVersion
-}
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuColumnVector.java b/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuColumnVector.java
index cf0e2b64a1b..a2f493299ff 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuColumnVector.java
+++ b/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuColumnVector.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -70,8 +70,12 @@ public static synchronized void debug(String name, Table table) {
    * @param cb the batch to print out.
    */
   public static synchronized void debug(String name, ColumnarBatch cb) {
-    try (Table table = from(cb)) {
-      debug(name, table);
+    if (cb.numCols() <= 0) {
+      System.err.println("DEBUG " + name + " NO COLS " + cb.numRows() + " ROWS");
+    } else {
+      try (Table table = from(cb)) {
+        debug(name, table);
+      }
     }
   }
 
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuExecutedCommandExec.scala b/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuExecutedCommandExec.scala
index 35d0b87bd89..7238a7a3874 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuExecutedCommandExec.scala
+++ b/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuExecutedCommandExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.execution.LeafExecNode
 import org.apache.spark.sql.execution.command.{ExecutedCommandExec, RunnableCommand}
+import org.apache.spark.sql.vectorized.ColumnarBatch
 
 /**
  * GPU version of ExecutedCommandExec.
@@ -70,6 +71,11 @@ case class GpuExecutedCommandExec(cmd: RunnableCommand) extends LeafExecNode wit
   protected override def doExecute(): RDD[InternalRow] = {
     sparkContext.parallelize(sideEffectResult, 1)
   }
+
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
+    throw new IllegalStateException(s"Internal Error ${this.getClass} has column support" +
+        s" mismatch:\n$this")
+  }
 }
 
 class ExecutedCommandExecMeta(
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/InternalRowToColumnarBatchIterator.java b/sql-plugin/src/main/java/com/nvidia/spark/rapids/InternalRowToColumnarBatchIterator.java
index efd9d52cade..493b3a28d3d 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/InternalRowToColumnarBatchIterator.java
+++ b/sql-plugin/src/main/java/com/nvidia/spark/rapids/InternalRowToColumnarBatchIterator.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -52,7 +52,6 @@ public abstract class InternalRowToColumnarBatchIterator implements Iterator<Col
   protected final long dataLength;
   protected final DType[] rapidsTypes;
   protected final DataType[] outputTypes;
-  protected final GpuMetric semaphoreWaitTime;
   protected final GpuMetric streamTime;
   protected final GpuMetric opTime;
   protected final GpuMetric numInputRows;
@@ -63,7 +62,6 @@ protected InternalRowToColumnarBatchIterator(
       Iterator<InternalRow> input,
       Attribute[] schema,
       CoalesceSizeGoal goal,
-      GpuMetric semaphoreWaitTime,
       GpuMetric streamTime,
       GpuMetric opTime,
       GpuMetric numInputRows,
@@ -81,7 +79,6 @@ protected InternalRowToColumnarBatchIterator(
       rapidsTypes[i] = GpuColumnVector.getNonNestedRapidsType(schema[i].dataType());
       outputTypes[i] = schema[i].dataType();
     }
-    this.semaphoreWaitTime = semaphoreWaitTime;
     this.streamTime = streamTime;
     this.opTime = opTime;
     this.numInputRows = numInputRows;
@@ -158,7 +155,7 @@ public ColumnarBatch next() {
         // Grab the semaphore because we are about to put data onto the GPU.
         TaskContext tc = TaskContext.get();
         if (tc != null) {
-          GpuSemaphore$.MODULE$.acquireIfNecessary(tc, semaphoreWaitTime);
+          GpuSemaphore$.MODULE$.acquireIfNecessary(tc);
         }
         buildRange = NvtxWithMetrics.apply("RowToColumnar: build", NvtxColor.GREEN, Option.apply(opTime));
         devColumn = hostColumn.copyToDevice();
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AbstractGpuJoinIterator.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AbstractGpuJoinIterator.scala
index bb9fee22ec8..8063d1c3122 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AbstractGpuJoinIterator.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AbstractGpuJoinIterator.scala
@@ -18,6 +18,7 @@ package com.nvidia.spark.rapids
 
 import ai.rapids.cudf.{GatherMap, NvtxColor, OutOfBoundsPolicy}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
+import com.nvidia.spark.rapids.RmmRapidsRetryIterator.{withRestoreOnRetry, withRetryNoSplit}
 
 import org.apache.spark.TaskContext
 import org.apache.spark.internal.Logging
@@ -40,7 +41,7 @@ trait TaskAutoCloseableResource extends AutoCloseable {
     resources.clear()
   }
 
-  TaskContext.get().addTaskCompletionListener[Unit](_ => close())
+  Option(TaskContext.get()).foreach(_.addTaskCompletionListener[Unit](_ => close()))
 }
 
 /**
@@ -133,8 +134,13 @@ abstract class AbstractGpuJoinIterator(
   private def nextCbFromGatherer(): Option[ColumnarBatch] = {
     withResource(new NvtxWithMetrics(gatherNvtxName, NvtxColor.DARK_GREEN, joinTime)) { _ =>
       val ret = gathererStore.map { gather =>
-        val nextRows = JoinGatherer.getRowsInNextBatch(gather, targetSize)
-        gather.gatherNext(nextRows)
+        gather.checkpoint()
+        withRetryNoSplit[ColumnarBatch] {
+          withRestoreOnRetry(gather) {
+            val nextRows = JoinGatherer.getRowsInNextBatch(gather, targetSize)
+            gather.gatherNext(nextRows)
+          }
+        }
       }
       if (gathererStore.exists(_.isDone)) {
         gathererStore.foreach(_.close())
@@ -158,7 +164,6 @@ abstract class AbstractGpuJoinIterator(
  * @param streamAttributes attributes corresponding to the streaming side input
  * @param builtBatch batch for the built side input of the join
  * @param targetSize configured target batch size in bytes
- * @param spillCallback callback to use when spilling
  * @param opTime metric to record time spent for this operation
  * @param joinTime metric to record GPU time spent in join
  */
@@ -168,7 +173,6 @@ abstract class SplittableJoinIterator(
     streamAttributes: Seq[Attribute],
     builtBatch: LazySpillableColumnarBatch,
     targetSize: Long,
-    spillCallback: SpillCallback,
     opTime: GpuMetric,
     joinTime: GpuMetric)
     extends AbstractGpuJoinIterator(
@@ -179,7 +183,7 @@ abstract class SplittableJoinIterator(
   // For some join types even if there is no stream data we might output something
   private var isInitialJoin = true
   // If the join explodes this holds batches from the stream side split into smaller pieces.
-  private val pendingSplits = scala.collection.mutable.Queue[SpillableColumnarBatch]()
+  private val pendingSplits = scala.collection.mutable.Queue[LazySpillableColumnarBatch]()
 
   protected def computeNumJoinRows(cb: ColumnarBatch): Long
 
@@ -190,7 +194,8 @@ abstract class SplittableJoinIterator(
    * @return some gatherer to use next or None if there is no next gatherer or the loop should try
    *         to build the gatherer again (e.g.: to skip a degenerate join result batch)
    */
-  protected def createGatherer(cb: ColumnarBatch, numJoinRows: Option[Long]): Option[JoinGatherer]
+  protected def createGatherer(cb: LazySpillableColumnarBatch,
+      numJoinRows: Option[Long]): Option[JoinGatherer]
 
   override def hasNextStreamBatch: Boolean = {
     isInitialJoin || pendingSplits.nonEmpty || stream.hasNext
@@ -200,40 +205,31 @@ abstract class SplittableJoinIterator(
     val wasInitialJoin = isInitialJoin
     isInitialJoin = false
     if (pendingSplits.nonEmpty || stream.hasNext) {
-      val cb = if (pendingSplits.nonEmpty) {
-        opTime.ns {
-          withResource(pendingSplits.dequeue()) {
-            _.getColumnarBatch()
-          }
-        }
+      val scb = if (pendingSplits.nonEmpty) {
+        pendingSplits.dequeue()
       } else {
-        val batch = withResource(stream.next()) { lazyBatch =>
-          opTime.ns {
-            lazyBatch.releaseBatch()
-          }
-        }
-        batch
+        stream.next()
       }
       opTime.ns {
-        withResource(cb) { cb =>
-          val numJoinRows = computeNumJoinRows(cb)
+        withResource(scb) { scb =>
+          val numJoinRows = computeNumJoinRows(scb.getBatch)
 
           // We want the gather maps size to be around the target size. There are two gather maps
           // that are made up of ints, so compute how many rows on the stream side will produce the
           // desired gather maps size.
           val maxJoinRows = Math.max(1, targetSize / (2 * Integer.BYTES))
-          if (numJoinRows > maxJoinRows && cb.numRows() > 1) {
+          if (numJoinRows > maxJoinRows && scb.numRows > 1) {
             // Need to split the batch to reduce the gather maps size. This takes a simplistic
             // approach of assuming the data is uniformly distributed in the stream table.
-            val numSplits = Math.min(cb.numRows(),
+            val numSplits = Math.min(scb.numRows,
               Math.ceil(numJoinRows.toDouble / maxJoinRows).toInt)
-            splitAndSave(cb, numSplits)
+            splitAndSave(scb.getBatch, numSplits)
 
             // Return no gatherer so the outer loop will try again
             return None
           }
 
-          createGatherer(cb, Some(numJoinRows))
+          createGatherer(scb, Some(numJoinRows))
         }
       }
     } else {
@@ -241,7 +237,9 @@ abstract class SplittableJoinIterator(
         assert(wasInitialJoin)
         import scala.collection.JavaConverters._
         withResource(GpuColumnVector.emptyBatch(streamAttributes.asJava)) { cb =>
-          createGatherer(cb, None)
+          withResource(LazySpillableColumnarBatch(cb, "empty_stream")) { scb =>
+            createGatherer(scb, None)
+          }
         }
       }
     }
@@ -256,6 +254,28 @@ abstract class SplittableJoinIterator(
     }
   }
 
+  private def splitStreamBatch(
+      cb: ColumnarBatch,
+      numBatches: Int): Seq[LazySpillableColumnarBatch] = {
+    val batchSize = cb.numRows() / numBatches
+    val splits = withResource(GpuColumnVector.from(cb)) { tab =>
+      val splitIndexes = (1 until numBatches).map(num => num * batchSize)
+      tab.contiguousSplit(splitIndexes: _*)
+    }
+    withResource(splits) { splits =>
+      val schema = GpuColumnVector.extractTypes(cb)
+      withResource(splits.safeMap(_.getTable)) { tables =>
+        withResource(tables.safeMap(GpuColumnVector.from(_, schema))) { batches =>
+          batches.safeMap { splitBatch =>
+            val lazyCb = LazySpillableColumnarBatch(splitBatch, "stream_data")
+            lazyCb.allowSpilling()
+            lazyCb
+          }
+        }
+      }
+    }
+  }
+
   /**
    * Split a stream-side input batch, making all splits spillable, and replacing this batch with
    * the splits in the stream-side input
@@ -266,7 +286,7 @@ abstract class SplittableJoinIterator(
   protected def splitAndSave(
       cb: ColumnarBatch,
       numBatches: Int,
-      oom: Option[OutOfMemoryError] = None): Unit = {
+      oom: Option[Throwable] = None): Unit = {
     val batchSize = cb.numRows() / numBatches
     if (oom.isDefined && batchSize < 100) {
       // We just need some kind of cutoff to not get stuck in a loop if the batches get to be too
@@ -280,17 +300,7 @@ abstract class SplittableJoinIterator(
     } else {
       logInfo(msg)
     }
-    val splits = withResource(GpuColumnVector.from(cb)) { tab =>
-      val splitIndexes = (1 until numBatches).map(num => num * batchSize)
-      tab.contiguousSplit(splitIndexes: _*)
-    }
-    withResource(splits) { splits =>
-      val schema = GpuColumnVector.extractTypes(cb)
-      pendingSplits ++= splits.map { ct =>
-        SpillableColumnarBatch(ct, schema,
-          SpillPriorities.ACTIVE_ON_DECK_PRIORITY, spillCallback)
-      }
-    }
+    pendingSplits ++= splitStreamBatch(cb, numBatches)
   }
 
   /**
@@ -319,7 +329,7 @@ abstract class SplittableJoinIterator(
         None
       }
 
-      val lazyLeftMap = LazySpillableGatherMap(leftMap, spillCallback, "left_map")
+      val lazyLeftMap = LazySpillableGatherMap(leftMap, "left_map")
       val gatherer = rightMap match {
         case None =>
           // When there isn't a `rightMap` we are in either LeftSemi or LeftAnti joins.
@@ -350,7 +360,7 @@ abstract class SplittableJoinIterator(
             case _: InnerLike | RightOuter => OutOfBoundsPolicy.DONT_CHECK
             case _ => OutOfBoundsPolicy.NULLIFY
           }
-          val lazyRightMap = LazySpillableGatherMap(right, spillCallback, "right_map")
+          val lazyRightMap = LazySpillableGatherMap(right, "right_map")
           JoinGatherer(lazyLeftMap, leftData, lazyRightMap, rightData,
             leftOutOfBoundsPolicy, rightOutOfBoundsPolicy)
       }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AlluxioConfigReader.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AlluxioConfigReader.scala
index 24ede38d8d0..50a681f6218 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AlluxioConfigReader.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AlluxioConfigReader.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,22 +22,24 @@ import scala.io.BufferedSource
 
 /**
  * Alluxio master address and port reader.
- * It reads from `/opt/alluxio-2.8.0/conf/alluxio-site.properties`
+ * It reads from `/opt/alluxio/conf/alluxio-site.properties`
  */
 class AlluxioConfigReader {
 
-  private val alluxioHome: String = "/opt/alluxio-2.8.0"
-
-  def readAlluxioMasterAndPort(): (String, String) = {
-    readMasterAndPort(alluxioHome)
+  def readAlluxioMasterAndPort(conf: RapidsConf): (String, String) = {
+    if (conf.getAlluxioMaster.isEmpty) {
+      // Read from local Alluxio home
+      readMasterAndPort(conf.getAlluxioHome)
+    } else {
+      // Alluxio master separately deployed and not co-located with Spark Driver.
+      // Like: EKS Env,
+      (conf.getAlluxioMaster, conf.getAlluxioMasterPort.toString)
+    }
   }
 
-
-  // By default, read from /opt/alluxio-2.8.0 if not setting ALLUXIO_HOME to get master and port
+  // By default, read from /opt/alluxio, refer to `spark.rapids.alluxio.home` config in `RapidsConf`
   // The default port is 19998
-  private[rapids] def readMasterAndPort(defaultHomePath: String): (String, String) = {
-    val homePath = scala.util.Properties.envOrElse("ALLUXIO_HOME", defaultHomePath)
-
+  private[rapids] def readMasterAndPort(homePath: String): (String, String) = {
     var buffered_source: BufferedSource = null
     try {
       buffered_source = scala.io.Source.fromFile(homePath + "/conf/alluxio-site.properties")
@@ -55,7 +57,9 @@ class AlluxioConfigReader {
       case _: FileNotFoundException =>
         throw new RuntimeException(s"Alluxio config file not found in " +
           s"$homePath/conf/alluxio-site.properties, " +
-          "please check if ALLUXIO_HOME is set correctly")
+          "the default value of `spark.rapids.alluxio.home` is /opt/alluxio, " +
+          "please create a link `/opt/alluxio` to Alluxio installation home, " +
+          "or set `spark.rapids.alluxio.home` to Alluxio installation home")
     } finally {
       if (buffered_source != null) buffered_source.close
     }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AlluxioUtils.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AlluxioUtils.scala
index 8619550f1d9..e909c1cf088 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AlluxioUtils.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AlluxioUtils.scala
@@ -26,7 +26,7 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.RuntimeConfig
-import org.apache.spark.sql.catalyst.expressions.{Expression, PlanExpression}
+import org.apache.spark.sql.catalyst.expressions.{DynamicPruning, Expression, PlanExpression}
 import org.apache.spark.sql.execution.datasources.{CatalogFileIndex, FileIndex, HadoopFsRelation, InMemoryFileIndex, PartitionDirectory, PartitionedFile, PartitionSpec}
 import org.apache.spark.sql.execution.datasources.rapids.GpuPartitioningUtils
 
@@ -102,9 +102,9 @@ object AlluxioUtils extends Logging with Arm {
     }
   }
 
-  // Default to read from /opt/alluxio-2.8.0 if not setting ALLUXIO_HOME
-  private def readAlluxioMasterAndPort: (String, String) = {
-    alluxioMasterAndPortReader.readAlluxioMasterAndPort()
+  // By default, read from /opt/alluxio, refer to `spark.rapids.alluxio.home` config in `RapidsConf`
+  private def readAlluxioMasterAndPort(conf: RapidsConf): (String, String) = {
+    alluxioMasterAndPortReader.readAlluxioMasterAndPort(conf)
   }
 
   // Read out alluxio.master.hostname, alluxio.master.rpc.port
@@ -126,7 +126,7 @@ object AlluxioUtils extends Logging with Arm {
       } else if (conf.getAlluxioAutoMountEnabled) {
         // auto-mount is enabled
         if (!isInitMountPointsForAutoMount) {
-          val (alluxioMasterHostStr, alluxioMasterPortStr) = readAlluxioMasterAndPort
+          val (alluxioMasterHostStr, alluxioMasterPortStr) = readAlluxioMasterAndPort(conf)
           alluxioBucketRegex = Some(conf.getAlluxioBucketRegex)
           // load mounted point by call Alluxio client.
           try {
@@ -529,8 +529,9 @@ object AlluxioUtils extends Logging with Arm {
 
           // With the base Spark FileIndex type we don't know how to modify it to
           // just replace the paths so we have to try to recompute.
-          def isDynamicPruningFilter(e: Expression): Boolean =
-            e.find(_.isInstanceOf[PlanExpression[_]]).isDefined
+          def isDynamicPruningFilter(e: Expression): Boolean = {
+            e.isInstanceOf[DynamicPruning] || e.find(_.isInstanceOf[PlanExpression[_]]).isDefined
+          }
 
           val partitionDirs = relation.location.listFiles(
             partitionFilters.filterNot(isDynamicPruningFilter), dataFilters)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ColumnarOutputWriter.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ColumnarOutputWriter.scala
index c90c3cafb55..e1d52658fcb 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ColumnarOutputWriter.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ColumnarOutputWriter.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,7 +62,9 @@ abstract class ColumnarOutputWriterFactory extends Serializable {
  * `org.apache.spark.sql.execution.datasources.OutputWriter`.
  */
 abstract class ColumnarOutputWriter(context: TaskAttemptContext,
-    dataSchema: StructType, rangeName: String) extends HostBufferConsumer with Arm {
+    dataSchema: StructType,
+    rangeName: String,
+    includeRetry: Boolean) extends HostBufferConsumer with Arm {
 
   val tableWriter: TableWriter
   val conf = context.getConfiguration
@@ -84,6 +86,12 @@ abstract class ColumnarOutputWriter(context: TaskAttemptContext,
     ColumnarOutputWriter.writeBufferedData(buffers, tempBuffer, outputStream)
   }
 
+  def dropBufferedData(): Unit = buffers.dequeueAll {
+    case (buffer, _) =>
+      buffer.close()
+      true
+  }
+
   /**
    * Persists a columnar batch. Invoked on the executor side. When writing to dynamically
    * partitioned tables, dynamic partition columns are not included in columns to be written.
@@ -140,6 +148,40 @@ abstract class ColumnarOutputWriter(context: TaskAttemptContext,
    * @return time in ns taken to write the batch
    */
   private[this] def writeBatch(batch: ColumnarBatch): Long = {
+    if (includeRetry) {
+      writeBatchWithRetry(batch)
+    } else {
+      writeBatchNoRetry(batch)
+    }
+  }
+
+  private[this] def writeBatchWithRetry(batch: ColumnarBatch): Long = {
+    val sb = SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
+    RmmRapidsRetryIterator.withRetry(sb, RmmRapidsRetryIterator.splitSpillableInHalfByRows) { sb =>
+      val cr = new CheckpointRestore {
+        override def checkpoint(): Unit = ()
+        override def restore(): Unit = dropBufferedData()
+      }
+      val startTimestamp = System.nanoTime
+      withResource(sb.getColumnarBatch()) { cb =>
+        RmmRapidsRetryIterator.withRestoreOnRetry(cr) {
+          withResource(new NvtxRange(s"GPU $rangeName write", NvtxColor.BLUE)) { _ =>
+            withResource(GpuColumnVector.from(cb)) { table =>
+              scanTableBeforeWrite(table)
+              anythingWritten = true
+              tableWriter.write(table)
+            }
+          }
+        }
+      }
+      GpuSemaphore.releaseIfNecessary(TaskContext.get)
+      val gpuTime = System.nanoTime - startTimestamp
+      writeBufferedData()
+      gpuTime
+    }.sum
+  }
+
+  private[this] def writeBatchNoRetry(batch: ColumnarBatch): Long = {
     var needToCloseBatch = true
     try {
       val startTimestamp = System.nanoTime
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/DeviceMemoryEventHandler.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/DeviceMemoryEventHandler.scala
index a97ff235527..7011438bec4 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/DeviceMemoryEventHandler.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/DeviceMemoryEventHandler.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,10 +20,11 @@ import java.io.File
 import java.lang.management.ManagementFactory
 import java.util.concurrent.atomic.AtomicLong
 
-import ai.rapids.cudf.{Cuda, NvtxColor, NvtxRange, Rmm, RmmEventHandler}
+import ai.rapids.cudf.{Cuda, Rmm, RmmEventHandler}
 import com.sun.management.HotSpotDiagnosticMXBean
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.sql.rapids.GpuTaskMetrics
 import org.apache.spark.sql.rapids.execution.TrampolineUtil
 
 /**
@@ -35,6 +36,7 @@ import org.apache.spark.sql.rapids.execution.TrampolineUtil
  *                            depleting the device store
  */
 class DeviceMemoryEventHandler(
+    catalog: RapidsBufferCatalog,
     store: RapidsDeviceMemoryStore,
     oomDumpDir: Option[String],
     isGdsSpillEnabled: Boolean,
@@ -52,7 +54,7 @@ class DeviceMemoryEventHandler(
 
   /**
    * A small helper class that helps keep track of retry counts as we trigger
-   * synchronizes on a depleted store.
+   * synchronizes on a completely spilled store.
    */
   class OOMRetryState {
     private var synchronizeAttempts = 0
@@ -82,19 +84,19 @@ class DeviceMemoryEventHandler(
     }
 
     /**
-     * We reset our counters if `storeSize` is non-zero (as we only track when the store is
-     * depleted), or `retryCount` is less than or equal to what we had previously
-     * recorded in `shouldTrySynchronizing`. We do this to detect that the new failures
-     * are for a separate allocation (we need to give this new attempt a new set of
-     * retries.)
+     * We reset our counters if `storeSpillableSize` is non-zero (as we only track when all
+     * spillable buffers are removed from the store), or `retryCount` is less than or equal
+     * to what we had previously recorded in `shouldTrySynchronizing`.
+     * We do this to detect that the new failures are for a separate allocation (we need to
+     * give this new attempt a new set of retries.)
      *
-     * For example, if an allocation fails and we deplete the store, `retryCountLastSynced`
+     * For example, if an allocation fails and we spill all eligible buffers, `retryCountLastSynced`
      * is set to the last `retryCount` sent to us by cuDF as we keep allowing retries
      * from cuDF. If we succeed, cuDF resets `retryCount`, and so the new count sent to us
      * must be <= than what we saw last, so we can reset our tracking.
      */
-    def resetIfNeeded(retryCount: Int, storeSize: Long): Unit = {
-      if (storeSize != 0 || retryCount <= retryCountLastSynced) {
+    def resetIfNeeded(retryCount: Int, storeSpillableSize: Long): Unit = {
+      if (storeSpillableSize != 0 || retryCount <= retryCountLastSynced) {
         reset()
       }
     }
@@ -108,15 +110,17 @@ class DeviceMemoryEventHandler(
    */
   override def onAllocFailure(allocSize: Long, retryCount: Int): Boolean = {
     // check arguments for good measure
-    require(allocSize >= 0, 
+    require(allocSize >= 0,
       s"onAllocFailure invoked with invalid allocSize $allocSize")
 
-    require(retryCount >= 0, 
+    require(retryCount >= 0,
       s"onAllocFailure invoked with invalid retryCount $retryCount")
 
     try {
-      withResource(new NvtxRange("onAllocFailure", NvtxColor.RED)) { _ =>
+      GpuTaskMetrics.get.spillTime {
         val storeSize = store.currentSize
+        val storeSpillableSize = store.currentSpillableSize
+
         val attemptMsg = if (retryCount > 0) {
           s"Attempt ${retryCount}. "
         } else {
@@ -124,12 +128,12 @@ class DeviceMemoryEventHandler(
         }
 
         val retryState = oomRetryState.get()
-        retryState.resetIfNeeded(retryCount, storeSize)
+        retryState.resetIfNeeded(retryCount, storeSpillableSize)
 
         logInfo(s"Device allocation of $allocSize bytes failed, device store has " +
-          s"$storeSize bytes. $attemptMsg" +
+          s"$storeSize total and $storeSpillableSize spillable bytes. $attemptMsg" +
           s"Total RMM allocated is ${Rmm.getTotalBytesAllocated} bytes. ")
-        if (storeSize == 0) {
+        if (storeSpillableSize == 0) {
           if (retryState.shouldTrySynchronizing(retryCount)) {
             Cuda.deviceSynchronize()
             logWarning(s"[RETRY ${retryState.getRetriesSoFar}] " +
@@ -149,14 +153,17 @@ class DeviceMemoryEventHandler(
             false
           }
         } else {
-          val targetSize = Math.max(storeSize - allocSize, 0)
+          val targetSize = Math.max(storeSpillableSize - allocSize, 0)
           logDebug(s"Targeting device store size of $targetSize bytes")
-          val amountSpilled = store.synchronousSpill(targetSize)
-          logInfo(s"Spilled $amountSpilled bytes from the device store")
-          if (isGdsSpillEnabled) {
-            TrampolineUtil.incTaskMetricsDiskBytesSpilled(amountSpilled)
-          } else {
-            TrampolineUtil.incTaskMetricsMemoryBytesSpilled(amountSpilled)
+          val maybeAmountSpilled =
+            catalog.synchronousSpill(store, targetSize, Cuda.DEFAULT_STREAM)
+          maybeAmountSpilled.foreach { amountSpilled =>
+            logInfo(s"Spilled $amountSpilled bytes from the device store")
+            if (isGdsSpillEnabled) {
+              TrampolineUtil.incTaskMetricsDiskBytesSpilled(amountSpilled)
+            } else {
+              TrampolineUtil.incTaskMetricsMemoryBytesSpilled(amountSpilled)
+            }
           }
           true
         }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBatchScanExecMetrics.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBatchScanExecMetrics.scala
index 26441b226c1..269ec2ca1fb 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBatchScanExecMetrics.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBatchScanExecMetrics.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,5 @@ trait GpuBatchScanExecMetrics extends GpuExec {
     GPU_DECODE_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_GPU_DECODE_TIME),
     BUFFER_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_BUFFER_TIME),
     FILTER_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_FILTER_TIME),
-    PEAK_DEVICE_MEMORY -> createSizeMetric(MODERATE_LEVEL, DESCRIPTION_PEAK_DEVICE_MEMORY)) ++
-      spillMetrics
+    PEAK_DEVICE_MEMORY -> createSizeMetric(MODERATE_LEVEL, DESCRIPTION_PEAK_DEVICE_MEMORY))
 }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBoundAttribute.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBoundAttribute.scala
index bff3e6f5824..fd4c3727c0b 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBoundAttribute.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBoundAttribute.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@ import ai.rapids.cudf.ast
 import com.nvidia.spark.rapids.shims.ShimExpression
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeSeq, Expression, ExprId, SortOrder}
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeSeq, Expression, ExprId, NamedExpression, SortOrder}
 import org.apache.spark.sql.rapids.catalyst.expressions.GpuEquivalentExpressions
 import org.apache.spark.sql.types.DataType
 import org.apache.spark.sql.vectorized.ColumnarBatch
@@ -130,19 +130,41 @@ object GpuBindReferences extends Logging {
 
   /**
    * A helper function to bind given expressions to an input schema where the expressions are
-   * to be processed on the GPU, and the result type indicates this.  Common sub-expressions
-   * bound with their inputs are placed into a sequence of tiers in a GpuTieredProject object.
+   * to be processed on the GPU, and the result type indicates this. If runTiered is true
+   * Common sub-expressions will be factored out where possible to reduce the runtime and memory.
+   * If set to false a GpuTieredProject object will still be returned, but no common
+   * sub-expressions will be factored out.
    */
   def bindGpuReferencesTiered[A <: Expression](
       expressions: Seq[A],
-      input: AttributeSeq): GpuTieredProject = {
-
-    val exprTiers = GpuEquivalentExpressions.getExprTiers(expressions)
-    val inputTiers = GpuEquivalentExpressions.getInputTiers(exprTiers, input)
-    GpuTieredProject(exprTiers.zip(inputTiers).map {
-      case (es:Seq[Expression], is:AttributeSeq) =>
-        es.map(GpuBindReferences.bindGpuReference(_, is)).toList
-    }, inputTiers)
+      input: AttributeSeq,
+      runTiered: Boolean): GpuTieredProject = {
+
+    if (runTiered) {
+      val exprTiers = GpuEquivalentExpressions.getExprTiers(expressions)
+      val inputTiers = GpuEquivalentExpressions.getInputTiers(exprTiers, input)
+      // Update ExprTiers to include the columns that are pass through and drop unneeded columns
+      val newExprTiers = exprTiers.zipWithIndex.map {
+        case (exprTier, index) =>
+          // get what the output should look like.
+          val atInput = index + 1
+          if (atInput < inputTiers.length) {
+            inputTiers(atInput).attrs.map { attr =>
+              exprTier.find { expr =>
+                expr.asInstanceOf[NamedExpression].toAttribute == attr
+              }.getOrElse(attr)
+            }
+          } else {
+            exprTier
+          }
+      }
+      GpuTieredProject(newExprTiers.zip(inputTiers).map {
+        case (es: Seq[Expression], is: AttributeSeq) =>
+          es.map(GpuBindReferences.bindGpuReference(_, is)).toList
+      })
+    } else {
+      GpuTieredProject(Seq(GpuBindReferences.bindGpuReferences(expressions, input)))
+    }
   }
 }
 
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBringBackToHost.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBringBackToHost.scala
index bb3eb30d850..0a4d0d47fd1 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBringBackToHost.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBringBackToHost.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,7 +39,7 @@ case class GpuBringBackToHost(child: SparkPlan) extends ShimUnaryExecNode with G
     columnarToRow.execute()
   }
 
-  override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     // Both GPU and CPU code expects this to close the incoming batch.
     AutoCloseColumnBatchIterator.map[ColumnarBatch](child.executeColumnar(), b => {
       val range = new NvtxRange("BringBackToHost", NvtxColor.RED)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCSVScan.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCSVScan.scala
index 0b345369d26..b389aaaf7b9 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCSVScan.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCSVScan.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@ import java.nio.charset.StandardCharsets
 import scala.collection.JavaConverters._
 
 import ai.rapids.cudf
-import ai.rapids.cudf.{ColumnVector, DType, Scalar, Schema, Table}
+import ai.rapids.cudf.{ColumnVector, DType, NvtxColor, Scalar, Schema, Table}
 import com.nvidia.spark.rapids.shims.ShimFilePartitionReaderFactory
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
@@ -334,6 +334,29 @@ abstract class CSVPartitionReaderBase[BUFF <: LineBufferer, FACT <: LineBufferer
 }
 
 
+object CSVPartitionReader extends Arm {
+  def readToTable(
+      dataBufferer: HostLineBufferer,
+      cudfSchema: Schema,
+      decodeTime: GpuMetric,
+      csvOpts: cudf.CSVOptions.Builder,
+      formatName: String,
+      partFile: PartitionedFile): Table = {
+    val dataSize = dataBufferer.getLength
+    try {
+      RmmRapidsRetryIterator.withRetryNoSplit(dataBufferer.getBufferAndRelease) { dataBuffer =>
+        withResource(new NvtxWithMetrics(formatName + " decode", NvtxColor.DARK_GREEN,
+          decodeTime)) { _ =>
+          Table.readCSV(cudfSchema, csvOpts.build, dataBuffer, 0, dataSize)
+        }
+      }
+    } catch {
+      case e: Exception =>
+        throw new IOException(s"Error when processing file [$partFile]", e)
+    }
+  }
+}
+
 class CSVPartitionReader(
     conf: Configuration,
     partFile: PartitionedFile,
@@ -375,17 +398,11 @@ class CSVPartitionReader(
       dataBufferer: HostLineBufferer,
       cudfSchema: Schema,
       readDataSchema: StructType,
-      isFirstChunk: Boolean): Table = {
+      isFirstChunk: Boolean,
+      decodeTime: GpuMetric): Table = {
     val hasHeader = isFirstChunk && parsedOptions.headerFlag
     val csvOpts = buildCsvOptions(parsedOptions, readDataSchema, hasHeader)
-    val dataSize = dataBufferer.getLength
-    try {
-      withResource(dataBufferer.getBufferAndRelease) { dataBuffer =>
-        Table.readCSV(cudfSchema, csvOpts.build, dataBuffer, 0, dataSize)
-      }
-    } catch {
-      case e: Exception =>
-        throw new IOException(s"Error when processing file [$partFile]", e)
-    }
+    CSVPartitionReader.readToTable(dataBufferer, cudfSchema, decodeTime, csvOpts,
+      getFileFormatShortName, partFile)
   }
 }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala
index e2fcaed37b5..ce79fc29181 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala
@@ -22,7 +22,7 @@ import java.util.Optional
 
 import scala.collection.mutable.ArrayBuffer
 
-import ai.rapids.cudf.{BinaryOp, ColumnVector, ColumnView, DecimalUtils, DType, Scalar}
+import ai.rapids.cudf.{BinaryOp, CaptureGroups, ColumnVector, ColumnView, DecimalUtils, DType, RegexProgram, Scalar}
 import ai.rapids.cudf
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.jni.CastStrings
@@ -157,6 +157,9 @@ final class CastExprMeta[INPUT <: UnaryExpression with TimeZoneAwareExpression w
   override def convertToGpu(child: Expression): GpuExpression =
     GpuCast(child, toType, ansiEnabled, cast.timeZoneId, legacyCastToString,
       stringToAnsiDate)
+
+  // timezone tagging in type checks is good enough, so always false
+  override protected val needTimezoneTagging: Boolean = false
 }
 
 object GpuCast extends Arm {
@@ -631,7 +634,6 @@ object GpuCast extends Arm {
     }
   }
 
-  @scala.annotation.nowarn("msg=method stringReplaceWithBackrefs in class ColumnView is deprecated")
   private def castTimestampToString(input: ColumnView): ColumnVector = {
     // the complexity in this function is due to Spark's rules for truncating
     // the fractional part of the timestamp string. Any trailing decimal place
@@ -654,7 +656,8 @@ object GpuCast extends Arm {
         // the decimal point and the last non-zero digit
         // the second group (non-capture) covers the remaining zeroes
         withResource(firstPass) { _ =>
-          firstPass.stringReplaceWithBackrefs("(\\.[0-9]*[1-9]+)(?:0+)?$", "\\1")
+          val prog = new RegexProgram("(\\.[0-9]*[1-9]+)(?:0+)?$")
+          firstPass.stringReplaceWithBackrefs(prog, "\\1")
         }
       }
     }
@@ -971,13 +974,13 @@ object GpuCast extends Arm {
   }
 
   /** This method does not close the `input` ColumnVector. */
-  @scala.annotation.nowarn("msg=method matchesRe in class ColumnView is deprecated")
   def convertDateOrNull(
       input: ColumnVector,
       regex: String,
       cudfFormat: String): ColumnVector = {
 
-    val isValidDate = withResource(input.matchesRe(regex)) { isMatch =>
+    val prog = new RegexProgram(regex, CaptureGroups.NON_CAPTURE)
+    val isValidDate = withResource(input.matchesRe(prog)) { isMatch =>
       withResource(input.isTimestamp(cudfFormat)) { isTimestamp =>
         isMatch.and(isTimestamp)
       }
@@ -993,14 +996,14 @@ object GpuCast extends Arm {
   }
 
     /** This method does not close the `input` ColumnVector. */
-    @scala.annotation.nowarn("msg=method matchesRe in class ColumnView is deprecated")
     def convertDateOr(
       input: ColumnVector,
       regex: String,
       cudfFormat: String,
       orElse: ColumnVector): ColumnVector = {
-
-    val isValidDate = withResource(input.matchesRe(regex)) { isMatch =>
+    
+    val prog = new RegexProgram(regex, CaptureGroups.NON_CAPTURE)
+    val isValidDate = withResource(input.matchesRe(prog)) { isMatch =>
       withResource(input.isTimestamp(cudfFormat)) { isTimestamp =>
         isMatch.and(isTimestamp)
       }
@@ -1079,14 +1082,14 @@ object GpuCast extends Arm {
   }
 
   /** This method does not close the `input` ColumnVector. */
-  @scala.annotation.nowarn("msg=method matchesRe in class ColumnView is deprecated")
   private def convertTimestampOrNull(
       input: ColumnVector,
       regex: String,
       cudfFormat: String): ColumnVector = {
 
     withResource(Scalar.fromNull(DType.TIMESTAMP_MICROSECONDS)) { orElse =>
-      val isValidTimestamp = withResource(input.matchesRe(regex)) { isMatch =>
+      val prog = new RegexProgram(regex, CaptureGroups.NON_CAPTURE)
+      val isValidTimestamp = withResource(input.matchesRe(prog)) { isMatch =>
         withResource(input.isTimestamp(cudfFormat)) { isTimestamp =>
           isMatch.and(isTimestamp)
         }
@@ -1100,7 +1103,6 @@ object GpuCast extends Arm {
   }
 
   /** This method does not close the `input` ColumnVector. */
-  @scala.annotation.nowarn("msg=method matchesRe in class ColumnView is deprecated")
   private def convertTimestampOr(
       input: ColumnVector,
       regex: String,
@@ -1108,7 +1110,8 @@ object GpuCast extends Arm {
       orElse: ColumnVector): ColumnVector = {
 
     withResource(orElse) { orElse =>
-      val isValidTimestamp = withResource(input.matchesRe(regex)) { isMatch =>
+      val prog = new RegexProgram(regex, CaptureGroups.NON_CAPTURE)
+      val isValidTimestamp = withResource(input.matchesRe(prog)) { isMatch =>
         withResource(input.isTimestamp(cudfFormat)) { isTimestamp =>
           isMatch.and(isTimestamp)
         }
@@ -1122,7 +1125,6 @@ object GpuCast extends Arm {
   }
 
   /** This method does not close the `input` ColumnVector. */
-  @scala.annotation.nowarn("msg=method matchesRe in class ColumnView is deprecated")
   private def convertFullTimestampOr(
       input: ColumnVector,
       orElse: ColumnVector): ColumnVector = {
@@ -1148,7 +1150,8 @@ object GpuCast extends Arm {
       }
 
       val isValidTimestamp = withResource(isCudfMatch) { _ =>
-        withResource(input.matchesRe(TIMESTAMP_REGEX_FULL)) { isRegexMatch =>
+        val prog = new RegexProgram(TIMESTAMP_REGEX_FULL, CaptureGroups.NON_CAPTURE)
+        withResource(input.matchesRe(prog)) { isRegexMatch =>
           isCudfMatch.and(isRegexMatch)
         }
       }
@@ -1163,7 +1166,6 @@ object GpuCast extends Arm {
     }
   }
 
-  @scala.annotation.nowarn("msg=method stringReplaceWithBackrefs in class ColumnView is deprecated")
   private def castStringToTimestamp(input: ColumnVector, ansiMode: Boolean): ColumnVector = {
 
     // special timestamps
@@ -1175,7 +1177,8 @@ object GpuCast extends Arm {
 
     // prepend today's date to timestamp formats without dates
     sanitizedInput = withResource(sanitizedInput) { _ =>
-      sanitizedInput.stringReplaceWithBackrefs(TIMESTAMP_REGEX_NO_DATE, s"${todayStr}T\\1")
+      val prog = new RegexProgram(TIMESTAMP_REGEX_NO_DATE)
+      sanitizedInput.stringReplaceWithBackrefs(prog, s"${todayStr}T\\1")
     }
 
     withResource(sanitizedInput) { sanitizedInput =>
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala
index 311d160f06a..f4dbf37d3a6 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala
@@ -20,6 +20,7 @@ import scala.collection.mutable.ArrayBuffer
 
 import ai.rapids.cudf.{Cuda, NvtxColor, Table}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
+import com.nvidia.spark.rapids.RmmRapidsRetryIterator.{withRetry, withRetryNoSplit}
 import com.nvidia.spark.rapids.shims.{ShimExpression, ShimUnaryExecNode}
 
 import org.apache.spark.TaskContext
@@ -171,6 +172,13 @@ sealed abstract class CoalesceSizeGoal extends CoalesceGoal {
  */
 trait RequireSingleBatchLike
 
+/**
+ * Trait used for pattern matching for goals that could be split, as they
+ * only specify that batches won't be too much bigger than a maximum target
+ * size in bytes.
+ */
+trait SplittableGoal
+
 /**
  * A single batch is required as the input to a node in the SparkPlan. This means
  * all of the data for a given task is in a single batch. This should be avoided
@@ -209,7 +217,9 @@ case class RequireSingleBatchWithFilter(filterExpression: GpuExpression)
  * limitations in cudf for nested type columns.
  * @param targetSizeBytes the size of each batch in bytes.
  */
-case class TargetSize(override val targetSizeBytes: Long) extends CoalesceSizeGoal {
+case class TargetSize(override val targetSizeBytes: Long)
+    extends CoalesceSizeGoal
+        with SplittableGoal {
   require(targetSizeBytes <= Integer.MAX_VALUE,
     "Target cannot exceed 2GB without checks for cudf row count limit")
 }
@@ -235,7 +245,7 @@ case class BatchedByKey(gpuOrder: Seq[SortOrder])(val cpuOrder: Seq[SortOrder])
 }
 
 abstract class AbstractGpuCoalesceIterator(
-    batches: Iterator[ColumnarBatch],
+    inputIter: Iterator[ColumnarBatch],
     goal: CoalesceSizeGoal,
     numInputRows: GpuMetric,
     numInputBatches: GpuMetric,
@@ -246,10 +256,16 @@ abstract class AbstractGpuCoalesceIterator(
     opTime: GpuMetric,
     opName: String) extends Iterator[ColumnarBatch] with Arm with Logging {
 
-  private val iter = new CollectTimeIterator(s"$opName: collect", batches, streamTime)
+  private val iter = new CollectTimeIterator(s"$opName: collect", inputIter, streamTime)
 
   private var batchInitialized: Boolean = false
 
+  /**
+   * This iterator is redefined if this coalesce iterator is under a `SplittableGoal`
+   * and so might retry and split given OOMs
+   */
+  private var coalesceBatchIterator: Iterator[ColumnarBatch] = Iterator.empty
+
   /**
    * This is defined iff `goal` is `RequireSingleBatchWithFilter` and we have
    * reached the cuDF row-count limit.
@@ -288,7 +304,7 @@ abstract class AbstractGpuCoalesceIterator(
   Option(TaskContext.get())
       .foreach(_.addTaskCompletionListener[Unit](_ => clearOnDeck()))
 
-  override def hasNext: Boolean = {
+  private def getHasOnDeck: Boolean = {
     while (!hasOnDeck && iter.hasNext) {
       val cb = iter.next()
       withResource(new MetricRange(opTime)) { _ =>
@@ -305,6 +321,10 @@ abstract class AbstractGpuCoalesceIterator(
     hasOnDeck
   }
 
+  override def hasNext: Boolean = {
+    coalesceBatchIterator.hasNext || getHasOnDeck
+  }
+
   /**
    * Called first to initialize any state needed for a new batch to be created.
    */
@@ -326,6 +346,23 @@ abstract class AbstractGpuCoalesceIterator(
    */
   def concatAllAndPutOnGPU(): ColumnarBatch
 
+  /**
+   * True for coalesce iterators that support an iterator that can retry
+   * and produce smaller batches on OOMs.
+   */
+  protected val supportsRetryIterator: Boolean = true
+
+  /**
+   * Function that returns a retry iterator that returns coalesced batches, as much
+   * as possible.
+   *
+   * Note this throws if the subclass does not support splitting its input.
+   * (supportsRetryIterator = false)
+   *
+   * @return an iterator that should be used to obtain coalesced batches
+   */
+  def getCoalesceRetryIterator: Iterator[ColumnarBatch]
+
   /**
    * Called to cleanup any state when a batch is done (even if there was a failure)
    */
@@ -354,135 +391,193 @@ abstract class AbstractGpuCoalesceIterator(
   }
 
   /**
-   * Each call to next() will combine incoming batches up to the limit specified
-   * by [[RapidsConf.GPU_BATCH_SIZE_BYTES]]. However, if any incoming batch is greater
-   * than this size it will be passed through unmodified.
+   * Add input batches to the `batches` collection up to the limit specified
+   * by the goal. Note: for a size goal, if any incoming batch is greater than this size
+   * it will be passed through unmodified.
    *
    * If the coalesce goal is `RequireSingleBatch` then an exception will be thrown if there
-   * is remaining data after the first batch is produced.
+   * is remaining data after the first batch is added.
    *
-   * @return The coalesced batch
+   * @note protected for testing
+   * @return boolean that is true if this call reached the last input batch.
    */
-  override def next(): ColumnarBatch = withResource(new MetricRange(opTime)) { _ =>
-    // reset batch state
-    batchInitialized = false
-    batchRowLimit = 0
-
-    try {
-      var numRows: Long = 0 // to avoid overflows
-      var numBytes: Long = 0
-
-      // check if there is a batch "on deck" from a previous call to next()
-      if (hasOnDeck) {
-        val batch = popOnDeck()
-        numRows += batch.numRows()
-        numBytes += getBatchDataSize(batch)
-        addBatch(batch)
-      }
+  protected def populateCandidateBatches(): Boolean = {
+    var numRows: Long = 0 // to avoid overflows
+    var numBytes: Long = 0
+
+    // check if there is a batch "on deck" from a previous call to next()
+    if (hasOnDeck) {
+      val batch = popOnDeck()
+      numRows += batch.numRows()
+      numBytes += getBatchDataSize(batch)
+      addBatch(batch)
+    }
 
-      // there is a hard limit of 2^31 rows
-      while (numRows < Int.MaxValue && !hasOnDeck && iter.hasNext) {
-        val cbFromIter = iter.next()
+    // there is a hard limit of 2^31 rows
+    while (numRows < Int.MaxValue && !hasOnDeck && iter.hasNext) {
+      val cbFromIter = iter.next()
 
-        var cb = if (inputFilterExpression.isDefined) {
-          // If we have reached the cuDF limit once, proactively filter batches
-          // after that first limit is reached.
-          GpuFilter(cbFromIter, inputFilterExpression.get)
-        } else {
-          cbFromIter
-        }
+      var cb = if (inputFilterExpression.isDefined) {
+        // If we have reached the cuDF limit once, proactively filter batches
+        // after that first limit is reached.
+        GpuFilter(cbFromIter, inputFilterExpression.get)
+      } else {
+        cbFromIter
+      }
 
-        closeOnExcept(cb) { _ =>
-          val nextRows = cb.numRows()
-          numInputBatches += 1
+      closeOnExcept(cb) { _ =>
+        val nextRows = cb.numRows()
+        numInputBatches += 1
 
-          // filter out empty batches
-          if (nextRows > 0) {
-            numInputRows += nextRows
-            val nextBytes = getBatchDataSize(cb)
+        // filter out empty batches
+        if (nextRows > 0) {
+          numInputRows += nextRows
+          val nextBytes = getBatchDataSize(cb)
 
-            // calculate the new sizes based on this input batch being added to the current
-            // output batch
-            val wouldBeRows = numRows + nextRows
-            val wouldBeBytes = numBytes + nextBytes
+          // calculate the new sizes based on this input batch being added to the current
+          // output batch
+          val wouldBeRows = numRows + nextRows
+          val wouldBeBytes = numBytes + nextBytes
 
-            if (wouldBeRows > Int.MaxValue) {
-              goal match {
-                case RequireSingleBatch =>
-                  throw new IllegalStateException("A single batch is required for this operation," +
+          if (wouldBeRows > Int.MaxValue) {
+            goal match {
+              case RequireSingleBatch =>
+                throw new IllegalStateException("A single batch is required for this operation," +
                     s" but cuDF only supports ${Int.MaxValue} rows. At least $wouldBeRows" +
                     s" are in this partition. Please try increasing your partition count.")
-                case RequireSingleBatchWithFilter(filterExpression) =>
-                  // filter what we had already stored
-                  val filteredDown = GpuFilter(concatAllAndPutOnGPU(), filterExpression)
-                  closeOnExcept(filteredDown) { _ =>
-                    // filter the incoming batch as well
-                    closeOnExcept(GpuFilter(cb, filterExpression)) { filteredCb =>
-                      cb = null // null out `cb` to prevent multiple close calls
-                      val filteredWouldBeRows = filteredDown.numRows() + filteredCb.numRows()
-                      if (filteredWouldBeRows > Int.MaxValue) {
-                        throw new IllegalStateException(
-                          "A single batch is required for this operation, but cuDF only supports " +
+              case RequireSingleBatchWithFilter(filterExpression) =>
+                // filter what we had already stored
+                val filteredDown = GpuFilter(concatAllAndPutOnGPU(), filterExpression)
+                closeOnExcept(filteredDown) { _ =>
+                  // filter the incoming batch as well
+                  closeOnExcept(GpuFilter(cb, filterExpression)) { filteredCb =>
+                    cb = null // null out `cb` to prevent multiple close calls
+                    val filteredWouldBeRows = filteredDown.numRows() + filteredCb.numRows()
+                    if (filteredWouldBeRows > Int.MaxValue) {
+                      throw new IllegalStateException(
+                        "A single batch is required for this operation, but cuDF only supports " +
                             s"${Int.MaxValue} rows. At least $filteredWouldBeRows are in this " +
                             "partition, even after filtering nulls. " +
                             "Please try increasing your partition count.")
-                      }
-                      if (inputFilterExpression.isEmpty) {
-                        inputFilterExpression = Some(filterExpression)
-                        logWarning("Switched to null-filtering mode. This coalesce iterator " +
+                    }
+                    if (inputFilterExpression.isEmpty) {
+                      inputFilterExpression = Some(filterExpression)
+                      logWarning("Switched to null-filtering mode. This coalesce iterator " +
                           "succeeded to fit rows under the cuDF limit only after null filtering. " +
                           "Please try increasing your partition count.")
-                      }
-                      numRows = filteredWouldBeRows
-                      numBytes = getBatchDataSize(filteredDown) + getBatchDataSize(filteredCb)
-                      addBatch(filteredDown)
-                      addBatch(filteredCb)
                     }
+                    numRows = filteredWouldBeRows
+                    numBytes = getBatchDataSize(filteredDown) + getBatchDataSize(filteredCb)
+                    addBatch(filteredDown)
+                    addBatch(filteredCb)
                   }
-                case _ => saveOnDeck(cb) // not a single batch requirement
-              }
-            } else if (batchRowLimit > 0 && wouldBeRows > batchRowLimit) {
-              saveOnDeck(cb)
-            } else if (wouldBeBytes > goal.targetSizeBytes && numBytes > 0) {
-              // There are no explicit checks for the concatenate result exceeding the cudf 2^31
-              // row count limit for any column. We are relying on cudf's concatenate to throw
-              // an exception if this occurs and limiting performance-oriented goals to under
-              // 2GB data total to avoid hitting that error.
-              saveOnDeck(cb)
-            } else {
-              addBatch(cb)
-              numRows = wouldBeRows
-              numBytes = wouldBeBytes
+                }
+              case _ => saveOnDeck(cb) // not a single batch requirement
             }
+          } else if (batchRowLimit > 0 && wouldBeRows > batchRowLimit) {
+            saveOnDeck(cb)
+          } else if (wouldBeBytes > goal.targetSizeBytes && numBytes > 0) {
+            // There are no explicit checks for the concatenate result exceeding the cudf 2^31
+            // row count limit for any column. We are relying on cudf's concatenate to throw
+            // an exception if this occurs and limiting performance-oriented goals to under
+            // 2GB data total to avoid hitting that error.
+            saveOnDeck(cb)
           } else {
-            cleanupInputBatch(cb)
+            addBatch(cb)
+            numRows = wouldBeRows
+            numBytes = wouldBeBytes
           }
+        } else {
+          cleanupInputBatch(cb)
         }
       }
+    }
 
-      val isLastBatch = !(hasOnDeck || iter.hasNext)
+    val isLastBatch = !(hasOnDeck || iter.hasNext)
 
-      // enforce single batch limit when appropriate
-      if (!isLastBatch) {
-        goal match {
-          case _: RequireSingleBatchLike =>
-            throw new IllegalStateException("A single batch is required for this operation," +
-                " Please try increasing your partition count.")
-          case _ =>
-        }
+    // enforce single batch limit when appropriate
+    if (!isLastBatch) {
+      goal match {
+        case _: RequireSingleBatchLike =>
+          throw new IllegalStateException("A single batch is required for this operation," +
+              " Please try increasing your partition count.")
+        case _ =>
       }
+    }
+    isLastBatch
+  }
 
-      numOutputRows += numRows
-      numOutputBatches += 1
-      withResource(new NvtxWithMetrics(s"$opName concat", NvtxColor.CYAN, concatTime)) { _ =>
-        val batch = concatAllAndPutOnGPU()
-        if (isLastBatch) {
+  var wasLastBatch: Boolean = false
+
+  /**
+   * Each call to next() will combine batches according to the goal specified.
+   * However, if any incoming batch is greater than this size it will be passed
+   * through unmodified.
+   *
+   * If the coalesce goal is `RequireSingleBatch` then an exception will be thrown if there
+   * is remaining data after the first batch is produced.
+   *
+   * If OOMs occur while coalescing (which may include decompression depending on the
+   * instance), this may be retried, and as a result `ColumnarBatch` may be smaller than
+   * desired, since we follow a "coalesce half of the batches" strategy, which should
+   * half the number of batches that are candidates for coalesce at each OOM, leaving the rest
+   * for a subsequent call to `next`.
+   *
+   * @return The coalesced batch
+   */
+  override def next(): ColumnarBatch = withResource(new MetricRange(opTime)) { _ =>
+    if (coalesceBatchIterator.hasNext) {
+      val batch = coalesceBatchIterator.next()
+      if (wasLastBatch) {
+        // if the coalesce iterator is empty, and nothing is left on deck
+        if (!hasNext) {
           GpuColumnVector.tagAsFinalBatch(batch)
+        } // else, we already marked `wasLastBatch`, will check it again
+          // next time.
+      }
+      numOutputRows += batch.numRows()
+      numOutputBatches += 1
+      batch
+    } else {
+      // reset batch state
+      batchInitialized = false
+      batchRowLimit = 0
+
+      try {
+        val isLastBatch = if (!coalesceBatchIterator.hasNext) {
+          populateCandidateBatches()
+        } else {
+          wasLastBatch
         }
-        batch
+
+        withResource(new NvtxWithMetrics(s"$opName concat", NvtxColor.CYAN, concatTime)) { _ =>
+          goal match {
+            case _: SplittableGoal if supportsRetryIterator =>
+              coalesceBatchIterator = getCoalesceRetryIterator
+              val batch = coalesceBatchIterator.next()
+              if (isLastBatch) {
+                if (!hasNext) {
+                  GpuColumnVector.tagAsFinalBatch(batch)
+                } else {
+                  wasLastBatch = true // but couldn't mark this one because there are leftovers
+                }
+              }
+              numOutputRows += batch.numRows()
+              numOutputBatches += 1
+              batch
+            case _ =>
+              val singleBatch = concatAllAndPutOnGPU()
+              if (isLastBatch) {
+                GpuColumnVector.tagAsFinalBatch(singleBatch)
+              }
+              numOutputRows += singleBatch.numRows()
+              numOutputBatches += 1
+              singleBatch
+          }
+        }
+      } finally {
+        cleanupConcatIsDone()
       }
-    } finally {
-      cleanupConcatIsDone()
     }
   }
 
@@ -493,6 +588,38 @@ abstract class AbstractGpuCoalesceIterator(
     }
     addBatchToConcat(batch)
   }
+
+  /**
+   * Splits a `BatchesToCoalesce` instance into two.
+   * @return Seq[BatchesToCoalesce] with 2 items.
+   */
+  protected def splitBatchesToCoalesceFn: BatchesToCoalesce => Seq[BatchesToCoalesce] = {
+    (batchesToCoalesce: BatchesToCoalesce) => {
+      closeOnExcept(batchesToCoalesce) { _ =>
+        val it = batchesToCoalesce.batches
+        val numBatches = it.length
+        if (numBatches <= 1) {
+          throw new OutOfMemoryError(s"Cannot split a sequence of $numBatches batches")
+        }
+        val res = it.splitAt(numBatches / 2)
+        Seq(BatchesToCoalesce(res._1), BatchesToCoalesce(res._2))
+      }
+    }
+  }
+}
+
+/**
+ * A helper class that contains a sequence of SpillableColumnarBatch and that
+ * can be used to split the sequence into two. This class is auto closeable,
+ * as it is sent to code that will close it, and in turn close the SpillableColumnarBatch
+ * instances in `batches`
+ * @param batches a sequence of `SpillableColumnarBatch` to manage.
+ */
+case class BatchesToCoalesce(batches: Array[SpillableColumnarBatch])
+    extends AutoCloseable {
+  override def close(): Unit = {
+    batches.safeClose()
+  }
 }
 
 class GpuCoalesceIterator(iter: Iterator[ColumnarBatch],
@@ -506,7 +633,6 @@ class GpuCoalesceIterator(iter: Iterator[ColumnarBatch],
     concatTime: GpuMetric,
     opTime: GpuMetric,
     peakDevMemory: GpuMetric,
-    spillCallback: SpillCallback,
     opName: String)
   extends AbstractGpuCoalesceIterator(iter,
     goal,
@@ -520,7 +646,8 @@ class GpuCoalesceIterator(iter: Iterator[ColumnarBatch],
     opName) with Arm {
 
   protected val batches: ArrayBuffer[SpillableColumnarBatch] = ArrayBuffer.empty
-  private var maxDeviceMemory: Long = 0
+
+  protected var maxDeviceMemory: Long = 0
 
   override def initNewBatch(batch: ColumnarBatch): Unit = {
     batches.safeClose()
@@ -528,24 +655,32 @@ class GpuCoalesceIterator(iter: Iterator[ColumnarBatch],
   }
 
   override def addBatchToConcat(batch: ColumnarBatch): Unit =
-    batches.append(SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_BATCHING_PRIORITY,
-      spillCallback))
+    batches.append(SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_BATCHING_PRIORITY))
 
-  protected def popAll(): Array[ColumnarBatch] = {
-    closeOnExcept(batches.toArray.safeMap(_.getColumnarBatch())) { wip =>
-      batches.safeClose()
-      batches.clear()
-      wip
-    }
-  }
-
-  override def concatAllAndPutOnGPU(): ColumnarBatch = {
-    val ret = ConcatAndConsumeAll.buildNonEmptyBatchFromTypes(popAll(), sparkTypes)
+  private def concatBatches(batches: Array[SpillableColumnarBatch]): ColumnarBatch = {
+    val wip = batches.safeMap(_.getColumnarBatch())
+    val ret = ConcatAndConsumeAll.buildNonEmptyBatchFromTypes(wip, sparkTypes)
     // sum of current batches and concatenating batches. Approximately sizeof(ret * 2).
     maxDeviceMemory = GpuColumnVector.getTotalDeviceMemoryUsed(ret) * 2
     ret
   }
 
+  override def concatAllAndPutOnGPU(): ColumnarBatch = {
+    val candidates = batches.clone()
+    batches.clear()
+    withRetryNoSplit(candidates) { attempt =>
+      concatBatches(attempt.toArray)
+    }
+  }
+
+  override def getCoalesceRetryIterator: Iterator[ColumnarBatch] = {
+    val candidates = BatchesToCoalesce(batches.clone().toArray)
+    batches.clear()
+    withRetry(candidates, splitBatchesToCoalesceFn) { attempt: BatchesToCoalesce =>
+      concatBatches(attempt.batches)
+    }
+  }
+
   override def cleanupConcatIsDone(): Unit = {
     peakDevMemory.set(maxDeviceMemory)
     batches.clear()
@@ -560,8 +695,7 @@ class GpuCoalesceIterator(iter: Iterator[ColumnarBatch],
     closeOnExcept(batch) { _ =>
       assert(onDeck.isEmpty)
     }
-    onDeck = Some(SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY,
-      spillCallback))
+    onDeck = Some(SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
   }
 
   override protected def clearOnDeck(): Unit = {
@@ -594,7 +728,6 @@ class GpuCompressionAwareCoalesceIterator(
     concatTime: GpuMetric,
     opTime: GpuMetric,
     peakDevMemory: GpuMetric,
-    spillCallback: SpillCallback,
     opName: String,
     codecConfigs: TableCompressionCodecConfig)
   extends GpuCoalesceIterator(
@@ -607,15 +740,12 @@ class GpuCompressionAwareCoalesceIterator(
     concatTime = concatTime,
     opTime = opTime,
     peakDevMemory = peakDevMemory,
-    spillCallback, opName) {
+    opName) {
 
   private[this] var codec: TableCompressionCodec = _
 
-  override protected def popAll(): Array[ColumnarBatch] = {
-    closeOnExcept(batches.toArray.safeMap(_.getColumnarBatch())) { wip =>
-      batches.safeClose()
-      batches.clear()
-
+  private def concatBatches(batches: Array[SpillableColumnarBatch]): ColumnarBatch = {
+    val toConcat = closeOnExcept(batches.safeMap(_.getColumnarBatch())) { wip =>
       val compressedBatchIndices = wip.zipWithIndex.filter { pair =>
         GpuCompressedColumnVector.isBatchCompressed(pair._1)
       }.map(_._2)
@@ -628,7 +758,7 @@ class GpuCompressionAwareCoalesceIterator(
           codec = TableCompressionCodec.getCodec(descr.codec, codecConfigs)
         }
         withResource(codec.createBatchDecompressor(maxDecompressBatchMemory,
-            Cuda.DEFAULT_STREAM)) { decompressor =>
+          Cuda.DEFAULT_STREAM)) { decompressor =>
           compressedVecs.foreach { cv =>
             val buffer = cv.getTableBuffer
             val bufferMeta = cv.getTableMeta.bufferMeta
@@ -641,15 +771,39 @@ class GpuCompressionAwareCoalesceIterator(
               val cv = compressedVecs(outputIndex)
               val batchIndex = compressedBatchIndices(outputIndex)
               val compressedBatch = wip(batchIndex)
-              wip(batchIndex) =
-                  MetaUtils.getBatchFromMeta(outputBuffer, cv.getTableMeta, sparkTypes)
-              compressedBatch.close()
+              withResource(compressedBatch) { _ =>
+                // the decompressed batch should get a new meta without codec information
+                // so that future materializations of the batch don't get confused and attempt
+                // to use GpuCompressedColumnVector instead of GpuPackedTableColumn
+                wip(batchIndex) =
+                  MetaUtils.getBatchFromMeta(
+                    outputBuffer, MetaUtils.dropCodecs(cv.getTableMeta), sparkTypes)
+              }
             }
           }
         }
       }
       wip
     }
+    val onGPU = ConcatAndConsumeAll.buildNonEmptyBatchFromTypes(toConcat, sparkTypes)
+    maxDeviceMemory = GpuColumnVector.getTotalDeviceMemoryUsed(onGPU) * 2
+    onGPU
+  }
+
+  override def concatAllAndPutOnGPU(): ColumnarBatch = {
+    val candidates = batches.clone()
+    batches.clear()
+    withRetryNoSplit(candidates) { attempt =>
+      concatBatches(attempt.toArray)
+    }
+  }
+
+  override def getCoalesceRetryIterator(): Iterator[ColumnarBatch] = {
+    val candidates = BatchesToCoalesce(batches.clone().toArray)
+    batches.clear()
+    withRetry(candidates, splitBatchesToCoalesceFn) { attempt: BatchesToCoalesce =>
+      concatBatches(attempt.batches)
+    }
   }
 }
 
@@ -670,7 +824,7 @@ case class GpuCoalesceBatches(child: SparkPlan, goal: CoalesceGoal)
     NUM_INPUT_BATCHES -> createMetric(DEBUG_LEVEL, DESCRIPTION_NUM_INPUT_BATCHES),
     CONCAT_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_CONCAT_TIME),
     PEAK_DEVICE_MEMORY -> createSizeMetric(DEBUG_LEVEL, DESCRIPTION_PEAK_DEVICE_MEMORY)
-  ) ++ spillMetrics
+  )
 
   override protected def doExecute(): RDD[InternalRow] = {
     throw new IllegalStateException("ROW BASED PROCESSING IS NOT SUPPORTED")
@@ -696,7 +850,7 @@ case class GpuCoalesceBatches(child: SparkPlan, goal: CoalesceGoal)
       child.outputOrdering
   }
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     val numInputRows = gpuLongMetric(NUM_INPUT_ROWS)
     val numInputBatches = gpuLongMetric(NUM_INPUT_BATCHES)
     val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
@@ -719,21 +873,20 @@ case class GpuCoalesceBatches(child: SparkPlan, goal: CoalesceGoal)
         Iterator.single(combinedCb)
       }
     } else {
-      val callback = GpuMetric.makeSpillCallback(allMetrics)
       goal match {
         case sizeGoal: CoalesceSizeGoal =>
           batches.mapPartitions { iter =>
             new GpuCompressionAwareCoalesceIterator(
               iter, dataTypes, sizeGoal, decompressMemoryTarget,
               numInputRows, numInputBatches, numOutputRows, numOutputBatches, NoopMetric,
-              concatTime, opTime, peakDevMemory, callback, "GpuCoalesceBatches",
+              concatTime, opTime, peakDevMemory, "GpuCoalesceBatches",
               localCodecConfigs)
           }
         case batchingGoal: BatchedByKey =>
           val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf)
           val f = GpuKeyBatchingIterator.makeFunc(batchingGoal.gpuOrder, output.toArray, targetSize,
             numInputRows, numInputBatches, numOutputRows, numOutputBatches,
-            concatTime, opTime, peakDevMemory, callback)
+            concatTime, opTime, peakDevMemory)
           batches.mapPartitions { iter =>
             f(iter)
           }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarToRowExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarToRowExec.scala
index a1ec04743e6..b5b809c5c69 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarToRowExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarToRowExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@ import scala.annotation.tailrec
 import scala.collection.mutable.Queue
 
 import ai.rapids.cudf.{Cuda, HostColumnVector, NvtxColor, Table}
+import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.shims.ShimUnaryExecNode
 
 import org.apache.spark.TaskContext
@@ -184,12 +185,22 @@ class AcceleratedColumnarToRowIterator(
   }
 }
 
+/**
+ * ColumnarToRowIterator converts GPU ColumnarBatches to CPU InternalRows.
+ *
+ * @note releaseSemaphore = true (default) should only be used in cases where
+ *       we are sure that no GPU memory is left unaccounted for (not spillable).
+ *       One notable case where releaseSemaphore is false is when used in
+ *       `GpuUserDefinedFunction`, which is evaluated as part of a projection, that
+ *       may or may not include other GPU columns.
+ */
 class ColumnarToRowIterator(batches: Iterator[ColumnarBatch],
     numInputBatches: GpuMetric,
     numOutputRows: GpuMetric,
     opTime: GpuMetric,
     streamTime: GpuMetric,
-    nullSafe: Boolean = false) extends Iterator[InternalRow] with Arm {
+    nullSafe: Boolean = false,
+    releaseSemaphore: Boolean = true) extends Iterator[InternalRow] with Arm {
   // GPU batches read in must be closed by the receiver (us)
   @transient private var cb: ColumnarBatch = null
   private var it: java.util.Iterator[InternalRow] = null
@@ -212,25 +223,30 @@ class ColumnarToRowIterator(batches: Iterator[ColumnarBatch],
   def loadNextBatch(): Unit = {
     closeCurrentBatch()
     it = null
+    // devCb will be None if the parent iterator is empty
     val devCb = fetchNextBatch()
     // perform conversion
-    devCb.foreach { devCb =>
-      withResource(new NvtxWithMetrics("ColumnarToRow: batch", NvtxColor.RED, opTime)) { _ =>
-        try {
-          cb = new ColumnarBatch(GpuColumnVector.extractColumns(devCb).map(toHost),
-            devCb.numRows())
-          it = cb.rowIterator()
-          // In order to match the numOutputRows metric in the generated code we update
-          // numOutputRows for each batch. This is less accurate than doing it at output
-          // because it will over count the number of rows output in the case of a limit,
-          // but it is more efficient.
-          numOutputRows += cb.numRows()
-        } finally {
-          devCb.close()
-          // Leaving the GPU for a while
-          GpuSemaphore.releaseIfNecessary(TaskContext.get())
+    try {
+      devCb.foreach { devCb =>
+        withResource(devCb) { _ =>
+          withResource(new NvtxWithMetrics("ColumnarToRow: batch", NvtxColor.RED, opTime)) { _ =>
+            cb = new ColumnarBatch(GpuColumnVector.extractColumns(devCb).safeMap(toHost),
+              devCb.numRows())
+            it = cb.rowIterator()
+            // In order to match the numOutputRows metric in the generated code we update
+            // numOutputRows for each batch. This is less accurate than doing it at output
+            // because it will over count the number of rows output in the case of a limit,
+            // but it is more efficient.
+            numOutputRows += cb.numRows()
+          }
         }
       }
+    } finally {
+      // Leaving the GPU for a while: if this iterator is configured to release
+      // the semaphore, do it now.
+      if (releaseSemaphore) {
+        GpuSemaphore.releaseIfNecessary(TaskContext.get())
+      }
     }
   }
 
@@ -322,6 +338,11 @@ case class GpuColumnarToRowExec(
       cdata.mapPartitions(f)
     }
   }
+
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
+    throw new IllegalStateException(s"Internal Error ${this.getClass} has column support" +
+        s" mismatch:\n$this")
+  }
 }
 
 object GpuColumnarToRowExec {
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDataProducer.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDataProducer.scala
index ca4371e9657..0dd05f9ff71 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDataProducer.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDataProducer.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,18 +145,15 @@ class CachedGpuBatchIterator private(pending: mutable.Queue[SpillableColumnarBat
  */
 object CachedGpuBatchIterator extends Arm {
   private[this] def makeSpillableAndClose(table: Table,
-      dataTypes: Array[DataType],
-      spillCallback: SpillCallback): SpillableColumnarBatch = {
+      dataTypes: Array[DataType]): SpillableColumnarBatch = {
     withResource(table) { _ =>
       SpillableColumnarBatch(GpuColumnVector.from(table, dataTypes),
-        SpillPriorities.ACTIVE_ON_DECK_PRIORITY,
-        spillCallback)
+        SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
     }
   }
 
   def apply(producer: GpuDataProducer[Table],
-      dataTypes: Array[DataType],
-      spillCallback: SpillCallback): GpuColumnarBatchIterator = {
+      dataTypes: Array[DataType]): GpuColumnarBatchIterator = {
     withResource(producer) { _ =>
       if (producer.hasNext) {
         // Special case for the first one.
@@ -168,9 +165,9 @@ object CachedGpuBatchIterator extends Arm {
             ret
           } else {
             val pending = mutable.Queue.empty[SpillableColumnarBatch]
-            pending += makeSpillableAndClose(firstTable, dataTypes, spillCallback)
+            pending += makeSpillableAndClose(firstTable, dataTypes)
             producer.foreach { t =>
-              pending += makeSpillableAndClose(t, dataTypes, spillCallback)
+              pending += makeSpillableAndClose(t, dataTypes)
             }
             new CachedGpuBatchIterator(pending)
           }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDataWritingCommandExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDataWritingCommandExec.scala
index 6d1389e72a4..a3bbeee7dc0 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDataWritingCommandExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDataWritingCommandExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -133,7 +133,7 @@ case class GpuDataWritingCommandExec(cmd: GpuDataWritingCommand, child: SparkPla
   protected override def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException(
     s"${getClass.getCanonicalName} does not support row-based execution")
 
-  override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     sparkContext.parallelize(sideEffectResult, 1)
   }
 
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala
index f02cb4fd8ee..b03ffe4a067 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -280,8 +280,14 @@ object GpuDeviceManager extends Logging {
       }
 
       if (conf.isUvmEnabled) {
-        init = init | RmmAllocationMode.CUDA_MANAGED_MEMORY
-        features += "UVM"
+        // Enable managed memory only if async allocator is not used.
+        if ((init & RmmAllocationMode.CUDA_ASYNC) == 0) {
+          features += "UVM"
+          init = init | RmmAllocationMode.CUDA_MANAGED_MEMORY
+        } else {
+          throw new IllegalArgumentException(
+            "CUDA Unified Memory is not supported in CUDA_ASYNC allocation mode");
+        }
       }
 
       val logConf: Rmm.LogConf = conf.rmmDebugLocation match {
@@ -291,7 +297,7 @@ object GpuDeviceManager extends Logging {
           Rmm.logToStdout()
         case c if "stderr".equalsIgnoreCase(c) =>
           features += "LOG: STDERR"
-          Rmm.logToStdout()
+          Rmm.logToStderr()
         case c =>
           logWarning(s"RMM logging set to '$c' is not supported and is being ignored.")
           null
@@ -309,9 +315,27 @@ object GpuDeviceManager extends Logging {
       }
 
       Cuda.setDevice(gpuId)
-      Rmm.initialize(init, logConf, poolAllocation)
-      RapidsBufferCatalog.init(conf)
+      try {
+        Rmm.initialize(init, logConf, poolAllocation)
+      } catch {
+        case firstEx: CudfException if ((init & RmmAllocationMode.CUDA_ASYNC) != 0) => {
+          logWarning("Failed to initialize RMM with ASYNC allocator. " +
+            "Initializing with ARENA allocator as a fallback option.")
+          init = init & (~RmmAllocationMode.CUDA_ASYNC) | RmmAllocationMode.ARENA
+          try {
+            Rmm.initialize(init, logConf, poolAllocation)
+          } catch {
+            case secondEx: Throwable => {
+              logError(
+                "Failed to initialize RMM with either ASYNC or ARENA allocators. Exiting...")
+              secondEx.addSuppressed(firstEx)
+              throw secondEx
+            }
+          }
+        }
+      }
 
+      RapidsBufferCatalog.init(conf)
       GpuShuffleEnv.init(conf, RapidsBufferCatalog.getDiskBlockManager())
     }
   }
@@ -350,7 +374,9 @@ object GpuDeviceManager extends Logging {
   }
 
   /** Wrap a thread factory with one that will set the GPU device on each thread created. */
-  def wrapThreadFactory(factory: ThreadFactory): ThreadFactory = new ThreadFactory() {
+  def wrapThreadFactory(factory: ThreadFactory,
+      before: () => Unit = null,
+      after: () => Unit = null): ThreadFactory = new ThreadFactory() {
     private[this] val devId = getDeviceId.getOrElse {
       throw new IllegalStateException("Device ID is not set")
     }
@@ -358,7 +384,16 @@ object GpuDeviceManager extends Logging {
     override def newThread(runnable: Runnable): Thread = {
       factory.newThread(() => {
         Cuda.setDevice(devId)
-        runnable.run()
+        try {
+          if (before != null) {
+            before()
+          }
+          runnable.run()
+        } finally {
+          if (after != null) {
+            after()
+          }
+        }
       })
     }
   }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExec.scala
index 5284cf8c5e3..fbdd40110ce 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,15 +18,18 @@ package com.nvidia.spark.rapids
 
 import ai.rapids.cudf.NvtxColor
 import com.nvidia.spark.RebaseHelper.withResource
-import com.nvidia.spark.rapids.StorageTier.{DEVICE, DISK, GDS, HOST, StorageTier}
 import com.nvidia.spark.rapids.shims.SparkShimImpl
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.rapids.LocationPreservingMapPartitionsRDD
+import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression, ExprId}
 import org.apache.spark.sql.catalyst.plans.QueryPlan
+import org.apache.spark.sql.catalyst.trees.TreeNodeTag
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
+import org.apache.spark.sql.rapids.GpuTaskMetrics
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
 sealed class MetricsLevel(val num: Integer) extends Serializable {
@@ -54,7 +57,6 @@ object GpuMetric extends Logging {
   val PARTITION_SIZE = "partitionSize"
   val NUM_PARTITIONS = "numPartitions"
   val OP_TIME = "opTime"
-  val SEMAPHORE_WAIT_TIME = "semaphoreWaitTime"
   val PEAK_DEVICE_MEMORY = "peakDevMemory"
   val COLLECT_TIME = "collectTime"
   val CONCAT_TIME = "concatTime"
@@ -66,9 +68,6 @@ object GpuMetric extends Logging {
   val BUILD_DATA_SIZE = "buildDataSize"
   val BUILD_TIME = "buildTime"
   val STREAM_TIME = "streamTime"
-  val SPILL_AMOUNT = "spillData"
-  val SPILL_AMOUNT_DISK = "spillDisk"
-  val SPILL_AMOUNT_HOST = "spillHost"
   val NUM_TASKS_FALL_BACKED = "numTasksFallBacked"
   val READ_FS_TIME = "readFsTime"
   val WRITE_BUFFER_TIME = "writeBufferTime"
@@ -84,7 +83,6 @@ object GpuMetric extends Logging {
   val DESCRIPTION_PARTITION_SIZE = "partition data size"
   val DESCRIPTION_NUM_PARTITIONS = "partitions"
   val DESCRIPTION_OP_TIME = "op time"
-  val DESCRIPTION_SEMAPHORE_WAIT_TIME = "GPU semaphore wait time"
   val DESCRIPTION_PEAK_DEVICE_MEMORY = "peak device memory"
   val DESCRIPTION_COLLECT_TIME = "collect batch time"
   val DESCRIPTION_CONCAT_TIME = "concat batch time"
@@ -96,9 +94,6 @@ object GpuMetric extends Logging {
   val DESCRIPTION_BUILD_DATA_SIZE = "build side size"
   val DESCRIPTION_BUILD_TIME = "build time"
   val DESCRIPTION_STREAM_TIME = "stream time"
-  val DESCRIPTION_SPILL_AMOUNT = "bytes spilled from GPU"
-  val DESCRIPTION_SPILL_AMOUNT_DISK = "bytes spilled to disk"
-  val DESCRIPTION_SPILL_AMOUNT_HOST = "bytes spilled to host"
   val DESCRIPTION_NUM_TASKS_FALL_BACKED = "number of sort fallback tasks"
   val DESCRIPTION_READ_FS_TIME = "time to read fs data"
   val DESCRIPTION_WRITE_BUFFER_TIME = "time to write data to buffer"
@@ -108,50 +103,20 @@ object GpuMetric extends Logging {
     case i => throw new IllegalArgumentException(s"found unsupported GpuMetric ${i.getClass}")
   }
 
-  def unwrap(input: Map[String, GpuMetric]): Map[String, SQLMetric] = input.filter {
+  def unwrap(input: Map[String, GpuMetric]): Map[String, SQLMetric] = input.collect {
     // remove the metrics that are not registered
-    case (_, NoopMetric) => false
-    case _ => true
-    // sadly mapValues produces a non-serializable result, so we have to hack it a bit to force
-    // it to be materialized
-  }.mapValues(unwrap).toArray.toMap
+    case (k, w) if w != NoopMetric => (k, unwrap(w))
+  }
 
   def wrap(input: SQLMetric): GpuMetric = WrappedGpuMetric(input)
 
-  def wrap(input: Map[String, SQLMetric]): Map[String, GpuMetric] =
-  // sadly mapValues produces a non-serializable result, so we have to hack it a bit to force
-  // it to be materialized
-    input.mapValues(wrap).toArray.toMap
+  def wrap(input: Map[String, SQLMetric]): Map[String, GpuMetric] = input.map {
+    case (k, v) => (k, wrap(v))
+  }
 
   object DEBUG_LEVEL extends MetricsLevel(0)
   object MODERATE_LEVEL extends MetricsLevel(1)
   object ESSENTIAL_LEVEL extends MetricsLevel(2)
-
-  def makeSpillCallback(allMetrics: Map[String, GpuMetric]): SpillCallback = {
-    val spillAmount = allMetrics(SPILL_AMOUNT)
-    val disk = allMetrics(SPILL_AMOUNT_DISK)
-    val host = allMetrics(SPILL_AMOUNT_HOST)
-    val sem = allMetrics(SEMAPHORE_WAIT_TIME)
-    new SpillCallback {
-      override def apply(from: StorageTier, to: StorageTier, amount: Long): Unit = {
-        from match {
-          case DEVICE =>
-            spillAmount += amount
-          case _ => // ignored
-        }
-        to match {
-          case HOST =>
-            host += amount
-          case GDS | DISK =>
-            disk += amount
-          case _ =>
-            logWarning(s"Spill to $to is unsupported in metrics: $amount")
-        }
-      }
-
-      override def semaphoreWaitTime: GpuMetric = sem
-    }
-  }
 }
 
 sealed abstract class GpuMetric extends Serializable {
@@ -160,7 +125,7 @@ sealed abstract class GpuMetric extends Serializable {
   def +=(v: Long): Unit
   def add(v: Long): Unit
 
-  def ns[T](f: => T): T = {
+  final def ns[T](f: => T): T = {
     val start = System.nanoTime()
     try {
       f
@@ -177,7 +142,7 @@ object NoopMetric extends GpuMetric {
   override def value: Long = 0
 }
 
-case class WrappedGpuMetric(sqlMetric: SQLMetric) extends GpuMetric {
+final case class WrappedGpuMetric(sqlMetric: SQLMetric) extends GpuMetric {
   def +=(v: Long): Unit = sqlMetric.add(v)
   def add(v: Long): Unit = sqlMetric.add(v)
   override def set(v: Long): Unit = sqlMetric.set(v)
@@ -206,6 +171,8 @@ object GpuExec {
     case gpu: GpuExec => gpu.outputBatching
     case _ => null
   }
+
+  val TASK_METRICS_TAG = new TreeNodeTag[GpuTaskMetrics]("gpu_task_metrics")
 }
 
 trait GpuExec extends SparkPlan with Arm {
@@ -284,16 +251,6 @@ trait GpuExec extends SparkPlan with Arm {
 
   lazy val additionalMetrics: Map[String, GpuMetric] = Map.empty
 
-  protected def spillMetrics: Map[String, GpuMetric] = Map(
-    SPILL_AMOUNT -> createSizeMetric(ESSENTIAL_LEVEL, DESCRIPTION_SPILL_AMOUNT),
-    SPILL_AMOUNT_DISK -> createSizeMetric(DEBUG_LEVEL, DESCRIPTION_SPILL_AMOUNT_DISK),
-    SPILL_AMOUNT_HOST -> createSizeMetric(DEBUG_LEVEL, DESCRIPTION_SPILL_AMOUNT_HOST)
-  ) ++ semaphoreMetrics
-
-  protected def semaphoreMetrics: Map[String, GpuMetric] = Map(
-    SEMAPHORE_WAIT_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_SEMAPHORE_WAIT_TIME)
-  )
-
   /**
    * Returns true if there is something in the exec that cannot work when batches between
    * multiple file partitions are combined into a single batch (coalesce).
@@ -333,4 +290,26 @@ trait GpuExec extends SparkPlan with Arm {
       case other => QueryPlan.normalizeExpressions(other, allAttributes)
     }.withNewChildren(canonicalizedChildren)
   }
+
+  // This is ugly, we don't need to access these metrics directly, but we do need to make sure
+  // that we can send them over the wire to the executor so that things work as expected
+  def setTaskMetrics(gpuTaskMetrics: GpuTaskMetrics): Unit =
+    setTagValue(GpuExec.TASK_METRICS_TAG, gpuTaskMetrics)
+
+  def getTaskMetrics: Option[GpuTaskMetrics] =
+    this.getTagValue(GpuExec.TASK_METRICS_TAG)
+
+  final override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+    val orig = internalDoExecuteColumnar()
+    val metrics = getTaskMetrics
+    metrics.map { gpuMetrics =>
+      // This is ugly, but it reduces the need to change all exec nodes, so we are doing it here
+      LocationPreservingMapPartitionsRDD(orig) { iter =>
+        gpuMetrics.makeSureRegistered()
+        iter
+      }
+    }.getOrElse(orig)
+  }
+
+  protected def internalDoExecuteColumnar(): RDD[ColumnarBatch]
 }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExpandExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExpandExec.scala
index 2250d5c2f02..7428bd4364f 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExpandExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExpandExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -87,7 +87,7 @@ case class GpuExpandExec(
   override lazy val references: AttributeSet =
     AttributeSet(projections.flatten.flatMap(_.references))
 
-  override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     val boundProjections: Seq[Seq[GpuExpression]] =
       projections.map(GpuBindReferences.bindGpuReferences(_, child.output))
 
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuGenerateExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuGenerateExec.scala
index 23a995211f3..cda5cd72d58 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuGenerateExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuGenerateExec.scala
@@ -682,7 +682,7 @@ case class GpuGenerateExec(
   override def doExecute(): RDD[InternalRow] =
     throw new IllegalStateException(s"Row-based execution should not occur for $this")
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
     val numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES)
     val opTime = gpuLongMetric(OP_TIME)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuHashPartitioningBase.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuHashPartitioningBase.scala
index 69356da3e9b..4d4cf204195 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuHashPartitioningBase.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuHashPartitioningBase.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 package com.nvidia.spark.rapids
 
-import ai.rapids.cudf.{DType, NvtxColor, NvtxRange}
+import ai.rapids.cudf.{DType, NvtxColor, NvtxRange, PartitionedTable}
 import com.nvidia.spark.rapids.shims.ShimExpression
 
 import org.apache.spark.sql.catalyst.expressions.Expression
@@ -33,20 +33,8 @@ abstract class GpuHashPartitioningBase(expressions: Seq[Expression], numPartitio
 
   def partitionInternalAndClose(batch: ColumnarBatch): (Array[Int], Array[GpuColumnVector]) = {
     val types = GpuColumnVector.extractTypes(batch)
-    val partedTable = withResource(batch) { batch =>
-      val parts = withResource(new NvtxRange("Calculate part", NvtxColor.CYAN)) { _ =>
-        withResource(GpuMurmur3Hash.compute(batch, expressions)) { hash =>
-          withResource(GpuScalar.from(numPartitions, IntegerType)) { partsLit =>
-            hash.pmod(partsLit, DType.INT32)
-          }
-        }
-      }
-      withResource(parts) { parts =>
-        withResource(GpuColumnVector.from(batch)) { table =>
-          table.partition(parts, numPartitions)
-        }
-      }
-    }
+    val partedTable = GpuHashPartitioningBase.hashPartitionAndClose(batch, expressions,
+      numPartitions, "Calculate part")
     withResource(partedTable) { partedTable =>
       val parts = partedTable.getPartitions
       val tp = partedTable.getTable
@@ -72,3 +60,27 @@ abstract class GpuHashPartitioningBase(expressions: Seq[Expression], numPartitio
     }
   }
 }
+
+object GpuHashPartitioningBase extends Arm {
+
+  val DEFAULT_HASH_SEED: Int = 42
+
+  def hashPartitionAndClose(batch: ColumnarBatch, keys: Seq[Expression], numPartitions: Int,
+      nvtxName: String, seed: Int = DEFAULT_HASH_SEED): PartitionedTable = {
+    withResource(batch) { batch =>
+      val parts = withResource(new NvtxRange(nvtxName, NvtxColor.CYAN)) { _ =>
+        withResource(GpuMurmur3Hash.compute(batch, keys, seed)) { hash =>
+          withResource(GpuScalar.from(numPartitions, IntegerType)) { partsLit =>
+            hash.pmod(partsLit, DType.INT32)
+          }
+        }
+      }
+      withResource(parts) { parts =>
+        withResource(GpuColumnVector.from(batch)) { table =>
+          table.partition(parts, numPartitions)
+        }
+      }
+    }
+  }
+
+}
\ No newline at end of file
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuKeyBatchingIterator.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuKeyBatchingIterator.scala
index 84820498ffc..687e7a19ece 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuKeyBatchingIterator.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuKeyBatchingIterator.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,8 +42,7 @@ class GpuKeyBatchingIterator(
     numOutputBatches: GpuMetric,
     concatTime: GpuMetric,
     opTime: GpuMetric,
-    peakDevMemory: GpuMetric,
-    spillCallback: SpillCallback)
+    peakDevMemory: GpuMetric)
     extends Iterator[ColumnarBatch] with Arm {
   private val pending = mutable.Queue[SpillableColumnarBatch]()
   private var pendingSize: Long = 0
@@ -177,7 +176,7 @@ class GpuKeyBatchingIterator(
               // Everything is for a single key, so save it away and try the next batch...
               pending +=
                   SpillableColumnarBatch(GpuColumnVector.incRefCounts(cb),
-                    SpillPriorities.ACTIVE_ON_DECK_PRIORITY, spillCallback)
+                    SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
               pendingSize += cbSize
             } else {
               var peak = GpuColumnVector.getTotalDeviceMemoryUsed(cb)
@@ -190,7 +189,7 @@ class GpuKeyBatchingIterator(
                   peak += savedSize
                   pending +=
                       SpillableColumnarBatch(tables(1), types,
-                        SpillPriorities.ACTIVE_ON_DECK_PRIORITY, spillCallback)
+                        SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
                   pendingSize += savedSize
                   numOutputRows += ret.numRows()
                   numOutputBatches += 1
@@ -223,14 +222,13 @@ object GpuKeyBatchingIterator {
       numOutputBatches: GpuMetric,
       concatTime: GpuMetric,
       opTime: GpuMetric,
-      peakDevMemory: GpuMetric,
-      spillCallback: SpillCallback): Iterator[ColumnarBatch] => GpuKeyBatchingIterator = {
+      peakDevMemory: GpuMetric): Iterator[ColumnarBatch] => GpuKeyBatchingIterator = {
     val sorter = new GpuSorter(unboundOrderSpec, schema)
     val types = schema.map(_.dataType)
     def makeIter(iter: Iterator[ColumnarBatch]): GpuKeyBatchingIterator = {
       new GpuKeyBatchingIterator(iter, sorter, types, targetSizeBytes,
         numInputRows, numInputBatches, numOutputRows, numOutputBatches,
-        concatTime, opTime, peakDevMemory, spillCallback)
+        concatTime, opTime, peakDevMemory)
     }
     makeIter
   }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuKryoRegistrator.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuKryoRegistrator.scala
index 85ebd084a15..1805864c95f 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuKryoRegistrator.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuKryoRegistrator.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ class GpuKryoRegistrator extends KryoRegistrator {
       "org.apache.spark.sql.rapids.execution.SerializeConcatHostBuffersDeserializeBatch",
       "org.apache.spark.sql.rapids.execution.SerializeBatchDeserializeHostBuffer")
     allClassesToRegister.foreach { classToRegister =>
-      kryo.register(ShimLoader.loadClass(classToRegister), new KryoJavaSerializer())
+      kryo.register(ShimReflectionUtils.loadClass(classToRegister), new KryoJavaSerializer())
     }
   }
 }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala
index 994963c8cef..61ba45e72d8 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala
@@ -27,7 +27,7 @@ import scala.collection.mutable
 import scala.language.implicitConversions
 
 import ai.rapids.cudf.{ColumnVector, HostMemoryBuffer, NvtxColor, NvtxRange, Table}
-import com.nvidia.spark.rapids.GpuMetric.{makeSpillCallback, BUFFER_TIME, FILTER_TIME, PEAK_DEVICE_MEMORY, SEMAPHORE_WAIT_TIME}
+import com.nvidia.spark.rapids.GpuMetric.{BUFFER_TIME, FILTER_TIME, PEAK_DEVICE_MEMORY}
 import com.nvidia.spark.rapids.RapidsPluginImplicits.AutoCloseableProducingSeq
 import org.apache.commons.io.IOUtils
 import org.apache.hadoop.conf.Configuration
@@ -77,10 +77,11 @@ trait HostMemoryBuffersWithMetaDataBase {
   def memBuffersAndSizes: Array[SingleHMBAndMeta]
   // Total bytes read
   def bytesRead: Long
-  // Percentage of time spent on filtering
-  private var _filterTimePct: Double = 0L
-  // Percentage of time spent on buffering
-  private var _bufferTimePct: Double = 0L
+
+  // Time spent on filtering
+  private var _filterTime: Long = 0L
+  // Time spent on buffering
+  private var _bufferTime: Long = 0L
 
   // The partition values which are needed if combining host memory buffers
   // after read by the multithreaded reader but before sending to GPU.
@@ -89,13 +90,22 @@ trait HostMemoryBuffersWithMetaDataBase {
   // Called by parquet/orc/avro scanners to set the amount of time (in nanoseconds)
   // that filtering and buffering incurred in one of the scan runners.
   def setMetrics(filterTime: Long, bufferTime: Long): Unit = {
-    val totalTime = filterTime + bufferTime
-    _filterTimePct = filterTime.toDouble / totalTime
-    _bufferTimePct = bufferTime.toDouble / totalTime
+    _bufferTime = bufferTime
+    _filterTime = filterTime
   }
 
-  def getBufferTimePct: Double = _bufferTimePct
-  def getFilterTimePct: Double = _filterTimePct
+  def getBufferTime: Long = _bufferTime
+  def getFilterTime: Long = _filterTime
+
+  def getBufferTimePct: Double = {
+    val totalTime = _filterTime + _bufferTime
+    _bufferTime.toDouble / totalTime
+  }
+
+  def getFilterTimePct: Double = {
+    val totalTime = _filterTime + _bufferTime
+    _filterTime.toDouble / totalTime
+  }
 }
 
 // This is a common trait for all kind of file formats
@@ -370,7 +380,6 @@ abstract class FilePartitionReaderBase(conf: Configuration, execMetrics: Map[Str
   protected var isDone: Boolean = false
   protected var maxDeviceMemory: Long = 0
   protected var batchIter: Iterator[ColumnarBatch] = EmptyGpuColumnarBatchIterator
-  protected lazy val spillCallback: SpillCallback = makeSpillCallback(execMetrics)
 
   override def get(): ColumnarBatch = {
     batchIter.next()
@@ -1133,7 +1142,7 @@ abstract class MultiFileCoalescingPartitionReaderBase(
         } else {
           val rows = currentChunkMeta.numTotalRows.toInt
           // Someone is going to process this data, even if it is just a row count
-          GpuSemaphore.acquireIfNecessary(TaskContext.get(), metrics(SEMAPHORE_WAIT_TIME))
+          GpuSemaphore.acquireIfNecessary(TaskContext.get())
           val nullColumns = currentChunkMeta.readSchema.safeMap(f =>
             GpuColumnVector.fromNull(rows, f.dataType).asInstanceOf[SparkVector])
           val emptyBatch = new ColumnarBatch(nullColumns.toArray, rows)
@@ -1141,9 +1150,26 @@ abstract class MultiFileCoalescingPartitionReaderBase(
         }
       } else {
         val colTypes = currentChunkMeta.readSchema.fields.map(f => f.dataType)
-        val tableReader = readToTable(currentChunkMeta.currentChunk, currentChunkMeta.clippedSchema,
-          currentChunkMeta.readSchema, currentChunkMeta.extraInfo)
-        CachedGpuBatchIterator(tableReader, colTypes, spillCallback)
+        if (currentChunkMeta.currentChunk.isEmpty) {
+          CachedGpuBatchIterator(EmptyTableReader, colTypes)
+        } else {
+          val (dataBuffer, dataSize) = readPartFiles(currentChunkMeta.currentChunk,
+            currentChunkMeta.clippedSchema)
+          if (dataSize == 0) {
+            dataBuffer.close()
+            CachedGpuBatchIterator(EmptyTableReader, colTypes)
+          } else {
+            RmmRapidsRetryIterator.withRetryNoSplit(dataBuffer) { _ =>
+              // We don't want to actually close the host buffer until we know that we don't
+              // want to retry more, so offset the close for now.
+              dataBuffer.incRefCount()
+              val tableReader = readBufferToTablesAndClose(dataBuffer,
+                dataSize, currentChunkMeta.clippedSchema, currentChunkMeta.readSchema,
+                currentChunkMeta.extraInfo)
+              CachedGpuBatchIterator(tableReader, colTypes)
+            }
+          }
+        }
       }
       new GpuColumnarBatchWithPartitionValuesIterator(batchIter, currentChunkMeta.allPartValues,
           currentChunkMeta.rowsPerPartition, partitionSchema).map { withParts =>
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala
index f4aed2de97d..24f8cadce11 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala
@@ -22,6 +22,7 @@ import java.nio.ByteBuffer
 import java.nio.channels.{Channels, WritableByteChannel}
 import java.util
 import java.util.concurrent.{Callable, TimeUnit}
+import java.util.regex.Pattern
 
 import scala.annotation.tailrec
 import scala.collection.JavaConverters._
@@ -823,20 +824,23 @@ trait OrcCommonFunctions extends OrcCodecWritingHelper { self: FilePartitionRead
       .build()
 
     // about to start using the GPU
-    GpuSemaphore.acquireIfNecessary(TaskContext.get(), metrics(SEMAPHORE_WAIT_TIME))
+    GpuSemaphore.acquireIfNecessary(TaskContext.get())
 
-    val table = withResource(new NvtxWithMetrics("ORC decode", NvtxColor.DARK_GREEN,
-        metrics(GPU_DECODE_TIME))) { _ =>
-      try {
-        Table.readORC(parseOpts, hostBuf, 0, bufSize)
-      } catch {
-        case e: Exception => 
-          throw new IOException(s"Error when processing file splits [${splits.mkString("; ")}]", e)
+    try {
+      RmmRapidsRetryIterator.withRetryNoSplit[Table] {
+        val table = withResource(new NvtxWithMetrics("ORC decode", NvtxColor.DARK_GREEN,
+          metrics(GPU_DECODE_TIME))) { _ =>
+          Table.readORC(parseOpts, hostBuf, 0, bufSize)
+        }
+
+        // Execute the schema evolution
+        SchemaUtils.evolveSchemaIfNeededAndClose(table, tableSchema, readDataSchema,
+          isCaseSensitive, Some(GpuOrcScan.castColumnTo))
       }
+    } catch {
+      case e: Exception =>
+        throw new IOException(s"Error when processing file splits [${splits.mkString("; ")}]", e)
     }
-    // Execute the schema evolution
-    SchemaUtils.evolveSchemaIfNeededAndClose(table, tableSchema, readDataSchema,
-      isCaseSensitive, Some(GpuOrcScan.castColumnTo))
   }
 }
 
@@ -1075,7 +1079,7 @@ class GpuOrcPartitionReader(
           None
         } else {
           // Someone is going to process this data, even if it is just a row count
-          GpuSemaphore.acquireIfNecessary(TaskContext.get(), metrics(SEMAPHORE_WAIT_TIME))
+          GpuSemaphore.acquireIfNecessary(TaskContext.get())
           val nullColumns = readDataSchema.safeMap(f =>
             GpuColumnVector.fromNull(numRows, f.dataType).asInstanceOf[SparkVector])
           Some(new ColumnarBatch(nullColumns.toArray, numRows))
@@ -1427,7 +1431,21 @@ private case class GpuOrcFileFilterHandler(
         isOrcFloatTypesToStringEnable: Boolean): (TypeDescription, Array[Boolean]) = {
       // all default to false
       val fileIncluded = new Array[Boolean](fileSchema.getMaximumId + 1)
-      val isForcePos = OrcShims.forcePositionalEvolution(conf)
+      val isForcePos = if (OrcShims.forcePositionalEvolution(conf)) {
+        true
+      } else if (GpuOrcPartitionReaderUtils.isMissingColumnNames(fileSchema)) {
+        if (OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf)) {
+          true
+        } else {
+          throw new RuntimeException("Found that schema metadata is missing"
+              + " from file. This is likely caused by"
+              + " a writer earlier than HIVE-4243. Will"
+              + " not try to reconcile schemas")
+        }
+      } else {
+        false
+      }
+
       (checkTypeCompatibility(fileSchema, readSchema, isCaseAware, fileIncluded, isForcePos,
         isOrcFloatTypesToStringEnable),
         fileIncluded)
@@ -1450,6 +1468,7 @@ private case class GpuOrcFileFilterHandler(
           // Check for the top or nested struct types.
           val readFieldNames = readType.getFieldNames.asScala
           val readField2Type = readFieldNames.zip(readType.getChildren.asScala)
+
           val getReadFieldType: (String, Int) => Option[(String, TypeDescription)] =
             if (isForcePos) {
               // Match the top level columns using position rather than column names.
@@ -1547,6 +1566,13 @@ private case class GpuOrcFileFilterHandler(
     }
   }
 
+  private object GpuOrcPartitionReaderUtils {
+    private val missingColumnNamePattern = Pattern.compile("_col\\d+")
+
+    private def isMissingColumnNames(t: TypeDescription): Boolean = {
+      t.getFieldNames.asScala.exists(f => missingColumnNamePattern.matcher(f).matches())
+    }
+  }
 }
 
 /**
@@ -1744,7 +1770,7 @@ class MultiFileCloudOrcPartitionReader(
           new ColumnarBatch(Array.empty, 0)
         } else {
           // Someone is going to process this data, even if it is just a row count
-          GpuSemaphore.acquireIfNecessary(TaskContext.get(), metrics(SEMAPHORE_WAIT_TIME))
+          GpuSemaphore.acquireIfNecessary(TaskContext.get())
           val nullColumns = meta.readSchema.fields.safeMap(f =>
             GpuColumnVector.fromNull(rows, f.dataType).asInstanceOf[SparkVector])
           new ColumnarBatch(nullColumns, rows)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
index c53ff1eb91b..f858aeee607 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
@@ -24,7 +24,7 @@ import scala.util.control.NonFatal
 
 import ai.rapids.cudf.DType
 import com.nvidia.spark.rapids.RapidsConf.{SUPPRESS_PLANNING_FAILURE, TEST_CONF}
-import com.nvidia.spark.rapids.shims.{AQEUtils, DecimalArithmeticOverrides, DeltaLakeUtils, GetMapValueMeta, GpuBatchScanExec, GpuHashPartitioning, GpuRangePartitioning, GpuSpecifiedWindowFrameMeta, GpuTypeShims, GpuWindowExpressionMeta, OffsetWindowFunctionMeta, SparkShimImpl}
+import com.nvidia.spark.rapids.shims.{AQEUtils, BatchScanExecMeta, DecimalArithmeticOverrides, DeltaLakeUtils, GetMapValueMeta, GpuHashPartitioning, GpuRangePartitioning, GpuSpecifiedWindowFrameMeta, GpuTypeShims, GpuWindowExpressionMeta, OffsetWindowFunctionMeta, SparkShimImpl}
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.internal.Logging
@@ -63,7 +63,7 @@ import org.apache.spark.sql.rapids._
 import org.apache.spark.sql.rapids.catalyst.expressions.GpuRand
 import org.apache.spark.sql.rapids.execution._
 import org.apache.spark.sql.rapids.execution.python._
-import org.apache.spark.sql.rapids.execution.python.shims.GpuFlatMapGroupsInPandasExecMeta
+import org.apache.spark.sql.rapids.execution.python.GpuFlatMapGroupsInPandasExecMeta
 import org.apache.spark.sql.rapids.shims.GpuTimeAdd
 import org.apache.spark.sql.rapids.zorder.ZOrderRules
 import org.apache.spark.sql.types._
@@ -523,12 +523,33 @@ object GpuOverrides extends Logging {
     }
   }
 
+  /**
+   * On some Spark platforms, AQE planning can result in old CPU exchanges being placed in the
+   * plan even after they have been replaced previously. This looks for subquery reuses of CPU
+   * exchanges that can be replaced with recently planned GPU exchanges that match the original
+   * CPU plan
+   */
+  def fixupCpuReusedExchanges(plan: SparkPlan): SparkPlan = {
+    plan.transformUp {
+      case bqse: BroadcastQueryStageExec =>
+        bqse.plan match {
+          case ReusedExchangeExec(output, b: BroadcastExchangeExec) =>
+            val cpuCanonical = b.canonicalized.asInstanceOf[BroadcastExchangeExec]
+            val gpuExchange = ExchangeMappingCache.findGpuExchangeReplacement(cpuCanonical)
+            gpuExchange.map { g =>
+              SparkShimImpl.newBroadcastQueryStageExec(bqse, ReusedExchangeExec(output, g))
+            }.getOrElse(bqse)
+          case _ => bqse
+        }
+    }
+  }
+
   /**
    * Searches the plan for ReusedExchangeExec instances containing a GPU shuffle where the
    * output types between the two plan nodes do not match. In such a case the ReusedExchangeExec
    * will be updated to match the GPU shuffle output types.
    */
-  def fixupReusedExchangeExecs(plan: SparkPlan): SparkPlan = {
+  def fixupReusedExchangeOutputs(plan: SparkPlan): SparkPlan = {
     def outputTypesMatch(a: Seq[Attribute], b: Seq[Attribute]): Boolean =
       a.corresponds(b)((x, y) => x.dataType == y.dataType)
     plan.transformUp {
@@ -572,6 +593,11 @@ object GpuOverrides extends Logging {
     lit.value == null
   }
 
+  def isSupportedStringReplacePattern(strLit: String): Boolean = {
+    // check for regex special characters, except for \u0000 which we can support
+    !regexList.filterNot(_ == "\u0000").exists(pattern => strLit.contains(pattern))
+  }
+
   def isSupportedStringReplacePattern(exp: Expression): Boolean = {
     extractLit(exp) match {
       case Some(Literal(null, _)) => false
@@ -581,7 +607,7 @@ object GpuOverrides extends Logging {
           false
         } else {
           // check for regex special characters, except for \u0000 which we can support
-          !regexList.filterNot(_ == "\u0000").exists(pattern => strLit.contains(pattern))
+          isSupportedStringReplacePattern(strLit)
         }
       case _ => false
     }
@@ -2358,6 +2384,7 @@ object GpuOverrides extends Logging {
       }),
     expr[StringSplit](
        "Splits `str` around occurrences that match `regex`",
+      // Java's split API produces different behaviors than cudf when splitting with empty pattern
       ExprChecks.projectOnly(TypeSig.ARRAY.nested(TypeSig.STRING),
         TypeSig.ARRAY.nested(TypeSig.STRING),
         Seq(ParamCheck("str", TypeSig.STRING, TypeSig.STRING),
@@ -2411,58 +2438,7 @@ object GpuOverrides extends Logging {
           }
         }
       }),
-    expr[ElementAt](
-      "Returns element of array at given(1-based) index in value if column is array. " +
-        "Returns value for the given key in value if column is map.",
-      ExprChecks.binaryProject(
-        (TypeSig.commonCudfTypes + TypeSig.ARRAY + TypeSig.STRUCT + TypeSig.NULL +
-          TypeSig.DECIMAL_128 + TypeSig.MAP + TypeSig.BINARY).nested(), TypeSig.all,
-        ("array/map", TypeSig.ARRAY.nested(TypeSig.commonCudfTypes + TypeSig.ARRAY +
-          TypeSig.STRUCT + TypeSig.NULL + TypeSig.DECIMAL_128 + TypeSig.MAP + TypeSig.BINARY) +
-          TypeSig.MAP.nested(TypeSig.commonCudfTypes + TypeSig.ARRAY + TypeSig.STRUCT +
-            TypeSig.NULL + TypeSig.DECIMAL_128 + TypeSig.MAP + TypeSig.BINARY)
-            .withPsNote(TypeEnum.MAP ,"If it's map, only primitive key types are supported."),
-          TypeSig.ARRAY.nested(TypeSig.all) + TypeSig.MAP.nested(TypeSig.all)),
-        ("index/key", (TypeSig.commonCudfTypes + TypeSig.DECIMAL_128)
-          .withPsNote(
-            Seq(TypeEnum.BOOLEAN, TypeEnum.BYTE, TypeEnum.SHORT, TypeEnum.LONG,
-              TypeEnum.FLOAT, TypeEnum.DOUBLE, TypeEnum.DATE, TypeEnum.TIMESTAMP,
-              TypeEnum.STRING, TypeEnum.DECIMAL), "Unsupported as array index."),
-          TypeSig.all)),
-      (in, conf, p, r) => new BinaryExprMeta[ElementAt](in, conf, p, r) {
-        override def tagExprForGpu(): Unit = {
-          // To distinguish the supported nested type between Array and Map
-          val checks = in.left.dataType match {
-            case _: MapType =>
-              // Match exactly with the checks for GetMapValue
-              ExprChecks.binaryProject(
-                (TypeSig.commonCudfTypes + TypeSig.ARRAY + TypeSig.STRUCT + TypeSig.NULL +
-                  TypeSig.DECIMAL_128 + TypeSig.MAP + TypeSig.BINARY).nested(),
-                TypeSig.all,
-                ("map",
-                  TypeSig.MAP.nested(TypeSig.commonCudfTypes + TypeSig.ARRAY + TypeSig.STRUCT +
-                    TypeSig.NULL + TypeSig.DECIMAL_128 + TypeSig.MAP + TypeSig.BINARY),
-                  TypeSig.MAP.nested(TypeSig.all)),
-                ("key", TypeSig.commonCudfTypes + TypeSig.DECIMAL_128, TypeSig.all))
-            case _: ArrayType =>
-              // Match exactly with the checks for GetArrayItem
-              ExprChecks.binaryProject(
-                (TypeSig.commonCudfTypes + TypeSig.ARRAY + TypeSig.STRUCT + TypeSig.NULL +
-                  TypeSig.DECIMAL_128 + TypeSig.MAP + TypeSig.BINARY).nested(),
-                TypeSig.all,
-                ("array", TypeSig.ARRAY.nested(TypeSig.commonCudfTypes + TypeSig.ARRAY +
-                  TypeSig.STRUCT + TypeSig.NULL + TypeSig.DECIMAL_128 + TypeSig.MAP +
-                  TypeSig.BINARY),
-                  TypeSig.ARRAY.nested(TypeSig.all)),
-                ("ordinal", TypeSig.INT, TypeSig.INT))
-            case _ => throw new IllegalStateException("Only Array or Map is supported as input.")
-          }
-          checks.tag(this)
-        }
-        override def convertToGpu(lhs: Expression, rhs: Expression): GpuExpression = {
-          GpuElementAt(lhs, rhs, failOnError = in.failOnError)
-        }
-      }),
+    GpuElementAtMeta.elementAtRule(false),
     expr[MapKeys](
       "Returns an unordered array containing the keys of the map",
       ExprChecks.unaryProject(
@@ -2505,6 +2481,7 @@ object GpuOverrides extends Logging {
       }),
     expr[StringToMap](
       "Creates a map after splitting the input string into pairs of key-value strings",
+      // Java's split API produces different behaviors than cudf when splitting with empty pattern
       ExprChecks.projectOnly(TypeSig.MAP.nested(TypeSig.STRING), TypeSig.MAP.nested(TypeSig.STRING),
         Seq(ParamCheck("str", TypeSig.STRING, TypeSig.STRING),
           ParamCheck("pairDelim", TypeSig.lit(TypeEnum.STRING), TypeSig.lit(TypeEnum.STRING)),
@@ -3393,7 +3370,7 @@ object GpuOverrides extends Logging {
       }).disabledByDefault("parsing JSON from a column has a large number of issues and " +
       "should be considered beta quality right now."),
     expr[JsonTuple](
-      "Returns a tuple like the function get_json_object, but it takes multiple names. " + 
+      "Returns a tuple like the function get_json_object, but it takes multiple names. " +
         "All the input parameters and output column types are string.",
       ExprChecks.projectOnly(
         TypeSig.ARRAY.nested(TypeSig.STRUCT + TypeSig.STRING),
@@ -3403,13 +3380,13 @@ object GpuOverrides extends Logging {
       (a, conf, p, r) => new GeneratorExprMeta[JsonTuple](a, conf, p, r) {
         override def tagExprForGpu(): Unit = {
           if (childExprs.length >= 50) {
-            // If the number of field parameters is too large, fall back to CPU to avoid 
+            // If the number of field parameters is too large, fall back to CPU to avoid
             // potential performance problems.
             willNotWorkOnGpu("JsonTuple with large number of fields is not supported on GPU")
           }
           // If any field argument contains special characters as follows, fall back to CPU.
           (a.children.tail).map { fieldExpr =>
-            extractLit(fieldExpr).foreach { field => 
+            extractLit(fieldExpr).foreach { field =>
               if (field.value != null) {
                 val fieldStr = field.value.asInstanceOf[UTF8String].toString
                 val specialCharacters = List(".", "[", "]", "{", "}", "\\", "\'", "\"")
@@ -3648,7 +3625,7 @@ object GpuOverrides extends Logging {
         (a, conf, p, r) => new SaveIntoDataSourceCommandMeta(a, conf, p, r))
     ).map(r => (r.getClassFor.asSubclass(classOf[RunnableCommand]), r)).toMap
 
-  val runnableCmds = commonRunnableCmds ++ 
+  val runnableCmds = commonRunnableCmds ++
     GpuHiveOverrides.runnableCmds ++
       ExternalSource.runnableCmds ++
       SparkShimImpl.getRunnableCmds
@@ -3693,13 +3670,7 @@ object GpuOverrides extends Logging {
         (TypeSig.commonCudfTypes + TypeSig.STRUCT + TypeSig.MAP + TypeSig.ARRAY +
           TypeSig.DECIMAL_128 + TypeSig.BINARY).nested(),
         TypeSig.all),
-      (p, conf, parent, r) => new SparkPlanMeta[BatchScanExec](p, conf, parent, r) {
-        override val childScans: scala.Seq[ScanMeta[_]] =
-          Seq(GpuOverrides.wrapScan(p.scan, conf, Some(this)))
-
-        override def convertToGpu(): GpuExec =
-          GpuBatchScanExec(p.output, childScans.head.convertToGpu())
-      }),
+      (p, conf, parent, r) => new BatchScanExecMeta(p, conf, parent, r)),
     exec[CoalesceExec](
       "The backend for the dataframe coalesce method",
       ExecChecks((_gpuCommonTypes + TypeSig.DECIMAL_128 + TypeSig.STRUCT + TypeSig.ARRAY +
@@ -4298,11 +4269,17 @@ case class GpuOverrides() extends Rule[SparkPlan] with Logging {
     if (plan.conf.adaptiveExecutionEnabled) {
       // AQE can cause Spark to inject undesired CPU shuffles into the plan because GPU and CPU
       // distribution expressions are not semantically equal.
-      val newPlan = GpuOverrides.removeExtraneousShuffles(plan, conf)
+      var newPlan = GpuOverrides.removeExtraneousShuffles(plan, conf)
+
+      // Some Spark implementations are caching CPU exchanges for reuse which can be problematic
+      // when the RAPIDS Accelerator replaces the original exchange.
+      if (conf.isAqeExchangeReuseFixupEnabled && plan.conf.exchangeReuseEnabled) {
+        newPlan = GpuOverrides.fixupCpuReusedExchanges(newPlan)
+      }
 
       // AQE can cause ReusedExchangeExec instance to cache the wrong aggregation buffer type
       // compared to the desired buffer type from a reused GPU shuffle.
-      GpuOverrides.fixupReusedExchangeExecs(newPlan)
+      GpuOverrides.fixupReusedExchangeOutputs(newPlan)
     } else {
       plan
     }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala
index d9d0a404b2d..42d7ba8ace2 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -79,6 +79,18 @@ object GpuParquetFileFormat {
         s"${RapidsConf.ENABLE_PARQUET_WRITE} to true")
     }
 
+    // Check if bloom filter is enabled for any columns. If yes, then disable GPU write.
+    // For Parquet tables, bloom filters are enabled for column `col` by setting
+    // `parquet.bloom.filter.enabled#col` to `true` in `options` or table properties.
+    // Refer to https://spark.apache.org/docs/3.2.0/sql-data-sources-load-save-functions.html
+    // for further details.
+    options.foreach {
+      case (key, _) if key.startsWith("parquet.bloom.filter.enabled#") =>
+        meta.willNotWorkOnGpu(s"Bloom filter write for Parquet is not yet supported on GPU. " +
+          s"If bloom filter is not required, unset $key")
+      case _ =>
+    }
+
     FileFormatChecks.tag(meta, schema, ParquetFormatType, WriteFileOp)
 
     parseCompressionType(parquetOptions.compressionCodecClassName)
@@ -290,7 +302,7 @@ class GpuParquetWriter(
     timestampRebaseException: Boolean,
     context: TaskAttemptContext,
     parquetFieldIdEnabled: Boolean)
-  extends ColumnarOutputWriter(context, dataSchema, "Parquet") {
+  extends ColumnarOutputWriter(context, dataSchema, "Parquet", false) {
 
   val outputTimestampType = conf.get(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key)
 
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala
index 45e2a27622d..c366e9c431e 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala
@@ -37,7 +37,7 @@ import com.nvidia.spark.rapids.ParquetPartitionReader.CopyRange
 import com.nvidia.spark.rapids.RapidsConf.ParquetFooterReaderType
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.jni.ParquetFooter
-import com.nvidia.spark.rapids.shims.{GpuParquetCrypto, GpuTypeShims, ParquetSchemaClipShims, ParquetStringPredShims, ShimFilePartitionReaderFactory, SparkShimImpl}
+import com.nvidia.spark.rapids.shims.{GpuParquetCrypto, GpuTypeShims, ParquetLegacyNanoAsLongShims, ParquetSchemaClipShims, ParquetStringPredShims, ShimFilePartitionReaderFactory, SparkShimImpl}
 import org.apache.commons.io.IOUtils
 import org.apache.commons.io.output.{CountingOutputStream, NullOutputStream}
 import org.apache.hadoop.conf.Configuration
@@ -180,6 +180,10 @@ object GpuParquetScan {
       meta: RapidsMeta[_, _, _]): Unit = {
     val sqlConf = sparkSession.conf
 
+    if (ParquetLegacyNanoAsLongShims.legacyParquetNanosAsLong) {
+      meta.willNotWorkOnGpu("GPU does not support spark.sql.legacy.parquet.nanosAsLong")
+    }
+
     if (!meta.conf.isParquetEnabled) {
       meta.willNotWorkOnGpu("Parquet input and output has been disabled. To enable set" +
         s"${RapidsConf.ENABLE_PARQUET} to true")
@@ -1713,7 +1717,7 @@ class MultiFileParquetPartitionReader(
     val parseOpts = getParquetOptions(readDataSchema, clippedSchema, useFieldId)
 
     // About to start using the GPU
-    GpuSemaphore.acquireIfNecessary(TaskContext.get(), metrics(SEMAPHORE_WAIT_TIME))
+    GpuSemaphore.acquireIfNecessary(TaskContext.get())
 
     MakeParquetTableProducer(useChunkedReader, conf, targetBatchSizeBytes, parseOpts,
       dataBuffer, 0, dataSize, metrics,
@@ -1931,7 +1935,7 @@ class MultiFileCloudParquetPartitionReader(
       }
       val newHmbBufferInfo = SingleHMBAndMeta(buf, offset,
         combinedMeta.allPartValues.map(_._1).sum, Seq.empty, schemaToUse)
-      HostMemoryBuffersWithMetaData(
+      val newHmbMeta = HostMemoryBuffersWithMetaData(
         metaToUse.partitionedFile,
         metaToUse.origPartitionedFile, // this doesn't matter since already read
         Array(newHmbBufferInfo),
@@ -1942,6 +1946,10 @@ class MultiFileCloudParquetPartitionReader(
         metaToUse.clippedSchema,
         metaToUse.readSchema,
         Some(combinedMeta.allPartValues))
+      val filterTime = combinedMeta.toCombine.map(_.getFilterTime).sum
+      val bufferTime = combinedMeta.toCombine.map(_.getBufferTime).sum
+      newHmbMeta.setMetrics(filterTime, bufferTime)
+      newHmbMeta
     }
     logDebug(s"Took ${(System.currentTimeMillis() - startCombineTime)} " +
       s"ms to do combine of ${toCombineHmbs.size} files, " +
@@ -1970,7 +1978,11 @@ class MultiFileCloudParquetPartitionReader(
       val result = input(iterLoc)
       result match {
         case emptyHMData: HostMemoryEmptyMetaData =>
-          if (metaForEmpty == null) {
+          if (metaForEmpty == null || emptyHMData.numRows > 0) {
+            // we might have multiple EmptyMetaData results. If some are due to ignoring
+            // missing files and others are row counts, we want to make sure we
+            // take the metadata information from the ones with row counts because
+            // the ones from ignoring missing files has less information with it.
             metaForEmpty = emptyHMData
           }
           val totalNumRows = result.memBuffersAndSizes.map(_.numRows).sum
@@ -2218,11 +2230,11 @@ class MultiFileCloudParquetPartitionReader(
     case meta: HostMemoryEmptyMetaData =>
       // Not reading any data, but add in partition data if needed
       val rows = meta.numRows.toInt
-      val origBatch = if (rows == 0) {
-        new ColumnarBatch(Array.empty, 0)
+      val origBatch = if (meta.readSchema.isEmpty) {
+        new ColumnarBatch(Array.empty, rows)
       } else {
         // Someone is going to process this data, even if it is just a row count
-        GpuSemaphore.acquireIfNecessary(TaskContext.get(), metrics(SEMAPHORE_WAIT_TIME))
+        GpuSemaphore.acquireIfNecessary(TaskContext.get())
         val nullColumns = meta.readSchema.fields.safeMap(f =>
           GpuColumnVector.fromNull(rows, f.dataType).asInstanceOf[SparkVector])
         new ColumnarBatch(nullColumns, rows)
@@ -2267,26 +2279,32 @@ class MultiFileCloudParquetPartitionReader(
       hostBuffer: HostMemoryBuffer,
       dataSize: Long,
       allPartValues: Option[Array[(Long, InternalRow)]]): Iterator[ColumnarBatch] = {
-    val tableReader = closeOnExcept(hostBuffer) { _ =>
 
+    val parseOpts = closeOnExcept(hostBuffer) { _ =>
       // Dump parquet data into a file
       dumpDataToFile(hostBuffer, dataSize, files, Option(debugDumpPrefix), Some("parquet"))
       val parseOpts = getParquetOptions(readDataSchema, clippedSchema, useFieldId)
 
       // about to start using the GPU
-      GpuSemaphore.acquireIfNecessary(TaskContext.get(), metrics(SEMAPHORE_WAIT_TIME))
+      GpuSemaphore.acquireIfNecessary(TaskContext.get())
+      parseOpts
+    }
+    val colTypes = readDataSchema.fields.map(f => f.dataType)
 
-      MakeParquetTableProducer(useChunkedReader, conf, targetBatchSizeBytes, parseOpts,
+    RmmRapidsRetryIterator.withRetryNoSplit(hostBuffer) { _ =>
+      // The MakeParquetTableProducer will close the input buffer, and that would be bad
+      // because we don't want to close it until we know that we are done with it
+      hostBuffer.incRefCount()
+      val tableReader = MakeParquetTableProducer(useChunkedReader, conf, targetBatchSizeBytes,
+        parseOpts,
         hostBuffer, 0, dataSize, metrics,
         isCorrectInt96RebaseMode, isCorrectRebaseMode, hasInt96Timestamps,
         isSchemaCaseSensitive, useFieldId, readDataSchema, clippedSchema, None,
         tableSize => maxDeviceMemory = max(tableSize, maxDeviceMemory))
-    }
 
-    closeOnExcept(tableReader) { _ =>
-      val colTypes = readDataSchema.fields.map(f => f.dataType)
+      val batchIter = CachedGpuBatchIterator(tableReader, colTypes)
+
       if (allPartValues.isDefined) {
-        val batchIter = CachedGpuBatchIterator(tableReader, colTypes, spillCallback)
         val allPartInternalRows = allPartValues.get.map(_._2)
         val rowsPerPartition = allPartValues.get.map(_._1)
         new GpuColumnarBatchWithPartitionValuesIterator(batchIter, allPartInternalRows,
@@ -2294,7 +2312,7 @@ class MultiFileCloudParquetPartitionReader(
       } else {
         // this is a bit weird, we don't have number of rows when allPartValues isn't
         // filled in so can't use GpuColumnarBatchWithPartitionValuesIterator
-        CachedGpuBatchIterator(tableReader, colTypes, spillCallback).map { batch =>
+        batchIter.map { batch =>
           // we have to add partition values here for this batch, we already verified that
           // its not different for all the blocks in this batch
           addPartitionValues(batch, partedFile.partitionValues, partitionSchema)
@@ -2329,21 +2347,21 @@ object MakeParquetTableProducer extends Arm {
         isSchemaCaseSensitive, useFieldId, readDataSchema, clippedParquetSchema,
         filePath, onTableSize)
     } else {
-      val table = withResource(buffer) { _ =>
-        withResource(new NvtxWithMetrics("Parquet decode", NvtxColor.DARK_GREEN,
-          metrics(GPU_DECODE_TIME))) { _ =>
-          try {
+      val table = try {
+        RmmRapidsRetryIterator.withRetryNoSplit(buffer) { _ =>
+          withResource(new NvtxWithMetrics("Parquet decode", NvtxColor.DARK_GREEN,
+            metrics(GPU_DECODE_TIME))) { _ =>
             Table.readParquet(opts, buffer, offset, len)
-          } catch {
-            case e: Exception =>
-              val path = filePath match {
-                case Some(path) => s"$path"
-                case None => ""
-              }
-              throw new IOException("Error when processing file " + 
-                  s"[path: $path, range: $offset-${offset + len}]", e)
           }
         }
+      } catch {
+        case e: Exception =>
+          val path = filePath match {
+            case Some(path) => s"$path"
+            case None => ""
+          }
+          throw new IOException("Error when processing file " +
+              s"[path: $path, range: $offset-${offset + len}]", e)
       }
       closeOnExcept(table) { _ =>
         GpuParquetScan.throwIfNeeded(table, isCorrectedInt96RebaseMode, isCorrectedRebaseMode,
@@ -2496,49 +2514,55 @@ class ParquetPartitionReader(
           EmptyGpuColumnarBatchIterator
         } else {
           // Someone is going to process this data, even if it is just a row count
-          GpuSemaphore.acquireIfNecessary(TaskContext.get(), metrics(SEMAPHORE_WAIT_TIME))
+          GpuSemaphore.acquireIfNecessary(TaskContext.get())
           val nullColumns = readDataSchema.safeMap(f =>
             GpuColumnVector.fromNull(numRows, f.dataType).asInstanceOf[SparkVector])
           new SingleGpuColumnarBatchIterator(new ColumnarBatch(nullColumns.toArray, numRows))
         }
       } else {
         val colTypes = readDataSchema.fields.map(f => f.dataType)
-        CachedGpuBatchIterator(readToTables(currentChunkedBlocks), colTypes,
-          spillCallback).map { batch =>
+        val iter = if (currentChunkedBlocks.isEmpty) {
+          CachedGpuBatchIterator(EmptyTableReader, colTypes)
+        } else {
+          val parseOpts = getParquetOptions(readDataSchema, clippedParquetSchema, useFieldId)
+          val (dataBuffer, dataSize, _) = metrics(BUFFER_TIME).ns {
+            readPartFile(currentChunkedBlocks, clippedParquetSchema, filePath)
+          }
+          if (dataSize == 0) {
+            dataBuffer.close()
+            CachedGpuBatchIterator(EmptyTableReader, colTypes)
+          } else {
+            closeOnExcept(dataBuffer) { _ =>
+              // Dump parquet data into a file
+              dumpDataToFile(dataBuffer, dataSize, Array(split), Option(debugDumpPrefix),
+                Some("parquet"))
+
+              // about to start using the GPU
+              GpuSemaphore.acquireIfNecessary(TaskContext.get())
+            }
+            RmmRapidsRetryIterator.withRetryNoSplit(dataBuffer) { _ =>
+              // Inc the ref count because MakeParquetTableProducer will try to close the dataBuffer
+              // which we don't want until we know that the retry is done with it.
+              dataBuffer.incRefCount()
+              val producer = MakeParquetTableProducer(useChunkedReader, conf,
+                targetBatchSizeBytes, parseOpts,
+                dataBuffer, 0, dataSize, metrics,
+                isCorrectedInt96RebaseMode, isCorrectedRebaseMode,
+                hasInt96Timestamps, isSchemaCaseSensitive,
+                useFieldId, readDataSchema,
+                clippedParquetSchema, Some(filePath),
+                tableSize => maxDeviceMemory = max(tableSize, maxDeviceMemory))
+              CachedGpuBatchIterator(producer, colTypes)
+            }
+          }
+        }
+        iter.map { batch =>
           logDebug(s"GPU batch size: ${GpuColumnVector.getTotalDeviceMemoryUsed(batch)} bytes")
           batch
         }
       }
     }
   }
-
-  private def readToTables(currentChunkedBlocks: Seq[BlockMetaData]): GpuDataProducer[Table] = {
-    if (currentChunkedBlocks.isEmpty) {
-      return EmptyTableReader
-    }
-    val (dataBuffer, dataSize, _) = metrics(BUFFER_TIME).ns {
-      readPartFile(currentChunkedBlocks, clippedParquetSchema, filePath)
-    }
-    if (dataSize == 0) {
-      dataBuffer.close()
-      EmptyTableReader
-    } else {
-      closeOnExcept(dataBuffer) { _ =>
-        // Dump parquet data into a file
-        dumpDataToFile(dataBuffer, dataSize, Array(split), Option(debugDumpPrefix), Some("parquet"))
-        val parseOpts = getParquetOptions(readDataSchema, clippedParquetSchema, useFieldId)
-
-        // about to start using the GPU
-        GpuSemaphore.acquireIfNecessary(TaskContext.get(), metrics(SEMAPHORE_WAIT_TIME))
-
-        MakeParquetTableProducer(useChunkedReader, conf, targetBatchSizeBytes, parseOpts,
-          dataBuffer, 0, dataSize, metrics, isCorrectedInt96RebaseMode, isCorrectedRebaseMode,
-          hasInt96Timestamps, isSchemaCaseSensitive, useFieldId, readDataSchema,
-          clippedParquetSchema, Some(filePath),
-          tableSize => maxDeviceMemory = max(tableSize, maxDeviceMemory))
-      }
-    }
-  }
 }
 
 object ParquetPartitionReader {
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRegExpReplaceMeta.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRegExpReplaceMeta.scala
index 22f5ae78268..9cde88c5815 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRegExpReplaceMeta.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRegExpReplaceMeta.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,18 @@
 package com.nvidia.spark.rapids
 
 import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, RegExpReplace}
-import org.apache.spark.sql.rapids.{GpuRegExpReplace, GpuRegExpReplaceWithBackref, GpuRegExpUtils, GpuStringReplace}
+import org.apache.spark.sql.rapids.{GpuRegExpReplace, GpuRegExpReplaceWithBackref, GpuRegExpUtils}
 import org.apache.spark.sql.types.DataTypes
 import org.apache.spark.unsafe.types.UTF8String
 
+trait GpuRegExpReplaceOpt extends Serializable
+
+@SerialVersionUID(100L)
+object GpuRegExpStringReplace extends GpuRegExpReplaceOpt
+
+@SerialVersionUID(100L)
+object GpuRegExpStringReplaceMulti extends GpuRegExpReplaceOpt
+
 class GpuRegExpReplaceMeta(
     expr: RegExpReplace,
     conf: RapidsConf,
@@ -30,11 +38,11 @@ class GpuRegExpReplaceMeta(
   private var javaPattern: Option[String] = None
   private var cudfPattern: Option[String] = None
   private var replacement: Option[String] = None
-  private var canUseGpuStringReplace = false
+  private var searchList: Option[Seq[String]] = None
+  private var replaceOpt: Option[GpuRegExpReplaceOpt] = None
   private var containsBackref: Boolean = false
 
   override def tagExprForGpu(): Unit = {
-    GpuRegExpUtils.tagForRegExpEnabled(this)
     replacement = expr.rep match {
       case Literal(s: UTF8String, DataTypes.StringType) if s != null => Some(s.toString)
       case _ => None
@@ -42,24 +50,32 @@ class GpuRegExpReplaceMeta(
 
     expr.regexp match {
       case Literal(s: UTF8String, DataTypes.StringType) if s != null =>
-        if (GpuOverrides.isSupportedStringReplacePattern(expr.regexp)) {
-          canUseGpuStringReplace = true
-        } else {
-          try {
-            javaPattern = Some(s.toString())
-            val (pat, repl) = 
-                new CudfRegexTranspiler(RegexReplaceMode).getTranspiledAST(s.toString, replacement)
-            GpuRegExpUtils.validateRegExpComplexity(this, pat)
-            cudfPattern = Some(pat.toRegexString)
-            repl.map { r => GpuRegExpUtils.backrefConversion(r.toRegexString) }.foreach {
-                case (hasBackref, convertedRep) =>
-                  containsBackref = hasBackref
-                  replacement = Some(GpuRegExpUtils.unescapeReplaceString(convertedRep))
-            }
-          } catch {
-            case e: RegexUnsupportedException =>
-              willNotWorkOnGpu(e.getMessage)
+        javaPattern = Some(s.toString())
+        try {
+          val (pat, repl) =
+              new CudfRegexTranspiler(RegexReplaceMode).getTranspiledAST(s.toString, None,
+                  replacement)
+          repl.map { r => GpuRegExpUtils.backrefConversion(r.toRegexString) }.foreach {
+              case (hasBackref, convertedRep) =>
+                containsBackref = hasBackref
+                replacement = Some(GpuRegExpUtils.unescapeReplaceString(convertedRep))
           }
+          if (!containsBackref && GpuOverrides.isSupportedStringReplacePattern(expr.regexp)) {
+            replaceOpt = Some(GpuRegExpStringReplace)
+          } else {
+              searchList = GpuRegExpUtils.getChoicesFromRegex(pat)
+              searchList match {
+                case Some(_) if !containsBackref =>
+                  replaceOpt = Some(GpuRegExpStringReplaceMulti)
+                case _ =>
+                  GpuRegExpUtils.tagForRegExpEnabled(this)
+                  GpuRegExpUtils.validateRegExpComplexity(this, pat)
+                  cudfPattern = Some(pat.toRegexString)
+              }
+          }
+        } catch {
+          case e: RegexUnsupportedException =>
+            willNotWorkOnGpu(e.getMessage)
         }
 
       case _ =>
@@ -81,19 +97,27 @@ class GpuRegExpReplaceMeta(
     // ignore the pos expression which must be a literal 1 after tagging check
     require(childExprs.length == 4,
       s"Unexpected child count for RegExpReplace: ${childExprs.length}")
-    if (canUseGpuStringReplace) {
-      GpuStringReplace(lhs, regexp, rep)
-    } else {
-      (javaPattern, cudfPattern, replacement) match {
-        case (Some(javaPattern), Some(cudfPattern), Some(cudfReplacement)) =>
-          if (containsBackref) {
-            GpuRegExpReplaceWithBackref(lhs, cudfPattern, cudfReplacement)
-          } else {
-            GpuRegExpReplace(lhs, regexp, rep, javaPattern, cudfPattern, cudfReplacement)
-          }
-        case _ =>
-          throw new IllegalStateException("Expression has not been tagged correctly")
-      }
+    replaceOpt match {
+      case None =>
+        (javaPattern, cudfPattern, replacement) match {
+          case (Some(javaPattern), Some(cudfPattern), Some(cudfReplacement)) =>
+            if (containsBackref) {
+              GpuRegExpReplaceWithBackref(lhs, regexp, rep)(cudfPattern, cudfReplacement)
+            } else {
+              GpuRegExpReplace(lhs, regexp, rep)(javaPattern, cudfPattern, cudfReplacement,
+                  None, None)
+            }
+          case _ =>
+            throw new IllegalStateException("Expression has not been tagged correctly")
+        }
+      case _ =>
+        (javaPattern, replacement) match {
+          case (Some(javaPattern), Some(replacement)) =>
+            GpuRegExpReplace(lhs, regexp, rep)(javaPattern, javaPattern, replacement,
+                searchList, replaceOpt)
+          case _ =>
+            throw new IllegalStateException("Expression has not been tagged correctly")
+        }
     }
   }
 }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRowToColumnarExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRowToColumnarExec.scala
index ee5316eec68..3947228bab6 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRowToColumnarExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRowToColumnarExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -591,7 +591,6 @@ class RowToColumnarIterator(
     numInputRows: GpuMetric = NoopMetric,
     numOutputRows: GpuMetric = NoopMetric,
     numOutputBatches: GpuMetric = NoopMetric,
-    semaphoreWaitTime: GpuMetric = NoopMetric,
     streamTime: GpuMetric = NoopMetric,
     opTime: GpuMetric = NoopMetric) extends Iterator[ColumnarBatch] with Arm {
 
@@ -650,7 +649,7 @@ class RowToColumnarIterator(
         // note that TaskContext.get() can return null during unit testing so we wrap it in an
         // option here
         Option(TaskContext.get())
-            .foreach(ctx => GpuSemaphore.acquireIfNecessary(ctx, semaphoreWaitTime))
+            .foreach(ctx => GpuSemaphore.acquireIfNecessary(ctx))
 
         val ret = withResource(new NvtxWithMetrics("RowToColumnar", NvtxColor.GREEN,
           opTime)) { _ =>
@@ -681,7 +680,6 @@ object GeneratedInternalRowToCudfRowIterator extends Logging {
   def apply(input: Iterator[InternalRow],
       schema: Array[Attribute],
       goal: CoalesceSizeGoal,
-      semaphoreWaitTime: GpuMetric,
       streamTime: GpuMetric,
       opTime: GpuMetric,
       numInputRows: GpuMetric,
@@ -701,8 +699,6 @@ object GeneratedInternalRowToCudfRowIterator extends Logging {
     val schemaRef = ctx.addReferenceObj("schema", schema,
       classOf[Array[Attribute]].getCanonicalName)
     val goalRef = ctx.addReferenceObj("goal", goal, classOf[CoalesceSizeGoal].getName)
-    val semaphoreWaitTimeRef = ctx.addReferenceObj("semaphoreWaitTime", semaphoreWaitTime,
-      classOf[GpuMetric].getName)
     val streamTimeRef = ctx.addReferenceObj("streamTime", streamTime, classOf[GpuMetric].getName)
     val opTimeRef = ctx.addReferenceObj("opTime", opTime, classOf[GpuMetric].getName)
     val numInputRowsRef = ctx.addReferenceObj("numInputRows", numInputRows,
@@ -777,7 +773,6 @@ object GeneratedInternalRowToCudfRowIterator extends Logging {
          |      $iterRef,
          |      $schemaRef,
          |      $goalRef,
-         |      $semaphoreWaitTimeRef,
          |      $streamTimeRef,
          |      $opTimeRef,
          |      $numInputRowsRef,
@@ -882,13 +877,12 @@ case class GpuRowToColumnarExec(child: SparkPlan, goal: CoalesceSizeGoal)
   }
 
   override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
-    SEMAPHORE_WAIT_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_SEMAPHORE_WAIT_TIME),
     OP_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_OP_TIME),
     STREAM_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_STREAM_TIME),
     NUM_INPUT_ROWS -> createMetric(DEBUG_LEVEL, DESCRIPTION_NUM_INPUT_ROWS)
   )
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     // use local variables instead of class global variables to prevent the entire
     // object from having to be serialized
     val numInputRows = gpuLongMetric(NUM_INPUT_ROWS)
@@ -896,7 +890,6 @@ case class GpuRowToColumnarExec(child: SparkPlan, goal: CoalesceSizeGoal)
     val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
     val streamTime = gpuLongMetric(STREAM_TIME)
     val opTime = gpuLongMetric(OP_TIME)
-    val semaphoreWaitTime = gpuLongMetric(SEMAPHORE_WAIT_TIME)
     val localGoal = goal
     val rowBased = child.execute()
 
@@ -912,13 +905,13 @@ case class GpuRowToColumnarExec(child: SparkPlan, goal: CoalesceSizeGoal)
         CudfRowTransitions.areAllSupported(output)) {
       val localOutput = output
       rowBased.mapPartitions(rowIter => GeneratedInternalRowToCudfRowIterator(
-        rowIter, localOutput.toArray, localGoal, semaphoreWaitTime, streamTime, opTime,
+        rowIter, localOutput.toArray, localGoal, streamTime, opTime,
         numInputRows, numOutputRows, numOutputBatches))
     } else {
       val converters = new GpuRowToColumnConverter(localSchema)
       rowBased.mapPartitions(rowIter => new RowToColumnarIterator(rowIter,
         localSchema, localGoal, converters,
-        numInputRows, numOutputRows, numOutputBatches, semaphoreWaitTime, streamTime, opTime))
+        numInputRows, numOutputRows, numOutputBatches, streamTime, opTime))
     }
   }
 }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRunnableCommandExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRunnableCommandExec.scala
index 0e9e4ad9058..d41bc12e07c 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRunnableCommandExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRunnableCommandExec.scala
@@ -134,7 +134,7 @@ case class GpuRunnableCommandExec(cmd: GpuRunnableCommand, child: SparkPlan)
   protected override def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException(
     s"${getClass.getCanonicalName} does not support row-based execution")
 
-  override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     sparkContext.parallelize(sideEffectResult, 1)
   }
 
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSemaphore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSemaphore.scala
index 5ffdc5ec45c..292467460db 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSemaphore.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSemaphore.scala
@@ -21,11 +21,13 @@ import java.util.concurrent.{ConcurrentHashMap, Semaphore}
 import scala.collection.mutable
 
 import ai.rapids.cudf.{NvtxColor, NvtxRange}
+import com.nvidia.spark.rapids.jni.RmmSpark
 import org.apache.commons.lang3.mutable.MutableInt
 
 import org.apache.spark.TaskContext
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.rapids.GpuTaskMetrics
 
 object GpuSemaphore {
   // DO NOT ACCESS DIRECTLY!  Use `getInstance` instead.
@@ -62,9 +64,9 @@ object GpuSemaphore {
    * NOTE: A task completion listener will automatically be installed to ensure
    *       the semaphore is always released by the time the task completes.
    */
-  def acquireIfNecessary(context: TaskContext, waitMetric: GpuMetric): Unit = {
+  def acquireIfNecessary(context: TaskContext): Unit = {
     if (context != null) {
-      getInstance.acquireIfNecessary(context, waitMetric)
+      getInstance.acquireIfNecessary(context)
     }
   }
 
@@ -120,8 +122,8 @@ private final class GpuSemaphore() extends Logging with Arm {
   case class TaskInfo(count: MutableInt, thread: Thread, numPermits: Int)
   private val activeTasks = new ConcurrentHashMap[Long, TaskInfo]
 
-  def acquireIfNecessary(context: TaskContext, waitMetric: GpuMetric): Unit = {
-    withResource(new NvtxWithMetrics("Acquire GPU", NvtxColor.RED, waitMetric)) { _ =>
+  def acquireIfNecessary(context: TaskContext): Unit = {
+    GpuTaskMetrics.get.semWaitTime {
       val taskAttemptId = context.taskAttemptId()
       val refs = activeTasks.get(taskAttemptId)
       if (refs == null || refs.count.getValue == 0) {
@@ -130,8 +132,9 @@ private final class GpuSemaphore() extends Logging with Arm {
         } else {
           refs.numPermits
         }
-        logDebug(s"Task $taskAttemptId acquiring GPU with ${refs.numPermits} permits")
+        logDebug(s"Task $taskAttemptId acquiring GPU with $permits permits")
         semaphore.acquire(permits)
+        RmmSpark.associateCurrentThreadWithTask(taskAttemptId)
         if (refs != null) {
           refs.count.increment()
         } else {
@@ -142,6 +145,9 @@ private final class GpuSemaphore() extends Logging with Arm {
           context.addTaskCompletionListener[Unit](completeTask)
         }
         GpuDeviceManager.initializeFromTask()
+      } else {
+        // Already had the semaphore, but we don't know if the thread is new or not
+        RmmSpark.associateCurrentThreadWithTask(taskAttemptId)
       }
     }
   }
@@ -150,6 +156,8 @@ private final class GpuSemaphore() extends Logging with Arm {
     val nvtxRange = new NvtxRange("Release GPU", NvtxColor.RED)
     try {
       val taskAttemptId = context.taskAttemptId()
+      GpuTaskMetrics.get.updateRetry(taskAttemptId)
+      RmmSpark.removeCurrentThreadAssociation()
       val refs = activeTasks.get(taskAttemptId)
       if (refs != null && refs.count.getValue > 0) {
         if (refs.count.decrementAndGet() == 0) {
@@ -164,6 +172,8 @@ private final class GpuSemaphore() extends Logging with Arm {
 
   def completeTask(context: TaskContext): Unit = {
     val taskAttemptId = context.taskAttemptId()
+    GpuTaskMetrics.get.updateRetry(taskAttemptId)
+    RmmSpark.taskDone(taskAttemptId)
     val refs = activeTasks.remove(taskAttemptId)
     if (refs == null) {
       throw new IllegalStateException(s"Completion of unknown task $taskAttemptId")
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffleCoalesceExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffleCoalesceExec.scala
index a72a34a608d..c02c2669774 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffleCoalesceExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffleCoalesceExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,7 +48,7 @@ case class GpuShuffleCoalesceExec(child: SparkPlan, targetBatchByteSize: Long)
     NUM_INPUT_ROWS -> createMetric(DEBUG_LEVEL, DESCRIPTION_NUM_INPUT_ROWS),
     NUM_INPUT_BATCHES -> createMetric(DEBUG_LEVEL, DESCRIPTION_NUM_INPUT_BATCHES),
     CONCAT_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_CONCAT_TIME)
-  ) ++ semaphoreMetrics
+  )
 
   override def output: Seq[Attribute] = child.output
 
@@ -58,7 +58,7 @@ case class GpuShuffleCoalesceExec(child: SparkPlan, targetBatchByteSize: Long)
     throw new IllegalStateException("ROW BASED PROCESSING IS NOT SUPPORTED")
   }
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     val metricsMap = allMetrics
     val targetSize = targetBatchByteSize
     val dataTypes = GpuColumnVector.extractTypes(schema)
@@ -192,7 +192,6 @@ class GpuShuffleCoalesceIterator(iter: Iterator[HostConcatResult],
                                  dataTypes: Array[DataType],
                                  metricsMap: Map[String, GpuMetric])
       extends Iterator[ColumnarBatch] with Arm {
-  private[this] val semWaitTime = metricsMap(GpuMetric.SEMAPHORE_WAIT_TIME)
   private[this] val opTimeMetric = metricsMap(GpuMetric.OP_TIME)
   private[this] val outputBatchesMetric = metricsMap(GpuMetric.NUM_OUTPUT_BATCHES)
   private[this] val outputRowsMetric = metricsMap(GpuMetric.NUM_OUTPUT_ROWS)
@@ -216,7 +215,7 @@ class GpuShuffleCoalesceIterator(iter: Iterator[HostConcatResult],
         // is an empty batch or not, because the downstream tasks expect
         // the `GpuShuffleCoalesceIterator` to acquire the semaphore and may
         // generate GPU data from batches that are empty.
-        GpuSemaphore.acquireIfNecessary(TaskContext.get(), semWaitTime)
+        GpuSemaphore.acquireIfNecessary(TaskContext.get())
 
         withResource(new MetricRange(opTimeMetric)) { _ =>
           val batch = cudf_utils.HostConcatResultUtil.getColumnarBatch(hostConcatResult, dataTypes)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala
index 838161180b8..24c21c8aca5 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala
@@ -29,8 +29,8 @@ import org.apache.spark.sql.catalyst.plans.physical.Distribution
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.joins.ShuffledHashJoinExec
 import org.apache.spark.sql.rapids.GpuOr
-import org.apache.spark.sql.rapids.execution.{GpuHashJoin, JoinTypeChecks}
-import org.apache.spark.sql.types.BooleanType
+import org.apache.spark.sql.rapids.execution.{GpuHashJoin, GpuSubPartitionHashJoin, JoinTypeChecks}
+import org.apache.spark.sql.types.{BooleanType, DataType}
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
 class GpuShuffledHashJoinMeta(
@@ -92,7 +92,8 @@ case class GpuShuffledHashJoinExec(
     right: SparkPlan,
     isSkewJoin: Boolean)(
     cpuLeftKeys: Seq[Expression],
-    cpuRightKeys: Seq[Expression]) extends ShimBinaryExecNode with GpuHashJoin {
+    cpuRightKeys: Seq[Expression]) extends ShimBinaryExecNode with GpuHashJoin
+  with GpuSubPartitionHashJoin {
 
   override def otherCopyArgs: Seq[AnyRef] = cpuLeftKeys :: cpuRightKeys :: Nil
 
@@ -108,7 +109,7 @@ case class GpuShuffledHashJoinExec(
     BUILD_TIME -> createNanoTimingMetric(ESSENTIAL_LEVEL, DESCRIPTION_BUILD_TIME),
     STREAM_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_STREAM_TIME),
     JOIN_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_JOIN_TIME),
-    JOIN_OUTPUT_ROWS -> createMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_OUTPUT_ROWS)) ++ spillMetrics
+    JOIN_OUTPUT_ROWS -> createMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_OUTPUT_ROWS))
 
   override def requiredChildDistribution: Seq[Distribution] =
     Seq(GpuHashPartitioning.getDistribution(cpuLeftKeys),
@@ -133,12 +134,21 @@ case class GpuShuffledHashJoinExec(
     case _ => RequireSingleBatch
   }
 
-  override def childrenCoalesceGoal: Seq[CoalesceGoal] = (joinType, buildSide) match {
-    case (_, GpuBuildLeft) => Seq(buildGoal, null)
-    case (_, GpuBuildRight) => Seq(null, buildGoal)
+  private def realTargetBatchSize(): Long = {
+    val configValue = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf)
+    // The 10k is mostly for tests, hopefully no one is setting anything that low in production.
+    Math.max(configValue, 10 * 1024)
   }
 
-  override def doExecuteColumnar() : RDD[ColumnarBatch] = {
+  override def childrenCoalesceGoal: Seq[CoalesceGoal] = {
+    val batchedBuildGoal = TargetSize(realTargetBatchSize())
+    (joinType, buildSide) match {
+      case (_, GpuBuildLeft) => Seq(batchedBuildGoal, null)
+      case (_, GpuBuildRight) => Seq(null, batchedBuildGoal)
+    }
+  }
+
+  override def internalDoExecuteColumnar() : RDD[ColumnarBatch] = {
     val buildDataSize = gpuLongMetric(BUILD_DATA_SIZE)
     val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
     val numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES)
@@ -146,38 +156,49 @@ case class GpuShuffledHashJoinExec(
     val streamTime = gpuLongMetric(STREAM_TIME)
     val joinTime = gpuLongMetric(JOIN_TIME)
     val joinOutputRows = gpuLongMetric(JOIN_OUTPUT_ROWS)
-    val batchSizeBytes = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf)
-    val spillCallback = GpuMetric.makeSpillCallback(allMetrics)
+    val numPartitions = RapidsConf.NUM_SUB_PARTITIONS.get(conf)
+    val subPartConf = RapidsConf.HASH_SUB_PARTITION_TEST_ENABLED.get(conf)
+       .map(_ && RapidsConf.TEST_CONF.get(conf))
     val localBuildOutput = buildPlan.output
 
     // Create a map of metrics that can be handed down to shuffle and coalesce
     // iterators, setting as noop certain metrics that the coalesce iterators
     // normally update, but that in the case of the join they would produce
     // the wrong statistics (since there are conflicts)
-    val coalesceMetricsMap = allMetrics +
+    val coalesceMetrics = allMetrics +
       (GpuMetric.NUM_INPUT_ROWS -> NoopMetric,
        GpuMetric.NUM_INPUT_BATCHES -> NoopMetric,
        GpuMetric.NUM_OUTPUT_BATCHES -> NoopMetric,
        GpuMetric.NUM_OUTPUT_ROWS -> NoopMetric)
 
+    val realTarget = realTargetBatchSize()
+
     streamedPlan.executeColumnar().zipPartitions(buildPlan.executeColumnar()) {
       (streamIter, buildIter) => {
-        val (builtBatch, maybeBufferedStreamIter) =
-          GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
-            buildGoal,
-            batchSizeBytes,
-            localBuildOutput,
-            buildIter,
+        val (buildData, maybeBufferedStreamIter) =
+          GpuShuffledHashJoinExec.prepareBuildBatchesForJoin(buildIter,
             new CollectTimeIterator("shuffled join stream", streamIter, streamTime),
-            spillCallback,
-            coalesceMetricsMap)
-
-        withResource(builtBatch) { _ =>
-          // doJoin will increment the reference counts as needed for the builtBatch
-          buildDataSize += GpuColumnVector.getTotalDeviceMemoryUsed(builtBatch)
-          doJoin(builtBatch, maybeBufferedStreamIter,
-            batchSizeBytes, spillCallback, numOutputRows, joinOutputRows, numOutputBatches,
-            opTime, joinTime)
+            realTarget, localBuildOutput, buildGoal, subPartConf, coalesceMetrics)
+
+        buildData match {
+          case Left(singleBatch) =>
+            closeOnExcept(singleBatch) { _ =>
+              buildDataSize += GpuColumnVector.getTotalDeviceMemoryUsed(singleBatch)
+            }
+            // doJoin will close singleBatch
+            doJoin(singleBatch, maybeBufferedStreamIter, realTarget,
+              numOutputRows, joinOutputRows, numOutputBatches, opTime, joinTime)
+          case Right(builtBatchIter) =>
+            // For big joins, when the build data can not fit into a single batch.
+            val sizeBuildIter = builtBatchIter.map { cb =>
+              closeOnExcept(cb) { _ =>
+                buildDataSize += GpuColumnVector.getTotalDeviceMemoryUsed(cb)
+              }
+              cb
+            }
+            doJoinBySubPartition(sizeBuildIter, maybeBufferedStreamIter, realTarget,
+              numPartitions, numOutputRows, joinOutputRows, numOutputBatches,
+              opTime, joinTime)
         }
       }
     }
@@ -190,9 +211,16 @@ case class GpuShuffledHashJoinExec(
 
 object GpuShuffledHashJoinExec extends Arm {
   /**
-   * Gets a `ColumnarBatch` and stream Iterator[ColumnarBatch] pair by acquiring
-   * the GPU semaphore optimally in the scenario where the build side is relatively
-   * small (less than `hostTargetBatchSize`).
+   * Return the build data as a single ColumnarBatch when sub-partitioning is not enabled,
+   * while as an iterator of ColumnarBatch when sub-partitioning is enabled.
+   *
+   * sub-partitioning can be activated by specifying its relevant config but this is intended
+   * for tests only. In production, whether sub-partitioning will be enabled depends on
+   * if all the data in build side can fit into a single batch. If yes, sub-partitioning
+   * will not be enabled. Otherwise, it will.
+   *
+   * This function also takes care of acquiring the GPU semaphore optimally in the scenario
+   * where the build side is relatively small (less than `targetSize`).
    *
    * In the optimal case, this function will load the build side on the host up to the
    * goal configuration and if it fits entirely, allow the stream iterator
@@ -203,143 +231,143 @@ object GpuShuffledHashJoinExec extends Arm {
    * the semaphore in the process, and then begin pulling from the stream iterator,
    * which could include IO (while holding onto the semaphore).
    *
-   * The function handles the case where the build side goes above the configured batch
-   * goal, in which case it will concat on the host, grab the semaphore, and continue to
-   * pull the build iterator to build a bigger batch on the GPU. This is not optimized
-   * because we hold onto the semaphore during the entire time after realizing the goal
-   * has been hit.
-   *
-   * @param buildGoal the build goal to use when coalescing batches
-   * @param hostTargetBatchSize target batch size goal on the host
+   * @param buildIter build side iterator
+   * @param streamIter stream side iterator
+   * @param targetSize target batch size goal
    * @param buildOutput output attributes of the build plan
-   * @param buildIter build iterator
-   * @param streamIter stream iterator
-   * @param spillCallback metric updater in case downstream iterators spill
-   * @param coalesceMetricsMap metrics map with metrics to be used in downstream
-   *                           iterators
-   * @return a pair of `ColumnarBatch` and streamed iterator that can be
-   *         used for the join
+   * @param buildGoal the build goal to use when coalescing batches
+   * @param subPartConf the config whether to enable sub-partitioning algorithm
+   * @param coalesceMetrics metrics map with metrics to be used in downstream
+   *                        iterators
+   * @return a pair of an Either for build and streamed iterator that can be used
+   *         for the join.
    */
-  def getBuiltBatchAndStreamIter(
-      buildGoal: CoalesceSizeGoal,
-      hostTargetBatchSize: Long,
-      buildOutput: Seq[Attribute],
+  private[rapids] def prepareBuildBatchesForJoin(
       buildIter: Iterator[ColumnarBatch],
       streamIter: Iterator[ColumnarBatch],
-      spillCallback: SpillCallback,
-      coalesceMetricsMap: Map[String, GpuMetric]): (ColumnarBatch, Iterator[ColumnarBatch]) = {
-    val semWait = coalesceMetricsMap(GpuMetric.SEMAPHORE_WAIT_TIME)
-    val buildTime = coalesceMetricsMap(GpuMetric.BUILD_TIME)
-    var bufferedBuildIterator: CloseableBufferedIterator[ColumnarBatch] = null
-    closeOnExcept(bufferedBuildIterator) { _ =>
+      targetSize: Long,
+      buildOutput: Seq[Attribute],
+      buildGoal: CoalesceSizeGoal,
+      subPartConf: Option[Boolean],
+      coalesceMetrics: Map[String, GpuMetric]):
+  (Either[ColumnarBatch, Iterator[ColumnarBatch]], Iterator[ColumnarBatch]) = {
+    val buildTime = coalesceMetrics(GpuMetric.BUILD_TIME)
+    val buildTypes = buildOutput.map(_.dataType).toArray
+    closeOnExcept(new CloseableBufferedIterator(buildIter.buffered)) { bufBuildIter =>
       val startTime = System.nanoTime()
-      // find if the build side is non-empty, and if the first batch is
-      // a serialized batch. If neither condition is met, we fallback to the
-      // `getSingleBatchWithVerification` method.
-      val firstBatchIsSerialized = {
-        if (!buildIter.hasNext) {
-          false
-        } else {
-          bufferedBuildIterator = new CloseableBufferedIterator(buildIter.buffered)
-          val firstBatch = bufferedBuildIterator.head
-          if (firstBatch.numCols() != 1) {
-            false
-          } else {
-            firstBatch.column(0).isInstanceOf[SerializedTableColumn]
-          }
-        }
+      // Batches type detection
+      val isBuildSerialized = bufBuildIter.hasNext && isBatchSerialized(bufBuildIter.head)
+
+      // Let batches coalesce for size overflow check
+      val coalesceBuiltIter = if (isBuildSerialized) {
+        new HostShuffleCoalesceIterator(bufBuildIter, targetSize, buildTypes, coalesceMetrics)
+      } else { // Batches on GPU have already coalesced to the target size by the given goal.
+        bufBuildIter
       }
 
-      if (!firstBatchIsSerialized) {
-        // In this scenario we are getting non host-side batches in the build side
-        // given the plan rules we expect this to be a single batch
-        val builtBatch =
-          ConcatAndConsumeAll.getSingleBatchWithVerification(
-            Option(bufferedBuildIterator).getOrElse(buildIter), buildOutput)
-        val delta = System.nanoTime() - startTime
-        buildTime += delta
-        (builtBatch, streamIter)
-      } else {
-        val dataTypes = buildOutput.map(_.dataType).toArray
-        val hostConcatIter = new HostShuffleCoalesceIterator(bufferedBuildIterator,
-          hostTargetBatchSize, dataTypes, coalesceMetricsMap)
-        withResource(hostConcatIter) { _ =>
-          closeOnExcept(hostConcatIter.next()) { hostConcatResult =>
-            if (!hostConcatIter.hasNext()) {
-              // add the time it took to fetch that first host-side build batch
-              buildTime += System.nanoTime() - startTime
-              // Optimal case, we drained the build iterator and we didn't have a prior
-              // so it was a single batch, and is entirely on the host.
-              // We peek at the stream iterator with `hasNext` on the buffered
-              // iterator, which will grab the semaphore when putting the first stream
-              // batch on the GPU, and then we bring the build batch to the GPU and return.
-              val bufferedStreamIter = new CloseableBufferedIterator(streamIter.buffered)
-              closeOnExcept(bufferedStreamIter) { _ =>
-                withResource(new NvtxRange("first stream batch", NvtxColor.RED)) { _ =>
-                  if (bufferedStreamIter.hasNext) {
-                    bufferedStreamIter.head
-                  } else {
-                    GpuSemaphore.acquireIfNecessary(TaskContext.get(), semWait)
-                  }
-                }
-                val buildBatch = getBuildBatchOptimized(hostConcatResult, buildOutput, buildTime)
-                (buildBatch, bufferedStreamIter)
-              }
-            } else {
-              val buildBatch = getBuildBatchFromUnfinished(
-                buildGoal, Seq(hostConcatResult).iterator ++ hostConcatIter,
-                buildOutput, spillCallback, coalesceMetricsMap)
-              buildTime += System.nanoTime() - startTime
-              (buildBatch, streamIter)
-            }
+      if (coalesceBuiltIter.hasNext) {
+        val firstBuildBatch = coalesceBuiltIter.next()
+        // Batches have coalesced to the target size, so size will overflow if there are
+        // more than one batch, or the first batch size already exceeds the target.
+        val (sizeOverflow, hasMultipleBatches) = closeOnExcept(firstBuildBatch) { _ =>
+          val hasSecondBatch = coalesceBuiltIter.hasNext
+          (hasSecondBatch || getBatchSize(firstBuildBatch) > targetSize, hasSecondBatch)
+        }
+        val needSingleBuildBatch = !subPartConf.getOrElse(sizeOverflow)
+        if (needSingleBuildBatch && isBuildSerialized && !sizeOverflow) {
+          // add the time it took to fetch that first host-side build batch
+          buildTime += System.nanoTime() - startTime
+          // It can be optimized for grabbing the GPU semaphore when there is only a single
+          // serialized host batch and the sub-partitioning is not activated.
+          val (singleBuildCb, bufferedStreamIter) = getBuildBatchOptimizedAndClose(
+            firstBuildBatch.asInstanceOf[HostConcatResult], streamIter, buildTypes,
+            buildTime)
+          (Left(singleBuildCb), bufferedStreamIter)
+
+        } else { // Other cases without optimization
+          val safeIter = GpuSubPartitionHashJoin.safeIteratorFromSeq(Seq(firstBuildBatch)) ++
+            coalesceBuiltIter
+          val gpuBuildIter = if (isBuildSerialized) {
+            // batches on host, move them to GPU
+            new GpuShuffleCoalesceIterator(safeIter.asInstanceOf[Iterator[HostConcatResult]],
+              buildTypes, coalesceMetrics)
+          } else { // batches already on GPU
+            safeIter.asInstanceOf[Iterator[ColumnarBatch]]
+          }
+
+          val buildRet = if (needSingleBuildBatch) {
+            val singleBuildCb = getAsSingleBatch(gpuBuildIter, buildOutput,
+              hasMultipleBatches, buildGoal, coalesceMetrics)
+            Left(singleBuildCb)
+          } else { // this is for sub-partitioning
+            Right(new CollectTimeIterator("hash join build", gpuBuildIter, buildTime))
           }
+          // add the time it took to fetch that first build batch
+          buildTime += System.nanoTime() - startTime
+          (buildRet, streamIter)
         }
+      } else {
+        // build is empty
+        (Left(GpuColumnVector.emptyBatchFromTypes(buildTypes)), streamIter)
       }
     }
   }
 
-  private def getBuildBatchFromUnfinished(
-      buildGoal: CoalesceSizeGoal,
-      iterWithPrior: Iterator[HostConcatResult],
-      buildOutput: Seq[Attribute],
-      spillCallback: SpillCallback,
-      coalesceMetricsMap: Map[String, GpuMetric]): ColumnarBatch = {
-    // In the fallback case we build the same iterator chain that the Spark plan
-    // would have produced:
-    //   GpuCoalesceIterator(GpuShuffleCoalesceIterator(shuffled build side))
-    // This allows us to make the shuffle batches spillable in case we have a large,
-    // build-side table, as `RequireSingleBatch` is virtually no limit, and we
-    // know we are now above `hostTargetBatchSize` (which is 2GB by default)
-    val dataTypes = buildOutput.map(_.dataType).toArray
-      val shuffleCoalesce = new GpuShuffleCoalesceIterator(
-        iterWithPrior,
-        dataTypes,
-        coalesceMetricsMap)
-    val res = ConcatAndConsumeAll.getSingleBatchWithVerification(
-        new GpuCoalesceIterator(shuffleCoalesce,
-          dataTypes, buildGoal,
-          NoopMetric, NoopMetric, NoopMetric, NoopMetric, NoopMetric,
-          coalesceMetricsMap(GpuMetric.CONCAT_TIME),
-          coalesceMetricsMap(GpuMetric.OP_TIME),
-          coalesceMetricsMap(GpuMetric.PEAK_DEVICE_MEMORY),
-          spillCallback,
-          "build batch"),
-        buildOutput)
-      res
+  /** Only accepts a HostConcatResult or a ColumnarBatch as input */
+  private def getBatchSize(maybeBatch: AnyRef): Long = maybeBatch match {
+    case batch: ColumnarBatch => GpuColumnVector.getTotalDeviceMemoryUsed(batch)
+    case hostBatch: HostConcatResult => hostBatch.getTableHeader().getDataLen()
+    case _ => throw new IllegalStateException(s"Expect a HostConcatResult or a " +
+      s"ColumnarBatch, but got a ${maybeBatch.getClass.getSimpleName}")
   }
 
-  private def getBuildBatchOptimized(
+  private def getBuildBatchOptimizedAndClose(
       hostConcatResult: HostConcatResult,
-      buildOutput: Seq[Attribute],
-      buildTime: GpuMetric): ColumnarBatch = {
-    val dataTypes = buildOutput.map(_.dataType).toArray
-    // we are on the GPU and our build batch is within `targetSizeBytes`.
-    // we can bring the build batch to the GPU now
+      streamIter: Iterator[ColumnarBatch],
+      buildDataTypes: Array[DataType],
+      buildTime: GpuMetric): (ColumnarBatch, Iterator[ColumnarBatch]) = {
+    // For the optimal case, the build iterator is already drained and didn't have a
+    // prior so it was a single batch, and is entirely on the host.
+    // We peek at the stream iterator with `hasNext` on the buffered iterator, which
+    // will grab the semaphore when putting the first stream batch on the GPU, and
+    // then we bring the build batch to the GPU and return.
     withResource(hostConcatResult) { _ =>
-      buildTime.ns {
-        cudf_utils.HostConcatResultUtil.getColumnarBatch(hostConcatResult, dataTypes)
+      closeOnExcept(new CloseableBufferedIterator(streamIter.buffered)) { bufStreamIter =>
+        withResource(new NvtxRange("first stream batch", NvtxColor.RED)) { _ =>
+          if (bufStreamIter.hasNext) {
+            bufStreamIter.head
+          } else {
+            GpuSemaphore.acquireIfNecessary(TaskContext.get())
+          }
+        }
+        // Bring the build batch to the GPU now
+        val buildBatch = buildTime.ns {
+          cudf_utils.HostConcatResultUtil.getColumnarBatch(hostConcatResult, buildDataTypes)
+        }
+        (buildBatch, bufStreamIter)
       }
     }
   }
+
+  private def getAsSingleBatch(
+      inputIter: Iterator[ColumnarBatch],
+      inputAttrs: Seq[Attribute],
+      hasMultipleBatches: Boolean,
+      goal: CoalesceSizeGoal,
+      coalesceMetrics: Map[String, GpuMetric]): ColumnarBatch = {
+    val singleBatchIter = if (hasMultipleBatches) {
+      new GpuCoalesceIterator(inputIter, inputAttrs.map(_.dataType).toArray, goal,
+        NoopMetric, NoopMetric, NoopMetric, NoopMetric, NoopMetric,
+        coalesceMetrics(GpuMetric.CONCAT_TIME), coalesceMetrics(GpuMetric.OP_TIME),
+        coalesceMetrics(GpuMetric.PEAK_DEVICE_MEMORY), "single build batch")
+    } else {
+      inputIter
+    }
+    ConcatAndConsumeAll.getSingleBatchWithVerification(singleBatchIter, inputAttrs)
+  }
+
+  def isBatchSerialized(batch: ColumnarBatch): Boolean = {
+    batch.numCols() == 1 && batch.column(0).isInstanceOf[SerializedTableColumn]
+  }
 }
 
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSortExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSortExec.scala
index 9a4f97aee15..7541e684156 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSortExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSortExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -115,21 +115,15 @@ case class GpuSortExec(
   override def doExecute(): RDD[InternalRow] =
     throw new IllegalStateException(s"Row-based execution should not occur for $this")
 
-  override lazy val additionalMetrics: Map[String, GpuMetric] = {
-    val required = Map(
+  override lazy val additionalMetrics: Map[String, GpuMetric] =
+    Map(
       OP_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_OP_TIME),
       SORT_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_SORT_TIME),
       PEAK_DEVICE_MEMORY -> createSizeMetric(MODERATE_LEVEL, DESCRIPTION_PEAK_DEVICE_MEMORY))
-    if (sortType == OutOfCoreSort) {
-      required ++ spillMetrics
-    } else {
-      required
-    }
-  }
 
   private lazy val targetSize = GpuSortExec.targetSize(conf)
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     val sorter = new GpuSorter(gpuSortOrder, output)
 
     val sortTime = gpuLongMetric(SORT_TIME)
@@ -142,10 +136,9 @@ case class GpuSortExec(
     child.executeColumnar().mapPartitions { cbIter =>
       if (outOfCore) {
         val cpuOrd = new LazilyGeneratedOrdering(sorter.cpuOrdering)
-        val spillCallback = GpuMetric.makeSpillCallback(allMetrics)
         val iter = GpuOutOfCoreSortIterator(cbIter, sorter, cpuOrd,
           targetSize, opTime, sortTime, outputBatch, outputRows,
-          peakDevMemory, spillCallback)
+          peakDevMemory)
         TaskContext.get().addTaskCompletionListener(_ -> iter.close())
         iter
       } else {
@@ -252,8 +245,7 @@ case class GpuOutOfCoreSortIterator(
     sortTime: GpuMetric,
     outputBatches: GpuMetric,
     outputRows: GpuMetric,
-    peakDevMemory: GpuMetric,
-    spillCallback: SpillCallback) extends Iterator[ColumnarBatch]
+    peakDevMemory: GpuMetric) extends Iterator[ColumnarBatch]
     with Arm with AutoCloseable {
 
   // There are so many places where we might hit a new peak that it gets kind of complex
@@ -324,7 +316,7 @@ case class GpuOutOfCoreSortIterator(
         val ct = splits.head
         memUsed += ct.getBuffer.getLength
         val sp = SpillableColumnarBatch(ct, sorter.projectedBatchTypes,
-          SpillPriorities.ACTIVE_ON_DECK_PRIORITY, spillCallback)
+          SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
         sortedSize += sp.sizeInBytes
         sorted.add(sp)
       }
@@ -359,7 +351,7 @@ case class GpuOutOfCoreSortIterator(
         memUsed += splits.map(_.getBuffer.getLength).sum
         val stillPending = if (hasFullySortedData) {
           val sp = SpillableColumnarBatch(splits.head, sorter.projectedBatchTypes,
-            SpillPriorities.ACTIVE_ON_DECK_PRIORITY, spillCallback)
+            SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
           sortedSize += sp.sizeInBytes
           sorted.add(sp)
           splits.slice(1, splits.length)
@@ -372,7 +364,7 @@ case class GpuOutOfCoreSortIterator(
           case (ct: ContiguousTable, lower: UnsafeRow) =>
             if (ct.getRowCount > 0) {
               val sp = SpillableColumnarBatch(ct, sorter.projectedBatchTypes,
-                SpillPriorities.ACTIVE_ON_DECK_PRIORITY, spillCallback)
+                SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
               pending.add(sp, lower)
             } else {
               ct.close()
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTextBasedPartitionReader.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTextBasedPartitionReader.scala
index 45434c06176..1a6f62319bc 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTextBasedPartitionReader.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTextBasedPartitionReader.scala
@@ -22,7 +22,7 @@ import java.util.Optional
 import scala.collection.mutable.ListBuffer
 import scala.math.max
 
-import ai.rapids.cudf.{ColumnVector, DType, HostColumnVector, HostColumnVectorCore, HostMemoryBuffer, NvtxColor, NvtxRange, Scalar, Schema, Table}
+import ai.rapids.cudf.{CaptureGroups, ColumnVector, DType, HostColumnVector, HostColumnVectorCore, HostMemoryBuffer, NvtxColor, NvtxRange, RegexProgram, Scalar, Schema, Table}
 import com.nvidia.spark.rapids.DateUtils.{toStrf, TimestampFormatConversionException}
 import com.nvidia.spark.rapids.jni.CastStrings
 import com.nvidia.spark.rapids.shims.GpuTypeShims
@@ -320,13 +320,11 @@ abstract class GpuTextBasedPartitionReader[BUFF <: LineBufferer, FACT <: LineBuf
         val cudfSchema = GpuColumnVector.from(dataSchemaWithStrings)
 
         // about to start using the GPU
-        GpuSemaphore.acquireIfNecessary(TaskContext.get(), metrics(SEMAPHORE_WAIT_TIME))
+        GpuSemaphore.acquireIfNecessary(TaskContext.get())
 
         // The buffer that is sent down
-        val table = withResource(new NvtxWithMetrics(getFileFormatShortName + " decode",
-          NvtxColor.DARK_GREEN, metrics(GPU_DECODE_TIME))) { _ =>
-          readToTable(dataBuffer, cudfSchema, newReadDataSchema, isFirstChunk)
-        }
+        val table = readToTable(dataBuffer, cudfSchema, newReadDataSchema, isFirstChunk,
+            metrics(GPU_DECODE_TIME))
         maxDeviceMemory = max(GpuColumnVector.getTotalDeviceMemoryUsed(table), maxDeviceMemory)
 
         // parse boolean and numeric columns that were read as strings
@@ -403,7 +401,6 @@ abstract class GpuTextBasedPartitionReader[BUFF <: LineBufferer, FACT <: LineBuf
     }
   }
 
-  @scala.annotation.nowarn("msg=in class ColumnView is deprecated")
   def castStringToTimestamp(
       lhs: ColumnVector,
       sparkFormat: String,
@@ -446,7 +443,8 @@ abstract class GpuTextBasedPartitionReader[BUFF <: LineBufferer, FACT <: LineBuf
 
     // filter by regexp first to eliminate invalid entries
     val regexpFiltered = withResource(lhs.strip()) { stripped =>
-      withResource(stripped.matchesRe(regex)) { matchesRe =>
+      val prog = new RegexProgram(regex, CaptureGroups.NON_CAPTURE)
+      withResource(stripped.matchesRe(prog)) { matchesRe =>
         withResource(Scalar.fromNull(DType.STRING)) { nullString =>
           matchesRe.ifElse(stripped, nullString)
         }
@@ -461,8 +459,8 @@ abstract class GpuTextBasedPartitionReader[BUFF <: LineBufferer, FACT <: LineBuf
       // `@` was chosen somewhat arbitrarily but should be safe since we do not support any
       // date/time formats that contain the `@` character
       val placeholder = "@"
-      withResource(regexpFiltered.stringReplaceWithBackrefs(
-        raw"(\.\d{3})(Z?)\Z", raw"\1$placeholder\2")) { tmp =>
+      val prog = new RegexProgram(raw"(\.\d{3})(Z?)\Z")
+      withResource(regexpFiltered.stringReplaceWithBackrefs(prog, raw"\1$placeholder\2")) { tmp =>
         withResource(Scalar.fromString(placeholder)) { from =>
           withResource(Scalar.fromString("000")) { to =>
             tmp.stringReplace(from, to)
@@ -552,7 +550,8 @@ abstract class GpuTextBasedPartitionReader[BUFF <: LineBufferer, FACT <: LineBuf
     dataBuffer: BUFF,
     cudfSchema: Schema,
     readDataSchema: StructType,
-    isFirstChunk: Boolean): Table
+    isFirstChunk: Boolean,
+    decodeTime: GpuMetric): Table
 
   /**
    * File format short name used for logging and other things to uniquely identity
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTransitionOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTransitionOverrides.scala
index 62a17952a0f..221e8a01ed4 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTransitionOverrides.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTransitionOverrides.scala
@@ -16,10 +16,15 @@
 
 package com.nvidia.spark.rapids
 
+import java.util.concurrent.atomic.AtomicInteger
+
 import scala.annotation.tailrec
+import scala.collection.mutable
 
 import com.nvidia.spark.rapids.shims.{GpuBatchScanExec, SparkShimImpl}
 
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, AttributeReference, Expression, InputFileBlockLength, InputFileBlockStart, InputFileName, SortOrder}
 import org.apache.spark.sql.catalyst.plans.physical.IdentityBroadcastMode
 import org.apache.spark.sql.catalyst.rules.Rule
@@ -30,8 +35,8 @@ import org.apache.spark.sql.execution.command.{DataWritingCommandExec, ExecutedC
 import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2ScanExecBase, DropTableExec, ShowTablesExec}
 import org.apache.spark.sql.execution.exchange.{BroadcastExchangeLike, Exchange, ReusedExchangeExec, ShuffleExchangeLike}
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, BroadcastNestedLoopJoinExec, HashedRelationBroadcastMode}
-import org.apache.spark.sql.rapids.{ExternalSource, GpuDataSourceScanExec, GpuFileSourceScanExec, GpuInputFileBlockLength, GpuInputFileBlockStart, GpuInputFileName, GpuShuffleEnv}
-import org.apache.spark.sql.rapids.execution.{GpuBroadcastExchangeExec, GpuBroadcastExchangeExecBase, GpuBroadcastToRowExec, GpuCustomShuffleReaderExec, GpuHashJoin, GpuShuffleExchangeExecBase}
+import org.apache.spark.sql.rapids.{ExternalSource, GpuDataSourceScanExec, GpuFileSourceScanExec, GpuInputFileBlockLength, GpuInputFileBlockStart, GpuInputFileName, GpuShuffleEnv, GpuTaskMetrics}
+import org.apache.spark.sql.rapids.execution.{ExchangeMappingCache, GpuBroadcastExchangeExec, GpuBroadcastExchangeExecBase, GpuBroadcastToRowExec, GpuCustomShuffleReaderExec, GpuHashJoin, GpuShuffleExchangeExecBase}
 import org.apache.spark.sql.types.StructType
 
 /**
@@ -662,6 +667,82 @@ class GpuTransitionOverrides extends Rule[SparkPlan] {
     case _ => plan
   }
 
+  /** Mark nodes as GPU planning completed. */
+  private def markGpuPlanningComplete(plan: SparkPlan): SparkPlan = {
+    plan.foreach {
+      case g: GpuBroadcastExchangeExec => g.markGpuPlanningComplete()
+      case _ =>
+    }
+    plan
+  }
+
+  /**
+   * On some Spark platforms, AQE planning ends up not reusing as many GPU exchanges as possible.
+   * This searches the plan for any GPU broadcast exchanges and checks if their original CPU plans
+   * match any other previously seen GPU broadcasts with the same CPU plan.
+   */
+  private def fixupAdaptiveExchangeReuse(p: SparkPlan): SparkPlan = {
+    def doFixup(plan: SparkPlan): SparkPlan = {
+      plan.transformUp {
+        case g: GpuBroadcastExchangeExec =>
+          ExchangeMappingCache.findGpuExchangeReplacement(g.cpuCanonical).map { other =>
+            if (other eq g) {
+              g
+            } else {
+              ReusedExchangeExec(g.output, other)
+            }
+          }.getOrElse(g)
+      }
+    }
+
+    // If an exchange is at the top of the plan being remapped, this is likely due to AQE
+    // re-planning, and we're not allowed to change an exchange to a reused exchange in that case.
+    p match {
+      case e: Exchange => e.mapChildren(doFixup)
+      case _ => doFixup(p)
+    }
+  }
+
+  private def insertStageLevelMetrics(plan: SparkPlan): Unit = {
+    val sc = SparkSession.active.sparkContext
+    val gen = new AtomicInteger(0)
+    val allMetrics = mutable.Map[Int, GpuTaskMetrics]()
+    insertStageLevelMetrics(sc, plan, gen.getAndIncrement(), gen, allMetrics)
+  }
+
+  private def insertStageLevelMetrics(sc: SparkContext,
+      plan: SparkPlan,
+      currentStageId: Int,
+      stageIdGen: AtomicInteger,
+      allMetrics: mutable.Map[Int, GpuTaskMetrics]): Unit = {
+    plan match {
+      case shuffle: Exchange =>
+        shuffle.children.foreach { child =>
+          val newStageId = stageIdGen.getAndIncrement()
+          insertStageLevelMetrics(sc, child, newStageId, stageIdGen, allMetrics)
+        }
+      case gpu: GpuExec if gpu.supportsColumnar =>
+        // We only want to insert metrics for one of the execs per stage, but that can
+        // have problems because we want it to be deserialized before any of the metrics
+        // are used, but depending on how the iterators work, that might not happen, so to
+        // be safe for now we are going to include it everywhere
+        val metrics = allMetrics.getOrElse(currentStageId, {
+          val newMetrics = new GpuTaskMetrics
+          newMetrics.register(sc)
+          allMetrics.put(currentStageId, newMetrics)
+          newMetrics
+        })
+        gpu.setTaskMetrics(metrics)
+        gpu.children.foreach { child =>
+          insertStageLevelMetrics(sc, child, currentStageId, stageIdGen, allMetrics)
+        }
+      case other =>
+        other.children.foreach { child =>
+          insertStageLevelMetrics(sc, child, currentStageId, stageIdGen, allMetrics)
+        }
+    }
+  }
+
   override def apply(sparkPlan: SparkPlan): SparkPlan = GpuOverrideUtil.tryOverride { plan =>
     this.rapidsConf = new RapidsConf(plan.conf)
     if (rapidsConf.isSqlEnabled && rapidsConf.isSqlExecuteOnGPU) {
@@ -704,11 +785,18 @@ class GpuTransitionOverrides extends Rule[SparkPlan] {
         // need to apply any remaining rules that should have been applied.
         updatedPlan = SparkShimImpl.applyPostShimPlanRules(updatedPlan)
 
+        updatedPlan = markGpuPlanningComplete(updatedPlan)
+        if (rapidsConf.isAqeExchangeReuseFixupEnabled &&
+            plan.conf.adaptiveExecutionEnabled && plan.conf.exchangeReuseEnabled) {
+          updatedPlan = fixupAdaptiveExchangeReuse(updatedPlan)
+        }
+
         if (rapidsConf.logQueryTransformations) {
           logWarning(s"Transformed query:" +
             s"\nOriginal Plan:\n$plan\nTransformed Plan:\n$updatedPlan")
         }
 
+        insertStageLevelMetrics(updatedPlan)
         updatedPlan
       }
     } else {
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuUserDefinedFunction.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuUserDefinedFunction.scala
index 32c3d76d76e..ac3afc137dc 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuUserDefinedFunction.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuUserDefinedFunction.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -133,7 +133,10 @@ trait GpuRowBasedUserDefinedFunction extends GpuExpression
             NoopMetric,
             NoopMetric,
             NoopMetric,
-            nullSafe).foreach { row =>
+            nullSafe,
+            // ensure `releaseSemaphore` is false so we don't release the semaphore
+            // mid projection.
+            releaseSemaphore = false).foreach { row =>
           retRow.update(0, evaluateRow(row))
           retConverter.append(retRow, 0, builder)
         }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuWindowExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuWindowExec.scala
index 0896ba842d6..62226d65518 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuWindowExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuWindowExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,8 @@ import scala.collection.mutable.ArrayBuffer
 
 import ai.rapids.cudf
 import ai.rapids.cudf.{AggregationOverWindow, DType, GroupByOptions, GroupByScanAggregation, NullPolicy, NvtxColor, ReplacePolicy, ReplacePolicyWithColumn, Scalar, ScanAggregation, ScanType, Table, WindowOptions}
+import com.nvidia.spark.rapids.RapidsPluginImplicits._
+import com.nvidia.spark.rapids.RmmRapidsRetryIterator.withRetryNoSplit
 import com.nvidia.spark.rapids.shims.{GpuWindowUtil, ShimUnaryExecNode}
 
 import org.apache.spark.TaskContext
@@ -533,17 +535,8 @@ trait GpuWindowBaseExec extends ShimUnaryExecNode with GpuExec {
 
   import GpuMetric._
 
-  def needsSpillMetrics: Boolean = false
-
-  override lazy val additionalMetrics: Map[String, GpuMetric] = {
-    val required = Map(
-      OP_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_OP_TIME))
-    if (needsSpillMetrics) {
-      required ++ spillMetrics
-    } else {
-      required
-    }
-  }
+  override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
+    OP_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_OP_TIME))
 
   override def output: Seq[Attribute] = windowOps.map(_.toAttribute)
 
@@ -877,6 +870,8 @@ class GroupedAggregations extends Arm {
                 val aggColumn = result.getColumn(resultIndex)
 
                 outputIndexes.foreach { outIndex =>
+                  require(outputColumns(outIndex) == null,
+                    "Attempted to overwrite a window output column!!")
                   outputColumns(outIndex) = func.windowOutput(aggColumn)
                 }
             }
@@ -941,6 +936,8 @@ class GroupedAggregations extends Arm {
 
         withResource(combined) { combined =>
           outputIndexes.foreach { outIndex =>
+            require(outputColumns(outIndex) == null,
+              "Attempted to overwrite a window output column!!")
             outputColumns(outIndex) = combined.incRefCount()
           }
         }
@@ -1068,8 +1065,10 @@ class GroupedAggregations extends Arm {
       val columns =
         (readIndex until (readIndex + numScans)).map(scannedAndReplaced.getColumn).toArray
       withResource(func.scanCombine(isRunningBatched, columns)) { col =>
-        outputLocations.foreach { i =>
-          outputColumns(i) = col.incRefCount()
+        outputLocations.foreach { outIndex =>
+          require(outputColumns(outIndex) == null,
+            "Attempted to overwrite a window output column!!")
+          outputColumns(outIndex) = col.incRefCount()
         }
       }
       readIndex += numScans
@@ -1115,15 +1114,36 @@ class GroupedAggregations extends Arm {
    * Do all of the aggregations and put them in the output columns. There may be extra processing
    * after this before you get to a final result.
    */
-  def doAggs(isRunningBatched: Boolean,
+  def doAggsAndClose(isRunningBatched: Boolean,
       boundOrderSpec: Seq[SortOrder],
       orderByPositions: Array[Int],
       partByPositions: Array[Int],
-      inputCb: ColumnarBatch,
+      inputSpillable: SpillableColumnarBatch,
       outputColumns: Array[cudf.ColumnVector]): Unit = {
-    doRunningWindowOptimizedAggs(isRunningBatched, partByPositions, inputCb, outputColumns)
-    doRowAggs(boundOrderSpec, orderByPositions, partByPositions, inputCb, outputColumns)
-    doRangeAggs(boundOrderSpec, orderByPositions, partByPositions, inputCb, outputColumns)
+    withRetryNoSplit(inputSpillable) { attempt =>
+      // when there are exceptions in this body, we always want to close
+      // `outputColumns` before a likely retry.
+      try {
+        withResource(attempt.getColumnarBatch()) { attemptCb =>
+          doRunningWindowOptimizedAggs(
+            isRunningBatched, partByPositions, attemptCb, outputColumns)
+          doRowAggs(
+            boundOrderSpec, orderByPositions, partByPositions, attemptCb, outputColumns)
+          doRangeAggs(
+            boundOrderSpec, orderByPositions, partByPositions, attemptCb, outputColumns)
+        }
+      } catch {
+        case t: Throwable =>
+          // on exceptions we want to throw away any columns in outputColumns that
+          // are not pass-through
+          val columnsToClose = outputColumns.filter(_ != null)
+          outputColumns.indices.foreach { col =>
+            outputColumns(col) = null
+          }
+          columnsToClose.safeClose(t)
+          throw t
+      }
+    }
   }
 
   /**
@@ -1220,18 +1240,27 @@ trait BasicWindowCalc extends Arm {
    */
   def computeBasicWindow(cb: ColumnarBatch): Array[cudf.ColumnVector] = {
     closeOnExcept(new Array[cudf.ColumnVector](boundWindowOps.length)) { outputColumns =>
-      // First the pass through unchanged columns
+      val inputSpillable = SpillableColumnarBatch(
+        GpuProjectExec.project(cb, initialProjections),
+        SpillPriorities.ACTIVE_BATCHING_PRIORITY)
+
+      // this takes ownership of `inputSpillable`
+      aggregations.doAggsAndClose(
+        isRunningBatched,
+        boundOrderSpec,
+        orderByPositions,
+        partByPositions,
+        inputSpillable,
+        outputColumns)
+
+      // if the window aggregates were successful, lets splice the passThrough
+      // columns
       passThrough.foreach {
         case (inputIndex, outputIndex) =>
           outputColumns(outputIndex) =
             cb.column(inputIndex).asInstanceOf[GpuColumnVector].getBase.incRefCount()
       }
 
-      withResource(GpuProjectExec.project(cb, initialProjections)) { initProjCb =>
-        aggregations.doAggs(isRunningBatched, boundOrderSpec, orderByPositions,
-          partByPositions, initProjCb, outputColumns)
-      }
-
       outputColumns
     }
   }
@@ -1541,7 +1570,7 @@ case class GpuRunningWindowExec(
 
   override def otherCopyArgs: Seq[AnyRef] = cpuPartitionSpec :: cpuOrderSpec :: Nil
 
-  override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     val numOutputBatches = gpuLongMetric(GpuMetric.NUM_OUTPUT_BATCHES)
     val numOutputRows = gpuLongMetric(GpuMetric.NUM_OUTPUT_ROWS)
     val opTime = gpuLongMetric(GpuMetric.OP_TIME)
@@ -1598,8 +1627,7 @@ class GpuCachedDoublePassWindowIterator(
     val outputTypes: Array[DataType],
     numOutputBatches: GpuMetric,
     numOutputRows: GpuMetric,
-    opTime: GpuMetric,
-    spillCallback: SpillCallback) extends Iterator[ColumnarBatch] with BasicWindowCalc {
+    opTime: GpuMetric) extends Iterator[ColumnarBatch] with BasicWindowCalc {
   import GpuBatchedWindowIterator._
   TaskContext.get().addTaskCompletionListener[Unit](_ => close())
 
@@ -1716,8 +1744,7 @@ class GpuCachedDoublePassWindowIterator(
     }
 
   def saveBatchForPostProcessing(batch: ColumnarBatch): Unit = {
-    firstPassProcessed += SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY,
-      spillCallback)
+    firstPassProcessed += SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
   }
 
   def saveBatchForPostProcessing(basic: Array[cudf.ColumnVector]): Unit = {
@@ -1851,15 +1878,12 @@ case class GpuCachedDoublePassWindowExec(
     override val cpuPartitionSpec: Seq[Expression],
     override val cpuOrderSpec: Seq[SortOrder]) extends GpuWindowBaseExec {
 
-  override def needsSpillMetrics: Boolean = true
-
   override def otherCopyArgs: Seq[AnyRef] = cpuPartitionSpec :: cpuOrderSpec :: Nil
 
-  override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     val numOutputBatches = gpuLongMetric(GpuMetric.NUM_OUTPUT_BATCHES)
     val numOutputRows = gpuLongMetric(GpuMetric.NUM_OUTPUT_ROWS)
     val opTime = gpuLongMetric(GpuMetric.OP_TIME)
-    val spillCallback = GpuMetric.makeSpillCallback(allMetrics)
 
     val boundWindowOps = GpuBindReferences.bindGpuReferences(windowOps, child.output)
     val boundPartitionSpec = GpuBindReferences.bindGpuReferences(gpuPartitionSpec, child.output)
@@ -1867,8 +1891,7 @@ case class GpuCachedDoublePassWindowExec(
 
     child.executeColumnar().mapPartitions { iter =>
       new GpuCachedDoublePassWindowIterator(iter, boundWindowOps, boundPartitionSpec,
-        boundOrderSpec, output.map(_.dataType).toArray, numOutputBatches, numOutputRows, opTime,
-        spillCallback)
+        boundOrderSpec, output.map(_.dataType).toArray, numOutputBatches, numOutputRows, opTime)
     }
   }
 }
@@ -1891,7 +1914,7 @@ case class GpuWindowExec(
     BatchedByKey(gpuPartitionOrdering)(cpuPartitionOrdering)
   }
 
-  override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     val numOutputBatches = gpuLongMetric(GpuMetric.NUM_OUTPUT_BATCHES)
     val numOutputRows = gpuLongMetric(GpuMetric.NUM_OUTPUT_ROWS)
     val opTime = gpuLongMetric(GpuMetric.OP_TIME)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HostColumnarToGpu.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HostColumnarToGpu.scala
index 8cdbb119fba..41a88a1224d 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HostColumnarToGpu.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HostColumnarToGpu.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ object HostColumnarToGpu extends Logging {
 
   // use reflection to get access to a private field in a class
   private def getClassFieldAccessible(className: String, fieldName: String) = {
-    val classObj = ShimLoader.loadClass(className)
+    val classObj = ShimReflectionUtils.loadClass(className)
     val fields = classObj.getDeclaredFields.toList
     val field = fields.filter( x => {
       x.getName.contains(fieldName)
@@ -177,7 +177,6 @@ class HostToGpuCoalesceIterator(iter: Iterator[ColumnarBatch],
     streamTime: GpuMetric,
     concatTime: GpuMetric,
     copyBufTime: GpuMetric,
-    semTime: GpuMetric,
     opTime: GpuMetric,
     peakDevMemory: GpuMetric,
     opName: String,
@@ -245,6 +244,13 @@ class HostToGpuCoalesceIterator(iter: Iterator[ColumnarBatch],
     totalRows = 0
   }
 
+
+  /**
+   * addBatchToConcat for HostToGpuCoalesceIterator does not need to close `batch`
+   * because the batch is closed by the producer iterator.
+   * See: https://github.com/NVIDIA/spark-rapids/issues/6995
+   * @param batch the batch to add in.
+   */
   override def addBatchToConcat(batch: ColumnarBatch): Unit = {
     withResource(new MetricRange(copyBufTime)) { _ =>
       val rows = batch.numRows()
@@ -261,7 +267,7 @@ class HostToGpuCoalesceIterator(iter: Iterator[ColumnarBatch],
 
   override def concatAllAndPutOnGPU(): ColumnarBatch = {
     // About to place data back on the GPU
-    GpuSemaphore.acquireIfNecessary(TaskContext.get(), semTime)
+    GpuSemaphore.acquireIfNecessary(TaskContext.get())
 
     val ret = batchBuilder.build(totalRows)
     maxDeviceMemory = GpuColumnVector.getTotalDeviceMemoryUsed(ret)
@@ -269,10 +275,16 @@ class HostToGpuCoalesceIterator(iter: Iterator[ColumnarBatch],
     // refine the estimate for number of rows based on this batch
     batchRowLimit = GpuBatchUtils.estimateRowCount(goal.targetSizeBytes, maxDeviceMemory,
       ret.numRows())
-
     ret
   }
 
+  override val supportsRetryIterator: Boolean = false
+
+  override def getCoalesceRetryIterator: Iterator[ColumnarBatch] = {
+    throw new UnsupportedOperationException(
+      "HostColumnarToGpu iterator does not support retry iterators")
+  }
+
   override def cleanupConcatIsDone(): Unit = {
     if (batchBuilder != null) {
       batchBuilder.close()
@@ -317,7 +329,7 @@ case class HostColumnarToGpu(child: SparkPlan, goal: CoalesceSizeGoal)
     CONCAT_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_CONCAT_TIME),
     COPY_BUFFER_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_COPY_BUFFER_TIME),
     PEAK_DEVICE_MEMORY -> createMetric(MODERATE_LEVEL, DESCRIPTION_PEAK_DEVICE_MEMORY)
-    ) ++ semaphoreMetrics
+    )
 
   override def output: Seq[Attribute] = child.output
 
@@ -338,7 +350,7 @@ case class HostColumnarToGpu(child: SparkPlan, goal: CoalesceSizeGoal)
    *
    * @return an RDD of `ColumnarBatch`
    */
-  override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
 
     val numInputRows = gpuLongMetric(NUM_INPUT_ROWS)
     val numInputBatches = gpuLongMetric(NUM_INPUT_BATCHES)
@@ -347,7 +359,6 @@ case class HostColumnarToGpu(child: SparkPlan, goal: CoalesceSizeGoal)
     val streamTime = gpuLongMetric(STREAM_TIME)
     val concatTime = gpuLongMetric(CONCAT_TIME)
     val copyBufTime = gpuLongMetric(COPY_BUFFER_TIME)
-    val semTime = gpuLongMetric(SEMAPHORE_WAIT_TIME)
     val opTime = gpuLongMetric(OP_TIME)
     val peakDevMemory = gpuLongMetric(PEAK_DEVICE_MEMORY)
 
@@ -360,7 +371,7 @@ case class HostColumnarToGpu(child: SparkPlan, goal: CoalesceSizeGoal)
     batches.mapPartitions { iter =>
       new HostToGpuCoalesceIterator(iter, goal, outputSchema,
         numInputRows, numInputBatches, numOutputRows, numOutputBatches,
-        streamTime, concatTime, copyBufTime, semTime, opTime,
+        streamTime, concatTime, copyBufTime, opTime,
         peakDevMemory, "HostColumnarToGpu", confUseArrow)
     }
   }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/JoinGatherer.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/JoinGatherer.scala
index 30099a49bb9..5ea39572290 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/JoinGatherer.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/JoinGatherer.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@ import org.apache.spark.sql.vectorized.ColumnarBatch
  * If the data is needed after `allowSpilling` is called the implementations should get the data
  * back and cache it again until allowSpilling is called once more.
  */
-trait LazySpillable extends AutoCloseable {
+trait LazySpillable extends AutoCloseable with CheckpointRestore {
 
   /**
    * Indicate that we are done using the data for now and it can be spilled.
@@ -89,6 +89,7 @@ trait JoinGatherer extends LazySpillable with Arm {
    */
   def getFixedWidthBitSize: Option[Int]
 
+
   /**
    * Do a complete/expensive job to get the number of rows that can be gathered to get close
    * to the targetSize for the final output.
@@ -213,9 +214,8 @@ trait LazySpillableColumnarBatch extends LazySpillable {
 
 object LazySpillableColumnarBatch {
   def apply(cb: ColumnarBatch,
-      spillCallback: SpillCallback,
       name: String): LazySpillableColumnarBatch =
-    new LazySpillableColumnarBatchImpl(cb, spillCallback, name)
+    new LazySpillableColumnarBatchImpl(cb, name)
 
   def spillOnly(wrapped: LazySpillableColumnarBatch): LazySpillableColumnarBatch = wrapped match {
     case alreadyGood: AllowSpillOnlyLazySpillableColumnarBatchImpl => alreadyGood
@@ -253,6 +253,12 @@ case class AllowSpillOnlyLazySpillableColumnarBatchImpl(wrapped: LazySpillableCo
     wrapped.allowSpilling()
   }
 
+  override def checkpoint(): Unit =
+    wrapped.checkpoint()
+
+  override def restore(): Unit =
+    wrapped.restore()
+
   override def toString: String = s"SPILL_ONLY $wrapped"
 }
 
@@ -261,7 +267,6 @@ case class AllowSpillOnlyLazySpillableColumnarBatchImpl(wrapped: LazySpillableCo
  */
 class LazySpillableColumnarBatchImpl(
     cb: ColumnarBatch,
-    spillCallback: SpillCallback,
     name: String) extends LazySpillableColumnarBatch with Arm {
 
   private var cached: Option[ColumnarBatch] = Some(GpuColumnVector.incRefCounts(cb))
@@ -292,11 +297,13 @@ class LazySpillableColumnarBatchImpl(
     if (spill.isEmpty && cached.isDefined) {
       withResource(new NvtxRange("spill batch " + name, NvtxColor.RED)) { _ =>
         // First time we need to allow for spilling
-        spill = Some(SpillableColumnarBatch(cached.get,
-          SpillPriorities.ACTIVE_ON_DECK_PRIORITY,
-          spillCallback))
-        // Putting data in a SpillableColumnarBatch takes ownership of it.
-        cached = None
+        try {
+          spill = Some(SpillableColumnarBatch(cached.get,
+            SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
+        } finally {
+          // Putting data in a SpillableColumnarBatch takes ownership of it.
+          cached = None
+        }
       }
     }
     cached.foreach(_.close())
@@ -310,6 +317,12 @@ class LazySpillableColumnarBatchImpl(
     spill = None
   }
 
+  override def checkpoint(): Unit =
+    allowSpilling()
+
+  override def restore(): Unit =
+    allowSpilling()
+
   override def toString: String = s"SpillableBatch $name $numCols X $numRows"
 }
 
@@ -328,8 +341,8 @@ trait LazySpillableGatherMap extends LazySpillable with Arm {
 }
 
 object LazySpillableGatherMap {
-  def apply(map: GatherMap, spillCallback: SpillCallback, name: String): LazySpillableGatherMap =
-    new LazySpillableGatherMapImpl(map, spillCallback, name)
+  def apply(map: GatherMap, name: String): LazySpillableGatherMap =
+    new LazySpillableGatherMapImpl(map, name)
 
   def leftCross(leftCount: Int, rightCount: Int): LazySpillableGatherMap =
     new LeftCrossGatherMap(leftCount, rightCount)
@@ -343,7 +356,6 @@ object LazySpillableGatherMap {
  */
 class LazySpillableGatherMapImpl(
     map: GatherMap,
-    spillCallback: SpillCallback,
     name: String) extends LazySpillableGatherMap {
 
   override val getRowCount: Long = map.getRowCount
@@ -367,12 +379,14 @@ class LazySpillableGatherMapImpl(
   override def allowSpilling(): Unit = {
     if (spill.isEmpty && cached.isDefined) {
       withResource(new NvtxRange("spill map " + name, NvtxColor.RED)) { _ =>
-        // First time we need to allow for spilling
-        spill = Some(SpillableBuffer(cached.get,
-          SpillPriorities.ACTIVE_ON_DECK_PRIORITY,
-          spillCallback))
-        // Putting data in a SpillableBuffer takes ownership of it.
-        cached = None
+        try {
+          // First time we need to allow for spilling
+          spill = Some(SpillableBuffer(cached.get,
+            SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
+        } finally {
+          // Putting data in a SpillableBuffer takes ownership of it.
+          cached = None
+        }
       }
     }
     cached.foreach(_.close())
@@ -385,6 +399,12 @@ class LazySpillableGatherMapImpl(
     spill.foreach(_.close())
     spill = None
   }
+
+  override def checkpoint(): Unit =
+    allowSpilling()
+
+  override def restore(): Unit =
+    allowSpilling()
 }
 
 abstract class BaseCrossJoinGatherMap(leftCount: Int, rightCount: Int)
@@ -412,6 +432,14 @@ abstract class BaseCrossJoinGatherMap(leftCount: Int, rightCount: Int)
   override def close(): Unit = {
     // NOOP, we don't cache anything on the GPU
   }
+  override def checkpoint(): Unit = {
+    // NOOP, we don't cache anything on the GPU
+  }
+
+  override def restore(): Unit = {
+    // NOOP, we don't cache anything on the GPU
+  }
+
 }
 
 class LeftCrossGatherMap(leftCount: Int, rightCount: Int) extends
@@ -503,6 +531,7 @@ class JoinGathererImpl(
 
   // How much of the gather map we have output so far
   private var gatheredUpTo: Long = 0
+  private var gatheredUpToCheckpoint: Long = 0
   private val totalRows: Long = gatherMap.getRowCount
   private val (fixedWidthRowSizeBits, nullRowSizeBits) = {
     val dts = data.dataTypes
@@ -511,6 +540,18 @@ class JoinGathererImpl(
     (fw, nullVal)
   }
 
+  override def checkpoint: Unit = {
+    gatheredUpToCheckpoint = gatheredUpTo
+    gatherMap.checkpoint()
+    data.checkpoint()
+  }
+
+  override def restore: Unit = {
+    gatheredUpTo = gatheredUpToCheckpoint
+    gatherMap.restore()
+    data.restore()
+  }
+
   override def toString: String = {
     s"GATHERER $gatheredUpTo/$totalRows $gatherMap $data"
   }
@@ -620,6 +661,15 @@ case class MultiJoinGather(left: JoinGatherer, right: JoinGatherer) extends Join
 
   override def numRowsLeft: Long = left.numRowsLeft
 
+  override def checkpoint: Unit = {
+    left.checkpoint
+    right.checkpoint
+  }
+  override def restore: Unit = {
+    left.restore
+    right.restore
+  }
+
   override def allowSpilling(): Unit = {
     left.allowSpilling()
     right.allowSpilling()
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/LocationPreservingMapPartitionsRDD.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/LocationPreservingMapPartitionsRDD.scala
new file mode 100644
index 00000000000..abfe0e4cdcb
--- /dev/null
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/LocationPreservingMapPartitionsRDD.scala
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rapids
+
+import scala.reflect.ClassTag
+
+import org.apache.spark.{Partition, TaskContext}
+import org.apache.spark.rdd.{MapPartitionsRDD, RDD}
+
+object LocationPreservingMapPartitionsRDD {
+  def apply[U: ClassTag, T: ClassTag](prev: RDD[T],
+      preservesPartitioning: Boolean = false)
+      (f: Iterator[T] => Iterator[U]):  RDD[U] = prev.withScope {
+    new LocationPreservingMapPartitionsRDD(
+      prev,
+      (_: TaskContext, _: Int, iter: Iterator[T]) => f(iter),
+      preservesPartitioning)
+  }
+}
+
+/**
+ * Used for a map partitions where we want to be sure that the location information is not lost
+ */
+class LocationPreservingMapPartitionsRDD[U: ClassTag, T: ClassTag](
+    prev: RDD[T],
+    f: (TaskContext, Int, Iterator[T]) => Iterator[U], // (TaskContext, partition index, iterator)
+    preservesPartitioning: Boolean = false,
+    isFromBarrier: Boolean = false,
+    isOrderSensitive: Boolean = false)
+    extends MapPartitionsRDD[U, T](
+      prev,
+      f,
+      preservesPartitioning = preservesPartitioning,
+      isFromBarrier = isFromBarrier,
+      isOrderSensitive = isOrderSensitive) {
+
+  override def getPreferredLocations(split: Partition): Seq[String] =
+    prev.preferredLocations(split)
+}
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/MetaUtils.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/MetaUtils.scala
index 602c0c43edb..d1d8b1fe942 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/MetaUtils.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/MetaUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -200,6 +200,53 @@ object MetaUtils extends Arm {
       GpuColumnVectorFromBuffer.from(table, deviceBuffer, meta, sparkTypes)
     }
   }
+
+  /**
+   * Copies BufferMeta not setting codecs in the new BufferMeta, and setting both
+   * the size and uncompressed size to be buffMeta.uncompressedSize (this is because
+   * the incoming BufferMeta was for a compressed buffer)
+   */
+  private def copyBufferMetaNoCodec(fbb: FlatBufferBuilder, buffMeta: BufferMeta): Int = {
+    BufferMeta.startBufferMeta(fbb)
+    BufferMeta.addId(fbb, buffMeta.id)
+    BufferMeta.addSize(fbb, buffMeta.uncompressedSize)
+    BufferMeta.addUncompressedSize(fbb, buffMeta.uncompressedSize)
+    BufferMeta.endBufferMeta(fbb)
+  }
+
+  /**
+   * Build a copy of the input `TableMeta` but ignoring any codecs specified in `BufferMeta`
+   * This is necessary after a decompression, such that batches that are reconstituted
+   * have a metadata that indicate they are not compressed.
+   * @param tableMeta - the incoming metadata with codec specifications
+   * @return a TableMeta without codecs
+   */
+  def dropCodecs(tableMeta: TableMeta): TableMeta = {
+    val fbb = new FlatBufferBuilder(1024)
+    val originalBufferMeta = tableMeta.bufferMeta()
+    val buffMetaOffset = if (originalBufferMeta != null) {
+      Some(copyBufferMetaNoCodec(fbb, originalBufferMeta))
+    } else {
+      None
+    }
+
+    val packedMetaBuffer = tableMeta.packedMetaAsByteBuffer()
+    val packedMetaOffset = if (packedMetaBuffer != null) {
+      val destBuffer = fbb.createUnintializedVector(1, packedMetaBuffer.remaining(), 1)
+      destBuffer.put(packedMetaBuffer)
+      Some(fbb.endVector())
+    } else {
+      None
+    }
+
+    TableMeta.startTableMeta(fbb)
+    buffMetaOffset.foreach(bmo => TableMeta.addBufferMeta(fbb, bmo))
+    packedMetaOffset.foreach(pmo => TableMeta.addPackedMeta(fbb, pmo))
+    TableMeta.addRowCount(fbb, tableMeta.rowCount())
+    fbb.finish(TableMeta.endTableMeta(fbb))
+    // copy the message to trim the backing array to only what is needed
+    TableMeta.getRootAsTableMeta(ByteBuffer.wrap(fbb.sizedByteArray()))
+  }
 }
 
 class DirectByteBufferFactory extends FlatBufferBuilder.ByteBufferFactory {
@@ -316,6 +363,7 @@ object ShuffleMetadata extends Logging{
     BufferTransferRequest.createBufferTransferRequest(fbb, bufferId)
   }
 
+
   def buildBufferTransferResponse(bufferMetas: Seq[BufferMeta]): ByteBuffer = {
     val fbb = new FlatBufferBuilder(1024, bbFactory)
     val responses = bufferMetas.map { bm =>
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ParquetCachedBatchSerializer.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ParquetCachedBatchSerializer.scala
index bad25576d2e..03336427fa2 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ParquetCachedBatchSerializer.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ParquetCachedBatchSerializer.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@ import ai.rapids.cudf.ParquetWriterOptions.StatisticsFrequency
 import com.nvidia.spark.GpuCachedBatchSerializer
 import com.nvidia.spark.rapids.GpuColumnVector.GpuColumnarBatchBuilder
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
-import com.nvidia.spark.rapids.shims.{ParquetFieldIdShims, SparkShimImpl}
+import com.nvidia.spark.rapids.shims.{ParquetFieldIdShims, ParquetLegacyNanoAsLongShims, SparkShimImpl}
 import org.apache.commons.io.output.ByteArrayOutputStream
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.mapreduce.RecordWriter
@@ -477,7 +477,7 @@ protected class ParquetCachedBatchSerializer extends GpuCachedBatchSerializer wi
           Table.readParquet(parquetOptions, parquetCB.buffer, 0, parquetCB.sizeInBytes)
         } catch {
           case e: Exception =>
-            throw new IOException("Error when processing file " + 
+            throw new IOException("Error when processing file " +
                 s"[range: 0-${parquetCB.sizeInBytes}]", e)
         }
         withResource(table) { table =>
@@ -557,7 +557,7 @@ protected class ParquetCachedBatchSerializer extends GpuCachedBatchSerializer wi
       })
       cbRdd.mapPartitions(iter => CloseableColumnBatchIterator(iter))
     } else {
-      val origSelectedAttributesWithUnambiguousNames = 
+      val origSelectedAttributesWithUnambiguousNames =
         sanitizeColumnNames(newSelectedAttributes, selectedSchemaWithNames)
       val broadcastedConf = SparkSession.active.sparkContext.broadcast(conf.getAllConfs)
       input.mapPartitions {
@@ -1201,6 +1201,9 @@ protected class ParquetCachedBatchSerializer extends GpuCachedBatchSerializer wi
     // From 3.3.0, Spark will check this filed ID config
     ParquetFieldIdShims.setupParquetFieldIdWriteConfig(hadoopConf, sqlConf)
 
+    // From 3.3.2, Spark schema converter needs this conf
+    ParquetLegacyNanoAsLongShims.setupLegacyParquetNanosAsLongForPCBS(hadoopConf)
+
     hadoopConf
   }
 
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ParquetSchemaUtils.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ParquetSchemaUtils.scala
index 3d0a56197f3..5a0a7b8b531 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ParquetSchemaUtils.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ParquetSchemaUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -559,6 +559,14 @@ object ParquetSchemaUtils extends Arm {
     cv.getType.equals(DType.INT32) && Seq(ByteType, ShortType, DateType).contains(dt)
   }
 
+  private def needSignedUpcast(cv: ColumnView, dt: DataType): Boolean = {
+    cv.getType match {
+      case DType.INT8 => dt == ShortType || dt == IntegerType
+      case DType.INT16 => dt == IntegerType
+      case _ => false
+    }
+  }
+
   // Wrap up all required casts for Parquet schema evolution
   //
   // Note: The behavior of unsigned to signed is decided by the Spark,
@@ -568,9 +576,8 @@ object ParquetSchemaUtils extends Arm {
   private def evolveSchemaCasts(cv: ColumnView, dt: DataType): ColumnView = {
     if (needDecimalCast(cv, dt)) {
       cv.castTo(DecimalUtil.createCudfDecimal(dt.asInstanceOf[DecimalType]))
-    } else if (needUnsignedToSignedCast(cv, dt)) {
-      cv.castTo(DType.create(GpuColumnVector.getNonNestedRapidsType(dt).getTypeId))
-    } else if (needInt32Downcast(cv, dt)) {
+    } else if (needUnsignedToSignedCast(cv, dt) || needInt32Downcast(cv, dt) ||
+        needSignedUpcast(cv, dt)) {
       cv.castTo(DType.create(GpuColumnVector.getNonNestedRapidsType(dt).getTypeId))
     } else if (DType.STRING.equals(cv.getType) && dt == BinaryType) {
       // Ideally we would bitCast the STRING to a LIST, but that does not work.
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/PlanUtils.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/PlanUtils.scala
index 48455887578..d09e3a2bbae 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/PlanUtils.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/PlanUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,7 +43,7 @@ object PlanUtils {
     val execNameWithoutPackage = getBaseNameFromClass(planClass.getName)
     execNameWithoutPackage == fallbackCpuClass ||
       plan.getClass.getName == fallbackCpuClass ||
-      Try(ShimLoader.loadClass(fallbackCpuClass))
+      Try(ShimReflectionUtils.loadClass(fallbackCpuClass))
         .map(_.isAssignableFrom(planClass))
         .getOrElse(false)
   }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
index da208c761c6..a5ff0732d12 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 package com.nvidia.spark.rapids
 
+import java.lang.reflect.InvocationTargetException
 import java.time.ZoneId
 import java.util.Properties
 
@@ -28,7 +29,7 @@ import com.nvidia.spark.rapids.python.PythonWorkerSemaphore
 import org.apache.commons.lang3.exception.ExceptionUtils
 
 import org.apache.spark.{ExceptionFailure, SparkConf, SparkContext, TaskFailedReason}
-import org.apache.spark.api.plugin.{DriverPlugin, ExecutorPlugin, PluginContext}
+import org.apache.spark.api.plugin.{DriverPlugin, ExecutorPlugin, PluginContext, SparkPlugin}
 import org.apache.spark.internal.Logging
 import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
 import org.apache.spark.sql.catalyst.rules.Rule
@@ -77,6 +78,8 @@ object RapidsPluginUtils extends Logging {
     logWarning(s"RAPIDS Accelerator $pluginVersion using cudf $cudfVersion.")
   }
 
+  val extraPlugins = getExtraPlugins
+
   def logPluginMode(conf: RapidsConf): Unit = {
     if (conf.isSqlEnabled && conf.isSqlExecuteOnGPU) {
       logWarning("RAPIDS Accelerator is enabled, to disable GPU " +
@@ -167,6 +170,50 @@ object RapidsPluginUtils extends Logging {
     props.load(resource)
     props
   }
+
+  private def loadExtensions[T <: AnyRef](extClass: Class[T], classes: Seq[String]): Seq[T] = {
+    classes.flatMap { name =>
+      try {
+        val klass = TrampolineUtil.classForName[T](name)
+        require(extClass.isAssignableFrom(klass),
+          s"$name is not a subclass of ${extClass.getName()}.")
+        Some(klass.getConstructor().newInstance())
+      } catch {
+        case _: NoSuchMethodException =>
+          throw new NoSuchMethodException(
+            s"$name did not have a zero-argument constructor or a" +
+              " single-argument constructor that accepts SparkConf. Note: if the class is" +
+              " defined inside of another Scala class, then its constructors may accept an" +
+              " implicit parameter that references the enclosing class; in this case, you must" +
+              " define the class as a top-level class in order to prevent this extra" +
+              " parameter from breaking Spark's ability to find a valid constructor.")
+
+        case e: InvocationTargetException =>
+          e.getCause() match {
+            case uoe: UnsupportedOperationException =>
+              logDebug(s"Extension $name not being initialized.", uoe)
+              logInfo(s"Extension $name not being initialized.")
+              None
+
+            case null => throw e
+            case cause => throw cause
+          }
+      }
+    }
+  }
+
+  private def getExtraPlugins: Seq[SparkPlugin] = {
+    val resourceName = "spark-rapids-extra-plugins"
+    val classLoader = RapidsPluginUtils.getClass.getClassLoader
+    val resource = classLoader.getResourceAsStream(resourceName)
+    if (resource == null) {
+      logDebug(s"Could not find file $resourceName in the classpath, not loading extra plugins")
+      Seq.empty
+    } else {
+      val pluginClasses = scala.io.Source.fromInputStream(resource).getLines().toSeq
+      loadExtensions(classOf[SparkPlugin], pluginClasses)
+    }
+  }
 }
 
 /**
@@ -174,6 +221,8 @@ object RapidsPluginUtils extends Logging {
  */
 class RapidsDriverPlugin extends DriverPlugin with Logging {
   var rapidsShuffleHeartbeatManager: RapidsShuffleHeartbeatManager = null
+  private lazy val extraDriverPlugins =
+    RapidsPluginUtils.extraPlugins.map(_.driverPlugin()).filterNot(_ == null)
 
   override def receive(msg: Any): AnyRef = {
     if (rapidsShuffleHeartbeatManager == null) {
@@ -205,8 +254,19 @@ class RapidsDriverPlugin extends DriverPlugin with Logging {
             conf.shuffleTransportEarlyStartHeartbeatTimeout)
       }
     }
+    logDebug("Loading extra driver plugins: " +
+      s"${extraDriverPlugins.map(_.getClass.getName).mkString(",")}")
+    extraDriverPlugins.foreach(_.init(sc, pluginContext))
     conf.rapidsConfMap
   }
+
+  override def registerMetrics(appId: String, pluginContext: PluginContext): Unit = {
+    extraDriverPlugins.foreach(_.registerMetrics(appId, pluginContext))
+  }
+
+  override def shutdown(): Unit = {
+    extraDriverPlugins.foreach(_.shutdown())
+  }
 }
 
 /**
@@ -214,6 +274,8 @@ class RapidsDriverPlugin extends DriverPlugin with Logging {
  */
 class RapidsExecutorPlugin extends ExecutorPlugin with Logging {
   var rapidsShuffleHeartbeatEndpoint: RapidsShuffleHeartbeatEndpoint = null
+  private lazy val extraExecutorPlugins =
+    RapidsPluginUtils.extraPlugins.map(_.executorPlugin()).filterNot(_ == null)
 
   override def init(
       pluginContext: PluginContext,
@@ -263,6 +325,9 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging {
         }
       }
 
+      logDebug("Loading extra executor plugins: " +
+        s"${extraExecutorPlugins.map(_.getClass.getName).mkString(",")}")
+      extraExecutorPlugins.foreach(_.init(pluginContext, extraConf))
       GpuSemaphore.initialize()
     } catch {
       // Exceptions in executor plugin can cause a single thread to die but the executor process
@@ -365,6 +430,7 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging {
     PythonWorkerSemaphore.shutdown()
     GpuDeviceManager.shutdown()
     Option(rapidsShuffleHeartbeatEndpoint).foreach(_.close())
+    extraExecutorPlugins.foreach(_.shutdown())
   }
 
   override def onTaskFailed(failureReason: TaskFailedReason): Unit = {
@@ -389,6 +455,15 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging {
       case other =>
         logDebug(s"Executor onTaskFailed: ${other.toString}")
     }
+    extraExecutorPlugins.foreach(_.onTaskFailed(failureReason))
+  }
+
+  override def onTaskStart(): Unit = {
+    extraExecutorPlugins.foreach(_.onTaskStart())
+  }
+
+  override def onTaskSucceeded(): Unit = {
+    extraExecutorPlugins.foreach(_.onTaskSucceeded())
   }
 }
 
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBuffer.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBuffer.scala
index 44300b6100d..5c4abbff2a1 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBuffer.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBuffer.scala
@@ -58,35 +58,6 @@ object StorageTier extends Enumeration {
   val GDS: StorageTier = Value(3, "GPUDirect Storage")
 }
 
-abstract class SpillCallback extends Serializable {
-
-  /**
-   * Callback type for when a batch is spilled from one storage tier to another. This is
-   * intended to only be used for metrics gathering in parts of the GPU plan that can spill.
-   * No GPU memory should ever be allocated from this callback, blocking in this function
-   * is strongly discouraged. It should be as light weight as possible. It takes three arguments
-   *
-   * @param from the storage tier the data is being spilled from.
-   * @param to the storage tier the data is being spilled to.
-   * @param amount the amount of data in bytes that is spilled.
-   */
-  def apply (from: StorageTier, to: StorageTier, amount: Long): Unit
-
-  def semaphoreWaitTime: GpuMetric
-}
-
-object RapidsBuffer {
-
-  /**
-   * A default NOOP callback for when a buffer is spilled
-   */
-  val defaultSpillCallback: SpillCallback = new SpillCallback {
-    override def apply(from: StorageTier, to: StorageTier, amount: Long): Unit = ()
-
-    override def semaphoreWaitTime: GpuMetric = NoopMetric
-  }
-}
-
 /** Interface provided by all types of RAPIDS buffers */
 trait RapidsBuffer extends AutoCloseable {
   /** The buffer identifier for this buffer. */
@@ -169,12 +140,6 @@ trait RapidsBuffer extends AutoCloseable {
    */
   def getSpillPriority: Long
 
-  /**
-   * Gets the spill metrics callback currently associated with this buffer.
-   * @return the current callback
-   */
-  def getSpillCallback: SpillCallback
-
   /**
    * Set the spill priority for this buffer. Lower values are higher priority
    * for spilling, meaning buffers with lower values will be preferred for
@@ -183,13 +148,6 @@ trait RapidsBuffer extends AutoCloseable {
    * @param priority new priority value for this buffer
    */
   def setSpillPriority(priority: Long): Unit
-
-  /**
-   * Update the metrics callback that will be invoked next time a spill occurs.
-   * @note should only be called from the buffer catalog
-   * @param spillCallback the new callback
-   */
-  def setSpillCallback(spillCallback: SpillCallback): Unit
 }
 
 /**
@@ -238,11 +196,7 @@ sealed class DegenerateRapidsBuffer(
 
   override def getSpillPriority: Long = Long.MaxValue
 
-  override val getSpillCallback: SpillCallback = RapidsBuffer.defaultSpillCallback
-
   override def setSpillPriority(priority: Long): Unit = {}
 
-  override def setSpillCallback(callback: SpillCallback): Unit = {}
-
   override def close(): Unit = {}
 }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferCatalog.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferCatalog.scala
index 4dded36f265..5b8751375e0 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferCatalog.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferCatalog.scala
@@ -19,14 +19,17 @@ package com.nvidia.spark.rapids
 import java.util.concurrent.ConcurrentHashMap
 import java.util.function.BiFunction
 
-import ai.rapids.cudf.{ContiguousTable, DeviceMemoryBuffer, Rmm, Table}
+import ai.rapids.cudf.{ContiguousTable, Cuda, DeviceMemoryBuffer, MemoryBuffer, NvtxColor, NvtxRange, Rmm}
+import com.nvidia.spark.rapids.RapidsBufferCatalog.getExistingRapidsBufferAndAcquire
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.StorageTier.StorageTier
 import com.nvidia.spark.rapids.format.TableMeta
+import com.nvidia.spark.rapids.jni.RmmSpark
 
 import org.apache.spark.{SparkConf, SparkEnv}
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.rapids.RapidsDiskBlockManager
+import org.apache.spark.sql.rapids.{RapidsDiskBlockManager, TempSpillBufferId}
+import org.apache.spark.sql.rapids.execution.TrampolineUtil
 
 /**
  *  Exception thrown when inserting a buffer into the catalog with a duplicate buffer ID
@@ -55,7 +58,9 @@ trait RapidsBufferHandle extends AutoCloseable {
  * Catalog for lookup of buffers by ID. The constructor is only visible for testing, generally
  * `RapidsBufferCatalog.singleton` should be used instead.
  */
-class RapidsBufferCatalog extends AutoCloseable with Arm {
+class RapidsBufferCatalog(
+    deviceStorage: RapidsDeviceMemoryStore = RapidsBufferCatalog.deviceStorage)
+  extends AutoCloseable with Arm with Logging {
 
   /** Map of buffer IDs to buffers sorted by storage tier */
   private[this] val bufferMap = new ConcurrentHashMap[RapidsBufferId, Seq[RapidsBuffer]]
@@ -64,10 +69,12 @@ class RapidsBufferCatalog extends AutoCloseable with Arm {
   private[this] val bufferIdToHandles =
     new ConcurrentHashMap[RapidsBufferId, Seq[RapidsBufferHandleImpl]]()
 
+  /** A counter used to skip a spill attempt if we detect a different thread has spilled */
+  @volatile private[this] var spillCount: Integer = 0
+
   class RapidsBufferHandleImpl(
       override val id: RapidsBufferId,
-      var priority: Long,
-      spillCallback: SpillCallback)
+      var priority: Long)
     extends RapidsBufferHandle {
 
     private var closed = false
@@ -87,16 +94,6 @@ class RapidsBufferCatalog extends AutoCloseable with Arm {
      */
     def getSpillPriority: Long = priority
 
-    /**
-     * Each handle was created in a different part of the code and as such could have
-     * different spill metrics callbacks. This function is used by the catalog to find
-     * out what the last spill callback added. This last callback gets reports of
-     * spill bytes if a spill were to occur to the `RapidsBuffer` this handle points to.
-     *
-     * @return the spill callback associated with this handle
-     */
-    def getSpillCallback: SpillCallback = spillCallback
-
     override def close(): Unit = synchronized {
       // since the handle is stored in the catalog in addition to being
       // handed out to potentially a `SpillableColumnarBatch` or `SpillableBuffer`
@@ -118,15 +115,13 @@ class RapidsBufferCatalog extends AutoCloseable with Arm {
    *
    * @param id the `RapidsBufferId` that this handle refers to
    * @param spillPriority the spill priority specified on creation of the handle
-   * @param spillCallback this handle's spill callback
    * @note public for testing
    * @return a new instance of `RapidsBufferHandle`
    */
   def makeNewHandle(
       id: RapidsBufferId,
-      spillPriority: Long,
-      spillCallback: SpillCallback): RapidsBufferHandle = {
-    val handle = new RapidsBufferHandleImpl(id, spillPriority, spillCallback)
+      spillPriority: Long): RapidsBufferHandle = {
+    val handle = new RapidsBufferHandleImpl(id, spillPriority)
     trackNewHandle(handle)
     handle
   }
@@ -181,9 +176,6 @@ class RapidsBufferCatalog extends AutoCloseable with Arm {
           if (newHandles.isEmpty) {
             null // remove since no more handles exist, should not happen
           } else {
-            // we pick the last spillCallback inserted as the winner every time
-            // this callback is going to get the metrics associated with this buffer's
-            // spill
             newHandles
           }
         }
@@ -196,12 +188,148 @@ class RapidsBufferCatalog extends AutoCloseable with Arm {
       } else {
         // more handles remain, our priority changed so we need to update things
         buffer.setSpillPriority(maxPriority)
-        buffer.setSpillCallback(newHandles.last.getSpillCallback)
         false // we have handles left
       }
     }
   }
 
+  /**
+   * Adds a buffer to the device storage. This does NOT take ownership of the
+   * buffer, so it is the responsibility of the caller to close it.
+   *
+   * This version of `addBuffer` should not be called from the shuffle catalogs
+   * since they provide their own ids.
+   *
+   * @param buffer buffer that will be owned by the store
+   * @param tableMeta metadata describing the buffer layout
+   * @param initialSpillPriority starting spill priority value for the buffer
+   * @param needsSync whether the spill framework should stream synchronize while adding
+   *                  this device buffer (defaults to true)
+   * @return RapidsBufferHandle handle for this buffer
+   */
+  def addBuffer(
+      buffer: DeviceMemoryBuffer,
+      tableMeta: TableMeta,
+      initialSpillPriority: Long,
+      needsSync: Boolean = true): RapidsBufferHandle = synchronized {
+    // first time we see `buffer`
+    val existing = getExistingRapidsBufferAndAcquire(buffer)
+    existing match {
+      case None =>
+        addBuffer(
+          TempSpillBufferId(),
+          buffer,
+          tableMeta,
+          initialSpillPriority,
+          needsSync)
+      case Some(rapidsBuffer) =>
+        withResource(rapidsBuffer) { _ =>
+          makeNewHandle(rapidsBuffer.id, initialSpillPriority)
+        }
+    }
+  }
+
+  /**
+   * Adds a contiguous table to the device storage. This does NOT take ownership of the
+   * contiguous table, so it is the responsibility of the caller to close it. The refcount of the
+   * underlying device buffer will be incremented so the contiguous table can be closed before
+   * this buffer is destroyed.
+   *
+   * This version of `addContiguousTable` should not be called from the shuffle catalogs
+   * since they provide their own ids.
+   *
+   * @param contigTable contiguous table to track in storage
+   * @param initialSpillPriority starting spill priority value for the buffer
+   * @param needsSync whether the spill framework should stream synchronize while adding
+   *                  this device buffer (defaults to true)
+   * @return RapidsBufferHandle handle for this table
+   */
+  def addContiguousTable(
+      contigTable: ContiguousTable,
+      initialSpillPriority: Long,
+      needsSync: Boolean = true): RapidsBufferHandle = synchronized {
+    val existing = getExistingRapidsBufferAndAcquire(contigTable.getBuffer)
+    existing match {
+      case None =>
+        addContiguousTable(
+          TempSpillBufferId(),
+          contigTable,
+          initialSpillPriority,
+          needsSync)
+      case Some(rapidsBuffer) =>
+        withResource(rapidsBuffer) { _ =>
+          makeNewHandle(rapidsBuffer.id, initialSpillPriority)
+        }
+    }
+  }
+
+  /**
+   * Adds a contiguous table to the device storage. This does NOT take ownership of the
+   * contiguous table, so it is the responsibility of the caller to close it. The refcount of the
+   * underlying device buffer will be incremented so the contiguous table can be closed before
+   * this buffer is destroyed.
+   *
+   * @param id the RapidsBufferId to use for this buffer
+   * @param contigTable contiguous table to track in storage
+   * @param initialSpillPriority starting spill priority value for the buffer
+   * @param needsSync whether the spill framework should stream synchronize while adding
+   *                  this device buffer (defaults to true)
+   * @return RapidsBufferHandle handle for this table
+   */
+  def addContiguousTable(
+      id: RapidsBufferId,
+      contigTable: ContiguousTable,
+      initialSpillPriority: Long,
+      needsSync: Boolean): RapidsBufferHandle = synchronized {
+    addBuffer(
+      id,
+      contigTable.getBuffer,
+      MetaUtils.buildTableMeta(id.tableId, contigTable),
+      initialSpillPriority,
+      needsSync)
+  }
+
+  /**
+   * Adds a buffer to the device storage. This does NOT take ownership of the
+   * buffer, so it is the responsibility of the caller to close it.
+   *
+   * @param id the RapidsBufferId to use for this buffer
+   * @param buffer buffer that will be owned by the store
+   * @param tableMeta metadata describing the buffer layout
+   * @param initialSpillPriority starting spill priority value for the buffer
+   * @param needsSync whether the spill framework should stream synchronize while adding
+   *                  this device buffer (defaults to true)
+   * @return RapidsBufferHandle handle for this RapidsBuffer
+   */
+  def addBuffer(
+      id: RapidsBufferId,
+      buffer: DeviceMemoryBuffer,
+      tableMeta: TableMeta,
+      initialSpillPriority: Long,
+      needsSync: Boolean): RapidsBufferHandle = synchronized {
+    logDebug(s"Adding buffer ${id} to ${deviceStorage}")
+    val rapidsBuffer = deviceStorage.addBuffer(
+      id,
+      buffer,
+      tableMeta,
+      initialSpillPriority,
+      needsSync)
+    registerNewBuffer(rapidsBuffer)
+    makeNewHandle(id, initialSpillPriority)
+  }
+
+  /**
+   * Register a degenerate RapidsBufferId given a TableMeta
+   * @note this is called from the shuffle catalogs only
+   */
+  def registerDegenerateBuffer(
+      bufferId: RapidsBufferId,
+      meta: TableMeta): RapidsBufferHandle = synchronized {
+    val buffer = new DegenerateRapidsBuffer(bufferId, meta)
+    registerNewBuffer(buffer)
+    makeNewHandle(buffer.id, buffer.getSpillPriority)
+  }
+
   /**
    * Called by the catalog when a handle is first added to the catalog, or to refresh
    * the priority of the underlying buffer if a handle's priority changed.
@@ -213,7 +341,6 @@ class RapidsBufferCatalog extends AutoCloseable with Arm {
       // update the priority of the underlying RapidsBuffer to be the
       // maximum priority for all handles associated with it
       buffer.setSpillPriority(maxPriority)
-      buffer.setSpillCallback(handles.last.getSpillCallback)
     }
   }
 
@@ -229,7 +356,8 @@ class RapidsBufferCatalog extends AutoCloseable with Arm {
     (0 until RapidsBufferCatalog.MAX_BUFFER_LOOKUP_ATTEMPTS).foreach { _ =>
       val buffers = bufferMap.get(id)
       if (buffers == null || buffers.isEmpty) {
-        throw new NoSuchElementException(s"Cannot locate buffers associated with ID: $id")
+        throw new NoSuchElementException(
+          s"Cannot locate buffers associated with ID: $id")
       }
       val buffer = buffers.head
       if (buffer.addReference()) {
@@ -264,6 +392,7 @@ class RapidsBufferCatalog extends AutoCloseable with Arm {
    *
    * @param id   buffer identifier
    * @param tier storage tier to check
+   * @note public for testing
    * @return true if the buffer is stored in multiple tiers
    */
   def isBufferSpilled(id: RapidsBufferId, tier: StorageTier): Boolean = {
@@ -283,6 +412,7 @@ class RapidsBufferCatalog extends AutoCloseable with Arm {
   /**
    * Register a new buffer with the catalog. An exception will be thrown if an
    * existing buffer was registered with the same buffer ID and storage tier.
+   * @note public for testing
    */
   def registerNewBuffer(buffer: RapidsBuffer): Unit = {
     val updater = new BiFunction[RapidsBufferId, Seq[RapidsBuffer], Seq[RapidsBuffer]] {
@@ -304,8 +434,172 @@ class RapidsBufferCatalog extends AutoCloseable with Arm {
     bufferMap.compute(buffer.id, updater)
   }
 
-  /** Remove a buffer ID from the catalog at the specified storage tier. */
-  def removeBufferTier(id: RapidsBufferId, tier: StorageTier): Unit = {
+  /**
+   * Free memory in `store` by spilling buffers to the spill store synchronously.
+   * @param store store to spill from
+   * @param targetTotalSize maximum total size of this store after spilling completes
+   * @param stream CUDA stream to use or null for default stream
+   * @return optionally number of bytes that were spilled, or None if this called
+   *         made no attempt to spill due to a detected spill race
+   */
+  def synchronousSpill(
+      store: RapidsBufferStore,
+      targetTotalSize: Long,
+      stream: Cuda.Stream = Cuda.DEFAULT_STREAM): Option[Long] = {
+    val spillStore = store.spillStore
+    if (spillStore == null) {
+      throw new OutOfMemoryError("Requested to spill without a spill store")
+    }
+    require(targetTotalSize >= 0, s"Negative spill target size: $targetTotalSize")
+    logWarning(s"Targeting a ${store.name} size of $targetTotalSize. " +
+      s"Current total ${store.currentSize}. " +
+      s"Current spillable ${store.currentSpillableSize}")
+
+    // we try to spill in this thread. If another thread is also spilling, we let that
+    // thread win and we return letting RMM retry the alloc
+    var rmmShouldRetryAlloc = false
+
+    // total amount spilled in this invocation
+    var totalSpilled: Long = 0
+
+    if (store.currentSpillableSize > targetTotalSize) {
+      withResource(new NvtxRange(s"${store.name} sync spill", NvtxColor.ORANGE)) { _ =>
+        logWarning(s"${store.name} store spilling to reduce usage from " +
+          s"${store.currentSize} total (${store.currentSpillableSize} spillable) " +
+          s"to $targetTotalSize bytes")
+
+        // If the store has 0 spillable bytes left, it has exhausted.
+        var exhausted = false
+
+        while (!exhausted && !rmmShouldRetryAlloc &&
+            store.currentSpillableSize > targetTotalSize) {
+          val mySpillCount = spillCount
+          synchronized {
+            if (spillCount == mySpillCount) {
+              spillCount += 1
+              val nextSpillable = store.nextSpillable()
+              if (nextSpillable != null) {
+                // we have a buffer (nextSpillable) to spill
+                spillAndFreeBuffer(nextSpillable, spillStore, stream)
+                totalSpilled += nextSpillable.size
+              }
+            } else {
+              rmmShouldRetryAlloc = true
+            }
+          }
+          if (!rmmShouldRetryAlloc && totalSpilled <= 0) {
+            // we didn't spill in this iteration, exit loop
+            exhausted = true
+            logWarning("Unable to spill enough to meet request. " +
+                s"Total=${store.currentSize} " +
+                s"Spillable=${store.currentSpillableSize} " +
+                s"Target=$targetTotalSize")
+          }
+        }
+      }
+    }
+
+    if (rmmShouldRetryAlloc) {
+      // if we are going to retry, and didn't spill, returning None prevents extra
+      // logs where we say we spilled 0 bytes from X store
+      None
+    } else {
+      Some(totalSpilled)
+    }
+  }
+
+  /**
+   * Given a specific `RapidsBuffer` spill it to `spillStore`
+   * @note called with catalog lock held
+   */
+  private def spillAndFreeBuffer(
+      buffer: RapidsBuffer,
+      spillStore: RapidsBufferStore,
+      stream: Cuda.Stream): Unit = {
+    if (buffer.addReference()) {
+      withResource(buffer) { _ =>
+        logDebug(s"Spilling $buffer ${buffer.id} to ${spillStore.name}")
+        val bufferHasSpilled = isBufferSpilled(buffer.id, buffer.storageTier)
+        if (!bufferHasSpilled) {
+          // if the spillStore specifies a maximum size spill taking this ceiling
+          // into account before trying to create a buffer there
+          trySpillToMaximumSize(buffer, spillStore, stream)
+
+          // copy the buffer to spillStore
+          val newBuffer = spillStore.copyBuffer(buffer, buffer.getMemoryBuffer, stream)
+
+          // once spilled, we get back a new RapidsBuffer instance in this new tier
+          registerNewBuffer(newBuffer)
+        } else {
+          logDebug(s"Skipping spilling $buffer ${buffer.id} to ${spillStore.name} as it is " +
+            s"already stored in multiple tiers")
+        }
+      }
+      // we can now remove the old tier linkage
+      removeBufferTier(buffer.id, buffer.storageTier)
+      // and free
+      buffer.safeFree()
+    }
+  }
+
+  /**
+   * If `spillStore` defines a maximum size, spill to make room for `buffer`.
+   */
+  private def trySpillToMaximumSize(
+      buffer: RapidsBuffer,
+      spillStore: RapidsBufferStore,
+      stream: Cuda.Stream): Unit = {
+    val spillStoreMaxSize = spillStore.getMaxSize
+    if (spillStoreMaxSize.isDefined) {
+      // this spillStore has a maximum size requirement (host only). We need to spill from it
+      // in order to make room for `buffer`.
+      val targetTotalSize =
+        math.max(spillStoreMaxSize.get - buffer.size, 0)
+      val maybeAmountSpilled = synchronousSpill(spillStore, targetTotalSize, stream)
+      maybeAmountSpilled.foreach { amountSpilled =>
+        if (amountSpilled != 0) {
+          logInfo(s"Spilled $amountSpilled bytes from the ${spillStore.name} store")
+          TrampolineUtil.incTaskMetricsDiskBytesSpilled(amountSpilled)
+        }
+      }
+    }
+  }
+
+  /**
+   * Copies `buffer` to the `deviceStorage` store, registering a new `RapidsBuffer` in
+   * the process
+   * @param buffer - buffer to copy
+   * @param memoryBuffer - cuDF MemoryBuffer to copy from
+   * @param stream - Cuda.Stream to synchronize on
+   * @return - The `RapidsBuffer` instance that was added to the device store.
+   */
+  def unspillBufferToDeviceStore(
+    buffer: RapidsBuffer,
+    memoryBuffer: MemoryBuffer,
+    stream: Cuda.Stream): RapidsBuffer = synchronized {
+    // try to acquire the buffer, if it's already in the store
+    // do not create a new one, else add a reference
+    acquireBuffer(buffer.id, StorageTier.DEVICE) match {
+      case None =>
+        val newBuffer = deviceStorage.copyBuffer(
+          buffer,
+          memoryBuffer,
+          stream)
+        newBuffer.addReference() // add a reference since we are about to use it
+        registerNewBuffer(newBuffer)
+        newBuffer
+      case Some(existingBuffer) =>
+        withResource(memoryBuffer) { _ =>
+          existingBuffer
+        }
+    }
+  }
+
+  /**
+   * Remove a buffer ID from the catalog at the specified storage tier.
+   * @note public for testing
+   */
+  def removeBufferTier(id: RapidsBufferId, tier: StorageTier): Unit = synchronized {
     val updater = new BiFunction[RapidsBufferId, Seq[RapidsBuffer], Seq[RapidsBuffer]] {
       override def apply(key: RapidsBufferId, value: Seq[RapidsBuffer]): Seq[RapidsBuffer] = {
         val updated = value.filter(_.storageTier != tier)
@@ -327,11 +621,11 @@ class RapidsBufferCatalog extends AutoCloseable with Arm {
    *               (`handle` was the last handle)
    *         false: if buffer was not removed due to other live handles.
    */
-  private def removeBuffer(handle: RapidsBufferHandle): Boolean = {
+  private def removeBuffer(handle: RapidsBufferHandle): Boolean = synchronized {
     // if this is the last handle, remove the buffer
     if (stopTrackingHandle(handle)) {
-      val buffers = bufferMap.remove(handle.id)
-      buffers.safeFree()
+      logDebug(s"Removing buffer ${handle.id}")
+      bufferMap.remove(handle.id).safeFree()
       true
     } else {
       false
@@ -350,9 +644,9 @@ class RapidsBufferCatalog extends AutoCloseable with Arm {
 }
 
 object RapidsBufferCatalog extends Logging with Arm {
+
   private val MAX_BUFFER_LOOKUP_ATTEMPTS = 100
 
-  val singleton = new RapidsBufferCatalog
   private var deviceStorage: RapidsDeviceMemoryStore = _
   private var hostStorage: RapidsHostMemoryStore = _
   private var diskBlockManager: RapidsDiskBlockManager = _
@@ -360,6 +654,18 @@ object RapidsBufferCatalog extends Logging with Arm {
   private var gdsStorage: RapidsGdsStore = _
   private var memoryEventHandler: DeviceMemoryEventHandler = _
   private var _shouldUnspill: Boolean = _
+  private var _singleton: RapidsBufferCatalog = null
+
+  def singleton: RapidsBufferCatalog = {
+    if (_singleton == null) {
+      synchronized {
+        if (_singleton == null) {
+          _singleton = new RapidsBufferCatalog(deviceStorage)
+        }
+      }
+    }
+    _singleton
+  }
 
   private lazy val conf: SparkConf = {
     val env = SparkEnv.get
@@ -371,11 +677,26 @@ object RapidsBufferCatalog extends Logging with Arm {
     }
   }
 
-  // For testing
+  /**
+   * Set a `RapidsDeviceMemoryStore` instance to use when instantiating our
+   * catalog.
+   * @note This should only be called from tests!
+   */
   def setDeviceStorage(rdms: RapidsDeviceMemoryStore): Unit = {
     deviceStorage = rdms
   }
 
+  /**
+   * Set a `RapidsBufferCatalog` instance to use our singleton.
+   * @note This should only be called from tests!
+   */
+  def setCatalog(catalog: RapidsBufferCatalog): Unit = synchronized {
+    if (_singleton != null) {
+      _singleton.close()
+    }
+    _singleton = catalog
+  }
+
   def init(rapidsConf: RapidsConf): Unit = {
     // We are going to re-initialize so make sure all of the old things were closed...
     closeImpl()
@@ -399,11 +720,24 @@ object RapidsBufferCatalog extends Logging with Arm {
 
     logInfo("Installing GPU memory handler for spill")
     memoryEventHandler = new DeviceMemoryEventHandler(
+      singleton,
       deviceStorage,
       rapidsConf.gpuOomDumpDir,
       rapidsConf.isGdsSpillEnabled,
       rapidsConf.gpuOomMaxRetries)
-    Rmm.setEventHandler(memoryEventHandler)
+
+    if (rapidsConf.sparkRmmStateEnable) {
+      val debugLoc = if (rapidsConf.sparkRmmDebugLocation.isEmpty) {
+        null
+      } else {
+        rapidsConf.sparkRmmDebugLocation
+      }
+
+      RmmSpark.setEventHandler(memoryEventHandler, debugLoc)
+    } else {
+      logWarning("SparkRMM retry has been disabled")
+      Rmm.setEventHandler(memoryEventHandler)
+    }
 
     _shouldUnspill = rapidsConf.isUnspillEnabled
   }
@@ -413,8 +747,11 @@ object RapidsBufferCatalog extends Logging with Arm {
     closeImpl()
   }
 
-  private def closeImpl(): Unit = {
-    singleton.close()
+  private def closeImpl(): Unit = synchronized {
+    if (_singleton != null) {
+      _singleton.close()
+      _singleton = null
+    }
 
     if (memoryEventHandler != null) {
       // Workaround for shutdown ordering problems where device buffers allocated with this handler
@@ -445,25 +782,6 @@ object RapidsBufferCatalog extends Logging with Arm {
 
   def shouldUnspill: Boolean = _shouldUnspill
 
-  /**
-   * Adds a contiguous table to the device storage, taking ownership of the table.
-   * @param table cudf table based from the contiguous buffer
-   * @param contigBuffer device memory buffer backing the table
-   * @param tableMeta metadata describing the buffer layout
-   * @param initialSpillPriority starting spill priority value for the buffer
-   * @param spillCallback a callback when the buffer is spilled. This should be very light weight.
-   *                      It should never allocate GPU memory and really just be used for metrics.
-   * @return RapidsBufferHandle associated with this buffer
-   */
-  def addTable(
-      table: Table,
-      contigBuffer: DeviceMemoryBuffer,
-      tableMeta: TableMeta,
-      initialSpillPriority: Long,
-      spillCallback: SpillCallback = RapidsBuffer.defaultSpillCallback): RapidsBufferHandle = {
-    deviceStorage.addTable(table, contigBuffer, tableMeta, initialSpillPriority)
-  }
-
   /**
    * Adds a contiguous table to the device storage. This does NOT take ownership of the
    * contiguous table, so it is the responsibility of the caller to close it. The refcount of the
@@ -471,15 +789,12 @@ object RapidsBufferCatalog extends Logging with Arm {
    * this buffer is destroyed.
    * @param contigTable contiguous table to trackNewHandle in device storage
    * @param initialSpillPriority starting spill priority value for the buffer
-   * @param spillCallback a callback when the buffer is spilled. This should be very light weight.
-   *                      It should never allocate GPU memory and really just be used for metrics.
    * @return RapidsBufferHandle associated with this buffer
    */
   def addContiguousTable(
       contigTable: ContiguousTable,
-      initialSpillPriority: Long,
-      spillCallback: SpillCallback = RapidsBuffer.defaultSpillCallback): RapidsBufferHandle = {
-      deviceStorage.addContiguousTable(contigTable, initialSpillPriority, spillCallback)
+      initialSpillPriority: Long): RapidsBufferHandle = {
+    singleton.addContiguousTable(contigTable, initialSpillPriority)
   }
 
   /**
@@ -488,16 +803,13 @@ object RapidsBufferCatalog extends Logging with Arm {
    * @param buffer buffer that will be owned by the store
    * @param tableMeta metadata describing the buffer layout
    * @param initialSpillPriority starting spill priority value for the buffer
-   * @param spillCallback a callback when the buffer is spilled. This should be very light weight.
-   *                      It should never allocate GPU memory and really just be used for metrics.
    * @return RapidsBufferHandle associated with this buffer
    */
   def addBuffer(
       buffer: DeviceMemoryBuffer,
       tableMeta: TableMeta,
-      initialSpillPriority: Long,
-      spillCallback: SpillCallback = RapidsBuffer.defaultSpillCallback): RapidsBufferHandle = {
-    deviceStorage.addBuffer(buffer, tableMeta, initialSpillPriority, spillCallback)
+      initialSpillPriority: Long): RapidsBufferHandle = {
+    singleton.addBuffer(buffer, tableMeta, initialSpillPriority)
   }
 
   /**
@@ -510,4 +822,40 @@ object RapidsBufferCatalog extends Logging with Arm {
     singleton.acquireBuffer(handle)
 
   def getDiskBlockManager(): RapidsDiskBlockManager = diskBlockManager
+
+  /**
+   * Given a `DeviceMemoryBuffer` find out if a `MemoryBuffer.EventHandler` is associated
+   * with it.
+   *
+   * After getting the `RapidsBuffer` try to acquire it via `addReference`.
+   * If successful, we can point to this buffer with a new handle, otherwise the buffer is
+   * about to be removed/freed (unlikely, because we are holding onto the reference as we
+   * are adding it again).
+   *
+   * @note public for testing
+   * @param buffer - the `DeviceMemoryBuffer` to inspect
+   * @return - Some(RapidsBuffer): the handler is associated with a rapids buffer
+   *         and the rapids buffer is currently valid, or
+   *
+   *         - None: if no `RapidsBuffer` is associated with this buffer (it is
+   *           brand new to the store, or the `RapidsBuffer` is invalid and
+   *           about to be removed).
+   */
+  private def getExistingRapidsBufferAndAcquire(
+      buffer: DeviceMemoryBuffer): Option[RapidsBuffer] = {
+    val eh = buffer.getEventHandler
+    eh match {
+      case null =>
+        None
+      case rapidsBuffer: RapidsBuffer =>
+        if (rapidsBuffer.addReference()) {
+          Some(rapidsBuffer)
+        } else {
+          None
+        }
+      case _ =>
+        throw new IllegalStateException("Unknown event handler")
+    }
+  }
 }
+
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferStore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferStore.scala
index 91e58983d5a..dc83e8aacae 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferStore.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferStore.scala
@@ -17,31 +17,26 @@
 package com.nvidia.spark.rapids
 
 import java.util.Comparator
-import java.util.concurrent.ConcurrentHashMap
-import java.util.concurrent.atomic.AtomicLong
 
-import ai.rapids.cudf.{BaseDeviceMemoryBuffer, Cuda, DeviceMemoryBuffer, HostMemoryBuffer, MemoryBuffer, NvtxColor, NvtxRange}
+import scala.collection.mutable
+
+import ai.rapids.cudf.{BaseDeviceMemoryBuffer, Cuda, DeviceMemoryBuffer, HostMemoryBuffer, MemoryBuffer}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.StorageTier.{DEVICE, StorageTier}
 import com.nvidia.spark.rapids.format.TableMeta
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.sql.rapids.GpuTaskMetrics
 import org.apache.spark.sql.types.DataType
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
-object RapidsBufferStore {
-  private val FREE_WAIT_TIMEOUT = 10 * 1000
-}
-
 /**
  * Base class for all buffer store types.
  *
  * @param tier storage tier of this store
  * @param catalog catalog to register this store
  */
-abstract class RapidsBufferStore(
-    val tier: StorageTier,
-    catalog: RapidsBufferCatalog = RapidsBufferCatalog.singleton)
+abstract class RapidsBufferStore(val tier: StorageTier)
     extends AutoCloseable with Logging with Arm {
 
   val name: String = tier.toString
@@ -50,24 +45,47 @@ abstract class RapidsBufferStore(
     private[this] val comparator: Comparator[RapidsBufferBase] =
       (o1: RapidsBufferBase, o2: RapidsBufferBase) =>
         java.lang.Long.compare(o1.getSpillPriority, o2.getSpillPriority)
+    // buffers: contains all buffers in this store, whether spillable or not
     private[this] val buffers = new java.util.HashMap[RapidsBufferId, RapidsBufferBase]
+    // spillable: contains only those buffers that are currently spillable
     private[this] val spillable = new HashedPriorityQueue[RapidsBufferBase](comparator)
+    // spilling: contains only those buffers that are currently being spilled, but
+    // have not been removed from the store
+    private[this] val spilling = new mutable.HashSet[RapidsBufferId]()
+    // total bytes stored, regardless of spillable status
     private[this] var totalBytesStored: Long = 0L
+    // total bytes that are currently eligible to be spilled
+    private[this] var totalBytesSpillable: Long = 0L
 
     def add(buffer: RapidsBufferBase): Unit = synchronized {
       val old = buffers.put(buffer.id, buffer)
+      // it is unlikely that the buffer was in this collection, but removing
+      // anyway. We assume the buffer is safe in this tier, and is not spilling
+      spilling.remove(buffer.id)
       if (old != null) {
         throw new DuplicateBufferException(s"duplicate buffer registered: ${buffer.id}")
       }
-      spillable.offer(buffer)
       totalBytesStored += buffer.size
+
+      // device buffers "spillability" is handled via DeviceMemoryBuffer ref counting
+      // so spillableOnAdd should be false, all other buffer tiers are spillable at
+      // all times.
+      if (spillableOnAdd) {
+        if (spillable.offer(buffer)) {
+          totalBytesSpillable += buffer.size
+        }
+      }
     }
 
     def remove(id: RapidsBufferId): Unit = synchronized {
+      // when removing a buffer we no longer need to know if it was spilling
+      spilling.remove(id)
       val obj = buffers.remove(id)
       if (obj != null) {
-        spillable.remove(obj)
         totalBytesStored -= obj.size
+        if (spillable.remove(obj)) {
+          totalBytesSpillable -= obj.size
+        }
       }
     }
 
@@ -76,6 +94,7 @@ abstract class RapidsBufferStore(
         val buffs = buffers.values().toArray(new Array[RapidsBufferBase](0))
         buffers.clear()
         spillable.clear()
+        spilling.clear()
         buffs
       }
       // We need to release the `RapidsBufferStore` lock to prevent a lock order inversion
@@ -84,8 +103,45 @@ abstract class RapidsBufferStore(
       values.safeFree()
     }
 
+    /**
+     * Sets a buffers state to spillable or non-spillable.
+     *
+     * If the buffer is currently being spilled or it is no longer in the `buffers` collection
+     * (e.g. it is not in this store), the action is skipped.
+     *
+     * @param buffer      the buffer to mark as spillable or not
+     * @param isSpillable whether the buffer should now be spillable
+     */
+    def setSpillable(buffer: RapidsBufferBase, isSpillable: Boolean): Unit = synchronized {
+      if (isSpillable) {
+        // if this buffer is in the store and isn't currently spilling
+        if (!spilling.contains(buffer.id) && buffers.containsKey(buffer.id)) {
+          // try to add it to the spillable collection
+          if (spillable.offer(buffer)) {
+            totalBytesSpillable += buffer.size
+            logDebug(s"Buffer ${buffer.id} is spillable. " +
+              s"total=${totalBytesStored} spillable=${totalBytesSpillable}")
+          } // else it was already there (unlikely)
+        }
+      } else {
+        if (spillable.remove(buffer)) {
+          totalBytesSpillable -= buffer.size
+          logDebug(s"Buffer ${buffer.id} is not spillable. " +
+            s"total=${totalBytesStored}, spillable=${totalBytesSpillable}")
+        } // else it was already removed
+      }
+    }
+
     def nextSpillableBuffer(): RapidsBufferBase = synchronized {
-      spillable.poll()
+      val buffer = spillable.poll()
+      if (buffer != null) {
+        // mark the id as "spilling" (this buffer is in the middle of a spill operation)
+        spilling.add(buffer.id)
+        totalBytesSpillable -= buffer.size
+        logDebug(s"Spilling buffer ${buffer.id}. size=${buffer.size} " +
+          s"total=${totalBytesStored}, new spillable=${totalBytesSpillable}")
+      }
+      buffer
     }
 
     def updateSpillPriority(buffer: RapidsBufferBase, priority:Long): Unit = synchronized {
@@ -94,26 +150,34 @@ abstract class RapidsBufferStore(
     }
 
     def getTotalBytes: Long = synchronized { totalBytesStored }
+
+    def getTotalSpillableBytes: Long = synchronized { totalBytesSpillable }
   }
 
-  private[this] val pendingFreeBytes = new AtomicLong(0L)
+  /**
+   * Stores that need to stay within a specific byte limit of buffers stored override
+   * this function. Only the `HostMemoryBufferStore` requires such a limit.
+   * @return maximum amount of bytes that can be stored in the store, None for no
+   *         limit
+   */
+  def getMaxSize: Option[Long] = None
 
   private[this] val buffers = new BufferTracker
 
-  /** Tracks buffers that are waiting on outstanding references to be freed. */
-  private[this] val pendingFreeBuffers = new ConcurrentHashMap[RapidsBufferId, RapidsBufferBase]
-
-  /** A monitor that can be used to wait for memory to be freed from this store. */
-  protected[this] val memoryFreedMonitor = new Object
-
   /** A store that can be used for spilling. */
-  private[this] var spillStore: RapidsBufferStore = _
-
-  private[this] val nvtxSyncSpillName: String = name + " sync spill"
+  var spillStore: RapidsBufferStore = _
 
   /** Return the current byte total of buffers in this store. */
   def currentSize: Long = buffers.getTotalBytes
 
+  def currentSpillableSize: Long = buffers.getTotalSpillableBytes
+
+  /**
+   * A store that manages spillability of buffers should override this method
+   * to false, otherwise `BufferTracker` treats buffers as always spillable.
+   */
+  protected def spillableOnAdd: Boolean = true
+
   /**
    * Specify another store that can be used when this store needs to spill.
    * @note Only one spill store can be registered. This will throw if a
@@ -134,79 +198,31 @@ abstract class RapidsBufferStore(
    *                     for `memoryBuffer` is transferred to this store. The store may close
    *                     `memoryBuffer` if necessary.
    * @param stream CUDA stream to use for copy or null
-   * @return new buffer that was created
+   * @return the new buffer that was created
    */
-  def copyBuffer(buffer: RapidsBuffer, memoryBuffer: MemoryBuffer, stream: Cuda.Stream)
-  : RapidsBufferBase = {
+  def copyBuffer(
+      buffer: RapidsBuffer,
+      memoryBuffer: MemoryBuffer,
+      stream: Cuda.Stream): RapidsBufferBase = {
     freeOnExcept(createBuffer(buffer, memoryBuffer, stream)) { newBuffer =>
       addBuffer(newBuffer)
       newBuffer
     }
   }
 
-  /**
-   * Free memory in this store by spilling buffers to the spill store synchronously.
-   * @param targetTotalSize maximum total size of this store after spilling completes
-   * @return number of bytes that were spilled
-   */
-  def synchronousSpill(targetTotalSize: Long): Long =
-    synchronousSpill(targetTotalSize, Cuda.DEFAULT_STREAM)
-
-  /**
-   * Free memory in this store by spilling buffers to the spill store synchronously.
-   * @param targetTotalSize maximum total size of this store after spilling completes
-   * @param stream CUDA stream to use or null for default stream
-   * @return number of bytes that were spilled
-   */
-  def synchronousSpill(targetTotalSize: Long, stream: Cuda.Stream): Long = {
-    require(targetTotalSize >= 0, s"Negative spill target size: $targetTotalSize")
-
-    var totalSpilled: Long = 0
-    if (buffers.getTotalBytes > targetTotalSize) {
-      val nvtx = new NvtxRange(nvtxSyncSpillName, NvtxColor.ORANGE)
-      try {
-        logDebug(s"$name store spilling to reduce usage from " +
-            s"${buffers.getTotalBytes} to $targetTotalSize bytes")
-        var waited = false
-        var exhausted = false
-        while (!exhausted && buffers.getTotalBytes > targetTotalSize) {
-          val amountSpilled = trySpillAndFreeBuffer(stream)
-          if (amountSpilled != 0) {
-            totalSpilled += amountSpilled
-            waited = false
-          } else {
-            if (!waited && pendingFreeBytes.get > 0) {
-              waited = true
-              logWarning(s"Cannot spill further, waiting for ${pendingFreeBytes.get} " +
-                  " bytes of pending buffers to be released")
-              memoryFreedMonitor.synchronized {
-                val memNeeded = buffers.getTotalBytes - targetTotalSize
-                if (memNeeded > 0 && memNeeded <= pendingFreeBytes.get) {
-                  // This could be a futile wait if the thread(s) holding the pending buffers open
-                  // are here waiting for more memory.
-                  memoryFreedMonitor.wait(RapidsBufferStore.FREE_WAIT_TIMEOUT)
-                }
-              }
-            } else {
-              logWarning("Unable to spill enough to meet request. " +
-                  s"Total=${buffers.getTotalBytes} Target=$targetTotalSize")
-              exhausted = true
-            }
-          }
-        }
-        logDebug(s"$this spill complete")
-      } finally {
-        nvtx.close()
-      }
-    }
+  protected def doSetSpillable(buffer: RapidsBufferBase, isSpillable: Boolean): Unit = {
+    buffers.setSpillable(buffer, isSpillable)
+  }
 
-    totalSpilled
+  protected def setSpillable(buffer: RapidsBufferBase, isSpillable: Boolean): Unit = {
+    throw new NotImplementedError(s"This store ${this} does not implement setSpillable")
   }
 
   /**
    * Create a new buffer from an existing buffer in another store.
    * If the data transfer will be performed asynchronously, this method is responsible for
    * adding a reference to the existing buffer and later closing it when the transfer completes.
+   *
    * @note DO NOT close the buffer unless adding a reference!
    * @note `createBuffer` impls should synchronize against `stream` before returning, if needed.
    * @param buffer data from another store
@@ -214,56 +230,24 @@ abstract class RapidsBufferStore(
    *                     for `memoryBuffer` is transferred to this store. The store may close
    *                     `memoryBuffer` if necessary.
    * @param stream CUDA stream to use or null
-   * @return new buffer tracking the data in this store
+   * @return the new buffer that was created.
    */
-  protected def createBuffer(buffer: RapidsBuffer, memoryBuffer: MemoryBuffer, stream: Cuda.Stream)
-  : RapidsBufferBase
+  protected def createBuffer(
+     buffer: RapidsBuffer,
+     memoryBuffer: MemoryBuffer,
+     stream: Cuda.Stream): RapidsBufferBase
 
   /** Update bookkeeping for a new buffer */
-  protected def addBuffer(buffer: RapidsBufferBase): Unit = synchronized {
+  protected def addBuffer(buffer: RapidsBufferBase): Unit = {
     buffers.add(buffer)
-    catalog.registerNewBuffer(buffer)
   }
 
   override def close(): Unit = {
     buffers.freeAll()
   }
 
-  private def trySpillAndFreeBuffer(stream: Cuda.Stream): Long = synchronized {
-    val bufferToSpill = buffers.nextSpillableBuffer()
-    if (bufferToSpill != null) {
-      spillAndFreeBuffer(bufferToSpill, stream)
-      bufferToSpill.size
-    } else {
-      0
-    }
-  }
-
-  private def spillAndFreeBuffer(buffer: RapidsBufferBase, stream: Cuda.Stream): Unit = {
-    if (spillStore == null) {
-      throw new OutOfMemoryError("Requested to spill without a spill store")
-    }
-    // If we fail to get a reference then this buffer has since been freed and probably best
-    // to return back to the outer loop to see if enough has been freed.
-    if (buffer.addReference()) {
-      try {
-        if (catalog.isBufferSpilled(buffer.id, buffer.storageTier)) {
-          logDebug(s"Skipping spilling $buffer ${buffer.id} to ${spillStore.name} as it is " +
-              s"already stored in multiple tiers total mem=${buffers.getTotalBytes}")
-          catalog.removeBufferTier(buffer.id, buffer.storageTier)
-        } else {
-          logDebug(s"Spilling $buffer ${buffer.id} to ${spillStore.name} " +
-              s"total mem=${buffers.getTotalBytes}")
-          val spillCallback = buffer.getSpillCallback
-          spillCallback(buffer.storageTier, spillStore.tier, buffer.size)
-          spillStore.copyBuffer(buffer, buffer.getMemoryBuffer, stream)
-        }
-      } finally {
-        buffer.close()
-      }
-      catalog.removeBufferTier(buffer.id, buffer.storageTier)
-      buffer.free()
-    }
+  def nextSpillable(): RapidsBuffer = {
+    buffers.nextSpillableBuffer()
   }
 
   /** Base class for all buffers in this store. */
@@ -272,16 +256,15 @@ abstract class RapidsBufferStore(
       override val size: Long,
       override val meta: TableMeta,
       initialSpillPriority: Long,
-      initialSpillCallback: SpillCallback,
-      catalog: RapidsBufferCatalog = RapidsBufferCatalog.singleton,
-      deviceStorage: RapidsDeviceMemoryStore = RapidsBufferCatalog.getDeviceStorage)
+      catalog: RapidsBufferCatalog = RapidsBufferCatalog.singleton)
       extends RapidsBuffer with Arm {
     private val MAX_UNSPILL_ATTEMPTS = 100
-    private[this] var isValid = true
+
+    // isValid and refcount must be used with the `RapidsBufferBase` lock held
+    protected[this] var isValid = true
     protected[this] var refcount = 0
 
     private[this] var spillPriority: Long = initialSpillPriority
-    private[this] var spillCallback: SpillCallback = initialSpillCallback
 
     /** Release the underlying resources for this buffer. */
     protected def releaseResources(): Unit
@@ -299,17 +282,6 @@ abstract class RapidsBufferStore(
      */
     protected def materializeMemoryBuffer: MemoryBuffer = getMemoryBuffer
 
-    /**
-     * Determine if a buffer is currently acquired.
-     * @note Unless this is called by the thread that currently "owns" an
-     * acquired buffer, the acquisition state could be changing
-     * asynchronously, and therefore the result cannot always be used as a
-     * proxy for the result obtained from the addReference method.
-     */
-    def isAcquired: Boolean = synchronized {
-      refcount > 0
-    }
-
     override def addReference(): Boolean = synchronized {
       if (isValid) {
         refcount += 1
@@ -352,61 +324,73 @@ abstract class RapidsBufferStore(
       }
     }
 
+    /**
+     * TODO: we want to remove this method from the buffer, instead we want the catalog
+     *   to be responsible for producing the DeviceMemoryBuffer by asking the buffer. This
+     *   hides the RapidsBuffer from clients and simplifies locking.
+     */
     override def getDeviceMemoryBuffer: DeviceMemoryBuffer = {
-      if (RapidsBufferCatalog.shouldUnspill) {
-        (0 until MAX_UNSPILL_ATTEMPTS).foreach { _ =>
-          catalog.acquireBuffer(id, DEVICE) match {
-            case Some(buffer) =>
-              withResource(buffer) { _ =>
-                return buffer.getDeviceMemoryBuffer
-              }
-            case _ =>
-              try {
-                logDebug(s"Unspilling $this $id to $DEVICE")
-                val newBuffer = deviceStorage.copyBuffer(
-                  this, materializeMemoryBuffer, Cuda.DEFAULT_STREAM)
-                if (newBuffer.addReference()) {
+      GpuTaskMetrics.get.readSpillTime {
+        if (RapidsBufferCatalog.shouldUnspill) {
+          (0 until MAX_UNSPILL_ATTEMPTS).foreach { _ =>
+            catalog.acquireBuffer(id, DEVICE) match {
+              case Some(buffer) =>
+                withResource(buffer) { _ =>
+                  return buffer.getDeviceMemoryBuffer
+                }
+              case _ =>
+                try {
+                  logDebug(s"Unspilling $this $id to $DEVICE")
+                  val newBuffer = catalog.unspillBufferToDeviceStore(
+                    this,
+                    materializeMemoryBuffer,
+                    Cuda.DEFAULT_STREAM)
                   withResource(newBuffer) { _ =>
                     return newBuffer.getDeviceMemoryBuffer
                   }
+                } catch {
+                  case _: DuplicateBufferException =>
+                    logDebug(s"Lost device buffer registration race for buffer $id, retrying...")
                 }
-              } catch {
-                case _: DuplicateBufferException =>
-                  logDebug(s"Lost device buffer registration race for buffer $id, retrying...")
-              }
+            }
           }
-        }
-        throw new IllegalStateException(s"Unable to get device memory buffer for ID: $id")
-      } else {
-        materializeMemoryBuffer match {
-          case h: HostMemoryBuffer =>
-            withResource(h) { _ =>
-              closeOnExcept(DeviceMemoryBuffer.allocate(size)) { deviceBuffer =>
-                logDebug(s"copying from host $h to device $deviceBuffer")
-                deviceBuffer.copyFromHostBuffer(h)
-                deviceBuffer
+          throw new IllegalStateException(s"Unable to get device memory buffer for ID: $id")
+        } else {
+          materializeMemoryBuffer match {
+            case h: HostMemoryBuffer =>
+              withResource(h) { _ =>
+                closeOnExcept(DeviceMemoryBuffer.allocate(size)) { deviceBuffer =>
+                  logDebug(s"copying from host $h to device $deviceBuffer")
+                  deviceBuffer.copyFromHostBuffer(h)
+                  deviceBuffer
+                }
               }
-            }
-          case d: DeviceMemoryBuffer => d
-          case b => throw new IllegalStateException(s"Unrecognized buffer: $b")
+            case d: DeviceMemoryBuffer => d
+            case b => throw new IllegalStateException(s"Unrecognized buffer: $b")
+          }
         }
       }
     }
 
+    /**
+     * close() is called by client code to decrease the ref count of this RapidsBufferBase.
+     * In the off chance that by the time close is invoked, the buffer was freed (not valid)
+     * then this close call winds up freeing the resources of the rapids buffer.
+     */
     override def close(): Unit = synchronized {
       if (refcount == 0) {
         throw new IllegalStateException("Buffer already closed")
       }
       refcount -= 1
       if (refcount == 0 && !isValid) {
-        pendingFreeBuffers.remove(id)
-        pendingFreeBytes.addAndGet(-size)
         freeBuffer()
       }
     }
 
     /**
-     * Mark the buffer as freed and no longer valid.
+     * Mark the buffer as freed and no longer valid. This is called by the store when removing a
+     * buffer (it is no longer tracked).
+     *
      * @note The resources may not be immediately released if the buffer has outstanding references.
      * In that case the resources will be released when the reference count reaches zero.
      */
@@ -416,9 +400,6 @@ abstract class RapidsBufferStore(
         buffers.remove(id)
         if (refcount == 0) {
           freeBuffer()
-        } else {
-          pendingFreeBuffers.put(id, this)
-          pendingFreeBytes.addAndGet(size)
         }
       } else {
         logWarning(s"Trying to free an invalid buffer => $id, size = $size, $this")
@@ -434,18 +415,9 @@ abstract class RapidsBufferStore(
       spillPriority = priority
     }
 
-    override def getSpillCallback: SpillCallback = spillCallback
-
-    override def setSpillCallback(callback: SpillCallback): Unit = {
-      spillCallback = callback
-    }
-
     /** Must be called with a lock on the buffer */
     private def freeBuffer(): Unit = {
       releaseResources()
-      memoryFreedMonitor.synchronized {
-        memoryFreedMonitor.notifyAll()
-      }
     }
 
     override def toString: String = s"$name buffer size=$size"
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
index 78b47212443..7eba0729217 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
@@ -28,6 +28,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.network.util.{ByteUnit, JavaUtils}
 import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.rapids.RapidsPrivateUtil
 
 object ConfHelper {
   def toBoolean(s: String, key: String): Boolean = {
@@ -343,6 +344,25 @@ object RapidsConf {
     .stringConf
     .createWithDefault("NONE")
 
+  val SPARK_RMM_STATE_DEBUG = conf("spark.rapids.memory.gpu.state.debug")
+      .doc("To better recover from out of memory errors, RMM will track several states for " +
+          "the threads that interact with the GPU. This provides a log of those state " +
+          "transitions to aid in debugging it. STDOUT or STDERR will have the logging go there " +
+          "empty string will disable logging and anything else will be treated as a file to " +
+          "write the logs to.")
+      .startupOnly()
+      .stringConf
+      .createWithDefault("")
+
+  val SPARK_RMM_STATE_ENABLE = conf("spark.rapids.memory.gpu.state.enable")
+      .doc("Enabled or disable using the SparkRMM state tracking to improve " +
+          "OOM response. This includes possibly retrying parts of the processing in " +
+          "the case of an OOM")
+      .startupOnly()
+      .internal()
+      .booleanConf
+      .createWithDefault(true)
+
   val GPU_OOM_DUMP_DIR = conf("spark.rapids.memory.gpu.oomDumpDir")
     .doc("The path to a local directory where a heap dump will be created if the GPU " +
       "encounters an unrecoverable out-of-memory (OOM) error. The filename will be of the " +
@@ -1227,6 +1247,13 @@ object RapidsConf {
 
   // INTERNAL TEST AND DEBUG CONFIGS
 
+  val TEST_RETRY_OOM_INJECTION_ENABLED = conf("spark.rapids.sql.test.injectRetryOOM")
+    .doc("Only to be used in tests. If enabled the retry iterator will inject a RetryOOM " +
+         "once per invocation.")
+    .internal()
+    .booleanConf
+    .createWithDefault(false)
+
   val TEST_CONF = conf("spark.rapids.sql.test.enabled")
     .doc("Intended to be used by unit tests, if enabled all operations must run on the " +
       "GPU or an error happens.")
@@ -1250,6 +1277,16 @@ object RapidsConf {
     .toSequence
     .createWithDefault(Nil)
 
+  val HASH_SUB_PARTITION_TEST_ENABLED = conf("spark.rapids.sql.test.subPartitioning.enabled")
+    .doc("Setting to true will force hash joins to use the sub-partitioning algorithm if " +
+      s"${TEST_CONF.key} is also enabled, while false means always disabling it. This is " +
+      "intended for tests. Do not set any value under production environments, since it " +
+      "will override the default behavior that will choose one automatically according to " +
+      "the input batch size")
+    .internal()
+    .booleanConf
+    .createOptional
+
   val LOG_TRANSFORMATIONS = conf("spark.rapids.sql.debug.logTransformations")
     .doc("When enabled, all query transformations will be logged.")
     .internal()
@@ -1501,6 +1538,27 @@ object RapidsConf {
         .createWithDefault(20)
 
   // ALLUXIO CONFIGS
+  val ALLUXIO_MASTER = conf("spark.rapids.alluxio.master")
+    .doc("The Alluxio master hostname. If not set, read Alluxio master URL from " +
+      "spark.rapids.alluxio.home locally. This config is useful when Alluxio master " +
+      "and Spark driver are not co-located.")
+    .startupOnly()
+    .stringConf
+    .createWithDefault("")
+
+  val ALLUXIO_MASTER_PORT = conf("spark.rapids.alluxio.master.port")
+    .doc("The Alluxio master port. If not set, read Alluxio master port from " +
+      "spark.rapids.alluxio.home locally. This config is useful when Alluxio master " +
+      "and Spark driver are not co-located.")
+    .startupOnly()
+    .integerConf
+    .createWithDefault(19998)
+
+  val ALLUXIO_HOME = conf("spark.rapids.alluxio.home")
+    .doc("The Alluxio installation home path or link to the installation home path. ")
+    .startupOnly()
+    .stringConf
+    .createWithDefault("/opt/alluxio")
 
   val ALLUXIO_PATHS_REPLACE = conf("spark.rapids.alluxio.pathsToReplace")
     .doc("List of paths to be replaced with corresponding Alluxio scheme. " +
@@ -1519,10 +1577,6 @@ object RapidsConf {
   val ALLUXIO_AUTOMOUNT_ENABLED = conf("spark.rapids.alluxio.automount.enabled")
     .doc("Enable the feature of auto mounting the cloud storage to Alluxio. " +
       "It requires the Alluxio master is the same node of Spark driver node. " +
-      "When it's true, it requires an environment variable ALLUXIO_HOME be set properly. " +
-      "The default value of ALLUXIO_HOME is \"/opt/alluxio-2.8.0\". " +
-      "You can set it as an environment variable when running a spark-submit or " +
-      "you can use spark.yarn.appMasterEnv.ALLUXIO_HOME to set it on Yarn. " +
       "The Alluxio master's host and port will be read from alluxio.master.hostname and " +
       "alluxio.master.rpc.port(default: 19998) from ALLUXIO_HOME/conf/alluxio-site.properties, " +
       "then replace a cloud path which matches spark.rapids.alluxio.bucket.regex like " +
@@ -1787,6 +1841,21 @@ object RapidsConf {
         .bytesConf(ByteUnit.BYTE)
         .createWithDefault(0L)
 
+  val NUM_SUB_PARTITIONS = conf("spark.rapids.sql.join.hash.numSubPartitions")
+    .doc("The number of partitions for the repartition in each partition for big hash join. " +
+      "GPU will try to repartition the data into smaller partitions in each partition when the " +
+      "data from the build side is too large to fit into a single batch.")
+    .internal()
+    .integerConf
+    .createWithDefault(16)
+
+  val ENABLE_AQE_EXCHANGE_REUSE_FIXUP = conf("spark.rapids.sql.aqeExchangeReuseFixup.enable")
+      .doc("Option to turn on the fixup of exchange reuse when running with " +
+          "adaptive query execution.")
+      .internal()
+      .booleanConf
+      .createWithDefault(true)
+
   private def printSectionHeader(category: String): Unit =
     println(s"\n### $category")
 
@@ -1817,7 +1886,7 @@ object RapidsConf {
         |On startup use: `--conf [conf key]=[conf value]`. For example:
         |
         |```
-        |${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-23.02.0-cuda11.jar \
+        |${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-23.04.0-cuda11.jar \
         |--conf spark.plugins=com.nvidia.spark.SQLPlugin \
         |--conf spark.rapids.sql.concurrentGpuTasks=2
         |```
@@ -1841,7 +1910,9 @@ object RapidsConf {
     } else {
       println("Rapids Configs:")
     }
-    registeredConfs.sortBy(_.key).foreach(_.help(asTable))
+    val allConfs = registeredConfs.clone()
+    allConfs.append(RapidsPrivateUtil.getPrivateConfigs(): _*)
+    allConfs.sortBy(_.key).foreach(_.help(asTable))
     if (asTable) {
       println("")
       // scalastyle:off line.size.limit
@@ -1951,6 +2022,8 @@ class RapidsConf(conf: Map[String, String]) extends Logging {
 
   lazy val isTestEnabled: Boolean = get(TEST_CONF)
 
+  lazy val testRetryOOMInjectionEnabled : Boolean = get(TEST_RETRY_OOM_INJECTION_ENABLED)
+
   lazy val testingAllowedNonGpu: Seq[String] = get(TEST_ALLOWED_NONGPU)
 
   lazy val validateExecsInGpuPlan: Seq[String] = get(TEST_VALIDATE_EXECS_ONGPU)
@@ -1959,6 +2032,10 @@ class RapidsConf(conf: Map[String, String]) extends Logging {
 
   lazy val rmmDebugLocation: String = get(RMM_DEBUG)
 
+  lazy val sparkRmmDebugLocation: String = get(SPARK_RMM_STATE_DEBUG)
+
+  lazy val sparkRmmStateEnable: Boolean = get(SPARK_RMM_STATE_ENABLE)
+
   lazy val gpuOomDumpDir: Option[String] = get(GPU_OOM_DUMP_DIR)
 
   lazy val gpuOomMaxRetries: Int = get(GPU_OOM_MAX_RETRIES)
@@ -2321,6 +2398,12 @@ class RapidsConf(conf: Map[String, String]) extends Logging {
 
   lazy val gpuWriteMemorySpeed: Double = get(OPTIMIZER_GPU_WRITE_SPEED)
 
+  lazy val getAlluxioHome: String = get(ALLUXIO_HOME)
+
+  lazy val getAlluxioMaster: String = get(ALLUXIO_MASTER)
+
+  lazy val getAlluxioMasterPort: Int = get(ALLUXIO_MASTER_PORT)
+
   lazy val getAlluxioPathsToReplace: Option[Seq[String]] = get(ALLUXIO_PATHS_REPLACE)
 
   lazy val getAlluxioAutoMountEnabled: Boolean = get(ALLUXIO_AUTOMOUNT_ENABLED)
@@ -2375,7 +2458,9 @@ class RapidsConf(conf: Map[String, String]) extends Logging {
 
   lazy val isDetectDeltaCheckpointQueries: Boolean = get(DETECT_DELTA_CHECKPOINT_QUERIES)
 
-  lazy val concurrentWriterPartitionFlushSize:Long = get(CONCURRENT_WRITER_PARTITION_FLUSH_SIZE)
+  lazy val concurrentWriterPartitionFlushSize: Long = get(CONCURRENT_WRITER_PARTITION_FLUSH_SIZE)
+
+  lazy val isAqeExchangeReuseFixupEnabled: Boolean = get(ENABLE_AQE_EXCHANGE_REUSE_FIXUP)
 
   private val optimizerDefaults = Map(
     // this is not accurate because CPU projections do have a cost due to appending values
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDeviceMemoryStore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDeviceMemoryStore.scala
index b6833074e19..addd4a108db 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDeviceMemoryStore.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDeviceMemoryStore.scala
@@ -16,11 +16,10 @@
 
 package com.nvidia.spark.rapids
 
-import ai.rapids.cudf.{ContiguousTable, Cuda, DeviceMemoryBuffer, HostMemoryBuffer, MemoryBuffer, Table}
+import ai.rapids.cudf.{Cuda, DeviceMemoryBuffer, HostMemoryBuffer, MemoryBuffer}
 import com.nvidia.spark.rapids.StorageTier.StorageTier
 import com.nvidia.spark.rapids.format.TableMeta
 
-import org.apache.spark.sql.rapids.TempSpillBufferId
 import org.apache.spark.sql.types.DataType
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
@@ -28,10 +27,15 @@ import org.apache.spark.sql.vectorized.ColumnarBatch
  * Buffer storage using device memory.
  * @param catalog catalog to register this store
  */
-class RapidsDeviceMemoryStore(catalog: RapidsBufferCatalog = RapidsBufferCatalog.singleton)
-    extends RapidsBufferStore(StorageTier.DEVICE, catalog) with Arm {
+class RapidsDeviceMemoryStore
+  extends RapidsBufferStore(StorageTier.DEVICE) with Arm {
 
-  override protected def createBuffer(other: RapidsBuffer, memoryBuffer: MemoryBuffer,
+  // The RapidsDeviceMemoryStore handles spillability via ref counting
+  override protected def spillableOnAdd: Boolean = false
+
+  override protected def createBuffer(
+      other: RapidsBuffer,
+      memoryBuffer: MemoryBuffer,
       stream: Cuda.Stream): RapidsBufferBase = {
     val deviceBuffer = {
       memoryBuffer match {
@@ -51,192 +55,45 @@ class RapidsDeviceMemoryStore(catalog: RapidsBufferCatalog = RapidsBufferCatalog
       other.id,
       other.size,
       other.meta,
-      None,
       deviceBuffer,
-      other.getSpillPriority,
-      other.getSpillCallback)
-  }
-
-  /**
-   * Adds a contiguous table to the device storage, taking ownership of the table.
-   *
-   * @param table cudf table based from the contiguous buffer
-   * @param contigBuffer device memory buffer backing the table
-   * @param tableMeta metadata describing the buffer layout
-   * @param initialSpillPriority starting spill priority value for the buffer
-   * @param spillCallback a callback when the buffer is spilled. This should be very light weight.
-   *                      It should never allocate GPU memory and really just be used for metrics.
-   * @return RapidsBufferHandle handle for this table
-   */
-  def addTable(
-      table: Table,
-      contigBuffer: DeviceMemoryBuffer,
-      tableMeta: TableMeta,
-      initialSpillPriority: Long,
-      spillCallback: SpillCallback = RapidsBuffer.defaultSpillCallback): RapidsBufferHandle = {
-    // We increment this because this rapids device memory has two pointers to the buffer:
-    // the actual contig buffer, and the table. When this `RapidsBuffer` releases its resources,
-    // it will decrement the ref count for the contig buffer (negating this incRefCount),
-    // it will also close the table being passed here, which together brings the ref count
-    // to 0.
-    contigBuffer.incRefCount()
-    val id = TempSpillBufferId()
-    freeOnExcept(
-      new RapidsDeviceMemoryBuffer(
-        id,
-        contigBuffer.getLength,
-        tableMeta,
-        Some(table),
-        contigBuffer,
-        initialSpillPriority,
-        spillCallback)) { buffer =>
-      logDebug(s"Adding table for: [id=$id, size=${buffer.size}, " +
-        s"meta_id=${buffer.meta.bufferMeta.id}, meta_size=${buffer.meta.bufferMeta.size}]")
-      addDeviceBuffer(buffer, needsSync = true)
-      catalog.makeNewHandle(id, initialSpillPriority, spillCallback)
-    }
-  }
-
-  /**
-   * Adds a contiguous table to the device storage. This does NOT take ownership of the
-   * contiguous table, so it is the responsibility of the caller to close it. The refcount of the
-   * underlying device buffer will be incremented so the contiguous table can be closed before
-   * this buffer is destroyed.
-   *
-   * This version of `addContiguousTable` creates a `TempSpillBufferId` to use
-   * to refer to this table.
-   *
-   * @param contigTable contiguous table to track in storage
-   * @param initialSpillPriority starting spill priority value for the buffer
-   * @param spillCallback a callback when the buffer is spilled. This should be very light weight.
-   *                      It should never allocate GPU memory and really just be used for metrics.
-   * @param needsSync whether the spill framework should stream synchronize while adding
-   *                  this device buffer (defaults to true)
-   * @return RapidsBufferHandle handle for this table
-   */
-  def addContiguousTable(
-      contigTable: ContiguousTable,
-      initialSpillPriority: Long,
-      spillCallback: SpillCallback = RapidsBuffer.defaultSpillCallback,
-      needsSync: Boolean = true): RapidsBufferHandle = {
-    addContiguousTable(
-      TempSpillBufferId(),
-      contigTable,
-      initialSpillPriority,
-      spillCallback,
-      needsSync)
-  }
-
-  /**
-   * Adds a contiguous table to the device storage. This does NOT take ownership of the
-   * contiguous table, so it is the responsibility of the caller to close it. The refcount of the
-   * underlying device buffer will be incremented so the contiguous table can be closed before
-   * this buffer is destroyed.
-   *
-   * @param id the RapidsBufferId to use for this buffer
-   * @param contigTable contiguous table to track in storage
-   * @param initialSpillPriority starting spill priority value for the buffer
-   * @param spillCallback a callback when the buffer is spilled. This should be very light weight.
-   *                      It should never allocate GPU memory and really just be used for metrics.
-   * @param needsSync whether the spill framework should stream synchronize while adding
-   *                             this device buffer (defaults to true)
-   * @return RapidsBufferHandle handle for this table
-   */
-  def addContiguousTable(
-      id: RapidsBufferId,
-      contigTable: ContiguousTable,
-      initialSpillPriority: Long,
-      spillCallback: SpillCallback,
-      needsSync: Boolean): RapidsBufferHandle = {
-    val contigBuffer = contigTable.getBuffer
-    val size = contigBuffer.getLength
-    val meta = MetaUtils.buildTableMeta(id.tableId, contigTable)
-    contigBuffer.incRefCount()
-    freeOnExcept(
-      new RapidsDeviceMemoryBuffer(
-        id,
-        size,
-        meta,
-        None,
-        contigBuffer,
-        initialSpillPriority,
-        spillCallback)) { buffer =>
-      logDebug(s"Adding table for: [id=$id, size=${buffer.size}, " +
-        s"uncompressed=${buffer.meta.bufferMeta.uncompressedSize}, " +
-        s"meta_id=${buffer.meta.bufferMeta.id}, meta_size=${buffer.meta.bufferMeta.size}]")
-      addDeviceBuffer(buffer, needsSync)
-      catalog.makeNewHandle(id, initialSpillPriority, spillCallback)
-    }
+      other.getSpillPriority)
   }
 
   /**
    * Adds a buffer to the device storage. This does NOT take ownership of the
    * buffer, so it is the responsibility of the caller to close it.
    *
-   * This version of `addBuffer` creates a `TempSpillBufferId` to use to refer to
-   * this buffer.
-   *
-   * @param buffer buffer that will be owned by the store
-   * @param tableMeta metadata describing the buffer layout
-   * @param initialSpillPriority starting spill priority value for the buffer
-   * @param spillCallback a callback when the buffer is spilled. This should be very light weight.
-   *                      It should never allocate GPU memory and really just be used for metrics.
-   * @param needsSync whether the spill framework should stream synchronize while adding
-   *                  this device buffer (defaults to true)
-   * @return RapidsBufferHandle handle for this buffer
-   */
-  def addBuffer(
-      buffer: DeviceMemoryBuffer,
-      tableMeta: TableMeta,
-      initialSpillPriority: Long,
-      spillCallback: SpillCallback = RapidsBuffer.defaultSpillCallback,
-      needsSync: Boolean = true): RapidsBufferHandle = {
-    addBuffer(
-      TempSpillBufferId(),
-      buffer,
-      tableMeta,
-      initialSpillPriority,
-      spillCallback,
-      needsSync)
-  }
-
-  /**
-   * Adds a buffer to the device storage. This does NOT take ownership of the
-   * buffer, so it is the responsibility of the caller to close it.
+   * This function is called only from the RapidsBufferCatalog, under the
+   * catalog lock.
    *
    * @param id the RapidsBufferId to use for this buffer
    * @param buffer buffer that will be owned by the store
    * @param tableMeta metadata describing the buffer layout
    * @param initialSpillPriority starting spill priority value for the buffer
-   * @param spillCallback a callback when the buffer is spilled. This should be very light weight.
-   *                      It should never allocate GPU memory and really just be used for metrics.
    * @param needsSync whether the spill framework should stream synchronize while adding
    *                  this device buffer (defaults to true)
-   * @return RapidsBufferHandle handle for this RapidsBuffer
+   * @return the RapidsBuffer instance that was added.
    */
   def addBuffer(
       id: RapidsBufferId,
       buffer: DeviceMemoryBuffer,
       tableMeta: TableMeta,
       initialSpillPriority: Long,
-      spillCallback: SpillCallback,
-      needsSync: Boolean): RapidsBufferHandle = {
+      needsSync: Boolean): RapidsBuffer = {
     buffer.incRefCount()
-    freeOnExcept(
-      new RapidsDeviceMemoryBuffer(
-        id,
-        buffer.getLength,
-        tableMeta,
-        None,
-        buffer,
-        initialSpillPriority,
-        spillCallback)) { buff =>
+    val rapidsBuffer = new RapidsDeviceMemoryBuffer(
+      id,
+      buffer.getLength,
+      tableMeta,
+      buffer,
+      initialSpillPriority)
+    freeOnExcept(rapidsBuffer) { _ =>
       logDebug(s"Adding receive side table for: [id=$id, size=${buffer.getLength}, " +
-        s"uncompressed=${buff.meta.bufferMeta.uncompressedSize}, " +
+        s"uncompressed=${rapidsBuffer.meta.bufferMeta.uncompressedSize}, " +
         s"meta_id=${tableMeta.bufferMeta.id}, " +
         s"meta_size=${tableMeta.bufferMeta.size}]")
-      addDeviceBuffer(buff, needsSync)
-      catalog.makeNewHandle(id, initialSpillPriority, spillCallback)
+      addDeviceBuffer(rapidsBuffer, needsSync)
+      rapidsBuffer
     }
   }
 
@@ -256,36 +113,94 @@ class RapidsDeviceMemoryStore(catalog: RapidsBufferCatalog = RapidsBufferCatalog
     addBuffer(buffer)
   }
 
+  /**
+   * The RapidsDeviceMemoryStore is the only store that supports setting a buffer spillable
+   * or not.
+   */
+  override protected def setSpillable(buffer: RapidsBufferBase, spillable: Boolean): Unit = {
+    doSetSpillable(buffer, spillable)
+  }
+
   class RapidsDeviceMemoryBuffer(
       id: RapidsBufferId,
       size: Long,
       meta: TableMeta,
-      table: Option[Table],
       contigBuffer: DeviceMemoryBuffer,
-      spillPriority: Long,
-      spillCallback: SpillCallback)
-      extends RapidsBufferBase(id, size, meta, spillPriority, spillCallback) {
+      spillPriority: Long)
+      extends RapidsBufferBase(id, size, meta, spillPriority)
+        with MemoryBuffer.EventHandler {
+
     override val storageTier: StorageTier = StorageTier.DEVICE
 
-    override protected def releaseResources(): Unit = {
+    // If this require triggers, we are re-adding a `DeviceMemoryBuffer` outside of
+    // the catalog lock, which should not possible. The event handler is set to null
+    // when we free the `RapidsDeviceMemoryBuffer` and if the buffer is not free, we
+    // take out another handle (in the catalog).
+    // TODO: This is not robust (to rely on outside locking and addReference/free)
+    //  and should be revisited.
+    require(contigBuffer.setEventHandler(this) == null,
+      "DeviceMemoryBuffer with non-null event handler failed to add!!")
+
+    /**
+     * Override from the MemoryBuffer.EventHandler interface.
+     *
+     * If we are being invoked we have the `contigBuffer` lock, as this callback
+     * is being invoked from `MemoryBuffer.close`
+     *
+     * @param refCount - contigBuffer's current refCount
+     */
+    override def onClosed(refCount: Int): Unit = {
+      // refCount == 1 means only 1 reference exists to `contigBuffer` in the
+      // RapidsDeviceMemoryBuffer (we own it)
+      if (refCount == 1) {
+        // setSpillable is being called here as an extension of `MemoryBuffer.close()`
+        // we hold the MemoryBuffer lock and we could be called from a Spark task thread
+        // Since we hold the MemoryBuffer lock, `incRefCount` waits for us. The only other
+        // call to `setSpillable` is also under this same MemoryBuffer lock (see:
+        // `getDeviceMemoryBuffer`)
+        setSpillable(this, true)
+      }
+    }
+
+    override protected def releaseResources(): Unit = synchronized {
+      // we need to disassociate this RapidsBuffer from the underlying buffer
       contigBuffer.close()
-      table.foreach(_.close())
     }
 
-    override def getDeviceMemoryBuffer: DeviceMemoryBuffer = {
-      contigBuffer.incRefCount()
-      contigBuffer
+    /**
+     * Get and increase the reference count of the device memory buffer
+     * in this RapidsBuffer, while making the RapidsBuffer non-spillable.
+     *
+     * @note It is the responsibility of the caller to close the DeviceMemoryBuffer
+     */
+    override def getDeviceMemoryBuffer: DeviceMemoryBuffer = synchronized {
+      contigBuffer.synchronized {
+        setSpillable(this, false)
+        contigBuffer.incRefCount()
+        contigBuffer
+      }
     }
 
     override def getMemoryBuffer: MemoryBuffer = getDeviceMemoryBuffer
 
     override def getColumnarBatch(sparkTypes: Array[DataType]): ColumnarBatch = {
-      if (table.isDefined) {
-        //REFCOUNT ++ of all columns
-        GpuColumnVectorFromBuffer.from(table.get, contigBuffer, meta, sparkTypes)
-      } else {
-        columnarBatchFromDeviceBuffer(contigBuffer, sparkTypes)
+      // calling `getDeviceMemoryBuffer` guarantees that we have marked this RapidsBuffer
+      // as not spillable and increased its refCount atomically
+      withResource(getDeviceMemoryBuffer) { buff =>
+        columnarBatchFromDeviceBuffer(buff, sparkTypes)
+      }
+    }
+
+    /**
+     * We overwrite free to make sure we don't have a handler for the underlying
+     * contigBuffer, since this `RapidsBuffer` is no longer tracked.
+     */
+    override def free(): Unit = synchronized {
+      if (isValid) {
+        // it is going to be invalid when calling super.free()
+        contigBuffer.setEventHandler(null)
       }
+      super.free()
     }
   }
 }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDiskStore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDiskStore.scala
index 12c312c52f4..8272dc73bf7 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDiskStore.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDiskStore.scala
@@ -27,14 +27,13 @@ import com.nvidia.spark.rapids.format.TableMeta
 import org.apache.spark.sql.rapids.RapidsDiskBlockManager
 
 /** A buffer store using files on the local disks. */
-class RapidsDiskStore(
-    diskBlockManager: RapidsDiskBlockManager,
-    catalog: RapidsBufferCatalog = RapidsBufferCatalog.singleton,
-    deviceStorage: RapidsDeviceMemoryStore = RapidsBufferCatalog.getDeviceStorage)
-    extends RapidsBufferStore(StorageTier.DISK, catalog) {
+class RapidsDiskStore(diskBlockManager: RapidsDiskBlockManager)
+    extends RapidsBufferStore(StorageTier.DISK) {
   private[this] val sharedBufferFiles = new ConcurrentHashMap[RapidsBufferId, File]
 
-  override protected def createBuffer(incoming: RapidsBuffer, incomingBuffer: MemoryBuffer,
+  override protected def createBuffer(
+      incoming: RapidsBuffer,
+      incomingBuffer: MemoryBuffer,
       stream: Cuda.Stream): RapidsBufferBase = {
     withResource(incomingBuffer) { _ =>
       val hostBuffer = incomingBuffer match {
@@ -61,9 +60,7 @@ class RapidsDiskStore(
         fileOffset,
         incoming.size,
         incoming.meta,
-        incoming.getSpillPriority,
-        incoming.getSpillCallback,
-        deviceStorage)
+        incoming.getSpillPriority)
     }
   }
 
@@ -94,11 +91,9 @@ class RapidsDiskStore(
       fileOffset: Long,
       size: Long,
       meta: TableMeta,
-      spillPriority: Long,
-      spillCallback: SpillCallback,
-      deviceStorage: RapidsDeviceMemoryStore)
+      spillPriority: Long)
       extends RapidsBufferBase(
-        id, size, meta, spillPriority, spillCallback, deviceStorage = deviceStorage) {
+        id, size, meta, spillPriority) {
     private[this] var hostBuffer: Option[HostMemoryBuffer] = None
 
     override val storageTier: StorageTier = StorageTier.DISK
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsGdsStore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsGdsStore.scala
index b9c754c14aa..2f8333eacb9 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsGdsStore.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsGdsStore.scala
@@ -32,9 +32,8 @@ import org.apache.spark.sql.rapids.{RapidsDiskBlockManager, TempSpillBufferId}
 /** A buffer store using GPUDirect Storage (GDS). */
 class RapidsGdsStore(
     diskBlockManager: RapidsDiskBlockManager,
-    batchWriteBufferSize: Long,
-    catalog: RapidsBufferCatalog = RapidsBufferCatalog.singleton)
-    extends RapidsBufferStore(StorageTier.GDS, catalog) with Arm {
+    batchWriteBufferSize: Long)
+    extends RapidsBufferStore(StorageTier.GDS) with Arm {
   private[this] val batchSpiller = new BatchSpiller()
 
   override protected def createBuffer(other: RapidsBuffer, otherBuffer: MemoryBuffer,
@@ -61,9 +60,8 @@ class RapidsGdsStore(
       override val id: RapidsBufferId,
       override val size: Long,
       override val meta: TableMeta,
-      spillPriority: Long,
-      spillCallback: SpillCallback)
-      extends RapidsBufferBase(id, size, meta, spillPriority, spillCallback) {
+      spillPriority: Long)
+      extends RapidsBufferBase(id, size, meta, spillPriority) {
     override val storageTier: StorageTier = StorageTier.GDS
 
     override def getMemoryBuffer: MemoryBuffer = getDeviceMemoryBuffer
@@ -71,8 +69,8 @@ class RapidsGdsStore(
 
   class RapidsGdsSingleShotBuffer(
       id: RapidsBufferId, path: File, fileOffset: Long, size: Long, meta: TableMeta,
-      spillPriority: Long, spillCallback: SpillCallback)
-      extends RapidsGdsBuffer(id, size, meta, spillPriority, spillCallback) {
+      spillPriority: Long)
+      extends RapidsGdsBuffer(id, size, meta, spillPriority) {
 
     override def materializeMemoryBuffer: MemoryBuffer = {
       closeOnExcept(DeviceMemoryBuffer.allocate(size)) { buffer =>
@@ -130,8 +128,7 @@ class RapidsGdsStore(
       fileOffset,
       other.size,
       other.meta,
-      other.getSpillPriority,
-      other.getSpillCallback)
+      other.getSpillPriority)
   }
 
   class BatchSpiller() extends AutoCloseable {
@@ -173,8 +170,7 @@ class RapidsGdsStore(
           currentOffset,
           other.size,
           other.meta,
-          other.getSpillPriority,
-          other.getSpillCallback)
+          other.getSpillPriority)
         currentOffset += alignUp(deviceBuffer.getLength)
         pendingBuffers += gdsBuffer
         gdsBuffer
@@ -223,9 +219,8 @@ class RapidsGdsStore(
         size: Long,
         meta: TableMeta,
         spillPriority: Long,
-        spillCallback: SpillCallback,
         var isPending: Boolean = true)
-        extends RapidsGdsBuffer(id, size, meta, spillPriority, spillCallback) {
+        extends RapidsGdsBuffer(id, size, meta, spillPriority) {
 
       override def materializeMemoryBuffer: MemoryBuffer = this.synchronized {
         closeOnExcept(DeviceMemoryBuffer.allocate(size)) { buffer =>
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsHostMemoryStore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsHostMemoryStore.scala
index ef3dd77ff88..4a2fef6b77a 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsHostMemoryStore.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsHostMemoryStore.scala
@@ -21,20 +21,15 @@ import com.nvidia.spark.rapids.SpillPriorities.{applyPriorityOffset, HOST_MEMORY
 import com.nvidia.spark.rapids.StorageTier.StorageTier
 import com.nvidia.spark.rapids.format.TableMeta
 
-import org.apache.spark.sql.rapids.execution.TrampolineUtil
-
 /**
  * A buffer store using host memory.
  * @param maxSize maximum size in bytes for all buffers in this store
  * @param pageableMemoryPoolSize maximum size in bytes for the internal pageable memory pool
- * @param catalog buffer catalog to use with this store
  */
 class RapidsHostMemoryStore(
     maxSize: Long,
-    pageableMemoryPoolSize: Long,
-    catalog: RapidsBufferCatalog = RapidsBufferCatalog.singleton,
-    deviceStorage: RapidsDeviceMemoryStore = RapidsBufferCatalog.getDeviceStorage)
-    extends RapidsBufferStore(StorageTier.HOST, catalog) {
+    pageableMemoryPoolSize: Long)
+    extends RapidsBufferStore(StorageTier.HOST) {
   private[this] val pool = HostMemoryBuffer.allocate(pageableMemoryPoolSize, false)
   private[this] val addressAllocator = new AddressSpaceAllocator(pageableMemoryPoolSize)
   private[this] var haveLoggedMaxExceeded = false
@@ -44,40 +39,26 @@ class RapidsHostMemoryStore(
   private case object Pooled extends AllocationMode(HOST_MEMORY_BUFFER_PAGEABLE_OFFSET)
   private case object Direct extends AllocationMode(HOST_MEMORY_BUFFER_DIRECT_OFFSET)
 
-  // Returns an allocated host buffer and its allocation mode
+  override def getMaxSize: Option[Long] = Some(maxSize)
+
   private def allocateHostBuffer(size: Long): (HostMemoryBuffer, AllocationMode) = {
-    // spill to keep within the targeted size
-    val amountSpilled = synchronousSpill(math.max(maxSize - size, 0))
-    if (amountSpilled != 0) {
-      logInfo(s"Spilled $amountSpilled bytes from the host memory store")
-      TrampolineUtil.incTaskMetricsDiskBytesSpilled(amountSpilled)
+    var buffer: HostMemoryBuffer = PinnedMemoryPool.tryAllocate(size)
+    if (buffer != null) {
+      return (buffer, Pinned)
     }
 
-    var buffer: HostMemoryBuffer = null
-    while (buffer == null) {
-      buffer = PinnedMemoryPool.tryAllocate(size)
-      if (buffer != null) {
-        return (buffer, Pinned)
-      }
-
-      if (size > pageableMemoryPoolSize) {
-        if (!haveLoggedMaxExceeded) {
-          logWarning(s"Exceeding host spill max of $pageableMemoryPoolSize bytes to accommodate " +
-              s"a buffer of $size bytes. Consider increasing pageable memory store size.")
-          haveLoggedMaxExceeded = true
-        }
-        return (HostMemoryBuffer.allocate(size, false), Direct)
-      }
+    val allocation = addressAllocator.allocate(size)
+    if (allocation.isDefined) {
+      buffer = pool.slice(allocation.get, size)
+      return (buffer, Pooled)
+    }
 
-      val allocation = addressAllocator.allocate(size)
-      if (allocation.isDefined) {
-        buffer = pool.slice(allocation.get, size)
-      } else {
-        val targetSize = math.max(currentSize - size, 0)
-        synchronousSpill(targetSize)
-      }
+    if (!haveLoggedMaxExceeded) {
+      logWarning(s"Exceeding host spill max of $pageableMemoryPoolSize bytes to accommodate " +
+          s"a buffer of $size bytes. Consider increasing pageable memory store size.")
+      haveLoggedMaxExceeded = true
     }
-    (buffer, Pooled)
+    (HostMemoryBuffer.allocate(size, false), Direct)
   }
 
   override protected def createBuffer(other: RapidsBuffer, otherBuffer: MemoryBuffer,
@@ -86,8 +67,10 @@ class RapidsHostMemoryStore(
       val (hostBuffer, allocationMode) = allocateHostBuffer(other.size)
       try {
         otherBuffer match {
-          case devBuffer: DeviceMemoryBuffer => hostBuffer.copyFromDeviceBuffer(devBuffer, stream)
-          case _ => throw new IllegalStateException("copying from buffer without device memory")
+          case devBuffer: DeviceMemoryBuffer =>
+            hostBuffer.copyFromDeviceBuffer(devBuffer, stream)
+          case _ =>
+            throw new IllegalStateException("copying from buffer without device memory")
         }
       } catch {
         case e: Exception =>
@@ -100,9 +83,7 @@ class RapidsHostMemoryStore(
         other.meta,
         applyPriorityOffset(other.getSpillPriority, allocationMode.spillPriorityOffset),
         hostBuffer,
-        allocationMode,
-        other.getSpillCallback,
-        deviceStorage)
+        allocationMode)
     }
   }
 
@@ -119,11 +100,9 @@ class RapidsHostMemoryStore(
       meta: TableMeta,
       spillPriority: Long,
       buffer: HostMemoryBuffer,
-      allocationMode: AllocationMode,
-      spillCallback: SpillCallback,
-      deviceStorage: RapidsDeviceMemoryStore)
+      allocationMode: AllocationMode)
       extends RapidsBufferBase(
-        id, size, meta, spillPriority, spillCallback, deviceStorage = deviceStorage) {
+        id, size, meta, spillPriority) {
     override val storageTier: StorageTier = StorageTier.HOST
 
     override def getMemoryBuffer: MemoryBuffer = {
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala
index dc570af9e85..cf04b4ebc2f 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala
@@ -689,6 +689,36 @@ abstract class SparkPlanMeta[INPUT <: SparkPlan](plan: INPUT,
     // shuffled exchanges. So broadcast exchanges are not impacted which could have an impact on
     // BroadcastHashJoin, and shuffled exchanges are not used to disable anything downstream.
     fixUpExchangeOverhead()
+
+    // 3) Some child nodes can't run on GPU if parent nodes can't run on GPU.
+    // WriteFilesExec is a new operator from Spark version 340,
+    // Did not extract a shim code for simplicity
+    tagChildAccordingToParent(this.asInstanceOf[SparkPlanMeta[SparkPlan]], "WriteFilesExec")
+  }
+
+  /**
+   * tag child node can't run on GPU if parent node can't run on GPU and child node is a `typeName`
+   * From Spark 340, plan is like:
+   *    InsertIntoHadoopFsRelationCommand
+   *    +- WriteFiles
+   *      +- sub plan
+   * Instead of:
+   *    InsertIntoHadoopFsRelationCommand
+   *    +- sub plan
+   * WriteFiles is a temporary node and does not have input and output, it acts like a tag node.
+   * @param p        plan
+   * @param typeName type name
+   */
+  private def tagChildAccordingToParent(p: SparkPlanMeta[SparkPlan], typeName: String): Unit = {
+    p.childPlans.foreach(e => tagChildAccordingToParent(e, typeName))
+    if (p.wrapped.getClass.getSimpleName.equals(typeName)) {
+      assert(p.parent.isDefined)
+      if (!p.parent.get.canThisBeReplaced) {
+        // parent can't run on GPU, also tag this.
+        p.willNotWorkOnGpu(
+          s"$typeName can't run on GPU because parent can't run on GPU")
+      }
+    }
   }
 
   override final def tagSelfForGpu(): Unit = {
@@ -1025,17 +1055,35 @@ abstract class BaseExprMeta[INPUT <: Expression](
 
   val isFoldableNonLitAllowed: Boolean = false
 
+  /**
+   * Whether to tag a TimeZoneAwareExpression for timezone after all the other tagging
+   * is done.
+   * By default a TimeZoneAwareExpression always requires the timezone tagging, but
+   * there are some exceptions, e.g. 'Cast', who requires timezone tagging only when it
+   * has timezone sensitive type as input or output.
+   *
+   * Override this to match special cases.
+   */
+  protected def needTimezoneTagging: Boolean = {
+    // A TimeZoneAwareExpression with no timezone sensitive types as input/output will
+    // escape from the timezone tagging in the prior type checks. So ask for tagging here.
+    // e.g. 'UnixTimestamp' with 'DateType' as the input, timezone will be taken into
+    // account when converting a Date to a Long.
+    !(dataType +: childExprs.map(_.dataType)).exists(TypeChecks.isTimezoneSensitiveType)
+  }
+
   final override def tagSelfForGpu(): Unit = {
     if (wrapped.foldable && !GpuOverrides.isLit(wrapped) && !isFoldableNonLitAllowed) {
       willNotWorkOnGpu(s"Cannot run on GPU. Is ConstantFolding excluded? Expression " +
         s"$wrapped is foldable and operates on non literals")
     }
     rule.getChecks.foreach(_.tag(this))
+    tagExprForGpu()
     wrapped match {
-      case tzAware: TimeZoneAwareExpression => checkTimeZoneId(tzAware.zoneId)
+      case tzAware: TimeZoneAwareExpression if needTimezoneTagging =>
+        checkTimeZoneId(tzAware.zoneId)
       case _ => // do nothing
     }
-    tagExprForGpu()
   }
 
   /**
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsShuffleHeartbeatManager.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsShuffleHeartbeatManager.scala
index 09dfdfc1869..2011bfec8c1 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsShuffleHeartbeatManager.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsShuffleHeartbeatManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@ import java.util.concurrent.{Executors, ScheduledExecutorService, TimeUnit}
 
 import scala.collection.mutable.ArrayBuffer
 
+import com.nvidia.spark.rapids.jni.RmmSpark
 import org.apache.commons.lang3.mutable.MutableLong
 
 import org.apache.spark.SparkEnv
@@ -194,7 +195,9 @@ class RapidsShuffleHeartbeatEndpoint(pluginContext: PluginContext, conf: RapidsC
       GpuDeviceManager.wrapThreadFactory(new ThreadFactoryBuilder()
         .setNameFormat("rapids-shuffle-hb")
         .setDaemon(true)
-        .build()))
+        .build(),
+        () => RmmSpark.associateCurrentThreadWithShuffle(),
+        () => RmmSpark.removeCurrentThreadAssociation()))
 
   private class InitializeShuffleManager(ctx: PluginContext,
       shuffleManager: RapidsShuffleInternalManagerBase) extends Runnable {
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
index 20306c623d5..80fc94f5720 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,6 +42,8 @@ import com.nvidia.spark.rapids.RegexParser.toReadableString
  */
 class RegexParser(pattern: String) {
   private val regexPunct = "!\"#$%&'()*+,-./:;<=>?@\\^_`{|}~"
+  private val escapeChars = Map('n' -> '\n', 'r' -> '\r', 't' -> '\t', 'f' -> '\f', 'a' -> '\u0007',
+      'b' -> '\b', 'e' -> '\u001b')
 
   /** index of current position within the string being parsed */
   private var pos = 0
@@ -208,13 +210,8 @@ class RegexParser(pattern: String) {
             case 'd' => RegexCharacterRange(RegexChar('0'), RegexChar('9'))
             // List of character literals with an escape from here, under "Characters"
             // https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html
-            case 'n' => RegexChar('\n')
-            case 'r' => RegexChar('\r')
-            case 't' => RegexChar('\t')
-            case 'f' => RegexChar('\f')
-            case 'a' => RegexChar('\u0007')
-            case 'b' => RegexChar('\b')
-            case 'e' => RegexChar('\u001b')
+            case ch if escapeChars.contains(ch) =>
+              RegexChar(escapeChars(ch))
             case ch => 
               if (supportedMetaCharacters.contains(ch)) {
                 // an escaped metacharacter ('\\', '^', '-', ']', '+')
@@ -431,14 +428,9 @@ class RegexParser(pattern: String) {
             parseOctalDigit
           case 'p' | 'P' =>
             parsePredefinedClass
-          case 'a' =>
-            // alert (bell) character \a
+          case _ if escapeChars.contains(ch) =>
             consumeExpected(ch)
-            RegexChar('\u0007')
-          case 'e' =>
-            // escape character \e
-            consumeExpected(ch)
-            RegexChar('\u001b')
+            RegexChar(escapeChars(ch))
           case _ if regexPunct.contains(ch) =>
             // other punctuation
             // note that this may include metacharacters from earlier, this is just to
@@ -689,6 +681,8 @@ sealed class RegexRewriteFlags(val emptyRepetition: Boolean)
 class CudfRegexTranspiler(mode: RegexMode) {
   private val regexMetaChars = ".$^[]\\|?*+(){}"
   private val regexPunct = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
+  private val escapeChars = Map('n' -> '\n', 'r' -> '\r', 't' -> '\t', 'f' -> '\f', 'a' -> '\u0007',
+      'b' -> '\b', 'e' -> '\u001b')
 
   private def countCaptureGroups(regex: RegexAST): Int = {
     regex match {
@@ -707,11 +701,13 @@ class CudfRegexTranspiler(mode: RegexMode) {
    * Parse Java regular expression and translate into cuDF regular expression.
    *
    * @param pattern Regular expression that is valid in Java's engine
+   * @param extractIndex extraction index for regular expression
    * @param repl Optional replacement pattern
    * @return Regular expression and optional replacement in cuDF format
    */
-  def transpile(pattern: String, repl: Option[String]): (String, Option[String]) = {
-    val (cudfRegex, replacement) = getTranspiledAST(pattern, repl)
+  def transpile(pattern: String, extractIndex: Option[Int], repl: Option[String]):
+        (String, Option[String]) = {
+    val (cudfRegex, replacement) = getTranspiledAST(pattern, extractIndex, repl)
 
     // write out to regex string, performing minor transformations
     // such as adding additional escaping
@@ -722,11 +718,14 @@ class CudfRegexTranspiler(mode: RegexMode) {
    * Parse Java regular expression and translate into cuDF regular expression in AST form.
    *
    * @param pattern Regular expression that is valid in Java's engine
+   * @param extractIndex extraction index for regular expression
    * @param repl Optional replacement pattern
    * @return Regular expression AST and optional replacement in cuDF format
    */
   def getTranspiledAST(
-    pattern: String, repl: Option[String]): (RegexAST, Option[RegexReplacement]) = {
+      pattern: String,
+      extractIndex: Option[Int],
+      repl: Option[String]): (RegexAST, Option[RegexReplacement]) = {
     // parse the source regular expression
     val regex = new RegexParser(pattern).parse()
     // if we have a replacement, parse the replacement string using the regex parser to account
@@ -734,13 +733,14 @@ class CudfRegexTranspiler(mode: RegexMode) {
     val replacement = repl.map(s => new RegexParser(s).parseReplacement(countCaptureGroups(regex)))
 
     // validate that the regex is supported by cuDF
-    val cudfRegex = transpile(regex, replacement, None)
+    val cudfRegex = transpile(regex, extractIndex, replacement, None)
 
     (cudfRegex, replacement)
   }
   
   def transpileToSplittableString(e: RegexAST): Option[String] = {
     e match {
+      case RegexEscaped(ch) if escapeChars.contains(ch) => Some(escapeChars(ch).toString)
       case RegexEscaped(ch) if regexPunct.contains(ch) => Some(ch.toString)
       case RegexChar(ch) if !regexMetaChars.contains(ch) => Some(ch.toString)
       case RegexSequence(parts) =>
@@ -893,7 +893,8 @@ class CudfRegexTranspiler(mode: RegexMode) {
     }
   }
 
-  private def transpile(regex: RegexAST, replacement: Option[RegexReplacement],
+  private def transpile(regex: RegexAST, extractIndex: Option[Int],
+      replacement: Option[RegexReplacement],
       previous: Option[RegexAST]): RegexAST = {
 
     def containsBeginAnchor(regex: RegexAST): Boolean = {
@@ -992,9 +993,31 @@ class CudfRegexTranspiler(mode: RegexMode) {
 
     checkUnsupported(regex)
 
+    var current = 0
+    // capture groups can be nested, so we need to do this logic outside of the rewrite
+    def updateGroupsForExtract(regex: RegexAST, n: Int): RegexAST = {
+      regex match {
+        case RegexGroup(capture, term, lookahead) if capture => {
+          current += 1
+          RegexGroup(n == current, updateGroupsForExtract(term, n), lookahead)
+        }
+        case RegexSequence(parts) => 
+          RegexSequence(parts.map(updateGroupsForExtract(_, n)))
+        case RegexRepetition(term, quantifier) => 
+          RegexRepetition(updateGroupsForExtract(term, n), quantifier)
+        case _ => regex
+      }
+    }
+
+    val withUpdatedGroups = extractIndex match {
+      case Some(n) =>
+        updateGroupsForExtract(regex, n)
+      case _ => regex
+    }
+
     val flags = new RegexRewriteFlags(isEmptyRepetition(regex))
 
-    rewrite(regex, replacement, previous, flags)
+    rewrite(withUpdatedGroups, replacement, previous, flags)
   }
 
   private def rewrite(regex: RegexAST, replacement: Option[RegexReplacement],
@@ -1204,6 +1227,8 @@ class CudfRegexTranspiler(mode: RegexMode) {
             RegexChar('\u0085'), RegexChar('\u2028'), RegexChar('\u2029')
           ))
           RegexGroup(true, RegexChoice(l, r), None)
+        case _ if escapeChars.contains(ch) =>
+          RegexChar(escapeChars(ch))
         case _ if regexPunct.contains(ch) && !regexMetaChars.contains(ch) =>
           RegexChar(ch)
         case _ =>
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RmmRapidsRetryIterator.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RmmRapidsRetryIterator.scala
new file mode 100644
index 00000000000..2dfafcab56a
--- /dev/null
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RmmRapidsRetryIterator.scala
@@ -0,0 +1,624 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import scala.collection.mutable
+
+import com.nvidia.spark.rapids.RapidsPluginImplicits._
+import com.nvidia.spark.rapids.jni.{RetryOOM, RmmSpark, RmmSparkThreadState, SplitAndRetryOOM}
+
+import org.apache.spark.TaskContext
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.internal.SQLConf
+
+object RmmRapidsRetryIterator extends Arm with Logging {
+
+  /**
+   * withRetry for Iterator[T]. This helper calls a function `fn` as it takes
+   * elements from the iterator given in `input`, and it can retry the work in `fn`,
+   * and optionally split items into smaller chunks. The `splitPolicy` function must
+   * close the item passed to it. The resulting iterator may or
+   * may not have the same number of elements as the source iterator.
+   *
+   * While T is a generic `AutoCloseable` subclass most of the time we expect it to be
+   * `SpillableColumnarBatch`. The expectation when code enters `withRetry` is that
+   * all of the caller's data is spillable already, allowing the thread to be blocked, and
+   * its data eventually spilled because of other higher priority work.
+   *
+   * This function will close the elements of `input` as `fn` is successfully
+   * invoked. Elements of `input` not manifested are the responsibility of the caller to
+   * close!
+   *
+   * `fn` must be idempotent: this is a requirement because we may call `fn` multiple times
+   * while handling retries.
+   *
+   * @param input an iterator of T
+   * @param splitPolicy a function that can split an item of type T into a Seq[T]. The split
+   *                    function must close the item passed to it.
+   * @param fn the work to perform. Takes T and produces an output K
+   * @tparam T element type that must be AutoCloseable (likely `SpillableColumnarBatch`)
+   * @tparam K `fn` result type
+   * @return an iterator of K
+   */
+  def withRetry[T <: AutoCloseable, K](
+      input: Iterator[T],
+      splitPolicy: T => Seq[T])
+      (fn: T => K): Iterator[K] = {
+    val attemptIter = new AutoCloseableAttemptSpliterator(input, fn, splitPolicy)
+    new RmmRapidsRetryAutoCloseableIterator(attemptIter)
+  }
+
+  /**
+   * withRetry for T. This helper calls a function `fn` with the single input `T`,
+   * and it can retry the work in `fn` and optionally split `input` into smaller chunks.
+   * The resulting iterator may be 1 element, if successful on the first attempt or retry,
+   * or it could be multiple if splits were required.
+   *
+   * While T is a generic `AutoCloseable` subclass most of the time we expect it to be
+   * `SpillableColumnarBatch`. The expectation when code enters `withRetry` is that
+   * all of the caller's data is spillable already, allowing the thread to be blocked, and
+   * its data eventually spilled because of other higher priority work.
+   *
+   * This function will close the elements of `input` as `fn` is successfully
+   * invoked. In the event of an unhandled exception `input` is also closed.
+   *
+   * `fn` must be idempotent: this is a requirement because we may call `fn` multiple times
+   * while handling retries.
+   *
+   * @param input a single item T
+   * @param splitPolicy a function that can split an item of type T into a Seq[T]. The split
+   *                    function must close the item passed to it.
+   * @param fn the work to perform. Takes T and produces an output K
+   * @tparam T element type that must be AutoCloseable (likely `SpillableColumnarBatch`)
+   * @tparam K `fn` result type
+   * @return an iterator of K
+   */
+  def withRetry[T <: AutoCloseable, K](
+      input: T,
+      splitPolicy: T => Seq[T])
+      (fn: T => K): Iterator[K] = {
+    val attemptIter = new AutoCloseableAttemptSpliterator(
+      SingleItemAutoCloseableIteratorInternal(input), fn, splitPolicy)
+    new RmmRapidsRetryAutoCloseableIterator(attemptIter)
+  }
+
+  /**
+   * withRetryNoSplit for T. This helper calls a function `fn` with the `input`, and it will
+   * retry the call to `fn` if needed. This does not split the
+   * input into multiple chunks. The result is a single item of type K.
+   *
+   * While T is a generic `AutoCloseable` subclass most of the time we expect it to be
+   * `SpillableColumnarBatch`. The expectation when code enters `withRetryNoSplit` is that
+   * all of the caller's data is spillable already, allowing the thread to be blocked, and
+   * its data eventually spilled because of other higher priority work.
+   *
+   * This function will close the elements of `input` as `fn` is successfully
+   * invoked. In the event of an unhandled exception `input` is also closed.
+   *
+   * `fn` must be idempotent: this is a requirement because we may call `fn` multiple times
+   * while handling retries.
+   *
+   * @param input       a single item T
+   * @param fn          the work to perform. Takes T and produces an output K
+   * @tparam T element type that must be AutoCloseable (likely `SpillableColumnarBatch`)
+   * @tparam K `fn` result type
+   * @return a single item of type K
+   */
+  def withRetryNoSplit[T <: AutoCloseable, K](
+      input: T)
+      (fn: T => K): K = {
+    val attemptIter = new AutoCloseableAttemptSpliterator(
+      SingleItemAutoCloseableIteratorInternal(input), fn)
+    drainSingleWithVerification(
+      new RmmRapidsRetryAutoCloseableIterator(attemptIter))
+  }
+
+  /**
+   * withRetryNoSplit for Seq[T]. This helper calls a function `fn` with the whole sequence
+   * given in `input`, and it will retry the call to `fn` if needed. This does not split the
+   * input into multiple chunks. The result is a single item of type K.
+   *
+   * While T is a generic `AutoCloseable` subclass most of the time we expect it to be
+   * `SpillableColumnarBatch`. The expectation when code enters `withRetryNoSplit` is that
+   * all of the caller's data is spillable already, allowing the thread to be blocked, and
+   * its data eventually spilled because of other higher priority work.
+   *
+   * This function will close the elements of `input` as `fn` is successfully
+   * invoked. In the event of an unhandled exception, all elements of `input` are closed.
+   *
+   * `fn` must be idempotent: this is a requirement because we may call `fn` multiple times
+   * while handling retries.
+   *
+   * @param input       a single item T
+   * @param fn          the work to perform. Takes T and produces an output K
+   * @tparam T element type that must be AutoCloseable (likely `SpillableColumnarBatch`)
+   * @tparam K `fn` result type
+   * @return a single item of type K
+   */
+  def withRetryNoSplit[T <: AutoCloseable, K](
+      input: Seq[T])
+      (fn: Seq[T] => K): K = {
+    val wrapped = AutoCloseableSeqInternal(input)
+    val attemptIter = new AutoCloseableAttemptSpliterator(
+      SingleItemAutoCloseableIteratorInternal(wrapped), fn)
+    drainSingleWithVerification(
+      new RmmRapidsRetryAutoCloseableIterator(attemptIter))
+  }
+
+  /**
+   * no-input withRetryNoSplit. This helper calls a function `fn` retrying the call if needed.
+   * The result is a single item of type K.
+   *
+   * The expectation when code enters `withRetryNoSplit` is that all of the caller's data is
+   * spillable already, allowing the thread to be blocked, and its data eventually spilled
+   * because of other higher priority work.
+   *
+   * `fn` must be idempotent: this is a requirement because we may call `fn` multiple times
+   * while handling retries.
+   *
+   * @param fn the work to perform. It is a function that takes nothing and produces K
+   * @tparam K `fn` result type
+   * @return a single item of type K
+   */
+  def withRetryNoSplit[K](fn: => K): K = {
+    val attemptIter = new NoInputSpliterator(fn)
+    drainSingleWithVerification(
+      new RmmRapidsRetryAutoCloseableIterator(attemptIter))
+  }
+
+  /**
+   * Returns a tuple of (shouldRetry, shouldSplit) depending the exception
+   * passed
+   */
+  private def isRetryOrSplitAndRetry(ex: Throwable): (Boolean, Boolean) = {
+    ex match {
+      case _: RetryOOM => (true, false)
+      case _: SplitAndRetryOOM => (true, true)
+      case _ => (false, false)
+    }
+  }
+
+  /**
+   * Returns a tuple of (causedByRetry, causedBySplit) depending the exception
+   * passed
+   */
+  private def causedByRetryOrSplit(ex: Throwable): (Boolean, Boolean) = {
+    var current = ex
+    var causedByRetry = false
+    var causedBySplit = false
+    // check if there is a hidden retry or split OOM
+    while (current != null && !causedByRetry) {
+      current = current.getCause()
+      val (isRetry, isSplit) = isRetryOrSplitAndRetry(current)
+      causedByRetry = isRetry
+      causedBySplit = causedBySplit || isSplit
+    }
+    (causedByRetry, causedBySplit)
+  }
+
+  /**
+   * withRestoreOnRetry for CheckpointRestore. This helper function calls `fn` with no input and
+   * returns the result. In the event of an OOM Retry exception, it calls the restore() method
+   * of the input and then throws the oom exception.  This is intended to be used within the `fn`
+   * of one of the withRetry* functions.  It provides an opportunity to reset state in the case
+   * of a retry.
+   *
+   * @param r  a single item T
+   * @param fn the work to perform. Takes no input and produces K
+   * @tparam T element type that must be a `CheckpointRestore` subclass
+   * @tparam K `fn` result type
+   * @return a single item of type K
+   */
+  def withRestoreOnRetry[T <: CheckpointRestore, K](r: T)(fn: => K): K = {
+    try {
+      fn
+    } catch {
+      case ex: Throwable =>
+        // Only restore on retry exceptions
+        val (topLevelIsRetry, _) = isRetryOrSplitAndRetry(ex)
+        if (topLevelIsRetry || causedByRetryOrSplit(ex)._1) {
+          r.restore()
+        }
+        throw ex
+    }
+  }
+
+  /**
+   * withRestoreOnRetry for CheckpointRestore. This helper function calls `fn` with no input and
+   * returns the result. In the event of an OOM Retry exception, it calls the restore() method
+   * of the input and then throws the oom exception.  This is intended to be used within the `fn`
+   * of one of the withRetry* functions.  It provides an opportunity to reset state in the case
+   * of a retry.
+   *
+   * @param r  a Seq of item T
+   * @param fn the work to perform. Takes no input and produces K
+   * @tparam T element type that must be a `CheckpointRestore` subclass
+   * @tparam K `fn` result type
+   * @return a single item of type K
+   */
+  def withRestoreOnRetry[T <: CheckpointRestore, K](r: Seq[T])(fn: => K): K = {
+    try {
+      fn
+    } catch {
+      case ex: Throwable =>
+        // Only restore on retry exceptions
+        val (topLevelIsRetry, _) = isRetryOrSplitAndRetry(ex)
+        if (topLevelIsRetry || causedByRetryOrSplit(ex)._1) {
+          r.foreach(_.restore())
+        }
+        throw ex
+    }
+  }
+
+  /**
+   * Helper method to drain an iterator and ensuring that it was non-empty
+   * and it had a single item in it.
+   */
+  private def drainSingleWithVerification[K](it: Iterator[K]): K = {
+    require(it.hasNext,
+      "Couldn't drain a single item with a closed iterator!")
+    val item = it.next()
+    require(!it.hasNext,
+      "Multiple items found in the source iterator but one expected!")
+    item
+  }
+
+  /**
+   * AutoCloseable wrapper on Seq[T], returning a Seq[T] that can be closed.
+   * @param ts the Seq to wrap
+   * @tparam T the type of the items in `ts`
+   */
+  private case class AutoCloseableSeqInternal[T <: AutoCloseable](ts: Seq[T])
+      extends Seq[T] with AutoCloseable{
+    override def close(): Unit = {
+      ts.foreach(_.safeClose())
+    }
+
+    override def length: Int = ts.length
+
+    override def iterator: Iterator[T] = ts.iterator
+
+    override def apply(idx: Int): T = ts.apply(idx)
+  }
+
+  /**
+   * An iterator of a single item that is able to close if .next
+   * has not been called on it.
+   * @param ts the AutoCloseable item to close if this iterator hasn't been drained
+   * @tparam T the type of `ts`, must be AutoCloseable
+   */
+  private case class SingleItemAutoCloseableIteratorInternal[T <: AutoCloseable](ts: T)
+    extends Iterator[T] with AutoCloseable {
+
+    private var wasCalledSuccessfully = false
+    override def hasNext: Boolean = !wasCalledSuccessfully
+    override def next(): T = {
+      wasCalledSuccessfully = true
+      ts
+    }
+    override def close(): Unit = {
+      if (!wasCalledSuccessfully) {
+        ts.close()
+      }
+    }
+  }
+
+  /**
+   * A trait that defines an iterator of type K that supports two extra things:
+   * the ability to split its input, and the ability to close itself.
+   *
+   * Note that the input's type is not defined and is not relevant to this trait.
+   *
+   * @tparam K the resulting type
+   */
+  trait Spliterator[K] extends Iterator[K] with AutoCloseable {
+    override def hasNext: Boolean
+
+    /**
+     * Split is a function that is invoked by `RmmRapidsRetryIterator` when `SplitAndRetryOOM`
+     * is thrown. This function is implemented by `Spliterator` classes to attempt to handle
+     * this exception by reducing the size of attempts (the thing that `.next` is
+     * using as an input), usually by splitting a batch in half by number of rows, or
+     * splitting a collection of batches into smaller collections to be attempted separately,
+     * likely reducing GPU memory that needs to be manifested while calling `.next`.
+     */
+    def split(): Unit
+
+    override def next(): K
+
+    override def close(): Unit
+  }
+
+  /**
+   * A spliterator that doesn't take any inputs, hence it is "empty", and it doesn't know
+   * how to split. It allows the caller to call the function `fn` once on `next`.
+   * @param fn the work to perform. It is a function that takes nothing and produces K
+   * @tparam K the resulting type
+   */
+  class NoInputSpliterator[K](fn: => K) extends Spliterator[K] {
+    private var wasCalledSuccessfully: Boolean = false
+
+    override def hasNext: Boolean = !wasCalledSuccessfully
+
+    override def split(): Unit = {
+      throw new SplitAndRetryOOM("GPU OutOfMemory: could not split inputs and retry")
+    }
+
+    override def next(): K = {
+      val res = fn
+      wasCalledSuccessfully = true
+      res
+    }
+
+    override def close(): Unit = {}
+  }
+
+  /**
+   * A spliterator that takes an input iterator of auto closeable T, and a function `fn`
+   * that can map `T` to `K`, with an additional `splitPolicy` that can split `T` into a
+   * `Seq[T]`
+   *
+   * It assumes the type T is AutoCloseable, and that if a split policy is specified, that it
+   * is capable of handling splitting one T into a sequence of them.
+   *
+   * When an attempt to invoke function `fn` is successful, the item T in `input` will be
+   * closed. In the case of a failure, all attempts will be closed. It is the responsibility
+   * of the caller to close any remaining items in `input` that have not been attempted.
+   *
+   * `fn` must be idempotent: this is a requirement because we may call `fn` multiple times
+   * while handling retries.
+   *
+   * @tparam T element type that must be AutoCloseable
+   * @tparam K `fn` result type
+   * @param input an iterator of T
+   * @param fn a function that takes T and produces K
+   * @param splitPolicy a function that can split an item of type T into a Seq[T]. The split
+   *                    function must close the item passed to it.
+   */
+  class AutoCloseableAttemptSpliterator[T <: AutoCloseable, K](
+      input: Iterator[T],
+      fn: T => K,
+      splitPolicy: T => Seq[T])
+      extends Spliterator[K] {
+    def this(input: Iterator[T], fn: T => K) =
+      this(input, fn, null)
+
+    protected val attemptStack = new mutable.ArrayStack[T]()
+
+    override def hasNext: Boolean = input.hasNext || attemptStack.nonEmpty
+
+    override def split(): Unit = {
+      // If `split` OOMs, we are already the last thread standing
+      // there is likely not much we can do, and for now we don't handle
+      // this OOM
+      if (splitPolicy == null) {
+        throw new SplitAndRetryOOM("GPU OutOfMemory: could not split inputs and retry")
+      }
+      // splitPolicy must take ownership of the argument
+      val splitted = splitPolicy(attemptStack.pop())
+      // the splitted sequence needs to be inserted in reverse order
+      // so we try the first item first.
+      splitted.reverse.foreach(attemptStack.push)
+    }
+
+    override def next(): K = {
+      if (attemptStack.isEmpty && input.hasNext) {
+        attemptStack.push(input.next())
+      }
+      val popped = attemptStack.head
+      val res = fn(popped)
+      attemptStack.pop().close()
+      res
+    }
+
+    override def close(): Unit = {
+      attemptStack.safeClose()
+      attemptStack.clear()
+    }
+  }
+
+  /**
+   * RmmRapidsRetryAutoCloseableIterator exposes an iterator that can retry work,
+   * specified by `fn`, abstracting away the retry specifics. Elements passed to this iterator
+   * must be AutoCloseable.
+   *
+   * It assumes the type T is AutoCloseable, and that if a split policy is specified, that it
+   * is capable of handling splitting one T into a sequence of them.
+   *
+   * @tparam T element type that must be AutoCloseable
+   * @tparam K result type
+   * @param attemptIter an iterator of T
+   */
+  class RmmRapidsRetryAutoCloseableIterator[T <: AutoCloseable, K](
+      attemptIter: Spliterator[K])
+      extends RmmRapidsRetryIterator[T, K](attemptIter)
+        with Arm {
+
+    override def hasNext: Boolean = super.hasNext
+
+    override def next(): K = {
+      if (!hasNext) {
+        throw new NoSuchElementException("Closed called on an empty iterator.")
+      }
+      try {
+        super.next()
+      } catch {
+        case t: Throwable =>
+          // exception occurred while trying to handle this retry
+          // we close our attempts (which includes the item we last attempted)
+          attemptIter.close()
+          throw t
+      }
+    }
+  }
+
+  /**
+   * RmmRapidsRetryIterator exposes an iterator that can retry work,
+   * specified by `fn`, abstracting away the retry specifics.
+   *
+   * @tparam T element type
+   * @tparam K `fn` result type
+   * @param attemptIter an iterator of T
+   */
+  class RmmRapidsRetryIterator[T, K](attemptIter: Spliterator[K])
+      extends Iterator[K]
+          with Arm {
+    // used to figure out if we should inject an OOM (only for tests)
+    private val config = new RapidsConf(SQLConf.get)
+
+    // this is true if an OOM was injected (only for tests)
+    private var injectedOOM = false
+    // this is true if the OOM was cleared after it was injected (only for tests)
+    private var injectedOOMCleared = false
+
+    override def hasNext: Boolean = attemptIter.hasNext
+
+    private def clearInjectedOOMIfNeeded(): Unit = {
+      if (injectedOOM && !injectedOOMCleared) {
+        val threadId = RmmSpark.getCurrentThreadId
+        // if for some reason we don't throw, or we throw something that isn't a RetryOOM
+        // we want to remove the retry we registered before we leave the withRetry block.
+        // If the thread is in an UNKNOWN state, then it is already cleared.
+        if (RmmSpark.getStateOf(threadId) != RmmSparkThreadState.UNKNOWN) {
+          RmmSpark.forceRetryOOM(threadId, 0)
+        }
+        injectedOOMCleared = true
+      }
+    }
+
+    override def next(): K = {
+      // this is set on the first exception, and we add suppressed if there are others
+      // during the retry attempts
+      var lastException: Throwable = null
+      var firstAttempt: Boolean = true
+      var result: Option[K] = None
+      var doSplit = false
+      while (result.isEmpty && attemptIter.hasNext) {
+        if (!firstAttempt) {
+          // call thread block API
+          RmmSpark.blockThreadUntilReady()
+        }
+        firstAttempt = false
+        if (doSplit) {
+          attemptIter.split()
+        }
+        doSplit = false
+        try {
+          // call the user's function
+          if (config.testRetryOOMInjectionEnabled && !injectedOOM) {
+            injectedOOM = true
+            // ensure we have associated our thread with the running task, as
+            // `forceRetryOOM` requires a prior association.
+            RmmSpark.associateCurrentThreadWithTask(TaskContext.get().taskAttemptId())
+            RmmSpark.forceRetryOOM(RmmSpark.getCurrentThreadId)
+          }
+          result = Some(attemptIter.next())
+          clearInjectedOOMIfNeeded()
+        } catch {
+          case ex: Throwable =>
+            // handle a retry as the top-level exception
+            val (topLevelIsRetry, topLevelIsSplit) = isRetryOrSplitAndRetry(ex)
+            doSplit = topLevelIsSplit
+
+            // handle any retries that are wrapped in a different top-level exception
+            var causedByRetry = false
+            if (!topLevelIsRetry) {
+              val (cbRetry, cbSplit) = causedByRetryOrSplit(ex)
+              causedByRetry = cbRetry
+              doSplit = doSplit || cbSplit
+            }
+
+            clearInjectedOOMIfNeeded()
+
+            // make sure we add any prior exceptions to this one as causes
+            if (lastException != null) {
+              ex.addSuppressed(lastException)
+            }
+            lastException = ex
+
+            if (!topLevelIsRetry && !causedByRetry) {
+              // we want to throw early here, since we got an exception
+              // we were not prepared to handle
+              throw lastException
+            } 
+            // else another exception wrapped a retry. So we are going to try again
+        }
+      }
+      if (result.isEmpty) {
+        // then lastException must be set, throw it.
+        throw lastException
+      }
+      result.get
+    }
+  }
+
+  /**
+   * Common split function from a single SpillableColumnarBatch to a sequence of them,
+   * that tries to split the input into two chunks. If the input cannot be split in two,
+   * because we are down to 1 row, this function throws `SplitAndRetryOOM`.
+   *
+   * Note how this function closes the input `spillable` that is passed in.
+   *
+   * @return a Seq[SpillableColumnarBatch] with 2 elements.
+   */
+  def splitSpillableInHalfByRows: SpillableColumnarBatch => Seq[SpillableColumnarBatch] = {
+    (spillable: SpillableColumnarBatch) => {
+      withResource(spillable) { _ =>
+        val toSplitRows = spillable.numRows()
+        if (toSplitRows <= 1) {
+          throw new SplitAndRetryOOM(
+            s"GPU OutOfMemory: a batch of $toSplitRows cannot be split!")
+        }
+        val (firstHalf, secondHalf) = withResource(spillable.getColumnarBatch()) { src =>
+          withResource(GpuColumnVector.from(src)) { tbl =>
+            val splitIx = (tbl.getRowCount / 2).toInt
+            withResource(tbl.contiguousSplit(splitIx)) { cts =>
+              val tables = cts.map(_.getTable)
+              withResource(tables.safeMap(GpuColumnVector.from(_, spillable.dataTypes))) {
+                batches =>
+                  val spillables = batches.safeMap { b =>
+                    SpillableColumnarBatch(
+                      GpuColumnVector.incRefCounts(b),
+                      SpillPriorities.ACTIVE_BATCHING_PRIORITY)
+                  }
+                  closeOnExcept(spillables) { _ =>
+                    require(spillables.length == 2,
+                      s"Contiguous split returned ${spillables.length} tables but two were " +
+                          s"expected!")
+                  }
+                  (spillables.head, spillables.last)
+              }
+            }
+          }
+        }
+        Seq(firstHalf, secondHalf)
+      }
+    }
+  }
+}
+
+trait CheckpointRestore {
+  /**
+   * Save state so it can be restored in case of an OOM Retry.
+   */
+  def checkpoint(): Unit
+
+  /**
+   * Restore state that was checkpointed.
+   */
+  def restore(): Unit
+}
\ No newline at end of file
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SamplingUtils.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SamplingUtils.scala
index 6fb754468cc..242ce8a67eb 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SamplingUtils.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SamplingUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -101,8 +101,7 @@ object SamplingUtils extends Arm {
           if (runningCb == null) {
             runningCb = SpillableColumnarBatch(
               GpuColumnVector.from(selected, GpuColumnVector.extractTypes(cb)),
-              SpillPriorities.ACTIVE_ON_DECK_PRIORITY,
-              RapidsBuffer.defaultSpillCallback)
+              SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
           } else {
             val concat = withResource(runningCb) { spb =>
               runningCb = null
@@ -115,8 +114,7 @@ object SamplingUtils extends Arm {
             withResource(concat) { concat =>
               runningCb = SpillableColumnarBatch(
                 GpuColumnVector.from(concat, GpuColumnVector.extractTypes(cb)),
-                SpillPriorities.ACTIVE_ON_DECK_PRIORITY,
-                RapidsBuffer.defaultSpillCallback)
+                SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
             }
           }
         }
@@ -191,8 +189,7 @@ object SamplingUtils extends Arm {
             rowsSaved = selected.getRowCount
             runningCb = SpillableColumnarBatch(
               GpuColumnVector.from(selected, GpuColumnVector.extractTypes(cb)),
-              SpillPriorities.ACTIVE_ON_DECK_PRIORITY,
-              RapidsBuffer.defaultSpillCallback)
+              SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
           } else {
             withResource(runningCb) { spb =>
               runningCb = null
@@ -209,8 +206,7 @@ object SamplingUtils extends Arm {
                   rowsSaved = concat.getRowCount
                   runningCb = SpillableColumnarBatch(
                     GpuColumnVector.from(concat, GpuColumnVector.extractTypes(cb)),
-                    SpillPriorities.ACTIVE_ON_DECK_PRIORITY,
-                    RapidsBuffer.defaultSpillCallback)
+                    SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
                 }
               }
             }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala
index 97c08746a7c..3da13c43f24 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala
@@ -58,13 +58,13 @@ import org.apache.spark.util.MutableURLClassLoader
 
     E.g., Spark 3.2.0 Shim will use
 
-    jar:file:/home/spark/rapids-4-spark_2.12-23.02.0.jar!/spark3xx-common/
-    jar:file:/home/spark/rapids-4-spark_2.12-23.02.0.jar!/spark320/
+    jar:file:/home/spark/rapids-4-spark_2.12-23.04.0.jar!/spark3xx-common/
+    jar:file:/home/spark/rapids-4-spark_2.12-23.04.0.jar!/spark320/
 
     Spark 3.1.1 will use
 
-    jar:file:/home/spark/rapids-4-spark_2.12-23.02.0.jar!/spark3xx-common/
-    jar:file:/home/spark/rapids-4-spark_2.12-23.02.0.jar!/spark311/
+    jar:file:/home/spark/rapids-4-spark_2.12-23.04.0.jar!/spark3xx-common/
+    jar:file:/home/spark/rapids-4-spark_2.12-23.04.0.jar!/spark311/
 
     Using these Jar URL's allows referencing different bytecode produced from identical sources
     by incompatible Scala / Spark dependencies.
@@ -194,7 +194,7 @@ object ShimLoader extends Logging {
     }
   }
 
-  private def getShimClassLoader(): ClassLoader = {
+  def getShimClassLoader(): ClassLoader = {
     initShimProviderIfNeeded()
     if (pluginClassLoader == null) {
       updateSparkClassLoader()
@@ -271,7 +271,8 @@ object ShimLoader extends Logging {
           ret
         }.getOrElse(shimClassLoader.loadClass(shimServiceProviderStr))
         Option(
-          (instantiateClass(shimClass).asInstanceOf[SparkShimServiceProvider], shimURL)
+          (ShimReflectionUtils.instantiateClass(shimClass).asInstanceOf[SparkShimServiceProvider],
+            shimURL)
         )
       } catch {
         case cnf: ClassNotFoundException =>
@@ -327,37 +328,14 @@ object ShimLoader extends Logging {
     SPARK_BUILD_DATE
   )
 
-  def loadClass(className: String): Class[_] = {
-    val loader = getShimClassLoader()
-    logDebug(s"Loading $className using $loader with the parent loader ${loader.getParent}")
-    loader.loadClass(className)
-  }
-
-  private def newInstanceOf[T](className: String): T = {
-    instantiateClass(loadClass(className)).asInstanceOf[T]
-  }
-
-  def newOptimizerClass(className: String): Optimizer = {
-    newInstanceOf[Optimizer](className)
-  } 
-
-  // avoid cached constructors
-  private def instantiateClass[T](cls: Class[T]): T = {
-    logDebug(s"Instantiate ${cls.getName} using classloader " + cls.getClassLoader)
-    cls.getClassLoader match {
-      case urcCl: java.net.URLClassLoader =>
-        logDebug("urls " + urcCl.getURLs.mkString("\n"))
-      case _ =>
-    }
-    val constructor = cls.getConstructor()
-    constructor.newInstance()
-  }
-
-
   //
   // Reflection-based API with Spark to switch the classloader used by the caller
   //
 
+  def newOptimizerClass(className: String): Optimizer = {
+    ShimReflectionUtils.newInstanceOf[Optimizer](className)
+  }
+
   def newInternalShuffleManager(conf: SparkConf, isDriver: Boolean): Any = {
     val shuffleClassLoader = getShimClassLoader()
     val shuffleClassName = getRapidsShuffleInternalClass
@@ -367,63 +345,66 @@ object ShimLoader extends Logging {
   }
 
   def newDriverPlugin(): DriverPlugin = {
-    newInstanceOf("com.nvidia.spark.rapids.RapidsDriverPlugin")
+    ShimReflectionUtils.newInstanceOf("com.nvidia.spark.rapids.RapidsDriverPlugin")
   }
 
   def newExecutorPlugin(): ExecutorPlugin = {
-    newInstanceOf("com.nvidia.spark.rapids.RapidsExecutorPlugin")
+    ShimReflectionUtils.newInstanceOf("com.nvidia.spark.rapids.RapidsExecutorPlugin")
   }
 
   def newColumnarOverrideRules(): ColumnarRule = {
-    newInstanceOf("com.nvidia.spark.rapids.ColumnarOverrideRules")
+    ShimReflectionUtils.newInstanceOf("com.nvidia.spark.rapids.ColumnarOverrideRules")
   }
 
   def newGpuQueryStagePrepOverrides(): Rule[SparkPlan] = {
-    newInstanceOf("com.nvidia.spark.rapids.GpuQueryStagePrepOverrides")
+    ShimReflectionUtils.newInstanceOf("com.nvidia.spark.rapids.GpuQueryStagePrepOverrides")
   }
 
   def newUdfLogicalPlanRules(): Rule[LogicalPlan] = {
-    newInstanceOf("com.nvidia.spark.udf.LogicalPlanRules")
+    ShimReflectionUtils.newInstanceOf("com.nvidia.spark.udf.LogicalPlanRules")
   }
 
   def newStrategyRules(): Strategy = {
-    newInstanceOf("com.nvidia.spark.rapids.StrategyRules")
+    ShimReflectionUtils.newInstanceOf("com.nvidia.spark.rapids.StrategyRules")
   }
 
   def newInternalExclusiveModeGpuDiscoveryPlugin(): ResourceDiscoveryPlugin = {
-    newInstanceOf("com.nvidia.spark.rapids.InternalExclusiveModeGpuDiscoveryPlugin")
+    ShimReflectionUtils.
+      newInstanceOf("com.nvidia.spark.rapids.InternalExclusiveModeGpuDiscoveryPlugin")
   }
 
   def newParquetCachedBatchSerializer(): GpuCachedBatchSerializer = {
-    newInstanceOf("com.nvidia.spark.rapids.ParquetCachedBatchSerializer")
+    ShimReflectionUtils.newInstanceOf("com.nvidia.spark.rapids.ParquetCachedBatchSerializer")
   }
 
   def loadColumnarRDD(): Class[_] = {
-    loadClass("org.apache.spark.sql.rapids.execution.InternalColumnarRddConverter")
+    ShimReflectionUtils.
+      loadClass("org.apache.spark.sql.rapids.execution.InternalColumnarRddConverter")
   }
 
   def newExplainPlan(): ExplainPlanBase = {
-    newInstanceOf[ExplainPlanBase]("com.nvidia.spark.rapids.ExplainPlanImpl")
+    ShimReflectionUtils.newInstanceOf[ExplainPlanBase]("com.nvidia.spark.rapids.ExplainPlanImpl")
   }
 
   def newHiveProvider(): HiveProvider= {
-    newInstanceOf[HiveProvider]("org.apache.spark.sql.hive.rapids.HiveProviderImpl")
+    ShimReflectionUtils.
+      newInstanceOf[HiveProvider]("org.apache.spark.sql.hive.rapids.HiveProviderImpl")
   }
 
-  def newAvroProvider(): AvroProvider = ShimLoader.newInstanceOf[AvroProvider](
+  def newAvroProvider(): AvroProvider = ShimReflectionUtils.newInstanceOf[AvroProvider](
     "org.apache.spark.sql.rapids.AvroProviderImpl")
 
-  def newDeltaProbe(): DeltaProbe = ShimLoader.newInstanceOf[DeltaProbe](
+  def newDeltaProbe(): DeltaProbe = ShimReflectionUtils.newInstanceOf[DeltaProbe](
     "com.nvidia.spark.rapids.delta.DeltaProbeImpl")
 
-  def newIcebergProvider(): IcebergProvider = ShimLoader.newInstanceOf[IcebergProvider](
+  def newIcebergProvider(): IcebergProvider = ShimReflectionUtils.newInstanceOf[IcebergProvider](
     "com.nvidia.spark.rapids.iceberg.IcebergProviderImpl")
 
-  def newPlanShims(): PlanShims = ShimLoader.newInstanceOf[PlanShims](
+  def newPlanShims(): PlanShims = ShimReflectionUtils.newInstanceOf[PlanShims](
     "com.nvidia.spark.rapids.shims.PlanShimsImpl"
   )
   
   def loadGpuColumnVector(): Class[_] = {
-    loadClass("com.nvidia.spark.rapids.GpuColumnVector")
+    ShimReflectionUtils.loadClass("com.nvidia.spark.rapids.GpuColumnVector")
   }
 }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShimReflectionUtils.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShimReflectionUtils.scala
new file mode 100644
index 00000000000..dfc7e13f44d
--- /dev/null
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShimReflectionUtils.scala
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import org.apache.spark.internal.Logging
+
+/*
+ * This is specifically for functions dealing with loading classes via reflection. This
+ * class itself should not contain or import any shimmed/parallel world classes so that
+ * it can also be called via reflection, like calling getMethod on ShimReflectionUtils.
+ */
+object ShimReflectionUtils extends Logging {
+
+  def loadClass(className: String): Class[_] = {
+    val loader = ShimLoader.getShimClassLoader()
+    logDebug(s"Loading $className using $loader with the parent loader ${loader.getParent}")
+    loader.loadClass(className)
+  }
+
+  def newInstanceOf[T](className: String): T = {
+    instantiateClass(ShimReflectionUtils.loadClass(className)).asInstanceOf[T]
+  }
+
+  // avoid cached constructors
+  def instantiateClass[T](cls: Class[T]): T = {
+    logDebug(s"Instantiate ${cls.getName} using classloader " + cls.getClassLoader)
+    cls.getClassLoader match {
+      case urcCl: java.net.URLClassLoader =>
+        logDebug("urls " + urcCl.getURLs.mkString("\n"))
+      case _ =>
+    }
+    val constructor = cls.getConstructor()
+    constructor.newInstance()
+  }
+}
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShuffleBufferCatalog.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShuffleBufferCatalog.scala
index 236e87fe4b0..a297cd78ea6 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShuffleBufferCatalog.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShuffleBufferCatalog.scala
@@ -76,25 +76,21 @@ class ShuffleBufferCatalog(
    * @param blockId Spark's `ShuffleBlockId` that identifies this buffer
    * @param contigTable contiguous table to track in storage
    * @param initialSpillPriority starting spill priority value for the buffer
-   * @param spillCallback a callback when the buffer is spilled. This should be very light weight.
-   *                      It should never allocate GPU memory and really just be used for metrics.
    * @param needsSync whether the spill framework should stream synchronize while adding
    *                  this device buffer (defaults to true)
-   * @return RapidsBufferId identifying this table
+   * @return RapidsBufferHandle identifying this table
    */
   def addContiguousTable(
       blockId: ShuffleBlockId,
       contigTable: ContiguousTable,
       initialSpillPriority: Long,
-      defaultSpillCallback: SpillCallback,
       needsSync: Boolean): RapidsBufferHandle = {
     val bufferId = nextShuffleBufferId(blockId)
     withResource(contigTable) { _ =>
-      val handle = deviceStore.addContiguousTable(
+      val handle = catalog.addContiguousTable(
         bufferId,
         contigTable,
         initialSpillPriority,
-        defaultSpillCallback,
         needsSync)
       trackCachedHandle(bufferId, handle)
       handle
@@ -108,8 +104,6 @@ class ShuffleBufferCatalog(
    * @param buffer buffer that will be owned by the store
    * @param tableMeta metadata describing the buffer layout
    * @param initialSpillPriority starting spill priority value for the buffer
-   * @param spillCallback a callback when the buffer is spilled. This should be very light weight.
-   *                      It should never allocate GPU memory and really just be used for metrics.
    * @return RapidsBufferHandle associated with this buffer
    */
   def addBuffer(
@@ -117,23 +111,18 @@ class ShuffleBufferCatalog(
       buffer: DeviceMemoryBuffer,
       tableMeta: TableMeta,
       initialSpillPriority: Long,
-      defaultSpillCallback: SpillCallback,
       needsSync: Boolean): RapidsBufferHandle = {
     val bufferId = nextShuffleBufferId(blockId)
     // update the table metadata for the buffer ID generated above
     tableMeta.bufferMeta.mutateId(bufferId.tableId)
-    // when we call `addBuffer` the store will incRefCount
-    withResource(buffer) { _ =>
-      val handle = deviceStore.addBuffer(
-        bufferId,
-        buffer,
-        tableMeta,
-        initialSpillPriority,
-        defaultSpillCallback,
-        needsSync)
-      trackCachedHandle(bufferId, handle)
-      handle
-    }
+    val handle = catalog.addBuffer(
+      bufferId,
+      buffer,
+      tableMeta,
+      initialSpillPriority,
+      needsSync)
+    trackCachedHandle(bufferId, handle)
+    handle
   }
 
   /**
@@ -142,13 +131,9 @@ class ShuffleBufferCatalog(
    */
   def addDegenerateRapidsBuffer(
       blockId: ShuffleBlockId,
-      meta: TableMeta,
-      spillCallback: SpillCallback): RapidsBufferHandle = {
+      meta: TableMeta): RapidsBufferHandle = {
     val bufferId = nextShuffleBufferId(blockId)
-    val buffer = new DegenerateRapidsBuffer(bufferId, meta)
-    catalog.registerNewBuffer(buffer)
-    val handle =
-      catalog.makeNewHandle(buffer.id, buffer.getSpillPriority, spillCallback)
+    val handle = catalog.registerDegenerateBuffer(bufferId, meta)
     trackCachedHandle(bufferId, handle)
     handle
   }
@@ -192,7 +177,10 @@ class ShuffleBufferCatalog(
         // NOTE: Not synchronizing array buffer because this shuffle should be inactive.
         bufferIds.foreach { id =>
           tableMap.remove(id.tableId)
-          bufferIdToHandle.get(id).close()
+          val handle = bufferIdToHandle.remove(id)
+          if (handle != null) {
+            handle.close()
+          }
         }
       }
       info.blockMap.forEachValue(Long.MaxValue, bufferRemover)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShuffleReceivedBufferCatalog.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShuffleReceivedBufferCatalog.scala
index e6b58b90b6a..b5cbc252b22 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShuffleReceivedBufferCatalog.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShuffleReceivedBufferCatalog.scala
@@ -69,8 +69,6 @@ class ShuffleReceivedBufferCatalog(
    * @param buffer buffer that will be owned by the store
    * @param tableMeta metadata describing the buffer layout
    * @param initialSpillPriority starting spill priority value for the buffer
-   * @param spillCallback a callback when the buffer is spilled. This should be very light weight.
-   *                      It should never allocate GPU memory and really just be used for metrics.
    * @param needsSync tells the store a synchronize in the current stream is required
    *                  before storing this buffer
    * @return RapidsBufferHandle associated with this buffer
@@ -79,18 +77,16 @@ class ShuffleReceivedBufferCatalog(
       buffer: DeviceMemoryBuffer,
       tableMeta: TableMeta,
       initialSpillPriority: Long,
-      defaultSpillCallback: SpillCallback = RapidsBuffer.defaultSpillCallback,
       needsSync: Boolean): RapidsBufferHandle = {
     val bufferId = nextShuffleReceivedBufferId()
     tableMeta.bufferMeta.mutateId(bufferId.tableId)
     // when we call `addBuffer` the store will incRefCount
     withResource(buffer) { _ =>
-      deviceStore.addBuffer(
+      catalog.addBuffer(
         bufferId,
         buffer,
         tableMeta,
         initialSpillPriority,
-        defaultSpillCallback,
         needsSync)
     }
   }
@@ -99,17 +95,12 @@ class ShuffleReceivedBufferCatalog(
    * Adds a degenerate buffer (zero rows or columns)
    *
    * @param meta metadata describing the buffer layout
-   * @param spillCallback a callback when the buffer is spilled. This should be very light weight.
-   *                      It should never allocate GPU memory and really just be used for metrics.
    * @return RapidsBufferHandle associated with this buffer
    */
   def addDegenerateRapidsBuffer(
-      meta: TableMeta,
-      spillCallback: SpillCallback): RapidsBufferHandle = {
+      meta: TableMeta): RapidsBufferHandle = {
     val bufferId = nextShuffleReceivedBufferId()
-    val buffer = new DegenerateRapidsBuffer(bufferId, meta)
-    catalog.registerNewBuffer(buffer)
-    catalog.makeNewHandle(bufferId, -1, spillCallback)
+    catalog.registerDegenerateBuffer(bufferId, meta)
   }
 
   /**
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SparkShims.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SparkShims.scala
index 1f6b15e3bac..ab0dac3ffbd 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SparkShims.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SparkShims.scala
@@ -56,7 +56,6 @@ case class DatabricksShimVersion(
 }
 
 trait SparkShims {
-  def getSparkShimVersion: ShimVersion = ShimLoader.getShimVersion
   def parquetRebaseReadKey: String
   def parquetRebaseWriteKey: String
   def avroRebaseReadKey: String
@@ -154,8 +153,8 @@ trait SparkShims {
     false
 
   /**
-   * Adds a row-based shuffle to the transititonal shuffle query stage if needed. This 
-   * is needed when AQE plans a GPU shuffleexchange to be reused by a parent plan exec 
+   * Adds a row-based shuffle to the transititonal shuffle query stage if needed. This
+   * is needed when AQE plans a GPU shuffleexchange to be reused by a parent plan exec
    * that consumes rows
    */
   def addRowShuffleToQueryStageTransitionIfNeeded(c2r: ColumnarToRowTransition,
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SpillableColumnarBatch.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SpillableColumnarBatch.scala
index 09d7134954b..ac7b0ae051b 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SpillableColumnarBatch.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SpillableColumnarBatch.scala
@@ -46,6 +46,8 @@ trait SpillableColumnarBatch extends AutoCloseable {
   def getColumnarBatch(): ColumnarBatch
 
   def sizeInBytes: Long
+
+  def dataTypes: Array[DataType]
 }
 
 /**
@@ -53,18 +55,20 @@ trait SpillableColumnarBatch extends AutoCloseable {
  * spillable, even though in reality there is no backing buffer.  It does this by just keeping the
  * row count in memory, and not dealing with the catalog at all.
  */
-class JustRowsColumnarBatch(numRows: Int, semWait: GpuMetric)
+class JustRowsColumnarBatch(numRows: Int)
     extends SpillableColumnarBatch with Arm {
   override def numRows(): Int = numRows
   override def setSpillPriority(priority: Long): Unit = () // NOOP nothing to spill
 
   def getColumnarBatch(): ColumnarBatch = {
-    GpuSemaphore.acquireIfNecessary(TaskContext.get(), semWait)
+    GpuSemaphore.acquireIfNecessary(TaskContext.get())
     new ColumnarBatch(Array.empty, numRows)
   }
 
   override def close(): Unit = () // NOOP nothing to close
   override val sizeInBytes: Long = 0L
+
+  override def dataTypes: Array[DataType] = Array.empty
 }
 
 /**
@@ -76,9 +80,10 @@ class JustRowsColumnarBatch(numRows: Int, semWait: GpuMetric)
 class SpillableColumnarBatchImpl (
     handle: RapidsBufferHandle,
     rowCount: Int,
-    sparkTypes: Array[DataType],
-    semWait: GpuMetric)
+    sparkTypes: Array[DataType])
     extends SpillableColumnarBatch with Arm {
+
+  override def dataTypes: Array[DataType] = sparkTypes
   /**
    * The number of rows stored in this batch.
    */
@@ -102,7 +107,7 @@ class SpillableColumnarBatchImpl (
 
   override def getColumnarBatch(): ColumnarBatch = {
     withRapidsBuffer { rapidsBuffer =>
-      GpuSemaphore.acquireIfNecessary(TaskContext.get(), semWait)
+      GpuSemaphore.acquireIfNecessary(TaskContext.get())
       rapidsBuffer.getColumnarBatch(sparkTypes)
     }
   }
@@ -123,25 +128,21 @@ object SpillableColumnarBatch extends Arm {
    * @note This takes over ownership of batch, and batch should not be used after this.
    * @param batch         the batch to make spillable
    * @param priority      the initial spill priority of this batch
-   * @param spillCallback a callback when the buffer is spilled. This should be very light weight.
-   *                      It should never allocate GPU memory and really just be used for metrics.
    */
   def apply(batch: ColumnarBatch,
-      priority: Long,
-      spillCallback: SpillCallback): SpillableColumnarBatch = {
+      priority: Long): SpillableColumnarBatch = {
     val numRows = batch.numRows()
     if (batch.numCols() <= 0) {
       // We consumed it
       batch.close()
-      new JustRowsColumnarBatch(numRows, spillCallback.semaphoreWaitTime)
+      new JustRowsColumnarBatch(numRows)
     } else {
       val types = GpuColumnVector.extractTypes(batch)
-      val handle = addBatch(batch, priority, spillCallback)
+      val handle = addBatch(batch, priority)
       new SpillableColumnarBatchImpl(
         handle,
         numRows,
-        types,
-        spillCallback.semaphoreWaitTime)
+        types)
     }
   }
 
@@ -151,58 +152,65 @@ object SpillableColumnarBatch extends Arm {
    * @param ct contiguous table containing the batch GPU data
    * @param sparkTypes array of Spark types describing the data schema
    * @param priority the initial spill priority of this batch
-   * @param spillCallback a callback when the buffer is spilled. This should be very light weight.
-   *                      It should never allocate GPU memory and really just be used for metrics.
    */
   def apply(
       ct: ContiguousTable,
       sparkTypes: Array[DataType],
-      priority: Long,
-      spillCallback: SpillCallback): SpillableColumnarBatch = {
-    val handle = RapidsBufferCatalog.addContiguousTable(ct, priority, spillCallback)
+      priority: Long): SpillableColumnarBatch = {
+    val handle = RapidsBufferCatalog.addContiguousTable(ct, priority)
     withResource(RapidsBufferCatalog.acquireBuffer(handle)) { _ =>
       new SpillableColumnarBatchImpl(
         handle,
         ct.getRowCount.toInt,
-        sparkTypes,
-        spillCallback.semaphoreWaitTime)
+        sparkTypes)
+    }
+  }
+
+  private[this] def allFromSameBuffer(batch: ColumnarBatch): Boolean = {
+    var bufferAddr = 0L
+    var isSet = false
+    val numColumns = batch.numCols()
+    (0 until numColumns).forall { i =>
+      batch.column(i) match {
+        case fb: GpuColumnVectorFromBuffer =>
+          if (!isSet) {
+            bufferAddr = fb.getBuffer.getAddress
+            isSet = true
+            true
+          } else {
+            bufferAddr == fb.getBuffer.getAddress
+          }
+        case _ => false
+      }
     }
   }
 
   private[this] def addBatch(
       batch: ColumnarBatch,
-      initialSpillPriority: Long,
-      spillCallback: SpillCallback): RapidsBufferHandle = {
+      initialSpillPriority: Long): RapidsBufferHandle = {
     withResource(batch) { batch =>
       val numColumns = batch.numCols()
       if (GpuCompressedColumnVector.isBatchCompressed(batch)) {
         val cv = batch.column(0).asInstanceOf[GpuCompressedColumnVector]
         val buff = cv.getTableBuffer
-        RapidsBufferCatalog.addBuffer(buff, cv.getTableMeta, initialSpillPriority,
-          spillCallback)
+        RapidsBufferCatalog.addBuffer(buff, cv.getTableMeta, initialSpillPriority)
       } else if (GpuPackedTableColumn.isBatchPacked(batch)) {
         val cv = batch.column(0).asInstanceOf[GpuPackedTableColumn]
         RapidsBufferCatalog.addContiguousTable(
           cv.getContiguousTable,
-          initialSpillPriority,
-          spillCallback)
+          initialSpillPriority)
       } else if (numColumns > 0 &&
-          (0 until numColumns)
-              .forall(i => batch.column(i).isInstanceOf[GpuColumnVectorFromBuffer])) {
+          allFromSameBuffer(batch)) {
         val cv = batch.column(0).asInstanceOf[GpuColumnVectorFromBuffer]
         val buff = cv.getBuffer
-        // note the table here is handed over to the catalog
-        val table = GpuColumnVector.from(batch)
-        RapidsBufferCatalog.addTable(table, buff, cv.getTableMeta, initialSpillPriority,
-          spillCallback)
+        RapidsBufferCatalog.addBuffer(buff, cv.getTableMeta, initialSpillPriority)
       } else {
         withResource(GpuColumnVector.from(batch)) { tmpTable =>
           withResource(tmpTable.contiguousSplit()) { contigTables =>
             require(contigTables.length == 1, "Unexpected number of contiguous spit tables")
             RapidsBufferCatalog.addContiguousTable(
               contigTables.head,
-              initialSpillPriority,
-              spillCallback)
+              initialSpillPriority)
           }
         }
       }
@@ -216,8 +224,7 @@ object SpillableColumnarBatch extends Arm {
  * Just like a SpillableColumnarBatch but for buffers.
  */
 class SpillableBuffer(
-    handle: RapidsBufferHandle,
-    semWait: GpuMetric) extends AutoCloseable with Arm {
+    handle: RapidsBufferHandle) extends AutoCloseable with Arm {
 
   /**
    * Set a new spill priority.
@@ -250,16 +257,13 @@ object SpillableBuffer extends Arm {
    * @note This takes over ownership of buffer, and buffer should not be used after this.
    * @param buffer the buffer to make spillable
    * @param priority the initial spill priority of this buffer
-   * @param spillCallback a callback when the buffer is spilled. This should be very light weight.
-   *                      It should never allocate GPU memory and really just be used for metrics.
    */
   def apply(buffer: DeviceMemoryBuffer,
-      priority: Long,
-      spillCallback: SpillCallback): SpillableBuffer = {
+      priority: Long): SpillableBuffer = {
     val meta = MetaUtils.getTableMetaNoTable(buffer)
     val handle = withResource(buffer) { _ => 
-      RapidsBufferCatalog.addBuffer(buffer, meta, priority, spillCallback)
+      RapidsBufferCatalog.addBuffer(buffer, meta, priority)
     }
-    new SpillableBuffer(handle, spillCallback.semaphoreWaitTime)
+    new SpillableBuffer(handle)
   }
 }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/TypeChecks.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/TypeChecks.scala
index e908aeb9aad..006112f0a67 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/TypeChecks.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/TypeChecks.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -398,6 +398,15 @@ final class TypeSig private(
     }
   }
 
+  private[this] def timezoneNotSupportedMessage(dataType: DataType,
+      te: TypeEnum.Value, check: TypeEnum.ValueSet, isChild: Boolean): Seq[String] = {
+    if (check.contains(te) && !TypeChecks.areTimestampsSupported()) {
+      Seq(withChild(isChild, TypeChecks.timezoneNotSupportedString(dataType)))
+    } else {
+      basicNotSupportedMessage(dataType, te, check, isChild)
+    }
+  }
+
   private[this] def reasonNotSupported(
       check: TypeEnum.ValueSet,
       dataType: DataType,
@@ -420,14 +429,7 @@ final class TypeSig private(
       case DateType =>
         basicNotSupportedMessage(dataType, TypeEnum.DATE, check, isChild)
       case TimestampType =>
-        if (check.contains(TypeEnum.TIMESTAMP) &&
-            !TypeChecks.areTimestampsSupported()) {
-          Seq(withChild(isChild, s"$dataType is not supported with timezone settings: (JVM:" +
-              s" ${ZoneId.systemDefault()}, session: ${SQLConf.get.sessionLocalTimeZone})." +
-              s" Set both of the timezones to UTC to enable $dataType support"))
-        } else {
-          basicNotSupportedMessage(dataType, TypeEnum.TIMESTAMP, check, isChild)
-        }
+        timezoneNotSupportedMessage(dataType, TypeEnum.TIMESTAMP, check, isChild)
       case StringType =>
         basicNotSupportedMessage(dataType, TypeEnum.STRING, check, isChild)
       case dt: DecimalType =>
@@ -779,27 +781,21 @@ abstract class TypeChecks[RET] {
    * here check again to add UTC info.
    */
   private def tagTimezoneInfoIfHasTimestampType(
-    unsupportedTypes: Map[DataType, Set[String]],
-    meta: RapidsMeta[_, _, _]
-    ): Unit = {
+      unsupportedTypes: Map[DataType, Set[String]],
+      meta: RapidsMeta[_, _, _]): Unit = {
     def checkTimestampType(dataType: DataType): Unit = dataType match {
-        case TimestampType if !TypeChecks.areTimestampsSupported() => {
-          meta.willNotWorkOnGpu(s"your timezone isn't in UTC (JVM:" +
-            s" ${ZoneId.systemDefault()}, session: ${SQLConf.get.sessionLocalTimeZone})." +
-            s" Set both of the timezones to UTC to enable TimestampType support")
-          return
-        }
-        case ArrayType(elementType, _) =>
-          checkTimestampType(elementType)
-        case MapType(keyType, valueType, _) =>
-          checkTimestampType(keyType)
-          checkTimestampType(valueType)
-        case StructType(fields) =>
-          fields.foreach(field => checkTimestampType(field.dataType))
-        case _ =>
-          // do nothing
+      case TimestampType if !TypeChecks.areTimestampsSupported() =>
+        meta.willNotWorkOnGpu(TypeChecks.timezoneNotSupportedString(dataType))
+      case ArrayType(elementType, _) =>
+        checkTimestampType(elementType)
+      case MapType(keyType, valueType, _) =>
+        checkTimestampType(keyType)
+        checkTimestampType(valueType)
+      case StructType(fields) =>
+        fields.foreach(field => checkTimestampType(field.dataType))
+      case _ => // do nothing
     }
-    unsupportedTypes.foreach { case (dataType, nameSet) =>
+    unsupportedTypes.foreach { case (dataType, _) =>
       checkTimestampType(dataType)
     }
   }
@@ -840,6 +836,16 @@ object TypeChecks {
     areTimestampsSupported(ZoneId.systemDefault()) &&
       areTimestampsSupported(SQLConf.get.sessionLocalTimeZone)
   }
+
+  def isTimezoneSensitiveType(dataType: DataType): Boolean = {
+    dataType == TimestampType
+  }
+
+  def timezoneNotSupportedString(dataType: DataType): String = {
+    s"$dataType is not supported with timezone settings: (JVM:" +
+      s" ${ZoneId.systemDefault()}, session: ${SQLConf.get.sessionLocalTimeZone})." +
+      s" Set both of the timezones to UTC to enable $dataType support"
+  }
 }
 
 /**
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/VersionUtils.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/VersionUtils.scala
index d848a5b129a..a9ac8fc1b72 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/VersionUtils.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/VersionUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,26 +16,24 @@
 
 package com.nvidia.spark.rapids
 
-import com.nvidia.spark.rapids.shims.SparkShimImpl
-
 object VersionUtils {
 
   lazy val isSpark320OrLater: Boolean = cmpSparkVersion(3, 2, 0) >= 0
 
   lazy val isSpark: Boolean = {
-    SparkShimImpl.getSparkShimVersion.isInstanceOf[SparkShimVersion]
+    ShimLoader.getShimVersion.isInstanceOf[SparkShimVersion]
   }
 
   lazy val isDataBricks: Boolean = {
-    SparkShimImpl.getSparkShimVersion.isInstanceOf[DatabricksShimVersion]
+    ShimLoader.getShimVersion.isInstanceOf[DatabricksShimVersion]
   }
 
   lazy val isCloudera: Boolean = {
-    SparkShimImpl.getSparkShimVersion.isInstanceOf[ClouderaShimVersion]
+    ShimLoader.getShimVersion.isInstanceOf[ClouderaShimVersion]
   }
 
   def cmpSparkVersion(major: Int, minor: Int, bugfix: Int): Int = {
-    val sparkShimVersion = SparkShimImpl.getSparkShimVersion
+    val sparkShimVersion = ShimLoader.getShimVersion
     val (sparkMajor, sparkMinor, sparkBugfix) = sparkShimVersion match {
       case SparkShimVersion(a, b, c) => (a, b, c)
       case DatabricksShimVersion(a, b, c, _) => (a, b, c)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/aggregate.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/aggregate.scala
index a3cd116927f..d1c364816c0 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/aggregate.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/aggregate.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,8 +23,10 @@ import scala.collection.mutable
 
 import ai.rapids.cudf
 import ai.rapids.cudf.{NvtxColor, NvtxRange}
+import com.nvidia.spark.rapids.GpuHashAggregateIterator.{computeAggregateAndClose, concatenateBatches, AggHelper}
 import com.nvidia.spark.rapids.GpuMetric._
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
+import com.nvidia.spark.rapids.RmmRapidsRetryIterator.{splitSpillableInHalfByRows, withRetry, withRetryNoSplit}
 import com.nvidia.spark.rapids.shims.{AggregationTagging, ShimUnaryExecNode}
 
 import org.apache.spark.TaskContext
@@ -129,9 +131,7 @@ case class GpuHashAggregateMetrics(
     opTime: GpuMetric,
     computeAggTime: GpuMetric,
     concatTime: GpuMetric,
-    sortTime: GpuMetric,
-    semWaitTime: GpuMetric,
-    spillCallback: SpillCallback)
+    sortTime: GpuMetric)
 
 /** Utility class to convey information on the aggregation modes being used */
 case class AggregateModeInfo(
@@ -153,6 +153,321 @@ object AggregateModeInfo {
   }
 }
 
+object GpuHashAggregateIterator extends Arm with Logging {
+  /**
+   * Internal class used in `computeAggregates` for the pre, agg, and post steps
+   *
+   * @param inputAttributes input attributes to identify the input columns from the input batches
+   * @param groupingExpressions expressions used for producing the grouping keys
+   * @param aggregateExpressions GPU aggregate expressions used to produce the aggregations
+   * @param forceMerge if true, we are merging two pre-aggregated batches, so we should use
+   *                   the merge steps for each aggregate function
+   * @param isSorted if the batch is sorted this is set to true and is passed to cuDF
+   *                 as an optimization hint
+   * @param useTieredProject if true, used tiered project for input projections
+   */
+  class AggHelper(
+      inputAttributes: Seq[Attribute],
+      groupingExpressions: Seq[NamedExpression],
+      aggregateExpressions: Seq[GpuAggregateExpression],
+      forceMerge: Boolean,
+      isSorted: Boolean = false,
+      useTieredProject : Boolean = true) extends Arm {
+
+    // `CudfAggregate` instances to apply, either update or merge aggregates
+    // package private for testing
+    private[rapids] val cudfAggregates = new mutable.ArrayBuffer[CudfAggregate]()
+
+    // integers for each column the aggregate is operating on
+    // package private for testing
+    private[rapids] val aggOrdinals = new mutable.ArrayBuffer[Int]
+
+    // grouping ordinals are the indices of the tables to aggregate that need to be
+    // the grouping key
+    // package private for testing
+    private[rapids] val groupingOrdinals: Array[Int] = groupingExpressions.indices.toArray
+
+    // the resulting data type from the cuDF aggregate (from
+    // the update or merge aggregate, be it reduction or group by)
+    private[rapids] val postStepDataTypes = new mutable.ArrayBuffer[DataType]()
+
+    private val groupingAttributes = groupingExpressions.map(_.toAttribute)
+    private val aggBufferAttributes = groupingAttributes ++
+        aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
+
+    // `GpuAggregateFunction` can add a pre and post step for update
+    // and merge aggregates.
+    private val preStep = new mutable.ArrayBuffer[Expression]()
+    private val postStep = new mutable.ArrayBuffer[Expression]()
+    private val postStepAttr = new mutable.ArrayBuffer[Attribute]()
+
+    // we add the grouping expression first, which should bind as pass-through
+    if (forceMerge) {
+      // a grouping expression can do actual computation, but we cannot do that computation again
+      // on a merge, nor would we want to if we could. So use the attributes instead of the
+      // original expression when we are forcing a merge.
+      preStep ++= groupingAttributes
+    } else {
+      preStep ++= groupingExpressions
+    }
+    postStep ++= groupingAttributes
+    postStepAttr ++= groupingAttributes
+    postStepDataTypes ++=
+        groupingExpressions.map(_.dataType)
+
+    private var ix = groupingAttributes.length
+    for (aggExp <- aggregateExpressions) {
+      val aggFn = aggExp.aggregateFunction
+      if ((aggExp.mode == Partial || aggExp.mode == Complete) && !forceMerge) {
+        val ordinals = (ix until ix + aggFn.updateAggregates.length)
+        aggOrdinals ++= ordinals
+        ix += ordinals.length
+        val updateAggs = aggFn.updateAggregates
+        postStepDataTypes ++= updateAggs.map(_.dataType)
+        cudfAggregates ++= updateAggs
+        preStep ++= aggFn.inputProjection
+        postStep ++= aggFn.postUpdate
+        postStepAttr ++= aggFn.postUpdateAttr
+      } else {
+        val ordinals = (ix until ix + aggFn.mergeAggregates.length)
+        aggOrdinals ++= ordinals
+        ix += ordinals.length
+        val mergeAggs = aggFn.mergeAggregates
+        postStepDataTypes ++= mergeAggs.map(_.dataType)
+        cudfAggregates ++= mergeAggs
+        preStep ++= aggFn.preMerge
+        postStep ++= aggFn.postMerge
+        postStepAttr ++= aggFn.postMergeAttr
+      }
+    }
+
+    // a bound expression that is applied before the cuDF aggregate
+    private val preStepAttributes = if (forceMerge) {
+      aggBufferAttributes
+    } else {
+      inputAttributes
+    }
+    private val preStepBound = GpuBindReferences.bindGpuReferencesTiered(preStep.toList,
+      preStepAttributes.toList, useTieredProject)
+
+    // a bound expression that is applied after the cuDF aggregate
+    private val postStepBound = GpuBindReferences.bindGpuReferencesTiered(postStep.toList,
+      postStepAttr.toList, useTieredProject)
+
+    /**
+     * Apply the "pre" step: preMerge for merge, or pass-through in the update case
+     * @param toAggregateBatch - input (to the agg) batch from the child directly in the
+     *                         merge case, or from the `inputProjection` in the update case.
+     * @return a pre-processed batch that can be later cuDF aggregated
+     */
+    def preProcess(
+        toAggregateBatch: ColumnarBatch,
+        metrics: GpuHashAggregateMetrics): SpillableColumnarBatch = {
+      val inputBatch = SpillableColumnarBatch(toAggregateBatch,
+        SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
+
+      val projectedCb = withResource(new NvtxRange("pre-process", NvtxColor.DARK_GREEN)) { _ =>
+        preStepBound.projectAndCloseWithRetrySingleBatch(inputBatch)
+      }
+      SpillableColumnarBatch(
+        projectedCb,
+        SpillPriorities.ACTIVE_BATCHING_PRIORITY)
+    }
+
+    def aggregate(preProcessed: ColumnarBatch): ColumnarBatch = {
+      if (groupingOrdinals.nonEmpty) {
+        performGroupByAggregation(preProcessed)
+      } else {
+        performReduction(preProcessed)
+      }
+    }
+
+    def aggregate(
+        metrics: GpuHashAggregateMetrics,
+        preProcessed: SpillableColumnarBatch): SpillableColumnarBatch = {
+      val aggregatedSeq =
+        withRetry(preProcessed, splitSpillableInHalfByRows) { preProcessedAttempt =>
+          withResource(preProcessedAttempt.getColumnarBatch()) { cb =>
+            SpillableColumnarBatch(
+              aggregate(cb),
+              SpillPriorities.ACTIVE_BATCHING_PRIORITY)
+          }
+        }.toSeq
+
+      // We need to merge the aggregated batches into 1 before calling post process,
+      // if the aggregate code had to split on a retry
+      if (aggregatedSeq.size > 1) {
+        val concatted = concatenateBatches(metrics, aggregatedSeq)
+        withRetryNoSplit(concatted) { attempt =>
+          withResource(attempt.getColumnarBatch()) { cb =>
+            SpillableColumnarBatch(
+              aggregate(cb),
+              SpillPriorities.ACTIVE_BATCHING_PRIORITY)
+          }
+        }
+      } else {
+        aggregatedSeq.head
+      }
+    }
+
+    /**
+     * Invoke reduction functions as defined in each `CudfAggreagte`
+     * @param preProcessed - a batch after the "pre" step
+     * @return
+     */
+    def performReduction(preProcessed: ColumnarBatch): ColumnarBatch = {
+      withResource(new NvtxRange("reduce", NvtxColor.BLUE)) { _ =>
+        val cvs = mutable.ArrayBuffer[GpuColumnVector]()
+        cudfAggregates.zipWithIndex.foreach { case (cudfAgg, ix) =>
+          val aggFn = cudfAgg.reductionAggregate
+          val cols = GpuColumnVector.extractColumns(preProcessed)
+          val reductionCol = cols(aggOrdinals(ix))
+          withResource(aggFn(reductionCol.getBase)) { res =>
+            cvs += GpuColumnVector.from(
+              cudf.ColumnVector.fromScalar(res, 1), cudfAgg.dataType)
+          }
+        }
+        new ColumnarBatch(cvs.toArray, 1)
+      }
+    }
+
+    /**
+     * Used to produce a group-by aggregate
+     * @param preProcessed the batch after the "pre" step
+     * @return a Table that has been cuDF aggregated
+     */
+    def performGroupByAggregation(preProcessed: ColumnarBatch): ColumnarBatch = {
+      withResource(new NvtxRange("groupby", NvtxColor.BLUE)) { _ =>
+        withResource(GpuColumnVector.from(preProcessed)) { preProcessedTbl =>
+          val groupOptions = cudf.GroupByOptions.builder()
+              .withIgnoreNullKeys(false)
+              .withKeysSorted(isSorted)
+              .build()
+
+          val cudfAggsOnColumn = cudfAggregates.zip(aggOrdinals).map {
+            case (cudfAgg, ord) => cudfAgg.groupByAggregate.onColumn(ord)
+          }
+
+          // perform the aggregate
+          val aggTbl = preProcessedTbl
+              .groupBy(groupOptions, groupingOrdinals:_*)
+              .aggregate(cudfAggsOnColumn: _*)
+
+          withResource(aggTbl) { _ =>
+            GpuColumnVector.from(aggTbl, postStepDataTypes.toArray)
+          }
+        }
+      }
+    }
+
+    /**
+     * Used to produce the outbound batch from the aggregate that could be
+     * shuffled or could be passed through the evaluateExpression if we are in the final
+     * stage.
+     * It takes a cuDF aggregated batch and applies the "post" step:
+     * postUpdate for update, or postMerge for merge
+     * @param resultBatch - cuDF aggregated batch
+     * @return output batch from the aggregate
+     */
+    def postProcess(
+        aggregatedSpillable: SpillableColumnarBatch,
+        metrics: GpuHashAggregateMetrics): SpillableColumnarBatch = {
+      val postProcessed =
+        withResource(new NvtxRange("post-process", NvtxColor.ORANGE)) { _ =>
+          postStepBound.projectAndCloseWithRetrySingleBatch(aggregatedSpillable)
+        }
+      SpillableColumnarBatch(
+        postProcessed,
+        SpillPriorities.ACTIVE_BATCHING_PRIORITY)
+    }
+  }
+
+  /**
+   * @note abstracted away for a unit test..
+   * @param helper
+   * @param preProcessed
+   * @return
+   */
+  def aggregate(
+      helper: AggHelper,
+      preProcessed: SpillableColumnarBatch,
+      metrics: GpuHashAggregateMetrics): SpillableColumnarBatch = {
+    helper.aggregate(metrics, preProcessed)
+  }
+
+  /**
+   * Compute the aggregations on the projected input columns, and close input batch.
+   *
+   * @note public for testing
+   * @param metrics metrics that will be updated during aggregation
+   * @param inputBatch input batch to aggregate
+   * @param helper an internal object that carries state required to execute the aggregate from
+   *               different parts of the codebase.
+   * @return aggregated batch
+   */
+  def computeAggregateAndClose(
+      metrics: GpuHashAggregateMetrics,
+      inputBatch: ColumnarBatch,
+      helper: AggHelper): SpillableColumnarBatch = {
+    val computeAggTime = metrics.computeAggTime
+    val opTime = metrics.opTime
+    withResource(new NvtxWithMetrics("computeAggregate", NvtxColor.CYAN, computeAggTime,
+      opTime)) { _ =>
+      // 1) a pre-processing step required before we go into the cuDF aggregate,
+      // in some cases casting and in others creating a struct (MERGE_M2 for instance,
+      // requires a struct)
+      // OOM retry happens within the projection in preProcess
+      val preProcessed = helper.preProcess(inputBatch, metrics)
+
+      // 2) perform the aggregation
+      // OOM retry means we could get a list of batches
+      val aggregatedSpillable = aggregate(helper, preProcessed, metrics)
+
+      // 3) a post-processing step required in some scenarios, casting or picking
+      // apart a struct
+      helper.postProcess(aggregatedSpillable, metrics)
+    }
+  }
+
+  /**
+   * Concatenates batches after extracting them from `SpllableColumnarBatch`
+   * @note the input batches are not closed as part of this operation
+   * @param metrics metrics that will be updated during aggregation
+   * @param toConcat spillable batches to concatenate
+   * @return concatenated batch result
+   */
+  def concatenateBatches(
+      metrics: GpuHashAggregateMetrics,
+      toConcat: Seq[SpillableColumnarBatch]): SpillableColumnarBatch = {
+    if (toConcat.size == 1) {
+      toConcat.head
+    } else {
+      withRetryNoSplit(toConcat) { attempt =>
+        val concatTime = metrics.concatTime
+        val opTime = metrics.opTime
+        withResource(
+          new NvtxWithMetrics("concatenateBatches", NvtxColor.BLUE, concatTime,
+            opTime)) { _ =>
+          val batchesToConcat = attempt.map(_.getColumnarBatch())
+          withResource(batchesToConcat) { _ =>
+            val numCols = batchesToConcat.head.numCols()
+            val dataTypes = (0 until numCols).map {
+              c => batchesToConcat.head.column(c).dataType
+            }.toArray
+            withResource(batchesToConcat.map(GpuColumnVector.from)) { tbl =>
+              withResource(cudf.Table.concatenate(tbl: _*)) { concatenated =>
+                val cb = GpuColumnVector.from(concatenated, dataTypes)
+                SpillableColumnarBatch(cb,
+                  SpillPriorities.ACTIVE_BATCHING_PRIORITY)
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 /**
  * Iterator that takes another columnar batch iterator as input and emits new columnar batches that
  * are aggregated based on the specified grouping and aggregation expressions. This iterator tries
@@ -217,7 +532,7 @@ class GpuHashAggregateIterator(
   private[this] val isReductionOnly = groupingExpressions.isEmpty
   private[this] val boundExpressions = setupReferences()
   private[this] val targetMergeBatchSize = computeTargetMergeBatchSize(configuredTargetBatchSize)
-  private[this] val aggregatedBatches = new util.ArrayDeque[LazySpillableColumnarBatch]
+  private[this] val aggregatedBatches = new util.ArrayDeque[SpillableColumnarBatch]
   private[this] var outOfCoreIter: Option[GpuOutOfCoreSortIterator] = None
 
   /** Iterator for fetching aggregated batches if a sort-based fallback has occurred */
@@ -255,8 +570,8 @@ class GpuHashAggregateIterator(
       } else {
         // this will be the last batch
         hasReductionOnlyBatch = false
-        withResource(aggregatedBatches.pop()) { lazyBatch =>
-          GpuColumnVector.incRefCounts(lazyBatch.getBatch)
+        withResource(aggregatedBatches.pop()) { spillableBatch =>
+          spillableBatch.getColumnarBatch()
         }
       }
     }
@@ -280,20 +595,12 @@ class GpuHashAggregateIterator(
 
   /** Aggregate all input batches and place the results in the aggregatedBatches queue. */
   private def aggregateInputBatches(): Unit = {
-    val aggHelper = new AggHelper(forceMerge = false, useTieredProject = useTieredProject)
+    val aggHelper = new AggHelper(
+      inputAttributes, groupingExpressions, aggregateExpressions,
+      forceMerge = false, useTieredProject = useTieredProject)
     while (cbIter.hasNext) {
-      val childBatch = cbIter.next()
-      val isLastInputBatch = GpuColumnVector.isTaggedAsFinalBatch(childBatch)
-
-      val spillableBatch =
-        withResource(computeAggregateAndClose(childBatch, aggHelper)) { aggBatch =>
-          LazySpillableColumnarBatch(aggBatch, metrics.spillCallback, "aggbatch")
-        }
-      // Avoid making batch spillable for the common case of the last and only batch
-      if (!(isLastInputBatch && aggregatedBatches.isEmpty)) {
-        spillableBatch.allowSpilling()
-      }
-      aggregatedBatches.add(spillableBatch)
+      aggregatedBatches.add(
+        computeAggregateAndClose(metrics, cbIter.next(), aggHelper))
     }
   }
 
@@ -319,7 +626,7 @@ class GpuHashAggregateIterator(
             logWarning(s"Unable to merge reduction-only aggregated batches within " +
                 s"target batch limit of $targetMergeBatchSize, attempting to merge remaining " +
                 s"${aggregatedBatches.size()} batches beyond limit")
-            withResource(mutable.ArrayBuffer[LazySpillableColumnarBatch]()) { batchesToConcat =>
+            withResource(mutable.ArrayBuffer[SpillableColumnarBatch]()) { batchesToConcat =>
               aggregatedBatches.forEach(b => batchesToConcat += b)
               aggregatedBatches.clear()
               val batch = concatenateAndMerge(batchesToConcat)
@@ -339,7 +646,7 @@ class GpuHashAggregateIterator(
    * @return true if at least one merge operation occurred
    */
   private def mergePass(): Boolean = {
-    val batchesToConcat: mutable.ArrayBuffer[LazySpillableColumnarBatch] = mutable.ArrayBuffer.empty
+    val batchesToConcat: mutable.ArrayBuffer[SpillableColumnarBatch] = mutable.ArrayBuffer.empty
     var wasBatchMerged = false
     // Current size in bytes of the batches targeted for the next concatenation
     var concatSize: Long = 0L
@@ -354,7 +661,7 @@ class GpuHashAggregateIterator(
         // order of aggregated batches.
         while (batchesLeftInPass > 0 && !isConcatSearchFinished) {
           val candidate = aggregatedBatches.getFirst
-          val potentialSize = concatSize + candidate.deviceMemorySize
+          val potentialSize = concatSize + candidate.sizeInBytes
           isConcatSearchFinished = concatSize > 0 && potentialSize > targetMergeBatchSize
           if (!isConcatSearchFinished) {
             batchesLeftInPass -= 1
@@ -366,9 +673,7 @@ class GpuHashAggregateIterator(
 
       val mergedBatch = if (batchesToConcat.length > 1) {
         wasBatchMerged = true
-        val batch = concatenateAndMerge(batchesToConcat)
-        batch.allowSpilling()
-        batch
+        concatenateAndMerge(batchesToConcat)
       } else {
         // Unable to find a neighboring buffer to produce a valid merge in this pass,
         // so simply put this buffer back on the queue for other passes.
@@ -387,7 +692,8 @@ class GpuHashAggregateIterator(
   }
 
   private lazy val concatAndMergeHelper =
-    new AggHelper(forceMerge = true, useTieredProject = useTieredProject)
+    new AggHelper(inputAttributes, groupingExpressions, aggregateExpressions,
+      forceMerge = true, useTieredProject = useTieredProject)
 
   /**
    * Concatenate batches together and perform a merge aggregation on the result. The input batches
@@ -396,13 +702,15 @@ class GpuHashAggregateIterator(
    * @return lazy spillable batch which has NOT been marked spillable
    */
   private def concatenateAndMerge(
-      batches: mutable.ArrayBuffer[LazySpillableColumnarBatch]): LazySpillableColumnarBatch = {
+      batches: mutable.ArrayBuffer[SpillableColumnarBatch]): SpillableColumnarBatch = {
+    // TODO: concatenateAndMerge (and calling code) could output a sequence
+    //   of batches for the partial aggregate case. This would be done in case
+    //   a retry failed a certain number of times.
     val concatBatch = withResource(batches) { _ =>
-      concatenateBatches(batches)
-    }
-    withResource(computeAggregateAndClose(concatBatch, concatAndMergeHelper)) { mergedBatch =>
-      LazySpillableColumnarBatch(mergedBatch, metrics.spillCallback, "agg merged batch")
+      val concatSpillable = concatenateBatches(metrics, batches)
+      withResource(concatSpillable) { _.getColumnarBatch() }
     }
+    computeAggregateAndClose(metrics, concatBatch, concatAndMergeHelper)
   }
 
   /** Build an iterator that uses a sort-based approach to merge aggregated batches together. */
@@ -413,8 +721,8 @@ class GpuHashAggregateIterator(
       override def hasNext: Boolean = !aggregatedBatches.isEmpty
 
       override def next(): ColumnarBatch = {
-        withResource(aggregatedBatches.removeFirst()) { lazyBatch =>
-          GpuColumnVector.incRefCounts(lazyBatch.getBatch)
+        withResource(aggregatedBatches.removeFirst()) { spillable =>
+          spillable.getColumnarBatch()
         }
       }
     }
@@ -443,8 +751,7 @@ class GpuHashAggregateIterator(
       sortTime = metrics.sortTime,
       outputBatches = NoopMetric,
       outputRows = NoopMetric,
-      peakDevMemory = NoopMetric,
-      spillCallback = metrics.spillCallback))
+      peakDevMemory = NoopMetric))
 
     // The out of core sort iterator does not guarantee that a batch contains all of the values
     // for a particular key, so add a key batching iterator to enforce this. That allows each batch
@@ -461,19 +768,23 @@ class GpuHashAggregateIterator(
       numOutputBatches = NoopMetric,
       concatTime = metrics.concatTime,
       opTime = metrics.opTime,
-      peakDevMemory = NoopMetric,
-      spillCallback = metrics.spillCallback)
+      peakDevMemory = NoopMetric)
 
     // Finally wrap the key batching iterator with a merge aggregation on the output batches.
     new Iterator[ColumnarBatch] {
       override def hasNext: Boolean = keyBatchingIter.hasNext
 
       private val mergeSortedHelper =
-        new AggHelper(true, isSorted = true, useTieredProject = useTieredProject)
+        new AggHelper(inputAttributes, groupingExpressions, aggregateExpressions,
+          forceMerge = true, isSorted = true, useTieredProject = useTieredProject)
 
       override def next(): ColumnarBatch = {
         // batches coming out of the sort need to be merged
-        computeAggregateAndClose(keyBatchingIter.next(), mergeSortedHelper)
+        val resultSpillable =
+          computeAggregateAndClose(metrics, keyBatchingIter.next(), mergeSortedHelper)
+        withResource(resultSpillable) { _ =>
+          resultSpillable.getColumnarBatch()
+        }
       }
     }
   }
@@ -490,7 +801,7 @@ class GpuHashAggregateIterator(
     // rows on the GPU out of empty input, meaning that if a batch has 0 rows, a new single
     // row is getting created with 0 as the count (if count is the operation), and other default
     // values.
-    GpuSemaphore.acquireIfNecessary(TaskContext.get(), metrics.semWaitTime)
+    GpuSemaphore.acquireIfNecessary(TaskContext.get())
     val vecs = defaultValues.safeMap { ref =>
       withResource(GpuScalar.from(ref.asInstanceOf[GpuLiteral].value, ref.dataType)) {
         scalar => GpuColumnVector.from(scalar, 1, ref.dataType)
@@ -525,31 +836,6 @@ class GpuHashAggregateIterator(
     }
   }
 
-  /**
-   * Concatenates batches after extracting them from `LazySpillableColumnarBatch`
-   * @note the input batches are not closed as part of this operation
-   * @param spillableBatchesToConcat lazy spillable batches to concatenate
-   * @return concatenated batch result
-   */
-  private def concatenateBatches(
-      spillableBatchesToConcat: mutable.ArrayBuffer[LazySpillableColumnarBatch]): ColumnarBatch = {
-    val concatTime = metrics.concatTime
-    val opTime = metrics.opTime
-    withResource(new NvtxWithMetrics("concatenateBatches", NvtxColor.BLUE, concatTime,
-      opTime)) { _ =>
-      val batchesToConcat = spillableBatchesToConcat.map(_.getBatch)
-      val numCols = batchesToConcat.head.numCols()
-      val dataTypes = (0 until numCols).map {
-        c => batchesToConcat.head.column(c).dataType
-      }.toArray
-      withResource(batchesToConcat.map(GpuColumnVector.from)) { tbl =>
-        withResource(cudf.Table.concatenate(tbl: _*)) { concatenated =>
-          GpuColumnVector.from(concatenated, dataTypes)
-        }
-      }
-    }
-  }
-
   /**
    * `setupReferences` binds input, final and result references for the aggregate.
    * - input: used to obtain columns coming into the aggregate from the child
@@ -599,210 +885,6 @@ class GpuHashAggregateIterator(
       boundResultReferences)
   }
 
-  /**
-   * Internal class used in `computeAggregates` for the pre, agg, and post steps
-   *
-   * @param forceMerge - if true, we are merging two pre-aggregated batches, so we should use
-   *                the merge steps for each aggregate function
-   * @param isSorted - if the batch is sorted this is set to true and is passed to cuDF
-   *                   as an optimization hint
-   * @param useTieredProject - if true, used tiered project for input projections
-   */
-  class AggHelper(forceMerge: Boolean, isSorted: Boolean = false,
-      useTieredProject : Boolean = true) {
-    // `CudfAggregate` instances to apply, either update or merge aggregates
-    private val cudfAggregates = new mutable.ArrayBuffer[CudfAggregate]()
-
-    // integers for each column the aggregate is operating on
-    private val aggOrdinals = new mutable.ArrayBuffer[Int]
-
-    // the resulting data type from the cuDF aggregate (from
-    // the update or merge aggregate, be it reduction or group by)
-    private val postStepDataTypes = new mutable.ArrayBuffer[DataType]()
-
-    private val groupingAttributes = groupingExpressions.map(_.toAttribute)
-    private val aggBufferAttributes = groupingAttributes ++
-      aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
-
-    // `GpuAggregateFunction` can add a pre and post step for update
-    // and merge aggregates.
-    private val preStep = new mutable.ArrayBuffer[Expression]()
-    private val postStep = new mutable.ArrayBuffer[Expression]()
-    private val postStepAttr = new mutable.ArrayBuffer[Attribute]()
-
-    // we add the grouping expression first, which should bind as pass-through
-    if (forceMerge) {
-      // a grouping expression can do actual computation, but we cannot do that computation again
-      // on a merge, nor would we want to if we could. So use the attributes instead of the
-      // original expression when we are forcing a merge.
-      preStep ++= groupingAttributes
-    } else {
-      preStep ++= groupingExpressions
-    }
-    postStep ++= groupingAttributes
-    postStepAttr ++= groupingAttributes
-    postStepDataTypes ++=
-      groupingExpressions.map(_.dataType)
-
-    private var ix = groupingAttributes.length
-    for (aggExp <- aggregateExpressions) {
-      val aggFn = aggExp.aggregateFunction
-      if ((aggExp.mode == Partial || aggExp.mode == Complete) && !forceMerge) {
-        val ordinals = (ix until ix + aggFn.updateAggregates.length)
-        aggOrdinals ++= ordinals
-        ix += ordinals.length
-        val updateAggs = aggFn.updateAggregates
-        postStepDataTypes ++= updateAggs.map(_.dataType)
-        cudfAggregates ++= updateAggs
-        preStep ++= aggFn.inputProjection
-        postStep ++= aggFn.postUpdate
-        postStepAttr ++= aggFn.postUpdateAttr
-      } else {
-        val ordinals = (ix until ix + aggFn.mergeAggregates.length)
-        aggOrdinals ++= ordinals
-        ix += ordinals.length
-        val mergeAggs = aggFn.mergeAggregates
-        postStepDataTypes ++= mergeAggs.map(_.dataType)
-        cudfAggregates ++= mergeAggs
-        preStep ++= aggFn.preMerge
-        postStep ++= aggFn.postMerge
-        postStepAttr ++= aggFn.postMergeAttr
-      }
-    }
-
-    // a bound expression that is applied before the cuDF aggregate
-    private val preStepAttributes = if (forceMerge) {
-      aggBufferAttributes
-    } else {
-      inputAttributes
-    }
-    private val (preStepBound, preStepBoundTiered) = if (useTieredProject) {
-      (None, Some(GpuBindReferences.bindGpuReferencesTiered(preStep.toList,
-        preStepAttributes.toList)))
-    } else {
-      (Some(GpuBindReferences.bindGpuReferences(preStep, preStepAttributes.toList)), None)
-    }
-
-    // a bound expression that is applied after the cuDF aggregate
-    private val postStepBound =
-      GpuBindReferences.bindGpuReferences(postStep, postStepAttr)
-
-    /**
-     * Apply the "pre" step: preMerge for merge, or pass-through in the update case
-     * @param toAggregateBatch - input (to the agg) batch from the child directly in the
-     *                         merge case, or from the `inputProjection` in the update case.
-     * @return a pre-processed batch that can be later cuDF aggregated
-     */
-    def preProcess(toAggregateBatch: ColumnarBatch): ColumnarBatch = {
-      withResource(new NvtxRange("pre-process", NvtxColor.DARK_GREEN)) { _ =>
-        if (useTieredProject) {
-          preStepBoundTiered.get.tieredProject(toAggregateBatch)
-        } else {
-          GpuProjectExec.project(toAggregateBatch, preStepBound.get)
-        }
-      }
-    }
-
-    /**
-     * Invoke reduction functions as defined in each `CudfAggreagte`
-     * @param preProcessed - a batch after the "pre" step
-     * @return
-     */
-    def performReduction(preProcessed: ColumnarBatch): ColumnarBatch = {
-      withResource(new NvtxRange("reduce", NvtxColor.BLUE)) { _ =>
-        val cvs = mutable.ArrayBuffer[GpuColumnVector]()
-        cudfAggregates.zipWithIndex.foreach { case (cudfAgg, ix) =>
-          val aggFn = cudfAgg.reductionAggregate
-          val cols = GpuColumnVector.extractColumns(preProcessed)
-          val reductionCol = cols(aggOrdinals(ix))
-          withResource(aggFn(reductionCol.getBase)) { res =>
-            cvs += GpuColumnVector.from(
-              cudf.ColumnVector.fromScalar(res, 1), cudfAgg.dataType)
-          }
-        }
-        new ColumnarBatch(cvs.toArray, 1)
-      }
-    }
-
-    /**
-     * Used to produce a group-by aggregate
-     * @param preProcessed the batch after the "pre" step
-     * @return a Table that has been cuDF aggregated
-     */
-    def performGroupByAggregation(preProcessed: ColumnarBatch): ColumnarBatch = {
-      withResource(new NvtxRange("groupby", NvtxColor.BLUE)) { _ =>
-        withResource(GpuColumnVector.from(preProcessed)) { preProcessedTbl =>
-          val groupOptions = cudf.GroupByOptions.builder()
-            .withIgnoreNullKeys(false)
-            .withKeysSorted(isSorted)
-            .build()
-
-          val cudfAggsOnColumn = cudfAggregates.zip(aggOrdinals).map {
-            case (cudfAgg, ord) => cudfAgg.groupByAggregate.onColumn(ord)
-          }
-
-          // perform the aggregate
-          val aggTbl = preProcessedTbl
-            .groupBy(groupOptions, groupingExpressions.indices: _*)
-            .aggregate(cudfAggsOnColumn: _*)
-
-          withResource(aggTbl) { _ =>
-            GpuColumnVector.from(aggTbl, postStepDataTypes.toArray)
-          }
-        }
-      }
-    }
-
-    /**
-     * Used to produce the outbound batch from the aggregate that could be
-     * shuffled or could be passed through the evaluateExpression if we are in the final
-     * stage.
-     * It takes a cuDF aggregated batch and applies the "post" step:
-     * postUpdate for update, or postMerge for merge
-     * @param resultBatch - cuDF aggregated batch
-     * @return output batch from the aggregate
-     */
-    def postProcess(resultBatch: ColumnarBatch): ColumnarBatch = {
-      withResource(new NvtxRange("post-process", NvtxColor.ORANGE)) { _ =>
-        GpuProjectExec.project(resultBatch, postStepBound)
-      }
-    }
-  }
-
-  /**
-   * Compute the aggregations on the projected input columns, and close input batch.
-   * @param toAggregateBatch input batch to aggregate
-   * @param helper an internal object that carries state required to execute the aggregate from
-   *               different parts of the codebase.
-   * @return aggregated batch
-   */
-  private def computeAggregateAndClose(
-      toAggregateBatch: ColumnarBatch, helper: AggHelper): ColumnarBatch = {
-    val computeAggTime = metrics.computeAggTime
-    val opTime = metrics.opTime
-    withResource(new NvtxWithMetrics("computeAggregate", NvtxColor.CYAN, computeAggTime,
-      opTime)) { _ =>
-      // a pre-processing step required before we go into the cuDF aggregate, in some cases
-      // casting and in others creating a struct (MERGE_M2 for instance, requires a struct)
-      val preProcessed = withResource(toAggregateBatch) { _ =>
-        helper.preProcess(toAggregateBatch)
-      }
-
-      val aggregated = withResource(preProcessed) { _ =>
-        if (groupingExpressions.nonEmpty) {
-          helper.performGroupByAggregation(preProcessed)
-        } else {
-          helper.performReduction(preProcessed)
-        }
-      }
-
-      // a post-processing step required in some scenarios, casting or picking
-      // apart a struct
-      withResource(aggregated) { _ =>
-        helper.postProcess(aggregated)
-      }
-    }
-  }
 }
 
 object GpuBaseAggregateMeta {
@@ -1418,7 +1500,7 @@ case class GpuHashAggregateExec(
     AGG_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_AGG_TIME),
     CONCAT_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_CONCAT_TIME),
     SORT_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_SORT_TIME)
-  ) ++ spillMetrics
+  )
 
   // requiredChildDistributions are CPU expressions, so remove it from the GPU expressions list
   override def gpuExpressions: Seq[Expression] =
@@ -1435,7 +1517,7 @@ case class GpuHashAggregateExec(
        |""".stripMargin
   }
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     val aggMetrics = GpuHashAggregateMetrics(
       numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS),
       numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES),
@@ -1443,9 +1525,7 @@ case class GpuHashAggregateExec(
       opTime = gpuLongMetric(OP_TIME),
       computeAggTime = gpuLongMetric(AGG_TIME),
       concatTime = gpuLongMetric(CONCAT_TIME),
-      sortTime = gpuLongMetric(SORT_TIME),
-      semWaitTime = gpuLongMetric(SEMAPHORE_WAIT_TIME),
-      makeSpillCallback(allMetrics))
+      sortTime = gpuLongMetric(SORT_TIME))
 
     // cache in a local variable to avoid serializing the full child plan
     val inputAttrs = inputAttributes
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/basicPhysicalOperators.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/basicPhysicalOperators.scala
index 9decb793f70..5bd67d95328 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/basicPhysicalOperators.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/basicPhysicalOperators.scala
@@ -84,12 +84,6 @@ object GpuProjectExec extends Arm {
     case _ => None
   }
 
-  private def isAllSingleBoundIndex(boundExprs: Seq[Expression]): Boolean =
-    extractSingleBoundIndex(boundExprs).forall {
-      case Some(index) => true
-      case _ => false
-    }
-
   def extractSingleBoundIndex(boundExprs: Seq[Expression]): Seq[Option[Int]] =
     boundExprs.map(extractSingleBoundIndex)
 
@@ -116,6 +110,61 @@ object GpuProjectExec extends Arm {
       new ColumnarBatch(newColumns, cb.numRows())
     }
   }
+
+  /**
+   * Similar to project, but it will try and retry the operations if it can. It also will close
+   * the input SpillableColumnarBatch if it succeeds.
+   * @param sb the input batch
+   * @param boundExprs the expressions to run
+   * @return the resulting batch
+   */
+  def projectAndCloseWithRetrySingleBatch(sb: SpillableColumnarBatch,
+      boundExprs: Seq[Expression]): ColumnarBatch = {
+    // First off we want to find/run all of the expressions that are non-deterministic
+    // These cannot be retried.
+    val (deterministicExprs, nonDeterministicExprs) = boundExprs.partition(_.deterministic)
+
+    val snd = if (nonDeterministicExprs.nonEmpty) {
+      withResource(sb.getColumnarBatch()) { cb =>
+        Some(SpillableColumnarBatch(project(cb, nonDeterministicExprs),
+          SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
+      }
+    } else {
+      None
+    }
+
+    withResource(snd) { snd =>
+      RmmRapidsRetryIterator.withRetryNoSplit(sb) { sb =>
+        val deterministicResults = withResource(sb.getColumnarBatch()) { cb =>
+          // For now we are just going to run all of these and deal with losing work...
+          project(cb, deterministicExprs)
+        }
+        if (snd.isEmpty) {
+          // We are done and the order should be the same so we don't need to do anything...
+          deterministicResults
+        } else {
+          // There was a mix of deterministic and non-deterministic...
+          withResource(deterministicResults) { _ =>
+            withResource(snd.get.getColumnarBatch()) { nd =>
+              var ndAt = 0
+              var detAt = 0
+              val outputColumns = ArrayBuffer[ColumnVector]()
+              boundExprs.foreach { expr =>
+                if (expr.deterministic) {
+                  outputColumns += deterministicResults.column(detAt)
+                  detAt += 1
+                } else {
+                  outputColumns += nd.column(ndAt)
+                  ndAt += 1
+                }
+              }
+              GpuColumnVector.incRefCounts(new ColumnarBatch(outputColumns.toArray, sb.numRows()))
+            }
+          }
+        }
+      }
+    }
+  }
 }
 
 object GpuProjectExecLike {
@@ -160,24 +209,25 @@ case class GpuProjectExec(
 
   override def output: Seq[Attribute] = projectList.map(_.toAttribute)
 
-  override def doExecuteColumnar() : RDD[ColumnarBatch] = {
+  override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
+    OP_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_OP_TIME))
+
+  override def internalDoExecuteColumnar() : RDD[ColumnarBatch] = {
     val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
     val numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES)
     val opTime = gpuLongMetric(OP_TIME)
-    val (boundProjectList, boundProjectListTiered) = if (useTieredProject) {
-      (None, Some(GpuBindReferences.bindGpuReferencesTiered(projectList, child.output)))
-    } else {
-      (Some(GpuBindReferences.bindGpuReferences(projectList, child.output)), None)
-    }
+    val boundProjectList = GpuBindReferences.bindGpuReferencesTiered(projectList, child.output,
+      useTieredProject)
+
     val rdd = child.executeColumnar()
     rdd.map { cb =>
-      numOutputBatches += 1
-      numOutputRows += cb.numRows()
-      if (useTieredProject) {
-        boundProjectListTiered.get.tieredProjectAndClose(cb, opTime)
-      } else {
-        GpuProjectExec.projectAndClose(cb, boundProjectList.get, opTime)
+      val ret = withResource(new NvtxWithMetrics("ProjectExec", NvtxColor.CYAN, opTime)) { _ =>
+        val sb = SpillableColumnarBatch(cb, SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
+        boundProjectList.projectAndCloseWithRetrySingleBatch(sb)
       }
+      numOutputBatches += 1
+      numOutputRows += ret.numRows()
+      ret
     }
   }
 }
@@ -200,7 +250,7 @@ case class GpuProjectAstExec(
     projectList.collect { case ne: NamedExpression => ne.toAttribute }
   }
 
-  override def doExecuteColumnar() : RDD[ColumnarBatch] = {
+  override def internalDoExecuteColumnar() : RDD[ColumnarBatch] = {
     val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
     val numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES)
     val opTime = gpuLongMetric(OP_TIME)
@@ -283,83 +333,70 @@ case class GpuProjectAstExec(
  *   Input columns for tier 3: a, c, e, f, ref2, ref3
  *   Tier 3: (ref2 * e), (ref3 * f), (a + e), (c + f)
  */
- case class GpuTieredProject(exprTiers: Seq[Seq[GpuExpression]],
-    inputAttrTiers: Seq[AttributeSeq]) extends Arm {
-
-  // Determine which attributes in the first AttributeSeq can be skipped when
-  // building the list of attributes for the next AttributeSeq.
-  // Returns a list of boolean values the same size as the first
-  // sequence, where the values are true if the corresponding attribute can be skipped.
-  // The indices of these attributes correspond to the columns that were bound via
-  // GpuBindReferences.bindGpuReference.
-  private def getColumnSkips(inputTiers: Seq[AttributeSeq]): Array[Boolean] = inputTiers match {
-    case Nil => Array.emptyBooleanArray
-    case curTier :: Nil =>
-      // For the last tier, fill with all true (skip) values, because the output
-      // of the last tier does not include any input columns.
-      Array.fill(curTier.attrs.size)(true)
-    case curTier :: tail =>
-      val curAttrs = curTier.attrs
-      val nextAttrs = tail.head.attrs
-      // This is equivalent to:
-      // skipList = curAttrs.map(a => if (nextAttrs.contains(a)) false else true)
-      // but this should be faster
-      val skipList = new Array[Boolean](curAttrs.size)
-      var curIdx = 0
-      var nextIdx = 0
-      while (curIdx < curAttrs.size) {
-        if (nextAttrs(nextIdx) == curAttrs(curIdx)) {
-          skipList(curIdx) = false
-          nextIdx += 1
-        } else {
-          skipList(curIdx) = true
-        }
-        curIdx += 1
-      }
-      skipList
+ case class GpuTieredProject(exprTiers: Seq[Seq[GpuExpression]]) extends Arm {
+
+  /**
+   * Is everything deterministic. This can help with reliability in the common case.
+   */
+  private lazy val areAllDeterministic = !exprTiers.exists { tier =>
+    tier.exists { expr =>
+      !expr.deterministic
+    }
   }
 
-  def tieredProject(batch: ColumnarBatch): ColumnarBatch = {
-    @tailrec
-    def recurse(boundExprs: Seq[Seq[GpuExpression]], attrTiers: Seq[AttributeSeq],
-        cb: ColumnarBatch, isFirst: Boolean): ColumnarBatch = boundExprs match {
-      case Nil =>
-        if (isFirst) {
-          // if there are no bound expressions, return an empty ColumnarBatch.
-          new ColumnarBatch(Array.empty, cb.numRows())
-        } else {
-          cb
+  def projectAndCloseWithRetrySingleBatch(sb: SpillableColumnarBatch): ColumnarBatch = {
+    if (areAllDeterministic) {
+      // If all of the expressions are deterministic we can just run everything and retry it
+      // at the top level. If some things are non-deterministic we need to split them up and
+      // do the processing in a way that makes it so retries are more likely to succeed.
+      RmmRapidsRetryIterator.withRetryNoSplit(sb) { sb =>
+        withResource(sb.getColumnarBatch()) { cb =>
+          project(cb)
         }
-      case exprSet :: tail =>
-        val projectCb = withResource(new NvtxRange("project tier", NvtxColor.ORANGE)) { _ =>
-          closeOnExcept(GpuProjectExec.project(cb, exprSet)) { projectResult =>
-            projectResult
-          }
-        }
-        val nextCb = if (tail.isEmpty) {
-          projectCb
-        } else {
-          val columnSkips = getColumnSkips(attrTiers)
-          withResource(projectCb) { newCols =>
-            withResource(GpuColumnVector.dropColumns(cb, columnSkips)) { remainingCb =>
-              GpuColumnVector.combineColumns(remainingCb, newCols)
-            }
+      }
+    } else {
+      @tailrec
+      def recurse(boundExprs: Seq[Seq[GpuExpression]],
+          sb: SpillableColumnarBatch): SpillableColumnarBatch = boundExprs match {
+        case Nil => sb
+        case exprSet :: tail =>
+          val projectSb = withResource(new NvtxRange("project tier", NvtxColor.ORANGE)) { _ =>
+            val projectResult = GpuProjectExec.projectAndCloseWithRetrySingleBatch(sb,
+              exprSet)
+            SpillableColumnarBatch(projectResult, SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
           }
-        }
-        // Close intermediate batches
-        if (!isFirst) cb.close()
-        recurse(tail, attrTiers.tail, nextCb, false)
+          recurse(tail, projectSb)
+      }
+      // Process tiers sequentially
+      withResource(recurse(exprTiers, sb)) { ret =>
+        ret.getColumnarBatch()
+      }
     }
-    // Process tiers sequentially
-    recurse(exprTiers, inputAttrTiers, batch, true)
   }
 
-  def tieredProjectAndClose(cb: ColumnarBatch, opTime: GpuMetric): ColumnarBatch = {
-    withResource(cb) { cb =>
-      withResource(new NvtxWithMetrics("ProjectExec", NvtxColor.CYAN, opTime)) { _ =>
-        tieredProject(cb)
+  def project(batch: ColumnarBatch): ColumnarBatch = {
+    @tailrec
+    def recurse(boundExprs: Seq[Seq[GpuExpression]],
+        cb: ColumnarBatch,
+        isFirst: Boolean): ColumnarBatch = {
+      boundExprs match {
+        case Nil => cb
+        case exprSet :: tail =>
+          val projectCb = try {
+            withResource(new NvtxRange("project tier", NvtxColor.ORANGE)) { _ =>
+              GpuProjectExec.project(cb, exprSet)
+            }
+          } finally {
+            // Close intermediate batches
+            if (!isFirst) {
+              cb.close()
+            }
+          }
+          recurse(tail, projectCb, false)
       }
     }
+    // Process tiers sequentially
+    recurse(exprTiers, batch, true)
   }
 }
 
@@ -381,6 +418,20 @@ object GpuFilter extends Arm {
     }
   }
 
+  def filterAndClose(
+      batch: ColumnarBatch,
+      boundCondition: Expression,
+      numOutputRows: GpuMetric,
+      numOutputBatches: GpuMetric,
+      filterTime: GpuMetric): ColumnarBatch = {
+    withResource(new NvtxWithMetrics("filter batch", NvtxColor.YELLOW, filterTime)) { _ =>
+      val filteredBatch = GpuFilter.filterAndClose(batch, boundCondition)
+      numOutputBatches += 1
+      numOutputRows += filteredBatch.numRows()
+      filteredBatch
+    }
+  }
+
   private def allEntriesAreTrue(mask: GpuColumnVector): Boolean = {
     if (mask.hasNull) {
       false
@@ -391,33 +442,59 @@ object GpuFilter extends Arm {
     }
   }
 
-  def apply(batch: ColumnarBatch,
-      boundCondition: Expression) : ColumnarBatch = {
-    withResource(batch) { batch =>
-      val checkedFilterMask = withResource(
-        GpuProjectExec.projectSingle(batch, boundCondition)) { filterMask =>
-        // If  filter is a noop then return a None for the mask
-        if (allEntriesAreTrue(filterMask)) {
-          None
-        } else {
-          Some(filterMask.getBase.incRefCount())
+  private def doFilter(checkedFilterMask: Option[cudf.ColumnVector],
+      cb: ColumnarBatch): ColumnarBatch = {
+    checkedFilterMask.map { checkedFilterMask =>
+      withResource(checkedFilterMask) { checkedFilterMask =>
+        val colTypes = GpuColumnVector.extractTypes(cb)
+        withResource(GpuColumnVector.from(cb)) { tbl =>
+          withResource(tbl.filter(checkedFilterMask)) { filteredData =>
+            GpuColumnVector.from(filteredData, colTypes)
+          }
         }
       }
-      checkedFilterMask.map { checkedFilterMask =>
-        withResource(checkedFilterMask) { checkedFilterMask =>
-          val colTypes = GpuColumnVector.extractTypes(batch)
-          withResource(GpuColumnVector.from(batch)) { tbl =>
-            withResource(tbl.filter(checkedFilterMask)) { filteredData =>
-              GpuColumnVector.from(filteredData, colTypes)
-            }
-          }
+    }.getOrElse {
+      // Nothing to filter so it is a NOOP
+      GpuColumnVector.incRefCounts(cb)
+    }
+  }
+
+  private def computeCheckedFilterMask(boundCondition: Expression,
+      cb: ColumnarBatch): Option[cudf.ColumnVector] = {
+    withResource(
+      GpuProjectExec.projectSingle(cb, boundCondition)) { filterMask =>
+      // If  filter is a noop then return a None for the mask
+      if (allEntriesAreTrue(filterMask)) {
+        None
+      } else {
+        Some(filterMask.getBase.incRefCount())
+      }
+    }
+  }
+
+  def filterAndClose(batch: ColumnarBatch,
+      boundCondition: Expression): ColumnarBatch = {
+    if (!boundCondition.deterministic) {
+      // If the condition is non-deterministic we cannot retry it, we could retry the filter, but
+      // this should be super rare. So we are not going to spend time trying to make it happen.
+      withResource(batch) { batch =>
+        GpuFilter(batch, boundCondition)
+      }
+    } else {
+      val sb = SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
+      RmmRapidsRetryIterator.withRetryNoSplit(sb) { sb =>
+        withResource(sb.getColumnarBatch()) { cb =>
+          GpuFilter(cb, boundCondition)
         }
-      }.getOrElse {
-        // Nothing to filter so it is a NOOP
-        GpuColumnVector.incRefCounts(batch)
       }
     }
   }
+
+  def apply(batch: ColumnarBatch,
+      boundCondition: Expression) : ColumnarBatch = {
+    val checkedFilterMask = computeCheckedFilterMask(boundCondition, batch)
+    doFilter(checkedFilterMask, batch)
+  }
 }
 
 case class GpuFilterExec(
@@ -458,14 +535,14 @@ case class GpuFilterExec(
   override val outputRowsLevel: MetricsLevel = ESSENTIAL_LEVEL
   override val outputBatchesLevel: MetricsLevel = MODERATE_LEVEL
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
     val numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES)
     val opTime = gpuLongMetric(OP_TIME)
     val boundCondition = GpuBindReferences.bindReference(condition, child.output)
     val rdd = child.executeColumnar()
     rdd.map { batch =>
-      GpuFilter(batch, boundCondition, numOutputRows, numOutputBatches, opTime)
+      GpuFilter.filterAndClose(batch, boundCondition, numOutputRows, numOutputBatches, opTime)
     }
   }
 }
@@ -518,7 +595,7 @@ case class GpuSampleExec(
   override val outputRowsLevel: MetricsLevel = ESSENTIAL_LEVEL
   override val outputBatchesLevel: MetricsLevel = MODERATE_LEVEL
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
     val numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES)
     val opTime = gpuLongMetric(OP_TIME)
@@ -595,7 +672,7 @@ case class GpuFastSampleExec(
   override val outputRowsLevel: MetricsLevel = ESSENTIAL_LEVEL
   override val outputBatchesLevel: MetricsLevel = MODERATE_LEVEL
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
     val numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES)
     val opTime = gpuLongMetric(OP_TIME)
@@ -668,7 +745,7 @@ case class GpuRangeExec(
 
   override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
     OP_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_OP_TIME)
-  ) ++ semaphoreMetrics
+  )
 
   override def outputOrdering: Seq[SortOrder] = {
     val order = if (step > 0) {
@@ -693,10 +770,9 @@ case class GpuRangeExec(
 
   override def outputBatching: CoalesceGoal = TargetSize(targetSizeBytes)
 
-  protected override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  protected override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
     val numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES)
-    val semTime = gpuLongMetric(SEMAPHORE_WAIT_TIME)
     val opTime = gpuLongMetric(OP_TIME)
     val maxRowCountPerBatch = Math.min(targetSizeBytes/8, Int.MaxValue)
 
@@ -738,7 +814,7 @@ case class GpuRangeExec(
                 } else false
 
               override def next(): ColumnarBatch = {
-                GpuSemaphore.acquireIfNecessary(taskContext, semTime)
+                GpuSemaphore.acquireIfNecessary(taskContext)
                 withResource(
                   new NvtxWithMetrics("GpuRange", NvtxColor.DARK_GREEN, opTime)) { _ =>
                     val start = number
@@ -812,7 +888,7 @@ case class GpuUnionExec(children: Seq[SparkPlan]) extends ShimSparkPlan with Gpu
   override def doExecute(): RDD[InternalRow] =
     throw new IllegalStateException(s"Row-based execution should not occur for $this")
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
     val numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES)
 
@@ -843,7 +919,7 @@ case class GpuCoalesceExec(numPartitions: Int, child: SparkPlan)
   protected override def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException(
     s"${getClass.getCanonicalName} does not support row-based execution")
 
-  override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     val rdd = child.executeColumnar()
     if (numPartitions == 1 && rdd.getNumPartitions < 1) {
       // Make sure we don't output an RDD with 0 partitions, when claiming that we have a
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/iceberg/IcebergProviderImpl.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/iceberg/IcebergProviderImpl.scala
index ae855214f2e..830a1029a80 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/iceberg/IcebergProviderImpl.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/iceberg/IcebergProviderImpl.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@ package com.nvidia.spark.rapids.iceberg
 import scala.reflect.ClassTag
 import scala.util.{Failure, Try}
 
-import com.nvidia.spark.rapids.{FileFormatChecks, IcebergFormatType, RapidsConf, ReadFileOp, ScanMeta, ScanRule, ShimLoader}
+import com.nvidia.spark.rapids.{FileFormatChecks, IcebergFormatType, RapidsConf, ReadFileOp, ScanMeta, ScanRule, ShimReflectionUtils}
 import com.nvidia.spark.rapids.iceberg.spark.source.GpuSparkBatchQueryScan
 
 import org.apache.spark.sql.connector.read.Scan
@@ -28,7 +28,7 @@ class IcebergProviderImpl extends IcebergProvider {
   override def isSupportedScan(scan: Scan): Boolean = scan.isInstanceOf[GpuSparkBatchQueryScan]
 
   override def getScans: Map[Class[_ <: Scan], ScanRule[_ <: Scan]] = {
-    val cpuIcebergScanClass = ShimLoader.loadClass(IcebergProvider.cpuScanClassName)
+    val cpuIcebergScanClass = ShimReflectionUtils.loadClass(IcebergProvider.cpuScanClassName)
     Seq(new ScanRule[Scan](
       (a, conf, p, r) => new ScanMeta[Scan](a, conf, p, r) {
         private lazy val convertedScan: Try[GpuSparkBatchQueryScan] = Try {
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/limit.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/limit.scala
index bb7fbf0d19e..0461165889f 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/limit.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/limit.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,7 +57,7 @@ trait GpuBaseLimitExec extends LimitExec with GpuExec with ShimUnaryExecNode {
   protected override def doExecute(): RDD[InternalRow] =
     throw new IllegalStateException(s"Row-based execution should not occur for $this")
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     sliceRDD(child.executeColumnar(), limit, 0)
   }
 
@@ -158,7 +158,7 @@ case class GpuGlobalLimitExec(limit: Int = -1, child: SparkPlan,
 
   override def requiredChildDistribution: List[Distribution] = AllTuples :: Nil
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch]  = {
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch]  = {
     super.sliceRDD(child.executeColumnar(), limit, offset)
   }
 }
@@ -235,8 +235,7 @@ object GpuTopN extends Arm {
       inputBatches: GpuMetric,
       inputRows: GpuMetric,
       outputBatches: GpuMetric,
-      outputRows: GpuMetric,
-      spillCallback: SpillCallback): Iterator[ColumnarBatch] =
+      outputRows: GpuMetric): Iterator[ColumnarBatch] =
     new Iterator[ColumnarBatch]() {
       override def hasNext: Boolean = iter.hasNext
 
@@ -272,8 +271,7 @@ object GpuTopN extends Arm {
               }
             }
             pending =
-                Some(SpillableColumnarBatch(runningResult, SpillPriorities.ACTIVE_ON_DECK_PRIORITY,
-                  spillCallback))
+                Some(SpillableColumnarBatch(runningResult, SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
           }
         }
         val ret = pending.get.getColumnarBatch()
@@ -314,9 +312,9 @@ case class GpuTopN(
     OP_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_OP_TIME),
     SORT_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_SORT_TIME),
     CONCAT_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_CONCAT_TIME)
-  ) ++ spillMetrics
+  )
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     val sorter = new GpuSorter(gpuSortOrder, child.output)
     val boundProjectExprs = GpuBindReferences.bindGpuReferences(projectList, child.output)
     val opTime = gpuLongMetric(OP_TIME)
@@ -326,14 +324,13 @@ case class GpuTopN(
     val outputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
     val sortTime = gpuLongMetric(SORT_TIME)
     val concatTime = gpuLongMetric(CONCAT_TIME)
-    val callback = GpuMetric.makeSpillCallback(allMetrics)
     val localLimit = limit
     val localProjectList = projectList
     val childOutput = child.output
 
     child.executeColumnar().mapPartitions { iter =>
       val topN = GpuTopN(localLimit, sorter, iter, opTime, sortTime, concatTime,
-        inputBatches, inputRows, outputBatches, outputRows, callback)
+        inputBatches, inputRows, outputBatches, outputRows)
       if (localProjectList != childOutput) {
         topN.map { batch =>
           GpuProjectExec.projectAndClose(batch, boundProjectExprs, opTime)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClient.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClient.scala
index 12044e8d781..926a742be15 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClient.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClient.scala
@@ -439,8 +439,7 @@ class RapidsShuffleClient(
     } else {
       // no device data, just tracking metadata
       catalog.addDegenerateRapidsBuffer(
-        meta,
-        RapidsBuffer.defaultSpillCallback)
+        meta)
 
     }
   }
@@ -464,5 +463,10 @@ class RapidsShuffleClient(
   def unregisterPeerErrorListener(handler: RapidsShuffleFetchHandler): Unit = {
     logDebug(s"Unregister $handler from client for ${connection.getPeerExecutorId}")
     liveHandlers.remove(handler)
+    // Make sure we remove the handler from the transport as well, otherwise
+    // we could have a host memory leak: https://github.com/NVIDIA/spark-rapids/issues/7997
+    // At this stage this should not cancel any requests because the task should have
+    // completed, but it will also remove `handler` from internal tracking in `transport`.
+    transport.cancelPending(handler)
   }
 }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIterator.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIterator.scala
index 1f44495759d..41ac440da57 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIterator.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIterator.scala
@@ -21,7 +21,8 @@ import java.util.concurrent.{LinkedBlockingQueue, TimeUnit}
 import scala.collection.mutable
 
 import ai.rapids.cudf.{NvtxColor, NvtxRange}
-import com.nvidia.spark.rapids.{Arm, GpuSemaphore, NoopMetric, RapidsBuffer, RapidsBufferHandle, RapidsConf, ShuffleReceivedBufferCatalog}
+import com.nvidia.spark.rapids.{Arm, GpuSemaphore, RapidsBuffer, RapidsBufferHandle, RapidsConf, ShuffleReceivedBufferCatalog}
+import com.nvidia.spark.rapids.jni.RmmSpark
 
 import org.apache.spark.TaskContext
 import org.apache.spark.internal.Logging
@@ -335,7 +336,7 @@ class RapidsShuffleIterator(
     // thread to schedule the fetches for us, it may be something we consider in the future, given
     // memory pressure.
     // No good way to get a metric in here for semaphore time.
-    taskContext.foreach(GpuSemaphore.acquireIfNecessary(_, NoopMetric))
+    taskContext.foreach(GpuSemaphore.acquireIfNecessary(_))
 
     if (!started) {
       // kick off if we haven't already
@@ -345,9 +346,14 @@ class RapidsShuffleIterator(
 
     val blockedStart = System.currentTimeMillis()
     var result: Option[ShuffleClientResult] = None
-
-    result = pollForResult(timeoutSeconds)
+    RmmSpark.threadCouldBlockOnShuffle()
+    try {
+      result = pollForResult(timeoutSeconds)
+    } finally {
+      RmmSpark.threadDoneWithShuffle()
+    }
     val blockedTime = System.currentTimeMillis() - blockedStart
+
     result match {
       case Some(BufferReceived(handle)) =>
         val nvtxRangeAfterGettingBatch = new NvtxRange("RapidsShuffleIterator.gotBatch",
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleTransport.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleTransport.scala
index 93d82da69f3..093903fec12 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleTransport.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleTransport.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@ import java.util.concurrent.ConcurrentLinkedQueue
 import java.util.concurrent.atomic.AtomicInteger
 
 import ai.rapids.cudf.{MemoryBuffer, NvtxColor, NvtxRange}
-import com.nvidia.spark.rapids.{RapidsConf, ShimLoader}
+import com.nvidia.spark.rapids.{RapidsConf, ShimReflectionUtils}
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.rapids.storage.RapidsStorageUtils
@@ -348,7 +348,7 @@ trait RapidsShuffleTransport extends AutoCloseable {
 
   /**
    * Cancel requests that are waiting in the queue (not in-flight) for a specific
-   * handler
+   * handler (if any), and unregister the handler.
    */
   def cancelPending(handler: RapidsShuffleFetchHandler): Unit
 
@@ -560,7 +560,7 @@ object RapidsShuffleTransport extends Logging {
   def makeTransport(shuffleServerId: BlockManagerId,
                     rapidsConf: RapidsConf): RapidsShuffleTransport = {
     val transportClass = try {
-      ShimLoader.loadClass(rapidsConf.shuffleTransportClassName)
+      ShimReflectionUtils.loadClass(rapidsConf.shuffleTransportClassName)
     } catch {
       case classNotFoundException: ClassNotFoundException =>
         logError(s"Unable to find RapidsShuffleTransport class " +
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala
index a40a62c1548..3e0ce05ccef 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala
@@ -23,7 +23,7 @@ import scala.collection.JavaConverters._
 import scala.collection.mutable.ListBuffer
 
 import ai.rapids.cudf
-import ai.rapids.cudf.{ColumnVector, DType, Scalar, Schema, Table}
+import ai.rapids.cudf.{CaptureGroups, ColumnVector, DType, NvtxColor, RegexProgram, Scalar, Schema, Table}
 import com.nvidia.spark.rapids._
 import com.nvidia.spark.rapids.shims.ShimFilePartitionReaderFactory
 import org.apache.hadoop.conf.Configuration
@@ -245,6 +245,31 @@ case class GpuJsonPartitionReaderFactory(
   }
 }
 
+object JsonPartitionReader extends Arm {
+  def readToTable(
+      dataBufferer: HostLineBufferer,
+      cudfSchema: Schema,
+      decodeTime: GpuMetric,
+      jsonOpts:  cudf.JSONOptions,
+      formatName: String,
+      partFile: PartitionedFile): Table = {
+    val dataSize = dataBufferer.getLength
+    // cuDF does not yet support reading a subset of columns so we have
+    // to apply the read schema projection here
+    try {
+      RmmRapidsRetryIterator.withRetryNoSplit(dataBufferer.getBufferAndRelease) { dataBuffer =>
+        withResource(new NvtxWithMetrics(formatName + " decode",
+          NvtxColor.DARK_GREEN, decodeTime)) { _ =>
+          Table.readJSON(cudfSchema, jsonOpts, dataBuffer, 0, dataSize)
+        }
+      }
+    } catch {
+      case e: Exception =>
+        throw new IOException(s"Error when processing file [$partFile]", e)
+    }
+  }
+}
+
 class JsonPartitionReader(
     conf: Configuration,
     partFile: PartitionedFile,
@@ -277,32 +302,24 @@ class JsonPartitionReader(
       dataBufferer: HostLineBufferer,
       cudfSchema: Schema,
       readDataSchema: StructType,
-      hasHeader: Boolean): Table = {
+      hasHeader: Boolean,
+      decodeTime: GpuMetric): Table = {
     val jsonOpts = buildJsonOptions(parsedOptions)
-    val dataSize = dataBufferer.getLength
-    // cuDF does not yet support reading a subset of columns so we have
-    // to apply the read schema projection here
-    withResource(dataBufferer.getBufferAndRelease) { dataBuffer =>
-      val jsonTbl = try {
-        Table.readJSON(cudfSchema, jsonOpts, dataBuffer, 0, dataSize)
-      } catch {
-        case e: Exception =>
-          throw new IOException(s"Error when processing file [$partFile]", e)
-      }
-      withResource(jsonTbl) { tbl =>
-        val columns = new ListBuffer[ColumnVector]()
-        closeOnExcept(columns) { _ =>
-          for (name <- readDataSchema.fieldNames) {
-            val i = cudfSchema.getColumnNames.indexOf(name)
-            if (i == -1) {
-              throw new IllegalStateException(
-                s"read schema contains field named '$name' that is not in the data schema")
-            }
-            columns += tbl.getColumn(i)
+    val jsonTbl = JsonPartitionReader.readToTable(dataBufferer, cudfSchema, decodeTime, jsonOpts,
+      getFileFormatShortName, partFile)
+    withResource(jsonTbl) { tbl =>
+      val columns = new ListBuffer[ColumnVector]()
+      closeOnExcept(columns) { _ =>
+        for (name <- readDataSchema.fieldNames) {
+          val i = cudfSchema.getColumnNames.indexOf(name)
+          if (i == -1) {
+            throw new IllegalStateException(
+              s"read schema contains field named '$name' that is not in the data schema")
           }
+          columns += tbl.getColumn(i)
         }
-        new Table(columns: _*)
       }
+      new Table(columns: _*)
     }
   }
 
@@ -374,24 +391,24 @@ class JsonPartitionReader(
    * Spark then has its own rules for supporting NaN and Infinity, which are not
    * valid numbers in JSON.
    */
-  @scala.annotation.nowarn("msg=method matchesRe in class ColumnView is deprecated")
   private def sanitizeNumbers(input: ColumnVector): ColumnVector = {
     // Note that this is not 100% consistent with Spark versions prior to Spark 3.3.0
     // due to https://issues.apache.org/jira/browse/SPARK-38060
     // cuDF `isFloat` supports some inputs that are not valid JSON numbers, such as `.1`, `1.`,
     // and `+1` so we use a regular expression to match valid JSON numbers instead
     val jsonNumberRegexp = "^-?[0-9]+(?:\\.[0-9]+)?(?:[eE][\\-\\+]?[0-9]+)?$"
+    val prog = new RegexProgram(jsonNumberRegexp, CaptureGroups.NON_CAPTURE)
     val isValid = if (parsedOptions.allowNonNumericNumbers) {
       withResource(ColumnVector.fromStrings("NaN", "+INF", "-INF", "+Infinity",
         "Infinity", "-Infinity")) { nonNumeric =>
-        withResource(input.matchesRe(jsonNumberRegexp)) { isJsonNumber =>
+        withResource(input.matchesRe(prog)) { isJsonNumber =>
           withResource(input.contains(nonNumeric)) { nonNumeric =>
             isJsonNumber.or(nonNumeric)
           }
         }
       }
     } else {
-      input.matchesRe(jsonNumberRegexp)
+      input.matchesRe(prog)
     }
     withResource(isValid) { _ =>
       withResource(Scalar.fromNull(DType.STRING)) { nullString =>
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveOverrides.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveOverrides.scala
index 6d3395ab1ad..695b3a037ff 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveOverrides.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveOverrides.scala
@@ -16,7 +16,7 @@
 
 package org.apache.spark.sql.hive.rapids
 
-import com.nvidia.spark.rapids.{DataWritingCommandRule, ExecRule, ExprRule, HiveProvider, RunnableCommandRule, ShimLoader}
+import com.nvidia.spark.rapids.{DataWritingCommandRule, ExecRule, ExprRule, HiveProvider, RunnableCommandRule, ShimLoader, ShimReflectionUtils}
 
 import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.execution.SparkPlan
@@ -25,8 +25,8 @@ import org.apache.spark.sql.execution.command.{DataWritingCommand, RunnableComma
 object GpuHiveOverrides {
   val isSparkHiveAvailable: Boolean = {
     try {
-      ShimLoader.loadClass("org.apache.spark.sql.hive.HiveSessionStateBuilder")
-      ShimLoader.loadClass("org.apache.hadoop.hive.conf.HiveConf")
+      ShimReflectionUtils.loadClass("org.apache.spark.sql.hive.HiveSessionStateBuilder")
+      ShimReflectionUtils.loadClass("org.apache.hadoop.hive.conf.HiveConf")
       true
     } catch {
       case _: ClassNotFoundException | _: NoClassDefFoundError => false
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveTableScanExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveTableScanExec.scala
index bed56bb8bd0..577135e2eff 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveTableScanExec.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveTableScanExec.scala
@@ -16,9 +16,9 @@
 
 package org.apache.spark.sql.hive.rapids
 
-import ai.rapids.cudf.{ColumnVector, DType, Scalar, Schema, Table}
+import ai.rapids.cudf.{CaptureGroups, ColumnVector, DType, NvtxColor, RegexProgram, Scalar, Schema, Table}
 import com.nvidia.spark.RebaseHelper.withResource
-import com.nvidia.spark.rapids.{ColumnarPartitionReaderWithPartitionValues, CSVPartitionReaderBase, DateUtils, GpuColumnVector, GpuExec, GpuMetric, HostStringColBufferer, HostStringColBuffererFactory, PartitionReaderIterator, PartitionReaderWithBytesRead, RapidsConf}
+import com.nvidia.spark.rapids.{ColumnarPartitionReaderWithPartitionValues, CSVPartitionReaderBase, DateUtils, GpuColumnVector, GpuExec, GpuMetric, HostStringColBufferer, HostStringColBuffererFactory, NvtxWithMetrics, PartitionReaderIterator, PartitionReaderWithBytesRead, RapidsConf}
 import com.nvidia.spark.rapids.GpuMetric.{BUFFER_TIME, DEBUG_LEVEL, DESCRIPTION_BUFFER_TIME, DESCRIPTION_FILTER_TIME, DESCRIPTION_GPU_DECODE_TIME, DESCRIPTION_PEAK_DEVICE_MEMORY, ESSENTIAL_LEVEL, FILTER_TIME, GPU_DECODE_TIME, MODERATE_LEVEL, NUM_OUTPUT_ROWS, PEAK_DEVICE_MEMORY}
 import com.nvidia.spark.rapids.RapidsPluginImplicits.AutoCloseableProducingSeq
 import com.nvidia.spark.rapids.jni.CastStrings
@@ -175,7 +175,7 @@ case class GpuHiveTableScanExec(requestedAttributes: Seq[Attribute],
     FILTER_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_FILTER_TIME),
     PEAK_DEVICE_MEMORY -> createSizeMetric(MODERATE_LEVEL, DESCRIPTION_PEAK_DEVICE_MEMORY),
     "scanTime" -> createTimingMetric(ESSENTIAL_LEVEL, "scan time")
-  ) ++ semaphoreMetrics
+  )
 
   private lazy val driverMetrics: mutable.HashMap[String, Long] = mutable.HashMap.empty
 
@@ -360,7 +360,7 @@ case class GpuHiveTableScanExec(requestedAttributes: Seq[Attribute],
     rdd
   }
 
-  override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
     val scanTime = gpuLongMetric("scanTime")
     inputRDD.mapPartitionsInternal { batches =>
@@ -482,7 +482,6 @@ case class GpuHiveTextPartitionReaderFactory(sqlConf: SQLConf,
 }
 
 // Reader that converts from chunked data buffers into cudf.Table.
-@scala.annotation.nowarn("msg=method stringSplit in class ColumnView is deprecated")
 class GpuHiveDelimitedTextPartitionReader(conf: Configuration,
                                           csvOptions: CSVOptions,
                                           params: Map[String, String],
@@ -499,45 +498,49 @@ class GpuHiveDelimitedTextPartitionReader(conf: Configuration,
   override def readToTable(dataBufferer: HostStringColBufferer,
                            inputFileCudfSchema: Schema,
                            requestedOutputDataSchema: StructType,
-                           isFirstChunk: Boolean): Table = {
-    // The delimiter is currently hard coded to ^A. This should be able to support any format
-    //  but we don't want to test that yet
-    val splitTable = withResource(dataBufferer.getColumnAndRelease) { cv =>
-      cv.stringSplit("\u0001", false)
-    }
+                           isFirstChunk: Boolean,
+                           decodeTime: GpuMetric): Table = {
+    withResource(new NvtxWithMetrics(getFileFormatShortName + " decode",
+      NvtxColor.DARK_GREEN, decodeTime)) { _ =>
+      // The delimiter is currently hard coded to ^A. This should be able to support any format
+      //  but we don't want to test that yet
+      val splitTable = withResource(dataBufferer.getColumnAndRelease) { cv =>
+        cv.stringSplit("\u0001")
+      }
 
-    // inputFileCudfSchema       == Schema of the input file/buffer.
-    //                              Presented in the order of input columns in the file.
-    // requestedOutputDataSchema == Spark output schema. This is inexplicably sorted alphabetically
-    //                              in HiveTSExec, unlike FileSourceScanExec (which has file-input
-    //                              ordering).
-    //                              This trips up the downstream string->numeric casts in
-    //                              GpuTextBasedPartitionReader.readToTable().
-    // Given that Table.readCsv presents the output columns in the order of the input file,
-    // we need to reorder the table read from the input file in the order specified in
-    // [[requestedOutputDataSchema]] (i.e. requiredAttributes).
-
-    withResource(splitTable) { _ =>
-      val nullFormat = params.getOrElse("serialization.null.format", "\\N")
-      withResource(Scalar.fromString(nullFormat)) { nullTag =>
-        withResource(Scalar.fromNull(DType.STRING)) { nullVal =>
-          // This is a bit different because we are dropping columns/etc ourselves
-          val requiredColumnSequence = requestedOutputDataSchema.map(_.name).toList
-          val outputColumnNames = inputFileCudfSchema.getColumnNames
-          val reorderedColumns = requiredColumnSequence.safeMap { colName =>
-            val colIndex = outputColumnNames.indexOf(colName)
-            if (splitTable.getNumberOfColumns > colIndex) {
-              val col = splitTable.getColumn(colIndex)
-              withResource(col.equalTo(nullTag)) { shouldBeNull =>
-                shouldBeNull.ifElse(nullVal, col)
+      // inputFileCudfSchema       == Schema of the input file/buffer.
+      //                              Presented in the order of input columns in the file.
+      // requestedOutputDataSchema == Spark output schema. This is inexplicably sorted
+      //                              alphabetically in HiveTSExec, unlike FileSourceScanExec
+      //                              (which has file-input ordering).
+      //                              This trips up the downstream string->numeric casts in
+      //                              GpuTextBasedPartitionReader.readToTable().
+      // Given that Table.readCsv presents the output columns in the order of the input file,
+      // we need to reorder the table read from the input file in the order specified in
+      // [[requestedOutputDataSchema]] (i.e. requiredAttributes).
+
+      withResource(splitTable) { _ =>
+        val nullFormat = params.getOrElse("serialization.null.format", "\\N")
+        withResource(Scalar.fromString(nullFormat)) { nullTag =>
+          withResource(Scalar.fromNull(DType.STRING)) { nullVal =>
+            // This is a bit different because we are dropping columns/etc ourselves
+            val requiredColumnSequence = requestedOutputDataSchema.map(_.name).toList
+            val outputColumnNames = inputFileCudfSchema.getColumnNames
+            val reorderedColumns = requiredColumnSequence.safeMap { colName =>
+              val colIndex = outputColumnNames.indexOf(colName)
+              if (splitTable.getNumberOfColumns > colIndex) {
+                val col = splitTable.getColumn(colIndex)
+                withResource(col.equalTo(nullTag)) { shouldBeNull =>
+                  shouldBeNull.ifElse(nullVal, col)
+                }
+              } else {
+                // the column didn't exist in the output, so we need to make an all null one
+                ColumnVector.fromScalar(nullVal, splitTable.getRowCount.toInt)
               }
-            } else {
-              // the column didn't exist in the output, so we need to make an all null one
-              ColumnVector.fromScalar(nullVal, splitTable.getRowCount.toInt)
             }
-          }
-          withResource(reorderedColumns) { _ =>
-            new Table(reorderedColumns: _*)
+            withResource(reorderedColumns) { _ =>
+              new Table(reorderedColumns: _*)
+            }
           }
         }
       }
@@ -586,11 +589,11 @@ class GpuHiveDelimitedTextPartitionReader(conf: Configuration,
    *   1. The input strings are not trimmed of whitespace.
    *   2. Invalid date strings do not cause exceptions.
    */
-  @scala.annotation.nowarn("msg=method matchesRe in class ColumnView is deprecated")
   override def castStringToDate(input: ColumnVector, dt: DType): ColumnVector = {
     // Filter out any dates that do not conform to the `yyyy-MM-dd` format.
     val supportedDateRegex = raw"\A\d{4}-\d{2}-\d{2}\Z"
-    val regexFiltered = withResource(input.matchesRe(supportedDateRegex)) { matchesRegex =>
+    val prog = new RegexProgram(supportedDateRegex, CaptureGroups.NON_CAPTURE)
+    val regexFiltered = withResource(input.matchesRe(prog)) { matchesRegex =>
       withResource(Scalar.fromNull(DType.STRING)) { nullString =>
         matchesRegex.ifElse(input, nullString)
       }
@@ -608,7 +611,6 @@ class GpuHiveDelimitedTextPartitionReader(conf: Configuration,
     }
   }
 
-  @scala.annotation.nowarn("msg=method matchesRe in class ColumnView is deprecated")
   override def castStringToTimestamp(lhs: ColumnVector, sparkFormat: String, dType: DType)
   : ColumnVector = {
     // Currently, only the following timestamp pattern is supported:
@@ -619,7 +621,8 @@ class GpuHiveDelimitedTextPartitionReader(conf: Configuration,
     // Input strings that do not match this format strictly must be replaced with nulls.
     //                 yyyy-  MM -  dd    HH  :  mm  :  ss [SSS...     ]
     val regex = raw"\A\d{4}-\d{2}-\d{2} \d{2}\:\d{2}\:\d{2}(?:\.\d{1,9})?\Z"
-    val regexFiltered = withResource(lhs.matchesRe(regex)) { matchesRegex =>
+    val prog = new RegexProgram(regex, CaptureGroups.NON_CAPTURE)
+    val regexFiltered = withResource(lhs.matchesRe(prog)) { matchesRegex =>
       withResource(Scalar.fromNull(DType.STRING)) { nullString =>
         matchesRegex.ifElse(lhs, nullString)
       }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/ExecutionPlanCaptureCallback.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/ExecutionPlanCaptureCallback.scala
index 8ef8a4fad7f..5e267108cbc 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/ExecutionPlanCaptureCallback.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/ExecutionPlanCaptureCallback.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 package org.apache.spark.sql.rapids
 
+import scala.collection.convert.ImplicitConversions.`collection AsScalaIterable`
 import scala.collection.mutable.{ArrayBuffer, Map => MutableMap}
 import scala.util.matching.Regex
 
@@ -73,6 +74,15 @@ object ExecutionPlanCaptureCallback {
     }
   }
 
+  def assertCapturedAndGpuFellBack(
+      // used by python code, should not be Array[String]
+      fallbackCpuClassList: java.util.ArrayList[String],
+      timeoutMs: Long): Unit = {
+    val gpuPlans = getResultsWithTimeout(timeoutMs = timeoutMs)
+    assert(gpuPlans.nonEmpty, "Did not capture a plan")
+    fallbackCpuClassList.foreach(fallbackCpuClass => assertDidFallBack(gpuPlans, fallbackCpuClass))
+  }
+
   def assertCapturedAndGpuFellBack(fallbackCpuClass: String, timeoutMs: Long = 2000): Unit = {
     val gpuPlans = getResultsWithTimeout(timeoutMs = timeoutMs)
     assert(gpuPlans.nonEmpty, "Did not capture a plan")
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/ExternalSource.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/ExternalSource.scala
index 4199cc4df13..b9631377438 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/ExternalSource.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/ExternalSource.scala
@@ -43,7 +43,7 @@ object ExternalSource extends Logging {
     /** spark-avro is an optional package for Spark, so the RAPIDS Accelerator
      * must run successfully without it. */
     Utils.classIsLoadable(avroScanClassName) && {
-      Try(ShimLoader.loadClass(avroScanClassName)).map(_ => true)
+      Try(ShimReflectionUtils.loadClass(avroScanClassName)).map(_ => true)
         .getOrElse {
           logWarning("Avro library not found by the RAPIDS plugin. The Plugin jars are " +
               "likely deployed using a static classpath spark.driver/executor.extraClassPath. " +
@@ -57,7 +57,7 @@ object ExternalSource extends Logging {
 
   private lazy val hasIcebergJar = {
     Utils.classIsLoadable(IcebergProvider.cpuScanClassName) &&
-        Try(ShimLoader.loadClass(IcebergProvider.cpuScanClassName)).isSuccess
+        Try(ShimReflectionUtils.loadClass(IcebergProvider.cpuScanClassName)).isSuccess
   }
 
   private lazy val icebergProvider = IcebergProvider()
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuAvroScan.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuAvroScan.scala
index 5240184c7e8..d9201b6aea7 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuAvroScan.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuAvroScan.scala
@@ -28,7 +28,7 @@ import scala.math.max
 
 import ai.rapids.cudf.{AvroOptions => CudfAvroOptions, HostMemoryBuffer, NvtxColor, NvtxRange, Table}
 import com.nvidia.spark.rapids._
-import com.nvidia.spark.rapids.GpuMetric.{BUFFER_TIME, FILTER_TIME, GPU_DECODE_TIME, NUM_OUTPUT_BATCHES, PEAK_DEVICE_MEMORY, READ_FS_TIME, SEMAPHORE_WAIT_TIME, WRITE_BUFFER_TIME}
+import com.nvidia.spark.rapids.GpuMetric.{BUFFER_TIME, FILTER_TIME, GPU_DECODE_TIME, NUM_OUTPUT_BATCHES, PEAK_DEVICE_MEMORY, READ_FS_TIME, WRITE_BUFFER_TIME}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.shims.ShimFilePartitionReaderFactory
 import org.apache.avro.Schema
@@ -325,16 +325,18 @@ trait GpuAvroReaderBase extends Arm with Logging { self: FilePartitionReaderBase
       .includeColumn(readDataSchema.fieldNames.toSeq: _*)
       .build()
     // about to start using the GPU
-    GpuSemaphore.acquireIfNecessary(TaskContext.get(), metrics(SEMAPHORE_WAIT_TIME))
+    GpuSemaphore.acquireIfNecessary(TaskContext.get())
 
-    withResource(new NvtxWithMetrics("Avro decode",
-        NvtxColor.DARK_GREEN, metrics(GPU_DECODE_TIME))) { _ =>
-      try {
-        Table.readAvro(readOpts, hostBuf, 0, bufSize)
-      } catch {
-        case e: Exception =>
-          throw new IOException(s"Error when processing file splits [${splits.mkString("; ")}]", e)
+    try {
+      RmmRapidsRetryIterator.withRetryNoSplit {
+        withResource(new NvtxWithMetrics("Avro decode",
+          NvtxColor.DARK_GREEN, metrics(GPU_DECODE_TIME))) { _ =>
+          Table.readAvro(readOpts, hostBuf, 0, bufSize)
+        }
       }
+    } catch {
+      case e: Exception =>
+        throw new IOException(s"Error when processing file splits [${splits.mkString("; ")}]", e)
     }
   }
 
@@ -641,7 +643,7 @@ class GpuMultiFileCloudAvroPartitionReader(
       val optBatch = if (bufAndSizeInfo.hmb == null) {
         // Not reading any data, but add in partition data if needed
         // Someone is going to process this data, even if it is just a row count
-        GpuSemaphore.acquireIfNecessary(TaskContext.get(), metrics(SEMAPHORE_WAIT_TIME))
+        GpuSemaphore.acquireIfNecessary(TaskContext.get())
         val emptyBatch = new ColumnarBatch(Array.empty, bufAndSizeInfo.numRows.toInt)
         Some(addPartitionValues(emptyBatch, partitionValues, partitionSchema))
       } else {
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuCartesianProductExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuCartesianProductExec.scala
index 059f7c62c74..54a7609a0e6 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuCartesianProductExec.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuCartesianProductExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@ import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
 import scala.collection.mutable
 
 import ai.rapids.cudf.{JCudfSerialization, NvtxColor, NvtxRange}
-import com.nvidia.spark.rapids.{Arm, GpuBindReferences, GpuBuildLeft, GpuColumnVector, GpuExec, GpuExpression, GpuMetric, GpuSemaphore, LazySpillableColumnarBatch, MetricsLevel, NoopMetric, SpillCallback}
+import com.nvidia.spark.rapids.{Arm, GpuBindReferences, GpuBuildLeft, GpuColumnVector, GpuExec, GpuExpression, GpuMetric, GpuSemaphore, LazySpillableColumnarBatch, MetricsLevel}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.shims.ShimBinaryExecNode
 
@@ -72,7 +72,7 @@ class GpuSerializableBatch(batch: ColumnarBatch)
   private def readObject(in: ObjectInputStream): Unit = {
     // There is no good way to tie this object deserialization to a specific metric, and I am not
     // sure it is worth trying.
-    GpuSemaphore.acquireIfNecessary(TaskContext.get(), NoopMetric)
+    GpuSemaphore.acquireIfNecessary(TaskContext.get())
     withResource(new NvtxRange("DeserializeBatch", NvtxColor.PURPLE)) { _ =>
       val schemaArray = in.readObject().asInstanceOf[Array[DataType]]
       withResource(JCudfSerialization.readTableFrom(in)) { tableInfo =>
@@ -117,7 +117,6 @@ class GpuCartesianRDD(
     boundCondition: Option[GpuExpression],
     numFirstTableColumns: Int,
     streamAttributes: Seq[Attribute],
-    spillCallback: SpillCallback,
     targetSize: Long,
     opTime: GpuMetric,
     joinTime: GpuMetric,
@@ -166,7 +165,7 @@ class GpuCartesianRDD(
 
     rdd1.iterator(currSplit.s1, context).flatMap { lhs =>
       val batch = withResource(lhs.getBatch) { lhsBatch =>
-        LazySpillableColumnarBatch(lhsBatch, spillCallback, "cross_lhs")
+        LazySpillableColumnarBatch(lhsBatch, "cross_lhs")
       }
       // Introduce sentinel `streamSideCached` to record whether stream-side data is cached or
       // not, because predicate `spillBatchBuffer.isEmpty` will always be true if
@@ -176,7 +175,7 @@ class GpuCartesianRDD(
         // lazily compute and cache stream-side data
         rdd2.iterator(currSplit.s2, context).map { serializableBatch =>
           withResource(serializableBatch.getBatch) { batch =>
-            val lzyBatch = LazySpillableColumnarBatch(batch, spillCallback, "cross_rhs")
+            val lzyBatch = LazySpillableColumnarBatch(batch, "cross_rhs")
             spillBatchBuffer += lzyBatch
             // return a spill only version so we don't close it until the end
             LazySpillableColumnarBatch.spillOnly(lzyBatch)
@@ -189,7 +188,7 @@ class GpuCartesianRDD(
 
       GpuBroadcastNestedLoopJoinExecBase.nestedLoopJoin(
         Cross, GpuBuildLeft, numFirstTableColumns, batch, streamIterator, streamAttributes,
-        targetSize, boundCondition, spillCallback,
+        targetSize, boundCondition,
         numOutputRows = numOutputRows,
         joinOutputRows = joinOutputRows,
         numOutputBatches = numOutputBatches,
@@ -237,17 +236,16 @@ case class GpuCartesianProductExec(
   override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
     OP_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_OP_TIME),
     JOIN_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_JOIN_TIME),
-    JOIN_OUTPUT_ROWS -> createMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_OUTPUT_ROWS)) ++ spillMetrics
+    JOIN_OUTPUT_ROWS -> createMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_OUTPUT_ROWS))
 
   protected override def doExecute(): RDD[InternalRow] =
     throw new IllegalStateException("This should only be called from columnar")
 
-  protected override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  protected override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
     val numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES)
     val joinTime = gpuLongMetric(JOIN_TIME)
     val joinOutputRows = gpuLongMetric(JOIN_OUTPUT_ROWS)
-    val semWait = gpuLongMetric(SEMAPHORE_WAIT_TIME)
     val opTime = gpuLongMetric(OP_TIME)
 
     val boundCondition = condition.map(GpuBindReferences.bindGpuReference(_, output))
@@ -272,17 +270,14 @@ case class GpuCartesianProductExec(
         l.cartesian(r).map(p => p._1 * p._2),
         targetSizeBytes,
         numOutputRows,
-        numOutputBatches,
-        semWait)
+        numOutputBatches)
     } else {
-      val spillCallback = GpuMetric.makeSpillCallback(allMetrics)
       val numFirstTableColumns = left.output.size
 
       new GpuCartesianRDD(sparkContext,
         boundCondition,
         numFirstTableColumns,
         right.output,
-        spillCallback,
         targetSizeBytes,
         opTime,
         joinTime,
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuFileFormatDataWriter.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuFileFormatDataWriter.scala
index e721fdd9990..bc066bb7ebf 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuFileFormatDataWriter.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuFileFormatDataWriter.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,6 @@ import ai.rapids.cudf.{ColumnVector, ContiguousTable, OrderByArg, Table}
 import com.nvidia.spark.TimingUtils
 import com.nvidia.spark.rapids._
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
-import com.nvidia.spark.rapids.StorageTier.StorageTier
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.TaskAttemptContext
 
@@ -704,8 +703,6 @@ class GpuDynamicPartitionDataConcurrentWriter(
 
   private val outDataTypes = description.dataColumns.map(_.dataType).toArray
 
-  val spillCallback = RapidsBuffer.defaultSpillCallback
-
   val partitionFlushSize = if (description.concurrentWriterPartitionFlushSize <= 0) {
     // if the property is equal or less than 0, use default value of parquet or orc
     val extension = description.outputWriterFactory
@@ -797,18 +794,12 @@ class GpuDynamicPartitionDataConcurrentWriter(
     val opTime = NoopMetric
     val outputBatch = NoopMetric
     val outputRows = NoopMetric
-    val spillCallback = new SpillCallback {
-      override def apply(from: StorageTier, to: StorageTier, amount: Long): Unit = {
-      }
-
-      override def semaphoreWaitTime: GpuMetric = NoopMetric
-    }
 
     val targetSize = GpuSortExec.targetSize(spec.batchSize)
     // out of core sort the entire iterator
     GpuOutOfCoreSortIterator(iterator, sorter, cpuOrd, targetSize,
       opTime, sortTime, outputBatch, outputRows,
-      peakDevMemory, spillCallback)
+      peakDevMemory)
   }
 
   /**
@@ -920,7 +911,7 @@ class GpuDynamicPartitionDataConcurrentWriter(
                 val currWriterStatus = concurrentWriters(partitionStr)
                 // create SpillableColumnarBatch to take the owner of `outputCb`
                 currWriterStatus.tableCaches += SpillableColumnarBatch(
-                  outputCb, SpillPriorities.ACTIVE_ON_DECK_PRIORITY, spillCallback)
+                  outputCb, SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
                 currWriterStatus.deviceBytes += GpuColumnVector.getTotalDeviceMemoryUsed(outputCb)
               }
             }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuFileSourceScanExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuFileSourceScanExec.scala
index 75dfa80b4d5..80a7d81cf7a 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuFileSourceScanExec.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuFileSourceScanExec.scala
@@ -452,12 +452,12 @@ case class GpuFileSourceScanExec(
     } else {
       Map.empty[String, GpuMetric]
     }
-  } ++ staticMetrics ++ spillMetrics
+  } ++ staticMetrics
 
   override protected def doExecute(): RDD[InternalRow] =
     throw new IllegalStateException(s"Row-based execution should not occur for $this")
 
-  override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
     val scanTime = gpuLongMetric("scanTime")
     inputRDD.asInstanceOf[RDD[ColumnarBatch]].mapPartitionsInternal { batches =>
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuInMemoryTableScanExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuInMemoryTableScanExec.scala
index 8d546d384d9..6e3b97c380d 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuInMemoryTableScanExec.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuInMemoryTableScanExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -152,7 +152,7 @@ case class GpuInMemoryTableScanExec(
     throw new UnsupportedOperationException("This Exec only deals with Columnar Data")
   }
 
-  protected override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  protected override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     columnarInputRDD
   }
 }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuOrcFileFormat.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuOrcFileFormat.scala
index 24a6393d212..fddefc9eb88 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuOrcFileFormat.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuOrcFileFormat.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,12 +63,20 @@ object GpuOrcFileFormat extends Logging {
     val encrypt = options.getOrElse("orc.encrypt", "")
     val mask = options.getOrElse("orc.mask", "")
 
-    if (!keyProvider.isEmpty || !keyProviderPath.isEmpty || !encrypt.isEmpty || !mask.isEmpty) {
+    if (keyProvider.nonEmpty || keyProviderPath.nonEmpty || encrypt.nonEmpty || mask.nonEmpty) {
       meta.willNotWorkOnGpu("Encryption is not yet supported on GPU. If encrypted ORC " +
           "writes are not required unset the \"hadoop.security.key.provider.path\" and " +
           "\"orc.key.provider\" and \"orc.encrypt\" and \"orc.mask\"")
     }
 
+    // Check if bloom filter is enabled. If yes, then disable GPU.
+    // Refer to https://orc.apache.org/docs/spark-config.html for the description of ORC configs.
+    val bloomFilterColumns = options.getOrElse("orc.bloom.filter.columns", "")
+    if (bloomFilterColumns.nonEmpty) {
+      meta.willNotWorkOnGpu("Bloom filter write for ORC is not yet supported on GPU. " +
+        "If bloom filter is not required, unset \"orc.bloom.filter.columns\"")
+    }
+
     FileFormatChecks.tag(meta, schema, OrcFormatType, WriteFileOp)
 
     val sqlConf = spark.sessionState.conf
@@ -170,7 +178,7 @@ class GpuOrcFileFormat extends ColumnarFileFormat with Logging {
 class GpuOrcWriter(override val path: String,
                    dataSchema: StructType,
                    context: TaskAttemptContext)
-  extends ColumnarOutputWriter(context, dataSchema, "ORC") {
+  extends ColumnarOutputWriter(context, dataSchema, "ORC", true) {
 
   override val tableWriter: TableWriter = {
     val builder = SchemaUtils
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuTaskMetrics.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuTaskMetrics.scala
new file mode 100644
index 00000000000..c6cda7e73f6
--- /dev/null
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuTaskMetrics.scala
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.rapids
+
+import java.io.ObjectInputStream
+import java.util.Locale
+import java.util.concurrent.TimeUnit
+
+import scala.collection.mutable
+
+import ai.rapids.cudf.{NvtxColor, NvtxRange}
+import com.nvidia.spark.rapids.Arm
+import com.nvidia.spark.rapids.jni.RmmSpark
+import java.{lang => jl}
+
+import org.apache.spark.{SparkContext, TaskContext}
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.{AccumulatorV2, LongAccumulator, Utils}
+
+case class NanoTime(value: java.lang.Long) {
+  override def toString: String = {
+    val hours = TimeUnit.NANOSECONDS.toHours(value)
+    var remaining = value - TimeUnit.HOURS.toNanos(hours)
+    val minutes = TimeUnit.NANOSECONDS.toMinutes(remaining)
+    remaining = remaining - TimeUnit.MINUTES.toNanos(minutes)
+    val seconds = remaining.toDouble / TimeUnit.SECONDS.toNanos(1)
+    val locale = Locale.US
+    "%02d:%02d:%06.3f".formatLocal(locale, hours, minutes, seconds)
+  }
+}
+
+class NanoSecondAccumulator extends AccumulatorV2[jl.Long, NanoTime] {
+  private var _sum = 0L
+  override def isZero: Boolean = _sum == 0
+
+
+  override def copy(): NanoSecondAccumulator = {
+    val newAcc = new NanoSecondAccumulator
+    newAcc._sum = this._sum
+    newAcc
+  }
+
+  override def reset(): Unit = {
+    _sum = 0
+  }
+
+  override def add(v: jl.Long): Unit = {
+    _sum += v
+  }
+
+  def add (v: Long): Unit = {
+    _sum += v
+  }
+
+  override def merge(other: AccumulatorV2[jl.Long, NanoTime]): Unit = other match {
+    case ns: NanoSecondAccumulator =>
+      _sum += ns._sum
+    case _ =>
+      throw new UnsupportedOperationException(
+        s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
+  }
+
+  override def value: NanoTime = NanoTime(_sum)
+}
+
+class GpuTaskMetrics extends Arm with Serializable {
+  private val semWaitTimeNs = new NanoSecondAccumulator
+  private val spillBlockTimeNs = new NanoSecondAccumulator
+  private val readSpillTimeNs = new NanoSecondAccumulator
+  private val retryCount = new LongAccumulator
+  private val splitAndRetryCount = new LongAccumulator
+  private val retryBlockTime = new NanoSecondAccumulator
+
+  private val metrics = Map[String, AccumulatorV2[_, _]](
+    "gpuSemaphoreWait" -> semWaitTimeNs,
+    "gpuSpillBlockTime" -> spillBlockTimeNs,
+    "gpuReadSpillTime" -> readSpillTimeNs,
+    "gpuRetryCount" -> retryCount,
+    "gpuSplitAndRetryCount" -> splitAndRetryCount,
+    "gpuRetryBlockTime" -> retryBlockTime)
+
+  def register(sc: SparkContext): Unit = {
+    metrics.foreach { case (k, m) =>
+      // This is not a public API, but the only way to get failed task
+      // If we run into problems we can use use sc.register(m, k), but
+      // it would not allow us to collect metrics for failed tasks.
+      m.register(sc, Some(k), true)
+    }
+  }
+
+  def makeSureRegistered(): Unit = {
+    // This is a noop for now, but need to make sure something happens
+  }
+
+  // Called by Java when deserializing an object
+  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
+    in.defaultReadObject()
+    // Now we need to make sure that we are registered with the proper task
+    GpuTaskMetrics.registerOnTask(this)
+  }
+
+  private def timeIt[A](timer: NanoSecondAccumulator,
+      range: String,
+      color: NvtxColor,
+      f: => A): A = {
+    val start = System.nanoTime()
+    withResource(new NvtxRange(range, color)) { _ =>
+      try {
+        f
+      } finally {
+        timer.add(System.nanoTime() - start)
+      }
+    }
+  }
+
+  def semWaitTime[A](f: => A): A = timeIt(semWaitTimeNs, "Acquire GPU", NvtxColor.RED, f)
+
+  def spillTime[A](f: => A): A = timeIt(spillBlockTimeNs, "OnAllocFailure", NvtxColor.RED, f)
+
+  def readSpillTime[A](f: => A): A = timeIt(readSpillTimeNs, "Read Spill", NvtxColor.ORANGE, f)
+
+  def updateRetry(taskAttemptId: Long): Unit = {
+    val rc = RmmSpark.getAndResetNumRetryThrow(taskAttemptId)
+    if (rc > 0) {
+      retryCount.add(rc)
+    }
+
+    val src = RmmSpark.getAndResetNumSplitRetryThrow(taskAttemptId)
+    if (src > 0) {
+      splitAndRetryCount.add(src)
+    }
+
+    val timeNs = RmmSpark.getAndResetBlockTimeNs(taskAttemptId)
+    if (timeNs > 0) {
+      retryBlockTime.add(timeNs)
+    }
+  }
+}
+
+/**
+ * Provides task level metrics
+ */
+object GpuTaskMetrics extends Logging {
+  private val taskLevelMetrics = mutable.Map[Long, GpuTaskMetrics]()
+
+  def registerOnTask(metrics: GpuTaskMetrics): Unit = synchronized {
+    val tc = TaskContext.get()
+    if (tc != null) {
+      val id = tc.taskAttemptId()
+      // avoid double registering the task metrics...
+      if (!taskLevelMetrics.contains(id)) {
+        taskLevelMetrics.put(id, metrics)
+        tc.addTaskCompletionListener { tc =>
+          synchronized {
+            taskLevelMetrics.remove(tc.taskAttemptId())
+          }
+        }
+      }
+    }
+  }
+
+  def get: GpuTaskMetrics = synchronized {
+    val tc = TaskContext.get()
+    val metrics = if (tc != null) {
+      taskLevelMetrics.get(tc.taskAttemptId())
+    } else {
+      None
+    }
+    // As a backstop better to not have metrics than to crash...
+    // Spark does this too for regular task metrics
+    metrics.getOrElse(new GpuTaskMetrics)
+  }
+}
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsCachingReader.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsCachingReader.scala
index f78bf3dc38c..e032b11b959 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsCachingReader.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsCachingReader.scala
@@ -142,7 +142,7 @@ class RapidsCachingReader[K, C](
       try {
         val cachedIt = cachedBufferHandles.iterator.map(bufferHandle => {
           // No good way to get a metric in here for semaphore wait time
-          GpuSemaphore.acquireIfNecessary(context, NoopMetric)
+          GpuSemaphore.acquireIfNecessary(context)
           val cb = withResource(catalog.acquireBuffer(bufferHandle)) { buffer =>
             buffer.getColumnarBatch(sparkTypes)
           }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsPrivateUtil.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsPrivateUtil.scala
new file mode 100644
index 00000000000..f908d455edb
--- /dev/null
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsPrivateUtil.scala
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.rapids
+
+import scala.io.Source
+
+import com.nvidia.spark.rapids.{Arm, ConfEntry, ConfEntryWithDefault, OptionalConfEntry}
+
+import org.apache.spark.internal.config.ConfigEntry
+
+object RapidsPrivateUtil extends Arm {
+  def getPrivateConfigs(): Seq[ConfEntry[_]] = {
+    withResource(Source.fromResource("spark-rapids-extra-configs-classes").bufferedReader()) { r =>
+      val className = r.readLine().trim
+      Class.forName(className)
+        .getDeclaredConstructor()
+        .newInstance()
+        .asInstanceOf[Iterable[ConfigEntry[_]]]
+        .map(convert).toSeq
+    }
+  }
+
+  /** Convert Spark ConfigEntry to Spark RAPIDS ConfEntry */
+  private def convert(e: ConfigEntry[_]): ConfEntry[_] = {
+    e.defaultValue match {
+      case None => createEntry[String](e.key, e.doc, _.toString)
+      case Some(value: Boolean) => createEntryWithDefault[Boolean](e.key, e.doc, _.toBoolean, value)
+      case Some(value: Integer) => createEntryWithDefault[Integer](e.key, e.doc, _.toInt, value)
+      case Some(value: Long) => createEntryWithDefault[Long](e.key, e.doc, _.toLong, value)
+      case Some(value: Double) => createEntryWithDefault[Double](e.key, e.doc, _.toDouble, value)
+      case Some(value: String) => createEntryWithDefault[String](e.key, e.doc, _.toString, value)
+      case Some(other) => throw new IllegalStateException(
+        s"Unsupported private config defaultValue type: $other")
+    }
+  }
+
+  private def createEntryWithDefault[T](
+      key: String,
+      doc: String,
+      converter: String => T,
+      value: T) = {
+    new ConfEntryWithDefault[T](key, converter, doc, isInternal = false,
+      isStartupOnly = false, value)
+  }
+
+  private def createEntry[T](
+     key: String,
+     doc: String,
+     converter: String => T) = {
+    new OptionalConfEntry[T](key, converter, doc, isInternal = false, isStartupOnly = false)
+  }
+}
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala
index ae49d401628..21f52594eac 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala
@@ -923,7 +923,6 @@ class RapidsCachingWriter[K, V](
                 blockId,
                 contigTable,
                 SpillPriorities.OUTPUT_FOR_SHUFFLE_INITIAL_PRIORITY,
-                RapidsBuffer.defaultSpillCallback,
                 // we don't need to sync here, because we sync on the cuda
                 // stream after sliceInternalOnGpu (contiguous_split)
                 needsSync = false)
@@ -937,7 +936,6 @@ class RapidsCachingWriter[K, V](
                 buffer,
                 tableMeta,
                 SpillPriorities.OUTPUT_FOR_SHUFFLE_INITIAL_PRIORITY,
-                RapidsBuffer.defaultSpillCallback,
                 // we don't need to sync here, because we sync on the cuda
                 // stream after compression.
                 needsSync = false)
@@ -953,8 +951,7 @@ class RapidsCachingWriter[K, V](
           val handle =
             catalog.addDegenerateRapidsBuffer(
               blockId,
-              tableMeta,
-              RapidsBuffer.defaultSpillCallback)
+              tableMeta)
 
           // The size of the data is really only used to tell if the data should be shuffled or not
           // a 0 indicates that we should not shuffle anything.  This is here for the special case
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/catalyst/expressions/GpuRandomExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/catalyst/expressions/GpuRandomExpressions.scala
index e9a9d1be00c..4e5d277d7d4 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/catalyst/expressions/GpuRandomExpressions.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/catalyst/expressions/GpuRandomExpressions.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,6 +38,8 @@ case class GpuRand(child: Expression) extends ShimUnaryExpression with GpuExpres
 
   def seedExpression: Expression = child
 
+  override lazy val deterministic: Boolean = false
+
   /**
    * Record ID within each partition. By being transient, the Random Number Generator is
    * reset every time we serialize and deserialize and initialize it.
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala
index fd79fe6a7aa..3f045359ce3 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala
@@ -28,7 +28,7 @@ import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.shims.ShimExpression
 
 import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion}
-import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, ImplicitCastInputTypes, NamedExpression, NullIntolerant, RowOrdering, Sequence, TimeZoneAwareExpression}
+import org.apache.spark.sql.catalyst.expressions.{ElementAt, ExpectsInputTypes, Expression, ImplicitCastInputTypes, NamedExpression, NullIntolerant, RowOrdering, Sequence, TimeZoneAwareExpression}
 import org.apache.spark.sql.catalyst.util.GenericArrayData
 import org.apache.spark.sql.rapids.shims.RapidsErrorUtils
 import org.apache.spark.sql.types._
@@ -103,6 +103,75 @@ case class GpuMapConcat(children: Seq[Expression]) extends GpuComplexTypeMerging
   }
 }
 
+object GpuElementAtMeta {
+  /**
+   * Construct the expression rule for `ElementAt`.
+   * @param nullOnInvalidAccessToMap
+   *  Returns `null` or throws an exception on invalid access to map column.
+   *  For Spark 3.4+ and DB11.3+, this argument is `true`,
+   *  and for other Spark versions, it is `false`.
+   */
+  def elementAtRule(nullOnInvalidAccessToMap: Boolean): ExprRule[_ <: Expression] = {
+    GpuOverrides.expr[ElementAt](
+      "Returns element of array at given(1-based) index in value if column is array. " +
+        "Returns value for the given key in value if column is map.",
+      ExprChecks.binaryProject(
+        (TypeSig.commonCudfTypes + TypeSig.ARRAY + TypeSig.STRUCT + TypeSig.NULL +
+          TypeSig.DECIMAL_128 + TypeSig.MAP + TypeSig.BINARY).nested(), TypeSig.all,
+        ("array/map", TypeSig.ARRAY.nested(TypeSig.commonCudfTypes + TypeSig.ARRAY +
+          TypeSig.STRUCT + TypeSig.NULL + TypeSig.DECIMAL_128 + TypeSig.MAP + TypeSig.BINARY) +
+          TypeSig.MAP.nested(TypeSig.commonCudfTypes + TypeSig.ARRAY + TypeSig.STRUCT +
+            TypeSig.NULL + TypeSig.DECIMAL_128 + TypeSig.MAP + TypeSig.BINARY)
+            .withPsNote(TypeEnum.MAP, "If it's map, only primitive key types are supported."),
+          TypeSig.ARRAY.nested(TypeSig.all) + TypeSig.MAP.nested(TypeSig.all)),
+        ("index/key", (TypeSig.commonCudfTypes + TypeSig.DECIMAL_128)
+          .withPsNote(
+            Seq(TypeEnum.BOOLEAN, TypeEnum.BYTE, TypeEnum.SHORT, TypeEnum.LONG,
+              TypeEnum.FLOAT, TypeEnum.DOUBLE, TypeEnum.DATE, TypeEnum.TIMESTAMP,
+              TypeEnum.STRING, TypeEnum.DECIMAL), "Unsupported as array index."),
+          TypeSig.all)),
+      (in, conf, p, r) => new BinaryExprMeta[ElementAt](in, conf, p, r) {
+        override def tagExprForGpu(): Unit = {
+          // To distinguish the supported nested type between Array and Map
+          val checks = in.left.dataType match {
+            case _: MapType =>
+              // Match exactly with the checks for GetMapValue
+              ExprChecks.binaryProject(
+                (TypeSig.commonCudfTypes + TypeSig.ARRAY + TypeSig.STRUCT + TypeSig.NULL +
+                  TypeSig.DECIMAL_128 + TypeSig.MAP + TypeSig.BINARY).nested(),
+                TypeSig.all,
+                ("map",
+                  TypeSig.MAP.nested(TypeSig.commonCudfTypes + TypeSig.ARRAY + TypeSig.STRUCT +
+                    TypeSig.NULL + TypeSig.DECIMAL_128 + TypeSig.MAP + TypeSig.BINARY),
+                  TypeSig.MAP.nested(TypeSig.all)),
+                ("key", TypeSig.commonCudfTypes + TypeSig.DECIMAL_128, TypeSig.all))
+            case _: ArrayType =>
+              // Match exactly with the checks for GetArrayItem
+              ExprChecks.binaryProject(
+                (TypeSig.commonCudfTypes + TypeSig.ARRAY + TypeSig.STRUCT + TypeSig.NULL +
+                  TypeSig.DECIMAL_128 + TypeSig.MAP + TypeSig.BINARY).nested(),
+                TypeSig.all,
+                ("array", TypeSig.ARRAY.nested(TypeSig.commonCudfTypes + TypeSig.ARRAY +
+                  TypeSig.STRUCT + TypeSig.NULL + TypeSig.DECIMAL_128 + TypeSig.MAP +
+                  TypeSig.BINARY),
+                  TypeSig.ARRAY.nested(TypeSig.all)),
+                ("ordinal", TypeSig.INT, TypeSig.INT))
+            case _ => throw new IllegalStateException("Only Array or Map is supported as input.")
+          }
+          checks.tag(this)
+        }
+        override def convertToGpu(lhs: Expression, rhs: Expression): GpuExpression = {
+          val failOnError = if (nullOnInvalidAccessToMap) {
+            in.failOnError && lhs.dataType.isInstanceOf[ArrayType]
+          } else {
+            in.failOnError
+          }
+          GpuElementAt(lhs, rhs, failOnError)
+        }
+      })
+  }
+}
+
 case class GpuElementAt(left: Expression, right: Expression, failOnError: Boolean)
   extends GpuBinaryExpression with ExpectsInputTypes {
 
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
index 0fe185b449b..d5632a53810 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.rapids
 import java.time.ZoneId
 import java.util.concurrent.TimeUnit
 
-import ai.rapids.cudf.{BinaryOp, ColumnVector, ColumnView, DType, Scalar}
+import ai.rapids.cudf.{BinaryOp, CaptureGroups, ColumnVector, ColumnView, DType, RegexProgram, Scalar}
 import com.nvidia.spark.rapids.{Arm, BinaryExprMeta, BoolUtils, DataFromReplacementRule, DateUtils, GpuBinaryExpression, GpuColumnVector, GpuExpression, GpuScalar, GpuUnaryExpression, RapidsConf, RapidsMeta}
 import com.nvidia.spark.rapids.GpuOverrides.{extractStringLit, getTimeParserPolicy}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
@@ -481,7 +481,6 @@ object GpuToTimestamp extends Arm {
     }
   }
 
-  @scala.annotation.nowarn("msg=method matchesRe in class ColumnView is deprecated")
   def isTimestamp(col: ColumnVector, sparkFormat: String, strfFormat: String) : ColumnVector = {
     CORRECTED_COMPATIBLE_FORMATS.get(sparkFormat) match {
       case Some(fmt) =>
@@ -490,7 +489,8 @@ object GpuToTimestamp extends Arm {
         // the string as well which works well for fixed-length formats but if/when we want to
         // support variable-length formats (such as timestamps with milliseconds) then we will need
         // to use regex instead.
-        val isTimestamp = withResource(col.matchesRe(fmt.validRegex)) { matches =>
+        val prog = new RegexProgram(fmt.validRegex, CaptureGroups.NON_CAPTURE)
+        val isTimestamp = withResource(col.matchesRe(prog)) { matches =>
           withResource(col.isTimestamp(strfFormat)) { isTimestamp =>
             isTimestamp.and(matches)
           }
@@ -551,7 +551,6 @@ object GpuToTimestamp extends Arm {
    * Parse string to timestamp when timeParserPolicy is LEGACY. This was the default behavior
    * prior to Spark 3.0
    */
-  @scala.annotation.nowarn("msg=in class ColumnView is deprecated")
   def parseStringAsTimestampWithLegacyParserPolicy(
       lhs: GpuColumnVector,
       sparkFormat: String,
@@ -580,14 +579,15 @@ object GpuToTimestamp extends Arm {
     val fixedUp = rulesWithSeparator
       .foldLeft(rejectLeadingNewlineThenStrip(lhs))((cv, regexRule) => {
         withResource(cv) {
-          _.stringReplaceWithBackrefs(regexRule.search, regexRule.replace)
+          _.stringReplaceWithBackrefs(new RegexProgram(regexRule.search), regexRule.replace)
         }
       })
 
     // check the final value against a regex to determine if it is valid or not, so we produce
     // null values for any invalid inputs
     withResource(Scalar.fromNull(dtype)) { nullValue =>
-      withResource(fixedUp.matchesRe(format.validRegex)) { isValidDate =>
+      val prog = new RegexProgram(format.validRegex, CaptureGroups.NON_CAPTURE)
+      withResource(fixedUp.matchesRe(prog)) { isValidDate =>
         withResource(asTimestampOrNull(fixedUp, dtype, strfFormat, asTimestamp)) { timestamp =>
           isValidDate.ifElse(timestamp, nullValue)
         }
@@ -599,9 +599,9 @@ object GpuToTimestamp extends Arm {
    * Filter out strings that have a newline before the first non-whitespace character
    * and then strip all leading and trailing whitespace.
    */
-  @scala.annotation.nowarn("msg=method matchesRe in class ColumnView is deprecated")
   private def rejectLeadingNewlineThenStrip(lhs: GpuColumnVector) = {
-    withResource(lhs.getBase.matchesRe("\\A[ \\t]*[\\n]+")) { hasLeadingNewline =>
+    val prog = new RegexProgram("\\A[ \\t]*[\\n]+", CaptureGroups.NON_CAPTURE)
+    withResource(lhs.getBase.matchesRe(prog)) { hasLeadingNewline =>
       withResource(Scalar.fromNull(DType.STRING)) { nullValue =>
         withResource(lhs.getBase.strip()) { stripped =>
           hasLeadingNewline.ifElse(nullValue, stripped)
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastExchangeExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastExchangeExec.scala
index 2601a306300..5c79e8a4219 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastExchangeExec.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastExchangeExec.scala
@@ -26,6 +26,7 @@ import scala.util.control.NonFatal
 
 import ai.rapids.cudf.{HostMemoryBuffer, JCudfSerialization, NvtxColor, NvtxRange}
 import ai.rapids.cudf.JCudfSerialization.SerializedTableHeader
+import com.google.common.collect.MapMaker
 import com.nvidia.spark.rapids._
 import com.nvidia.spark.rapids.GpuMetric._
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
@@ -41,7 +42,7 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.Statistics
 import org.apache.spark.sql.catalyst.plans.physical.{BroadcastMode, BroadcastPartitioning, Partitioning}
 import org.apache.spark.sql.execution.{SparkPlan, SQLExecution}
-import org.apache.spark.sql.execution.exchange.BroadcastExchangeExec
+import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, Exchange}
 import org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.MAX_BROADCAST_TABLE_BYTES
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, BroadcastNestedLoopJoinExec}
 import org.apache.spark.sql.execution.metric.SQLMetrics
@@ -118,7 +119,7 @@ class SerializeConcatHostBuffersDeserializeBatch(
         try {
           val res = if (headers.isEmpty) {
             SpillableColumnarBatch(GpuColumnVector.emptyBatchFromTypes(dataTypes),
-            SpillPriorities.ACTIVE_BATCHING_PRIORITY, RapidsBuffer.defaultSpillCallback)
+            SpillPriorities.ACTIVE_BATCHING_PRIORITY)
           } else {
             withResource(JCudfSerialization.readTableFrom(headers.head, buffers.head)) {
               tableInfo =>
@@ -126,10 +127,9 @@ class SerializeConcatHostBuffersDeserializeBatch(
                 if (table == null) {
                   val numRows = tableInfo.getNumRows
                   SpillableColumnarBatch(new ColumnarBatch(Array.empty[ColumnVector], numRows),
-                    SpillPriorities.ACTIVE_BATCHING_PRIORITY, RapidsBuffer.defaultSpillCallback)
+                    SpillPriorities.ACTIVE_BATCHING_PRIORITY)
                 } else {
-                  SpillableColumnarBatch(table, dataTypes,
-                    SpillPriorities.ACTIVE_BATCHING_PRIORITY, RapidsBuffer.defaultSpillCallback)
+                  SpillableColumnarBatch(table, dataTypes, SpillPriorities.ACTIVE_BATCHING_PRIORITY)
                 }
             }
           }
@@ -290,7 +290,7 @@ class GpuBroadcastMeta(
     conf: RapidsConf,
     parent: Option[RapidsMeta[_, _, _]],
     rule: DataFromReplacementRule) extends
-  SparkPlanMeta[BroadcastExchangeExec](exchange, conf, parent, rule) {
+  SparkPlanMeta[BroadcastExchangeExec](exchange, conf, parent, rule) with Logging {
 
   override def tagPlanForGpu(): Unit = {
     if (!TrampolineUtil.isSupportedRelation(exchange.mode)) {
@@ -312,7 +312,8 @@ class GpuBroadcastMeta(
   }
 
   override def convertToGpu(): GpuExec = {
-    GpuBroadcastExchangeExec(exchange.mode, childPlans.head.convertIfNeeded())
+    GpuBroadcastExchangeExec(exchange.mode, childPlans.head.convertIfNeeded())(
+      exchange.canonicalized.asInstanceOf[BroadcastExchangeExec])
   }
 }
 
@@ -500,6 +501,11 @@ abstract class GpuBroadcastExchangeExecBase(
       sizeInBytes = metrics("dataSize").value,
       rowCount = Some(metrics(GpuMetric.NUM_OUTPUT_ROWS).value))
   }
+
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
+    throw new IllegalStateException(s"Internal Error ${this.getClass} has column support" +
+        s" mismatch:\n$this")
+  }
 }
 
 object GpuBroadcastExchangeExecBase {
@@ -533,9 +539,54 @@ object GpuBroadcastExchangeExecBase {
       SQLConf.get.getConf(StaticSQLConf.BROADCAST_EXCHANGE_MAX_THREAD_THRESHOLD)))
 }
 
-case class GpuBroadcastExchangeExec(mode: BroadcastMode, child: SparkPlan)
+case class GpuBroadcastExchangeExec(
+    mode: BroadcastMode,
+    child: SparkPlan)
+    (val cpuCanonical: BroadcastExchangeExec)
     extends GpuBroadcastExchangeExecBase(mode, child) {
+
+  override def otherCopyArgs: Seq[AnyRef] = Seq(cpuCanonical)
+
+  private var _isGpuPlanningComplete = false
+
+  /**
+   * Returns true if this node and children are finished being optimized by the RAPIDS Accelerator.
+   */
+  def isGpuPlanningComplete: Boolean = _isGpuPlanningComplete
+
+  /**
+   * Method to call after all RAPIDS Accelerator optimizations have been applied
+   * to indicate this node and its children are done being planned by the RAPIDS Accelerator.
+   * Some optimizations, such as AQE exchange reuse fixup, need to know when a node will no longer
+   * be updated so it can be tracked for reuse.
+   */
+  def markGpuPlanningComplete(): Unit = {
+    if (!_isGpuPlanningComplete) {
+      _isGpuPlanningComplete = true
+      ExchangeMappingCache.trackExchangeMapping(cpuCanonical, this)
+    }
+  }
+
   override def doCanonicalize(): SparkPlan = {
-    GpuBroadcastExchangeExec(mode.canonicalized, child.canonicalized)
+    GpuBroadcastExchangeExec(mode.canonicalized, child.canonicalized)(cpuCanonical)
+  }
+}
+
+/** Caches the mappings from canonical CPU exchanges to the GPU exchanges that replaced them */
+object ExchangeMappingCache extends Logging {
+  import scala.collection.JavaConverters._
+  private val cache = new MapMaker().weakValues().makeMap[Exchange, Exchange]().asScala
+
+  /** Try to find a recent GPU exchange that has replaced the specified CPU canonical plan. */
+  def findGpuExchangeReplacement(cpuCanonical: Exchange): Option[Exchange] = {
+    cache.get(cpuCanonical)
+  }
+
+  /** Add a GPU exchange to the exchange cache */
+  def trackExchangeMapping(cpuCanonical: Exchange, gpuExchange: Exchange): Unit = {
+    val old = findGpuExchangeReplacement(cpuCanonical)
+    if (!old.exists(_.asInstanceOf[GpuBroadcastExchangeExec].isGpuPlanningComplete)) {
+      cache.put(cpuCanonical, gpuExchange)
+    }
   }
 }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExecBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExecBase.scala
index 770c734ae02..07749dc49df 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExecBase.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExecBase.scala
@@ -96,7 +96,7 @@ abstract class GpuBroadcastHashJoinExecBase(
     OP_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_OP_TIME),
     JOIN_OUTPUT_ROWS -> createMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_OUTPUT_ROWS),
     STREAM_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_STREAM_TIME),
-    JOIN_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_JOIN_TIME)) ++ spillMetrics
+    JOIN_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_JOIN_TIME))
 
   override def requiredChildDistribution: Seq[Distribution] = {
     val mode = HashedRelationBroadcastMode(buildKeys)
@@ -140,7 +140,6 @@ abstract class GpuBroadcastHashJoinExecBase(
       buildSchema: StructType,
       streamIter: Iterator[ColumnarBatch],
       coalesceMetricsMap: Map[String, GpuMetric]): (ColumnarBatch, Iterator[ColumnarBatch]) = {
-    val semWait = coalesceMetricsMap(GpuMetric.SEMAPHORE_WAIT_TIME)
 
     val bufferedStreamIter = new CloseableBufferedIterator(streamIter.buffered)
     closeOnExcept(bufferedStreamIter) { _ =>
@@ -148,7 +147,7 @@ abstract class GpuBroadcastHashJoinExecBase(
         if (bufferedStreamIter.hasNext) {
           bufferedStreamIter.head
         } else {
-          GpuSemaphore.acquireIfNecessary(TaskContext.get(), semWait)
+          GpuSemaphore.acquireIfNecessary(TaskContext.get())
         }
       }
 
@@ -166,8 +165,6 @@ abstract class GpuBroadcastHashJoinExecBase(
     val joinTime = gpuLongMetric(JOIN_TIME)
     val joinOutputRows = gpuLongMetric(JOIN_OUTPUT_ROWS)
 
-    val spillCallback = GpuMetric.makeSpillCallback(allMetrics)
-
     val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf)
 
     val broadcastRelation = broadcastExchange.executeColumnarBroadcast[Any]()
@@ -181,14 +178,13 @@ abstract class GpuBroadcastHashJoinExecBase(
           buildSchema,
           new CollectTimeIterator("broadcast join stream", it, streamTime),
           allMetrics)
-      withResource(builtBatch) { _ =>
-        doJoin(builtBatch, streamIter, targetSize, spillCallback,
-          numOutputRows, joinOutputRows, numOutputBatches, opTime, joinTime)
-      }
+      // builtBatch will be closed in doJoin
+      doJoin(builtBatch, streamIter, targetSize,
+        numOutputRows, joinOutputRows, numOutputBatches, opTime, joinTime)
     }
   }
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     doColumnarBroadcastJoin()
   }
 }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExecBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExecBase.scala
index 99f3ceb95f0..71a2edcdaf1 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExecBase.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExecBase.scala
@@ -16,9 +16,11 @@
 
 package org.apache.spark.sql.rapids.execution
 
-import ai.rapids.cudf.{ast, GatherMap, NvtxColor, OutOfBoundsPolicy, Table}
+import ai.rapids.cudf
+import ai.rapids.cudf.{ast, GatherMap, NvtxColor, OutOfBoundsPolicy, Scalar, Table}
 import ai.rapids.cudf.ast.CompiledExpression
 import com.nvidia.spark.rapids._
+import com.nvidia.spark.rapids.RmmRapidsRetryIterator.{withRestoreOnRetry, withRetryNoSplit}
 import com.nvidia.spark.rapids.shims.{GpuBroadcastJoinMeta, ShimBinaryExecNode}
 
 import org.apache.spark.TaskContext
@@ -32,6 +34,7 @@ import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.adaptive.BroadcastQueryStageExec
 import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
 import org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec
+import org.apache.spark.sql.types.BooleanType
 import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
 
 abstract class GpuBroadcastNestedLoopJoinMetaBase(
@@ -157,7 +160,6 @@ class ConditionalNestedLoopJoinIterator(
     streamAttributes: Seq[Attribute],
     targetSize: Long,
     condition: ast.CompiledExpression,
-    spillCallback: SpillCallback,
     opTime: GpuMetric,
     joinTime: GpuMetric)
     extends SplittableJoinIterator(
@@ -166,7 +168,6 @@ class ConditionalNestedLoopJoinIterator(
       streamAttributes,
       builtBatch,
       targetSize,
-      spillCallback,
       opTime = opTime,
       joinTime = joinTime) {
   override def close(): Unit = {
@@ -196,22 +197,32 @@ class ConditionalNestedLoopJoinIterator(
   }
 
   override def createGatherer(
-      cb: ColumnarBatch,
+      cb: LazySpillableColumnarBatch,
       numJoinRows: Option[Long]): Option[JoinGatherer] = {
     if (numJoinRows.contains(0)) {
       // nothing matched
       return None
     }
-    withResource(GpuColumnVector.from(builtBatch.getBatch)) { builtTable =>
-      withResource(GpuColumnVector.from(cb)) { streamTable =>
-        closeOnExcept(LazySpillableColumnarBatch(cb, spillCallback, "stream_data")) { streamBatch =>
-          val builtSpillOnly = LazySpillableColumnarBatch.spillOnly(builtBatch)
-          val (leftTable, leftBatch, rightTable, rightBatch) = buildSide match {
-            case GpuBuildLeft => (builtTable, builtSpillOnly, streamTable, streamBatch)
-            case GpuBuildRight => (streamTable, streamBatch, builtTable, builtSpillOnly)
+    // cb will be closed by the caller, so use a spill-only version here
+    val spillOnlyCb = LazySpillableColumnarBatch.spillOnly(cb)
+    val batches = Seq(builtBatch, spillOnlyCb)
+    batches.foreach(_.checkpoint())
+    withRetryNoSplit {
+      withRestoreOnRetry(batches) {
+        withResource(GpuColumnVector.from(builtBatch.getBatch)) { builtTable =>
+          withResource(GpuColumnVector.from(cb.getBatch)) { streamTable =>
+          // We need a new LSCB that will be taken over by the gatherer, or closed
+          closeOnExcept(LazySpillableColumnarBatch(spillOnlyCb.getBatch, "stream_data")) {
+              streamBatch =>
+                val builtSpillOnly = LazySpillableColumnarBatch.spillOnly(builtBatch)
+                val (leftTable, leftBatch, rightTable, rightBatch) = buildSide match {
+                  case GpuBuildLeft => (builtTable, builtSpillOnly, streamTable, streamBatch)
+                  case GpuBuildRight => (streamTable, streamBatch, builtTable, builtSpillOnly)
+                }
+                val maps = computeGatherMaps(leftTable, rightTable, numJoinRows)
+                makeGatherer(maps, leftBatch, rightBatch, joinType)
+            }
           }
-          val maps = computeGatherMaps(leftTable, rightTable, numJoinRows)
-          makeGatherer(maps, leftBatch, rightBatch, joinType)
         }
       }
     }
@@ -270,7 +281,6 @@ object GpuBroadcastNestedLoopJoinExecBase extends Arm {
       streamAttributes: Seq[Attribute],
       targetSize: Long,
       boundCondition: Option[GpuExpression],
-      spillCallback: SpillCallback,
       numOutputRows: GpuMetric,
       joinOutputRows: GpuMetric,
       numOutputBatches: GpuMetric,
@@ -282,14 +292,18 @@ object GpuBroadcastNestedLoopJoinExecBase extends Arm {
       assert(joinType.isInstanceOf[InnerLike], s"Unexpected unconditional join type: $joinType")
       new CrossJoinIterator(builtBatch, stream, targetSize, buildSide, opTime, joinTime)
     } else {
-      val compiledAst = boundCondition.get.convertToAst(numFirstTableColumns).compile()
       if (joinType.isInstanceOf[ExistenceJoin]) {
-        // existence join
-        new ConditionalNestedLoopExistenceJoinIterator(
-          builtBatch, stream, compiledAst, opTime, joinTime)
+        if (builtBatch.numCols == 0) {
+          degenerateExistsJoinIterator(stream, builtBatch, boundCondition.get)
+        } else {
+          val compiledAst = boundCondition.get.convertToAst(numFirstTableColumns).compile()
+          new ConditionalNestedLoopExistenceJoinIterator(
+            builtBatch, stream, compiledAst, opTime, joinTime)
+        }
       } else {
+        val compiledAst = boundCondition.get.convertToAst(numFirstTableColumns).compile()
         new ConditionalNestedLoopJoinIterator(joinType, buildSide, builtBatch,
-          stream, streamAttributes, targetSize, compiledAst, spillCallback,
+          stream, streamAttributes, targetSize, compiledAst,
           opTime = opTime, joinTime = joinTime)
       }
     }
@@ -301,12 +315,42 @@ object GpuBroadcastNestedLoopJoinExecBase extends Arm {
     }
   }
 
+  private def degenerateExistsJoinIterator(
+      stream: Iterator[LazySpillableColumnarBatch],
+      builtBatch: LazySpillableColumnarBatch,
+      boundCondition: GpuExpression): Iterator[ColumnarBatch] = {
+    new Iterator[ColumnarBatch] {
+      override def hasNext: Boolean = stream.hasNext
+
+      override def next(): ColumnarBatch = {
+        withResource(stream.next()) { streamSpillable =>
+          val streamBatch = streamSpillable.getBatch
+          val existsCol: ColumnVector = if (builtBatch.numRows == 0) {
+            withResource(Scalar.fromBool(false)) { falseScalar =>
+              GpuColumnVector.from(cudf.ColumnVector.fromScalar(falseScalar, streamBatch.numRows),
+                BooleanType)
+            }
+          } else {
+            withResource(GpuExpressionsUtils.columnarEvalToColumn(
+              boundCondition, streamBatch)) { condEval =>
+              withResource(Scalar.fromBool(false)) { falseScalar =>
+                GpuColumnVector.from(condEval.getBase.replaceNulls(falseScalar), BooleanType)
+              }
+            }
+          }
+          withResource(new ColumnarBatch(Array(existsCol), streamBatch.numRows)) { existsBatch =>
+            GpuColumnVector.combineColumns(streamBatch, existsBatch)
+          }
+        }
+      }
+    }
+  }
+
   def divideIntoBatches(
       rowCounts: RDD[Long],
       targetSizeBytes: Long,
       numOutputRows: GpuMetric,
-      numOutputBatches: GpuMetric,
-      semWait: GpuMetric): RDD[ColumnarBatch] = {
+      numOutputBatches: GpuMetric): RDD[ColumnarBatch] = {
     // Hash aggregate explodes the rows out, so if we go too large
     // it can blow up. The size of a Long is 8 bytes so we just go with
     // that as our estimate, no nulls.
@@ -324,7 +368,7 @@ object GpuBroadcastNestedLoopJoinExecBase extends Arm {
         numOutputRows += ret.numRows()
         numOutputBatches += 1
         // grab the semaphore for downstream processing
-        GpuSemaphore.acquireIfNecessary(TaskContext.get(), semWait)
+        GpuSemaphore.acquireIfNecessary(TaskContext.get())
         ret
       })
     }
@@ -353,7 +397,7 @@ abstract class GpuBroadcastNestedLoopJoinExecBase(
     BUILD_DATA_SIZE -> createSizeMetric(MODERATE_LEVEL, DESCRIPTION_BUILD_DATA_SIZE),
     BUILD_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_BUILD_TIME),
     JOIN_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_JOIN_TIME),
-    JOIN_OUTPUT_ROWS -> createMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_OUTPUT_ROWS)) ++ spillMetrics
+    JOIN_OUTPUT_ROWS -> createMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_OUTPUT_ROWS))
 
   /** BuildRight means the right relation <=> the broadcast relation. */
   val (streamed, buildPlan) = gpuBuildSide match {
@@ -440,7 +484,7 @@ abstract class GpuBroadcastNestedLoopJoinExecBase(
     broadcastExchange.executeColumnarBroadcast[Any]()
   }
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     // Determine which table will be first in the join and bind the references accordingly
     // so the AST column references match the appropriate table.
     val (firstTable, secondTable) = joinType match {
@@ -526,16 +570,15 @@ abstract class GpuBroadcastNestedLoopJoinExecBase(
         case _ =>
           // Everything else is treated like an unconditional cross join
           val buildSide = gpuBuildSide
-          val spillCallback = GpuMetric.makeSpillCallback(allMetrics)
           val joinTime = gpuLongMetric(JOIN_TIME)
           streamed.executeColumnar().mapPartitions { streamedIter =>
             val lazyStream = streamedIter.map { cb =>
               withResource(cb) { cb =>
-                LazySpillableColumnarBatch(cb, spillCallback, "stream_batch")
+                LazySpillableColumnarBatch(cb, "stream_batch")
               }
             }
             val spillableBuiltBatch = withResource(builtBatch) {
-              LazySpillableColumnarBatch(_, spillCallback, "built_batch")
+              LazySpillableColumnarBatch(_, "built_batch")
             }
             new CrossJoinIterator(
               spillableBuiltBatch,
@@ -581,14 +624,12 @@ abstract class GpuBroadcastNestedLoopJoinExecBase(
 
       val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
       val numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES)
-      val semWait = gpuLongMetric(SEMAPHORE_WAIT_TIME)
       val counts = streamed.executeColumnar().map(getRowCountAndClose)
       GpuBroadcastNestedLoopJoinExecBase.divideIntoBatches(
         counts.map(s => s * buildCount),
         targetSizeBytes,
         numOutputRows,
-        numOutputBatches,
-        semWait)
+        numOutputBatches)
     }
   }
 
@@ -598,7 +639,6 @@ abstract class GpuBroadcastNestedLoopJoinExecBase(
       numFirstTableColumns: Int): RDD[ColumnarBatch] = {
     val buildTime = gpuLongMetric(BUILD_TIME)
     val buildDataSize = gpuLongMetric(BUILD_DATA_SIZE)
-    val spillCallback = GpuMetric.makeSpillCallback(allMetrics)
     // NOTE: this is a def because we want a brand new `ColumnarBatch` to be returned
     // per partition (task), since each task is going to be taking ownership
     // of a columnar batch via `LazySpillableColumnarBatch`.
@@ -617,17 +657,17 @@ abstract class GpuBroadcastNestedLoopJoinExecBase(
     streamed.executeColumnar().mapPartitions { streamedIter =>
       val lazyStream = streamedIter.map { cb =>
         withResource(cb) { cb =>
-          LazySpillableColumnarBatch(cb, spillCallback, "stream_batch")
+          LazySpillableColumnarBatch(cb, "stream_batch")
         }
       }
       val spillableBuiltBatch = withResource(builtBatch) {
-        LazySpillableColumnarBatch(_, spillCallback, "built_batch")
+        LazySpillableColumnarBatch(_, "built_batch")
       }
 
       GpuBroadcastNestedLoopJoinExecBase.nestedLoopJoin(
         nestedLoopJoinType, buildSide, numFirstTableColumns,
         spillableBuiltBatch,
-        lazyStream, streamAttributes, targetSizeBytes, boundCondition, spillCallback,
+        lazyStream, streamAttributes, targetSizeBytes, boundCondition,
         numOutputRows = numOutputRows,
         joinOutputRows = joinOutputRows,
         numOutputBatches = numOutputBatches,
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastToRowExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastToRowExec.scala
index 5e3f7bfa8e0..dbbfe60a98c 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastToRowExec.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastToRowExec.scala
@@ -37,6 +37,7 @@ import org.apache.spark.sql.catalyst.plans.physical.BroadcastMode
 import org.apache.spark.sql.execution.{SparkPlan, SQLExecution}
 import org.apache.spark.sql.execution.joins.HashedRelationBroadcastMode
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.ThreadUtils
 
 // a version of GpuSubqueryBroadcastExec that implements doExecuteBroadcast
@@ -151,6 +152,10 @@ case class GpuBroadcastToRowExec(
       rowCount = Some(metrics(NUM_OUTPUT_ROWS).value))
   }
 
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
+    throw new IllegalStateException(s"Internal Error ${this.getClass} has column support" +
+        s" mismatch:\n$this")
+  }
 }
 
 object GpuBroadcastToRowExec {
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuCustomShuffleReaderExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuCustomShuffleReaderExec.scala
index 01a704ced8f..0edcc1c969f 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuCustomShuffleReaderExec.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuCustomShuffleReaderExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -116,7 +116,7 @@ case class GpuCustomShuffleReaderExec(
    * true. By convention the executor that creates a ColumnarBatch is responsible for closing it
    * when it is no longer needed. This allows input formats to be able to reuse batches if needed.
    */
-  override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     if (cachedShuffleRDD == null) {
       cachedShuffleRDD = child match {
         case stage: ShuffleQueryStageExec =>
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala
index c550f661e54..4aba77a5ba8 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala
@@ -19,6 +19,8 @@ import ai.rapids.cudf.{ColumnView, DType, GatherMap, GroupByAggregation, NullEqu
 import ai.rapids.cudf.ast.CompiledExpression
 import com.nvidia.spark.rapids._
 import com.nvidia.spark.rapids.RapidsPluginImplicits.AutoCloseableProducingSeq
+import com.nvidia.spark.rapids.RmmRapidsRetryIterator.{withRestoreOnRetry, withRetryNoSplit}
+import com.nvidia.spark.rapids.jni.GpuOOM
 
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, NamedExpression}
 import org.apache.spark.sql.catalyst.plans.{Cross, ExistenceJoin, FullOuter, Inner, InnerLike, JoinType, LeftAnti, LeftExistence, LeftOuter, LeftSemi, RightOuter}
@@ -226,6 +228,21 @@ object GpuHashJoin extends Arm {
       case _ => false
     }
   }
+
+  // scalastyle:off line.size.limit
+  /**
+   * The function is copied from Spark 3.2:
+   *   https://github.com/apache/spark/blob/v3.2.2/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala#L709-L713
+   *
+   * Returns whether the keys can be rewritten as a packed long. If
+   * they can, we can assume that they are packed when we extract them out.
+   */
+  // scalastyle:on
+  def canRewriteAsLongType(keys: Seq[Expression]): Boolean = {
+    // TODO: support BooleanType, DateType and TimestampType
+    keys.forall(_.dataType.isInstanceOf[IntegralType]) &&
+      keys.map(_.dataType.defaultSize).sum <= 8
+  }
 }
 
 abstract class BaseHashJoinIterator(
@@ -237,7 +254,6 @@ abstract class BaseHashJoinIterator(
     targetSize: Long,
     joinType: JoinType,
     buildSide: GpuBuildSide,
-    spillCallback: SpillCallback,
     opTime: GpuMetric,
     joinTime: GpuMetric)
     extends SplittableJoinIterator(
@@ -246,7 +262,6 @@ abstract class BaseHashJoinIterator(
       streamAttributes,
       built,
       targetSize,
-      spillCallback,
       opTime = opTime,
       joinTime = joinTime) {
   // We can cache this because the build side is not changing
@@ -272,26 +287,38 @@ abstract class BaseHashJoinIterator(
   }
 
   override def createGatherer(
-      cb: ColumnarBatch,
+      cb: LazySpillableColumnarBatch,
       numJoinRows: Option[Long]): Option[JoinGatherer] = {
+    // cb will be closed by the caller, so use a spill-only version here
+    val spillOnlyCb = LazySpillableColumnarBatch.spillOnly(cb)
+    val batches = Seq(built, spillOnlyCb)
+    batches.foreach(_.checkpoint())
     try {
-      withResource(GpuProjectExec.project(built.getBatch, boundBuiltKeys)) { builtKeys =>
-        joinGatherer(builtKeys, built, cb)
+      withRetryNoSplit {
+        withRestoreOnRetry(batches) {
+          // We need a new LSCB that will be taken over by the gatherer, or closed
+          closeOnExcept(LazySpillableColumnarBatch(spillOnlyCb.getBatch, "stream_data")) {
+            streamBatch =>
+              withResource(GpuProjectExec.project(built.getBatch, boundBuiltKeys)) { builtKeys =>
+                joinGatherer(builtKeys, built, streamBatch)
+              }
+          }
+        }
       }
     } catch {
       // This should work for all join types. There should be no need to do this for any
       // of the existence joins because the output rows will never be larger than the
       // input rows on the stream side.
-      case oom: OutOfMemoryError if joinType.isInstanceOf[InnerLike]
+      case oom @ (_ : OutOfMemoryError | _: GpuOOM) if joinType.isInstanceOf[InnerLike]
           || joinType == LeftOuter
           || joinType == RightOuter
           || joinType == FullOuter =>
         // Because this is just an estimate, it is possible for us to get this wrong, so
         // make sure we at least split the batch in half.
-        val numBatches = Math.max(2, estimatedNumBatches(cb))
+        val numBatches = Math.max(2, estimatedNumBatches(spillOnlyCb))
 
         // Split batch and return no gatherer so the outer loop will try again
-        splitAndSave(cb, numBatches, Some(oom))
+        splitAndSave(spillOnlyCb.getBatch, numBatches, Some(oom))
         None
     }
   }
@@ -339,11 +366,9 @@ abstract class BaseHashJoinIterator(
   private def joinGatherer(
       buildKeys: ColumnarBatch,
       buildData: LazySpillableColumnarBatch,
-      streamCb: ColumnarBatch): Option[JoinGatherer] = {
-    withResource(GpuProjectExec.project(streamCb, boundStreamKeys)) { streamKeys =>
-      closeOnExcept(LazySpillableColumnarBatch(streamCb, spillCallback, "stream_data")) { sd =>
-        joinGatherer(buildKeys, LazySpillableColumnarBatch.spillOnly(buildData), streamKeys, sd)
-      }
+      streamCb: LazySpillableColumnarBatch): Option[JoinGatherer] = {
+    withResource(GpuProjectExec.project(streamCb.getBatch, boundStreamKeys)) { streamKeys =>
+      joinGatherer(buildKeys, LazySpillableColumnarBatch.spillOnly(buildData), streamKeys, streamCb)
     }
   }
 
@@ -370,15 +395,15 @@ abstract class BaseHashJoinIterator(
     }
   }
 
-  private def estimatedNumBatches(cb: ColumnarBatch): Int = joinType match {
-    case _: InnerLike | LeftOuter | RightOuter =>
+  private def estimatedNumBatches(cb: LazySpillableColumnarBatch): Int = joinType match {
+    case _: InnerLike | LeftOuter | RightOuter | FullOuter =>
       // We want the gather map size to be around the target size. There are two gather maps
       // that are made up of ints, so estimate how many rows per batch on the stream side
       // will produce the desired gather map size.
       val approximateStreamRowCount = ((targetSize.toDouble / 2) /
           DType.INT32.getSizeInBytes) / streamMagnificationFactor
       val estimatedRowsPerStreamBatch = Math.min(Int.MaxValue, approximateStreamRowCount)
-      Math.ceil(cb.numRows() / estimatedRowsPerStreamBatch).toInt
+      Math.ceil(cb.numRows / estimatedRowsPerStreamBatch).toInt
     case _ => 1
   }
 }
@@ -396,7 +421,6 @@ class HashJoinIterator(
     val joinType: JoinType,
     val buildSide: GpuBuildSide,
     val compareNullsEqual: Boolean, // This is a workaround to how cudf support joins for structs
-    private val spillCallback: SpillCallback,
     opTime: GpuMetric,
     private val joinTime: GpuMetric)
     extends BaseHashJoinIterator(
@@ -408,7 +432,6 @@ class HashJoinIterator(
       targetSize,
       joinType,
       buildSide,
-      spillCallback,
       opTime = opTime,
       joinTime = joinTime) {
   override protected def joinGathererLeftRight(
@@ -450,7 +473,6 @@ class ConditionalHashJoinIterator(
     joinType: JoinType,
     buildSide: GpuBuildSide,
     compareNullsEqual: Boolean, // This is a workaround to how cudf support joins for structs
-    spillCallback: SpillCallback,
     opTime: GpuMetric,
     joinTime: GpuMetric)
     extends BaseHashJoinIterator(
@@ -462,7 +484,6 @@ class ConditionalHashJoinIterator(
       targetSize,
       joinType,
       buildSide,
-      spillCallback,
       opTime = opTime,
       joinTime = joinTime) {
   override protected def joinGathererLeftRight(
@@ -526,7 +547,6 @@ class HashFullJoinIterator(
     targetSize: Long,
     buildSide: GpuBuildSide,
     compareNullsEqual: Boolean, // This is a workaround to how cudf support joins for structs
-    spillCallback: SpillCallback,
     opTime: GpuMetric,
     joinTime: GpuMetric)
     extends BaseHashJoinIterator(
@@ -538,14 +558,13 @@ class HashFullJoinIterator(
       targetSize,
       FullOuter,
       buildSide,
-      spillCallback,
       opTime = opTime,
       joinTime = joinTime) {
   // Full Join is implemented via LeftOuter or RightOuter join, depending on the build side.
   private val useLeftOuterJoin = (buildSide == GpuBuildRight)
   private val numBuiltRows = built.numRows
 
-  private[this] var builtSideTracker : Option[LazySpillableColumnarBatch] = None
+  private[this] var builtSideTracker : Option[SpillableColumnarBatch] = None
 
   private val nullEquality = if (compareNullsEqual) NullEquality.EQUAL else NullEquality.UNEQUAL
 
@@ -599,8 +618,8 @@ class HashFullJoinIterator(
       }
       assert(maps.length == 2)
       try {
-        val lazyLeftMap = LazySpillableGatherMap(maps(0), spillCallback, "left_map")
-        val lazyRightMap = LazySpillableGatherMap(maps(1), spillCallback, "right_map")
+        val lazyLeftMap = LazySpillableGatherMap(maps(0), "left_map")
+        val lazyRightMap = LazySpillableGatherMap(maps(1), "right_map")
         withResource(new NvtxWithMetrics("update tracking mask",
           NvtxColor.ORANGE, joinTime)) { _ =>
           closeOnExcept(Seq(lazyLeftMap, lazyRightMap)) { _ =>
@@ -643,12 +662,14 @@ class HashFullJoinIterator(
       builtSideTracker match {
         case None => None
         case Some(tracker) => {
-          val filteredBatch = withResource(tracker.releaseBatch()) { trackerBatch =>
-            withResource(GpuColumnVector.from(trackerBatch)) { trackerTab =>
-              val batch = built.getBatch
-              withResource(GpuColumnVector.from(batch)) { builtTable =>
-                withResource(builtTable.filter(trackerTab.getColumn(0))) { filterTab =>
-                  GpuColumnVector.from(filterTab, GpuColumnVector.extractTypes(batch))
+          val filteredBatch = withResource(tracker) { scb =>
+            withResource(scb.getColumnarBatch()) { trackerBatch =>
+              withResource(GpuColumnVector.from(trackerBatch)) { trackerTab =>
+                val batch = built.getBatch
+                withResource(GpuColumnVector.from(batch)) { builtTable =>
+                  withResource(builtTable.filter(trackerTab.getColumn(0))) { filterTab =>
+                    GpuColumnVector.from(filterTab, GpuColumnVector.extractTypes(batch))
+                  }
                 }
               }
             }
@@ -715,8 +736,10 @@ class HashFullJoinIterator(
     val updatedTrackingTable = withResource(filteredGatherMap) { filteredMap =>
       // Get the current tracking table, or all true table to start with
       val builtTrackingTable = builtSideTracker.map { spillableBatch =>
-        withResource(spillableBatch.releaseBatch()) { trackingBatch =>
-          GpuColumnVector.from(trackingBatch)
+        withResource(spillableBatch) { scb =>
+          withResource(scb.getColumnarBatch()) { trackingBatch =>
+            GpuColumnVector.from(trackingBatch)
+          }
         }
       }.getOrElse {
         trueColumnTable(numBuiltRows)
@@ -727,13 +750,14 @@ class HashFullJoinIterator(
         }
       }
     }
-    builtSideTracker = withResource(updatedTrackingTable) { newTab =>
-      withResource(GpuColumnVector.from(newTab, Array[DataType](DataTypes.BooleanType))) { cb =>
-        val lazyBatch = LazySpillableColumnarBatch(cb, spillCallback, "tracking_batch")
-        lazyBatch.allowSpilling()
-        Some(lazyBatch)
-      }
+    val previousTracker = builtSideTracker
+    builtSideTracker = withResource(updatedTrackingTable) { _ =>
+      Some(SpillableColumnarBatch(
+        GpuColumnVector.from(updatedTrackingTable, Array[DataType](DataTypes.BooleanType)),
+        SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
     }
+    // If we throw above, we should not close the existing tracker
+    previousTracker.foreach(_.close())
   }
 }
 
@@ -927,32 +951,30 @@ trait GpuHashJoin extends GpuExec {
       builtBatch: ColumnarBatch,
       stream: Iterator[ColumnarBatch],
       targetSize: Long,
-      spillCallback: SpillCallback,
       numOutputRows: GpuMetric,
       joinOutputRows: GpuMetric,
       numOutputBatches: GpuMetric,
       opTime: GpuMetric,
       joinTime: GpuMetric): Iterator[ColumnarBatch] = {
-    // The 10k is mostly for tests, hopefully no one is setting anything that low in production.
-    val realTarget = Math.max(targetSize, 10 * 1024)
-
     // Filtering nulls on the build side is a workaround for Struct joins with nullable children
     // see https://github.com/NVIDIA/spark-rapids/issues/2126 for more info
     val builtAnyNullable = compareNullsEqual && buildKeys.exists(_.nullable)
 
     val nullFiltered = if (builtAnyNullable) {
-      GpuHashJoin.filterNulls(builtBatch, boundBuildKeys)
+      withResource(builtBatch) { _ =>
+        GpuHashJoin.filterNulls(builtBatch, boundBuildKeys)
+      }
     } else {
-      GpuColumnVector.incRefCounts(builtBatch)
+      builtBatch
     }
 
     val spillableBuiltBatch = withResource(nullFiltered) {
-      LazySpillableColumnarBatch(_, spillCallback, "built")
+      LazySpillableColumnarBatch(_, "built")
     }
 
     val lazyStream = stream.map { cb =>
       withResource(cb) { cb =>
-        LazySpillableColumnarBatch(cb, spillCallback, "stream_batch")
+        LazySpillableColumnarBatch(cb, "stream_batch")
       }
     }
 
@@ -973,7 +995,7 @@ trait GpuHashJoin extends GpuExec {
       case FullOuter =>
         new HashFullJoinIterator(spillableBuiltBatch, boundBuildKeys, lazyStream,
           boundStreamKeys, streamedPlan.output, boundCondition, numFirstConditionTableColumns,
-          realTarget, buildSide, compareNullsEqual, spillCallback, opTime, joinTime)
+          targetSize, buildSide, compareNullsEqual, opTime, joinTime)
       case _ =>
         if (boundCondition.isDefined) {
           // ConditionalHashJoinIterator will close the compiled condition
@@ -981,10 +1003,10 @@ trait GpuHashJoin extends GpuExec {
             boundCondition.get.convertToAst(numFirstConditionTableColumns).compile()
           new ConditionalHashJoinIterator(spillableBuiltBatch, boundBuildKeys, lazyStream,
             boundStreamKeys, streamedPlan.output, compiledCondition,
-            realTarget, joinType, buildSide, compareNullsEqual, spillCallback, opTime, joinTime)
+            targetSize, joinType, buildSide, compareNullsEqual, opTime, joinTime)
         } else {
           new HashJoinIterator(spillableBuiltBatch, boundBuildKeys, lazyStream, boundStreamKeys,
-            streamedPlan.output, realTarget, joinType, buildSide, compareNullsEqual, spillCallback,
+            streamedPlan.output, targetSize, joinType, buildSide, compareNullsEqual,
             opTime, joinTime)
         }
     }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuShuffleExchangeExecBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuShuffleExchangeExecBase.scala
index cb437d0f1c0..c4d6d2e2115 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuShuffleExchangeExecBase.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuShuffleExchangeExecBase.scala
@@ -248,7 +248,7 @@ abstract class GpuShuffleExchangeExecBase(
   protected override def doExecute(): RDD[InternalRow] =
     throw new IllegalStateException(s"Row-based execution should not occur for $this")
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = SparkShimImpl
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = SparkShimImpl
     .attachTreeIfSupported(this, "execute") {
       // Returns the same ShuffleRowRDD if this plan is used by multiple plans.
       if (cachedShuffleRDD == null) {
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuSubPartitionHashJoin.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuSubPartitionHashJoin.scala
new file mode 100644
index 00000000000..859e194d319
--- /dev/null
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuSubPartitionHashJoin.scala
@@ -0,0 +1,550 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.rapids.execution
+
+import scala.collection.mutable.ArrayBuffer
+
+import ai.rapids.cudf.Table
+import com.nvidia.spark.rapids.{Arm, GpuColumnVector, GpuExpression, GpuHashPartitioningBase, GpuMetric, SpillableColumnarBatch, SpillPriorities, TaskAutoCloseableResource}
+import com.nvidia.spark.rapids.RapidsPluginImplicits._
+
+import org.apache.spark.TaskContext
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.catalyst.plans.InnerLike
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+object GpuSubPartitionHashJoin extends Arm {
+  /**
+   * The seed for sub partitioner for hash join.
+   * Differ from the default value: 0.
+   */
+  val SUB_PARTITION_HASH_SEED: Int = 100
+
+  /**
+   * Concatenate the input batches into a single one.
+   * The caller is responsible for closing the returned batch.
+   *
+   * @param spillBatches the batches to be concatenated, will be closed after the call
+   *                     returns.
+   * @return the concatenated SpillableColumnarBatch or None if the input is empty.
+   */
+  def concatSpillBatchesAndClose(
+      spillBatches: Seq[SpillableColumnarBatch]): Option[SpillableColumnarBatch] = {
+    val retBatch = if (spillBatches.length >= 2) {
+      // two or more batches, concatenate them
+      val (concatTable, types) = withResource(spillBatches) { _ =>
+        withResource(spillBatches.safeMap(_.getColumnarBatch())) { batches =>
+          val batchTypes = GpuColumnVector.extractTypes(batches.head)
+          withResource(batches.safeMap(GpuColumnVector.from)) { tables =>
+            (Table.concatenate(tables: _*), batchTypes)
+          }
+        }
+      }
+      // Make the concatenated table spillable.
+      withResource(concatTable) { _ =>
+        SpillableColumnarBatch(GpuColumnVector.from(concatTable, types),
+          SpillPriorities.ACTIVE_BATCHING_PRIORITY)
+      }
+    } else if (spillBatches.length == 1) {
+      // only one batch
+      spillBatches.head
+    } else null
+
+    Option(retBatch)
+  }
+
+  /**
+   * Create an iterator that takes over the input resources, and make sure closing
+   * all the resources when needed.
+   */
+  def safeIteratorFromSeq[R <: AutoCloseable](closeables: Seq[R]): Iterator[R] = {
+    new Iterator[R] with TaskAutoCloseableResource {
+
+      private[this] val remainingCloseables: ArrayBuffer[R] =
+        ArrayBuffer(closeables: _*)
+
+      override def hasNext: Boolean = remainingCloseables.nonEmpty && !closed
+
+      override def next(): R = {
+        if (!hasNext) throw new NoSuchElementException()
+        remainingCloseables.remove(0)
+      }
+
+      override def close(): Unit = {
+        remainingCloseables.safeClose()
+        remainingCloseables.clear()
+        super.close()
+      }
+    }
+  }
+}
+
+/**
+ * Drain the batches in the input iterator and partition each batch into smaller parts.
+ * It assumes all the batches are on GPU.
+ * e.g. There are two batches as below
+ *     (2,4,6), (6,8,8),
+ * and split them into 6 partitions. The result will be
+ *   0 -> (2)
+ *   1 -> (6), (6) (two separate not merged sub batches)
+ *   2 -> empty
+ *   3 -> (4)
+ *   4 -> empty
+ *   5 -> (8, 8) (A single batch)
+ */
+class GpuBatchSubPartitioner(
+    inputIter: Iterator[ColumnarBatch],
+    inputBoundKeys: Seq[GpuExpression],
+    numPartitions: Int) extends AutoCloseable with Arm {
+
+  private var isNotInited = true
+  private var numCurBatches = 0
+  // At least two partitions
+  private val realNumPartitions = Math.max(2, numPartitions)
+  private val pendingParts =
+    Array.fill(realNumPartitions)(ArrayBuffer.empty[SpillableColumnarBatch])
+
+  /** The actual count of partitions */
+  def partitionsCount: Int = realNumPartitions
+
+  /**
+   * Get the count of remaining batches (nonempty) in all the partitions currently.
+   */
+  def batchesCount: Int = {
+    initPartitions()
+    numCurBatches
+  }
+
+  /**
+   * Get a partition data as a Seq of SpillableColumnarBatch. The caller should NOT close
+   * the returned batches.
+   * If the caller wants to own the returned batches, call `releaseBatchByPartition` instead.
+   * @param partId the partition id. An exception will be raised up if it is out of range.
+   * @return a Seq of SpillableColumnarBatch if the given "partId" is in range, or null if
+   *         the partition has been released.
+   */
+  def getBatchesByPartition(partId: Int): Seq[SpillableColumnarBatch] = {
+    initPartitions()
+    pendingParts(partId)
+  }
+
+  /**
+   * Release a partition data as a Seq of SpillableColumnarBatch, and the caller is
+   * responsible for closing the returned batch.
+   *
+   * @param partId the partition id. An exception will be raised up if it is out of range.
+   * @return a Seq of SpillableColumnarBatch if the given "partId" is in range, or null if
+   *         the partition has been released.
+   */
+  def releaseBatchesByPartition(partId: Int): Seq[SpillableColumnarBatch] = {
+    initPartitions()
+    val ret = pendingParts(partId)
+    if (ret != null) {
+      numCurBatches -= ret.length
+      pendingParts(partId) = null
+    }
+    ret
+  }
+
+  override def close(): Unit = {
+    pendingParts.filterNot(_ == null).flatten.safeClose()
+    // batches are closed, safe to set all to null
+    (0 until realNumPartitions).foreach(releaseBatchesByPartition)
+  }
+
+  private def initPartitions(): Unit = {
+    if (isNotInited) {
+      partitionBatches()
+      isNotInited = false
+    }
+  }
+
+  private[this] def partitionBatches(): Unit = {
+    while (inputIter.hasNext) {
+      val gpuBatch = inputIter.next()
+      if (gpuBatch.numRows() > 0 && gpuBatch.numCols() > 0) {
+        val types = GpuColumnVector.extractTypes(gpuBatch)
+        // 1) Hash partition on the batch
+        val partedTable = GpuHashPartitioningBase.hashPartitionAndClose(
+          gpuBatch, inputBoundKeys, realNumPartitions, "Sub-Hash Calculate",
+          GpuSubPartitionHashJoin.SUB_PARTITION_HASH_SEED)
+        // 2) Split into smaller tables according to partitions
+        val subTables = withResource(partedTable) { _ =>
+          partedTable.getTable.contiguousSplit(partedTable.getPartitions.tail: _*)
+        }
+        // 3) Make each smaller table spillable and cache them in the queue
+        withResource(subTables) { _ =>
+          subTables.zipWithIndex.foreach { case (table, id) =>
+            // skip empty tables
+            if (table.getRowCount > 0) {
+              pendingParts(id) += SpillableColumnarBatch(table, types,
+                SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
+              numCurBatches += 1
+            }
+          }
+        }
+      } else if (gpuBatch.numRows() > 0 && gpuBatch.numCols() == 0) {
+        // Rows only batch. This should never happen for a hash join in Spark.
+        gpuBatch.close()
+      } else {
+        // Skip empty batches
+        gpuBatch.close()
+      }
+    } // end of while
+  }
+}
+
+/**
+ * Iterate all the partitions in the input "batchSubPartitioner," and each call to
+ * "next()" will return one or multiple parts as a single "SpillableColumnarBatch",
+ * or None for an empty partition, along with its partition id(s).
+ */
+class GpuBatchSubPartitionIterator(
+    batchSubPartitioner: GpuBatchSubPartitioner,
+    targetBatchSize: Long)
+  extends Iterator[(Seq[Int], Option[SpillableColumnarBatch])] with Arm with Logging {
+
+  // The partitions to be read. Initially it is all the partitions.
+  private val remainingPartIds: ArrayBuffer[Int] =
+    ArrayBuffer.range(0, batchSubPartitioner.partitionsCount)
+
+  override def hasNext: Boolean = remainingPartIds.nonEmpty
+
+  override def next(): (Seq[Int], Option[SpillableColumnarBatch]) = {
+    if (!hasNext) throw new NoSuchElementException()
+    // Get the next partition ids for this output.
+    val partIds = nextPartitions()
+    // Take over the batches of one or multiple partitions according to the ids. And
+    // concatenate them in a single batch.
+    val spillBatches = closeOnExcept(ArrayBuffer.empty[SpillableColumnarBatch]) { buf =>
+      partIds.foreach { pid =>
+        buf ++= batchSubPartitioner.releaseBatchesByPartition(pid)
+      }
+      buf
+    }
+    val retBatch = GpuSubPartitionHashJoin.concatSpillBatchesAndClose(spillBatches)
+    closeOnExcept(retBatch) { _ =>
+      // Update the remaining partitions
+      remainingPartIds --= partIds
+      (partIds, retBatch)
+    }
+  }
+
+  private[this] def nextPartitions(): Seq[Int] = {
+    val ret = ArrayBuffer.empty[Int]
+    var accPartitionSize = 0L
+    // always append the first one.
+    val firstPartId = remainingPartIds.head
+    val firstPartSize = computePartitionSize(firstPartId)
+    if (firstPartSize > targetBatchSize) {
+      logWarning(s"Got partition that size($firstPartSize) is larger than" +
+        s" target size($targetBatchSize)")
+    }
+    ret += firstPartId
+    accPartitionSize += firstPartSize
+    // For each output, try to collect small nonempty partitions to reach
+    // "targetBatchSize" as much as possible.
+    // e.g. We have 5 partitions as below. And the "targetBatchSize" is 16 bytes.
+    //    p1: (3,4)            -> 8 bytes
+    //    p2: (1,2), (11, 22)  -> 16 bytes
+    //    p3: (5,6)            -> 8 bytes
+    //    p4: <empty>          -> 0 bytes
+    //    p5: <empty>          -> 0 bytes
+    // Then the p1 and p3 will be put together (8 bytes + 8 bytes) in one output.
+    //    next: (3,4), (5,6)
+    //    next: (1,2), (11, 22)
+    //    next: <empty>
+    //    next: <empty>
+    if (firstPartSize > 0) {
+      remainingPartIds.tail.foreach { partId =>
+        val partSize = computePartitionSize(partId)
+        // Do not coalesce empty partitions, and output them one by one.
+        if (partSize > 0 && (accPartitionSize + partSize) <= targetBatchSize) {
+          ret += partId
+          accPartitionSize += partSize
+        }
+      }
+    }
+    ret
+  }
+
+  private[this] def computePartitionSize(partId: Int): Long = {
+    val batches = batchSubPartitioner.getBatchesByPartition(partId)
+    if (batches != null) batches.map(_.sizeInBytes).sum else 0L
+  }
+}
+
+/**
+ * An utils class that will take over the two input resources representing data from
+ * build side and stream side separately.
+ * build data is passed in as a Option of a single batch, while stream data is as
+ * a Seq of batches.
+ */
+class PartitionPair(
+    private var build: Option[SpillableColumnarBatch],
+    private var stream: Seq[SpillableColumnarBatch]) extends AutoCloseable {
+
+  /**
+   * Whether the PartitionPair pair is empty.
+   * A pair is empty when both build and stream are empty.
+   */
+  def isEmpty: Boolean = build.isEmpty && stream.isEmpty
+
+  /**
+   * Get the batches from two sides as a Tuple(build, stream).
+   */
+  def get: (Option[SpillableColumnarBatch], Seq[SpillableColumnarBatch]) = {
+    (build, stream)
+  }
+
+  /**
+   * Release the batches from two sides as a Tuple(build, stream).
+   * Callers should close the returned batches.
+   */
+  def release: (Option[SpillableColumnarBatch], Seq[SpillableColumnarBatch]) = {
+    val ret = (build, stream)
+    build = None
+    stream = Seq.empty
+    ret
+  }
+
+  override def close(): Unit = {
+    stream.safeClose()
+    stream = Seq.empty
+    build.foreach(_.safeClose())
+    build = None
+  }
+}
+
+/**
+ * Iterator that returns a pair of batches (build side, stream side) with the same key set
+ * generated by sub-partitioning algorithm when each call to "next".
+ * Each pair may have data from one or multiple partitions. And for build side, batches are
+ * concatenated into a single one.
+ *
+ * It will skip the empty pairs by default. Set "skipEmptyPairs" to false to also get
+ * the empty pairs.
+ */
+class GpuSubPartitionPairIterator(
+    buildIter: Iterator[ColumnarBatch],
+    boundBuildKeys: Seq[GpuExpression],
+    streamIter: Iterator[ColumnarBatch],
+    boundStreamKeys: Seq[GpuExpression],
+    numPartitions: Int,
+    targetBatchSize: Long,
+    skipEmptyPairs: Boolean = true)
+  extends Iterator[PartitionPair] with Arm with AutoCloseable {
+
+  private val buildSubPartitioner =
+    new GpuBatchSubPartitioner(buildIter, boundBuildKeys, numPartitions)
+  private val buildSubIterator =
+    new GpuBatchSubPartitionIterator(buildSubPartitioner, targetBatchSize)
+  private val streamSubPartitioner =
+    new GpuBatchSubPartitioner(streamIter, boundStreamKeys, numPartitions)
+
+  private[this] var closed = false
+
+  private[this] var partitionPair: Option[PartitionPair] = None
+  private[this] var pairConsumed = true
+
+  override def hasNext: Boolean = {
+    if (closed) return false
+    if (pairConsumed) {
+      do {
+        partitionPair.foreach(_.close())
+        partitionPair = None
+        partitionPair = tryPullNextPair()
+      } while (partitionPair.exists(_.isEmpty) && skipEmptyPairs)
+      pairConsumed = false
+    }
+    partitionPair.isDefined
+  }
+
+  override def next(): PartitionPair = {
+    if (!hasNext) {
+      throw new NoSuchElementException()
+    }
+    val pair = partitionPair.get
+    partitionPair = None
+    pairConsumed = true
+    pair
+  }
+
+  override def close(): Unit = if (!closed) {
+    closed = true
+    // for real safe close
+    val e = new Exception()
+    buildSubPartitioner.safeClose(e)
+    streamSubPartitioner.safeClose(e)
+    partitionPair.foreach(_.close())
+    partitionPair = None
+  }
+
+  private[this] val hasNextBatch: () => Boolean = if (skipEmptyPairs) {
+    // Check the batch numbers directly can stop early when the remaining partitions
+    // are all empty on both build side and stream side.
+    () => buildSubPartitioner.batchesCount > 0 || streamSubPartitioner.batchesCount > 0
+  } else {
+    () => buildSubIterator.hasNext
+  }
+
+  private[this] def tryPullNextPair(): Option[PartitionPair] = {
+    if(hasNextBatch()) {
+      val (partIds, spillBuildBatch) = buildSubIterator.next()
+      closeOnExcept(spillBuildBatch) { _ =>
+        closeOnExcept(ArrayBuffer.empty[SpillableColumnarBatch]) { streamBuf =>
+          partIds.foreach { id =>
+            streamBuf ++= streamSubPartitioner.releaseBatchesByPartition(id)
+          }
+          Some(new PartitionPair(spillBuildBatch, streamBuf))
+        }
+      }
+    } else None
+  }
+}
+
+/** Base class for joins by sub-partitioning algorithm */
+abstract class BaseSubHashJoinIterator(
+    buildIter: Iterator[ColumnarBatch],
+    boundBuildKeys: Seq[GpuExpression],
+    streamIter: Iterator[ColumnarBatch],
+    boundStreamKeys: Seq[GpuExpression],
+    numPartitions: Int,
+    targetSize: Long,
+    opTime: GpuMetric)
+  extends Iterator[ColumnarBatch] with Arm with TaskAutoCloseableResource {
+
+  // skip empty partition pairs
+  private[this] val subPartitionPairIter = new GpuSubPartitionPairIterator(buildIter,
+    boundBuildKeys, streamIter, boundStreamKeys, numPartitions, targetSize)
+
+  private[this] var joinIter: Option[Iterator[ColumnarBatch]] = None
+  private[this] var nextCb: Option[ColumnarBatch] = None
+
+  override def close(): Unit = {
+    nextCb.foreach(_.safeClose(new Exception))
+    nextCb = None
+    subPartitionPairIter.close()
+    super.close()
+  }
+
+  override def hasNext: Boolean = {
+    if (closed) return false
+    var mayContinue = true
+    // Loop to support optimizing out some pairs by returning a None instead
+    // of a join iterator.
+    while (nextCb.isEmpty && mayContinue) {
+      if (joinIter.exists(_.hasNext)) {
+        nextCb = joinIter.map(_.next())
+      } else {
+        val hasNextPair = opTime.ns {
+          subPartitionPairIter.hasNext
+        }
+        if (hasNextPair) {
+          // Need to refill the join iterator
+          joinIter.foreach {
+            case closeable: AutoCloseable => closeable.close()
+            case _ => // noop
+          }
+          joinIter = None
+          opTime.ns {
+            withResource(subPartitionPairIter.next()) { pair =>
+              joinIter = setupJoinIterator(pair)
+            }
+          }
+          // try to pull next batch right away to avoid a loop again
+          if (joinIter.exists(_.hasNext)) {
+            nextCb = joinIter.map(_.next())
+          }
+        } else {
+          mayContinue = false
+        }
+      }
+    }
+    nextCb.isDefined
+  }
+
+  override def next(): ColumnarBatch = {
+    if (!hasNext) {
+      throw new NoSuchElementException()
+    }
+    val ret = nextCb.get
+    nextCb = None
+    ret
+  }
+
+  protected def setupJoinIterator(pair: PartitionPair): Option[Iterator[ColumnarBatch]]
+}
+
+trait GpuSubPartitionHashJoin extends Arm with Logging { self: GpuHashJoin =>
+
+  protected lazy val buildSchema: StructType = StructType.fromAttributes(buildPlan.output)
+
+  def doJoinBySubPartition(
+      builtIter: Iterator[ColumnarBatch],
+      streamIter: Iterator[ColumnarBatch],
+      targetSize: Long,
+      numPartitions: Int,
+      numOutputRows: GpuMetric,
+      joinOutputRows: GpuMetric,
+      numOutputBatches: GpuMetric,
+      opTime: GpuMetric,
+      joinTime: GpuMetric): Iterator[ColumnarBatch] = {
+
+    // A log for test to verify that sub-partitioning is used.
+    logInfo(s"$joinType hash join is executed by sub-partitioning " +
+      s"in task ${TaskContext.get().taskAttemptId()}")
+
+    new BaseSubHashJoinIterator(builtIter, boundBuildKeys, streamIter,
+        boundStreamKeys, numPartitions, targetSize, opTime) {
+
+      private[this] def canOptimizeOut(pair: PartitionPair): Boolean = {
+        val (build, stream) = pair.get
+        joinType match {
+          case _: InnerLike =>
+            // For inner join, no need to run if either side is empty
+            build.isEmpty || stream.isEmpty
+          case _ => false
+        }
+      }
+
+      override def setupJoinIterator(pair: PartitionPair): Option[Iterator[ColumnarBatch]] = {
+        if (canOptimizeOut(pair)) {
+          // Skip it due to optimization
+          None
+        } else {
+          val (build, stream) = pair.release
+          val buildCb = closeOnExcept(stream) { _ =>
+            withResource(build) { _ =>
+              build.map(_.getColumnarBatch()).getOrElse(GpuColumnVector.emptyBatch(buildSchema))
+            }
+          }
+          val streamIter = closeOnExcept(buildCb) { _ =>
+            GpuSubPartitionHashJoin.safeIteratorFromSeq(stream).map { spill =>
+              withResource(spill)(_.getColumnarBatch())
+            }
+          }
+          // Leverage the original join iterators
+          val joinIter = doJoin(buildCb, streamIter, targetSize, 
+            numOutputRows, joinOutputRows, numOutputBatches, opTime, joinTime)
+          Some(joinIter)
+        }
+      }
+
+    } // end of "new BaseSubHashJoinIterator"
+  }
+}
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastExec.scala
index e03188fef31..09e128e95ff 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastExec.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,7 +32,8 @@ import org.apache.spark.sql.catalyst.plans.physical.IdentityBroadcastMode
 import org.apache.spark.sql.execution.{BaseSubqueryExec, SparkPlan, SQLExecution, SubqueryBroadcastExec}
 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec
 import org.apache.spark.sql.execution.exchange.BroadcastExchangeExec
-import org.apache.spark.sql.execution.joins.HashedRelationBroadcastMode
+import org.apache.spark.sql.execution.joins.{HashedRelationBroadcastMode, HashJoin}
+import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.ThreadUtils
 
 
@@ -229,14 +230,28 @@ case class GpuSubqueryBroadcastExec(
     // are being extracted. The CPU already has the key projections applied in the broadcast
     // data and thus does not have similar logic here.
     val broadcastModeProject = modeKeys.map { keyExprs =>
-      val keyExpr = keyExprs(index)
+      val keyExpr = if (GpuHashJoin.canRewriteAsLongType(buildKeys)) {
+        // in this case, there is only 1 key expression since it's a packed version that encompasses
+        // multiple integral values into a single long using bit logic. In CPU Spark, the broadcast
+        // would create a LongHashedRelation instead of a standard HashedRelation.
+        keyExprs.head
+      } else {
+        keyExprs(index)
+      }
       UnsafeProjection.create(keyExpr)
     }
 
     // Use the single output of the broadcast mode projection if it exists
     val rowProjectIndex = if (broadcastModeProject.isDefined) 0 else index
-    val rowProject = UnsafeProjection.create(
-      BoundReference(rowProjectIndex, buildKeys(index).dataType, buildKeys(index).nullable))
+    val rowExpr = if (GpuHashJoin.canRewriteAsLongType(buildKeys)) {
+      // Since this is the expected output for a LongHashedRelation, we can extract the key from the
+      // long packed key using bit logic, using this method available in HashJoin to give us the 
+      // correct key expression. 
+      HashJoin.extractKeyExprAt(buildKeys, index)
+    } else {
+      BoundReference(rowProjectIndex, buildKeys(index).dataType, buildKeys(index).nullable)
+    }
+    val rowProject = UnsafeProjection.create(rowExpr)
 
     // Deserializes the batch on the host. Then, transforms it to rows and performs row-wise
     // projection. We should NOT run any device operation on the driver node.
@@ -267,6 +282,11 @@ case class GpuSubqueryBroadcastExec(
   override def executeCollect(): Array[InternalRow] = {
     ThreadUtils.awaitResult(relationFuture, Duration.Inf)
   }
+
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
+    throw new IllegalStateException(s"Internal Error ${this.getClass} has column support" +
+        s" mismatch:\n$this")
+  }
 }
 
 object GpuSubqueryBroadcastExec {
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/InternalColumnarRddConverter.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/InternalColumnarRddConverter.scala
index 4d079abc079..c7d0413a49f 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/InternalColumnarRddConverter.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/InternalColumnarRddConverter.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -609,7 +609,7 @@ private class ExternalRowToColumnarIterator(
       }
 
       // About to place data back on the GPU
-      GpuSemaphore.acquireIfNecessary(TaskContext.get(), NoopMetric)
+      GpuSemaphore.acquireIfNecessary(TaskContext.get())
 
       val ret = builders.build(rowCount)
 
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/TrampolineUtil.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/TrampolineUtil.scala
index 27492ed1216..32677e701c2 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/TrampolineUtil.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/TrampolineUtil.scala
@@ -159,4 +159,12 @@ object TrampolineUtil {
   def addShutdownHook(priority: Int, runnable: Runnable): AnyRef = {
     ShutdownHookManager.addShutdownHook(priority)(() => runnable.run())
   }
+
+  def classForName[C](
+      className: String,
+      initialize: Boolean = true,
+      noSparkClassLoader: Boolean = false): Class[C] = {
+    Utils.classForName(className, initialize, noSparkClassLoader)
+  }
+
 }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/BatchGroupUtils.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/BatchGroupUtils.scala
index 80e6f1a9fe6..14327910297 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/BatchGroupUtils.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/BatchGroupUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -163,8 +163,7 @@ private[python] object BatchGroupUtils extends Arm {
       dedupAttrs: Seq[Attribute],
       groupingOffsetsInDedup: Seq[Int],
       inputRows: GpuMetric,
-      inputBatches: GpuMetric,
-      spillCallback: SpillCallback): Iterator[ColumnarBatch] = {
+      inputBatches: GpuMetric): Iterator[ColumnarBatch] = {
     val dedupRefs = GpuBindReferences.bindReferences(dedupAttrs, inputAttrs)
     val dedupIter = inputIter.map { batch =>
       // Close the original input batches.
@@ -175,7 +174,7 @@ private[python] object BatchGroupUtils extends Arm {
       }
     }
     // Groups rows on the batches being projected
-    BatchGroupedIterator(dedupIter, dedupAttrs, groupingOffsetsInDedup, spillCallback)
+    BatchGroupedIterator(dedupIter, dedupAttrs, groupingOffsetsInDedup)
   }
 
   /**
@@ -261,8 +260,7 @@ private[python] object BatchGroupUtils extends Arm {
 private[python] class BatchGroupedIterator private(
     input: Iterator[ColumnarBatch],
     inputAttributes: Seq[Attribute],
-    groupingIndices: Seq[Int],
-    spillCallback: SpillCallback) extends Iterator[ColumnarBatch] with Arm {
+    groupingIndices: Seq[Int]) extends Iterator[ColumnarBatch] with Arm {
 
   private val batchesQueue: mutable.Queue[SpillableColumnarBatch] = mutable.Queue.empty
 
@@ -307,8 +305,7 @@ private[python] class BatchGroupedIterator private(
           tables.foreach { t =>
             batchesQueue.enqueue(SpillableColumnarBatch(
               GpuColumnVectorFromBuffer.from(t, inputTypes),
-              SpillPriorities.ACTIVE_ON_DECK_PRIORITY,
-              spillCallback))
+              SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
           }
         }
       }
@@ -328,10 +325,9 @@ private[python] object BatchGroupedIterator extends Arm {
    */
   def apply(wrapped: Iterator[ColumnarBatch],
             inputAttributes: Seq[Attribute],
-            groupingIndices: Seq[Int],
-            spillCallback: SpillCallback): Iterator[ColumnarBatch] = {
+            groupingIndices: Seq[Int]): Iterator[ColumnarBatch] = {
     if (wrapped.hasNext) {
-      new BatchGroupedIterator(wrapped, inputAttributes, groupingIndices, spillCallback)
+      new BatchGroupedIterator(wrapped, inputAttributes, groupingIndices)
     } else {
       Iterator.empty
     }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuAggregateInPandasExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuAggregateInPandasExec.scala
index ec7f6593bbc..551d7ce01e9 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuAggregateInPandasExec.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuAggregateInPandasExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -131,9 +131,8 @@ case class GpuAggregateInPandasExec(
   // so better to coalesce the output batches.
   override def coalesceAfter: Boolean = gpuGroupingExpressions.nonEmpty
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
-    val (mNumInputRows, mNumInputBatches, mNumOutputRows, mNumOutputBatches,
-      spillCallback) = commonGpuMetrics()
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
+    val (mNumInputRows, mNumInputBatches, mNumOutputRows, mNumOutputBatches) = commonGpuMetrics()
 
     lazy val isPythonOnGpuEnabled = GpuPythonHelper.isPythonOnGpuEnabled(conf)
     val sessionLocalTimeZone = conf.sessionLocalTimeZone
@@ -193,7 +192,7 @@ case class GpuAggregateInPandasExec(
       // Second splits into separate group batches.
       val miniAttrs = gpuGroupingExpressions ++ allInputs
       val pyInputIter = BatchGroupedIterator(miniIter, miniAttrs.asInstanceOf[Seq[Attribute]],
-          groupingRefs.indices, spillCallback)
+          groupingRefs.indices)
         .map { groupedBatch =>
           // Resolves the group key and the python input from a grouped batch. Then
           //  - Caches the key to be combined with the Python output later. And
@@ -228,7 +227,7 @@ case class GpuAggregateInPandasExec(
                 }
               }
             }
-            queue.add(keyBatch, spillCallback)
+            queue.add(keyBatch)
 
             // Python input batch
             val pyInputColumns = pyInputRefs.indices.safeMap { idx =>
@@ -250,7 +249,6 @@ case class GpuAggregateInPandasExec(
           pythonRunnerConf,
           // The whole group data should be written in a single call, so here is unlimited
           Int.MaxValue,
-          spillCallback.semaphoreWaitTime,
           StructType.fromAttributes(pyOutAttributes),
           () => queue.finish())
 
@@ -270,6 +268,6 @@ case class GpuAggregateInPandasExec(
         inputIter
       }
     }
-  } // end of doExecuteColumnar
+  } // end of internalDoExecuteColumnar
 
 }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowEvalPythonExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowEvalPythonExec.scala
index 98bff3f144c..103be267f03 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowEvalPythonExec.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowEvalPythonExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -50,8 +50,7 @@ class RebatchingRoundoffIterator(
     schema: StructType,
     targetRoundoff: Int,
     inputRows: GpuMetric,
-    inputBatches: GpuMetric,
-    spillCallback: SpillCallback)
+    inputBatches: GpuMetric)
     extends Iterator[ColumnarBatch] with Arm {
   var pending: Option[SpillableColumnarBatch] = None
 
@@ -86,15 +85,14 @@ class RebatchingRoundoffIterator(
       inputBatches += 1
       inputRows += got.numRows()
       rowsSoFar += got.numRows()
-      batches.append(SpillableColumnarBatch(got, SpillPriorities.ACTIVE_BATCHING_PRIORITY,
-        spillCallback))
+      batches.append(SpillableColumnarBatch(got, SpillPriorities.ACTIVE_BATCHING_PRIORITY))
     }
     val toConcat = batches.safeMap(_.getColumnarBatch()).toArray
     ConcatAndConsumeAll.buildNonEmptyBatch(toConcat, schema)
   }
 
   override def next(): ColumnarBatch = {
-    GpuSemaphore.acquireIfNecessary(TaskContext.get(), spillCallback.semaphoreWaitTime)
+    GpuSemaphore.acquireIfNecessary(TaskContext.get())
 
     val combined : ColumnarBatch = if (pending.isDefined) {
       if (!wrapped.hasNext) {
@@ -122,8 +120,7 @@ class RebatchingRoundoffIterator(
             localPending.setSpillPriority(SpillPriorities.ACTIVE_BATCHING_PRIORITY)
             batches.append(localPending)
             pending = None
-            batches.append(SpillableColumnarBatch(cb, SpillPriorities.ACTIVE_BATCHING_PRIORITY,
-              spillCallback))
+            batches.append(SpillableColumnarBatch(cb, SpillPriorities.ACTIVE_BATCHING_PRIORITY))
             fillAndConcat(batches)
           } finally {
             batches.safeClose()
@@ -139,8 +136,7 @@ class RebatchingRoundoffIterator(
       } else {
         val batches: ArrayBuffer[SpillableColumnarBatch] = ArrayBuffer.empty
         try {
-          batches.append(SpillableColumnarBatch(cb, SpillPriorities.ACTIVE_BATCHING_PRIORITY,
-            spillCallback))
+          batches.append(SpillableColumnarBatch(cb, SpillPriorities.ACTIVE_BATCHING_PRIORITY))
           fillAndConcat(batches)
         } finally {
           batches.safeClose()
@@ -165,8 +161,7 @@ class RebatchingRoundoffIterator(
       pending =
           Some(SpillableColumnarBatch(GpuColumnVectorFromBuffer.from(split.last,
             GpuColumnVector.extractTypes(schema)),
-            SpillPriorities.ACTIVE_ON_DECK_PRIORITY,
-            spillCallback))
+            SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
       GpuColumnVectorFromBuffer.from(split.head, GpuColumnVector.extractTypes(schema))
     }
   }
@@ -181,9 +176,8 @@ class BatchQueue extends AutoCloseable with Arm {
     mutable.Queue[SpillableColumnarBatch]()
   private var isSet = false
 
-  def add(batch: ColumnarBatch, spillCallback: SpillCallback): Unit = synchronized {
-    queue.enqueue(SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY,
-      spillCallback))
+  def add(batch: ColumnarBatch): Unit = synchronized {
+    queue.enqueue(SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
     if (!isSet) {
       // Wake up anyone waiting for the first batch.
       isSet = true
@@ -267,9 +261,8 @@ case class GpuArrowEvalPythonExec(
   private val sessionLocalTimeZone = conf.sessionLocalTimeZone
   private val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf)
 
-  override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {
-    val (numInputRows, numInputBatches, numOutputRows, numOutputBatches,
-         spillCallback) = commonGpuMetrics()
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
+    val (numInputRows, numInputBatches, numOutputRows, numOutputBatches) = commonGpuMetrics()
 
     lazy val isPythonOnGpuEnabled = GpuPythonHelper.isPythonOnGpuEnabled(conf)
 
@@ -323,12 +316,12 @@ case class GpuArrowEvalPythonExec(
 
       val boundReferences = GpuBindReferences.bindReferences(allInputs, childOutput)
       val batchedIterator = new RebatchingRoundoffIterator(iter, inputSchema, targetBatchSize,
-        numInputRows, numInputBatches, spillCallback)
+        numInputRows, numInputBatches)
       val pyInputIterator = batchedIterator.map { batch =>
         // We have to do the project before we add the batch because the batch might be closed
         // when it is added
         val ret = GpuProjectExec.project(batch, boundReferences)
-        queue.add(batch, spillCallback)
+        queue.add(batch)
         ret
       }
 
@@ -346,7 +339,6 @@ case class GpuArrowEvalPythonExec(
           timeZone,
           runnerConf,
           targetBatchSize,
-          spillCallback.semaphoreWaitTime,
           pythonOutputSchema,
           () => queue.finish())
 
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowPythonRunner.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowPythonRunner.scala
index 7ef3eba1099..b6aba57aad0 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowPythonRunner.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowPythonRunner.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -94,8 +94,6 @@ trait GpuPythonArrowOutput extends Arm { _: GpuPythonRunnerBase[_] =>
     minReadTargetBatchSize = size
   }
 
-  def semWait: GpuMetric
-
   /** Convert the table received from the Python side to a batch. */
   protected def toBatch(table: Table): ColumnarBatch
 
@@ -169,7 +167,7 @@ trait GpuPythonArrowOutput extends Arm { _: GpuPythonRunnerBase[_] =>
               case SpecialLengths.START_ARROW_STREAM =>
                 val builder = ArrowIPCOptions.builder()
                 builder.withCallback(() =>
-                  GpuSemaphore.acquireIfNecessary(TaskContext.get(), semWait))
+                  GpuSemaphore.acquireIfNecessary(TaskContext.get()))
                 arrowReader = Table.readArrowIPCChunked(builder.build(),
                   new StreamToBufferProvider(stream))
                 read()
@@ -210,7 +208,6 @@ abstract class GpuArrowPythonRunnerBase(
     timeZoneId: String,
     conf: Map[String, String],
     batchSize: Long,
-    val semWait: GpuMetric,
     onDataWriteFinished: () => Unit = null)
   extends GpuPythonRunnerBase[ColumnarBatch](funcs, evalType, argOffsets)
     with GpuPythonArrowOutput {
@@ -290,7 +287,6 @@ class GpuArrowPythonRunner(
     timeZoneId: String,
     conf: Map[String, String],
     batchSize: Long,
-    override val semWait: GpuMetric,
     pythonOutSchema: StructType,
     onDataWriteFinished: () => Unit = null)
   extends GpuArrowPythonRunnerBase(
@@ -301,7 +297,6 @@ class GpuArrowPythonRunner(
     timeZoneId,
     conf,
     batchSize,
-    semWait,
     onDataWriteFinished) {
 
   def toBatch(table: Table): ColumnarBatch = {
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuCoGroupedArrowPythonRunner.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuCoGroupedArrowPythonRunner.scala
index 64d96b6cf4d..d92e5e5a4ed 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuCoGroupedArrowPythonRunner.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuCoGroupedArrowPythonRunner.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@ import java.io.DataOutputStream
 import java.net.Socket
 
 import ai.rapids.cudf.{ArrowIPCWriterOptions, NvtxColor, NvtxRange, Table}
-import com.nvidia.spark.rapids.{GpuColumnVector, GpuMetric, GpuSemaphore}
+import com.nvidia.spark.rapids.{GpuColumnVector, GpuSemaphore}
 
 import org.apache.spark.{SparkEnv, TaskContext}
 import org.apache.spark.api.python.{ChainedPythonFunctions, PythonRDD}
@@ -44,7 +44,6 @@ class GpuCoGroupedArrowPythonRunner(
     timeZoneId: String,
     conf: Map[String, String],
     batchSize: Int,
-    val semWait: GpuMetric,
     pythonOutSchema: StructType)
   extends GpuPythonRunnerBase[(ColumnarBatch, ColumnarBatch)](funcs, evalType, argOffsets)
     with GpuPythonArrowOutput {
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuFlatMapCoGroupsInPandasExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuFlatMapCoGroupsInPandasExec.scala
index 9687d8619da..26f58323570 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuFlatMapCoGroupsInPandasExec.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuFlatMapCoGroupsInPandasExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -129,9 +129,8 @@ case class GpuFlatMapCoGroupsInPandasExec(
   override def childrenCoalesceGoal: Seq[CoalesceGoal] =
     Seq(RequireSingleBatch, RequireSingleBatch)
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
-    val (numInputRows, numInputBatches, numOutputRows, numOutputBatches,
-         spillCallback) = commonGpuMetrics()
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
+    val (numInputRows, numInputBatches, numOutputRows, numOutputBatches) = commonGpuMetrics()
     lazy val isPythonOnGpuEnabled = GpuPythonHelper.isPythonOnGpuEnabled(conf)
     // Python wraps the resulting columns in a single struct column.
     val pythonOutputSchema = StructType(
@@ -153,9 +152,9 @@ case class GpuFlatMapCoGroupsInPandasExec(
       if (leftIter.isEmpty && rightIter.isEmpty) Iterator.empty else {
         // project and group for left and right
         val leftGroupedIter = projectAndGroup(leftIter, left.output, leftDedupAttrs,
-          leftGroupingOffsets, numInputRows, numInputBatches, spillCallback)
+          leftGroupingOffsets, numInputRows, numInputBatches)
         val rightGroupedIter = projectAndGroup(rightIter, right.output, rightDedupAttrs,
-          rightGroupingOffsets, numInputRows, numInputBatches, spillCallback)
+          rightGroupingOffsets, numInputRows, numInputBatches)
         // Cogroup the data
         val pyInputIter = new CoGroupedIterator(leftGroupedIter, leftDedupAttrs,
           leftGroupingOffsets, rightGroupedIter, rightDedupAttrs,  rightGroupingOffsets)
@@ -170,11 +169,10 @@ case class GpuFlatMapCoGroupsInPandasExec(
           pythonRunnerConf,
           // The whole group data should be written in a single call, so here is unlimited
           Int.MaxValue,
-          spillCallback.semaphoreWaitTime,
           pythonOutputSchema)
 
         executePython(pyInputIter, output, pyRunner, numOutputRows, numOutputBatches)
       }
     }
-  } // end of doExecuteColumnar
+  } // end of internalDoExecuteColumnar
 }
diff --git a/sql-plugin/src/main/311+-nondb/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuFlatMapGroupsInPandasExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuFlatMapGroupsInPandasExec.scala
similarity index 83%
rename from sql-plugin/src/main/311+-nondb/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuFlatMapGroupsInPandasExec.scala
rename to sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuFlatMapGroupsInPandasExec.scala
index cc6f95a4efc..5cdf3c333bf 100644
--- a/sql-plugin/src/main/311+-nondb/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuFlatMapGroupsInPandasExec.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuFlatMapGroupsInPandasExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,23 +14,22 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.rapids.execution.python.shims
+package org.apache.spark.sql.rapids.execution.python
 
 import com.nvidia.spark.rapids._
 import com.nvidia.spark.rapids.python.PythonWorkerSemaphore
 import com.nvidia.spark.rapids.shims.ShimUnaryExecNode
 
 import org.apache.spark.TaskContext
-import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
+import org.apache.spark.api.python.ChainedPythonFunctions
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, Partitioning}
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.python.FlatMapGroupsInPandasExec
-import org.apache.spark.sql.rapids.execution.python.{GpuArrowPythonRunner, GpuPythonExecBase, GpuPythonHelper, GpuPythonUDF, GroupArgs}
 import org.apache.spark.sql.rapids.execution.python.BatchGroupUtils._
+import org.apache.spark.sql.rapids.execution.python.shims._
 import org.apache.spark.sql.types.{StructField, StructType}
-import org.apache.spark.sql.util.ArrowUtils
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
 class GpuFlatMapGroupsInPandasExecMeta(
@@ -107,14 +106,11 @@ case class GpuFlatMapGroupsInPandasExec(
   // processed by Python executors group by group, so better to coalesce the output batches.
   override def coalesceAfter: Boolean = true
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
-    val (mNumInputRows, mNumInputBatches, mNumOutputRows, mNumOutputBatches,
-         spillCallback) = commonGpuMetrics()
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
+    val (mNumInputRows, mNumInputBatches, mNumOutputRows, mNumOutputBatches) = commonGpuMetrics()
 
     lazy val isPythonOnGpuEnabled = GpuPythonHelper.isPythonOnGpuEnabled(conf)
     val chainedFunc = Seq(ChainedPythonFunctions(Seq(pandasFunction)))
-    val sessionLocalTimeZone = conf.sessionLocalTimeZone
-    val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf)
     val localOutput = output
     val localChildOutput = child.output
     // Python wraps the resulting columns in a single struct column.
@@ -125,6 +121,12 @@ case class GpuFlatMapGroupsInPandasExec(
     val GroupArgs(dedupAttrs, argOffsets, groupingOffsets) =
         resolveArgOffsets(child, groupingAttributes)
 
+    val runnerShims = GpuArrowPythonRunnerShims(conf,
+                        chainedFunc,
+                        Array(argOffsets),
+                        StructType.fromAttributes(dedupAttrs),
+                        pythonOutputSchema)
+
     // Start processing. Map grouped batches to ArrowPythonRunner results.
     child.executeColumnar().mapPartitionsInternal { inputIter =>
       if (isPythonOnGpuEnabled) {
@@ -135,22 +137,11 @@ case class GpuFlatMapGroupsInPandasExec(
       // Projects each input batch into the deduplicated schema, and splits
       // into separate group batches to sends them to Python group by group later.
       val pyInputIter = projectAndGroup(inputIter, localChildOutput, dedupAttrs, groupingOffsets,
-          mNumInputRows, mNumInputBatches, spillCallback)
+          mNumInputRows, mNumInputBatches)
 
       if (pyInputIter.hasNext) {
         // Launch Python workers only when the data is not empty.
-        val pyRunner = new GpuArrowPythonRunner(
-          chainedFunc,
-          PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
-          Array(argOffsets),
-          StructType.fromAttributes(dedupAttrs),
-          sessionLocalTimeZone,
-          pythonRunnerConf,
-          // The whole group data should be written in a single call, so here is unlimited
-          Int.MaxValue,
-          spillCallback.semaphoreWaitTime,
-          pythonOutputSchema)
-
+        val pyRunner = runnerShims.getRunner()
         executePython(pyInputIter, localOutput, pyRunner, mNumOutputRows, mNumOutputBatches)
       } else {
         // Empty partition, return it directly
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuMapInBatchExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuMapInBatchExec.scala
index bf44dbb9eb4..4885952c73f 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuMapInBatchExec.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuMapInBatchExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,9 +48,8 @@ trait GpuMapInBatchExec extends ShimUnaryExecNode with GpuPythonExecBase {
 
   override def outputPartitioning: Partitioning = child.outputPartitioning
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
-    val (numInputRows, numInputBatches, numOutputRows, numOutputBatches,
-         spillCallback) = commonGpuMetrics()
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
+    val (numInputRows, numInputBatches, numOutputRows, numOutputBatches) = commonGpuMetrics()
 
     val pyInputTypes = child.schema
     val chainedFunc = Seq(ChainedPythonFunctions(Seq(pandasFunction)))
@@ -75,7 +74,7 @@ trait GpuMapInBatchExec extends ShimUnaryExecNode with GpuPythonExecBase {
       val contextAwareIter = new ContextAwareIterator(context, inputIter)
 
       val pyInputIterator = new RebatchingRoundoffIterator(contextAwareIter, pyInputTypes,
-          batchSize, numInputRows, numInputBatches, spillCallback)
+          batchSize, numInputRows, numInputBatches)
         .map { batch =>
           // Here we wrap it via another column so that Python sides understand it
           // as a DataFrame.
@@ -96,8 +95,7 @@ trait GpuMapInBatchExec extends ShimUnaryExecNode with GpuPythonExecBase {
             pyInputSchema,
             sessionLocalTimeZone,
             pythonRunnerConf,
-            batchSize,
-            spillCallback.semaphoreWaitTime) {
+            batchSize) {
           override def toBatch(table: Table): ColumnarBatch = {
             BatchGroupedIterator.extractChildren(table, localOutput)
           }
@@ -114,6 +112,6 @@ trait GpuMapInBatchExec extends ShimUnaryExecNode with GpuPythonExecBase {
         inputIter
       }
     } // end of mapPartitionsInternal
-  } // end of doExecuteColumnar
+  } // end of internalDoExecuteColumnar
 
 }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuPythonExecBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuPythonExecBase.scala
index a75599913f2..340e6b829b0 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuPythonExecBase.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuPythonExecBase.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,7 +39,7 @@ trait GpuPythonExecBase extends GpuExec {
     NUM_OUTPUT_BATCHES -> createMetric(outputBatchesLevel, DESCRIPTION_NUM_OUTPUT_BATCHES),
     NUM_INPUT_ROWS -> createMetric(DEBUG_LEVEL, DESCRIPTION_NUM_INPUT_ROWS),
     NUM_INPUT_BATCHES -> createMetric(DEBUG_LEVEL, DESCRIPTION_NUM_INPUT_BATCHES)
-  ) ++ spillMetrics
+  )
 
   /**
    * Return the common metrics in order of
@@ -47,17 +47,13 @@ trait GpuPythonExecBase extends GpuExec {
    *   NUM_INPUT_BATCHES
    *   NUM_OUTPUT_ROWS
    *   NUM_OUTPUT_BATCHES
-   *   SpillCallback
    * as a tuple.
    */
-  protected def commonGpuMetrics(): (GpuMetric, GpuMetric, GpuMetric, GpuMetric,
-      SpillCallback) = (
+  protected def commonGpuMetrics(): (GpuMetric, GpuMetric, GpuMetric, GpuMetric) = (
     gpuLongMetric(NUM_INPUT_ROWS),
     gpuLongMetric(NUM_INPUT_BATCHES),
     gpuLongMetric(NUM_OUTPUT_ROWS),
-    gpuLongMetric(NUM_OUTPUT_BATCHES),
-    GpuMetric.makeSpillCallback(allMetrics)
-  )
+    gpuLongMetric(NUM_OUTPUT_BATCHES))
 
 }
 
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuWindowInPandasExecBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuWindowInPandasExecBase.scala
index 88eeb9e88cd..bfd198aec59 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuWindowInPandasExecBase.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuWindowInPandasExecBase.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -86,8 +86,7 @@ class GroupingIterator(
     wrapped: Iterator[ColumnarBatch],
     partitionSpec: Seq[Expression],
     inputRows: GpuMetric,
-    inputBatches: GpuMetric,
-    spillCallback: SpillCallback) extends Iterator[ColumnarBatch] with Arm {
+    inputBatches: GpuMetric) extends Iterator[ColumnarBatch] with Arm {
 
   // Currently do it in a somewhat ugly way. In the future cuDF will provide a dedicated API.
   // Current solution assumes one group data exists in only one batch, so just split the
@@ -149,8 +148,7 @@ class GroupingIterator(
                   GpuColumnVectorFromBuffer.from(table, GpuColumnVector.extractTypes(batch))
                 }
                 groupBatches.enqueue(splitBatches.tail.map(sb =>
-                  SpillableColumnarBatch(sb, SpillPriorities.ACTIVE_ON_DECK_PRIORITY,
-                    spillCallback)): _*)
+                  SpillableColumnarBatch(sb, SpillPriorities.ACTIVE_ON_DECK_PRIORITY)): _*)
                 splitBatches.head
               }
             }
@@ -393,9 +391,8 @@ trait GpuWindowInPandasExecBase extends ShimUnaryExecNode with GpuPythonExecBase
     new ColumnarBatch(boundsCVs ++ dataCVs.map(_.incRefCount()), numRows)
   }
 
-  override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {
-    val (numInputRows, numInputBatches, numOutputRows, numOutputBatches,
-         spillCallback) = commonGpuMetrics()
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
+    val (numInputRows, numInputBatches, numOutputRows, numOutputBatches) = commonGpuMetrics()
     val sessionLocalTimeZone = conf.sessionLocalTimeZone
 
     // 1) Unwrap the expressions and build some info data:
@@ -504,7 +501,7 @@ trait GpuWindowInPandasExecBase extends ShimUnaryExecNode with GpuPythonExecBase
       // Re-batching the input data by GroupingIterator
       val boundPartitionRefs = GpuBindReferences.bindGpuReferences(gpuPartitionSpec, childOutput)
       val groupedIterator = new GroupingIterator(inputIter, boundPartitionRefs,
-        numInputRows, numInputBatches, spillCallback)
+        numInputRows, numInputBatches)
       val pyInputIterator = groupedIterator.map { batch =>
         // We have to do the project before we add the batch because the batch might be closed
         // when it is added
@@ -513,7 +510,7 @@ trait GpuWindowInPandasExecBase extends ShimUnaryExecNode with GpuPythonExecBase
         val inputBatch = withResource(projectedBatch) { projectedCb =>
           insertWindowBounds(projectedCb)
         }
-        queue.add(batch, spillCallback)
+        queue.add(batch)
         inputBatch
       }
 
@@ -532,7 +529,6 @@ trait GpuWindowInPandasExecBase extends ShimUnaryExecNode with GpuPythonExecBase
           pythonRunnerConf,
           /* The whole group data should be written in a single call, so here is unlimited */
           Int.MaxValue,
-          spillCallback.semaphoreWaitTime,
           pythonOutputSchema,
           () => queue.finish())
 
@@ -545,6 +541,6 @@ trait GpuWindowInPandasExecBase extends ShimUnaryExecNode with GpuPythonExecBase
       }
 
     } // End of mapPartitions
-  } // End of doExecuteColumnar
+  } // End of internalDoExecuteColumnar
 
 }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala
index 534273b79cd..d02bb7a0b32 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala
@@ -21,7 +21,7 @@ import java.util.Optional
 
 import scala.collection.mutable.ArrayBuffer
 
-import ai.rapids.cudf.{BinaryOp, BinaryOperable, ColumnVector, ColumnView, DType, PadSide, Scalar, Table}
+import ai.rapids.cudf.{BinaryOp, BinaryOperable, CaptureGroups, ColumnVector, ColumnView, DType, PadSide, RegexProgram, Scalar, Table}
 import com.nvidia.spark.rapids._
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.shims.{ShimExpression, SparkShimImpl}
@@ -742,11 +742,39 @@ case class GpuStringRepeat(input: Expression, repeatTimes: Expression)
 
 }
 
+trait HasGpuStringReplace extends Arm {
+  def doStringReplace(
+      strExpr: GpuColumnVector,
+      searchExpr: GpuScalar,
+      replaceExpr: GpuScalar): ColumnVector = {
+    // When search or replace string is null, return all nulls like the CPU does.
+    if (!searchExpr.isValid || !replaceExpr.isValid) {
+      GpuColumnVector.columnVectorFromNull(strExpr.getRowCount.toInt, StringType)
+    } else if (searchExpr.getValue.asInstanceOf[UTF8String].numChars() == 0) {
+      // Return original string if search string is empty
+      strExpr.getBase.asStrings()
+    } else {
+      strExpr.getBase.stringReplace(searchExpr.getBase, replaceExpr.getBase)
+    }
+  }
+
+  def doStringReplaceMulti(
+    strExpr: GpuColumnVector,
+    search: Seq[String],
+    replacement: String): ColumnVector = {
+      withResource(ColumnVector.fromStrings(search: _*)) { targets => 
+        withResource(ColumnVector.fromStrings(replacement)) {  repls =>
+          strExpr.getBase.stringReplace(targets, repls)
+        }
+      }
+  }
+}
+
 case class GpuStringReplace(
     srcExpr: Expression,
     searchExpr: Expression,
     replaceExpr: Expression)
-  extends GpuTernaryExpression with ImplicitCastInputTypes {
+  extends GpuTernaryExpression with ImplicitCastInputTypes with HasGpuStringReplace {
 
   override def dataType: DataType = srcExpr.dataType
 
@@ -794,15 +822,7 @@ case class GpuStringReplace(
       strExpr: GpuColumnVector,
       searchExpr: GpuScalar,
       replaceExpr: GpuScalar): ColumnVector = {
-    // When search or replace string is null, return all nulls like the CPU does.
-    if (!searchExpr.isValid || !replaceExpr.isValid) {
-      GpuColumnVector.columnVectorFromNull(strExpr.getRowCount.toInt, StringType)
-    } else if (searchExpr.getValue.asInstanceOf[UTF8String].numChars() == 0) {
-      // Return original string if search string is empty
-      strExpr.getBase.asStrings()
-    } else {
-      strExpr.getBase.stringReplace(searchExpr.getBase, replaceExpr.getBase)
-    }
+    doStringReplace(strExpr, searchExpr, replaceExpr)
   }
 
   override def doColumnar(numRows: Int, val0: GpuScalar, val1: GpuScalar,
@@ -996,6 +1016,40 @@ object GpuRegExpUtils {
    countGroups(parseAST(pattern))
   }
 
+  def getChoicesFromRegex(regex: RegexAST): Option[Seq[String]] = {
+    regex match {
+      case RegexGroup(_, t, None) =>
+        getChoicesFromRegex(t)
+      case RegexChoice(a, b) =>
+        getChoicesFromRegex(a) match {
+          case Some(la) => 
+            getChoicesFromRegex(b) match {
+              case Some(lb) => Some(la ++ lb)
+              case _ => None
+            }
+          case _ => None
+        }
+      case RegexSequence(parts) =>
+        if (GpuOverrides.isSupportedStringReplacePattern(regex.toRegexString)) {
+          Some(Seq(regex.toRegexString))
+        } else {
+          parts.foldLeft(Some(Seq[String]()): Option[Seq[String]]) { (m: Option[Seq[String]], r) => 
+            getChoicesFromRegex(r) match {
+              case Some(l) => m.map(_ ++ l)
+              case _ => None
+            }
+          }
+        }
+      case _ =>
+        if (GpuOverrides.isSupportedStringReplacePattern(regex.toRegexString)) {
+          Some(Seq(regex.toRegexString))
+        } else {
+          None
+        }
+    }
+  }
+
+
 }
 
 class GpuRLikeMeta(
@@ -1013,7 +1067,7 @@ class GpuRLikeMeta(
           try {
             // verify that we support this regex and can transpile it to cuDF format
             val (transpiledAST, _) =
-                new CudfRegexTranspiler(RegexFindMode).getTranspiledAST(str.toString, None)
+                new CudfRegexTranspiler(RegexFindMode).getTranspiledAST(str.toString, None, None)
             GpuRegExpUtils.validateRegExpComplexity(this, transpiledAST)
             pattern = Some(transpiledAST.toRegexString)
           } catch {
@@ -1044,9 +1098,8 @@ case class GpuRLike(left: Expression, right: Expression, pattern: String)
     throw new IllegalStateException("Really should not be here, " +
       "Cannot have a scalar as left side operand in RLike")
 
-  @scala.annotation.nowarn("msg=method containsRe in class ColumnView is deprecated")
   override def doColumnar(lhs: GpuColumnVector, rhs: GpuScalar): ColumnVector = {
-    lhs.getBase.containsRe(pattern)
+    lhs.getBase.containsRe(new RegexProgram(pattern, CaptureGroups.NON_CAPTURE))
   }
 
   override def doColumnar(numRows: Int, lhs: GpuScalar, rhs: GpuScalar): ColumnVector = {
@@ -1112,26 +1165,29 @@ abstract class GpuRegExpTernaryBase extends GpuTernaryExpression {
 case class GpuRegExpReplace(
     srcExpr: Expression,
     searchExpr: Expression,
-    replaceExpr: Expression,
-    javaRegexpPattern: String,
+    replaceExpr: Expression)
+    (javaRegexpPattern: String,
     cudfRegexPattern: String,
-    cudfReplacementString: String)
-  extends GpuRegExpTernaryBase with ImplicitCastInputTypes {
+    cudfReplacementString: String,
+    searchList: Option[Seq[String]],
+    replaceOpt: Option[GpuRegExpReplaceOpt])
+  extends GpuRegExpTernaryBase with ImplicitCastInputTypes with HasGpuStringReplace {
 
+  override def otherCopyArgs: Seq[AnyRef] = Seq(javaRegexpPattern,
+    cudfRegexPattern, cudfReplacementString, searchList, replaceOpt)
   override def inputTypes: Seq[DataType] = Seq(StringType, StringType, StringType)
 
   override def first: Expression = srcExpr
   override def second: Expression = searchExpr
   override def third: Expression = replaceExpr
 
-  def this(srcExpr: Expression, searchExpr: Expression, javaRegexpPattern: String,
+  def this(srcExpr: Expression, searchExpr: Expression)(javaRegexpPattern: String,
     cudfRegexPattern: String, cudfReplacementString: String) = {
 
-    this(srcExpr, searchExpr, GpuLiteral("", StringType), javaRegexpPattern,
-      cudfRegexPattern, cudfReplacementString)
+    this(srcExpr, searchExpr, GpuLiteral("", StringType))(javaRegexpPattern,
+      cudfRegexPattern, cudfReplacementString, None, None)
   }
 
-  @scala.annotation.nowarn("msg=method replaceRegex in class ColumnView is deprecated")
   override def doColumnar(
       strExpr: GpuColumnVector,
       searchExpr: GpuScalar,
@@ -1139,44 +1195,61 @@ case class GpuRegExpReplace(
     // For empty strings and a regex containing only a zero-match repetition,
     // the behavior in some versions of Spark is different.
     // see https://github.com/NVIDIA/spark-rapids/issues/5456
-    if (SparkShimImpl.reproduceEmptyStringBug &&
-        GpuRegExpUtils.isEmptyRepetition(javaRegexpPattern)) {
-      val isEmpty = withResource(strExpr.getBase.getCharLengths) { len =>
-        withResource(Scalar.fromInt(0)) { zero =>
-          len.equalTo(zero)
+    replaceOpt match {
+      case Some(GpuRegExpStringReplace) =>
+        doStringReplace(strExpr, searchExpr, replaceExpr)
+      case Some(GpuRegExpStringReplaceMulti) =>
+        searchList match {
+          case Some(searches) =>
+            doStringReplaceMulti(strExpr, searches, cudfReplacementString)
+          case _ =>
+            throw new IllegalStateException("Need a replace")
         }
-      }
-      withResource(isEmpty) { _ =>
-        withResource(GpuScalar.from("", DataTypes.StringType)) { emptyString =>
-          withResource(GpuScalar.from(cudfReplacementString, DataTypes.StringType)) { rep =>
-            withResource(strExpr.getBase.replaceRegex(cudfRegexPattern, rep)) { replacement =>
-              isEmpty.ifElse(emptyString, replacement)
+      case _ =>
+        val prog = new RegexProgram(cudfRegexPattern, CaptureGroups.NON_CAPTURE)
+        if (SparkShimImpl.reproduceEmptyStringBug &&
+            GpuRegExpUtils.isEmptyRepetition(javaRegexpPattern)) {
+          val isEmpty = withResource(strExpr.getBase.getCharLengths) { len =>
+            withResource(Scalar.fromInt(0)) { zero =>
+              len.equalTo(zero)
             }
           }
+          withResource(isEmpty) { _ =>
+            withResource(GpuScalar.from("", DataTypes.StringType)) { emptyString =>
+              withResource(GpuScalar.from(cudfReplacementString, DataTypes.StringType)) { rep =>
+                withResource(strExpr.getBase.replaceRegex(prog, rep)) { replacement =>
+                  isEmpty.ifElse(emptyString, replacement)
+                }
+              }
+            }
+          }
+        } else {
+          withResource(Scalar.fromString(cudfReplacementString)) { rep =>
+            strExpr.getBase.replaceRegex(prog, rep)
+          }
         }
-      }
-    } else {
-      withResource(Scalar.fromString(cudfReplacementString)) { rep =>
-        strExpr.getBase.replaceRegex(cudfRegexPattern, rep)
-      }
     }
+
   }
 
 }
 
 case class GpuRegExpReplaceWithBackref(
     override val child: Expression,
-    cudfRegexPattern: String,
+    searchExpr: Expression,
+    replaceExpr: Expression)
+    (cudfRegexPattern: String,
     cudfReplacementString: String)
   extends GpuUnaryExpression with ImplicitCastInputTypes {
 
+  override def otherCopyArgs: Seq[AnyRef] = Seq(cudfRegexPattern, cudfReplacementString)
   override def inputTypes: Seq[DataType] = Seq(StringType)
 
   override def dataType: DataType = StringType
 
-  @scala.annotation.nowarn("msg=method stringReplaceWithBackrefs in class ColumnView is deprecated")
   override protected def doColumnar(input: GpuColumnVector): ColumnVector = {
-    input.getBase.stringReplaceWithBackrefs(cudfRegexPattern, cudfReplacementString)
+    val prog = new RegexProgram(cudfRegexPattern)
+    input.getBase.stringReplaceWithBackrefs(prog, cudfReplacementString)
   }
 
 }
@@ -1189,7 +1262,6 @@ class GpuRegExpExtractMeta(
   extends TernaryExprMeta[RegExpExtract](expr, conf, parent, rule) {
 
   private var pattern: Option[String] = None
-  private var numGroups = 0
 
   override def tagExprForGpu(): Unit = {
     GpuRegExpUtils.tagForRegExpEnabled(this)
@@ -1200,13 +1272,23 @@ class GpuRegExpExtractMeta(
       case _ =>
     }
 
+    var numGroups = 0
+    val groupIdx = expr.idx match {
+      case Literal(value, DataTypes.IntegerType) =>
+        Some(value.asInstanceOf[Int])
+      case _ =>
+        willNotWorkOnGpu("GPU only supports literal index")
+        None
+    }
+
     expr.regexp match {
       case Literal(str: UTF8String, DataTypes.StringType) if str != null =>
         try {
           val javaRegexpPattern = str.toString
           // verify that we support this regex and can transpile it to cuDF format
           val (transpiledAST, _) =
-            new CudfRegexTranspiler(RegexFindMode).getTranspiledAST(javaRegexpPattern, None)
+            new CudfRegexTranspiler(RegexFindMode).getTranspiledAST(
+              javaRegexpPattern, groupIdx, None)
           GpuRegExpUtils.validateRegExpComplexity(this, transpiledAST)
           pattern = Some(transpiledAST.toRegexString)
           numGroups = GpuRegExpUtils.countGroups(javaRegexpPattern)
@@ -1218,18 +1300,14 @@ class GpuRegExpExtractMeta(
         willNotWorkOnGpu(s"only non-null literal strings are supported on GPU")
     }
 
-    expr.idx match {
-      case Literal(value, DataTypes.IntegerType) =>
-        val idx = value.asInstanceOf[Int]
-        if (idx < 0) {
-          willNotWorkOnGpu("the specified group index cannot be less than zero")
-        }
-        if (idx > numGroups) {
-          willNotWorkOnGpu(
-            s"regex group count is $numGroups, but the specified group index is $idx")
-        }
-      case _ =>
-        willNotWorkOnGpu("GPU only supports literal index")
+    groupIdx.foreach { idx =>
+      if (idx < 0) {
+        willNotWorkOnGpu("the specified group index cannot be less than zero")
+      }
+      if (idx > numGroups) {
+        willNotWorkOnGpu(
+          s"regex group count is $numGroups, but the specified group index is $idx")
+      }
     }
   }
 
@@ -1239,17 +1317,17 @@ class GpuRegExpExtractMeta(
       idx: Expression): GpuExpression = {
     val cudfPattern = pattern.getOrElse(
       throw new IllegalStateException("Expression has not been tagged with cuDF regex pattern"))
-    GpuRegExpExtract(str, regexp, idx, cudfPattern)
+    GpuRegExpExtract(str, regexp, idx)(cudfPattern)
   }
 }
 
 case class GpuRegExpExtract(
     subject: Expression,
     regexp: Expression,
-    idx: Expression,
-    cudfRegexPattern: String)
+    idx: Expression)(cudfRegexPattern: String)
   extends GpuRegExpTernaryBase with ImplicitCastInputTypes with NullIntolerant {
 
+  override def otherCopyArgs: Seq[AnyRef] = cudfRegexPattern :: Nil
   override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, IntegerType)
   override def first: Expression = subject
   override def second: Expression = regexp
@@ -1257,7 +1335,6 @@ case class GpuRegExpExtract(
 
   override def prettyName: String = "regexp_extract"
 
-  @scala.annotation.nowarn("msg=method extractRe in class ColumnView is deprecated")
   override def doColumnar(
       str: GpuColumnVector,
       regexp: GpuScalar,
@@ -1275,7 +1352,9 @@ case class GpuRegExpExtract(
       case i: Int if i == 0 =>
         ("(" + cudfRegexPattern + ")", 0)
       case i =>
-        (cudfRegexPattern, i.asInstanceOf[Int] - 1)
+        // Since we have transpiled all but one of the capture groups to non-capturing, the index
+        // here moves to 0 to single out the one capture group left
+        (cudfRegexPattern, 0)
     }
 
     // There are some differences in behavior between cuDF and Java so we have
@@ -1293,7 +1372,7 @@ case class GpuRegExpExtract(
     // | 'a1a'  | '1'   | '1'   |
     // | '1a1'  | ''    | NULL  |
 
-    withResource(str.getBase.extractRe(extractPattern)) { extract =>
+    withResource(str.getBase.extractRe(new RegexProgram(extractPattern))) { extract =>
       withResource(GpuScalar.from("", DataTypes.StringType)) { emptyString =>
         val outputNullAndInputNotNull =
           withResource(extract.getColumn(groupIndex).isNull) { outputNull =>
@@ -1317,18 +1396,27 @@ class GpuRegExpExtractAllMeta(
   extends TernaryExprMeta[RegExpExtractAll](expr, conf, parent, rule) {
 
   private var pattern: Option[String] = None
-  private var numGroups = 0
 
   override def tagExprForGpu(): Unit = {
     GpuRegExpUtils.tagForRegExpEnabled(this)
 
+    var numGroups = 0
+    val groupIdx = expr.idx match {
+      case Literal(value, DataTypes.IntegerType) =>
+        Some(value.asInstanceOf[Int])
+      case _ =>
+        willNotWorkOnGpu("GPU only supports literal index")
+        None
+    }
+
     expr.regexp match {
       case Literal(str: UTF8String, DataTypes.StringType) if str != null =>
         try {
           val javaRegexpPattern = str.toString
           // verify that we support this regex and can transpile it to cuDF format
           val (transpiledAST, _) =
-            new CudfRegexTranspiler(RegexFindMode).getTranspiledAST(javaRegexpPattern, None)
+            new CudfRegexTranspiler(RegexFindMode).getTranspiledAST(
+              javaRegexpPattern, groupIdx, None)
           GpuRegExpUtils.validateRegExpComplexity(this, transpiledAST)
           pattern = Some(transpiledAST.toRegexString)
           numGroups = GpuRegExpUtils.countGroups(javaRegexpPattern)
@@ -1340,18 +1428,14 @@ class GpuRegExpExtractAllMeta(
         willNotWorkOnGpu(s"only non-null literal strings are supported on GPU")
     }
 
-    expr.idx match {
-      case Literal(value, DataTypes.IntegerType) =>
-        val idx = value.asInstanceOf[Int]
-        if (idx < 0) {
-          willNotWorkOnGpu("the specified group index cannot be less than zero")
-        }
-        if (idx > numGroups) {
-          willNotWorkOnGpu(
-            s"regex group count is $numGroups, but the specified group index is $idx")
-        }
-      case _ =>
-        willNotWorkOnGpu("GPU only supports literal index")
+    groupIdx.foreach { idx =>
+      if (idx < 0) {
+        willNotWorkOnGpu("the specified group index cannot be less than zero")
+      }
+      if (idx > numGroups) {
+        willNotWorkOnGpu(
+          s"regex group count is $numGroups, but the specified group index is $idx")
+      }
     }
   }
 
@@ -1361,18 +1445,17 @@ class GpuRegExpExtractAllMeta(
       idx: Expression): GpuExpression = {
     val cudfPattern = pattern.getOrElse(
       throw new IllegalStateException("Expression has not been tagged with cuDF regex pattern"))
-    GpuRegExpExtractAll(str, regexp, idx, numGroups, cudfPattern)
+    GpuRegExpExtractAll(str, regexp, idx)(cudfPattern)
   }
 }
 
 case class GpuRegExpExtractAll(
     str: Expression,
     regexp: Expression,
-    idx: Expression,
-    numGroups: Int,
-    cudfRegexPattern: String)
+    idx: Expression)(cudfRegexPattern: String)
   extends GpuRegExpTernaryBase with ImplicitCastInputTypes with NullIntolerant {
 
+  override def otherCopyArgs: Seq[AnyRef] = cudfRegexPattern :: Nil
   override def dataType: DataType = ArrayType(StringType, containsNull = true)
   override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, IntegerType)
   override def first: Expression = str
@@ -1381,14 +1464,14 @@ case class GpuRegExpExtractAll(
 
   override def prettyName: String = "regexp_extract_all"
 
-  @scala.annotation.nowarn("msg=method extractAllRecord in class ColumnView is deprecated")
   override def doColumnar(
       str: GpuColumnVector,
       regexp: GpuScalar,
       idx: GpuScalar): ColumnVector = {
     idx.getValue.asInstanceOf[Int] match {
       case 0 =>
-        str.getBase.extractAllRecord(cudfRegexPattern, 0)
+        val prog = new RegexProgram(cudfRegexPattern, CaptureGroups.NON_CAPTURE)
+        str.getBase.extractAllRecord(prog, 0)
       case intIdx =>
         // Extract matches corresponding to idx. cuDF's extract_all_record does not support
         // group idx, so we must manually extract the relevant matches. Example:
@@ -1402,13 +1485,16 @@ case class GpuRegExpExtractAll(
         // 2nd element afterwards from the cuDF list
 
         val rowCount = str.getRowCount
+        val prog = new RegexProgram(cudfRegexPattern)
 
         val extractedWithNulls = withResource(
-          str.getBase.extractAllRecord(cudfRegexPattern, intIdx)) { allExtracted =>
+          // Now the index is always 1 because we have transpiled all the capture groups to the
+          // single group that we care about, so we just have to handle the idx = 1 case here
+          str.getBase.extractAllRecord(prog, 1)) { allExtracted =>
             withResource(allExtracted.countElements) { listSizes =>
               withResource(listSizes.max) { maxSize =>
                 val maxSizeInt = maxSize.getInt
-                val stringCols = Range(intIdx - 1, maxSizeInt, numGroups).safeMap {
+                val stringCols = Range(0, maxSizeInt, 1).safeMap {
                   i =>
                     withResource(Scalar.fromInt(i)) { scalarIndex =>
                       withResource(ColumnVector.fromScalar(scalarIndex, rowCount.toInt)) {
@@ -1528,7 +1614,6 @@ case class GpuSubstringIndex(strExpr: Expression,
   // This is a bit hacked up at the moment. We are going to use a regular expression to extract
   // a single value. It only works if the delim is a single character. A full version of
   // substring_index for the GPU has been requested at https://github.com/rapidsai/cudf/issues/5158
-  @scala.annotation.nowarn("msg=method extractRe in class ColumnView is deprecated")
   override def doColumnar(str: GpuColumnVector, delim: GpuScalar,
       count: GpuScalar): ColumnVector = {
     if (regexp == null) {
@@ -1538,7 +1623,7 @@ case class GpuSubstringIndex(strExpr: Expression,
         }
       }
     } else {
-      withResource(str.getBase.extractRe(regexp)) { table: Table =>
+      withResource(str.getBase.extractRe(new RegexProgram(regexp))) { table: Table =>
         table.getColumn(0).incRefCount()
       }
     }
@@ -1713,7 +1798,7 @@ abstract class StringSplitRegExpMeta[INPUT <: TernaryExpression](expr: INPUT,
             pattern = simplified
           case None =>
             try {
-              val (transpiledAST, _) = transpiler.getTranspiledAST(utf8Str.toString, None)
+              val (transpiledAST, _) = transpiler.getTranspiledAST(utf8Str.toString, None, None)
               GpuRegExpUtils.validateRegExpComplexity(this, transpiledAST)
               pattern = transpiledAST.toRegexString
               isRegExp = true
@@ -1784,13 +1869,16 @@ case class GpuStringSplit(str: Expression, regex: Expression, limit: Expression,
 
   override def prettyName: String = "split"
 
-  @scala.annotation.nowarn("msg=method stringSplitRecord in class ColumnView is deprecated")
   override def doColumnar(str: GpuColumnVector, regex: GpuScalar,
       limit: GpuScalar): ColumnVector = {
     limit.getValue.asInstanceOf[Int] match {
       case 0 =>
         // Same as splitting as many times as possible
-        str.getBase.stringSplitRecord(pattern, -1, isRegExp)
+        if (isRegExp) {
+          str.getBase.stringSplitRecord(new RegexProgram(pattern, CaptureGroups.NON_CAPTURE), -1)
+        } else {
+          str.getBase.stringSplitRecord(pattern, -1)
+        }
       case 1 =>
         // Short circuit GPU and just return a list containing the original input string
         withResource(str.getBase.isNull) { isNull =>
@@ -1802,7 +1890,11 @@ case class GpuStringSplit(str: Expression, regex: Expression, limit: Expression,
           }
         }
       case n =>
-        str.getBase.stringSplitRecord(pattern, n, isRegExp)
+        if (isRegExp) {
+          str.getBase.stringSplitRecord(new RegexProgram(pattern, CaptureGroups.NON_CAPTURE), n)
+        } else {
+          str.getBase.stringSplitRecord(pattern, n)
+        }
     }
   }
 
@@ -1909,15 +2001,23 @@ case class GpuStringToMap(strExpr: Expression,
     }
   }
 
-  @scala.annotation.nowarn("msg=in class ColumnView is deprecated")
   private def toMap(str: GpuColumnVector): GpuColumnVector = {
     // Firstly, split the input strings into lists of strings.
-    withResource(str.getBase.stringSplitRecord(pairDelim, isPairDelimRegExp)) { listsOfStrings =>
+    val listsOfStrings = if (isPairDelimRegExp) {
+      str.getBase.stringSplitRecord(new RegexProgram(pairDelim, CaptureGroups.NON_CAPTURE))
+    } else {
+      str.getBase.stringSplitRecord(pairDelim)
+    }
+    withResource(listsOfStrings) { listsOfStrings =>
       // Extract strings column from the output lists column.
       withResource(listsOfStrings.getChildColumnView(0)) { stringsCol =>
         // Split the key-value strings into pairs of strings of key-value (using limit = 2).
-        withResource(stringsCol.stringSplit(keyValueDelim, 2, isKeyValueDelimRegExp)) {
-          keysValuesTable =>
+        val keysValuesTable = if (isKeyValueDelimRegExp) {
+          stringsCol.stringSplit(new RegexProgram(keyValueDelim, CaptureGroups.NON_CAPTURE), 2)
+        } else {
+          stringsCol.stringSplit(keyValueDelim, 2)
+        }
+        withResource(keysValuesTable) { keysValuesTable =>
 
           def toMapFromValues(values: ColumnVector): GpuColumnVector = {
             // This code is safe, because the `keysValuesTable` always has at least one column
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/zorder/ZOrderRules.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/zorder/ZOrderRules.scala
index eabe62b9c5a..32529531328 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/zorder/ZOrderRules.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/zorder/ZOrderRules.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 package org.apache.spark.sql.rapids.zorder
 
-import com.nvidia.spark.rapids.{ExprChecks, ExprMeta, ExprRule, GpuExpression, GpuRangePartitioner, GpuSorter, RepeatingParamCheck, ShimLoader, TypeSig, UnaryExprMeta}
+import com.nvidia.spark.rapids.{ExprChecks, ExprMeta, ExprRule, GpuExpression, GpuRangePartitioner, GpuSorter, RepeatingParamCheck, ShimReflectionUtils, TypeSig, UnaryExprMeta}
 import com.nvidia.spark.rapids.GpuOverrides.{expr, pluginSupportedOrderableSig}
 
 import org.apache.spark.sql.catalyst.InternalRow
@@ -88,7 +88,7 @@ object ZOrderRules {
   def openSourceExprs: Map[Class[_ <: Expression], ExprRule[_ <: Expression]] = {
     try {
       val interleaveClazz =
-        ShimLoader.loadClass("org.apache.spark.sql.delta.expressions.InterleaveBits")
+        ShimReflectionUtils.loadClass("org.apache.spark.sql.delta.expressions.InterleaveBits")
             .asInstanceOf[Class[Expression]]
       val interleaveRule = expr[Expression](
         "Interleave bit as a part of deltalake zorder",
@@ -105,7 +105,7 @@ object ZOrderRules {
         })
 
       val partExprClass =
-        ShimLoader.loadClass("org.apache.spark.sql.delta.expressions.PartitionerExpr")
+        ShimReflectionUtils.loadClass("org.apache.spark.sql.delta.expressions.PartitionerExpr")
             .asInstanceOf[Class[Expression]]
       val partRule = partExprRule(partExprClass)
 
@@ -120,7 +120,7 @@ object ZOrderRules {
   def databricksExprs: Map[Class[_ <: Expression], ExprRule[_ <: Expression]] = {
     try {
       val hilbertClazz =
-        ShimLoader.loadClass("com.databricks.sql.expressions.HilbertLongIndex")
+        ShimReflectionUtils.loadClass("com.databricks.sql.expressions.HilbertLongIndex")
             .asInstanceOf[Class[Expression]]
       val hilbertRule = expr[Expression](
         "Hilbert long index as a part of Databrick's deltalake zorder",
@@ -141,7 +141,7 @@ object ZOrderRules {
         })
 
       val partExprClass =
-        ShimLoader.loadClass("com.databricks.sql.expressions.PartitionerExpr")
+        ShimReflectionUtils.loadClass("com.databricks.sql.expressions.PartitionerExpr")
             .asInstanceOf[Class[Expression]]
       val partRule = partExprRule(partExprClass)
       Map(hilbertClazz -> hilbertRule,
diff --git a/sql-plugin/src/main/311until320-all/java/com/nvidia/spark/rapids/shims/ShimSupportsRuntimeFiltering.java b/sql-plugin/src/main/spark311/java/com/nvidia/spark/rapids/shims/ShimSupportsRuntimeFiltering.java
similarity index 85%
rename from sql-plugin/src/main/311until320-all/java/com/nvidia/spark/rapids/shims/ShimSupportsRuntimeFiltering.java
rename to sql-plugin/src/main/spark311/java/com/nvidia/spark/rapids/shims/ShimSupportsRuntimeFiltering.java
index 3c378f19d2d..03f2f369f03 100644
--- a/sql-plugin/src/main/311until320-all/java/com/nvidia/spark/rapids/shims/ShimSupportsRuntimeFiltering.java
+++ b/sql-plugin/src/main/spark311/java/com/nvidia/spark/rapids/shims/ShimSupportsRuntimeFiltering.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims;
 
 import org.apache.spark.sql.connector.expressions.NamedReference;
diff --git a/sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala
similarity index 78%
rename from sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala
index 6a5af442efc..4d6aef4b91b 100644
--- a/sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,21 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.expressions.Attribute
diff --git a/sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala
similarity index 68%
rename from sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala
index 823c81fe3f2..68fbc7c1aba 100644
--- a/sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,21 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 object AggregationTagging {
diff --git a/sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/AnsiUtil.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/AnsiUtil.scala
similarity index 78%
rename from sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/AnsiUtil.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/AnsiUtil.scala
index 307d1485749..d30a910b397 100644
--- a/sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/AnsiUtil.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/AnsiUtil.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,17 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import ai.rapids.cudf.{ColumnVector, ColumnView}
diff --git a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/AvoidAdaptiveTransitionToRow.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/AvoidAdaptiveTransitionToRow.scala
similarity index 92%
rename from sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/AvoidAdaptiveTransitionToRow.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/AvoidAdaptiveTransitionToRow.scala
index ec5fa139e6d..166c52364af 100644
--- a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/AvoidAdaptiveTransitionToRow.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/AvoidAdaptiveTransitionToRow.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import java.lang.reflect.Method
@@ -55,7 +60,7 @@ case class AvoidAdaptiveTransitionToRow(child: SparkPlan) extends ShimUnaryExecN
 
   override def output: Seq[Attribute] = child.output
 
-  override protected def doExecuteColumnar(): RDD[ColumnarBatch] = child match {
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = child match {
     case GpuRowToColumnarExec(a: AdaptiveSparkPlanExec, _) =>
       val getFinalPhysicalPlan = getPrivateMethod("getFinalPhysicalPlan")
       val plan = getFinalPhysicalPlan.invoke(a)
diff --git a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala
new file mode 100644
index 00000000000..7353544ec8a
--- /dev/null
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
+package com.nvidia.spark.rapids.shims
+
+import com.nvidia.spark.rapids._
+
+import org.apache.spark.sql.execution.datasources.v2.BatchScanExec
+
+class BatchScanExecMeta(p: BatchScanExec,
+    conf: RapidsConf,
+    parent: Option[RapidsMeta[_, _, _]],
+    rule: DataFromReplacementRule)
+  extends SparkPlanMeta[BatchScanExec](p, conf, parent, rule) {
+
+  override val childScans: scala.Seq[ScanMeta[_]] =
+    Seq(GpuOverrides.wrapScan(p.scan, conf, Some(this)))
+
+  override def convertToGpu(): GpuExec =
+    GpuBatchScanExec(p.output, childScans.head.convertToGpu())
+}
diff --git a/sql-plugin/src/main/311until340-non330db/scala/com/nvidia/spark/rapids/shims/CastingConfigShim.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/CastingConfigShim.scala
similarity index 75%
rename from sql-plugin/src/main/311until340-non330db/scala/com/nvidia/spark/rapids/shims/CastingConfigShim.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/CastingConfigShim.scala
index f67be0e8a9d..4b30c7ef6c7 100644
--- a/sql-plugin/src/main/311until340-non330db/scala/com/nvidia/spark/rapids/shims/CastingConfigShim.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/CastingConfigShim.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,21 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.expressions.{CastBase, Expression}
diff --git a/sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/CharVarcharUtilsShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/CharVarcharUtilsShims.scala
similarity index 76%
rename from sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/CharVarcharUtilsShims.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/CharVarcharUtilsShims.scala
index 70db614c351..06759c84256 100644
--- a/sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/CharVarcharUtilsShims.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/CharVarcharUtilsShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,17 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.util.CharVarcharUtils
diff --git a/sql-plugin/src/main/311until340-all/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectCommandMetaShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectCommandMetaShims.scala
similarity index 90%
rename from sql-plugin/src/main/311until340-all/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectCommandMetaShims.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectCommandMetaShims.scala
index ccbad72b804..d4e24dc36f0 100644
--- a/sql-plugin/src/main/311until340-all/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectCommandMetaShims.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectCommandMetaShims.scala
@@ -14,6 +14,22 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/311until340-non330db/scala/com/nvidia/spark/rapids/shims/DecimalArithmeticOverrides.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/DecimalArithmeticOverrides.scala
similarity index 97%
rename from sql-plugin/src/main/311until340-non330db/scala/com/nvidia/spark/rapids/shims/DecimalArithmeticOverrides.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/DecimalArithmeticOverrides.scala
index 2625e934e68..f99605ab3f5 100644
--- a/sql-plugin/src/main/311until340-non330db/scala/com/nvidia/spark/rapids/shims/DecimalArithmeticOverrides.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/DecimalArithmeticOverrides.scala
@@ -14,6 +14,21 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import ai.rapids.cudf.DType
diff --git a/sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/DeltaLakeUtils.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/DeltaLakeUtils.scala
similarity index 68%
rename from sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/DeltaLakeUtils.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/DeltaLakeUtils.scala
index c55021d8b1d..f5cf7bac8fa 100644
--- a/sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/DeltaLakeUtils.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/DeltaLakeUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,21 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.execution.FileSourceScanExec
diff --git a/sql-plugin/src/main/311until330-nondb/scala/com/nvidia/spark/rapids/shims/DistributionUtil.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/DistributionUtil.scala
similarity index 86%
rename from sql-plugin/src/main/311until330-nondb/scala/com/nvidia/spark/rapids/shims/DistributionUtil.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/DistributionUtil.scala
index 564c5daaee4..a87e3103231 100644
--- a/sql-plugin/src/main/311until330-nondb/scala/com/nvidia/spark/rapids/shims/DistributionUtil.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/DistributionUtil.scala
@@ -14,6 +14,16 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, BroadcastDistribution, ClusteredDistribution, Distribution, HashClusteredDistribution, OrderedDistribution, UnspecifiedDistribution}
diff --git a/sql-plugin/src/main/311until340-all/scala/com/nvidia/spark/rapids/shims/FileIndexOptionsShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/FileIndexOptionsShims.scala
similarity index 68%
rename from sql-plugin/src/main/311until340-all/scala/com/nvidia/spark/rapids/shims/FileIndexOptionsShims.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/FileIndexOptionsShims.scala
index dbfd53954f3..96b21dc0f05 100644
--- a/sql-plugin/src/main/311until340-all/scala/com/nvidia/spark/rapids/shims/FileIndexOptionsShims.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/FileIndexOptionsShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,22 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
diff --git a/sql-plugin/src/main/311until340-non330db/scala/com/nvidia/spark/rapids/shims/GetMapValueMeta.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GetMapValueMeta.scala
similarity index 80%
rename from sql-plugin/src/main/311until340-non330db/scala/com/nvidia/spark/rapids/shims/GetMapValueMeta.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GetMapValueMeta.scala
index 078896ece91..2f4c258dff6 100644
--- a/sql-plugin/src/main/311until340-non330db/scala/com/nvidia/spark/rapids/shims/GetMapValueMeta.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GetMapValueMeta.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,21 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{BinaryExprMeta, DataFromReplacementRule, GpuExpression, RapidsConf, RapidsMeta}
diff --git a/sql-plugin/src/main/311until340-all/scala/com/nvidia/spark/rapids/shims/GlobalLimitShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GlobalLimitShims.scala
similarity index 73%
rename from sql-plugin/src/main/311until340-all/scala/com/nvidia/spark/rapids/shims/GlobalLimitShims.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GlobalLimitShims.scala
index 846f892190c..0faf7c1ca6b 100644
--- a/sql-plugin/src/main/311until340-all/scala/com/nvidia/spark/rapids/shims/GlobalLimitShims.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GlobalLimitShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,22 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{RowCountPlanVisitor, SparkPlanMeta}
diff --git a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
similarity index 71%
rename from sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
index 11b6b6383ac..82ec332f283 100644
--- a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{GpuBatchScanExecMetrics, ScanWithMetrics}
@@ -23,10 +28,12 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.AttributeReference
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.connector.read._
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanExecBase
+import org.apache.spark.sql.vectorized.ColumnarBatch
 
 case class GpuBatchScanExec(
     output: Seq[AttributeReference],
-    @transient scan: Scan) extends ShimDataSourceV2ScanExecBase with GpuBatchScanExecMetrics {
+    @transient scan: Scan) extends DataSourceV2ScanExecBase with GpuBatchScanExecMetrics {
   @transient lazy val batch: Batch = scan.toBatch
 
   @transient override lazy val partitions: Seq[InputPartition] = batch.planInputPartitions()
@@ -45,4 +52,12 @@ case class GpuBatchScanExec(
   override def doCanonicalize(): GpuBatchScanExec = {
     this.copy(output = output.map(QueryPlan.normalizeExpressions(_, output)))
   }
+
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
+    val numOutputRows = longMetric("numOutputRows")
+    inputRDD.asInstanceOf[RDD[ColumnarBatch]].map { b =>
+      numOutputRows += b.numRows()
+      b
+    }
+  }
 }
diff --git a/sql-plugin/src/main/311until340-non330db/scala/com/nvidia/spark/rapids/shims/GpuBroadcastJoinMeta.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuBroadcastJoinMeta.scala
similarity index 88%
rename from sql-plugin/src/main/311until340-non330db/scala/com/nvidia/spark/rapids/shims/GpuBroadcastJoinMeta.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuBroadcastJoinMeta.scala
index 2e322af6d5d..03944033fe1 100644
--- a/sql-plugin/src/main/311until340-non330db/scala/com/nvidia/spark/rapids/shims/GpuBroadcastJoinMeta.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuBroadcastJoinMeta.scala
@@ -13,6 +13,21 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/GpuDataSourceRDD.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuDataSourceRDD.scala
similarity index 94%
rename from sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/GpuDataSourceRDD.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuDataSourceRDD.scala
index 25ffd0295db..23059db1dc2 100644
--- a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/GpuDataSourceRDD.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuDataSourceRDD.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{MetricsBatchIterator, PartitionIterator}
diff --git a/sql-plugin/src/main/311until330-nondb/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala
similarity index 86%
rename from sql-plugin/src/main/311until330-nondb/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala
index f259366966c..f81ab49fc87 100644
--- a/sql-plugin/src/main/311until330-nondb/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,16 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.GpuHashPartitioningBase
diff --git a/sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/GpuIntervalUtils.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuIntervalUtils.scala
similarity index 91%
rename from sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/GpuIntervalUtils.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuIntervalUtils.scala
index ebe99d57185..d314f99d767 100644
--- a/sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/GpuIntervalUtils.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuIntervalUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import ai.rapids.cudf.{ColumnVector, ColumnView}
diff --git a/sql-plugin/src/main/311until340-all/scala/com/nvidia/spark/rapids/shims/GpuOptimizedCreateHiveTableAsSelectCommandShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuOptimizedCreateHiveTableAsSelectCommandShims.scala
similarity index 96%
rename from sql-plugin/src/main/311until340-all/scala/com/nvidia/spark/rapids/shims/GpuOptimizedCreateHiveTableAsSelectCommandShims.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuOptimizedCreateHiveTableAsSelectCommandShims.scala
index 45b5fafca09..b76c1d7c748 100644
--- a/sql-plugin/src/main/311until340-all/scala/com/nvidia/spark/rapids/shims/GpuOptimizedCreateHiveTableAsSelectCommandShims.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuOptimizedCreateHiveTableAsSelectCommandShims.scala
@@ -14,6 +14,22 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import java.util.Locale
diff --git a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala
similarity index 81%
rename from sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala
index ec208bba7e7..54dc9ef6a47 100644
--- a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 object GpuParquetCrypto {
diff --git a/sql-plugin/src/main/311until330-nondb/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala
similarity index 93%
rename from sql-plugin/src/main/311until330-nondb/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala
index dfade901023..f10fc0677f9 100644
--- a/sql-plugin/src/main/311until330-nondb/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,16 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{GpuExpression, GpuPartitioning}
diff --git a/sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/GpuTypeShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuTypeShims.scala
similarity index 94%
rename from sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/GpuTypeShims.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuTypeShims.scala
index 91e15b8c833..fee91666e8e 100644
--- a/sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/GpuTypeShims.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuTypeShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import ai.rapids.cudf
diff --git a/sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala
similarity index 83%
rename from sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala
index 19392fa5335..231cacaa570 100644
--- a/sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,21 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, NamedExpression, SortOrder}
diff --git a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/HashUtils.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/HashUtils.scala
similarity index 84%
rename from sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/HashUtils.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/HashUtils.scala
index 9fa25da705c..77abc306ac3 100644
--- a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/HashUtils.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/HashUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import ai.rapids.cudf
diff --git a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/OffsetWindowFunctionMeta.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/OffsetWindowFunctionMeta.scala
similarity index 95%
rename from sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/OffsetWindowFunctionMeta.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/OffsetWindowFunctionMeta.scala
index e520cbcc32f..5fd65188e60 100644
--- a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/OffsetWindowFunctionMeta.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/OffsetWindowFunctionMeta.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{BaseExprMeta, DataFromReplacementRule, ExprMeta, GpuOverrides, RapidsConf, RapidsMeta}
diff --git a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
similarity index 78%
rename from sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
index edf8bd12c2c..2bbc860ca52 100644
--- a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import ai.rapids.cudf.{ColumnView, DType}
@@ -23,7 +28,7 @@ import com.nvidia.spark.rapids.GpuOrcScan
 object OrcCastingShims {
 
   def castIntegerToTimestamp(col: ColumnView, colType: DType): ColumnView = {
-    // For 311 <= spark < 320 (including 311, 312, 313, 314), they consider the integer as
+    // For 311 <= spark < 320 (including 311, 312, 313), they consider the integer as
     // milliseconds.
     GpuOrcScan.castIntegersToTimestamp(col, colType, DType.TIMESTAMP_MILLISECONDS)
   }
diff --git a/sql-plugin/src/main/311until340-all/scala/com/nvidia/spark/rapids/shims/OrcProtoWriterShim.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/OrcProtoWriterShim.scala
similarity index 76%
rename from sql-plugin/src/main/311until340-all/scala/com/nvidia/spark/rapids/shims/OrcProtoWriterShim.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/OrcProtoWriterShim.scala
index a46913ef1f5..a398cf9e64e 100644
--- a/sql-plugin/src/main/311until340-all/scala/com/nvidia/spark/rapids/shims/OrcProtoWriterShim.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/OrcProtoWriterShim.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,22 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.google.protobuf.{AbstractMessage, CodedOutputStream}
diff --git a/sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/OrcReadingShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/OrcReadingShims.scala
similarity index 84%
rename from sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/OrcReadingShims.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/OrcReadingShims.scala
index 6b3dd921082..a7cb046e287 100644
--- a/sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/OrcReadingShims.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/OrcReadingShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,17 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.hadoop.conf.Configuration
diff --git a/sql-plugin/src/main/311until320-noncdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
similarity index 86%
rename from sql-plugin/src/main/311until320-noncdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
index a6a2bec9ace..3ded1c76620 100644
--- a/sql-plugin/src/main/311until320-noncdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
diff --git a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/OrcShims311until320Base.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/OrcShims311until320Base.scala
similarity index 95%
rename from sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/OrcShims311until320Base.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/OrcShims311until320Base.scala
index ab694c4a4b2..cc09437178c 100644
--- a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/OrcShims311until320Base.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/OrcShims311until320Base.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import scala.collection.mutable.ArrayBuffer
diff --git a/sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/ParquetFieldIdShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ParquetFieldIdShims.scala
similarity index 80%
rename from sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/ParquetFieldIdShims.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ParquetFieldIdShims.scala
index baf7bbd1252..4b69c2b6bdd 100644
--- a/sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/ParquetFieldIdShims.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ParquetFieldIdShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,17 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.hadoop.conf.Configuration
diff --git a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ParquetLegacyNanoAsLongShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ParquetLegacyNanoAsLongShims.scala
new file mode 100644
index 00000000000..65bc7d9f56d
--- /dev/null
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ParquetLegacyNanoAsLongShims.scala
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+spark-rapids-shim-json-lines ***/
+package com.nvidia.spark.rapids.shims
+
+import org.apache.hadoop.conf.Configuration
+
+object ParquetLegacyNanoAsLongShims {
+  def legacyParquetNanosAsLong(): Boolean = {
+    // this should be true for 3.2.4+, 3.3.2+, 3.4.0+ if
+    //   spark.sql.legacy.parquet.nanosAsLong = true
+    false
+  }
+
+  def setupLegacyParquetNanosAsLongForPCBS(conf: Configuration): Unit = {
+    // LEGACY_PARQUET_NANOS_AS_LONG is only considered in Spark 3.3.2 and later
+  }
+}
diff --git a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala
similarity index 96%
rename from sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala
index a0d68fb9002..fd48b8b6375 100644
--- a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.parquet.schema._
diff --git a/sql-plugin/src/main/311until340-non330db/scala/com/nvidia/spark/rapids/shims/ParquetStringPredShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ParquetStringPredShims.scala
similarity index 71%
rename from sql-plugin/src/main/311until340-non330db/scala/com/nvidia/spark/rapids/shims/ParquetStringPredShims.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ParquetStringPredShims.scala
index 621966ee1d2..99b9b8d3d8e 100644
--- a/sql-plugin/src/main/311until340-non330db/scala/com/nvidia/spark/rapids/shims/ParquetStringPredShims.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ParquetStringPredShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,21 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.internal.SQLConf
diff --git a/sql-plugin/src/main/311until340-all/scala/com/nvidia/spark/rapids/shims/ParquetTimestampNTZShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ParquetTimestampNTZShims.scala
similarity index 71%
rename from sql-plugin/src/main/311until340-all/scala/com/nvidia/spark/rapids/shims/ParquetTimestampNTZShims.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ParquetTimestampNTZShims.scala
index c8e7f7bde87..07047ca05dd 100644
--- a/sql-plugin/src/main/311until340-all/scala/com/nvidia/spark/rapids/shims/ParquetTimestampNTZShims.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ParquetTimestampNTZShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,22 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.hadoop.conf.Configuration
diff --git a/sql-plugin/src/main/311until340-all/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala
similarity index 77%
rename from sql-plugin/src/main/311until340-all/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala
index a27aa677a61..1fb6badd5e3 100644
--- a/sql-plugin/src/main/311until340-all/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala
@@ -14,6 +14,22 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.InternalRow
diff --git a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala
similarity index 82%
rename from sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala
index 7687a8663c8..c809e23a71f 100644
--- a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.PlanShims
diff --git a/sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/RapidsFileSourceMetaUtils.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/RapidsFileSourceMetaUtils.scala
similarity index 74%
rename from sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/RapidsFileSourceMetaUtils.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/RapidsFileSourceMetaUtils.scala
index 54553c1d464..3b5347da995 100644
--- a/sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/RapidsFileSourceMetaUtils.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/RapidsFileSourceMetaUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,17 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.expressions.Attribute
diff --git a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala
similarity index 89%
rename from sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala
index 5b395860734..b7e8c758622 100644
--- a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{DataFromReplacementRule, GpuOrcScan, RapidsConf, RapidsMeta, ScanMeta}
diff --git a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala
similarity index 89%
rename from sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala
index 5c10748cbd2..7c3f57574de 100644
--- a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{DataFromReplacementRule, GpuParquetScan, RapidsConf, RapidsMeta, ScanMeta}
diff --git a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/ShimAQEShuffleReadExec.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ShimAQEShuffleReadExec.scala
similarity index 95%
rename from sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/ShimAQEShuffleReadExec.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ShimAQEShuffleReadExec.scala
index ad54b6eaacb..8a4162a2175 100644
--- a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/ShimAQEShuffleReadExec.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ShimAQEShuffleReadExec.scala
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/311until320-nondb/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala
similarity index 88%
rename from sql-plugin/src/main/311until320-nondb/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala
index 45e85988797..f86d7e51ca6 100644
--- a/sql-plugin/src/main/311until320-nondb/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import scala.concurrent.Promise
diff --git a/sql-plugin/src/main/311until340-non330db/scala/com/nvidia/spark/rapids/shims/ShimFilePartitionReaderFactory.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ShimFilePartitionReaderFactory.scala
similarity index 70%
rename from sql-plugin/src/main/311until340-non330db/scala/com/nvidia/spark/rapids/shims/ShimFilePartitionReaderFactory.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ShimFilePartitionReaderFactory.scala
index c83a99955e7..62056467b39 100644
--- a/sql-plugin/src/main/311until340-non330db/scala/com/nvidia/spark/rapids/shims/ShimFilePartitionReaderFactory.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ShimFilePartitionReaderFactory.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,21 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.execution.datasources.v2.FilePartitionReaderFactory
diff --git a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ShimLeafExecNode.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ShimLeafExecNode.scala
new file mode 100644
index 00000000000..e50251e26c6
--- /dev/null
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ShimLeafExecNode.scala
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
+package com.nvidia.spark.rapids.shims
+
+import org.apache.spark.sql.execution.LeafExecNode
+
+trait ShimLeafExecNode extends LeafExecNode
diff --git a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/ShimPredicateHelper.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ShimPredicateHelper.scala
similarity index 88%
rename from sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/ShimPredicateHelper.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ShimPredicateHelper.scala
index 3767a4369e1..14e3e2ae7a5 100644
--- a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/ShimPredicateHelper.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ShimPredicateHelper.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.expressions._
diff --git a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala
similarity index 89%
rename from sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala
index 08f5fd30605..ef7d256d41f 100644
--- a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, REPARTITION, REPARTITION_WITH_NUM, ShuffleOrigin}
diff --git a/sql-plugin/src/main/311until320-nondb/scala/com/nvidia/spark/rapids/shims/Spark31XShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/Spark31XShims.scala
similarity index 99%
rename from sql-plugin/src/main/311until320-nondb/scala/com/nvidia/spark/rapids/shims/Spark31XShims.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/Spark31XShims.scala
index b3b81e81c22..be966d9d8cc 100644
--- a/sql-plugin/src/main/311until320-nondb/scala/com/nvidia/spark/rapids/shims/Spark31XShims.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/Spark31XShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import scala.collection.mutable.ListBuffer
diff --git a/sql-plugin/src/main/311until330-nondb/scala/com/nvidia/spark/rapids/shims/Spark31Xuntil33XShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/Spark31Xuntil33XShims.scala
similarity index 92%
rename from sql-plugin/src/main/311until330-nondb/scala/com/nvidia/spark/rapids/shims/Spark31Xuntil33XShims.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/Spark31Xuntil33XShims.scala
index 53ab857ad25..a0f9e4bcf13 100644
--- a/sql-plugin/src/main/311until330-nondb/scala/com/nvidia/spark/rapids/shims/Spark31Xuntil33XShims.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/Spark31Xuntil33XShims.scala
@@ -14,6 +14,16 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/311-nondb/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
similarity index 91%
rename from sql-plugin/src/main/311-nondb/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
index 5fc95e7ccb5..f5b4709b983 100644
--- a/sql-plugin/src/main/311-nondb/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.parquet.schema.MessageType
diff --git a/sql-plugin/src/main/pre320-treenode/scala/com/nvidia/spark/rapids/shims/TreeNode.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/TreeNode.scala
similarity index 89%
rename from sql-plugin/src/main/pre320-treenode/scala/com/nvidia/spark/rapids/shims/TreeNode.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/TreeNode.scala
index 41ae3e51087..671bdbd45bf 100644
--- a/sql-plugin/src/main/pre320-treenode/scala/com/nvidia/spark/rapids/shims/TreeNode.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/TreeNode.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, Expression, TernaryExpression, UnaryExpression}
diff --git a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/TypeSigUtil.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/TypeSigUtil.scala
similarity index 93%
rename from sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/TypeSigUtil.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/TypeSigUtil.scala
index ae1b730548a..ab4fab48a45 100644
--- a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/TypeSigUtil.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/TypeSigUtil.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{TypeEnum, TypeSig, TypeSigUtilBase}
diff --git a/sql-plugin/src/main/311until340-non330db/scala/com/nvidia/spark/rapids/shims/TypeUtilsShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/TypeUtilsShims.scala
similarity index 71%
rename from sql-plugin/src/main/311until340-non330db/scala/com/nvidia/spark/rapids/shims/TypeUtilsShims.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/TypeUtilsShims.scala
index bc7d65daed6..fe234bb999c 100644
--- a/sql-plugin/src/main/311until340-non330db/scala/com/nvidia/spark/rapids/shims/TypeUtilsShims.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/TypeUtilsShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,21 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.util.TypeUtils
diff --git a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/YearParseUtil.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/YearParseUtil.scala
similarity index 82%
rename from sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/YearParseUtil.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/YearParseUtil.scala
index 8eb908ee140..54f3b4b60fb 100644
--- a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/YearParseUtil.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/YearParseUtil.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{RapidsConf, RapidsMeta}
diff --git a/sql-plugin/src/main/311+-non330db/scala/com/nvidia/spark/rapids/shims/extractValueShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/extractValueShims.scala
similarity index 68%
rename from sql-plugin/src/main/311+-non330db/scala/com/nvidia/spark/rapids/shims/extractValueShims.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/extractValueShims.scala
index e576cbfd1d4..8c68e5666a3 100644
--- a/sql-plugin/src/main/311+-non330db/scala/com/nvidia/spark/rapids/shims/extractValueShims.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/extractValueShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,22 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.expressions._
diff --git a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/gpuWindows.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/gpuWindows.scala
similarity index 92%
rename from sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/gpuWindows.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/gpuWindows.scala
index 4f75844f0e2..9d88a72ca8f 100644
--- a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/gpuWindows.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/gpuWindows.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{DataFromReplacementRule, GpuSpecifiedWindowFrameMetaBase, GpuWindowExpressionMetaBase, ParsedBoundary, RapidsConf, RapidsMeta}
diff --git a/sql-plugin/src/main/311-nondb/scala/com/nvidia/spark/rapids/shims/spark311/SparkShimServiceProvider.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/spark311/SparkShimServiceProvider.scala
similarity index 88%
rename from sql-plugin/src/main/311-nondb/scala/com/nvidia/spark/rapids/shims/spark311/SparkShimServiceProvider.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/spark311/SparkShimServiceProvider.scala
index 58f58c485ec..c764799dba8 100644
--- a/sql-plugin/src/main/311-nondb/scala/com/nvidia/spark/rapids/shims/spark311/SparkShimServiceProvider.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/spark311/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark311
 
 import com.nvidia.spark.rapids.SparkShimVersion
diff --git a/sql-plugin/src/main/311-nondb/scala/com/nvidia/spark/rapids/spark311/RapidsShuffleManager.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/spark311/RapidsShuffleManager.scala
similarity index 87%
rename from sql-plugin/src/main/311-nondb/scala/com/nvidia/spark/rapids/spark311/RapidsShuffleManager.scala
rename to sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/spark311/RapidsShuffleManager.scala
index c517f71d9f9..c0e05de16be 100644
--- a/sql-plugin/src/main/311-nondb/scala/com/nvidia/spark/rapids/spark311/RapidsShuffleManager.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/spark311/RapidsShuffleManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.spark311
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuFileFormatWriter.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/GpuFileFormatWriter.scala
similarity index 97%
rename from sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuFileFormatWriter.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/GpuFileFormatWriter.scala
index 89f934065c4..f06b5c30e1e 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuFileFormatWriter.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/GpuFileFormatWriter.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,22 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids
 
 import java.util.{Date, UUID}
diff --git a/sql-plugin/src/main/311until320-all/scala/org/apache/spark/rapids/shims/GpuShuffleBlockResolver.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/rapids/shims/GpuShuffleBlockResolver.scala
similarity index 84%
rename from sql-plugin/src/main/311until320-all/scala/org/apache/spark/rapids/shims/GpuShuffleBlockResolver.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/rapids/shims/GpuShuffleBlockResolver.scala
index 26c0c762506..e4a2451c59b 100644
--- a/sql-plugin/src/main/311until320-all/scala/org/apache/spark/rapids/shims/GpuShuffleBlockResolver.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/rapids/shims/GpuShuffleBlockResolver.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import com.nvidia.spark.rapids.ShuffleBufferCatalog
diff --git a/sql-plugin/src/main/311+-nondb/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala
similarity index 86%
rename from sql-plugin/src/main/311+-nondb/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala
index 9ad02ce0ba2..c2312f2c344 100644
--- a/sql-plugin/src/main/311+-nondb/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,21 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.rapids.shims
 
 import com.nvidia.spark.rapids.GpuPartitioning
diff --git a/sql-plugin/src/main/311until320-all/scala/org/apache/spark/rapids/shims/ShuffledBatchRDDUtil.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/rapids/shims/ShuffledBatchRDDUtil.scala
similarity index 96%
rename from sql-plugin/src/main/311until320-all/scala/org/apache/spark/rapids/shims/ShuffledBatchRDDUtil.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/rapids/shims/ShuffledBatchRDDUtil.scala
index 7934c296cb3..54d79eae6ff 100644
--- a/sql-plugin/src/main/311until320-all/scala/org/apache/spark/rapids/shims/ShuffledBatchRDDUtil.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/rapids/shims/ShuffledBatchRDDUtil.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.rapids.shims
 
 import org.apache.spark.{MapOutputTrackerMaster, Partition, ShuffleDependency, SparkEnv, TaskContext}
diff --git a/sql-plugin/src/main/311until320-nondb/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala
similarity index 90%
rename from sql-plugin/src/main/311until320-nondb/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala
index 71472618926..92b07c446cf 100644
--- a/sql-plugin/src/main/311until320-nondb/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.rapids.shims.api.python
 
 import java.io.DataInputStream
diff --git a/sql-plugin/src/main/311until320-all/scala/org/apache/spark/rapids/shims/storage/ShimDiskBlockManager.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/rapids/shims/storage/ShimDiskBlockManager.scala
similarity index 83%
rename from sql-plugin/src/main/311until320-all/scala/org/apache/spark/rapids/shims/storage/ShimDiskBlockManager.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/rapids/shims/storage/ShimDiskBlockManager.scala
index ef06e295fdd..4822afd76f4 100644
--- a/sql-plugin/src/main/311until320-all/scala/org/apache/spark/rapids/shims/storage/ShimDiskBlockManager.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/rapids/shims/storage/ShimDiskBlockManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.rapids.shims.storage
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/311until330-nondb/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala
similarity index 75%
rename from sql-plugin/src/main/311until330-nondb/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala
index 866fa191e46..dcab4d487d6 100644
--- a/sql-plugin/src/main/311until330-nondb/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,16 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.catalyst.csv
 
 object GpuCsvUtils {
diff --git a/sql-plugin/src/main/311until330-nondb/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala
similarity index 75%
rename from sql-plugin/src/main/311until330-nondb/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala
index 3cfe43fa5f5..65e14960701 100644
--- a/sql-plugin/src/main/311until330-nondb/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,16 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.catalyst.json
 
 object GpuJsonUtils {
diff --git a/sql-plugin/src/main/311until330-all/scala/org/apache/spark/sql/execution/datasources/parquet/ShimCurrentBatchIterator.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/execution/datasources/parquet/ShimCurrentBatchIterator.scala
similarity index 95%
rename from sql-plugin/src/main/311until330-all/scala/org/apache/spark/sql/execution/datasources/parquet/ShimCurrentBatchIterator.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/execution/datasources/parquet/ShimCurrentBatchIterator.scala
index 152d2f22bb4..ccc552f3f1a 100644
--- a/sql-plugin/src/main/311until330-all/scala/org/apache/spark/sql/execution/datasources/parquet/ShimCurrentBatchIterator.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/execution/datasources/parquet/ShimCurrentBatchIterator.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,17 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.execution.datasources.parquet
 
 import java.io.IOException
diff --git a/sql-plugin/src/main/311until320-noncdh/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala
similarity index 92%
rename from sql-plugin/src/main/311until320-noncdh/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala
index dc78d9fb4d7..c166be781a8 100644
--- a/sql-plugin/src/main/311until320-noncdh/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.execution.datasources.parquet.rapids.shims
 
 import java.time.ZoneId
diff --git a/sql-plugin/src/main/311until340-all/scala/org/apache/spark/sql/hive/rapids/shims/GpuCreateHiveTableAsSelectCommand.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/GpuCreateHiveTableAsSelectCommand.scala
similarity index 94%
rename from sql-plugin/src/main/311until340-all/scala/org/apache/spark/sql/hive/rapids/shims/GpuCreateHiveTableAsSelectCommand.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/GpuCreateHiveTableAsSelectCommand.scala
index 74fd4585dee..d8bc8a174f5 100644
--- a/sql-plugin/src/main/311until340-all/scala/org/apache/spark/sql/hive/rapids/shims/GpuCreateHiveTableAsSelectCommand.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/GpuCreateHiveTableAsSelectCommand.scala
@@ -14,6 +14,22 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.hive.rapids.shims
 
 import com.nvidia.spark.rapids.{DataFromReplacementRule, DataWritingCommandMeta, GpuDataWritingCommand, GpuOverrides, RapidsConf, RapidsMeta}
diff --git a/sql-plugin/src/main/311until340-all/scala/org/apache/spark/sql/hive/rapids/shims/GpuHiveTextFileFormat.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/GpuHiveTextFileFormat.scala
similarity index 95%
rename from sql-plugin/src/main/311until340-all/scala/org/apache/spark/sql/hive/rapids/shims/GpuHiveTextFileFormat.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/GpuHiveTextFileFormat.scala
index 53faf74ae2a..f4597b0e291 100644
--- a/sql-plugin/src/main/311until340-all/scala/org/apache/spark/sql/hive/rapids/shims/GpuHiveTextFileFormat.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/GpuHiveTextFileFormat.scala
@@ -14,6 +14,22 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.hive.rapids.shims
 
 import ai.rapids.cudf.{CSVWriterOptions, DType, HostBufferConsumer, QuoteStyle, Scalar, Table, TableWriter => CudfTableWriter}
@@ -123,7 +139,7 @@ class GpuHiveTextFileFormat extends ColumnarFileFormat with Logging {
 class GpuHiveTextWriter(override val path: String,
                         dataSchema: StructType,
                         context: TaskAttemptContext)
-  extends ColumnarOutputWriter(context, dataSchema, "HiveText") {
+  extends ColumnarOutputWriter(context, dataSchema, "HiveText", false) {
 
   /**
    * This CSV writer reformats columns, to iron out inconsistencies between
diff --git a/sql-plugin/src/main/311until340-all/scala/org/apache/spark/sql/hive/rapids/shims/GpuInsertIntoHiveTable.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/GpuInsertIntoHiveTable.scala
similarity index 98%
rename from sql-plugin/src/main/311until340-all/scala/org/apache/spark/sql/hive/rapids/shims/GpuInsertIntoHiveTable.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/GpuInsertIntoHiveTable.scala
index 85ef54cd608..2ede5b37175 100644
--- a/sql-plugin/src/main/311until340-all/scala/org/apache/spark/sql/hive/rapids/shims/GpuInsertIntoHiveTable.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/GpuInsertIntoHiveTable.scala
@@ -14,6 +14,22 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.hive.rapids.shims
 
 import com.nvidia.spark.rapids.{ColumnarFileFormat, DataFromReplacementRule, DataWritingCommandMeta, GpuDataWritingCommand, RapidsConf, RapidsMeta}
diff --git a/sql-plugin/src/main/311until340-all/scala/org/apache/spark/sql/hive/rapids/shims/HiveProviderCmdShims.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/HiveProviderCmdShims.scala
similarity index 87%
rename from sql-plugin/src/main/311until340-all/scala/org/apache/spark/sql/hive/rapids/shims/HiveProviderCmdShims.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/HiveProviderCmdShims.scala
index ec62bef5ce2..756ede3ea35 100644
--- a/sql-plugin/src/main/311until340-all/scala/org/apache/spark/sql/hive/rapids/shims/HiveProviderCmdShims.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/HiveProviderCmdShims.scala
@@ -14,6 +14,22 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.hive.rapids.shims
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/311until340-non330db/scala/org/apache/spark/sql/rapids/AggregateFunctions.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/AggregateFunctions.scala
similarity index 84%
rename from sql-plugin/src/main/311until340-non330db/scala/org/apache/spark/sql/rapids/AggregateFunctions.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/AggregateFunctions.scala
index 45c72c7d43c..a808c5cd7f7 100644
--- a/sql-plugin/src/main/311until340-non330db/scala/org/apache/spark/sql/rapids/AggregateFunctions.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/AggregateFunctions.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,21 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids
 
 import com.nvidia.spark.rapids.{GpuCast, GpuWindowExpression, GpuWindowSpecDefinition}
diff --git a/sql-plugin/src/main/311until340-non330db/scala/org/apache/spark/sql/rapids/arithmetic.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/arithmetic.scala
similarity index 88%
rename from sql-plugin/src/main/311until340-non330db/scala/org/apache/spark/sql/rapids/arithmetic.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/arithmetic.scala
index 333ecd42fe3..093960a8927 100644
--- a/sql-plugin/src/main/311until340-non330db/scala/org/apache/spark/sql/rapids/arithmetic.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/arithmetic.scala
@@ -14,6 +14,21 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala
similarity index 89%
rename from sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala
index 72291418f54..d823d3f0946 100644
--- a/sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala
@@ -14,6 +14,21 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.execution
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala
similarity index 92%
rename from sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala
index c978979f930..233371778ad 100644
--- a/sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala
@@ -14,6 +14,21 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.execution
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/execution/GpuShuffleMeta.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/GpuShuffleMeta.scala
similarity index 77%
rename from sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/execution/GpuShuffleMeta.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/GpuShuffleMeta.scala
index 03bc0b342f3..f166834cbeb 100644
--- a/sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/execution/GpuShuffleMeta.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/GpuShuffleMeta.scala
@@ -14,6 +14,21 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.execution
 
 import com.nvidia.spark.rapids.{DataFromReplacementRule, RapidsConf, RapidsMeta}
diff --git a/sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/rapids/execution/ShimTrampolineUtil.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/ShimTrampolineUtil.scala
similarity index 91%
rename from sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/rapids/execution/ShimTrampolineUtil.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/ShimTrampolineUtil.scala
index 08cfcf3b9db..87b6183aae1 100644
--- a/sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/rapids/execution/ShimTrampolineUtil.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/ShimTrampolineUtil.scala
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.execution
 
 import org.apache.spark.sql.catalyst.plans.physical.{BroadcastMode, IdentityBroadcastMode}
diff --git a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuArrowPythonRunnerShims.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuArrowPythonRunnerShims.scala
new file mode 100644
index 00000000000..0f795080085
--- /dev/null
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuArrowPythonRunnerShims.scala
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
+package org.apache.spark.sql.rapids.execution.python.shims
+
+import org.apache.spark.api.python._
+import org.apache.spark.sql.rapids.execution.python._
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.util._
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+case class GpuArrowPythonRunnerShims(
+  conf: org.apache.spark.sql.internal.SQLConf,
+  chainedFunc: Seq[ChainedPythonFunctions],
+  argOffsets: Array[Array[Int]],
+  dedupAttrs: StructType,
+  pythonOutputSchema: StructType) {
+  val sessionLocalTimeZone = conf.sessionLocalTimeZone
+  val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf)
+
+  def getRunner(): GpuPythonRunnerBase[ColumnarBatch] = {
+    new GpuArrowPythonRunner(
+      chainedFunc,
+      PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
+      argOffsets,
+      dedupAttrs,
+      sessionLocalTimeZone,
+      pythonRunnerConf,
+      // The whole group data should be written in a single call, so here is unlimited
+      Int.MaxValue,
+      pythonOutputSchema)
+  }
+}
\ No newline at end of file
diff --git a/sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/rapids/shims/AvroUtils.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/AvroUtils.scala
similarity index 83%
rename from sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/rapids/shims/AvroUtils.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/AvroUtils.scala
index a7c15ebab41..881fd81eda0 100644
--- a/sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/rapids/shims/AvroUtils.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/AvroUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import com.nvidia.spark.rapids.RapidsMeta
diff --git a/sql-plugin/src/main/311until340-all/scala/org/apache/spark/sql/rapids/shims/GpuCreateDataSourceTableAsSelectCommandShims.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/GpuCreateDataSourceTableAsSelectCommandShims.scala
similarity index 94%
rename from sql-plugin/src/main/311until340-all/scala/org/apache/spark/sql/rapids/shims/GpuCreateDataSourceTableAsSelectCommandShims.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/GpuCreateDataSourceTableAsSelectCommandShims.scala
index 22484e000b1..a92c2326a3e 100644
--- a/sql-plugin/src/main/311until340-all/scala/org/apache/spark/sql/rapids/shims/GpuCreateDataSourceTableAsSelectCommandShims.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/GpuCreateDataSourceTableAsSelectCommandShims.scala
@@ -14,6 +14,22 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import java.net.URI
diff --git a/sql-plugin/src/main/31xdb/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
similarity index 94%
rename from sql-plugin/src/main/31xdb/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
index 03526d0de8b..f23229e0956 100644
--- a/sql-plugin/src/main/31xdb/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import org.apache.spark.sql.AnalysisException
diff --git a/sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedReader.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedReader.scala
similarity index 93%
rename from sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedReader.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedReader.scala
index eae809ae9f0..5e485fe3550 100644
--- a/sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedReader.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedReader.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import ai.rapids.cudf.{NvtxColor, NvtxRange}
diff --git a/sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedWriter.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedWriter.scala
similarity index 90%
rename from sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedWriter.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedWriter.scala
index 14093c1648a..23d670eb696 100644
--- a/sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedWriter.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedWriter.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/311until340-all/scala/org/apache/spark/sql/rapids/shims/SchemaUtilsShims.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/SchemaUtilsShims.scala
similarity index 82%
rename from sql-plugin/src/main/311until340-all/scala/org/apache/spark/sql/rapids/shims/SchemaUtilsShims.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/SchemaUtilsShims.scala
index d0a0f9828ad..a803e12cf38 100644
--- a/sql-plugin/src/main/311until340-all/scala/org/apache/spark/sql/rapids/shims/SchemaUtilsShims.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/SchemaUtilsShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,22 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import org.apache.spark.sql.catalyst.analysis.Resolver
diff --git a/sql-plugin/src/main/311until330-all/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
similarity index 76%
rename from sql-plugin/src/main/311until330-all/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
index a7e036c5577..69e7efc3b6b 100644
--- a/sql-plugin/src/main/311until330-all/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,17 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import org.apache.spark.SparkUpgradeException
diff --git a/sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/rapids/shims/datetimeExpressions.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/datetimeExpressions.scala
similarity index 87%
rename from sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/rapids/shims/datetimeExpressions.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/datetimeExpressions.scala
index 56047d3cdf9..8a37bd63ca5 100644
--- a/sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/rapids/shims/datetimeExpressions.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/datetimeExpressions.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import ai.rapids.cudf.{ColumnVector, ColumnView, Scalar}
diff --git a/sql-plugin/src/main/311-nondb/scala/org/apache/spark/sql/rapids/shims/spark311/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/spark311/RapidsShuffleInternalManager.scala
similarity index 90%
rename from sql-plugin/src/main/311-nondb/scala/org/apache/spark/sql/rapids/shims/spark311/RapidsShuffleInternalManager.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/spark311/RapidsShuffleInternalManager.scala
index 30a6782fceb..1cb99b023ae 100644
--- a/sql-plugin/src/main/311-nondb/scala/org/apache/spark/sql/rapids/shims/spark311/RapidsShuffleInternalManager.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/spark311/RapidsShuffleInternalManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims.spark311
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/types/shims/PartitionValueCastShims.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/types/shims/PartitionValueCastShims.scala
similarity index 86%
rename from sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/types/shims/PartitionValueCastShims.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/types/shims/PartitionValueCastShims.scala
index 61aad87a637..fc1b4365dad 100644
--- a/sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/types/shims/PartitionValueCastShims.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/types/shims/PartitionValueCastShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.types.shims
 
 import java.time.ZoneId
diff --git a/sql-plugin/src/main/311until320-all/scala/org/apache/spark/storage/RapidsShuffleBlockFetcherIterator.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/storage/RapidsShuffleBlockFetcherIterator.scala
similarity index 99%
rename from sql-plugin/src/main/311until320-all/scala/org/apache/spark/storage/RapidsShuffleBlockFetcherIterator.scala
rename to sql-plugin/src/main/spark311/scala/org/apache/spark/storage/RapidsShuffleBlockFetcherIterator.scala
index 27cdb8dc09c..fe8a540c05a 100644
--- a/sql-plugin/src/main/311until320-all/scala/org/apache/spark/storage/RapidsShuffleBlockFetcherIterator.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/storage/RapidsShuffleBlockFetcherIterator.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+{"spark": "312"}
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.storage
 
 import java.io.{InputStream, IOException}
diff --git a/sql-plugin/src/main/312-nondb/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/spark312/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
similarity index 91%
rename from sql-plugin/src/main/312-nondb/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
rename to sql-plugin/src/main/spark312/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
index 27ab277ab4f..673f10fb8e6 100644
--- a/sql-plugin/src/main/312-nondb/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
+++ b/sql-plugin/src/main/spark312/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "312"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.parquet.schema.MessageType
diff --git a/sql-plugin/src/main/312-nondb/scala/com/nvidia/spark/rapids/shims/spark312/SparkShimServiceProvider.scala b/sql-plugin/src/main/spark312/scala/com/nvidia/spark/rapids/shims/spark312/SparkShimServiceProvider.scala
similarity index 88%
rename from sql-plugin/src/main/312-nondb/scala/com/nvidia/spark/rapids/shims/spark312/SparkShimServiceProvider.scala
rename to sql-plugin/src/main/spark312/scala/com/nvidia/spark/rapids/shims/spark312/SparkShimServiceProvider.scala
index 1a969565b67..1454d35e629 100644
--- a/sql-plugin/src/main/312-nondb/scala/com/nvidia/spark/rapids/shims/spark312/SparkShimServiceProvider.scala
+++ b/sql-plugin/src/main/spark312/scala/com/nvidia/spark/rapids/shims/spark312/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "312"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark312
 
 import com.nvidia.spark.rapids.SparkShimVersion
diff --git a/sql-plugin/src/main/312-nondb/scala/com/nvidia/spark/rapids/spark312/RapidsShuffleManager.scala b/sql-plugin/src/main/spark312/scala/com/nvidia/spark/rapids/spark312/RapidsShuffleManager.scala
similarity index 87%
rename from sql-plugin/src/main/312-nondb/scala/com/nvidia/spark/rapids/spark312/RapidsShuffleManager.scala
rename to sql-plugin/src/main/spark312/scala/com/nvidia/spark/rapids/spark312/RapidsShuffleManager.scala
index 2383d7fd4cf..23d24fa7444 100644
--- a/sql-plugin/src/main/312-nondb/scala/com/nvidia/spark/rapids/spark312/RapidsShuffleManager.scala
+++ b/sql-plugin/src/main/spark312/scala/com/nvidia/spark/rapids/spark312/RapidsShuffleManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "312"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.spark312
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/312-nondb/scala/org/apache/spark/sql/rapids/shims/spark312/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/spark312/scala/org/apache/spark/sql/rapids/shims/spark312/RapidsShuffleInternalManager.scala
similarity index 90%
rename from sql-plugin/src/main/312-nondb/scala/org/apache/spark/sql/rapids/shims/spark312/RapidsShuffleInternalManager.scala
rename to sql-plugin/src/main/spark312/scala/org/apache/spark/sql/rapids/shims/spark312/RapidsShuffleInternalManager.scala
index 59f45d754a9..5f5b7c540d3 100644
--- a/sql-plugin/src/main/312-nondb/scala/org/apache/spark/sql/rapids/shims/spark312/RapidsShuffleInternalManager.scala
+++ b/sql-plugin/src/main/spark312/scala/org/apache/spark/sql/rapids/shims/spark312/RapidsShuffleInternalManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "312"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims.spark312
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/313/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/spark313/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
similarity index 92%
rename from sql-plugin/src/main/313/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
rename to sql-plugin/src/main/spark313/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
index 53152532145..f28619b22d2 100644
--- a/sql-plugin/src/main/313/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
+++ b/sql-plugin/src/main/spark313/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.parquet.schema.MessageType
diff --git a/sql-plugin/src/main/313/scala/com/nvidia/spark/rapids/shims/spark313/SparkShimServiceProvider.scala b/sql-plugin/src/main/spark313/scala/com/nvidia/spark/rapids/shims/spark313/SparkShimServiceProvider.scala
similarity index 88%
rename from sql-plugin/src/main/313/scala/com/nvidia/spark/rapids/shims/spark313/SparkShimServiceProvider.scala
rename to sql-plugin/src/main/spark313/scala/com/nvidia/spark/rapids/shims/spark313/SparkShimServiceProvider.scala
index 646d4b3d9d2..9dafcdde8c6 100644
--- a/sql-plugin/src/main/313/scala/com/nvidia/spark/rapids/shims/spark313/SparkShimServiceProvider.scala
+++ b/sql-plugin/src/main/spark313/scala/com/nvidia/spark/rapids/shims/spark313/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark313
 
 import com.nvidia.spark.rapids.SparkShimVersion
diff --git a/sql-plugin/src/main/313/scala/com/nvidia/spark/rapids/spark313/RapidsShuffleManager.scala b/sql-plugin/src/main/spark313/scala/com/nvidia/spark/rapids/spark313/RapidsShuffleManager.scala
similarity index 87%
rename from sql-plugin/src/main/313/scala/com/nvidia/spark/rapids/spark313/RapidsShuffleManager.scala
rename to sql-plugin/src/main/spark313/scala/com/nvidia/spark/rapids/spark313/RapidsShuffleManager.scala
index 47dda977ff7..66ed4431cfb 100644
--- a/sql-plugin/src/main/313/scala/com/nvidia/spark/rapids/spark313/RapidsShuffleManager.scala
+++ b/sql-plugin/src/main/spark313/scala/com/nvidia/spark/rapids/spark313/RapidsShuffleManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.spark313
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/313/scala/org/apache/spark/sql/rapids/shims/spark313/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/spark313/scala/org/apache/spark/sql/rapids/shims/spark313/RapidsShuffleInternalManager.scala
similarity index 90%
rename from sql-plugin/src/main/313/scala/org/apache/spark/sql/rapids/shims/spark313/RapidsShuffleInternalManager.scala
rename to sql-plugin/src/main/spark313/scala/org/apache/spark/sql/rapids/shims/spark313/RapidsShuffleInternalManager.scala
index 22c1fe7940a..db17351887d 100644
--- a/sql-plugin/src/main/313/scala/org/apache/spark/sql/rapids/shims/spark313/RapidsShuffleInternalManager.scala
+++ b/sql-plugin/src/main/spark313/scala/org/apache/spark/sql/rapids/shims/spark313/RapidsShuffleInternalManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims.spark313
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/320+/java/com/nvidia/spark/rapids/shims/ShimSupportsRuntimeFiltering.java b/sql-plugin/src/main/spark320/java/com/nvidia/spark/rapids/shims/ShimSupportsRuntimeFiltering.java
similarity index 72%
rename from sql-plugin/src/main/320+/java/com/nvidia/spark/rapids/shims/ShimSupportsRuntimeFiltering.java
rename to sql-plugin/src/main/spark320/java/com/nvidia/spark/rapids/shims/ShimSupportsRuntimeFiltering.java
index 1da2fd872ac..b99d6907d4d 100644
--- a/sql-plugin/src/main/320+/java/com/nvidia/spark/rapids/shims/ShimSupportsRuntimeFiltering.java
+++ b/sql-plugin/src/main/spark320/java/com/nvidia/spark/rapids/shims/ShimSupportsRuntimeFiltering.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,20 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims;
 
 import org.apache.spark.sql.connector.read.SupportsRuntimeFiltering;
diff --git a/sql-plugin/src/main/320until340-non330db/scala/com/nvidia/spark/rapids/shims/AnsiCastRuleShims.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/AnsiCastRuleShims.scala
similarity index 92%
rename from sql-plugin/src/main/320until340-non330db/scala/com/nvidia/spark/rapids/shims/AnsiCastRuleShims.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/AnsiCastRuleShims.scala
index 546af5ef948..eaa040bcb5b 100644
--- a/sql-plugin/src/main/320until340-non330db/scala/com/nvidia/spark/rapids/shims/AnsiCastRuleShims.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/AnsiCastRuleShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala
similarity index 90%
rename from sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala
index 65909f9af03..01cf7ffffb3 100644
--- a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,18 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/320+-nondb/scala/com/nvidia/spark/rapids/shims/FileSourceScanExecMeta.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/FileSourceScanExecMeta.scala
similarity index 94%
rename from sql-plugin/src/main/320+-nondb/scala/com/nvidia/spark/rapids/shims/FileSourceScanExecMeta.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/FileSourceScanExecMeta.scala
index 8ee5e44abf1..60ca50c36c1 100644
--- a/sql-plugin/src/main/320+-nondb/scala/com/nvidia/spark/rapids/shims/FileSourceScanExecMeta.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/FileSourceScanExecMeta.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,18 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/320until330-all/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
similarity index 88%
rename from sql-plugin/src/main/320until330-all/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
index 6fbdaa684e5..2973b78269a 100644
--- a/sql-plugin/src/main/320until330-all/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.google.common.base.Objects
@@ -29,12 +37,13 @@ import org.apache.spark.sql.catalyst.util.truncatedString
 import org.apache.spark.sql.connector.read._
 import org.apache.spark.sql.execution.datasources.rapids.DataSourceStrategyUtils
 import org.apache.spark.sql.execution.datasources.v2._
+import org.apache.spark.sql.vectorized.ColumnarBatch
 
 case class GpuBatchScanExec(
     output: Seq[AttributeReference],
     @transient scan: Scan,
     runtimeFilters: Seq[Expression] = Seq.empty)
-    extends ShimDataSourceV2ScanExecBase with GpuBatchScanExecMetrics {
+    extends DataSourceV2ScanExecBase with GpuBatchScanExecMetrics {
   @transient lazy val batch: Batch = scan.toBatch
 
   // All expressions are filter expressions used on the CPU.
@@ -114,4 +123,12 @@ case class GpuBatchScanExec(
     val result = s"$nodeName$truncatedOutputString ${scan.description()} $runtimeFiltersString"
     redact(result)
   }
+
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
+    val numOutputRows = longMetric("numOutputRows")
+    inputRDD.asInstanceOf[RDD[ColumnarBatch]].map { b =>
+      numOutputRows += b.numRows()
+      b
+    }
+  }
 }
diff --git a/sql-plugin/src/main/320until330-all/scala/com/nvidia/spark/rapids/shims/GpuDataSourceRDD.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GpuDataSourceRDD.scala
similarity index 92%
rename from sql-plugin/src/main/320until330-all/scala/com/nvidia/spark/rapids/shims/GpuDataSourceRDD.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GpuDataSourceRDD.scala
index 1dfb97ae7df..6697c3e2e30 100644
--- a/sql-plugin/src/main/320until330-all/scala/com/nvidia/spark/rapids/shims/GpuDataSourceRDD.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GpuDataSourceRDD.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{MetricsBatchIterator, PartitionIterator}
diff --git a/sql-plugin/src/main/320+-noncdh/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala
similarity index 76%
rename from sql-plugin/src/main/320+-noncdh/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala
index bf7a67ff485..3958b3482cb 100644
--- a/sql-plugin/src/main/320+-noncdh/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.parquet.crypto.ParquetCryptoRuntimeException
diff --git a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/HashUtils.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/HashUtils.scala
similarity index 82%
rename from sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/HashUtils.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/HashUtils.scala
index bbaaa68c785..45713b1329c 100644
--- a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/HashUtils.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/HashUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,20 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import ai.rapids.cudf
diff --git a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/OffsetWindowFunctionMeta.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/OffsetWindowFunctionMeta.scala
similarity index 91%
rename from sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/OffsetWindowFunctionMeta.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/OffsetWindowFunctionMeta.scala
index 7bca685c440..25965dd3f07 100644
--- a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/OffsetWindowFunctionMeta.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/OffsetWindowFunctionMeta.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,20 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{BaseExprMeta, DataFromReplacementRule, ExprMeta, GpuOverrides, RapidsConf, RapidsMeta}
diff --git a/sql-plugin/src/main/320+-noncdh/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
similarity index 77%
rename from sql-plugin/src/main/320+-noncdh/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
index abc427e4272..3bd6ca5ce1b 100644
--- a/sql-plugin/src/main/320+-noncdh/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,18 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import ai.rapids.cudf.{ColumnView, DType}
diff --git a/sql-plugin/src/main/320until330-noncdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
similarity index 82%
rename from sql-plugin/src/main/320until330-noncdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
index 72d90ab3e3b..b578d588f00 100644
--- a/sql-plugin/src/main/320until330-noncdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.execution.datasources.orc.OrcUtils
diff --git a/sql-plugin/src/main/320+-noncdh/scala/com/nvidia/spark/rapids/shims/OrcShims320untilAllBase.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/OrcShims320untilAllBase.scala
similarity index 94%
rename from sql-plugin/src/main/320+-noncdh/scala/com/nvidia/spark/rapids/shims/OrcShims320untilAllBase.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/OrcShims320untilAllBase.scala
index 366e2fd0290..c200ae5471b 100644
--- a/sql-plugin/src/main/320+-noncdh/scala/com/nvidia/spark/rapids/shims/OrcShims320untilAllBase.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/OrcShims320untilAllBase.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,18 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import scala.collection.mutable.ArrayBuffer
diff --git a/sql-plugin/src/main/320until330-all/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala
similarity index 92%
rename from sql-plugin/src/main/320until330-all/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala
index 604990458b4..133e5c6066b 100644
--- a/sql-plugin/src/main/320until330-all/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.parquet.schema._
@@ -133,6 +141,10 @@ object ParquetSchemaClipShims {
             TimestampType
           case timestamp: TimestampLogicalTypeAnnotation if timestamp.getUnit == TimeUnit.MILLIS =>
             TimestampType
+          case timestamp: TimestampLogicalTypeAnnotation if timestamp.getUnit == TimeUnit.NANOS &&
+              ParquetLegacyNanoAsLongShims.legacyParquetNanosAsLong =>
+            TrampolineUtil.throwAnalysisException(
+              "GPU does not support spark.sql.legacy.parquet.nanosAsLong")
           case _ => illegalType()
         }
 
diff --git a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala
similarity index 73%
rename from sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala
index dd502323485..3fda132ba70 100644
--- a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,20 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.PlanShims
diff --git a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/RapidsCsvScanMeta.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/RapidsCsvScanMeta.scala
similarity index 83%
rename from sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/RapidsCsvScanMeta.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/RapidsCsvScanMeta.scala
index 1a50887652e..284bc3e5ee8 100644
--- a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/RapidsCsvScanMeta.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/RapidsCsvScanMeta.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,20 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{DataFromReplacementRule, GpuCSVScan, RapidsConf, RapidsMeta, ScanMeta}
diff --git a/sql-plugin/src/main/320until330-all/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala
similarity index 88%
rename from sql-plugin/src/main/320until330-all/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala
index 78a44293f0d..ebfbac976d3 100644
--- a/sql-plugin/src/main/320until330-all/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{DataFromReplacementRule, GpuOrcScan, RapidsConf, RapidsMeta, ScanMeta}
diff --git a/sql-plugin/src/main/320until330-all/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala
similarity index 88%
rename from sql-plugin/src/main/320until330-all/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala
index f94df07f13b..0c9c259dfe6 100644
--- a/sql-plugin/src/main/320until330-all/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{DataFromReplacementRule, GpuParquetScan, RapidsConf, RapidsMeta, ScanMeta}
diff --git a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/RebaseShims.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/RebaseShims.scala
similarity index 83%
rename from sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/RebaseShims.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/RebaseShims.scala
index 58f7d088e1e..64b0e05a914 100644
--- a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/RebaseShims.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/RebaseShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,20 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.internal.SQLConf
diff --git a/sql-plugin/src/main/320until330-all/scala/com/nvidia/spark/rapids/shims/ScanExecShims.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ScanExecShims.scala
similarity index 89%
rename from sql-plugin/src/main/320until330-all/scala/com/nvidia/spark/rapids/shims/ScanExecShims.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ScanExecShims.scala
index d19eb0f686d..a9bdd98c265 100644
--- a/sql-plugin/src/main/320until330-all/scala/com/nvidia/spark/rapids/shims/ScanExecShims.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ScanExecShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/ShimAQEShuffleReadExec.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ShimAQEShuffleReadExec.scala
similarity index 88%
rename from sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/ShimAQEShuffleReadExec.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ShimAQEShuffleReadExec.scala
index 03d44713956..bfa6b92838a 100644
--- a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/ShimAQEShuffleReadExec.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ShimAQEShuffleReadExec.scala
@@ -14,6 +14,20 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/320+-nondb/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala
similarity index 80%
rename from sql-plugin/src/main/320+-nondb/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala
index 7a36a7596a6..ce65d0d1045 100644
--- a/sql-plugin/src/main/320+-nondb/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,18 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import scala.concurrent.Promise
diff --git a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/ShimPredicateHelper.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ShimPredicateHelper.scala
similarity index 76%
rename from sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/ShimPredicateHelper.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ShimPredicateHelper.scala
index 6839eaeffc7..daa410aa8ae 100644
--- a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/ShimPredicateHelper.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ShimPredicateHelper.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,20 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.expressions._
diff --git a/sql-plugin/src/main/320until340-non330db/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala
similarity index 82%
rename from sql-plugin/src/main/320until340-non330db/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala
index 294f1bd42e5..3551e0b3103 100644
--- a/sql-plugin/src/main/320until340-non330db/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala
@@ -14,6 +14,18 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, REBALANCE_PARTITIONS_BY_COL, REBALANCE_PARTITIONS_BY_NONE, REPARTITION_BY_COL, REPARTITION_BY_NUM, ShuffleOrigin}
diff --git a/sql-plugin/src/main/320+-nondb/scala/com/nvidia/spark/rapids/shims/Spark320PlusNonDBShims.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/Spark320PlusNonDBShims.scala
similarity index 88%
rename from sql-plugin/src/main/320+-nondb/scala/com/nvidia/spark/rapids/shims/Spark320PlusNonDBShims.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/Spark320PlusNonDBShims.scala
index 5ce350fa0e8..36cf08c89a0 100644
--- a/sql-plugin/src/main/320+-nondb/scala/com/nvidia/spark/rapids/shims/Spark320PlusNonDBShims.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/Spark320PlusNonDBShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.SparkShims
diff --git a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/Spark320PlusShims.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/Spark320PlusShims.scala
similarity index 97%
rename from sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/Spark320PlusShims.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/Spark320PlusShims.scala
index 6ff1ccab535..6060daf2dac 100644
--- a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/Spark320PlusShims.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/Spark320PlusShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,20 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import scala.collection.mutable.ListBuffer
diff --git a/sql-plugin/src/main/320/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
similarity index 92%
rename from sql-plugin/src/main/320/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
index 5b64f8e2ef0..cbed59ac363 100644
--- a/sql-plugin/src/main/320/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.parquet.schema.MessageType
diff --git a/sql-plugin/src/main/post320-treenode/scala/com/nvidia/spark/rapids/shims/TreeNode.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/TreeNode.scala
similarity index 87%
rename from sql-plugin/src/main/post320-treenode/scala/com/nvidia/spark/rapids/shims/TreeNode.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/TreeNode.scala
index 769833c2ac0..527418713b0 100644
--- a/sql-plugin/src/main/post320-treenode/scala/com/nvidia/spark/rapids/shims/TreeNode.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/TreeNode.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,20 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, Expression, TernaryExpression, UnaryExpression}
diff --git a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/TypeSigUtil.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/TypeSigUtil.scala
similarity index 87%
rename from sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/TypeSigUtil.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/TypeSigUtil.scala
index f95a3703047..9055758d88e 100644
--- a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/TypeSigUtil.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/TypeSigUtil.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,20 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{TypeEnum, TypeSig, TypeSigUtilBase}
diff --git a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/YearParseUtil.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/YearParseUtil.scala
similarity index 75%
rename from sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/YearParseUtil.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/YearParseUtil.scala
index f5107091b8c..bdd1dd4ecef 100644
--- a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/YearParseUtil.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/YearParseUtil.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,20 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{RapidsConf, RapidsMeta}
diff --git a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/gpuWindows.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/gpuWindows.scala
similarity index 89%
rename from sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/gpuWindows.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/gpuWindows.scala
index 68230d6fc04..71900feec06 100644
--- a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/gpuWindows.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/gpuWindows.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,20 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{DataFromReplacementRule, GpuLiteral, GpuSpecifiedWindowFrameMetaBase, GpuWindowExpressionMetaBase, ParsedBoundary, RapidsConf, RapidsMeta}
diff --git a/sql-plugin/src/main/320/scala/com/nvidia/spark/rapids/shims/spark320/SparkShimServiceProvider.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/spark320/SparkShimServiceProvider.scala
similarity index 88%
rename from sql-plugin/src/main/320/scala/com/nvidia/spark/rapids/shims/spark320/SparkShimServiceProvider.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/spark320/SparkShimServiceProvider.scala
index 4117e0b58db..aed5be3d5ec 100644
--- a/sql-plugin/src/main/320/scala/com/nvidia/spark/rapids/shims/spark320/SparkShimServiceProvider.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/spark320/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark320
 
 import com.nvidia.spark.rapids.SparkShimVersion
diff --git a/sql-plugin/src/main/320/scala/com/nvidia/spark/rapids/spark320/RapidsShuffleManager.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/spark320/RapidsShuffleManager.scala
similarity index 87%
rename from sql-plugin/src/main/320/scala/com/nvidia/spark/rapids/spark320/RapidsShuffleManager.scala
rename to sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/spark320/RapidsShuffleManager.scala
index ccab7bb3c33..dcebdba9831 100644
--- a/sql-plugin/src/main/320/scala/com/nvidia/spark/rapids/spark320/RapidsShuffleManager.scala
+++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/spark320/RapidsShuffleManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.spark320
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/320+/scala/org/apache/spark/rapids/shims/GpuShuffleBlockResolver.scala b/sql-plugin/src/main/spark320/scala/org/apache/spark/rapids/shims/GpuShuffleBlockResolver.scala
similarity index 83%
rename from sql-plugin/src/main/320+/scala/org/apache/spark/rapids/shims/GpuShuffleBlockResolver.scala
rename to sql-plugin/src/main/spark320/scala/org/apache/spark/rapids/shims/GpuShuffleBlockResolver.scala
index 7ffad0c14e1..4e7717b5d4b 100644
--- a/sql-plugin/src/main/320+/scala/org/apache/spark/rapids/shims/GpuShuffleBlockResolver.scala
+++ b/sql-plugin/src/main/spark320/scala/org/apache/spark/rapids/shims/GpuShuffleBlockResolver.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,20 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import com.nvidia.spark.rapids.ShuffleBufferCatalog
diff --git a/sql-plugin/src/main/320+/scala/org/apache/spark/rapids/shims/ShuffledBatchRDDUtil.scala b/sql-plugin/src/main/spark320/scala/org/apache/spark/rapids/shims/ShuffledBatchRDDUtil.scala
similarity index 94%
rename from sql-plugin/src/main/320+/scala/org/apache/spark/rapids/shims/ShuffledBatchRDDUtil.scala
rename to sql-plugin/src/main/spark320/scala/org/apache/spark/rapids/shims/ShuffledBatchRDDUtil.scala
index cd895e98120..b3f3bd736b5 100644
--- a/sql-plugin/src/main/320+/scala/org/apache/spark/rapids/shims/ShuffledBatchRDDUtil.scala
+++ b/sql-plugin/src/main/spark320/scala/org/apache/spark/rapids/shims/ShuffledBatchRDDUtil.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,20 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.rapids.shims
 
 import org.apache.spark.{MapOutputTrackerMaster, Partition, ShuffleDependency, SparkEnv, TaskContext}
diff --git a/sql-plugin/src/main/320+/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala b/sql-plugin/src/main/spark320/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala
similarity index 81%
rename from sql-plugin/src/main/320+/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala
rename to sql-plugin/src/main/spark320/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala
index 8f7f68427cc..fbeeb8c674e 100644
--- a/sql-plugin/src/main/320+/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala
+++ b/sql-plugin/src/main/spark320/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,20 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.rapids.shims.api.python
 
 import java.io.DataInputStream
diff --git a/sql-plugin/src/main/320+/scala/org/apache/spark/rapids/shims/storage/ShimDiskBlockManager.scala b/sql-plugin/src/main/spark320/scala/org/apache/spark/rapids/shims/storage/ShimDiskBlockManager.scala
similarity index 73%
rename from sql-plugin/src/main/320+/scala/org/apache/spark/rapids/shims/storage/ShimDiskBlockManager.scala
rename to sql-plugin/src/main/spark320/scala/org/apache/spark/rapids/shims/storage/ShimDiskBlockManager.scala
index 96a3fd6f96a..9bec40409d9 100644
--- a/sql-plugin/src/main/320+/scala/org/apache/spark/rapids/shims/storage/ShimDiskBlockManager.scala
+++ b/sql-plugin/src/main/spark320/scala/org/apache/spark/rapids/shims/storage/ShimDiskBlockManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,20 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.rapids.shims.storage
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/320until340-non330db/scala/org/apache/spark/sql/execution/ShimTrampolineUtil.scala b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/execution/ShimTrampolineUtil.scala
similarity index 83%
rename from sql-plugin/src/main/320until340-non330db/scala/org/apache/spark/sql/execution/ShimTrampolineUtil.scala
rename to sql-plugin/src/main/spark320/scala/org/apache/spark/sql/execution/ShimTrampolineUtil.scala
index 1515d371b51..aecdf44b894 100644
--- a/sql-plugin/src/main/320until340-non330db/scala/org/apache/spark/sql/execution/ShimTrampolineUtil.scala
+++ b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/execution/ShimTrampolineUtil.scala
@@ -14,6 +14,18 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.execution
 
 import org.apache.spark.sql.catalyst.plans.physical.{BroadcastMode, IdentityBroadcastMode}
diff --git a/sql-plugin/src/main/320/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala
similarity index 94%
rename from sql-plugin/src/main/320/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala
rename to sql-plugin/src/main/spark320/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala
index c5c3ad6b65a..9168dfcbe91 100644
--- a/sql-plugin/src/main/320/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala
+++ b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.execution.datasources.parquet.rapids.shims
 
 import java.time.ZoneId
diff --git a/sql-plugin/src/main/320until340-non330db/scala/org/apache/spark/sql/execution/datasources/rapids/DataSourceStrategyUtils.scala b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/execution/datasources/rapids/DataSourceStrategyUtils.scala
similarity index 77%
rename from sql-plugin/src/main/320until340-non330db/scala/org/apache/spark/sql/execution/datasources/rapids/DataSourceStrategyUtils.scala
rename to sql-plugin/src/main/spark320/scala/org/apache/spark/sql/execution/datasources/rapids/DataSourceStrategyUtils.scala
index 7713d863409..b3a03143cb0 100644
--- a/sql-plugin/src/main/320until340-non330db/scala/org/apache/spark/sql/execution/datasources/rapids/DataSourceStrategyUtils.scala
+++ b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/execution/datasources/rapids/DataSourceStrategyUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,18 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.execution.datasources.rapids
 
 import org.apache.spark.sql.catalyst.expressions.Expression
diff --git a/sql-plugin/src/main/320+/scala/org/apache/spark/sql/rapids/shims/AvroUtils.scala b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/AvroUtils.scala
similarity index 74%
rename from sql-plugin/src/main/320+/scala/org/apache/spark/sql/rapids/shims/AvroUtils.scala
rename to sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/AvroUtils.scala
index 464ef92f54c..9dbe466ef85 100644
--- a/sql-plugin/src/main/320+/scala/org/apache/spark/sql/rapids/shims/AvroUtils.scala
+++ b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/AvroUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,20 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import com.nvidia.spark.rapids.RapidsMeta
diff --git a/sql-plugin/src/main/320until330-nondb/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
similarity index 93%
rename from sql-plugin/src/main/320until330-nondb/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
rename to sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
index fd8a2fb8d9e..5b3bc7a6710 100644
--- a/sql-plugin/src/main/320until330-nondb/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
+++ b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,13 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import org.apache.spark.sql.catalyst.TableIdentifier
diff --git a/sql-plugin/src/main/320until330-all/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedReader.scala b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedReader.scala
similarity index 92%
rename from sql-plugin/src/main/320until330-all/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedReader.scala
rename to sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedReader.scala
index 180365f6055..5d007d68dd7 100644
--- a/sql-plugin/src/main/320until330-all/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedReader.scala
+++ b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedReader.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import ai.rapids.cudf.{NvtxColor, NvtxRange}
diff --git a/sql-plugin/src/main/320+/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedWriter.scala b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedWriter.scala
similarity index 88%
rename from sql-plugin/src/main/320+/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedWriter.scala
rename to sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedWriter.scala
index 121d9b6f04c..06a0adfbc37 100644
--- a/sql-plugin/src/main/320+/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedWriter.scala
+++ b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedWriter.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,20 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/320+/scala/org/apache/spark/sql/rapids/shims/Spark32XShimsUtils.scala b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/Spark32XShimsUtils.scala
similarity index 81%
rename from sql-plugin/src/main/320+/scala/org/apache/spark/sql/rapids/shims/Spark32XShimsUtils.scala
rename to sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/Spark32XShimsUtils.scala
index 727ca88840e..0e6c97c8a98 100644
--- a/sql-plugin/src/main/320+/scala/org/apache/spark/sql/rapids/shims/Spark32XShimsUtils.scala
+++ b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/Spark32XShimsUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,20 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import org.apache.spark.sql.SparkSession
diff --git a/sql-plugin/src/main/320+/scala/org/apache/spark/sql/rapids/shims/datetimeExpressions.scala b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/datetimeExpressions.scala
similarity index 93%
rename from sql-plugin/src/main/320+/scala/org/apache/spark/sql/rapids/shims/datetimeExpressions.scala
rename to sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/datetimeExpressions.scala
index 096b3a85ab7..ee22f497240 100644
--- a/sql-plugin/src/main/320+/scala/org/apache/spark/sql/rapids/shims/datetimeExpressions.scala
+++ b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/datetimeExpressions.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,20 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import java.util.concurrent.TimeUnit
diff --git a/sql-plugin/src/main/320/scala/org/apache/spark/sql/rapids/shims/spark320/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/spark320/RapidsShuffleInternalManager.scala
similarity index 90%
rename from sql-plugin/src/main/320/scala/org/apache/spark/sql/rapids/shims/spark320/RapidsShuffleInternalManager.scala
rename to sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/spark320/RapidsShuffleInternalManager.scala
index 110770f3a48..99bdea19b42 100644
--- a/sql-plugin/src/main/320/scala/org/apache/spark/sql/rapids/shims/spark320/RapidsShuffleInternalManager.scala
+++ b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/spark320/RapidsShuffleInternalManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims.spark320
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/320until330-all/scala/org/apache/spark/sql/types/shims/PartitionValueCastShims.scala b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/types/shims/PartitionValueCastShims.scala
similarity index 87%
rename from sql-plugin/src/main/320until330-all/scala/org/apache/spark/sql/types/shims/PartitionValueCastShims.scala
rename to sql-plugin/src/main/spark320/scala/org/apache/spark/sql/types/shims/PartitionValueCastShims.scala
index c9b2bb43664..30eab463515 100644
--- a/sql-plugin/src/main/320until330-all/scala/org/apache/spark/sql/types/shims/PartitionValueCastShims.scala
+++ b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/types/shims/PartitionValueCastShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.types.shims
 
 import java.time.ZoneId
diff --git a/sql-plugin/src/main/320+/scala/org/apache/spark/storage/RapidsPushBasedFetchHelper.scala b/sql-plugin/src/main/spark320/scala/org/apache/spark/storage/RapidsPushBasedFetchHelper.scala
similarity index 97%
rename from sql-plugin/src/main/320+/scala/org/apache/spark/storage/RapidsPushBasedFetchHelper.scala
rename to sql-plugin/src/main/spark320/scala/org/apache/spark/storage/RapidsPushBasedFetchHelper.scala
index 4225cae1840..0d53519e5e6 100644
--- a/sql-plugin/src/main/320+/scala/org/apache/spark/storage/RapidsPushBasedFetchHelper.scala
+++ b/sql-plugin/src/main/spark320/scala/org/apache/spark/storage/RapidsPushBasedFetchHelper.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,20 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.storage
 
 import java.util.concurrent.TimeUnit
diff --git a/sql-plugin/src/main/320+/scala/org/apache/spark/storage/RapidsShuffleBlockFetcherIterator.scala b/sql-plugin/src/main/spark320/scala/org/apache/spark/storage/RapidsShuffleBlockFetcherIterator.scala
similarity index 99%
rename from sql-plugin/src/main/320+/scala/org/apache/spark/storage/RapidsShuffleBlockFetcherIterator.scala
rename to sql-plugin/src/main/spark320/scala/org/apache/spark/storage/RapidsShuffleBlockFetcherIterator.scala
index ee2f903e8cc..0cff4b1c10f 100644
--- a/sql-plugin/src/main/320+/scala/org/apache/spark/storage/RapidsShuffleBlockFetcherIterator.scala
+++ b/sql-plugin/src/main/spark320/scala/org/apache/spark/storage/RapidsShuffleBlockFetcherIterator.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,20 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.storage
 
 import java.io.{InputStream, IOException}
diff --git a/sql-plugin/src/main/321+/scala/com/nvidia/spark/rapids/shims/Spark321PlusShims.scala b/sql-plugin/src/main/spark321/scala/com/nvidia/spark/rapids/shims/Spark321PlusShims.scala
similarity index 83%
rename from sql-plugin/src/main/321+/scala/com/nvidia/spark/rapids/shims/Spark321PlusShims.scala
rename to sql-plugin/src/main/spark321/scala/com/nvidia/spark/rapids/shims/Spark321PlusShims.scala
index a423d2d85e2..8430719c49d 100644
--- a/sql-plugin/src/main/321+/scala/com/nvidia/spark/rapids/shims/Spark321PlusShims.scala
+++ b/sql-plugin/src/main/spark321/scala/com/nvidia/spark/rapids/shims/Spark321PlusShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,19 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.parquet.schema.MessageType
diff --git a/sql-plugin/src/main/321cdh/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/spark321/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
similarity index 83%
rename from sql-plugin/src/main/321cdh/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
rename to sql-plugin/src/main/spark321/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
index 1604df81f1f..c4c1dad2cfd 100644
--- a/sql-plugin/src/main/321cdh/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
+++ b/sql-plugin/src/main/spark321/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,15 +14,15 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321"}
+{"spark": "321cdh"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
-import com.nvidia.spark.rapids._
-
 object SparkShimImpl extends Spark321PlusShims
     with Spark320PlusNonDBShims
     with Spark31Xuntil33XShims
     with AnsiCastRuleShims {
-  override def getSparkShimVersion: ShimVersion = ShimLoader.getShimVersion
-
   override def reproduceEmptyStringBug: Boolean = true
 }
diff --git a/sql-plugin/src/main/321/scala/com/nvidia/spark/rapids/shims/spark321/SparkShimServiceProvider.scala b/sql-plugin/src/main/spark321/scala/com/nvidia/spark/rapids/shims/spark321/SparkShimServiceProvider.scala
similarity index 88%
rename from sql-plugin/src/main/321/scala/com/nvidia/spark/rapids/shims/spark321/SparkShimServiceProvider.scala
rename to sql-plugin/src/main/spark321/scala/com/nvidia/spark/rapids/shims/spark321/SparkShimServiceProvider.scala
index 9cac4a8f366..d7e2669a592 100644
--- a/sql-plugin/src/main/321/scala/com/nvidia/spark/rapids/shims/spark321/SparkShimServiceProvider.scala
+++ b/sql-plugin/src/main/spark321/scala/com/nvidia/spark/rapids/shims/spark321/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark321
 
 import com.nvidia.spark.rapids.SparkShimVersion
diff --git a/sql-plugin/src/main/321/scala/com/nvidia/spark/rapids/spark321/RapidsShuffleManager.scala b/sql-plugin/src/main/spark321/scala/com/nvidia/spark/rapids/spark321/RapidsShuffleManager.scala
similarity index 87%
rename from sql-plugin/src/main/321/scala/com/nvidia/spark/rapids/spark321/RapidsShuffleManager.scala
rename to sql-plugin/src/main/spark321/scala/com/nvidia/spark/rapids/spark321/RapidsShuffleManager.scala
index 81c6607da7d..f388e9f7ea9 100644
--- a/sql-plugin/src/main/321/scala/com/nvidia/spark/rapids/spark321/RapidsShuffleManager.scala
+++ b/sql-plugin/src/main/spark321/scala/com/nvidia/spark/rapids/spark321/RapidsShuffleManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.spark321
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/321until330-all/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala b/sql-plugin/src/main/spark321/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala
similarity index 93%
rename from sql-plugin/src/main/321until330-all/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala
rename to sql-plugin/src/main/spark321/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala
index 1f665755f3b..5292f2b7846 100644
--- a/sql-plugin/src/main/321until330-all/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala
+++ b/sql-plugin/src/main/spark321/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,13 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.execution.datasources.parquet.rapids.shims
 
 import java.time.ZoneId
diff --git a/sql-plugin/src/main/321/scala/org/apache/spark/sql/rapids/shims/spark321/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/spark321/scala/org/apache/spark/sql/rapids/shims/spark321/RapidsShuffleInternalManager.scala
similarity index 90%
rename from sql-plugin/src/main/321/scala/org/apache/spark/sql/rapids/shims/spark321/RapidsShuffleInternalManager.scala
rename to sql-plugin/src/main/spark321/scala/org/apache/spark/sql/rapids/shims/spark321/RapidsShuffleInternalManager.scala
index f064aef811e..a0d94672d90 100644
--- a/sql-plugin/src/main/321/scala/org/apache/spark/sql/rapids/shims/spark321/RapidsShuffleInternalManager.scala
+++ b/sql-plugin/src/main/spark321/scala/org/apache/spark/sql/rapids/shims/spark321/RapidsShuffleInternalManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims.spark321
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/330cdh/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala b/sql-plugin/src/main/spark321cdh/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala
similarity index 85%
rename from sql-plugin/src/main/330cdh/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala
rename to sql-plugin/src/main/spark321cdh/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala
index 869966fcbf8..eda873d7bd2 100644
--- a/sql-plugin/src/main/330cdh/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala
+++ b/sql-plugin/src/main/spark321cdh/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "321cdh"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 object GpuParquetCrypto {
diff --git a/sql-plugin/src/main/321cdh/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala b/sql-plugin/src/main/spark321cdh/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
similarity index 87%
rename from sql-plugin/src/main/321cdh/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
rename to sql-plugin/src/main/spark321cdh/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
index 6b9874750ec..b07b915e991 100644
--- a/sql-plugin/src/main/321cdh/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
+++ b/sql-plugin/src/main/spark321cdh/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321cdh"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import ai.rapids.cudf.{ColumnView, DType}
diff --git a/sql-plugin/src/main/321cdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala b/sql-plugin/src/main/spark321cdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
similarity index 90%
rename from sql-plugin/src/main/321cdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
rename to sql-plugin/src/main/spark321cdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
index 6bc8a9f8a7f..8c17e690aaa 100644
--- a/sql-plugin/src/main/321cdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
+++ b/sql-plugin/src/main/spark321cdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "321cdh"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.orc.Reader
diff --git a/sql-plugin/src/main/321cdh/scala/com/nvidia/spark/rapids/shims/OrcShims321CDHBase.scala b/sql-plugin/src/main/spark321cdh/scala/com/nvidia/spark/rapids/shims/OrcShims321CDHBase.scala
similarity index 96%
rename from sql-plugin/src/main/321cdh/scala/com/nvidia/spark/rapids/shims/OrcShims321CDHBase.scala
rename to sql-plugin/src/main/spark321cdh/scala/com/nvidia/spark/rapids/shims/OrcShims321CDHBase.scala
index 3efaf08102f..44e3cdc0483 100644
--- a/sql-plugin/src/main/321cdh/scala/com/nvidia/spark/rapids/shims/OrcShims321CDHBase.scala
+++ b/sql-plugin/src/main/spark321cdh/scala/com/nvidia/spark/rapids/shims/OrcShims321CDHBase.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "321cdh"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import scala.collection.mutable.ArrayBuffer
diff --git a/sql-plugin/src/main/321cdh/scala/com/nvidia/spark/rapids/shims/spark321cdh/SparkShimServiceProvider.scala b/sql-plugin/src/main/spark321cdh/scala/com/nvidia/spark/rapids/shims/spark321cdh/SparkShimServiceProvider.scala
similarity index 89%
rename from sql-plugin/src/main/321cdh/scala/com/nvidia/spark/rapids/shims/spark321cdh/SparkShimServiceProvider.scala
rename to sql-plugin/src/main/spark321cdh/scala/com/nvidia/spark/rapids/shims/spark321cdh/SparkShimServiceProvider.scala
index 3f5d2918e82..0a5d33fc260 100644
--- a/sql-plugin/src/main/321cdh/scala/com/nvidia/spark/rapids/shims/spark321cdh/SparkShimServiceProvider.scala
+++ b/sql-plugin/src/main/spark321cdh/scala/com/nvidia/spark/rapids/shims/spark321cdh/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321cdh"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark321cdh
 
 import com.nvidia.spark.rapids.{ClouderaShimVersion, ShimVersion}
diff --git a/sql-plugin/src/main/321cdh/scala/com/nvidia/spark/rapids/spark321cdh/RapidsShuffleManager.scala b/sql-plugin/src/main/spark321cdh/scala/com/nvidia/spark/rapids/spark321cdh/RapidsShuffleManager.scala
similarity index 87%
rename from sql-plugin/src/main/321cdh/scala/com/nvidia/spark/rapids/spark321cdh/RapidsShuffleManager.scala
rename to sql-plugin/src/main/spark321cdh/scala/com/nvidia/spark/rapids/spark321cdh/RapidsShuffleManager.scala
index c1fb4cd838e..665e31dc123 100644
--- a/sql-plugin/src/main/321cdh/scala/com/nvidia/spark/rapids/spark321cdh/RapidsShuffleManager.scala
+++ b/sql-plugin/src/main/spark321cdh/scala/com/nvidia/spark/rapids/spark321cdh/RapidsShuffleManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321cdh"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.spark321cdh
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/321cdh/scala/org/apache/spark/sql/rapids/shims/spark321cdh/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/spark321cdh/scala/org/apache/spark/sql/rapids/shims/spark321cdh/RapidsShuffleInternalManager.scala
similarity index 90%
rename from sql-plugin/src/main/321cdh/scala/org/apache/spark/sql/rapids/shims/spark321cdh/RapidsShuffleInternalManager.scala
rename to sql-plugin/src/main/spark321cdh/scala/org/apache/spark/sql/rapids/shims/spark321cdh/RapidsShuffleInternalManager.scala
index 061729ae2f2..3d6d85d1447 100644
--- a/sql-plugin/src/main/321cdh/scala/org/apache/spark/sql/rapids/shims/spark321cdh/RapidsShuffleInternalManager.scala
+++ b/sql-plugin/src/main/spark321cdh/scala/org/apache/spark/sql/rapids/shims/spark321cdh/RapidsShuffleInternalManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321cdh"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims.spark321cdh
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/321+-db/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala
similarity index 91%
rename from sql-plugin/src/main/321+-db/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala
rename to sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala
index 20c5714bf54..df381a86b05 100644
--- a/sql-plugin/src/main/321+-db/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala
+++ b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.expressions.Attribute
diff --git a/sql-plugin/src/main/321+-db/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala
similarity index 84%
rename from sql-plugin/src/main/321+-db/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala
rename to sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala
index de7f92c32a1..1956382f586 100644
--- a/sql-plugin/src/main/321+-db/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala
+++ b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 object AggregationTagging {
diff --git a/sql-plugin/src/main/321+-db/scala/com/nvidia/spark/rapids/shims/DeltaLakeUtils.scala b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/DeltaLakeUtils.scala
similarity index 85%
rename from sql-plugin/src/main/321+-db/scala/com/nvidia/spark/rapids/shims/DeltaLakeUtils.scala
rename to sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/DeltaLakeUtils.scala
index cc6367b7cf4..bc187ed8013 100644
--- a/sql-plugin/src/main/321+-db/scala/com/nvidia/spark/rapids/shims/DeltaLakeUtils.scala
+++ b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/DeltaLakeUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.execution.FileSourceScanExec
diff --git a/sql-plugin/src/main/321+-db/scala/com/nvidia/spark/rapids/shims/DistributionUtil.scala b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/DistributionUtil.scala
similarity index 92%
rename from sql-plugin/src/main/321+-db/scala/com/nvidia/spark/rapids/shims/DistributionUtil.scala
rename to sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/DistributionUtil.scala
index 943b8f5d200..d7aa407d23e 100644
--- a/sql-plugin/src/main/321+-db/scala/com/nvidia/spark/rapids/shims/DistributionUtil.scala
+++ b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/DistributionUtil.scala
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, BroadcastDistribution, ClusteredDistribution, Distribution, OrderedDistribution, StatefulOpClusteredDistribution, UnspecifiedDistribution}
diff --git a/sql-plugin/src/main/321+-db/scala/com/nvidia/spark/rapids/shims/FileSourceScanExecMeta.scala b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/FileSourceScanExecMeta.scala
similarity index 66%
rename from sql-plugin/src/main/321+-db/scala/com/nvidia/spark/rapids/shims/FileSourceScanExecMeta.scala
rename to sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/FileSourceScanExecMeta.scala
index f4b78b01bb5..73da3546f4c 100644
--- a/sql-plugin/src/main/321+-db/scala/com/nvidia/spark/rapids/shims/FileSourceScanExecMeta.scala
+++ b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/FileSourceScanExecMeta.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,16 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
 
-import org.apache.spark.sql.catalyst.expressions.DynamicPruningExpression
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.catalyst.expressions.{DynamicPruningExpression, Expression}
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, InMemoryFileIndex}
 import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
@@ -30,51 +35,58 @@ class FileSourceScanExecMeta(plan: FileSourceScanExec,
     conf: RapidsConf,
     parent: Option[RapidsMeta[_, _, _]],
     rule: DataFromReplacementRule)
-    extends SparkPlanMeta[FileSourceScanExec](plan, conf, parent, rule) {
+    extends SparkPlanMeta[FileSourceScanExec](plan, conf, parent, rule) with Logging {
 
   // Replaces SubqueryBroadcastExec inside dynamic pruning filters with GPU counterpart
   // if possible. Instead regarding filters as childExprs of current Meta, we create
   // a new meta for SubqueryBroadcastExec. The reason is that the GPU replacement of
   // FileSourceScan is independent from the replacement of the partitionFilters. It is
   // possible that the FileSourceScan is on the CPU, while the dynamic partitionFilters
-  // are on the GPU. And vice versa.
-  private lazy val partitionFilters = {
-    val convertBroadcast = (bc: SubqueryBroadcastExec) => {
-      val meta = GpuOverrides.wrapAndTagPlan(bc, conf)
-      meta.tagForExplain()
-      val converted = meta.convertIfNeeded()
-      // Because the PlanSubqueries rule is not called (and does not work as expected),
-      // we might actually have to fully convert the subquery plan as the plugin would
-      // intend (in this case calling GpuTransitionOverrides to insert GpuCoalesceBatches,
-      // etc.) to match the other side of the join to reuse the BroadcastExchange.
-      // This happens when SubqueryBroadcast has the original (Gpu)BroadcastExchangeExec
-      converted match {
-        case e: GpuSubqueryBroadcastExec => e.child match {
-          // If the GpuBroadcastExchange is here, then we will need to run the transition
-          // overrides here
-          case _: GpuBroadcastExchangeExec =>
-            var updated = ApplyColumnarRulesAndInsertTransitions(Seq(), true)
-                .apply(converted)
-            updated = (new GpuTransitionOverrides()).apply(updated)
-            updated match {
-              case h: GpuBringBackToHost =>
-                h.child.asInstanceOf[BaseSubqueryExec]
-              case c2r: GpuColumnarToRowExec =>
-                c2r.child.asInstanceOf[BaseSubqueryExec]
-              case _: GpuSubqueryBroadcastExec =>
-                updated.asInstanceOf[BaseSubqueryExec]
-            }
-          // Otherwise, if this SubqueryBroadcast is using a ReusedExchange, then we don't
-          // do anything further
-          case _: ReusedExchangeExec =>
-            converted.asInstanceOf[BaseSubqueryExec]
-        }
-        case _ =>
+  // are on the GPU. And vice versa. The same applies for dataFilters in the case of
+  // Dynamic File Pruning
+  private def convertBroadcast(bc: SubqueryBroadcastExec): BaseSubqueryExec = {
+    val meta = GpuOverrides.wrapAndTagPlan(bc, conf)
+    meta.tagForExplain()
+    if (conf.shouldExplain) {
+      val explain = meta.explain(conf.shouldExplainAll)
+      if (explain.nonEmpty) {
+        logWarning(s"\n$explain")
+      }
+    }
+    val converted = meta.convertIfNeeded()
+    // Because the PlanSubqueries rule is not called (and does not work as expected),
+    // we might actually have to fully convert the subquery plan as the plugin would
+    // intend (in this case calling GpuTransitionOverrides to insert GpuCoalesceBatches,
+    // etc.) to match the other side of the join to reuse the BroadcastExchange.
+    // This happens when SubqueryBroadcast has the original (Gpu)BroadcastExchangeExec
+    converted match {
+      case e: GpuSubqueryBroadcastExec => e.child match {
+        // If the GpuBroadcastExchange is here, then we will need to run the transition
+        // overrides here
+        case _: GpuBroadcastExchangeExec =>
+          var updated = ApplyColumnarRulesAndInsertTransitions(Seq(), true)
+              .apply(converted)
+          updated = (new GpuTransitionOverrides()).apply(updated)
+          updated match {
+            case h: GpuBringBackToHost =>
+              h.child.asInstanceOf[BaseSubqueryExec]
+            case c2r: GpuColumnarToRowExec =>
+              c2r.child.asInstanceOf[BaseSubqueryExec]
+            case _: GpuSubqueryBroadcastExec =>
+              updated.asInstanceOf[BaseSubqueryExec]
+          }
+        // Otherwise, if this SubqueryBroadcast is using a ReusedExchange, then we don't
+        // do anything further
+        case _: ReusedExchangeExec =>
           converted.asInstanceOf[BaseSubqueryExec]
       }
+      case _ =>
+        converted.asInstanceOf[BaseSubqueryExec]
     }
+  }
 
-    wrapped.partitionFilters.map { filter =>
+  private def convertDynamicPruningFilters(filters: Seq[Expression]): Seq[Expression] = {
+    filters.map { filter =>
       filter.transformDown {
         case dpe @ DynamicPruningExpression(inSub: InSubqueryExec) =>
           inSub.plan match {
@@ -89,6 +101,12 @@ class FileSourceScanExecMeta(plan: FileSourceScanExec,
     }
   }
 
+  // Support partitionFilters in Dynamic Partition Pruning
+  private lazy val partitionFilters = convertDynamicPruningFilters(wrapped.partitionFilters)
+
+  // Support dataFilters in Dynamic File Pruning
+  private lazy val dataFilters = convertDynamicPruningFilters(wrapped.dataFilters)
+
   // partition filters and data filters are not run on the GPU
   override val childExprs: Seq[ExprMeta[_]] = Seq.empty
 
@@ -106,7 +124,7 @@ class FileSourceScanExecMeta(plan: FileSourceScanExec,
   }
 
   override def convertToCpu(): SparkPlan = {
-    wrapped.copy(partitionFilters = partitionFilters)
+    wrapped.copy(partitionFilters = partitionFilters, dataFilters = dataFilters)
   }
 
   override def convertToGpu(): GpuExec = {
@@ -140,7 +158,7 @@ class FileSourceScanExecMeta(plan: FileSourceScanExec,
             conf,
             wrapped.relation,
             partitionFilters,
-            wrapped.dataFilters)
+            dataFilters)
         } else {
           // convert time algorithm and read large files
           (wrapped.relation.location, None)
@@ -166,7 +184,7 @@ class FileSourceScanExecMeta(plan: FileSourceScanExec,
       wrapped.optionalBucketSet,
       // TODO: Does Databricks have coalesced bucketing implemented?
       None,
-      wrapped.dataFilters,
+      dataFilters,
       wrapped.tableIdentifier,
       wrapped.disableBucketedScan,
       queryUsesInputFile = false,
diff --git a/sql-plugin/src/main/321db/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala
similarity index 92%
rename from sql-plugin/src/main/321db/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala
rename to sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala
index fb0f226c4c9..6c0a5fd3091 100644
--- a/sql-plugin/src/main/321db/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala
+++ b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.GpuHashPartitioningBase
diff --git a/sql-plugin/src/main/321db/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala
similarity index 96%
rename from sql-plugin/src/main/321db/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala
rename to sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala
index 3cbaf0c158e..9d4de02d87b 100644
--- a/sql-plugin/src/main/321db/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala
+++ b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{GpuExpression, GpuPartitioning}
diff --git a/sql-plugin/src/main/311+-db/scala/com/nvidia/spark/rapids/shims/GpuRunningWindowExecMeta.scala b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/GpuRunningWindowExecMeta.scala
similarity index 91%
rename from sql-plugin/src/main/311+-db/scala/com/nvidia/spark/rapids/shims/GpuRunningWindowExecMeta.scala
rename to sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/GpuRunningWindowExecMeta.scala
index f12c8208ec2..3d3045b9bb2 100644
--- a/sql-plugin/src/main/311+-db/scala/com/nvidia/spark/rapids/shims/GpuRunningWindowExecMeta.scala
+++ b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/GpuRunningWindowExecMeta.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.databricks.sql.execution.window.RunningWindowFunctionExec
diff --git a/sql-plugin/src/main/311+-db/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala
similarity index 95%
rename from sql-plugin/src/main/311+-db/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala
rename to sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala
index 27abaaa834f..eb16c90f160 100644
--- a/sql-plugin/src/main/311+-db/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala
+++ b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import scala.collection.mutable.ArrayBuffer
@@ -83,10 +87,9 @@ case class GpuWindowInPandasExec(
       projectList.map(_.transform(unboundToRefMap)), child.output)
   }
 
-  // Override doExecuteColumnar so we use the correct GpuArrowPythonRunner
-  override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {
-    val (numInputRows, numInputBatches, numOutputRows, numOutputBatches,
-         spillCallback) = commonGpuMetrics()
+  // Override internalDoExecuteColumnar so we use the correct GpuArrowPythonRunner
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
+    val (numInputRows, numInputBatches, numOutputRows, numOutputBatches) = commonGpuMetrics()
     val sessionLocalTimeZone = conf.sessionLocalTimeZone
 
     // 1) Unwrap the expressions and build some info data:
@@ -195,7 +198,7 @@ case class GpuWindowInPandasExec(
       // Re-batching the input data by GroupingIterator
       val boundPartitionRefs = GpuBindReferences.bindGpuReferences(gpuPartitionSpec, childOutput)
       val groupedIterator = new GroupingIterator(inputIter, boundPartitionRefs,
-        numInputRows, numInputBatches, spillCallback)
+        numInputRows, numInputBatches)
       val pyInputIterator = groupedIterator.map { batch =>
         // We have to do the project before we add the batch because the batch might be closed
         // when it is added
@@ -204,7 +207,7 @@ case class GpuWindowInPandasExec(
         val inputBatch = withResource(projectedBatch) { projectedCb =>
           insertWindowBounds(projectedCb)
         }
-        queue.add(batch, spillCallback)
+        queue.add(batch)
         inputBatch
       }
 
@@ -223,7 +226,6 @@ case class GpuWindowInPandasExec(
           pythonRunnerConf,
           /* The whole group data should be written in a single call, so here is unlimited */
           Int.MaxValue,
-          spillCallback.semaphoreWaitTime,
           pythonOutputSchema,
           () => queue.finish())
 
@@ -236,6 +238,6 @@ case class GpuWindowInPandasExec(
       }
 
     } // End of mapPartitions
-  } // End of doExecuteColumnar
+  } // End of internalDoExecuteColumnar
 
 }
diff --git a/sql-plugin/src/main/321+-db/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala
similarity index 87%
rename from sql-plugin/src/main/321+-db/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala
rename to sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala
index e4545ccf42a..913af3e0d8a 100644
--- a/sql-plugin/src/main/321+-db/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala
+++ b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import scala.concurrent.Promise
diff --git a/sql-plugin/src/main/311+-db/scala/com/nvidia/spark/rapids/shims/ShimLeafExecNode.scala b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/ShimLeafExecNode.scala
similarity index 91%
rename from sql-plugin/src/main/311+-db/scala/com/nvidia/spark/rapids/shims/ShimLeafExecNode.scala
rename to sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/ShimLeafExecNode.scala
index 0bc732cf80c..bbc90e49f94 100644
--- a/sql-plugin/src/main/311+-db/scala/com/nvidia/spark/rapids/shims/ShimLeafExecNode.scala
+++ b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/ShimLeafExecNode.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.plans.logical.Statistics
@@ -41,5 +45,5 @@ trait ShimDataSourceV2ScanExecBase extends DataSourceV2ScanExecBase {
     )
   }
 
-  def ordering: Option[Seq[org.apache.spark.sql.catalyst.expressions.SortOrder]] = None
-}
\ No newline at end of file
+}
+
diff --git a/sql-plugin/src/main/321+-db/scala/com/nvidia/spark/rapids/shims/Spark321PlusDBShims.scala b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/Spark321PlusDBShims.scala
similarity index 82%
rename from sql-plugin/src/main/321+-db/scala/com/nvidia/spark/rapids/shims/Spark321PlusDBShims.scala
rename to sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/Spark321PlusDBShims.scala
index f6c0b56df80..92533d46f48 100644
--- a/sql-plugin/src/main/321+-db/scala/com/nvidia/spark/rapids/shims/Spark321PlusDBShims.scala
+++ b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/Spark321PlusDBShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,10 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.databricks.sql.execution.window.RunningWindowFunctionExec
-import com.databricks.sql.optimizer.PlanDynamicPruningFilters
+import com.databricks.sql.optimizer.{EphemeralSubstring, PlanDynamicPruningFilters}
 import com.nvidia.spark.rapids._
 import org.apache.hadoop.fs.FileStatus
 
@@ -35,6 +39,7 @@ import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.exchange._
 import org.apache.spark.sql.execution.python._
 import org.apache.spark.sql.execution.window._
+import org.apache.spark.sql.rapids.GpuSubstring
 import org.apache.spark.sql.rapids.execution._
 import org.apache.spark.sql.rapids.execution.shims.{GpuSubqueryBroadcastMeta,ReuseGpuBroadcastExchangeAndSubquery}
 import org.apache.spark.sql.rapids.shims._
@@ -129,6 +134,25 @@ trait Spark321PlusDBShims extends SparkShims
   override def getExecs: Map[Class[_ <: SparkPlan], ExecRule[_ <: SparkPlan]] =
     super.getExecs ++ shimExecs
 
+  override def getExprs: Map[Class[_ <: Expression], ExprRule[_ <: Expression]] = {
+    val exprs: Map[Class[_ <: Expression], ExprRule[_ <: Expression]] = Seq(
+      GpuOverrides.expr[EphemeralSubstring](
+        "Ephemeral version of substring operator",
+        ExprChecks.projectOnly(TypeSig.STRING, TypeSig.STRING + TypeSig.BINARY,
+          Seq(ParamCheck("str", TypeSig.STRING, TypeSig.STRING + TypeSig.BINARY),
+            ParamCheck("pos", TypeSig.INT, TypeSig.INT),
+            ParamCheck("len", TypeSig.INT, TypeSig.INT))),
+        (in, conf, p, r) => new TernaryExprMeta[EphemeralSubstring](in, conf, p, r) {
+          override def convertToGpu(
+              column: Expression,
+              position: Expression,
+              length: Expression): GpuExpression =
+            GpuSubstring(column, position, length)
+        })
+    ).map(r => (r.getClassFor.asSubclass(classOf[Expression]), r)).toMap
+    exprs ++ super.getExprs
+  }
+
   /**
    * Case class ShuffleQueryStageExec holds an additional field shuffleOrigin
    * affecting the unapply method signature
diff --git a/sql-plugin/src/main/321db/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
similarity index 94%
rename from sql-plugin/src/main/321db/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
rename to sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
index 7a8168236ba..b78fc36b8a5 100644
--- a/sql-plugin/src/main/321db/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
+++ b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/321db/scala/com/nvidia/spark/rapids/shims/spark321db/SparkShimServiceProvider.scala b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/spark321db/SparkShimServiceProvider.scala
similarity index 89%
rename from sql-plugin/src/main/321db/scala/com/nvidia/spark/rapids/shims/spark321db/SparkShimServiceProvider.scala
rename to sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/spark321db/SparkShimServiceProvider.scala
index 357b6a03f08..1e742225f7a 100644
--- a/sql-plugin/src/main/321db/scala/com/nvidia/spark/rapids/shims/spark321db/SparkShimServiceProvider.scala
+++ b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/spark321db/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark321db
 
 import com.nvidia.spark.rapids.{DatabricksShimVersion, ShimVersion}
diff --git a/sql-plugin/src/main/321db/scala/com/nvidia/spark/rapids/spark321db/RapidsShuffleManager.scala b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/spark321db/RapidsShuffleManager.scala
similarity index 87%
rename from sql-plugin/src/main/321db/scala/com/nvidia/spark/rapids/spark321db/RapidsShuffleManager.scala
rename to sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/spark321db/RapidsShuffleManager.scala
index 82c231908f2..c3665b44c0c 100644
--- a/sql-plugin/src/main/321db/scala/com/nvidia/spark/rapids/spark321db/RapidsShuffleManager.scala
+++ b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/spark321db/RapidsShuffleManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.spark321db
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/321+-db/scala/org/apache/spark/rapids/execution/shims/GpuSubqueryBroadcastMeta.scala b/sql-plugin/src/main/spark321db/scala/org/apache/spark/rapids/execution/shims/GpuSubqueryBroadcastMeta.scala
similarity index 97%
rename from sql-plugin/src/main/321+-db/scala/org/apache/spark/rapids/execution/shims/GpuSubqueryBroadcastMeta.scala
rename to sql-plugin/src/main/spark321db/scala/org/apache/spark/rapids/execution/shims/GpuSubqueryBroadcastMeta.scala
index 079162c6bce..950a88fa684 100644
--- a/sql-plugin/src/main/321+-db/scala/org/apache/spark/rapids/execution/shims/GpuSubqueryBroadcastMeta.scala
+++ b/sql-plugin/src/main/spark321db/scala/org/apache/spark/rapids/execution/shims/GpuSubqueryBroadcastMeta.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.execution.shims
 
 import com.nvidia.spark.rapids.{BaseExprMeta, DataFromReplacementRule, GpuExec, RapidsConf, RapidsMeta, SparkPlanMeta}
diff --git a/sql-plugin/src/main/31xdb/scala/org/apache/spark/sql/rapids/execution/shims/ReuseGpuBroadcastExchangeAndSubquery.scala b/sql-plugin/src/main/spark321db/scala/org/apache/spark/rapids/execution/shims/ReuseGpuBroadcastExchangeAndSubquery.scala
similarity index 96%
rename from sql-plugin/src/main/31xdb/scala/org/apache/spark/sql/rapids/execution/shims/ReuseGpuBroadcastExchangeAndSubquery.scala
rename to sql-plugin/src/main/spark321db/scala/org/apache/spark/rapids/execution/shims/ReuseGpuBroadcastExchangeAndSubquery.scala
index c0847d8a56c..41bf796370b 100644
--- a/sql-plugin/src/main/31xdb/scala/org/apache/spark/sql/rapids/execution/shims/ReuseGpuBroadcastExchangeAndSubquery.scala
+++ b/sql-plugin/src/main/spark321db/scala/org/apache/spark/rapids/execution/shims/ReuseGpuBroadcastExchangeAndSubquery.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -17,6 +17,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.execution.shims
 
 import scala.collection.mutable
@@ -25,7 +29,6 @@ import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.trees.TreePattern._
 import org.apache.spark.sql.execution.{BaseSubqueryExec, ExecSubqueryExpression, ReusedSubqueryExec, SparkPlan}
 import org.apache.spark.sql.execution.exchange.{Exchange, ReusedExchangeExec}
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.rapids.execution.GpuBroadcastExchangeExec
 
 /**
@@ -44,7 +47,6 @@ import org.apache.spark.sql.rapids.execution.GpuBroadcastExchangeExec
  * in one go.
  */
 case object ReuseGpuBroadcastExchangeAndSubquery extends Rule[SparkPlan] {
-  def conf: SQLConf = SQLConf.get
 
   def apply(plan: SparkPlan): SparkPlan = {
     if (conf.exchangeReuseEnabled || conf.subqueryReuseEnabled) {
diff --git a/sql-plugin/src/main/321+-db/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala b/sql-plugin/src/main/spark321db/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala
similarity index 94%
rename from sql-plugin/src/main/321+-db/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala
rename to sql-plugin/src/main/spark321db/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala
index 26dda745e8b..10301f28a52 100644
--- a/sql-plugin/src/main/321+-db/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala
+++ b/sql-plugin/src/main/spark321db/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.rapids.shims
 
 import com.nvidia.spark.rapids.GpuPartitioning
diff --git a/sql-plugin/src/main/321db/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala b/sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala
similarity index 89%
rename from sql-plugin/src/main/321db/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala
rename to sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala
index 2f41143f313..9bf2b2cf7d7 100644
--- a/sql-plugin/src/main/321db/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala
+++ b/sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.catalyst.csv
 
 import org.apache.spark.sql.catalyst.util.DateFormatter
diff --git a/sql-plugin/src/main/321db/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala b/sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala
similarity index 89%
rename from sql-plugin/src/main/321db/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala
rename to sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala
index dc2160ed6d0..142395c318e 100644
--- a/sql-plugin/src/main/321db/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala
+++ b/sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.catalyst.json
 
 import org.apache.spark.sql.catalyst.util.DateFormatter
diff --git a/sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuArrowPythonRunnerShims.scala b/sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuArrowPythonRunnerShims.scala
new file mode 100644
index 00000000000..96498edd1ee
--- /dev/null
+++ b/sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuArrowPythonRunnerShims.scala
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
+package org.apache.spark.sql.rapids.execution.python.shims
+
+import org.apache.spark.api.python._
+import org.apache.spark.sql.rapids.execution.python._
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.util._
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+case class GpuArrowPythonRunnerShims(
+  conf: org.apache.spark.sql.internal.SQLConf,
+  chainedFunc: Seq[ChainedPythonFunctions],
+  argOffsets: Array[Array[Int]],
+  dedupAttrs: StructType,
+  pythonOutputSchema: StructType) {
+  // Configs from DB runtime
+  val maxBytes = conf.pandasZeroConfConversionGroupbyApplyMaxBytesPerSlice
+  val zeroConfEnabled = conf.pandasZeroConfConversionGroupbyApplyEnabled
+  val sessionLocalTimeZone = conf.sessionLocalTimeZone
+  val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf)
+
+  def getRunner(): GpuPythonRunnerBase[ColumnarBatch] = {
+    if (zeroConfEnabled && maxBytes > 0L) {
+      new GpuGroupUDFArrowPythonRunner(
+        chainedFunc,
+        PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
+        argOffsets,
+        dedupAttrs,
+        sessionLocalTimeZone,
+        pythonRunnerConf,
+        // The whole group data should be written in a single call, so here is unlimited
+        Int.MaxValue,
+        pythonOutputSchema)
+    } else {
+      new GpuArrowPythonRunner(
+        chainedFunc,
+        PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
+        argOffsets,
+        dedupAttrs,
+        sessionLocalTimeZone,
+        pythonRunnerConf,
+        Int.MaxValue,
+        pythonOutputSchema)
+    }
+  }
+}
diff --git a/sql-plugin/src/main/311+-db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala b/sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala
similarity index 96%
rename from sql-plugin/src/main/311+-db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala
rename to sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala
index cc351158ba0..61fce81b8c0 100644
--- a/sql-plugin/src/main/311+-db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala
+++ b/sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -17,6 +17,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.execution.python.shims
 
 import java.io.DataOutputStream
@@ -54,7 +58,6 @@ class GpuGroupUDFArrowPythonRunner(
     timeZoneId: String,
     conf: Map[String, String],
     batchSize: Long,
-    val semWait: GpuMetric,
     pythonOutSchema: StructType)
   extends GpuPythonRunnerBase[ColumnarBatch](funcs, evalType, argOffsets)
     with GpuPythonArrowOutput {
diff --git a/sql-plugin/src/main/321db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
similarity index 95%
rename from sql-plugin/src/main/321db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
rename to sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
index a2b112c119e..6fa5b8350a5 100644
--- a/sql-plugin/src/main/321db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
+++ b/sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import org.apache.spark.sql.catalyst.TableIdentifier
diff --git a/sql-plugin/src/main/321db/scala/org/apache/spark/sql/rapids/shims/spark321db/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/rapids/shims/spark321db/RapidsShuffleInternalManager.scala
similarity index 90%
rename from sql-plugin/src/main/321db/scala/org/apache/spark/sql/rapids/shims/spark321db/RapidsShuffleInternalManager.scala
rename to sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/rapids/shims/spark321db/RapidsShuffleInternalManager.scala
index ab5f608cb97..5b90ac81e5c 100644
--- a/sql-plugin/src/main/321db/scala/org/apache/spark/sql/rapids/shims/spark321db/RapidsShuffleInternalManager.scala
+++ b/sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/rapids/shims/spark321db/RapidsShuffleInternalManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321db"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims.spark321db
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/323/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/spark322/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
similarity index 79%
rename from sql-plugin/src/main/323/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
rename to sql-plugin/src/main/spark322/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
index 9f6b3f70935..4af154b32c6 100644
--- a/sql-plugin/src/main/323/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
+++ b/sql-plugin/src/main/spark322/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "322"}
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
-import com.nvidia.spark.rapids._
-
 object SparkShimImpl extends Spark321PlusShims
     with Spark320PlusNonDBShims
     with Spark31Xuntil33XShims
-    with AnsiCastRuleShims {
-  override def getSparkShimVersion: ShimVersion = ShimLoader.getShimVersion
-}
+    with AnsiCastRuleShims
diff --git a/sql-plugin/src/main/322/scala/com/nvidia/spark/rapids/shims/spark322/SparkShimServiceProvider.scala b/sql-plugin/src/main/spark322/scala/com/nvidia/spark/rapids/shims/spark322/SparkShimServiceProvider.scala
similarity index 88%
rename from sql-plugin/src/main/322/scala/com/nvidia/spark/rapids/shims/spark322/SparkShimServiceProvider.scala
rename to sql-plugin/src/main/spark322/scala/com/nvidia/spark/rapids/shims/spark322/SparkShimServiceProvider.scala
index 1df0b81e6b7..d1cfd4e6229 100644
--- a/sql-plugin/src/main/322/scala/com/nvidia/spark/rapids/shims/spark322/SparkShimServiceProvider.scala
+++ b/sql-plugin/src/main/spark322/scala/com/nvidia/spark/rapids/shims/spark322/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "322"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark322
 
 import com.nvidia.spark.rapids.SparkShimVersion
diff --git a/sql-plugin/src/main/322/scala/com/nvidia/spark/rapids/spark322/RapidsShuffleManager.scala b/sql-plugin/src/main/spark322/scala/com/nvidia/spark/rapids/spark322/RapidsShuffleManager.scala
similarity index 87%
rename from sql-plugin/src/main/322/scala/com/nvidia/spark/rapids/spark322/RapidsShuffleManager.scala
rename to sql-plugin/src/main/spark322/scala/com/nvidia/spark/rapids/spark322/RapidsShuffleManager.scala
index 0b2ea1ef2c2..7b34813f08c 100644
--- a/sql-plugin/src/main/322/scala/com/nvidia/spark/rapids/spark322/RapidsShuffleManager.scala
+++ b/sql-plugin/src/main/spark322/scala/com/nvidia/spark/rapids/spark322/RapidsShuffleManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "322"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.spark322
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/322/scala/org/apache/spark/sql/rapids/shims/spark322/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/spark322/scala/org/apache/spark/sql/rapids/shims/spark322/RapidsShuffleInternalManager.scala
similarity index 90%
rename from sql-plugin/src/main/322/scala/org/apache/spark/sql/rapids/shims/spark322/RapidsShuffleInternalManager.scala
rename to sql-plugin/src/main/spark322/scala/org/apache/spark/sql/rapids/shims/spark322/RapidsShuffleInternalManager.scala
index b9599045fdd..bd3487f5ccc 100644
--- a/sql-plugin/src/main/322/scala/org/apache/spark/sql/rapids/shims/spark322/RapidsShuffleInternalManager.scala
+++ b/sql-plugin/src/main/spark322/scala/org/apache/spark/sql/rapids/shims/spark322/RapidsShuffleInternalManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "322"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims.spark322
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/323/scala/com/nvidia/spark/rapids/shims/spark323/SparkShimServiceProvider.scala b/sql-plugin/src/main/spark323/scala/com/nvidia/spark/rapids/shims/spark323/SparkShimServiceProvider.scala
similarity index 88%
rename from sql-plugin/src/main/323/scala/com/nvidia/spark/rapids/shims/spark323/SparkShimServiceProvider.scala
rename to sql-plugin/src/main/spark323/scala/com/nvidia/spark/rapids/shims/spark323/SparkShimServiceProvider.scala
index 3748a26e433..168e6af32f3 100644
--- a/sql-plugin/src/main/323/scala/com/nvidia/spark/rapids/shims/spark323/SparkShimServiceProvider.scala
+++ b/sql-plugin/src/main/spark323/scala/com/nvidia/spark/rapids/shims/spark323/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark323
 
 import com.nvidia.spark.rapids.SparkShimVersion
diff --git a/sql-plugin/src/main/323/scala/com/nvidia/spark/rapids/spark323/RapidsShuffleManager.scala b/sql-plugin/src/main/spark323/scala/com/nvidia/spark/rapids/spark323/RapidsShuffleManager.scala
similarity index 87%
rename from sql-plugin/src/main/323/scala/com/nvidia/spark/rapids/spark323/RapidsShuffleManager.scala
rename to sql-plugin/src/main/spark323/scala/com/nvidia/spark/rapids/spark323/RapidsShuffleManager.scala
index 6c57fd31e34..f7a297b9d6e 100644
--- a/sql-plugin/src/main/323/scala/com/nvidia/spark/rapids/spark323/RapidsShuffleManager.scala
+++ b/sql-plugin/src/main/spark323/scala/com/nvidia/spark/rapids/spark323/RapidsShuffleManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.spark323
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/323/scala/org/apache/spark/sql/rapids/shims/spark323/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/spark323/scala/org/apache/spark/sql/rapids/shims/spark323/RapidsShuffleInternalManager.scala
similarity index 90%
rename from sql-plugin/src/main/323/scala/org/apache/spark/sql/rapids/shims/spark323/RapidsShuffleInternalManager.scala
rename to sql-plugin/src/main/spark323/scala/org/apache/spark/sql/rapids/shims/spark323/RapidsShuffleInternalManager.scala
index d247eb2ab44..d2602b84fb3 100644
--- a/sql-plugin/src/main/323/scala/org/apache/spark/sql/rapids/shims/spark323/RapidsShuffleInternalManager.scala
+++ b/sql-plugin/src/main/spark323/scala/org/apache/spark/sql/rapids/shims/spark323/RapidsShuffleInternalManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims.spark323
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/AnsiUtil.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/AnsiUtil.scala
similarity index 94%
rename from sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/AnsiUtil.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/AnsiUtil.scala
index 5b380a73666..db980f45048 100644
--- a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/AnsiUtil.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/AnsiUtil.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import ai.rapids.cudf.{ColumnVector, ColumnView, DType, Scalar}
diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/CharVarcharUtilsShims.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/CharVarcharUtilsShims.scala
similarity index 80%
rename from sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/CharVarcharUtilsShims.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/CharVarcharUtilsShims.scala
index 52d66623e41..5ad50e18e00 100644
--- a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/CharVarcharUtilsShims.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/CharVarcharUtilsShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.util.CharVarcharUtils
diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/DayTimeIntervalShims.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/DayTimeIntervalShims.scala
similarity index 94%
rename from sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/DayTimeIntervalShims.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/DayTimeIntervalShims.scala
index 5287010872f..2274dc36f22 100644
--- a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/DayTimeIntervalShims.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/DayTimeIntervalShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/330+-nondb/scala/com/nvidia/spark/rapids/shims/DistributionUtil.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/DistributionUtil.scala
similarity index 89%
rename from sql-plugin/src/main/330+-nondb/scala/com/nvidia/spark/rapids/shims/DistributionUtil.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/DistributionUtil.scala
index 943b8f5d200..f0823444b12 100644
--- a/sql-plugin/src/main/330+-nondb/scala/com/nvidia/spark/rapids/shims/DistributionUtil.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/DistributionUtil.scala
@@ -14,6 +14,13 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, BroadcastDistribution, ClusteredDistribution, Distribution, OrderedDistribution, StatefulOpClusteredDistribution, UnspecifiedDistribution}
diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
similarity index 82%
rename from sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
index 78e14b789f0..fd66e921dd1 100644
--- a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.google.common.base.Objects
@@ -25,18 +31,18 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, DynamicPruningExpression, Expression, Literal}
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.physical.{KeyGroupedPartitioning, SinglePartition}
-import org.apache.spark.sql.catalyst.util.InternalRowSet
-import org.apache.spark.sql.catalyst.util.truncatedString
+import org.apache.spark.sql.catalyst.util.{truncatedString, InternalRowSet}
 import org.apache.spark.sql.connector.read._
 import org.apache.spark.sql.execution.datasources.rapids.DataSourceStrategyUtils
 import org.apache.spark.sql.execution.datasources.v2._
+import org.apache.spark.sql.vectorized.ColumnarBatch
 
 case class GpuBatchScanExec(
     output: Seq[AttributeReference],
     @transient scan: Scan,
     runtimeFilters: Seq[Expression] = Seq.empty,
     keyGroupedPartitioning: Option[Seq[Expression]] = None)
-    extends ShimDataSourceV2ScanExecBase with GpuBatchScanExecMetrics {
+    extends DataSourceV2ScanExecBase with GpuBatchScanExecMetrics {
   @transient lazy val batch: Batch = scan.toBatch
 
   // All expressions are filter expressions used on the CPU.
@@ -74,24 +80,24 @@ case class GpuBatchScanExec(
         case p: KeyGroupedPartitioning =>
           if (newPartitions.exists(!_.isInstanceOf[HasPartitionKey])) {
             throw new SparkException("Data source must have preserved the original partitioning " +
-                "during runtime filtering: not all partitions implement HasPartitionKey after " +
-                "filtering")
+              "during runtime filtering: not all partitions implement HasPartitionKey after " +
+              "filtering")
           }
 
           val newRows = new InternalRowSet(p.expressions.map(_.dataType))
           newRows ++= newPartitions.map(_.asInstanceOf[HasPartitionKey].partitionKey())
-          val oldRows = KeyGroupedPartitioningShim.getPartitionValues(p)
+          val oldRows = p.partitionValuesOpt.get
 
           if (oldRows.size != newRows.size) {
             throw new SparkException("Data source must have preserved the original partitioning " +
-                "during runtime filtering: the number of unique partition values obtained " +
-                s"through HasPartitionKey changed: before ${oldRows.size}, after ${newRows.size}")
+              "during runtime filtering: the number of unique partition values obtained " +
+              s"through HasPartitionKey changed: before ${oldRows.size}, after ${newRows.size}")
           }
 
           if (!oldRows.forall(newRows.contains)) {
             throw new SparkException("Data source must have preserved the original partitioning " +
-                "during runtime filtering: the number of unique partition values obtained " +
-                s"through HasPartitionKey remain the same but do not exactly match")
+              "during runtime filtering: the number of unique partition values obtained " +
+              s"through HasPartitionKey remain the same but do not exactly match")
           }
 
           groupPartitions(newPartitions).get.map(_._2)
@@ -136,4 +142,12 @@ case class GpuBatchScanExec(
     val result = s"$nodeName$truncatedOutputString ${scan.description()} $runtimeFiltersString"
     redact(result)
   }
+
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
+    val numOutputRows = longMetric("numOutputRows")
+    inputRDD.asInstanceOf[RDD[ColumnarBatch]].map { b =>
+      numOutputRows += b.numRows()
+      b
+    }
+  }
 }
diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/GpuDataSourceRDD.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/GpuDataSourceRDD.scala
similarity index 94%
rename from sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/GpuDataSourceRDD.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/GpuDataSourceRDD.scala
index 35e43bbb598..edadfa2404f 100644
--- a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/GpuDataSourceRDD.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/GpuDataSourceRDD.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{MetricsBatchIterator, PartitionIterator}
diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala
similarity index 88%
rename from sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala
index a8642abe1ea..67ceb44671b 100644
--- a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.GpuHashPartitioningBase
diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/GpuIntervalUtils.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/GpuIntervalUtils.scala
similarity index 94%
rename from sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/GpuIntervalUtils.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/GpuIntervalUtils.scala
index ee35832f0e6..f1dbdea206a 100644
--- a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/GpuIntervalUtils.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/GpuIntervalUtils.scala
@@ -13,6 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import java.math.BigInteger
@@ -20,7 +28,7 @@ import java.util.concurrent.TimeUnit.{DAYS, HOURS, MINUTES, SECONDS}
 
 import scala.collection.mutable.ArrayBuffer
 
-import ai.rapids.cudf.{ColumnVector, ColumnView, DType, Scalar}
+import ai.rapids.cudf.{ColumnVector, ColumnView, DType, RegexProgram, Scalar}
 import com.nvidia.spark.rapids.{Arm, BoolUtils, CloseableHolder}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 
@@ -167,21 +175,19 @@ object GpuIntervalUtils extends Arm {
    * @param t              day-time interval type
    * @return long column of micros
    */
- @scala.annotation.nowarn("msg=method extractRe in class ColumnView is deprecated")
   def castStringToDTInterval(cv: ColumnView, t: DT): ColumnVector = {
     (t.startField, t.endField) match {
-      case (DT.DAY, DT.DAY) => withResource(cv.extractRe(dayLiteralRegex)) {
-        groupsTable => {
+      case (DT.DAY, DT.DAY) =>
+        withResource(cv.extractRe(new RegexProgram(dayLiteralRegex))) { groupsTable =>
           withResource(finalSign(groupsTable.getColumn(0), groupsTable.getColumn(1))) { sign =>
             addFromDayToDay(sign,
               groupsTable.getColumn(2) // day
             )
           }
         }
-      }
 
-      case (DT.DAY, DT.HOUR) => withResource(cv.extractRe(dayHourLiteralRegex)) {
-        groupsTable => {
+      case (DT.DAY, DT.HOUR) =>
+        withResource(cv.extractRe(new RegexProgram(dayHourLiteralRegex))) { groupsTable =>
           withResource(finalSign(groupsTable.getColumn(0), groupsTable.getColumn(1))) { sign =>
             addFromDayToHour(sign,
               groupsTable.getColumn(2), // day
@@ -189,10 +195,9 @@ object GpuIntervalUtils extends Arm {
             )
           }
         }
-      }
 
-      case (DT.DAY, DT.MINUTE) => withResource(cv.extractRe(dayMinuteLiteralRegex)) {
-        groupsTable => {
+      case (DT.DAY, DT.MINUTE) =>
+        withResource(cv.extractRe(new RegexProgram(dayMinuteLiteralRegex))) { groupsTable =>
           withResource(finalSign(groupsTable.getColumn(0), groupsTable.getColumn(1))) { sign =>
             addFromDayToMinute(sign,
               groupsTable.getColumn(2), // day
@@ -201,10 +206,9 @@ object GpuIntervalUtils extends Arm {
             )
           }
         }
-      }
 
-      case (DT.DAY, DT.SECOND) => withResource(cv.extractRe(daySecondLiteralRegex)) {
-        groupsTable => {
+      case (DT.DAY, DT.SECOND) =>
+        withResource(cv.extractRe(new RegexProgram(daySecondLiteralRegex))) { groupsTable =>
           withResource(finalSign(groupsTable.getColumn(0), groupsTable.getColumn(1))) { sign =>
             addFromDayToSecond(sign,
               groupsTable.getColumn(2), // day
@@ -215,19 +219,18 @@ object GpuIntervalUtils extends Arm {
             )
           }
         }
-      }
 
-      case (DT.HOUR, DT.HOUR) => withResource(cv.extractRe(hourLiteralRegex)) { groupsTable => {
-        withResource(finalSign(groupsTable.getColumn(0), groupsTable.getColumn(1))) { sign =>
-          addFromHourToHour(sign,
-            groupsTable.getColumn(2) // hour
-          )
+      case (DT.HOUR, DT.HOUR) =>
+        withResource(cv.extractRe(new RegexProgram(hourLiteralRegex))) { groupsTable =>
+          withResource(finalSign(groupsTable.getColumn(0), groupsTable.getColumn(1))) { sign =>
+            addFromHourToHour(sign,
+              groupsTable.getColumn(2) // hour
+            )
+          }
         }
-      }
-      }
 
-      case (DT.HOUR, DT.MINUTE) => withResource(cv.extractRe(hourMinuteLiteralRegex)) {
-        groupsTable => {
+      case (DT.HOUR, DT.MINUTE) =>
+        withResource(cv.extractRe(new RegexProgram(hourMinuteLiteralRegex))) { groupsTable =>
           withResource(finalSign(groupsTable.getColumn(0), groupsTable.getColumn(1))) { sign =>
             addFromHourToMinute(sign,
               groupsTable.getColumn(2), // hour
@@ -235,34 +238,30 @@ object GpuIntervalUtils extends Arm {
             )
           }
         }
-      }
 
-      case (DT.HOUR, DT.SECOND) => withResource(cv.extractRe(hourSecondLiteralRegex)) {
-        groupsTable => {
-          withResource(finalSign(groupsTable.getColumn(0), groupsTable.getColumn(1))) {
-            sign =>
-              addFromHourToSecond(sign,
-                groupsTable.getColumn(2), // hour
-                groupsTable.getColumn(3), // minute
-                groupsTable.getColumn(4), // second
-                groupsTable.getColumn(5) // micros
-              )
+      case (DT.HOUR, DT.SECOND) =>
+        withResource(cv.extractRe(new RegexProgram(hourSecondLiteralRegex))) { groupsTable =>
+          withResource(finalSign(groupsTable.getColumn(0), groupsTable.getColumn(1))) { sign =>
+            addFromHourToSecond(sign,
+              groupsTable.getColumn(2), // hour
+              groupsTable.getColumn(3), // minute
+              groupsTable.getColumn(4), // second
+              groupsTable.getColumn(5) // micros
+            )
           }
         }
-      }
 
-      case (DT.MINUTE, DT.MINUTE) => withResource(cv.extractRe(minuteLiteralRegex)) {
-        groupsTable => {
+      case (DT.MINUTE, DT.MINUTE) =>
+        withResource(cv.extractRe(new RegexProgram(minuteLiteralRegex))) { groupsTable =>
           withResource(finalSign(groupsTable.getColumn(0), groupsTable.getColumn(1))) { sign =>
             addFromMinuteToMinute(sign,
               groupsTable.getColumn(2) // minute
             )
           }
         }
-      }
 
-      case (DT.MINUTE, DT.SECOND) => withResource(cv.extractRe(minuteSecondLiteralRegex)) {
-        groupsTable => {
+      case (DT.MINUTE, DT.SECOND) =>
+        withResource(cv.extractRe(new RegexProgram(minuteSecondLiteralRegex))) { groupsTable =>
           withResource(finalSign(groupsTable.getColumn(0), groupsTable.getColumn(1))) { sign =>
             addFromMinuteToSecond(sign,
               groupsTable.getColumn(2), // minute
@@ -271,10 +270,9 @@ object GpuIntervalUtils extends Arm {
             )
           }
         }
-      }
 
-      case (DT.SECOND, DT.SECOND) => withResource(cv.extractRe(secondLiteralRegex)) {
-        groupsTable => {
+      case (DT.SECOND, DT.SECOND) =>
+        withResource(cv.extractRe(new RegexProgram(secondLiteralRegex))) { groupsTable =>
           withResource(finalSign(groupsTable.getColumn(0), groupsTable.getColumn(1))) { sign =>
             addFromSecondToSecond(sign,
               groupsTable.getColumn(2), // second
@@ -282,7 +280,6 @@ object GpuIntervalUtils extends Arm {
             )
           }
         }
-      }
 
       case _ =>
         throw new RuntimeException(
diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala
similarity index 94%
rename from sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala
index 3cbaf0c158e..a319c87908a 100644
--- a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{GpuExpression, GpuPartitioning}
diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/GpuTypeShims.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/GpuTypeShims.scala
similarity index 97%
rename from sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/GpuTypeShims.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/GpuTypeShims.scala
index 47cc89f7d65..83426fc048e 100644
--- a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/GpuTypeShims.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/GpuTypeShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import ai.rapids.cudf
diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/OrcReadingShims.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/OrcReadingShims.scala
similarity index 86%
rename from sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/OrcReadingShims.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/OrcReadingShims.scala
index 733fa2cd6b1..d491101d250 100644
--- a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/OrcReadingShims.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/OrcReadingShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.hadoop.conf.Configuration
diff --git a/sql-plugin/src/main/330+-noncdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
similarity index 82%
rename from sql-plugin/src/main/330+-noncdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
index a5d3d6774cf..1a0a5279c4a 100644
--- a/sql-plugin/src/main/330+-noncdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.execution.datasources.orc.OrcUtils
diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/ParquetFieldIdShims.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/ParquetFieldIdShims.scala
similarity index 84%
rename from sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/ParquetFieldIdShims.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/ParquetFieldIdShims.scala
index f9da1be3c59..0629c1dff21 100644
--- a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/ParquetFieldIdShims.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/ParquetFieldIdShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.hadoop.conf.Configuration
diff --git a/sql-plugin/src/main/330until340/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala
similarity index 93%
rename from sql-plugin/src/main/330until340/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala
index ea2b6071cdb..280b71db363 100644
--- a/sql-plugin/src/main/330until340/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,13 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import scala.collection.JavaConverters._
@@ -162,6 +169,10 @@ object ParquetSchemaClipShims {
           case timestamp: TimestampLogicalTypeAnnotation
             if timestamp.getUnit == TimeUnit.MICROS || timestamp.getUnit == TimeUnit.MILLIS =>
               TimestampType
+          case timestamp: TimestampLogicalTypeAnnotation if timestamp.getUnit == TimeUnit.NANOS &&
+              ParquetLegacyNanoAsLongShims.legacyParquetNanosAsLong =>
+            TrampolineUtil.throwAnalysisException(
+              "GPU does not support spark.sql.legacy.parquet.nanosAsLong")
           case _ => illegalType()
         }
 
diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/PythonMapInArrowExecShims.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/PythonMapInArrowExecShims.scala
similarity index 87%
rename from sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/PythonMapInArrowExecShims.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/PythonMapInArrowExecShims.scala
index 37da540d5b0..aab17811278 100644
--- a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/PythonMapInArrowExecShims.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/PythonMapInArrowExecShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/RapidsFileSourceMetaUtils.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/RapidsFileSourceMetaUtils.scala
similarity index 83%
rename from sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/RapidsFileSourceMetaUtils.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/RapidsFileSourceMetaUtils.scala
index ede437581c0..d0f2597aa3e 100644
--- a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/RapidsFileSourceMetaUtils.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/RapidsFileSourceMetaUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.expressions.{Attribute, FileSourceMetadataAttribute}
diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala
similarity index 89%
rename from sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala
index cbcf9239872..01ee33d26ba 100644
--- a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{DataFromReplacementRule, GpuOrcScan, RapidsConf, RapidsMeta, ScanMeta}
diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala
similarity index 89%
rename from sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala
index 05070f7041c..5f8866b5e49 100644
--- a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{DataFromReplacementRule, GpuParquetScan, RapidsConf, RapidsMeta, ScanMeta}
diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/RoundingShims.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/RoundingShims.scala
similarity index 94%
rename from sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/RoundingShims.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/RoundingShims.scala
index afc68ca7536..8065aa5dc63 100644
--- a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/RoundingShims.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/RoundingShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import ai.rapids.cudf.DType
diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/ScanExecShims.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/ScanExecShims.scala
similarity index 91%
rename from sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/ScanExecShims.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/ScanExecShims.scala
index e5b90dd5bf2..d29ccf9ff07 100644
--- a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/ScanExecShims.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/ScanExecShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/330+-nondb/scala/com/nvidia/spark/rapids/shims/Spark330PlusNonDBShims.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/Spark330PlusNonDBShims.scala
similarity index 76%
rename from sql-plugin/src/main/330+-nondb/scala/com/nvidia/spark/rapids/shims/Spark330PlusNonDBShims.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/Spark330PlusNonDBShims.scala
index 1974e17ab52..e55c7ab3e20 100644
--- a/sql-plugin/src/main/330+-nondb/scala/com/nvidia/spark/rapids/shims/Spark330PlusNonDBShims.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/Spark330PlusNonDBShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,13 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 trait Spark330PlusNonDBShims extends Spark330PlusShims
diff --git a/sql-plugin/src/main/330+-nondb/scala/com/nvidia/spark/rapids/shims/Spark330PlusShims.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/Spark330PlusShims.scala
similarity index 95%
rename from sql-plugin/src/main/330+-nondb/scala/com/nvidia/spark/rapids/shims/Spark330PlusShims.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/Spark330PlusShims.scala
index 60e0b572433..52622a4b7b2 100644
--- a/sql-plugin/src/main/330+-nondb/scala/com/nvidia/spark/rapids/shims/Spark330PlusShims.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/Spark330PlusShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,13 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/330/scala/com/nvidia/spark/rapids/SparkShims.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
similarity index 94%
rename from sql-plugin/src/main/330/scala/com/nvidia/spark/rapids/SparkShims.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
index 1386ec2f96f..f0f81947d6b 100644
--- a/sql-plugin/src/main/330/scala/com/nvidia/spark/rapids/SparkShims.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/330/scala/com/nvidia/spark/rapids/shims/spark330/SparkShimServiceProvider.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/spark330/SparkShimServiceProvider.scala
similarity index 88%
rename from sql-plugin/src/main/330/scala/com/nvidia/spark/rapids/shims/spark330/SparkShimServiceProvider.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/spark330/SparkShimServiceProvider.scala
index c5a34efef88..e50335d71a8 100644
--- a/sql-plugin/src/main/330/scala/com/nvidia/spark/rapids/shims/spark330/SparkShimServiceProvider.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/spark330/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark330
 
 import com.nvidia.spark.rapids.SparkShimVersion
diff --git a/sql-plugin/src/main/330/scala/com/nvidia/spark/rapids/spark330/RapidsShuffleManager.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/spark330/RapidsShuffleManager.scala
similarity index 87%
rename from sql-plugin/src/main/330/scala/com/nvidia/spark/rapids/spark330/RapidsShuffleManager.scala
rename to sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/spark330/RapidsShuffleManager.scala
index 6b0f82c8764..1a3383f7d97 100644
--- a/sql-plugin/src/main/330/scala/com/nvidia/spark/rapids/spark330/RapidsShuffleManager.scala
+++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/spark330/RapidsShuffleManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.spark330
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/330+/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala
similarity index 84%
rename from sql-plugin/src/main/330+/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala
rename to sql-plugin/src/main/spark330/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala
index 2f41143f313..42f1491594c 100644
--- a/sql-plugin/src/main/330+/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala
+++ b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.catalyst.csv
 
 import org.apache.spark.sql.catalyst.util.DateFormatter
diff --git a/sql-plugin/src/main/330+/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala
similarity index 84%
rename from sql-plugin/src/main/330+/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala
rename to sql-plugin/src/main/spark330/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala
index dc2160ed6d0..ddd3865286f 100644
--- a/sql-plugin/src/main/330+/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala
+++ b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.catalyst.json
 
 import org.apache.spark.sql.catalyst.util.DateFormatter
diff --git a/sql-plugin/src/main/330+/scala/org/apache/spark/sql/execution/datasources/parquet/ShimCurrentBatchIterator.scala b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/execution/datasources/parquet/ShimCurrentBatchIterator.scala
similarity index 96%
rename from sql-plugin/src/main/330+/scala/org/apache/spark/sql/execution/datasources/parquet/ShimCurrentBatchIterator.scala
rename to sql-plugin/src/main/spark330/scala/org/apache/spark/sql/execution/datasources/parquet/ShimCurrentBatchIterator.scala
index 1077c9c4fe8..b93878ae64a 100644
--- a/sql-plugin/src/main/330+/scala/org/apache/spark/sql/execution/datasources/parquet/ShimCurrentBatchIterator.scala
+++ b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/execution/datasources/parquet/ShimCurrentBatchIterator.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.execution.datasources.parquet
 
 import java.io.IOException
diff --git a/sql-plugin/src/main/330until340-nondb/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ParquetCVShims.scala b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ParquetCVShims.scala
similarity index 86%
rename from sql-plugin/src/main/330until340-nondb/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ParquetCVShims.scala
rename to sql-plugin/src/main/spark330/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ParquetCVShims.scala
index 619d931cbc7..f73a5598cbb 100644
--- a/sql-plugin/src/main/330until340-nondb/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ParquetCVShims.scala
+++ b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ParquetCVShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.execution.datasources.parquet
 
 import org.apache.spark.memory.MemoryMode
diff --git a/sql-plugin/src/main/330+/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala
similarity index 92%
rename from sql-plugin/src/main/330+/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala
rename to sql-plugin/src/main/spark330/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala
index 382703e8779..362747ae824 100644
--- a/sql-plugin/src/main/330+/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala
+++ b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ShimVectorizedColumnReader.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.execution.datasources.parquet.rapids.shims
 
 import java.time.ZoneId
diff --git a/sql-plugin/src/main/330+/scala/org/apache/spark/sql/rapids/shims/GpuPythonMapInArrowExec.scala b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/GpuPythonMapInArrowExec.scala
similarity index 91%
rename from sql-plugin/src/main/330+/scala/org/apache/spark/sql/rapids/shims/GpuPythonMapInArrowExec.scala
rename to sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/GpuPythonMapInArrowExec.scala
index 443d1d1c9c6..5c9812f1bc5 100644
--- a/sql-plugin/src/main/330+/scala/org/apache/spark/sql/rapids/shims/GpuPythonMapInArrowExec.scala
+++ b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/GpuPythonMapInArrowExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.execution.python
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/330until340-nondb/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
similarity index 93%
rename from sql-plugin/src/main/330until340-nondb/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
rename to sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
index bd30e3fcad1..cf9d86dc43f 100644
--- a/sql-plugin/src/main/330until340-nondb/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
+++ b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import org.apache.spark.SparkDateTimeException
diff --git a/sql-plugin/src/main/330+/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtilsFor330plus.scala b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtilsFor330plus.scala
similarity index 87%
rename from sql-plugin/src/main/330+/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtilsFor330plus.scala
rename to sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtilsFor330plus.scala
index 803c75f84c5..eb27e9b0108 100644
--- a/sql-plugin/src/main/330+/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtilsFor330plus.scala
+++ b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtilsFor330plus.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import org.apache.spark.sql.catalyst.TableIdentifier
diff --git a/sql-plugin/src/main/330+/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedReader.scala b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedReader.scala
similarity index 92%
rename from sql-plugin/src/main/330+/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedReader.scala
rename to sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedReader.scala
index dde2aa4e8a9..e5e714018f2 100644
--- a/sql-plugin/src/main/330+/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedReader.scala
+++ b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/RapidsShuffleThreadedReader.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import ai.rapids.cudf.{NvtxColor, NvtxRange}
diff --git a/sql-plugin/src/main/330until340-nondb/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
similarity index 83%
rename from sql-plugin/src/main/330until340-nondb/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
rename to sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
index 9123885e6c9..0b6866fa0ae 100644
--- a/sql-plugin/src/main/330until340-nondb/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
+++ b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import org.apache.spark.SparkUpgradeException
diff --git a/sql-plugin/src/main/330+/scala/org/apache/spark/sql/rapids/shims/intervalExpressions.scala b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/intervalExpressions.scala
similarity index 99%
rename from sql-plugin/src/main/330+/scala/org/apache/spark/sql/rapids/shims/intervalExpressions.scala
rename to sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/intervalExpressions.scala
index 439e481152b..0c0149f8827 100644
--- a/sql-plugin/src/main/330+/scala/org/apache/spark/sql/rapids/shims/intervalExpressions.scala
+++ b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/intervalExpressions.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import java.math.BigInteger
diff --git a/sql-plugin/src/main/330/scala/org/apache/spark/sql/rapids/shims/spark330/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/spark330/RapidsShuffleInternalManager.scala
similarity index 90%
rename from sql-plugin/src/main/330/scala/org/apache/spark/sql/rapids/shims/spark330/RapidsShuffleInternalManager.scala
rename to sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/spark330/RapidsShuffleInternalManager.scala
index f42020d535e..fc7f3a1c167 100644
--- a/sql-plugin/src/main/330/scala/org/apache/spark/sql/rapids/shims/spark330/RapidsShuffleInternalManager.scala
+++ b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/spark330/RapidsShuffleInternalManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims.spark330
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/330+/scala/org/apache/spark/sql/types/shims/PartitionValueCastShims.scala b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/types/shims/PartitionValueCastShims.scala
similarity index 88%
rename from sql-plugin/src/main/330+/scala/org/apache/spark/sql/types/shims/PartitionValueCastShims.scala
rename to sql-plugin/src/main/spark330/scala/org/apache/spark/sql/types/shims/PartitionValueCastShims.scala
index 947aa85ca79..855dbab9e1d 100644
--- a/sql-plugin/src/main/330+/scala/org/apache/spark/sql/types/shims/PartitionValueCastShims.scala
+++ b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/types/shims/PartitionValueCastShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.types.shims
 
 import java.time.ZoneId
diff --git a/sql-plugin/src/main/321cdh/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala b/sql-plugin/src/main/spark330cdh/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala
similarity index 85%
rename from sql-plugin/src/main/321cdh/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala
rename to sql-plugin/src/main/spark330cdh/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala
index 869966fcbf8..193eaf4c921 100644
--- a/sql-plugin/src/main/321cdh/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala
+++ b/sql-plugin/src/main/spark330cdh/scala/com/nvidia/spark/rapids/shims/GpuParquetCrypto.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "330cdh"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 object GpuParquetCrypto {
diff --git a/sql-plugin/src/main/330cdh/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala b/sql-plugin/src/main/spark330cdh/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
similarity index 87%
rename from sql-plugin/src/main/330cdh/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
rename to sql-plugin/src/main/spark330cdh/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
index 4129704280a..14562e8f593 100644
--- a/sql-plugin/src/main/330cdh/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
+++ b/sql-plugin/src/main/spark330cdh/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330cdh"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import ai.rapids.cudf.{ColumnView, DType}
diff --git a/sql-plugin/src/main/330cdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala b/sql-plugin/src/main/spark330cdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
similarity index 96%
rename from sql-plugin/src/main/330cdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
rename to sql-plugin/src/main/spark330cdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
index 63f927c1579..bd5d50a9e7a 100644
--- a/sql-plugin/src/main/330cdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
+++ b/sql-plugin/src/main/spark330cdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "330cdh"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import scala.collection.mutable.ArrayBuffer
diff --git a/sql-plugin/src/main/330cdh/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/spark330cdh/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
similarity index 95%
rename from sql-plugin/src/main/330cdh/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
rename to sql-plugin/src/main/spark330cdh/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
index deb0589118b..b2eaff3e0dc 100644
--- a/sql-plugin/src/main/330cdh/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
+++ b/sql-plugin/src/main/spark330cdh/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330cdh"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/330cdh/scala/com/nvidia/spark/rapids/shims/spark330cdh/SparkShimServiceProvider.scala b/sql-plugin/src/main/spark330cdh/scala/com/nvidia/spark/rapids/shims/spark330cdh/SparkShimServiceProvider.scala
similarity index 89%
rename from sql-plugin/src/main/330cdh/scala/com/nvidia/spark/rapids/shims/spark330cdh/SparkShimServiceProvider.scala
rename to sql-plugin/src/main/spark330cdh/scala/com/nvidia/spark/rapids/shims/spark330cdh/SparkShimServiceProvider.scala
index 8d8c07522ee..736cdecd982 100644
--- a/sql-plugin/src/main/330cdh/scala/com/nvidia/spark/rapids/shims/spark330cdh/SparkShimServiceProvider.scala
+++ b/sql-plugin/src/main/spark330cdh/scala/com/nvidia/spark/rapids/shims/spark330cdh/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330cdh"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark330cdh
 
 import com.nvidia.spark.rapids.{ClouderaShimVersion, ShimVersion}
diff --git a/sql-plugin/src/main/330cdh/scala/com/nvidia/spark/rapids/spark330cdh/RapidsShuffleManager.scala b/sql-plugin/src/main/spark330cdh/scala/com/nvidia/spark/rapids/spark330cdh/RapidsShuffleManager.scala
similarity index 87%
rename from sql-plugin/src/main/330cdh/scala/com/nvidia/spark/rapids/spark330cdh/RapidsShuffleManager.scala
rename to sql-plugin/src/main/spark330cdh/scala/com/nvidia/spark/rapids/spark330cdh/RapidsShuffleManager.scala
index 9d35629d842..50270ac5f30 100644
--- a/sql-plugin/src/main/330cdh/scala/com/nvidia/spark/rapids/spark330cdh/RapidsShuffleManager.scala
+++ b/sql-plugin/src/main/spark330cdh/scala/com/nvidia/spark/rapids/spark330cdh/RapidsShuffleManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330cdh"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.spark330cdh
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/330cdh/scala/org/apache/spark/sql/hive/cdh/HiveOverrides.scala b/sql-plugin/src/main/spark330cdh/scala/org/apache/spark/sql/hive/cdh/HiveOverrides.scala
similarity index 94%
rename from sql-plugin/src/main/330cdh/scala/org/apache/spark/sql/hive/cdh/HiveOverrides.scala
rename to sql-plugin/src/main/spark330cdh/scala/org/apache/spark/sql/hive/cdh/HiveOverrides.scala
index 10ad6ec4bda..38799ddff5e 100644
--- a/sql-plugin/src/main/330cdh/scala/org/apache/spark/sql/hive/cdh/HiveOverrides.scala
+++ b/sql-plugin/src/main/spark330cdh/scala/org/apache/spark/sql/hive/cdh/HiveOverrides.scala
@@ -13,6 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "330cdh"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.hive.cdh
 
 import com.nvidia.spark.rapids.{ExecChecks, GpuExec, GpuOverrides, SparkPlanMeta, TypeSig}
diff --git a/sql-plugin/src/main/330cdh/scala/org/apache/spark/sql/rapids/shims/spark330cdh/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/spark330cdh/scala/org/apache/spark/sql/rapids/shims/spark330cdh/RapidsShuffleInternalManager.scala
similarity index 90%
rename from sql-plugin/src/main/330cdh/scala/org/apache/spark/sql/rapids/shims/spark330cdh/RapidsShuffleInternalManager.scala
rename to sql-plugin/src/main/spark330cdh/scala/org/apache/spark/sql/rapids/shims/spark330cdh/RapidsShuffleInternalManager.scala
index 0b95abbb564..bac86dcf36d 100644
--- a/sql-plugin/src/main/330cdh/scala/org/apache/spark/sql/rapids/shims/spark330cdh/RapidsShuffleInternalManager.scala
+++ b/sql-plugin/src/main/spark330cdh/scala/org/apache/spark/sql/rapids/shims/spark330cdh/RapidsShuffleInternalManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330cdh"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims.spark330cdh
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala
new file mode 100644
index 00000000000..442e45bf9df
--- /dev/null
+++ b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
+package com.nvidia.spark.rapids.shims
+
+import com.nvidia.spark.rapids._
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.execution._
+import org.apache.spark.sql.execution.datasources.v2.BatchScanExec
+
+class BatchScanExecMeta(p: BatchScanExec,
+    conf: RapidsConf,
+    parent: Option[RapidsMeta[_, _, _]],
+    rule: DataFromReplacementRule)
+    extends SparkPlanMeta[BatchScanExec](p, conf, parent, rule) {
+  // Replaces SubqueryBroadcastExec inside dynamic pruning filters with GPU counterpart
+  // if possible. Instead regarding filters as childExprs of current Meta, we create
+  // a new meta for SubqueryBroadcastExec. The reason is that the GPU replacement of
+  // BatchScanExec is independent from the replacement of the runtime filters. It is
+  // possible that the BatchScanExec is on the CPU, while the dynamic runtime filters
+  // are on the GPU. And vice versa.
+  private lazy val runtimeFilters = {
+    val convertBroadcast = (bc: SubqueryBroadcastExec) => {
+      val meta = GpuOverrides.wrapAndTagPlan(bc, conf)
+      meta.tagForExplain()
+      meta.convertIfNeeded().asInstanceOf[BaseSubqueryExec]
+    }
+    wrapped.runtimeFilters.map { filter =>
+      filter.transformDown {
+        case dpe @ DynamicPruningExpression(inSub: InSubqueryExec) =>
+          inSub.plan match {
+            case bc: SubqueryBroadcastExec =>
+              dpe.copy(inSub.copy(plan = convertBroadcast(bc)))
+            case reuse @ ReusedSubqueryExec(bc: SubqueryBroadcastExec) =>
+              dpe.copy(inSub.copy(plan = reuse.copy(convertBroadcast(bc))))
+            case _ =>
+              dpe
+          }
+      }
+    }
+  }
+
+  override val childExprs: Seq[BaseExprMeta[_]] = {
+    // We want to leave the runtime filters as CPU expressions
+    p.output.map(GpuOverrides.wrapExpr(_, conf, Some(this)))
+  }
+
+  override val childScans: scala.Seq[ScanMeta[_]] =
+    Seq(GpuOverrides.wrapScan(p.scan, conf, Some(this)))
+
+  override def tagPlanForGpu(): Unit = {
+    if (!p.runtimeFilters.isEmpty && !childScans.head.supportsRuntimeFilters) {
+      willNotWorkOnGpu("runtime filtering (DPP) is not supported for this scan")
+    }
+    if(p.reusesFileListingResultsSourceNode != None) {
+      willNotWorkOnGpu("reusesFileListingResultsSourceNode is not supported for this scan")
+    }
+  }
+
+  override def convertToCpu(): SparkPlan = {
+    wrapped.copy(runtimeFilters = runtimeFilters)
+  }
+
+  override def convertToGpu(): GpuExec =
+    GpuBatchScanExec(p.output, childScans.head.convertToGpu(), runtimeFilters, 
+    p.keyGroupedPartitioning, p.ordering, p.table, p.reusesFileListingResultsSourceNode)
+}
diff --git a/sql-plugin/src/main/340+-and-330db/scala/com/nvidia/spark/rapids/shims/CastingConfigShim.scala b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/CastingConfigShim.scala
similarity index 87%
rename from sql-plugin/src/main/340+-and-330db/scala/com/nvidia/spark/rapids/shims/CastingConfigShim.scala
rename to sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/CastingConfigShim.scala
index a41a5ab06ec..f2d268f0ff9 100644
--- a/sql-plugin/src/main/340+-and-330db/scala/com/nvidia/spark/rapids/shims/CastingConfigShim.scala
+++ b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/CastingConfigShim.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.expressions.{Cast, Expression}
diff --git a/sql-plugin/src/main/340+-and-330db/scala/com/nvidia/spark/rapids/shims/DecimalArithmeticOverrides.scala b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/DecimalArithmeticOverrides.scala
similarity index 98%
rename from sql-plugin/src/main/340+-and-330db/scala/com/nvidia/spark/rapids/shims/DecimalArithmeticOverrides.scala
rename to sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/DecimalArithmeticOverrides.scala
index ade001d2465..b00edb72e1f 100644
--- a/sql-plugin/src/main/340+-and-330db/scala/com/nvidia/spark/rapids/shims/DecimalArithmeticOverrides.scala
+++ b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/DecimalArithmeticOverrides.scala
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import ai.rapids.cudf.DType
diff --git a/sql-plugin/src/main/340+-and-330db/scala/com/nvidia/spark/rapids/shims/GetMapValueMeta.scala b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/GetMapValueMeta.scala
similarity index 90%
rename from sql-plugin/src/main/340+-and-330db/scala/com/nvidia/spark/rapids/shims/GetMapValueMeta.scala
rename to sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/GetMapValueMeta.scala
index 77a37090ddb..8d827353dd8 100644
--- a/sql-plugin/src/main/340+-and-330db/scala/com/nvidia/spark/rapids/shims/GetMapValueMeta.scala
+++ b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/GetMapValueMeta.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{BinaryExprMeta, DataFromReplacementRule, GpuExpression, RapidsConf, RapidsMeta}
diff --git a/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
new file mode 100644
index 00000000000..cd2b738e927
--- /dev/null
+++ b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
+package com.nvidia.spark.rapids.shims
+
+import com.google.common.base.Objects
+import com.nvidia.spark.rapids.{GpuBatchScanExecMetrics, ScanWithMetrics}
+
+import org.apache.spark.SparkException
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, DynamicPruningExpression, Expression, Literal, SortOrder}
+import org.apache.spark.sql.catalyst.plans.QueryPlan
+import org.apache.spark.sql.catalyst.plans.physical.{KeyGroupedPartitioning, SinglePartition}
+import org.apache.spark.sql.catalyst.util.{truncatedString, InternalRowSet}
+import org.apache.spark.sql.connector.catalog.Table
+import org.apache.spark.sql.connector.read._
+import org.apache.spark.sql.execution.datasources.rapids.DataSourceStrategyUtils
+import org.apache.spark.sql.execution.datasources.v2._
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+case class GpuBatchScanExec(
+    output: Seq[AttributeReference],
+    @transient scan: Scan,
+    runtimeFilters: Seq[Expression],
+    keyGroupedPartitioning: Option[Seq[Expression]],
+    ordering: Option[Seq[SortOrder]], 
+    @transient table: Table,
+    reusesFileListingResultsSourceNode: Option[BatchScanExec])
+    extends DataSourceV2ScanExecBase with GpuBatchScanExecMetrics {
+  @transient lazy val batch: Batch = scan.toBatch
+
+  // All expressions are filter expressions used on the CPU.
+  override def gpuExpressions: Seq[Expression] = Nil
+
+  // TODO: unify the equal/hashCode implementation for all data source v2 query plans.
+  override def equals(other: Any): Boolean = other match {
+    case other: GpuBatchScanExec =>
+      this.batch == other.batch && this.runtimeFilters == other.runtimeFilters
+    case _ =>
+      false
+  }
+
+  override def hashCode(): Int = Objects.hashCode(batch, runtimeFilters)
+
+  @transient override lazy val inputPartitions: Seq[InputPartition] = batch.planInputPartitions()
+
+  @transient private lazy val filteredPartitions: Seq[Seq[InputPartition]] = {
+    val dataSourceFilters = runtimeFilters.flatMap {
+      case DynamicPruningExpression(e) => DataSourceStrategyUtils.translateRuntimeFilter(e)
+      case _ => None
+    }
+
+    if (dataSourceFilters.nonEmpty && scan.isInstanceOf[SupportsRuntimeFiltering]) {
+      val originalPartitioning = outputPartitioning
+
+      // the cast is safe as runtime filters are only assigned if the scan can be filtered
+      val filterableScan = scan.asInstanceOf[SupportsRuntimeFiltering]
+      filterableScan.filter(dataSourceFilters.toArray)
+
+      // call toBatch again to get filtered partitions
+      val newPartitions = scan.toBatch.planInputPartitions()
+
+      originalPartitioning match {
+        case p: KeyGroupedPartitioning =>
+          if (newPartitions.exists(!_.isInstanceOf[HasPartitionKey])) {
+            throw new SparkException("Data source must have preserved the original partitioning " +
+              "during runtime filtering: not all partitions implement HasPartitionKey after " +
+              "filtering")
+          }
+
+          val newRows = new InternalRowSet(p.expressions.map(_.dataType))
+          newRows ++= newPartitions.map(_.asInstanceOf[HasPartitionKey].partitionKey())
+          val oldRows = p.partitionValuesOpt.get
+
+          if (oldRows.size != newRows.size) {
+            throw new SparkException("Data source must have preserved the original partitioning " +
+              "during runtime filtering: the number of unique partition values obtained " +
+              s"through HasPartitionKey changed: before ${oldRows.size}, after ${newRows.size}")
+          }
+
+          if (!oldRows.forall(newRows.contains)) {
+            throw new SparkException("Data source must have preserved the original partitioning " +
+              "during runtime filtering: the number of unique partition values obtained " +
+              s"through HasPartitionKey remain the same but do not exactly match")
+          }
+
+          groupPartitions(newPartitions).get.map(_._2)
+
+        case _ =>
+          // no validation is needed as the data source did not report any specific partitioning
+          newPartitions.map(Seq(_))
+      }
+
+    } else {
+      partitions
+    }
+  }
+
+  override lazy val readerFactory: PartitionReaderFactory = batch.createReaderFactory()
+
+  override lazy val inputRDD: RDD[InternalRow] = {
+    scan match {
+      case s: ScanWithMetrics => s.metrics = allMetrics
+      case _ =>
+    }
+
+    if (filteredPartitions.isEmpty && outputPartitioning == SinglePartition) {
+      // return an empty RDD with 1 partition if dynamic filtering removed the only split
+      sparkContext.parallelize(Array.empty[InternalRow], 1)
+    } else {
+      new GpuDataSourceRDD(sparkContext, filteredPartitions, readerFactory)
+    }
+  }
+
+  override def doCanonicalize(): GpuBatchScanExec = {
+    this.copy(
+      output = output.map(QueryPlan.normalizeExpressions(_, output)),
+      runtimeFilters = QueryPlan.normalizePredicates(
+        runtimeFilters.filterNot(_ == DynamicPruningExpression(Literal.TrueLiteral)),
+        output))
+  }
+
+  override def simpleString(maxFields: Int): String = {
+    val truncatedOutputString = truncatedString(output, "[", ", ", "]", maxFields)
+    val runtimeFiltersString = s"RuntimeFilters: ${runtimeFilters.mkString("[", ",", "]")}"
+    val result = s"$nodeName$truncatedOutputString ${scan.description()} $runtimeFiltersString"
+    redact(result)
+  }
+
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
+    val numOutputRows = longMetric("numOutputRows")
+    inputRDD.asInstanceOf[RDD[ColumnarBatch]].map { b =>
+      numOutputRows += b.numRows()
+      b
+    }
+  }
+}
diff --git a/sql-plugin/src/main/330db/scala/com/nvidia/spark/rapids/shims/GpuBroadcastJoinMeta.scala b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/GpuBroadcastJoinMeta.scala
similarity index 97%
rename from sql-plugin/src/main/330db/scala/com/nvidia/spark/rapids/shims/GpuBroadcastJoinMeta.scala
rename to sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/GpuBroadcastJoinMeta.scala
index 422f19283d1..29a5cbfe767 100644
--- a/sql-plugin/src/main/330db/scala/com/nvidia/spark/rapids/shims/GpuBroadcastJoinMeta.scala
+++ b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/GpuBroadcastJoinMeta.scala
@@ -13,6 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/340+-and-330db/scala/com/nvidia/spark/rapids/shims/ParquetStringPredShims.scala b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/ParquetStringPredShims.scala
similarity index 85%
rename from sql-plugin/src/main/340+-and-330db/scala/com/nvidia/spark/rapids/shims/ParquetStringPredShims.scala
rename to sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/ParquetStringPredShims.scala
index 91cc1c4003f..5f167ca0333 100644
--- a/sql-plugin/src/main/340+-and-330db/scala/com/nvidia/spark/rapids/shims/ParquetStringPredShims.scala
+++ b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/ParquetStringPredShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.internal.SQLConf
diff --git a/sql-plugin/src/main/340+-and-330db/scala/com/nvidia/spark/rapids/shims/ShimFilePartitionReaderFactory.scala b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/ShimFilePartitionReaderFactory.scala
similarity index 88%
rename from sql-plugin/src/main/340+-and-330db/scala/com/nvidia/spark/rapids/shims/ShimFilePartitionReaderFactory.scala
rename to sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/ShimFilePartitionReaderFactory.scala
index 35efb9a0e86..16db2cd2781 100644
--- a/sql-plugin/src/main/340+-and-330db/scala/com/nvidia/spark/rapids/shims/ShimFilePartitionReaderFactory.scala
+++ b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/ShimFilePartitionReaderFactory.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.FileSourceOptions
diff --git a/sql-plugin/src/main/330db/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala
similarity index 93%
rename from sql-plugin/src/main/330db/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala
rename to sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala
index f4baf2ced6d..980d3edaffb 100644
--- a/sql-plugin/src/main/330db/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala
+++ b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, EXECUTOR_BROADCAST, REBALANCE_PARTITIONS_BY_COL, REBALANCE_PARTITIONS_BY_NONE, REPARTITION_BY_COL, REPARTITION_BY_NUM, ShuffleOrigin}
diff --git a/sql-plugin/src/main/330db/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
similarity index 92%
rename from sql-plugin/src/main/330db/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
rename to sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
index f7ad3c4f727..864cbf457f4 100644
--- a/sql-plugin/src/main/330db/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
+++ b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
@@ -28,6 +31,7 @@ import org.apache.spark.sql.execution.command.{CreateDataSourceTableAsSelectComm
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFilters
 import org.apache.spark.sql.execution.exchange.{EXECUTOR_BROADCAST, ShuffleExchangeExec, ShuffleExchangeLike}
+import org.apache.spark.sql.rapids.GpuElementAtMeta
 import org.apache.spark.sql.rapids.execution.{GpuBroadcastHashJoinExec, GpuBroadcastNestedLoopJoinExec}
 
 object SparkShimImpl extends Spark321PlusDBShims {
@@ -50,8 +54,12 @@ object SparkShimImpl extends Spark321PlusDBShims {
       pushDownInFilterThreshold, caseSensitive, datetimeRebaseMode)
   }
 
-  override def getExprs: Map[Class[_ <: Expression], ExprRule[_ <: Expression]] =
-    super.getExprs ++ DayTimeIntervalShims.exprs ++ RoundingShims.exprs
+  override def getExprs: Map[Class[_ <: Expression], ExprRule[_ <: Expression]] = {
+    val elementAtExpr: Map[Class[_ <: Expression], ExprRule[_ <: Expression]] = Seq(
+      GpuElementAtMeta.elementAtRule(true)
+    ).map(r => (r.getClassFor.asSubclass(classOf[Expression]), r)).toMap
+    super.getExprs ++ DayTimeIntervalShims.exprs ++ RoundingShims.exprs ++ elementAtExpr
+  }
 
   override def getExecs: Map[Class[_ <: SparkPlan], ExecRule[_ <: SparkPlan]] =
     super.getExecs ++ PythonMapInArrowExecShims.execs
diff --git a/sql-plugin/src/main/340+-and-330db/scala/com/nvidia/spark/rapids/shims/TypeUtilsShims.scala b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/TypeUtilsShims.scala
similarity index 88%
rename from sql-plugin/src/main/340+-and-330db/scala/com/nvidia/spark/rapids/shims/TypeUtilsShims.scala
rename to sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/TypeUtilsShims.scala
index 8e144cf6444..b59c33ec120 100644
--- a/sql-plugin/src/main/340+-and-330db/scala/com/nvidia/spark/rapids/shims/TypeUtilsShims.scala
+++ b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/TypeUtilsShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
diff --git a/sql-plugin/src/main/330db/scala/com/nvidia/spark/rapids/shims/spark330db/SparkShimServiceProvider.scala b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/spark330db/SparkShimServiceProvider.scala
similarity index 89%
rename from sql-plugin/src/main/330db/scala/com/nvidia/spark/rapids/shims/spark330db/SparkShimServiceProvider.scala
rename to sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/spark330db/SparkShimServiceProvider.scala
index a7e819e148b..1d4c0e7278f 100644
--- a/sql-plugin/src/main/330db/scala/com/nvidia/spark/rapids/shims/spark330db/SparkShimServiceProvider.scala
+++ b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/spark330db/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark330db
 
 import com.nvidia.spark.rapids.{DatabricksShimVersion, ShimVersion}
diff --git a/sql-plugin/src/main/330db/scala/com/nvidia/spark/rapids/spark330db/RapidsShuffleManager.scala b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/spark330db/RapidsShuffleManager.scala
similarity index 91%
rename from sql-plugin/src/main/330db/scala/com/nvidia/spark/rapids/spark330db/RapidsShuffleManager.scala
rename to sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/spark330db/RapidsShuffleManager.scala
index c4c634e027f..e906fad05db 100644
--- a/sql-plugin/src/main/330db/scala/com/nvidia/spark/rapids/spark330db/RapidsShuffleManager.scala
+++ b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/spark330db/RapidsShuffleManager.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.spark330db
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/340+-and-330db/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ParquetCVShims.scala b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ParquetCVShims.scala
similarity index 89%
rename from sql-plugin/src/main/340+-and-330db/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ParquetCVShims.scala
rename to sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ParquetCVShims.scala
index 94d3b7121ff..da08489e917 100644
--- a/sql-plugin/src/main/340+-and-330db/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ParquetCVShims.scala
+++ b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/execution/datasources/parquet/rapids/shims/ParquetCVShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.execution.datasources.parquet
 
 import org.apache.spark.memory.MemoryMode
diff --git a/sql-plugin/src/main/340+-and-330db/scala/org/apache/spark/sql/execution/datasources/rapids/DataSourceStrategyUtils.scala b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/execution/datasources/rapids/DataSourceStrategyUtils.scala
similarity index 87%
rename from sql-plugin/src/main/340+-and-330db/scala/org/apache/spark/sql/execution/datasources/rapids/DataSourceStrategyUtils.scala
rename to sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/execution/datasources/rapids/DataSourceStrategyUtils.scala
index 0e8952f0389..c30c77b9003 100644
--- a/sql-plugin/src/main/340+-and-330db/scala/org/apache/spark/sql/execution/datasources/rapids/DataSourceStrategyUtils.scala
+++ b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/execution/datasources/rapids/DataSourceStrategyUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.execution.datasources.rapids
 
 import org.apache.spark.sql.catalyst.expressions.Expression
diff --git a/sql-plugin/src/main/340+-and-330db/scala/org/apache/spark/sql/rapids/AggregateFunctions.scala b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/AggregateFunctions.scala
similarity index 92%
rename from sql-plugin/src/main/340+-and-330db/scala/org/apache/spark/sql/rapids/AggregateFunctions.scala
rename to sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/AggregateFunctions.scala
index c76399db12d..3140a870dfe 100644
--- a/sql-plugin/src/main/340+-and-330db/scala/org/apache/spark/sql/rapids/AggregateFunctions.scala
+++ b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/AggregateFunctions.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids
 
 import com.nvidia.spark.rapids.{GpuCast, GpuWindowExpression, GpuWindowSpecDefinition}
diff --git a/sql-plugin/src/main/340+-and-330db/scala/org/apache/spark/sql/rapids/arithmetic.scala b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/arithmetic.scala
similarity index 99%
rename from sql-plugin/src/main/340+-and-330db/scala/org/apache/spark/sql/rapids/arithmetic.scala
rename to sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/arithmetic.scala
index c77144a753b..e78a2d62e26 100644
--- a/sql-plugin/src/main/340+-and-330db/scala/org/apache/spark/sql/rapids/arithmetic.scala
+++ b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/arithmetic.scala
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids
 
 import scala.math.{max, min}
diff --git a/sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala
similarity index 93%
rename from sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala
rename to sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala
index 9c9c2f6e365..13fdb8af558 100644
--- a/sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala
+++ b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.execution
 
 import ai.rapids.cudf.{NvtxColor, NvtxRange}
@@ -91,7 +94,7 @@ case class GpuBroadcastHashJoinExec(
     NUM_INPUT_ROWS -> createMetric(DEBUG_LEVEL, DESCRIPTION_NUM_INPUT_ROWS),
     NUM_INPUT_BATCHES -> createMetric(DEBUG_LEVEL, DESCRIPTION_NUM_INPUT_BATCHES),
     CONCAT_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_CONCAT_TIME)
-  ) ++ semaphoreMetrics ++ spillMetrics
+  )
 
   override def requiredChildDistribution: Seq[Distribution] = {
     if (isExecutorBroadcast) {
@@ -126,7 +129,6 @@ case class GpuBroadcastHashJoinExec(
       buildOutput: Seq[Attribute],
       streamIter: Iterator[ColumnarBatch],
       coalesceMetricsMap: Map[String, GpuMetric]): (ColumnarBatch, Iterator[ColumnarBatch]) = {
-    val semWait = coalesceMetricsMap(GpuMetric.SEMAPHORE_WAIT_TIME)
     val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf)
     val metricsMap = allMetrics
 
@@ -136,7 +138,7 @@ case class GpuBroadcastHashJoinExec(
         if (bufferedStreamIter.hasNext) {
           bufferedStreamIter.head
         } else {
-          GpuSemaphore.acquireIfNecessary(TaskContext.get(), semWait)
+          GpuSemaphore.acquireIfNecessary(TaskContext.get())
         }
       }
       val buildBatch = GpuExecutorBroadcastHelper.getExecutorBroadcastBatch(buildRelation,
@@ -153,8 +155,6 @@ case class GpuBroadcastHashJoinExec(
     val joinTime = gpuLongMetric(JOIN_TIME)
     val joinOutputRows = gpuLongMetric(JOIN_OUTPUT_ROWS)
 
-    val spillCallback = GpuMetric.makeSpillCallback(allMetrics)
-
     val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf)
 
     // Get all the broadcast data from the shuffle coalesced into a single partition 
@@ -173,18 +173,18 @@ case class GpuBroadcastHashJoinExec(
           localBuildOutput,
           new CollectTimeIterator("executor broadcast join stream", it, streamTime),
           allMetrics)
-      withResource(builtBatch) { _ =>
-        doJoin(builtBatch, streamIter, targetSize, spillCallback,
-          numOutputRows, joinOutputRows, numOutputBatches, opTime, joinTime)
-      }
+      // builtBatch will be closed in doJoin
+      doJoin(builtBatch, streamIter, targetSize,
+        numOutputRows, joinOutputRows, numOutputBatches, opTime, joinTime)
     }
   }
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     if (isExecutorBroadcast) {
       doColumnarExecutorBroadcastJoin()
     } else {
       doColumnarBroadcastJoin()
     }
   }
-}
\ No newline at end of file
+}
+
diff --git a/sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala
similarity index 98%
rename from sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala
rename to sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala
index 9b426393bec..5353d601cb4 100644
--- a/sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala
+++ b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.execution
 
 import ai.rapids.cudf.NvtxColor
@@ -106,7 +109,7 @@ case class GpuBroadcastNestedLoopJoinExec(
     NUM_INPUT_ROWS -> createMetric(DEBUG_LEVEL, DESCRIPTION_NUM_INPUT_ROWS),
     NUM_INPUT_BATCHES -> createMetric(DEBUG_LEVEL, DESCRIPTION_NUM_INPUT_BATCHES),
     CONCAT_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_CONCAT_TIME)
-  ) ++ semaphoreMetrics ++ spillMetrics
+  )
 
   def isExecutorBroadcast(): Boolean = {
     executorBroadcast
@@ -187,4 +190,4 @@ case class GpuBroadcastNestedLoopJoinExec(
     }
   }
 
-}
\ No newline at end of file
+}
diff --git a/sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/execution/GpuExecutorBroadcastHelper.scala b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/GpuExecutorBroadcastHelper.scala
similarity index 98%
rename from sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/execution/GpuExecutorBroadcastHelper.scala
rename to sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/GpuExecutorBroadcastHelper.scala
index 2b4895ec320..1db4298b493 100644
--- a/sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/execution/GpuExecutorBroadcastHelper.scala
+++ b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/GpuExecutorBroadcastHelper.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.execution
 
 import com.nvidia.spark.rapids.{Arm, ConcatAndConsumeAll, GpuColumnVector, GpuMetric, GpuShuffleCoalesceIterator, HostShuffleCoalesceIterator}
diff --git a/sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/execution/GpuShuffleMeta.scala b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/GpuShuffleMeta.scala
similarity index 96%
rename from sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/execution/GpuShuffleMeta.scala
rename to sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/GpuShuffleMeta.scala
index c2181cfb78a..c14f3e8ad81 100644
--- a/sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/execution/GpuShuffleMeta.scala
+++ b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/GpuShuffleMeta.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.execution
 
 import com.nvidia.spark.rapids.{DataFromReplacementRule, GpuExec, RapidsConf, RapidsMeta}
diff --git a/sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/execution/ShimTrampolineUtil.scala b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/ShimTrampolineUtil.scala
similarity index 93%
rename from sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/execution/ShimTrampolineUtil.scala
rename to sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/ShimTrampolineUtil.scala
index 6badc4ffdf9..7b4b93729df 100644
--- a/sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/execution/ShimTrampolineUtil.scala
+++ b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/ShimTrampolineUtil.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.execution
 
 import org.apache.spark.sql.catalyst.plans.physical.{BroadcastMode, IdentityBroadcastMode}
diff --git a/sql-plugin/src/main/340+-and-330db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
similarity index 95%
rename from sql-plugin/src/main/340+-and-330db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
rename to sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
index eac391b0a1a..02e4425e9f7 100644
--- a/sql-plugin/src/main/340+-and-330db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
+++ b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import org.apache.spark.SparkDateTimeException
diff --git a/sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala
similarity index 88%
rename from sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala
rename to sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala
index 0210302a4f9..4f59c12c985 100644
--- a/sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala
+++ b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import org.apache.spark.{QueryContext, SparkDateTimeException}
diff --git a/sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
similarity index 87%
rename from sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
rename to sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
index 3675e5f23a0..9f67c31fc3f 100644
--- a/sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
+++ b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import org.apache.spark.SparkUpgradeException
diff --git a/sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/shims/spark330db/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/spark330db/RapidsShuffleInternalManager.scala
similarity index 93%
rename from sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/shims/spark330db/RapidsShuffleInternalManager.scala
rename to sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/spark330db/RapidsShuffleInternalManager.scala
index f9c4e769a93..a2893bbfc5a 100644
--- a/sql-plugin/src/main/330db/scala/org/apache/spark/sql/rapids/shims/spark330db/RapidsShuffleInternalManager.scala
+++ b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/spark330db/RapidsShuffleInternalManager.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims.spark330db
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/331+/scala/com/nvidia/spark/rapids/shims/Spark331PlusShims.scala b/sql-plugin/src/main/spark331/scala/com/nvidia/spark/rapids/shims/Spark331PlusShims.scala
similarity index 92%
rename from sql-plugin/src/main/331+/scala/com/nvidia/spark/rapids/shims/Spark331PlusShims.scala
rename to sql-plugin/src/main/spark331/scala/com/nvidia/spark/rapids/shims/Spark331PlusShims.scala
index d2973274373..97fd591b1fb 100644
--- a/sql-plugin/src/main/331+/scala/com/nvidia/spark/rapids/shims/Spark331PlusShims.scala
+++ b/sql-plugin/src/main/spark331/scala/com/nvidia/spark/rapids/shims/Spark331PlusShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{ExprChecks, ExprRule, GpuCast, GpuExpression, GpuOverrides, TypeSig, UnaryExprMeta}
diff --git a/sql-plugin/src/main/331/scala/com/nvidia/spark/rapids/SparkShims.scala b/sql-plugin/src/main/spark331/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
similarity index 93%
rename from sql-plugin/src/main/331/scala/com/nvidia/spark/rapids/SparkShims.scala
rename to sql-plugin/src/main/spark331/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
index 4c7269807bb..a49cac5e0e8 100644
--- a/sql-plugin/src/main/331/scala/com/nvidia/spark/rapids/SparkShims.scala
+++ b/sql-plugin/src/main/spark331/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "331"}
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/331/scala/com/nvidia/spark/rapids/shims/spark331/SparkShimServiceProvider.scala b/sql-plugin/src/main/spark331/scala/com/nvidia/spark/rapids/shims/spark331/SparkShimServiceProvider.scala
similarity index 88%
rename from sql-plugin/src/main/331/scala/com/nvidia/spark/rapids/shims/spark331/SparkShimServiceProvider.scala
rename to sql-plugin/src/main/spark331/scala/com/nvidia/spark/rapids/shims/spark331/SparkShimServiceProvider.scala
index 5c8722f9bb5..db631bdfb63 100644
--- a/sql-plugin/src/main/331/scala/com/nvidia/spark/rapids/shims/spark331/SparkShimServiceProvider.scala
+++ b/sql-plugin/src/main/spark331/scala/com/nvidia/spark/rapids/shims/spark331/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "331"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark331
 
 import com.nvidia.spark.rapids.SparkShimVersion
diff --git a/sql-plugin/src/main/331/scala/com/nvidia/spark/rapids/spark331/RapidsShuffleManager.scala b/sql-plugin/src/main/spark331/scala/com/nvidia/spark/rapids/spark331/RapidsShuffleManager.scala
similarity index 87%
rename from sql-plugin/src/main/331/scala/com/nvidia/spark/rapids/spark331/RapidsShuffleManager.scala
rename to sql-plugin/src/main/spark331/scala/com/nvidia/spark/rapids/spark331/RapidsShuffleManager.scala
index 3e08c01c7f1..1f44dde1d9f 100644
--- a/sql-plugin/src/main/331/scala/com/nvidia/spark/rapids/spark331/RapidsShuffleManager.scala
+++ b/sql-plugin/src/main/spark331/scala/com/nvidia/spark/rapids/spark331/RapidsShuffleManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "331"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.spark331
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/331+/scala/org/apache/spark/sql/rapids/GpuCheckOverflowInTableInsert.scala b/sql-plugin/src/main/spark331/scala/org/apache/spark/sql/rapids/GpuCheckOverflowInTableInsert.scala
similarity index 93%
rename from sql-plugin/src/main/331+/scala/org/apache/spark/sql/rapids/GpuCheckOverflowInTableInsert.scala
rename to sql-plugin/src/main/spark331/scala/org/apache/spark/sql/rapids/GpuCheckOverflowInTableInsert.scala
index b80edfb8d2e..d12277e6635 100644
--- a/sql-plugin/src/main/331+/scala/org/apache/spark/sql/rapids/GpuCheckOverflowInTableInsert.scala
+++ b/sql-plugin/src/main/spark331/scala/org/apache/spark/sql/rapids/GpuCheckOverflowInTableInsert.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids
 
 import com.nvidia.spark.rapids.{GpuCast, GpuExpression}
diff --git a/sql-plugin/src/main/331/scala/org/apache/spark/sql/rapids/shims/spark331/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/spark331/scala/org/apache/spark/sql/rapids/shims/spark331/RapidsShuffleInternalManager.scala
similarity index 90%
rename from sql-plugin/src/main/331/scala/org/apache/spark/sql/rapids/shims/spark331/RapidsShuffleInternalManager.scala
rename to sql-plugin/src/main/spark331/scala/org/apache/spark/sql/rapids/shims/spark331/RapidsShuffleInternalManager.scala
index ed0bbb3dfa3..a4d8897c3a9 100644
--- a/sql-plugin/src/main/331/scala/org/apache/spark/sql/rapids/shims/spark331/RapidsShuffleInternalManager.scala
+++ b/sql-plugin/src/main/spark331/scala/org/apache/spark/sql/rapids/shims/spark331/RapidsShuffleInternalManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "331"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims.spark331
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/spark332/scala/com/nvidia/spark/rapids/shims/ParquetLegacyNanoAsLongShims.scala b/sql-plugin/src/main/spark332/scala/com/nvidia/spark/rapids/shims/ParquetLegacyNanoAsLongShims.scala
new file mode 100644
index 00000000000..a8cbfee97a4
--- /dev/null
+++ b/sql-plugin/src/main/spark332/scala/com/nvidia/spark/rapids/shims/ParquetLegacyNanoAsLongShims.scala
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
+package com.nvidia.spark.rapids.shims
+
+import org.apache.hadoop.conf.Configuration
+
+import org.apache.spark.sql.internal.SQLConf
+
+object ParquetLegacyNanoAsLongShims {
+  def legacyParquetNanosAsLong(): Boolean = {
+    SQLConf.get.legacyParquetNanosAsLong
+  }
+
+  /**
+   * This method should strictly be used by ParquetCachedBatchSerializer(PCBS) as it is hard coding
+   * the value of LEGACY_PARQUET_NANOS_AS_LONG.
+   *
+   * As far as PCBS is concerned it really doesn't matter what we set it to as long as
+   * ParquetSchemaConverter doesn't see a "null" value.
+   *
+   * @param conf Hadoop conf
+   */
+  def setupLegacyParquetNanosAsLongForPCBS(conf: Configuration): Unit = {
+    conf.setBoolean(SQLConf.LEGACY_PARQUET_NANOS_AS_LONG.key, true)
+  }
+}
diff --git a/sql-plugin/src/main/332/scala/com/nvidia/spark/rapids/shims/spark332/SparkShimServiceProvider.scala b/sql-plugin/src/main/spark332/scala/com/nvidia/spark/rapids/shims/spark332/SparkShimServiceProvider.scala
similarity index 85%
rename from sql-plugin/src/main/332/scala/com/nvidia/spark/rapids/shims/spark332/SparkShimServiceProvider.scala
rename to sql-plugin/src/main/spark332/scala/com/nvidia/spark/rapids/shims/spark332/SparkShimServiceProvider.scala
index 9ed7d01944a..06be70cb21b 100644
--- a/sql-plugin/src/main/332/scala/com/nvidia/spark/rapids/shims/spark332/SparkShimServiceProvider.scala
+++ b/sql-plugin/src/main/spark332/scala/com/nvidia/spark/rapids/shims/spark332/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,16 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark332
 
 import com.nvidia.spark.rapids.SparkShimVersion
 
 object SparkShimServiceProvider {
   val VERSION = SparkShimVersion(3, 3, 2)
-  val VERSIONNAMES = Seq(s"$VERSION", s"$VERSION-SNAPSHOT")
+  val VERSIONNAMES = Seq(s"$VERSION")
 }
 
 class SparkShimServiceProvider extends com.nvidia.spark.rapids.SparkShimServiceProvider {
diff --git a/sql-plugin/src/main/332/scala/com/nvidia/spark/rapids/spark332/RapidsShuffleManager.scala b/sql-plugin/src/main/spark332/scala/com/nvidia/spark/rapids/spark332/RapidsShuffleManager.scala
similarity index 87%
rename from sql-plugin/src/main/332/scala/com/nvidia/spark/rapids/spark332/RapidsShuffleManager.scala
rename to sql-plugin/src/main/spark332/scala/com/nvidia/spark/rapids/spark332/RapidsShuffleManager.scala
index 662d5b816de..25b3ddfeb5f 100644
--- a/sql-plugin/src/main/332/scala/com/nvidia/spark/rapids/spark332/RapidsShuffleManager.scala
+++ b/sql-plugin/src/main/spark332/scala/com/nvidia/spark/rapids/spark332/RapidsShuffleManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.spark332
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/332/scala/org/apache/spark/sql/rapids/shims/spark332/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/spark332/scala/org/apache/spark/sql/rapids/shims/spark332/RapidsShuffleInternalManager.scala
similarity index 90%
rename from sql-plugin/src/main/332/scala/org/apache/spark/sql/rapids/shims/spark332/RapidsShuffleInternalManager.scala
rename to sql-plugin/src/main/spark332/scala/org/apache/spark/sql/rapids/shims/spark332/RapidsShuffleInternalManager.scala
index 66bc041a924..b2ce60ce251 100644
--- a/sql-plugin/src/main/332/scala/org/apache/spark/sql/rapids/shims/spark332/RapidsShuffleInternalManager.scala
+++ b/sql-plugin/src/main/spark332/scala/org/apache/spark/sql/rapids/shims/spark332/RapidsShuffleInternalManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims.spark332
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala
new file mode 100644
index 00000000000..d6be3b6c3c1
--- /dev/null
+++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
+package com.nvidia.spark.rapids.shims
+
+import com.nvidia.spark.rapids._
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.execution._
+import org.apache.spark.sql.execution.datasources.v2.BatchScanExec
+
+class BatchScanExecMeta(p: BatchScanExec,
+    conf: RapidsConf,
+    parent: Option[RapidsMeta[_, _, _]],
+    rule: DataFromReplacementRule)
+    extends SparkPlanMeta[BatchScanExec](p, conf, parent, rule) {
+  // Replaces SubqueryBroadcastExec inside dynamic pruning filters with GPU counterpart
+  // if possible. Instead regarding filters as childExprs of current Meta, we create
+  // a new meta for SubqueryBroadcastExec. The reason is that the GPU replacement of
+  // BatchScanExec is independent from the replacement of the runtime filters. It is
+  // possible that the BatchScanExec is on the CPU, while the dynamic runtime filters
+  // are on the GPU. And vice versa.
+  private lazy val runtimeFilters = {
+    val convertBroadcast = (bc: SubqueryBroadcastExec) => {
+      val meta = GpuOverrides.wrapAndTagPlan(bc, conf)
+      meta.tagForExplain()
+      meta.convertIfNeeded().asInstanceOf[BaseSubqueryExec]
+    }
+    wrapped.runtimeFilters.map { filter =>
+      filter.transformDown {
+        case dpe @ DynamicPruningExpression(inSub: InSubqueryExec) =>
+          inSub.plan match {
+            case bc: SubqueryBroadcastExec =>
+              dpe.copy(inSub.copy(plan = convertBroadcast(bc)))
+            case reuse @ ReusedSubqueryExec(bc: SubqueryBroadcastExec) =>
+              dpe.copy(inSub.copy(plan = reuse.copy(convertBroadcast(bc))))
+            case _ =>
+              dpe
+          }
+      }
+    }
+  }
+
+  override val childExprs: Seq[BaseExprMeta[_]] = {
+    // We want to leave the runtime filters as CPU expressions
+    p.output.map(GpuOverrides.wrapExpr(_, conf, Some(this)))
+  }
+
+  override val childScans: scala.Seq[ScanMeta[_]] =
+    Seq(GpuOverrides.wrapScan(p.scan, conf, Some(this)))
+
+  override def tagPlanForGpu(): Unit = {
+    if (!p.runtimeFilters.isEmpty && !childScans.head.supportsRuntimeFilters) {
+      willNotWorkOnGpu("runtime filtering (DPP) is not supported for this scan")
+    }
+  }
+
+  override def convertToCpu(): SparkPlan = {
+    wrapped.copy(runtimeFilters = runtimeFilters)
+  }
+
+  override def convertToGpu(): GpuExec = {
+    GpuBatchScanExec(p.output, childScans.head.convertToGpu(), runtimeFilters,
+      p.keyGroupedPartitioning, p.ordering, p.table, p.commonPartitionValues,
+      p.applyPartialClustering, p.replicatePartitions)
+  }
+}
diff --git a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectCommandMetaShims.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectCommandMetaShims.scala
similarity index 97%
rename from sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectCommandMetaShims.scala
rename to sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectCommandMetaShims.scala
index c281ddbf1c5..8ee2a59671b 100644
--- a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectCommandMetaShims.scala
+++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectCommandMetaShims.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/FileIndexOptionsShims.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/FileIndexOptionsShims.scala
similarity index 85%
rename from sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/FileIndexOptionsShims.scala
rename to sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/FileIndexOptionsShims.scala
index 340f52c271d..ec7e77ee609 100644
--- a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/FileIndexOptionsShims.scala
+++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/FileIndexOptionsShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.execution.datasources.FileIndexOptions
diff --git a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/GlobalLimitShims.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/GlobalLimitShims.scala
similarity index 91%
rename from sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/GlobalLimitShims.scala
rename to sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/GlobalLimitShims.scala
index f12809f78e2..d54657ddae7 100644
--- a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/GlobalLimitShims.scala
+++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/GlobalLimitShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids.{RowCountPlanVisitor, SparkPlanMeta}
diff --git a/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
new file mode 100644
index 00000000000..ceb36dc9d4f
--- /dev/null
+++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
+package com.nvidia.spark.rapids.shims
+
+import com.google.common.base.Objects
+import com.nvidia.spark.rapids.{GpuBatchScanExecMetrics, ScanWithMetrics}
+
+import org.apache.spark.SparkException
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, DynamicPruningExpression, Expression, Literal, SortOrder}
+import org.apache.spark.sql.catalyst.plans.QueryPlan
+import org.apache.spark.sql.catalyst.plans.physical.{KeyGroupedPartitioning, Partitioning, SinglePartition}
+import org.apache.spark.sql.catalyst.util.{truncatedString, InternalRowComparableWrapper}
+import org.apache.spark.sql.connector.catalog.Table
+import org.apache.spark.sql.connector.read._
+import org.apache.spark.sql.execution.datasources.rapids.DataSourceStrategyUtils
+import org.apache.spark.sql.execution.datasources.v2._
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+case class GpuBatchScanExec(
+    output: Seq[AttributeReference],
+    @transient scan: Scan,
+    runtimeFilters: Seq[Expression] = Seq.empty,
+    keyGroupedPartitioning: Option[Seq[Expression]] = None,
+    ordering: Option[Seq[SortOrder]] = None,
+    @transient table: Table,
+    commonPartitionValues: Option[Seq[(InternalRow, Int)]] = None,
+    applyPartialClustering: Boolean = false,
+    replicatePartitions: Boolean = false)
+    extends DataSourceV2ScanExecBase with GpuBatchScanExecMetrics {
+  @transient lazy val batch: Batch = scan.toBatch
+
+  // All expressions are filter expressions used on the CPU.
+  override def gpuExpressions: Seq[Expression] = Nil
+
+  // TODO: unify the equal/hashCode implementation for all data source v2 query plans.
+  override def equals(other: Any): Boolean = other match {
+    case other: GpuBatchScanExec =>
+      this.batch == other.batch && this.runtimeFilters == other.runtimeFilters &&
+        this.commonPartitionValues == other.commonPartitionValues &&
+        this.replicatePartitions == other.replicatePartitions &&
+        this.applyPartialClustering == other.applyPartialClustering
+    case _ =>
+      false
+  }
+
+  override def hashCode(): Int = Objects.hashCode(batch, runtimeFilters)
+
+  @transient override lazy val inputPartitions: Seq[InputPartition] = batch.planInputPartitions()
+
+  @transient private lazy val filteredPartitions: Seq[Seq[InputPartition]] = {
+    val dataSourceFilters = runtimeFilters.flatMap {
+      case DynamicPruningExpression(e) => DataSourceStrategyUtils.translateRuntimeFilter(e)
+      case _ => None
+    }
+
+    if (dataSourceFilters.nonEmpty) {
+      val originalPartitioning = outputPartitioning
+
+      // the cast is safe as runtime filters are only assigned if the scan can be filtered
+      val filterableScan = scan.asInstanceOf[SupportsRuntimeV2Filtering]
+      filterableScan.filter(dataSourceFilters.toArray)
+
+      // call toBatch again to get filtered partitions
+      val newPartitions = scan.toBatch.planInputPartitions()
+
+      originalPartitioning match {
+        case p: KeyGroupedPartitioning =>
+          if (newPartitions.exists(!_.isInstanceOf[HasPartitionKey])) {
+            throw new SparkException("Data source must have preserved the original partitioning " +
+              "during runtime filtering: not all partitions implement HasPartitionKey after " +
+              "filtering")
+          }
+
+          val newPartitionValues = newPartitions.map(partition =>
+            InternalRowComparableWrapper(partition.asInstanceOf[HasPartitionKey], p.expressions))
+            .toSet
+          val oldPartitionValues = p.partitionValues
+            .map(partition => InternalRowComparableWrapper(partition, p.expressions)).toSet
+          // We require the new number of partition values to be equal or less than the old number
+          // of partition values here. In the case of less than, empty partitions will be added for
+          // those missing values that are not present in the new input partitions.
+          if (oldPartitionValues.size < newPartitionValues.size) {
+            throw new SparkException("During runtime filtering, data source must either report " +
+              "the same number of partition values, or a subset of partition values from the " +
+              s"original. Before: ${oldPartitionValues.size} partition values. " +
+              s"After: ${newPartitionValues.size} partition values")
+          }
+
+          if (!newPartitionValues.forall(oldPartitionValues.contains)) {
+            throw new SparkException("During runtime filtering, data source must not report new " +
+              "partition values that are not present in the original partitioning.")
+          }
+          groupPartitions(newPartitions).get.map(_._2)
+
+        case _ =>
+          // no validation is needed as the data source did not report any specific partitioning
+          newPartitions.map(Seq(_))
+      }
+
+    } else {
+      partitions
+    }
+  }
+
+  override def outputPartitioning: Partitioning = {
+    super.outputPartitioning match {
+      case k: KeyGroupedPartitioning if commonPartitionValues.isDefined =>
+        // We allow duplicated partition values if
+        // `spark.sql.sources.v2.bucketing.partiallyClusteredDistribution.enabled` is true
+        val newPartValues = commonPartitionValues.get.flatMap { case (partValue, numSplits) =>
+          Seq.fill(numSplits)(partValue)
+        }
+        k.copy(numPartitions = newPartValues.length, partitionValues = newPartValues)
+      case p => p
+    }
+  }
+
+  override lazy val readerFactory: PartitionReaderFactory = batch.createReaderFactory()
+
+  override lazy val inputRDD: RDD[InternalRow] = {
+    scan match {
+      case s: ScanWithMetrics => s.metrics = allMetrics
+      case _ =>
+    }
+    val rdd = if (filteredPartitions.isEmpty && outputPartitioning == SinglePartition) {
+      // return an empty RDD with 1 partition if dynamic filtering removed the only split
+      sparkContext.parallelize(Array.empty[InternalRow], 1)
+    } else {
+      var finalPartitions = filteredPartitions
+
+      outputPartitioning match {
+        case p: KeyGroupedPartitioning =>
+          if (conf.v2BucketingPushPartValuesEnabled &&
+            conf.v2BucketingPartiallyClusteredDistributionEnabled) {
+            assert(filteredPartitions.forall(_.size == 1),
+              "Expect partitions to be not grouped when " +
+                s"${SQLConf.V2_BUCKETING_PARTIALLY_CLUSTERED_DISTRIBUTION_ENABLED.key} " +
+                "is enabled")
+
+            val groupedPartitions = groupPartitions(finalPartitions.map(_.head), true).get
+
+            // This means the input partitions are not grouped by partition values. We'll need to
+            // check `groupByPartitionValues` and decide whether to group and replicate splits
+            // within a partition.
+            if (commonPartitionValues.isDefined && applyPartialClustering) {
+              // A mapping from the common partition values to how many splits the partition
+              // should contain. Note this no longer maintain the partition key ordering.
+              val commonPartValuesMap = commonPartitionValues
+                .get
+                .map(t => (InternalRowComparableWrapper(t._1, p.expressions), t._2))
+                .toMap
+              val nestGroupedPartitions = groupedPartitions.map {
+                case (partValue, splits) =>
+                  // `commonPartValuesMap` should contain the part value since it's the super set.
+                  val numSplits = commonPartValuesMap
+                    .get(InternalRowComparableWrapper(partValue, p.expressions))
+                  assert(numSplits.isDefined, s"Partition value $partValue does not exist in " +
+                    "common partition values from Spark plan")
+
+                  val newSplits = if (replicatePartitions) {
+                    // We need to also replicate partitions according to the other side of join
+                    Seq.fill(numSplits.get)(splits)
+                  } else {
+                    // Not grouping by partition values: this could be the side with partially
+                    // clustered distribution. Because of dynamic filtering, we'll need to check if
+                    // the final number of splits of a partition is smaller than the original
+                    // number, and fill with empty splits if so. This is necessary so that both
+                    // sides of a join will have the same number of partitions & splits.
+                    splits.map(Seq(_)).padTo(numSplits.get, Seq.empty)
+                  }
+                  (InternalRowComparableWrapper(partValue, p.expressions), newSplits)
+              }
+
+              // Now fill missing partition keys with empty partitions
+              val partitionMapping = nestGroupedPartitions.toMap
+              finalPartitions = commonPartitionValues.get.flatMap { case (partValue, numSplits) =>
+                // Use empty partition for those partition values that are not present.
+                partitionMapping.getOrElse(
+                  InternalRowComparableWrapper(partValue, p.expressions),
+                  Seq.fill(numSplits)(Seq.empty))
+              }
+            } else {
+              val partitionMapping = groupedPartitions.map { case (row, parts) =>
+                InternalRowComparableWrapper(row, p.expressions) -> parts
+              }.toMap
+              finalPartitions = p.partitionValues.map { partValue =>
+                // Use empty partition for those partition values that are not present
+                partitionMapping.getOrElse(
+                  InternalRowComparableWrapper(partValue, p.expressions), Seq.empty)
+              }
+            }
+          } else {
+            val partitionMapping = finalPartitions.map { parts =>
+              val row = parts.head.asInstanceOf[HasPartitionKey].partitionKey()
+              InternalRowComparableWrapper(row, p.expressions) -> parts
+            }.toMap
+            finalPartitions = p.partitionValues.map { partValue =>
+              // Use empty partition for those partition values that are not present
+              partitionMapping.getOrElse(
+                InternalRowComparableWrapper(partValue, p.expressions), Seq.empty)
+            }
+          }
+
+        case _ =>
+      }
+
+      new GpuDataSourceRDD(sparkContext, filteredPartitions, readerFactory)
+    }
+    postDriverMetrics()
+    rdd
+  }
+
+  override def doCanonicalize(): GpuBatchScanExec = {
+    this.copy(
+      output = output.map(QueryPlan.normalizeExpressions(_, output)),
+      runtimeFilters = QueryPlan.normalizePredicates(
+        runtimeFilters.filterNot(_ == DynamicPruningExpression(Literal.TrueLiteral)),
+        output))
+  }
+
+  override def simpleString(maxFields: Int): String = {
+    val truncatedOutputString = truncatedString(output, "[", ", ", "]", maxFields)
+    val runtimeFiltersString = s"RuntimeFilters: ${runtimeFilters.mkString("[", ",", "]")}"
+    val result = s"$nodeName$truncatedOutputString ${scan.description()} $runtimeFiltersString"
+    redact(result)
+  }
+
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
+    val numOutputRows = longMetric("numOutputRows")
+    inputRDD.asInstanceOf[RDD[ColumnarBatch]].map { b =>
+      numOutputRows += b.numRows()
+      b
+    }
+  }
+
+  override def nodeName: String = {
+    s"GpuBatchScan ${table.name()}".trim
+  }
+}
diff --git a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/GpuBroadcastJoinMeta.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/GpuBroadcastJoinMeta.scala
similarity index 96%
rename from sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/GpuBroadcastJoinMeta.scala
rename to sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/GpuBroadcastJoinMeta.scala
index f0cf6068ab9..314f2b7d083 100644
--- a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/GpuBroadcastJoinMeta.scala
+++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/GpuBroadcastJoinMeta.scala
@@ -13,6 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/GpuOptimizedCreateHiveTableAsSelectCommandShims.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/GpuOptimizedCreateHiveTableAsSelectCommandShims.scala
similarity index 99%
rename from sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/GpuOptimizedCreateHiveTableAsSelectCommandShims.scala
rename to sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/GpuOptimizedCreateHiveTableAsSelectCommandShims.scala
index 3bc4e8df23b..bf743916322 100644
--- a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/GpuOptimizedCreateHiveTableAsSelectCommandShims.scala
+++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/GpuOptimizedCreateHiveTableAsSelectCommandShims.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import java.util.Locale
diff --git a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/OrcProtoWriterShim.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/OrcProtoWriterShim.scala
similarity index 90%
rename from sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/OrcProtoWriterShim.scala
rename to sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/OrcProtoWriterShim.scala
index 8b548d28f26..20290390cc4 100644
--- a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/OrcProtoWriterShim.scala
+++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/OrcProtoWriterShim.scala
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.orc.impl.OutStream
diff --git a/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/ParquetLegacyNanoAsLongShims.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/ParquetLegacyNanoAsLongShims.scala
new file mode 100644
index 00000000000..03e581375ba
--- /dev/null
+++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/ParquetLegacyNanoAsLongShims.scala
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
+package com.nvidia.spark.rapids.shims
+
+import org.apache.hadoop.conf.Configuration
+
+import org.apache.spark.sql.internal.SQLConf
+object ParquetLegacyNanoAsLongShims {
+  def legacyParquetNanosAsLong(): Boolean = {
+    SQLConf.get.legacyParquetNanosAsLong
+  }
+
+  /**
+   * This method should strictly be used by ParquetCachedBatchSerializer(PCBS) as it is hard coding
+   * the value of LEGACY_PARQUET_NANOS_AS_LONG.
+   *
+   * As far as PCBS is concerned it really doesn't matter what we set it to as long as
+   * ParquetSchemaConverter doesn't see a "null" value.
+   *
+   * @param conf Hadoop conf
+   */
+  def setupLegacyParquetNanosAsLongForPCBS(conf: Configuration): Unit = {
+    conf.setBoolean(SQLConf.LEGACY_PARQUET_NANOS_AS_LONG.key, true)
+  }
+}
diff --git a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala
similarity index 95%
rename from sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala
rename to sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala
index 6fdcbc141cb..fdee388e12e 100644
--- a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala
+++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import scala.collection.JavaConverters._
@@ -166,6 +169,10 @@ object ParquetSchemaClipShims {
             } else {
               TimestampNTZType
             }
+          case timestamp: TimestampLogicalTypeAnnotation if timestamp.getUnit == TimeUnit.NANOS &&
+              ParquetLegacyNanoAsLongShims.legacyParquetNanosAsLong =>
+            TrampolineUtil.throwAnalysisException(
+              "GPU does not support spark.sql.legacy.parquet.nanosAsLong")
           case _ => illegalType()
         }
 
diff --git a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/ParquetTimestampNTZShims.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/ParquetTimestampNTZShims.scala
similarity index 92%
rename from sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/ParquetTimestampNTZShims.scala
rename to sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/ParquetTimestampNTZShims.scala
index dc4efaee1bc..5093ee0d8ec 100644
--- a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/ParquetTimestampNTZShims.scala
+++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/ParquetTimestampNTZShims.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.hadoop.conf.Configuration
diff --git a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala
similarity index 93%
rename from sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala
rename to sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala
index 99f8a236e97..133ccb61166 100644
--- a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala
+++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.paths.SparkPath
diff --git a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala
similarity index 92%
rename from sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala
rename to sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala
index 33292d45af2..7958ae2f14d 100644
--- a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala
+++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/ShuffleOriginUtil.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, REBALANCE_PARTITIONS_BY_COL, REBALANCE_PARTITIONS_BY_NONE, REPARTITION_BY_COL, REPARTITION_BY_NUM, ShuffleOrigin}
diff --git a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/Spark340PlusShims.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/Spark340PlusShims.scala
similarity index 86%
rename from sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/Spark340PlusShims.scala
rename to sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/Spark340PlusShims.scala
index 02ac6837c29..f8c5062a175 100644
--- a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/Spark340PlusShims.scala
+++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/Spark340PlusShims.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
@@ -24,7 +27,9 @@ import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.plans.physical.SinglePartition
 import org.apache.spark.sql.execution.{CollectLimitExec, GlobalLimitExec, SparkPlan}
 import org.apache.spark.sql.execution.command.{CreateDataSourceTableAsSelectCommand, DataWritingCommand, RunnableCommand}
+import org.apache.spark.sql.execution.datasources.{GpuWriteFilesMeta, WriteFilesExec}
 import org.apache.spark.sql.execution.exchange.ENSURE_REQUIREMENTS
+import org.apache.spark.sql.rapids.GpuElementAtMeta
 import org.apache.spark.sql.rapids.GpuV1WriteUtils.GpuEmpty2Null
 
 trait Spark340PlusShims extends Spark331PlusShims {
@@ -57,7 +62,15 @@ trait Spark340PlusShims extends Spark331PlusShims {
       }
     ).disabledByDefault("Collect Limit replacement can be slower on the GPU, if huge number " +
         "of rows in a batch it could help by limiting the number of rows transferred from " +
-        "GPU to CPU")
+        "GPU to CPU"),
+    GpuOverrides.exec[WriteFilesExec](
+      "v1 write files",
+      // WriteFilesExec always has patterns:
+      //   InsertIntoHadoopFsRelationCommand(WriteFilesExec) or InsertIntoHiveTable(WriteFilesExec)
+      // The parent node of `WriteFilesExec` will check the types, here just let type check pass
+      ExecChecks(TypeSig.all, TypeSig.all),
+      (write, conf, p, r) => new GpuWriteFilesMeta(write, conf, p, r)
+    )
   ).map(r => (r.getClassFor.asSubclass(classOf[SparkPlan]), r)).toMap
 
   override def getExecs: Map[Class[_ <: SparkPlan], ExecRule[_ <: SparkPlan]] =
@@ -77,7 +90,8 @@ trait Spark340PlusShims extends Spark331PlusShims {
         (a, conf, p, r) => new UnaryExprMeta[Empty2Null](a, conf, p, r) {
           override def convertToGpu(child: Expression): GpuExpression = GpuEmpty2Null(child)
         }
-      )
+      ),
+      GpuElementAtMeta.elementAtRule(true)
     ).map(r => (r.getClassFor.asSubclass(classOf[Expression]), r)).toMap
     super.getExprs ++ shimExprs
   }
diff --git a/sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/ShimLeafExecNode.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
similarity index 77%
rename from sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/ShimLeafExecNode.scala
rename to sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
index 0d5e5ed6f64..1ce403806c8 100644
--- a/sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/ShimLeafExecNode.scala
+++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/SparkShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
-import org.apache.spark.sql.execution.LeafExecNode
-
-trait ShimLeafExecNode extends LeafExecNode
+object SparkShimImpl extends Spark340PlusShims
diff --git a/sql-plugin/src/main/340/scala/com/nvidia/spark/rapids/shims/spark340/SparkShimServiceProvider.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/spark340/SparkShimServiceProvider.scala
similarity index 89%
rename from sql-plugin/src/main/340/scala/com/nvidia/spark/rapids/shims/spark340/SparkShimServiceProvider.scala
rename to sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/spark340/SparkShimServiceProvider.scala
index 45b37509ad3..08bd78e5b70 100644
--- a/sql-plugin/src/main/340/scala/com/nvidia/spark/rapids/shims/spark340/SparkShimServiceProvider.scala
+++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/spark340/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark340
 
 import com.nvidia.spark.rapids.SparkShimVersion
diff --git a/sql-plugin/src/main/340/scala/com/nvidia/spark/rapids/spark340/RapidsShuffleManager.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/spark340/RapidsShuffleManager.scala
similarity index 87%
rename from sql-plugin/src/main/340/scala/com/nvidia/spark/rapids/spark340/RapidsShuffleManager.scala
rename to sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/spark340/RapidsShuffleManager.scala
index 488293cf473..713ced74c8e 100644
--- a/sql-plugin/src/main/340/scala/com/nvidia/spark/rapids/spark340/RapidsShuffleManager.scala
+++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/spark340/RapidsShuffleManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.spark340
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/execution/datasources/GpuWriteFiles.scala b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/execution/datasources/GpuWriteFiles.scala
new file mode 100644
index 00000000000..481374ebcc2
--- /dev/null
+++ b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/execution/datasources/GpuWriteFiles.scala
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
+package org.apache.spark.sql.execution.datasources
+
+import java.util.Date
+
+import com.nvidia.spark.rapids.{DataFromReplacementRule, GpuExec, RapidsConf, RapidsMeta, SparkPlanMeta}
+import com.nvidia.spark.rapids.shims.ShimUnaryExecNode
+
+import org.apache.spark.{SparkException, TaskContext}
+import org.apache.spark.internal.io.{FileCommitProtocol, SparkHadoopWriterUtils}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.catalog.BucketSpec
+import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
+import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder}
+import org.apache.spark.sql.connector.write.WriterCommitMessage
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.rapids.{GpuFileFormatWriter, GpuWriteJobDescription}
+import org.apache.spark.sql.rapids.GpuFileFormatWriter.GpuConcurrentOutputWriterSpec
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+/**
+ * The write files spec holds all information of [[V1WriteCommand]] if its provider is
+ * [[FileFormat]].
+ */
+case class GpuWriteFilesSpec(
+    description: GpuWriteJobDescription,
+    committer: FileCommitProtocol,
+    concurrentOutputWriterSpecFunc: SparkPlan => Option[GpuConcurrentOutputWriterSpec])
+
+class GpuWriteFilesMeta(
+    writeFilesExec: WriteFilesExec,
+    conf: RapidsConf,
+    parent: Option[RapidsMeta[_, _, _]],
+    rule: DataFromReplacementRule)
+    extends SparkPlanMeta[WriteFilesExec](writeFilesExec, conf, parent, rule) {
+
+  override def convertToGpu(): GpuExec = {
+    // WriteFilesExec only has one child
+    val child = childPlans.head.convertIfNeeded()
+    GpuWriteFilesExec(
+      child,
+      writeFilesExec.fileFormat,
+      writeFilesExec.partitionColumns,
+      writeFilesExec.bucketSpec,
+      writeFilesExec.options,
+      writeFilesExec.staticPartitions
+    )
+  }
+}
+
+/**
+ * Responsible for writing files.
+ */
+case class GpuWriteFilesExec(
+    child: SparkPlan,
+    fileFormat: FileFormat,
+    partitionColumns: Seq[Attribute],
+    bucketSpec: Option[BucketSpec],
+    options: Map[String, String],
+    staticPartitions: TablePartitionSpec) extends ShimUnaryExecNode with GpuExec {
+
+  override def output: Seq[Attribute] = Seq.empty
+
+  /**
+   * Cpu version for SparkPlan.executeWrite, just throw an exception
+   */
+  override protected def doExecuteWrite(writeFilesSpec: WriteFilesSpec): RDD[WriterCommitMessage] =
+    throw new UnsupportedOperationException(
+      s"${getClass.getCanonicalName} does not support row-based execution")
+
+  /**
+   * Gpu version for SparkPlan.executeWrite
+   */
+  def executeColumnarWrite(
+      writeFilesSpec: GpuWriteFilesSpec): RDD[WriterCommitMessage] = executeQuery {
+    // Copied from SparkPlan.executeWrite
+    if (isCanonicalizedPlan) {
+      throw SparkException.internalError("A canonicalized plan is not supposed to be executed.")
+    }
+    doExecuteColumnarWrite(writeFilesSpec)
+  }
+
+  /**
+   * Gpu version for SparkPlan.doExecuteWrite
+   *
+   * @param writeFilesSpec
+   * @return
+   */
+  private def doExecuteColumnarWrite(
+      writeFilesSpec: GpuWriteFilesSpec): RDD[WriterCommitMessage] = {
+    val rdd = child.executeColumnar()
+    // SPARK-23271 If we are attempting to write a zero partition rdd, create a dummy single
+    // partition rdd to make sure we at least set up one write task to write the metadata.
+    val rddWithNonEmptyPartitions = if (rdd.partitions.length == 0) {
+      session.sparkContext.parallelize(Array.empty[ColumnarBatch], 1)
+    } else {
+      rdd
+    }
+
+    val concurrentOutputWriterSpec = writeFilesSpec.concurrentOutputWriterSpecFunc(child)
+    val description = writeFilesSpec.description
+    val committer = writeFilesSpec.committer
+    val jobTrackerID = SparkHadoopWriterUtils.createJobTrackerID(new Date())
+    rddWithNonEmptyPartitions.mapPartitionsInternal { iterator =>
+      val sparkStageId = TaskContext.get().stageId()
+      val sparkPartitionId = TaskContext.get().partitionId()
+      val sparkAttemptNumber = TaskContext.get().taskAttemptId().toInt & Int.MaxValue
+      val ret = GpuFileFormatWriter.executeTask(
+        description,
+        jobTrackerID,
+        sparkStageId,
+        sparkPartitionId,
+        sparkAttemptNumber,
+        committer,
+        iterator,
+        concurrentOutputWriterSpec
+      )
+
+      Iterator(ret)
+    }
+  }
+
+  override protected def doExecute(): RDD[InternalRow] = {
+    throw SparkException.internalError(s"$nodeName does not support doExecute")
+  }
+
+  override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
+    throw new IllegalStateException(s"Internal Error ${this.getClass} has column support" +
+        s" mismatch:\n$this")
+  }
+
+  override protected def stringArgs: Iterator[Any] = Iterator(child)
+}
+
+object GpuWriteFiles {
+
+  def createConcurrentOutputWriterSpec(
+      sparkSession: SparkSession,
+      sortColumns: Seq[Attribute],
+      output: Seq[Attribute],
+      batchSize: Long,
+      sortOrder: Seq[SortOrder]): Option[GpuConcurrentOutputWriterSpec] = {
+    val maxWriters = sparkSession.sessionState.conf.maxConcurrentOutputFileWriters
+    val concurrentWritersEnabled = maxWriters > 0 && sortColumns.isEmpty
+    if (concurrentWritersEnabled) {
+      Some(GpuConcurrentOutputWriterSpec(maxWriters, output, batchSize, sortOrder))
+    } else {
+      None
+    }
+  }
+
+  /**
+   * Find the first `GpuWriteFilesExec`
+   */
+  def getWriteFilesOpt(p: SparkPlan): Option[GpuWriteFilesExec] = {
+    p.collectFirst {
+      case w: GpuWriteFilesExec => w
+    }
+  }
+}
diff --git a/sql-plugin/src/main/340+/scala/org/apache/spark/sql/hive/rapids/shims/HiveProviderCmdShims.scala b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/hive/rapids/shims/HiveProviderCmdShims.scala
similarity index 93%
rename from sql-plugin/src/main/340+/scala/org/apache/spark/sql/hive/rapids/shims/HiveProviderCmdShims.scala
rename to sql-plugin/src/main/spark340/scala/org/apache/spark/sql/hive/rapids/shims/HiveProviderCmdShims.scala
index fd6e5ffc2f0..ded023ced27 100644
--- a/sql-plugin/src/main/340+/scala/org/apache/spark/sql/hive/rapids/shims/HiveProviderCmdShims.scala
+++ b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/hive/rapids/shims/HiveProviderCmdShims.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.hive.rapids.shims
 
 import  com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/GpuFileFormatWriter.scala b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/GpuFileFormatWriter.scala
new file mode 100644
index 00000000000..60b889cb504
--- /dev/null
+++ b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/GpuFileFormatWriter.scala
@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
+package org.apache.spark.sql.rapids
+
+import java.util.{Date, UUID}
+
+import com.nvidia.spark.TimingUtils
+import com.nvidia.spark.rapids._
+import com.nvidia.spark.rapids.shims.RapidsFileSourceMetaUtils
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.mapreduce._
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
+
+import org.apache.spark.{SparkException, TaskContext}
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.io.{FileCommitProtocol, SparkHadoopWriterUtils}
+import org.apache.spark.shuffle.FetchFailedException
+import org.apache.spark.sql.{AnalysisException, SparkSession}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.catalog.BucketSpec
+import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, AttributeSet, Expression, SortOrder}
+import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
+import org.apache.spark.sql.connector.write.WriterCommitMessage
+import org.apache.spark.sql.execution.{SparkPlan, SQLExecution}
+import org.apache.spark.sql.execution.datasources.{GpuWriteFiles, GpuWriteFilesExec, GpuWriteFilesSpec, WriteTaskResult, WriteTaskStats}
+import org.apache.spark.sql.execution.datasources.FileFormatWriter.OutputSpec
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.vectorized.ColumnarBatch
+import org.apache.spark.util.{SerializableConfiguration, Utils}
+
+/** A helper object for writing columnar data out to a location. */
+object GpuFileFormatWriter extends Logging {
+
+  private def verifySchema(format: ColumnarFileFormat, schema: StructType): Unit = {
+    schema.foreach { field =>
+      if (!format.supportDataType(field.dataType)) {
+        throw new AnalysisException(
+          s"$format data source does not support ${field.dataType.catalogString} data type.")
+      }
+    }
+  }
+
+  /** Describes how concurrent output writers should be executed. */
+  case class GpuConcurrentOutputWriterSpec(maxWriters: Int, output: Seq[Attribute],
+      batchSize: Long, sortOrder: Seq[SortOrder])
+
+  /**
+   * Basic work flow of this command is:
+   * 1. Driver side setup, including output committer initialization and data source specific
+   *    preparation work for the write job to be issued.
+   * 2. Issues a write job consists of one or more executor side tasks, each of which writes all
+   *    rows within an RDD partition.
+   * 3. If no exception is thrown in a task, commits that task, otherwise aborts that task;  If any
+   *    exception is thrown during task commitment, also aborts that task.
+   * 4. If all tasks are committed, commit the job, otherwise aborts the job;  If any exception is
+   *    thrown during job commitment, also aborts the job.
+   * 5. If the job is successfully committed, perform post-commit operations such as
+   *    processing statistics.
+   * @return The set of all partition paths that were updated during this write job.
+   */
+  def write(
+      sparkSession: SparkSession,
+      plan: SparkPlan,
+      fileFormat: ColumnarFileFormat,
+      committer: FileCommitProtocol,
+      outputSpec: OutputSpec,
+      hadoopConf: Configuration,
+      partitionColumns: Seq[Attribute],
+      bucketSpec: Option[BucketSpec],
+      statsTrackers: Seq[ColumnarWriteJobStatsTracker],
+      options: Map[String, String],
+      useStableSort: Boolean,
+      concurrentWriterPartitionFlushSize: Long,
+      numStaticPartitionCols: Int = 0): Set[String] = {
+    require(partitionColumns.size >= numStaticPartitionCols)
+
+    val job = Job.getInstance(hadoopConf)
+    job.setOutputKeyClass(classOf[Void])
+    // The data is being written as columnar batches, but those are not serializable. Using the same
+    // InternalRow type that Spark uses here, as it should not really matter. The columnar path
+    // should not be executing the output format code that depends on this setting. Instead specific
+    // output formats are detected and replaced with a different code path, otherwise the code
+    // needs to fallback to the row-based write path.
+    job.setOutputValueClass(classOf[InternalRow])
+    FileOutputFormat.setOutputPath(job, new Path(outputSpec.outputPath))
+
+    val partitionSet = AttributeSet(partitionColumns)
+    // cleanup the internal metadata information of
+    // the file source metadata attribute if any before write out when needed.
+    val finalOutputSpec = outputSpec.copy(outputColumns = outputSpec.outputColumns
+      .map(RapidsFileSourceMetaUtils.cleanupFileSourceMetadataInformation))
+    val dataColumns = finalOutputSpec.outputColumns.filterNot(partitionSet.contains)
+
+    val writerBucketSpec: Option[GpuWriterBucketSpec] = bucketSpec.map { spec =>
+      // TODO: Cannot support this until we:
+      // support Hive hash partitioning on the GPU
+      throw new UnsupportedOperationException("GPU hash partitioning for bucketed data is not "
+          + "compatible with the CPU version")
+    }
+
+    val sortColumns = bucketSpec.toSeq.flatMap {
+      spec => spec.sortColumnNames.map(c => dataColumns.find(_.name == c).get)
+    }
+
+    val caseInsensitiveOptions = CaseInsensitiveMap(options)
+
+    val dataSchema = dataColumns.toStructType
+    verifySchema(fileFormat, dataSchema)
+
+    // NOTE: prepareWrite has side effects as it modifies the job configuration.
+    val outputWriterFactory =
+      fileFormat.prepareWrite(sparkSession, job, caseInsensitiveOptions, dataSchema)
+
+    val description = new GpuWriteJobDescription(
+      uuid = UUID.randomUUID.toString,
+      serializableHadoopConf = new SerializableConfiguration(job.getConfiguration),
+      outputWriterFactory = outputWriterFactory,
+      allColumns = finalOutputSpec.outputColumns,
+      dataColumns = dataColumns,
+      partitionColumns = partitionColumns,
+      bucketSpec = writerBucketSpec,
+      path = finalOutputSpec.outputPath,
+      customPartitionLocations = finalOutputSpec.customPartitionLocations,
+      maxRecordsPerFile = caseInsensitiveOptions.get("maxRecordsPerFile").map(_.toLong)
+          .getOrElse(sparkSession.sessionState.conf.maxRecordsPerFile),
+      timeZoneId = caseInsensitiveOptions.get(DateTimeUtils.TIMEZONE_OPTION)
+          .getOrElse(sparkSession.sessionState.conf.sessionLocalTimeZone),
+      statsTrackers = statsTrackers,
+      concurrentWriterPartitionFlushSize = concurrentWriterPartitionFlushSize
+    )
+
+    // We should first sort by dynamic partition columns, then bucket id, and finally
+    // sorting columns.
+    val requiredOrdering = partitionColumns.drop(numStaticPartitionCols) ++
+      writerBucketSpec.map(_.bucketIdExpression) ++ sortColumns
+    val writeFilesOpt = GpuWriteFiles.getWriteFilesOpt(plan)
+    // the sort order doesn't matter
+    val actualOrdering = writeFilesOpt.map(_.child).getOrElse(plan).outputOrdering.map(_.child)
+
+    val orderingMatched = if (requiredOrdering.length > actualOrdering.length) {
+      false
+    } else {
+      requiredOrdering.zip(actualOrdering).forall {
+        case (requiredOrder, childOutputOrder) =>
+          requiredOrder.semanticEquals(childOutputOrder)
+      }
+    }
+
+    SQLExecution.checkSQLExecutionId(sparkSession)
+
+    // propagate the description UUID into the jobs, so that committers
+    // get an ID guaranteed to be unique.
+    job.getConfiguration.set("spark.sql.sources.writeJobUUID", description.uuid)
+
+    if (writeFilesOpt.isDefined) {
+      // Typically plan is like:
+      //   Execute InsertIntoHadoopFsRelationCommand
+      //     +- WriteFiles
+      //       +- Sort // already sorted
+      //         +- Sub plan
+      // No need to sort again when execute `WriteFiles`
+
+      // build `WriteFilesSpec` for `WriteFiles`
+      val concurrentOutputWriterSpecFunc = (plan: SparkPlan) => {
+        val orderingExpr = GpuBindReferences.bindReferences(requiredOrdering
+          .map(attr => SortOrder(attr, Ascending)), outputSpec.outputColumns)
+        // this sort plan does not execute, only use its output
+        val sortPlan = createSortPlan(plan, orderingExpr, useStableSort)
+        val batchSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(sparkSession.sessionState.conf)
+        GpuWriteFiles.createConcurrentOutputWriterSpec(sparkSession, sortColumns,
+          sortPlan.output, batchSize, orderingExpr)
+      }
+      val writeSpec = GpuWriteFilesSpec(
+        description = description,
+        committer = committer,
+        concurrentOutputWriterSpecFunc = concurrentOutputWriterSpecFunc
+      )
+      executeWrite(sparkSession, plan.asInstanceOf[GpuWriteFilesExec], writeSpec, job)
+    } else {
+      // In this path, Spark version is less than 340 or 'spark.sql.optimizer.plannedWrite.enabled'
+      // is disabled, should sort the data if necessary.
+      executeWrite(sparkSession, plan, job, description, committer, outputSpec,
+        requiredOrdering, partitionColumns, sortColumns, orderingMatched, useStableSort)
+    }
+  }
+
+  private def executeWrite(
+      sparkSession: SparkSession,
+      plan: SparkPlan,
+      job: Job,
+      description: GpuWriteJobDescription,
+      committer: FileCommitProtocol,
+      outputSpec: OutputSpec,
+      requiredOrdering: Seq[Expression],
+      partitionColumns: Seq[Attribute],
+      sortColumns: Seq[Attribute],
+      orderingMatched: Boolean,
+      useStableSort: Boolean): Set[String] = {
+    val partitionSet = AttributeSet(partitionColumns)
+    val hasGpuEmpty2Null = plan.find(p => GpuV1WriteUtils.hasGpuEmptyToNull(p.expressions))
+      .isDefined
+    val empty2NullPlan = if (hasGpuEmpty2Null) {
+      // Empty2Null has been inserted during logic optimization.
+      plan
+    } else {
+      val projectList = GpuV1WriteUtils.convertGpuEmptyToNull(plan.output, partitionSet)
+      if (projectList.nonEmpty) GpuProjectExec(projectList, plan) else plan
+    }
+
+    writeAndCommit(job, description, committer) {
+      val (rdd, concurrentOutputWriterSpec) = if (orderingMatched) {
+        (empty2NullPlan.executeColumnar(), None)
+      } else {
+        // SPARK-21165: the `requiredOrdering` is based on the attributes from analyzed plan, and
+        // the physical plan may have different attribute ids due to optimizer removing some
+        // aliases. Here we bind the expression ahead to avoid potential attribute ids mismatch.
+        val orderingExpr = GpuBindReferences.bindReferences(
+          requiredOrdering
+            .map(attr => SortOrder(attr, Ascending)), outputSpec.outputColumns)
+        val batchSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(sparkSession.sessionState.conf)
+        val concurrentOutputWriterSpec = GpuWriteFiles.createConcurrentOutputWriterSpec(
+          sparkSession, sortColumns, empty2NullPlan.output, batchSize, orderingExpr)
+
+        if (concurrentOutputWriterSpec.isDefined) {
+          // concurrent write
+          (empty2NullPlan.executeColumnar(), concurrentOutputWriterSpec)
+        } else {
+          // sort, then write
+          val sortPlan = createSortPlan(empty2NullPlan, orderingExpr, useStableSort)
+          val sort = sortPlan.executeColumnar()
+          (sort, concurrentOutputWriterSpec) // concurrentOutputWriterSpec is None
+        }
+      }
+
+      // SPARK-23271 If we are attempting to write a zero partition rdd, create a dummy single
+      // partition rdd to make sure we at least set up one write task to write the metadata.
+      val rddWithNonEmptyPartitions = if (rdd.partitions.length == 0) {
+        sparkSession.sparkContext.parallelize(Array.empty[ColumnarBatch], 1)
+      } else {
+        rdd
+      }
+
+      val jobTrackerID = SparkHadoopWriterUtils.createJobTrackerID(new Date())
+      val ret = new Array[WriteTaskResult](rddWithNonEmptyPartitions.partitions.length)
+      sparkSession.sparkContext.runJob(
+        rddWithNonEmptyPartitions,
+        (taskContext: TaskContext, iter: Iterator[ColumnarBatch]) => {
+          executeTask(
+            description = description,
+            jobTrackerID = jobTrackerID,
+            sparkStageId = taskContext.stageId(),
+            sparkPartitionId = taskContext.partitionId(),
+            sparkAttemptNumber = taskContext.taskAttemptId().toInt & Integer.MAX_VALUE,
+            committer,
+            iterator = iter,
+            concurrentOutputWriterSpec = concurrentOutputWriterSpec)
+        },
+        rddWithNonEmptyPartitions.partitions.indices,
+        (index, res: WriteTaskResult) => {
+          committer.onTaskCommit(res.commitMsg)
+          ret(index) = res
+        })
+      ret
+    }
+  }
+
+  private def writeAndCommit(
+      job: Job,
+      description: GpuWriteJobDescription,
+      committer: FileCommitProtocol)(f: => Array[WriteTaskResult]): Set[String] = {
+    // This call shouldn't be put into the `try` block below because it only initializes and
+    // prepares the job, any exception thrown from here shouldn't cause abortJob() to be called.
+    committer.setupJob(job)
+    try {
+      val ret = f
+      val commitMsgs = ret.map(_.commitMsg)
+
+      val (_, duration) = TimingUtils.timeTakenMs {
+        committer.commitJob(job, commitMsgs)
+      }
+      logInfo(s"Write Job ${description.uuid} committed. Elapsed time: $duration ms.")
+
+      processStats(description.statsTrackers, ret.map(_.summary.stats), duration)
+      logInfo(s"Finished processing stats for write job ${description.uuid}.")
+
+      // return a set of all the partition paths that were updated during this job
+      ret.map(_.summary.updatedPartitions).reduceOption(_ ++ _).getOrElse(Set.empty)
+    } catch {
+      case cause: Throwable =>
+        logError(s"Aborting job ${description.uuid}.", cause)
+        committer.abortJob(job)
+        throw new SparkException("Job aborted.", cause)
+    }
+  }
+
+  /**
+   * Write files using [[SparkPlan.executeWrite]]
+   */
+  def executeWrite(
+      session: SparkSession,
+      planForWrites: GpuWriteFilesExec,
+      writeFilesSpec: GpuWriteFilesSpec,
+      job: Job): Set[String] = {
+    val committer = writeFilesSpec.committer
+    val description = writeFilesSpec.description
+
+    writeAndCommit(job, description, committer) {
+      // columnar write
+      val rdd = planForWrites.executeColumnarWrite(writeFilesSpec)
+      val ret = new Array[WriteTaskResult](rdd.partitions.length)
+      session.sparkContext.runJob(
+        rdd,
+        (context: TaskContext, iter: Iterator[WriterCommitMessage]) => {
+          assert(iter.hasNext)
+          val commitMessage = iter.next()
+          assert(!iter.hasNext)
+          commitMessage
+        },
+        rdd.partitions.indices,
+        (index, res: WriterCommitMessage) => {
+          assert(res.isInstanceOf[WriteTaskResult])
+          val writeTaskResult = res.asInstanceOf[WriteTaskResult]
+          committer.onTaskCommit(writeTaskResult.commitMsg)
+          ret(index) = writeTaskResult
+        })
+      ret
+    }
+  }
+
+  private def createSortPlan(
+      child: SparkPlan,
+      orderingExpr: Seq[SortOrder],
+      useStableSort: Boolean): GpuSortExec = {
+    // SPARK-21165: the `requiredOrdering` is based on the attributes from analyzed plan, and
+    // the physical plan may have different attribute ids due to optimizer removing some
+    // aliases. Here we bind the expression ahead to avoid potential attribute ids mismatch.
+
+    // sort, then write
+    val sortType = if (useStableSort) {
+      FullSortSingleBatch
+    } else {
+      OutOfCoreSort
+    }
+    // TODO: Using a GPU ordering as a CPU ordering here. Should be OK for now since we do not
+    //       support bucket expressions yet and the rest should be simple attributes.
+    GpuSortExec(
+      orderingExpr,
+      global = false,
+      child = child,
+      sortType = sortType
+    )(orderingExpr)
+  }
+
+  /** Writes data out in a single Spark task. */
+  def executeTask(
+      description: GpuWriteJobDescription,
+      jobTrackerID: String,
+      sparkStageId: Int,
+      sparkPartitionId: Int,
+      sparkAttemptNumber: Int,
+      committer: FileCommitProtocol,
+      iterator: Iterator[ColumnarBatch],
+      concurrentOutputWriterSpec: Option[GpuConcurrentOutputWriterSpec]): WriteTaskResult = {
+
+    val jobId = SparkHadoopWriterUtils.createJobID(jobTrackerID, sparkStageId)
+    val taskId = new TaskID(jobId, TaskType.MAP, sparkPartitionId)
+    val taskAttemptId = new TaskAttemptID(taskId, sparkAttemptNumber)
+
+    // Set up the attempt context required to use in the output committer.
+    val taskAttemptContext: TaskAttemptContext = {
+      // Set up the configuration object
+      val hadoopConf = description.serializableHadoopConf.value
+      hadoopConf.set("mapreduce.job.id", jobId.toString)
+      hadoopConf.set("mapreduce.task.id", taskAttemptId.getTaskID.toString)
+      hadoopConf.set("mapreduce.task.attempt.id", taskAttemptId.toString)
+      hadoopConf.setBoolean("mapreduce.task.ismap", true)
+      hadoopConf.setInt("mapreduce.task.partition", 0)
+
+      new TaskAttemptContextImpl(hadoopConf, taskAttemptId)
+    }
+
+    committer.setupTask(taskAttemptContext)
+
+    val dataWriter =
+      if (sparkPartitionId != 0 && !iterator.hasNext) {
+        // In case of empty job, leave first partition to save meta for file format like parquet.
+        new GpuEmptyDirectoryDataWriter(description, taskAttemptContext, committer)
+      } else if (description.partitionColumns.isEmpty && description.bucketSpec.isEmpty) {
+        new GpuSingleDirectoryDataWriter(description, taskAttemptContext, committer)
+      } else {
+        concurrentOutputWriterSpec match {
+          case Some(spec) =>
+            new GpuDynamicPartitionDataConcurrentWriter(
+              description, taskAttemptContext, committer, spec)
+          case _ =>
+            new GpuDynamicPartitionDataSingleWriter(description, taskAttemptContext, committer)
+        }
+      }
+
+    try {
+      Utils.tryWithSafeFinallyAndFailureCallbacks(block = {
+        // Execute the task to write rows out and commit the task.
+        dataWriter.writeWithIterator(iterator)
+        dataWriter.commit()
+      })(catchBlock = {
+        // If there is an error, abort the task
+        dataWriter.abort()
+        logError(s"Job $jobId aborted.")
+      }, finallyBlock = {
+        dataWriter.close()
+      })
+    } catch {
+      case e: FetchFailedException =>
+        throw e
+      case t: Throwable =>
+        throw new SparkException("Task failed while writing rows.", t)
+    }
+  }
+
+  /**
+   * For every registered [[WriteJobStatsTracker]], call `processStats()` on it, passing it
+   * the corresponding [[WriteTaskStats]] from all executors.
+   */
+  private def processStats(
+      statsTrackers: Seq[ColumnarWriteJobStatsTracker],
+      statsPerTask: Seq[Seq[WriteTaskStats]],
+      jobCommitDuration: Long)
+  : Unit = {
+
+    val numStatsTrackers = statsTrackers.length
+    assert(statsPerTask.forall(_.length == numStatsTrackers),
+      s"""Every WriteTask should have produced one `WriteTaskStats` object for every tracker.
+         |There are $numStatsTrackers statsTrackers, but some task returned
+         |${statsPerTask.find(_.length != numStatsTrackers).get.length} results instead.
+       """.stripMargin)
+
+    val statsPerTracker = if (statsPerTask.nonEmpty) {
+      statsPerTask.transpose
+    } else {
+      statsTrackers.map(_ => Seq.empty)
+    }
+
+    statsTrackers.zip(statsPerTracker).foreach {
+      case (statsTracker, stats) => statsTracker.processStats(stats, jobCommitDuration)
+    }
+  }
+}
diff --git a/sql-plugin/src/main/311until340-non330db/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala
similarity index 96%
rename from sql-plugin/src/main/311until340-non330db/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala
rename to sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala
index 72291418f54..68c8d9d876f 100644
--- a/sql-plugin/src/main/311until340-non330db/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala
+++ b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.execution
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/311until340-non330db/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala
similarity index 97%
rename from sql-plugin/src/main/311until340-non330db/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala
rename to sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala
index c978979f930..56ad09fa20b 100644
--- a/sql-plugin/src/main/311until340-non330db/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala
+++ b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.execution
 
 import com.nvidia.spark.rapids._
diff --git a/sql-plugin/src/main/311until340-non330db/scala/org/apache/spark/sql/rapids/execution/GpuShuffleMeta.scala b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/execution/GpuShuffleMeta.scala
similarity index 92%
rename from sql-plugin/src/main/311until340-non330db/scala/org/apache/spark/sql/rapids/execution/GpuShuffleMeta.scala
rename to sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/execution/GpuShuffleMeta.scala
index 03bc0b342f3..471a99ab48a 100644
--- a/sql-plugin/src/main/311until340-non330db/scala/org/apache/spark/sql/rapids/execution/GpuShuffleMeta.scala
+++ b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/execution/GpuShuffleMeta.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.execution
 
 import com.nvidia.spark.rapids.{DataFromReplacementRule, RapidsConf, RapidsMeta}
diff --git a/sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/execution/ShimTrampolineUtil.scala b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/execution/ShimTrampolineUtil.scala
similarity index 93%
rename from sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/execution/ShimTrampolineUtil.scala
rename to sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/execution/ShimTrampolineUtil.scala
index 1515d371b51..f747c320dd8 100644
--- a/sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/execution/ShimTrampolineUtil.scala
+++ b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/execution/ShimTrampolineUtil.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.execution
 
 import org.apache.spark.sql.catalyst.plans.physical.{BroadcastMode, IdentityBroadcastMode}
diff --git a/sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/shims/GpuCreateDataSourceTableAsSelectCommandShims.scala b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/GpuCreateDataSourceTableAsSelectCommandShims.scala
similarity index 98%
rename from sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/shims/GpuCreateDataSourceTableAsSelectCommandShims.scala
rename to sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/GpuCreateDataSourceTableAsSelectCommandShims.scala
index e1fd0fe9a92..b90d455ab41 100644
--- a/sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/shims/GpuCreateDataSourceTableAsSelectCommandShims.scala
+++ b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/GpuCreateDataSourceTableAsSelectCommandShims.scala
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import java.net.URI
diff --git a/sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/shims/SchemaUtilsShims.scala b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/SchemaUtilsShims.scala
similarity index 92%
rename from sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/shims/SchemaUtilsShims.scala
rename to sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/SchemaUtilsShims.scala
index ae687409443..55734b3cfb2 100644
--- a/sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/shims/SchemaUtilsShims.scala
+++ b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/SchemaUtilsShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import org.apache.spark.sql.catalyst.analysis.Resolver
diff --git a/sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala
similarity index 88%
rename from sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala
rename to sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala
index da586510b11..6cffbead177 100644
--- a/sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala
+++ b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import org.apache.spark.{QueryContext, SparkDateTimeException}
diff --git a/sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
similarity index 87%
rename from sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
rename to sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
index 19e7590163b..f57cc307bc3 100644
--- a/sql-plugin/src/main/340+/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
+++ b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 
 import org.apache.spark.SparkUpgradeException
diff --git a/sql-plugin/src/main/340/scala/org/apache/spark/sql/rapids/shims/spark340/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/spark340/RapidsShuffleInternalManager.scala
similarity index 90%
rename from sql-plugin/src/main/340/scala/org/apache/spark/sql/rapids/shims/spark340/RapidsShuffleInternalManager.scala
rename to sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/spark340/RapidsShuffleInternalManager.scala
index d1eb66da4d2..d667360c1e8 100644
--- a/sql-plugin/src/main/340/scala/org/apache/spark/sql/rapids/shims/spark340/RapidsShuffleInternalManager.scala
+++ b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/spark340/RapidsShuffleInternalManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims.spark340
 
 import org.apache.spark.SparkConf
diff --git a/sql-plugin/src/test/scala/com/nvidia/spark/rapids/AlluxioUtilsSuite.scala b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/AlluxioUtilsSuite.scala
index e7967595bde..19cbaefba58 100644
--- a/sql-plugin/src/test/scala/com/nvidia/spark/rapids/AlluxioUtilsSuite.scala
+++ b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/AlluxioUtilsSuite.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.internal.SQLConf
 
 class AlluxioMasterAndPortReaderMock(master: String, port: String)
   extends AlluxioConfigReader {
-  override def readAlluxioMasterAndPort(): (String, String) = (master, port)
+  override def readAlluxioMasterAndPort(conf: RapidsConf): (String, String) = (master, port)
 }
 
 class AlluxioFSMock extends AlluxioFS {
@@ -76,15 +76,15 @@ class AlluxioUtilsSuite extends FunSuite {
     val partitionedFiles = Array[PartitionedFile](
       PartitionedFileUtilsShim.newPartitionedFile(null, "s3a://bucket_1/a.file", 0, 0),
       PartitionedFileUtilsShim.newPartitionedFile(null, "s3a://bucket_2/b.file", 0, 0),
-      PartitionedFileUtilsShim.newPartitionedFile(null, "my_scheme://bucket_1/1.file", 0, 0)
+      PartitionedFileUtilsShim.newPartitionedFile(null, "myScheme://bucket_1/1.file", 0, 0)
     )
     val replaced = AlluxioUtils.updateFilesTaskTimeIfAlluxio(partitionedFiles, Option(replaceMap))
     assert(replaced.size == 3)
-    assert(replaced(0).toRead.filePath.equals("alluxio://localhost:19998/bucket_1/a.file"))
-    assert(replaced(0).original.get.filePath.equals("s3a://bucket_1/a.file"))
-    assert(replaced(1).toRead.filePath.equals("alluxio://localhost:19998/bucket_2/b.file"))
-    assert(replaced(1).original.get.filePath.equals("s3a://bucket_2/b.file"))
-    assert(replaced(2).toRead.filePath.equals("my_scheme://bucket_1/1.file"))
+    assert(replaced(0).toRead.filePath.toString === "alluxio://localhost:19998/bucket_1/a.file")
+    assert(replaced(0).original.get.filePath.toString === "s3a://bucket_1/a.file")
+    assert(replaced(1).toRead.filePath.toString === "alluxio://localhost:19998/bucket_2/b.file")
+    assert(replaced(1).original.get.filePath.toString === "s3a://bucket_2/b.file")
+    assert(replaced(2).toRead.filePath.toString === "myScheme://bucket_1/1.file")
     assert(replaced(2).original.isEmpty)
   }
 
diff --git a/sql-plugin/src/main/31xdb/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/FQSuiteName.scala
similarity index 71%
rename from sql-plugin/src/main/31xdb/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala
rename to sql-plugin/src/test/scala/com/nvidia/spark/rapids/FQSuiteName.scala
index 823c81fe3f2..a1a54a18d27 100644
--- a/sql-plugin/src/main/31xdb/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala
+++ b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/FQSuiteName.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,8 @@
  * limitations under the License.
  */
 
-package com.nvidia.spark.rapids.shims
+package com.nvidia.spark.rapids
 
-object AggregationTagging {
-  // Whether aggregations must be replaced only when both halves are replaced.
-  val mustReplaceBoth: Boolean = false
+trait FQSuiteName extends org.scalatest.Suite {
+  override def suiteName: String = getClass().getName()
 }
diff --git a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/KeyGroupedPartitioningShim.scala b/sql-plugin/src/test/spark311/scala/com/nvidia/spark/rapids/shims/spark311/SparkShimsSuite.scala
similarity index 53%
rename from sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/KeyGroupedPartitioningShim.scala
rename to sql-plugin/src/test/spark311/scala/com/nvidia/spark/rapids/shims/spark311/SparkShimsSuite.scala
index 1f4214fbcd0..e56e2286db3 100644
--- a/sql-plugin/src/main/340+/scala/com/nvidia/spark/rapids/shims/KeyGroupedPartitioningShim.scala
+++ b/sql-plugin/src/test/spark311/scala/com/nvidia/spark/rapids/shims/spark311/SparkShimsSuite.scala
@@ -14,14 +14,21 @@
  * limitations under the License.
  */
 
-package com.nvidia.spark.rapids.shims
+/*** spark-rapids-shim-json-lines
+{"spark": "311"}
+spark-rapids-shim-json-lines ***/
+package com.nvidia.spark.rapids.shims.spark311;
 
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.plans.physical.KeyGroupedPartitioning
+import com.nvidia.spark.rapids._
+import org.scalatest.FunSuite
 
-object KeyGroupedPartitioningShim {
+class SparkShimsSuite extends FunSuite with FQSuiteName {
+  test("spark shims version") {
+    assert(ShimLoader.getShimVersion === SparkShimVersion(3, 1, 1))
+  }
 
-  def getPartitionValues(p: KeyGroupedPartitioning): Seq[InternalRow] = {
-    p.partitionValues
+  test("shuffle manager class") {
+    assert(ShimLoader.getRapidsShuffleManagerClass ===
+      classOf[com.nvidia.spark.rapids.spark311.RapidsShuffleManager].getCanonicalName)
   }
 }
diff --git a/sql-plugin/src/main/330until340/scala/com/nvidia/spark/rapids/shims/KeyGroupedPartitioningShim.scala b/sql-plugin/src/test/spark312/scala/com/nvidia/spark/rapids/shims/spark312/SparkShimsSuite.scala
similarity index 53%
rename from sql-plugin/src/main/330until340/scala/com/nvidia/spark/rapids/shims/KeyGroupedPartitioningShim.scala
rename to sql-plugin/src/test/spark312/scala/com/nvidia/spark/rapids/shims/spark312/SparkShimsSuite.scala
index 79611e7d22e..2aa8209d2c4 100644
--- a/sql-plugin/src/main/330until340/scala/com/nvidia/spark/rapids/shims/KeyGroupedPartitioningShim.scala
+++ b/sql-plugin/src/test/spark312/scala/com/nvidia/spark/rapids/shims/spark312/SparkShimsSuite.scala
@@ -14,14 +14,21 @@
  * limitations under the License.
  */
 
-package com.nvidia.spark.rapids.shims
+/*** spark-rapids-shim-json-lines
+{"spark": "312"}
+spark-rapids-shim-json-lines ***/
+package com.nvidia.spark.rapids.shims.spark312;
 
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.plans.physical.KeyGroupedPartitioning
+import com.nvidia.spark.rapids._
+import org.scalatest.FunSuite
 
-object KeyGroupedPartitioningShim {
+class SparkShimsSuite extends FunSuite with FQSuiteName {
+  test("spark shims version") {
+    assert(ShimLoader.getShimVersion === SparkShimVersion(3, 1, 2))
+  }
 
-  def getPartitionValues(p: KeyGroupedPartitioning): Seq[InternalRow] = {
-    p.partitionValuesOpt.get
+  test("shuffle manager class") {
+    assert(ShimLoader.getRapidsShuffleManagerClass ===
+      classOf[com.nvidia.spark.rapids.spark312.RapidsShuffleManager].getCanonicalName)
   }
 }
diff --git a/sql-plugin/src/test/spark313/scala/com/nvidia/spark/rapids/shims/spark313/SparkShimsSuite.scala b/sql-plugin/src/test/spark313/scala/com/nvidia/spark/rapids/shims/spark313/SparkShimsSuite.scala
new file mode 100644
index 00000000000..4d264f052d4
--- /dev/null
+++ b/sql-plugin/src/test/spark313/scala/com/nvidia/spark/rapids/shims/spark313/SparkShimsSuite.scala
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "313"}
+spark-rapids-shim-json-lines ***/
+package com.nvidia.spark.rapids.shims.spark313;
+
+import com.nvidia.spark.rapids._
+import org.scalatest.FunSuite
+
+class SparkShimsSuite extends FunSuite with FQSuiteName {
+  test("spark shims version") {
+    assert(ShimLoader.getShimVersion === SparkShimVersion(3, 1, 3))
+  }
+
+  test("shuffle manager class") {
+    assert(ShimLoader.getRapidsShuffleManagerClass ===
+      classOf[com.nvidia.spark.rapids.spark313.RapidsShuffleManager].getCanonicalName)
+  }
+}
diff --git a/sql-plugin/src/test/320/scala/com/nvidia/spark/rapids/shims/spark320/Spark320ShimsSuite.scala b/sql-plugin/src/test/spark320/scala/com/nvidia/spark/rapids/shims/spark320/SparkShimsSuite.scala
similarity index 78%
rename from sql-plugin/src/test/320/scala/com/nvidia/spark/rapids/shims/spark320/Spark320ShimsSuite.scala
rename to sql-plugin/src/test/spark320/scala/com/nvidia/spark/rapids/shims/spark320/SparkShimsSuite.scala
index 1a41ce2e8b8..f13407f7f16 100644
--- a/sql-plugin/src/test/320/scala/com/nvidia/spark/rapids/shims/spark320/Spark320ShimsSuite.scala
+++ b/sql-plugin/src/test/spark320/scala/com/nvidia/spark/rapids/shims/spark320/SparkShimsSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,17 +14,19 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark320;
 
-import com.nvidia.spark.rapids.{ShimLoader, SparkShimVersion, TypeSig}
-import com.nvidia.spark.rapids.shims.SparkShimImpl
+import com.nvidia.spark.rapids._
 import org.scalatest.FunSuite
 
 import org.apache.spark.sql.types.{DayTimeIntervalType, YearMonthIntervalType}
 
-class Spark320ShimsSuite extends FunSuite {
+class SparkShimsSuite extends FunSuite with FQSuiteName {
   test("spark shims version") {
-    assert(SparkShimImpl.getSparkShimVersion === SparkShimVersion(3, 2, 0))
+    assert(ShimLoader.getShimVersion === SparkShimVersion(3, 2, 0))
   }
 
   test("shuffle manager class") {
@@ -32,7 +34,7 @@ class Spark320ShimsSuite extends FunSuite {
       classOf[com.nvidia.spark.rapids.spark320.RapidsShuffleManager].getCanonicalName)
   }
 
-  test("TypeSig320") {
+  test("TypeSig") {
     val check = TypeSig.DAYTIME + TypeSig.YEARMONTH
     assert(check.isSupportedByPlugin(DayTimeIntervalType()) == true)
     assert(check.isSupportedByPlugin(YearMonthIntervalType()) == true)
diff --git a/sql-plugin/src/test/321/scala/com/nvidia/spark/rapids/shims/spark321/Spark321ShimsSuite.scala b/sql-plugin/src/test/spark321/scala/com/nvidia/spark/rapids/shims/spark321/SparkShimsSuite.scala
similarity index 78%
rename from sql-plugin/src/test/321/scala/com/nvidia/spark/rapids/shims/spark321/Spark321ShimsSuite.scala
rename to sql-plugin/src/test/spark321/scala/com/nvidia/spark/rapids/shims/spark321/SparkShimsSuite.scala
index f9b1265c2b2..46a2be256e0 100644
--- a/sql-plugin/src/test/321/scala/com/nvidia/spark/rapids/shims/spark321/Spark321ShimsSuite.scala
+++ b/sql-plugin/src/test/spark321/scala/com/nvidia/spark/rapids/shims/spark321/SparkShimsSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,17 +14,19 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark321;
 
-import com.nvidia.spark.rapids.{ShimLoader, SparkShimVersion, TypeSig}
-import com.nvidia.spark.rapids.shims.SparkShimImpl
+import com.nvidia.spark.rapids._
 import org.scalatest.FunSuite
 
 import org.apache.spark.sql.types.{DayTimeIntervalType, YearMonthIntervalType}
 
-class Spark321ShimsSuite extends FunSuite {
+class SparkShimsSuite extends FunSuite with FQSuiteName {
   test("spark shims version") {
-    assert(SparkShimImpl.getSparkShimVersion === SparkShimVersion(3, 2, 1))
+    assert(ShimLoader.getShimVersion === SparkShimVersion(3, 2, 1))
   }
 
   test("shuffle manager class") {
@@ -32,7 +34,7 @@ class Spark321ShimsSuite extends FunSuite {
       classOf[com.nvidia.spark.rapids.spark321.RapidsShuffleManager].getCanonicalName)
   }
 
-  test("TypeSig321") {
+  test("TypeSig") {
     val check = TypeSig.DAYTIME + TypeSig.YEARMONTH
     assert(check.isSupportedByPlugin(DayTimeIntervalType()) == true)
     assert(check.isSupportedByPlugin(YearMonthIntervalType()) == true)
diff --git a/sql-plugin/src/test/spark321cdh/scala/com/nvidia/spark/rapids/shims/spark321cdh/SparkShimsSuite.scala b/sql-plugin/src/test/spark321cdh/scala/com/nvidia/spark/rapids/shims/spark321cdh/SparkShimsSuite.scala
new file mode 100644
index 00000000000..ab87a25b71e
--- /dev/null
+++ b/sql-plugin/src/test/spark321cdh/scala/com/nvidia/spark/rapids/shims/spark321cdh/SparkShimsSuite.scala
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "321cdh"}
+spark-rapids-shim-json-lines ***/
+package com.nvidia.spark.rapids.shims.spark321cdh;
+
+import com.nvidia.spark.rapids._
+import org.scalatest.FunSuite
+
+import org.apache.spark.sql.types.{DayTimeIntervalType, YearMonthIntervalType}
+
+class SparkShimsSuite extends FunSuite with FQSuiteName {
+  test("spark shims version") {
+    assert(VersionUtils.cmpSparkVersion(3, 2, 1) === 0)
+  }
+
+  test("shuffle manager class") {
+    assert(ShimLoader.getRapidsShuffleManagerClass ===
+      classOf[com.nvidia.spark.rapids.spark321cdh.RapidsShuffleManager].getCanonicalName)
+  }
+
+  test("TypeSig") {
+    val check = TypeSig.DAYTIME + TypeSig.YEARMONTH
+    assert(check.isSupportedByPlugin(DayTimeIntervalType()) == true)
+    assert(check.isSupportedByPlugin(YearMonthIntervalType()) == true)
+  }
+
+}
diff --git a/sql-plugin/src/test/322/scala/com/nvidia/spark/rapids/shims/spark322/Spark322ShimsSuite.scala b/sql-plugin/src/test/spark322/scala/com/nvidia/spark/rapids/shims/spark322/SparkShimsSuite.scala
similarity index 78%
rename from sql-plugin/src/test/322/scala/com/nvidia/spark/rapids/shims/spark322/Spark322ShimsSuite.scala
rename to sql-plugin/src/test/spark322/scala/com/nvidia/spark/rapids/shims/spark322/SparkShimsSuite.scala
index 26f0f0fdd45..e713afaada3 100644
--- a/sql-plugin/src/test/322/scala/com/nvidia/spark/rapids/shims/spark322/Spark322ShimsSuite.scala
+++ b/sql-plugin/src/test/spark322/scala/com/nvidia/spark/rapids/shims/spark322/SparkShimsSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,17 +14,19 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "322"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark322;
 
-import com.nvidia.spark.rapids.{ShimLoader, SparkShimVersion, TypeSig}
-import com.nvidia.spark.rapids.shims.SparkShimImpl
+import com.nvidia.spark.rapids._
 import org.scalatest.FunSuite
 
 import org.apache.spark.sql.types.{DayTimeIntervalType, YearMonthIntervalType}
 
-class Spark322ShimsSuite extends FunSuite {
+class SparkShimsSuite extends FunSuite with FQSuiteName {
   test("spark shims version") {
-    assert(SparkShimImpl.getSparkShimVersion === SparkShimVersion(3, 2, 2))
+    assert(ShimLoader.getShimVersion === SparkShimVersion(3, 2, 2))
   }
 
   test("shuffle manager class") {
@@ -32,7 +34,7 @@ class Spark322ShimsSuite extends FunSuite {
       classOf[com.nvidia.spark.rapids.spark322.RapidsShuffleManager].getCanonicalName)
   }
 
-  test("TypeSig322") {
+  test("TypeSig") {
     val check = TypeSig.DAYTIME + TypeSig.YEARMONTH
     assert(check.isSupportedByPlugin(DayTimeIntervalType()) == true)
     assert(check.isSupportedByPlugin(YearMonthIntervalType()) == true)
diff --git a/sql-plugin/src/test/323/scala/com/nvidia/spark/rapids/shims/spark323/Spark323ShimsSuite.scala b/sql-plugin/src/test/spark323/scala/com/nvidia/spark/rapids/shims/spark323/SparkShimsSuite.scala
similarity index 78%
rename from sql-plugin/src/test/323/scala/com/nvidia/spark/rapids/shims/spark323/Spark323ShimsSuite.scala
rename to sql-plugin/src/test/spark323/scala/com/nvidia/spark/rapids/shims/spark323/SparkShimsSuite.scala
index 59f07834566..c27f1032976 100644
--- a/sql-plugin/src/test/323/scala/com/nvidia/spark/rapids/shims/spark323/Spark323ShimsSuite.scala
+++ b/sql-plugin/src/test/spark323/scala/com/nvidia/spark/rapids/shims/spark323/SparkShimsSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,17 +14,19 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "323"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark323;
 
-import com.nvidia.spark.rapids.{ShimLoader, SparkShimVersion, TypeSig}
-import com.nvidia.spark.rapids.shims.SparkShimImpl
+import com.nvidia.spark.rapids._
 import org.scalatest.FunSuite
 
 import org.apache.spark.sql.types.{DayTimeIntervalType, YearMonthIntervalType}
 
-class Spark323ShimsSuite extends FunSuite {
+class SparkShimsSuite extends FunSuite with FQSuiteName {
   test("spark shims version") {
-    assert(SparkShimImpl.getSparkShimVersion === SparkShimVersion(3, 2, 3))
+    assert(ShimLoader.getShimVersion === SparkShimVersion(3, 2, 3))
   }
 
   test("shuffle manager class") {
@@ -32,7 +34,7 @@ class Spark323ShimsSuite extends FunSuite {
       classOf[com.nvidia.spark.rapids.spark323.RapidsShuffleManager].getCanonicalName)
   }
 
-  test("TypeSig323") {
+  test("TypeSig") {
     val check = TypeSig.DAYTIME + TypeSig.YEARMONTH
     assert(check.isSupportedByPlugin(DayTimeIntervalType()) == true)
     assert(check.isSupportedByPlugin(YearMonthIntervalType()) == true)
diff --git a/sql-plugin/src/test/330/scala/com/nvidia/spark/rapids/shims/spark330/Spark330ShimsSuite.scala b/sql-plugin/src/test/spark330/scala/com/nvidia/spark/rapids/shims/spark330/SparkShimsSuite.scala
similarity index 78%
rename from sql-plugin/src/test/330/scala/com/nvidia/spark/rapids/shims/spark330/Spark330ShimsSuite.scala
rename to sql-plugin/src/test/spark330/scala/com/nvidia/spark/rapids/shims/spark330/SparkShimsSuite.scala
index 4691068b413..48c1608273d 100644
--- a/sql-plugin/src/test/330/scala/com/nvidia/spark/rapids/shims/spark330/Spark330ShimsSuite.scala
+++ b/sql-plugin/src/test/spark330/scala/com/nvidia/spark/rapids/shims/spark330/SparkShimsSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,17 +14,19 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark330;
 
-import com.nvidia.spark.rapids.{ShimLoader, SparkShimVersion, TypeSig}
-import com.nvidia.spark.rapids.shims.SparkShimImpl
+import com.nvidia.spark.rapids._
 import org.scalatest.FunSuite
 
 import org.apache.spark.sql.types.{DayTimeIntervalType, YearMonthIntervalType}
 
-class Spark330ShimsSuite extends FunSuite {
+class SparkShimsSuite extends FunSuite with FQSuiteName {
   test("spark shims version") {
-    assert(SparkShimImpl.getSparkShimVersion === SparkShimVersion(3, 3, 0))
+    assert(ShimLoader.getShimVersion === SparkShimVersion(3, 3, 0))
   }
 
   test("shuffle manager class") {
@@ -32,7 +34,7 @@ class Spark330ShimsSuite extends FunSuite {
       classOf[com.nvidia.spark.rapids.spark330.RapidsShuffleManager].getCanonicalName)
   }
 
-  test("TypeSig330") {
+  test("TypeSig") {
     val check = TypeSig.DAYTIME + TypeSig.YEARMONTH
     assert(check.isSupportedByPlugin(DayTimeIntervalType()) == true)
     assert(check.isSupportedByPlugin(YearMonthIntervalType()) == true)
diff --git a/sql-plugin/src/test/spark330cdh/scala/com/nvidia/spark/rapids/shims/spark330cdh/SparkShimsSuite.scala b/sql-plugin/src/test/spark330cdh/scala/com/nvidia/spark/rapids/shims/spark330cdh/SparkShimsSuite.scala
new file mode 100644
index 00000000000..dcfb941d25d
--- /dev/null
+++ b/sql-plugin/src/test/spark330cdh/scala/com/nvidia/spark/rapids/shims/spark330cdh/SparkShimsSuite.scala
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "330cdh"}
+spark-rapids-shim-json-lines ***/
+package com.nvidia.spark.rapids.shims.spark330cdh;
+
+import com.nvidia.spark.rapids._
+import org.scalatest.FunSuite
+
+import org.apache.spark.sql.types.{DayTimeIntervalType, YearMonthIntervalType}
+
+class SparkShimsSuite extends FunSuite with FQSuiteName {
+  test("spark shims version") {
+    assert(VersionUtils.cmpSparkVersion(3, 3, 0) === 0)
+  }
+
+  test("shuffle manager class") {
+    assert(ShimLoader.getRapidsShuffleManagerClass ===
+      classOf[com.nvidia.spark.rapids.spark330cdh.RapidsShuffleManager].getCanonicalName)
+  }
+
+  test("TypeSig") {
+    val check = TypeSig.DAYTIME + TypeSig.YEARMONTH
+    assert(check.isSupportedByPlugin(DayTimeIntervalType()) == true)
+    assert(check.isSupportedByPlugin(YearMonthIntervalType()) == true)
+  }
+
+}
diff --git a/sql-plugin/src/test/331/scala/com/nvidia/spark/rapids/shims/spark331/Spark331ShimsSuite.scala b/sql-plugin/src/test/spark331/scala/com/nvidia/spark/rapids/shims/spark331/SparkShimsSuite.scala
similarity index 78%
rename from sql-plugin/src/test/331/scala/com/nvidia/spark/rapids/shims/spark331/Spark331ShimsSuite.scala
rename to sql-plugin/src/test/spark331/scala/com/nvidia/spark/rapids/shims/spark331/SparkShimsSuite.scala
index 21ccd352fed..3354f845a0b 100644
--- a/sql-plugin/src/test/331/scala/com/nvidia/spark/rapids/shims/spark331/Spark331ShimsSuite.scala
+++ b/sql-plugin/src/test/spark331/scala/com/nvidia/spark/rapids/shims/spark331/SparkShimsSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,17 +14,19 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "331"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark331;
 
-import com.nvidia.spark.rapids.{ShimLoader, SparkShimVersion, TypeSig}
-import com.nvidia.spark.rapids.shims.SparkShimImpl
+import com.nvidia.spark.rapids._
 import org.scalatest.FunSuite
 
 import org.apache.spark.sql.types.{DayTimeIntervalType, YearMonthIntervalType}
 
-class Spark331ShimsSuite extends FunSuite {
+class SparkShimsSuite extends FunSuite with FQSuiteName {
   test("spark shims version") {
-    assert(SparkShimImpl.getSparkShimVersion === SparkShimVersion(3, 3, 1))
+    assert(ShimLoader.getShimVersion === SparkShimVersion(3, 3, 1))
   }
 
   test("shuffle manager class") {
@@ -32,7 +34,7 @@ class Spark331ShimsSuite extends FunSuite {
       classOf[com.nvidia.spark.rapids.spark331.RapidsShuffleManager].getCanonicalName)
   }
 
-  test("TypeSig331") {
+  test("TypeSig") {
     val check = TypeSig.DAYTIME + TypeSig.YEARMONTH
     assert(check.isSupportedByPlugin(DayTimeIntervalType()) == true)
     assert(check.isSupportedByPlugin(YearMonthIntervalType()) == true)
diff --git a/sql-plugin/src/test/spark332/scala/com/nvidia/spark/rapids/shims/spark332/SparkShimsSuite.scala b/sql-plugin/src/test/spark332/scala/com/nvidia/spark/rapids/shims/spark332/SparkShimsSuite.scala
new file mode 100644
index 00000000000..e28ff409aec
--- /dev/null
+++ b/sql-plugin/src/test/spark332/scala/com/nvidia/spark/rapids/shims/spark332/SparkShimsSuite.scala
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "332"}
+spark-rapids-shim-json-lines ***/
+package com.nvidia.spark.rapids.shims.spark332;
+
+import com.nvidia.spark.rapids._
+import org.scalatest.FunSuite
+
+import org.apache.spark.sql.types.{DayTimeIntervalType, YearMonthIntervalType}
+
+class SparkShimsSuite extends FunSuite with FQSuiteName {
+  test("spark shims version") {
+    assert(ShimLoader.getShimVersion === SparkShimVersion(3, 3, 2))
+  }
+
+  test("shuffle manager class") {
+    assert(ShimLoader.getRapidsShuffleManagerClass ===
+      classOf[com.nvidia.spark.rapids.spark332.RapidsShuffleManager].getCanonicalName)
+  }
+
+  test("TypeSig") {
+    val check = TypeSig.DAYTIME + TypeSig.YEARMONTH
+    assert(check.isSupportedByPlugin(DayTimeIntervalType()) == true)
+    assert(check.isSupportedByPlugin(YearMonthIntervalType()) == true)
+  }
+
+}
diff --git a/sql-plugin/src/test/340/scala/com/nvidia/spark/rapids/shims/spark340/Spark340ShimsSuite.scala b/sql-plugin/src/test/spark340/scala/com/nvidia/spark/rapids/shims/spark340/SparkShimsSuite.scala
similarity index 68%
rename from sql-plugin/src/test/340/scala/com/nvidia/spark/rapids/shims/spark340/Spark340ShimsSuite.scala
rename to sql-plugin/src/test/spark340/scala/com/nvidia/spark/rapids/shims/spark340/SparkShimsSuite.scala
index 29ffc0de507..b109b2c742e 100644
--- a/sql-plugin/src/test/340/scala/com/nvidia/spark/rapids/shims/spark340/Spark340ShimsSuite.scala
+++ b/sql-plugin/src/test/spark340/scala/com/nvidia/spark/rapids/shims/spark340/SparkShimsSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,15 +14,17 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims.spark340;
 
-import com.nvidia.spark.rapids.{ShimLoader, SparkShimVersion}
-import com.nvidia.spark.rapids.shims.SparkShimImpl
+import com.nvidia.spark.rapids._
 import org.scalatest.FunSuite
 
-class Spark340ShimsSuite extends FunSuite {
-  test("spark shims version") {
-    assert(SparkShimImpl.getSparkShimVersion === SparkShimVersion(3, 4, 0))
+class SparkShimsSuite extends FunSuite with FQSuiteName {
+  ignore("spark shims version - https://github.com/NVIDIA/spark-rapids/issues/7676") {
+    assert(ShimLoader.getShimVersion === SparkShimVersion(3, 4, 0))
   }
 
   test("shuffle manager class") {
diff --git a/tests/pom.xml b/tests/pom.xml
index 7d044431cc2..1f44c210133 100644
--- a/tests/pom.xml
+++ b/tests/pom.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
-  Copyright (c) 2020-2022, NVIDIA CORPORATION.
+  Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
@@ -22,12 +22,12 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.02.0</version>
+        <version>23.04.0</version>
     </parent>
     <artifactId>rapids-4-spark-tests_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Tests</name>
     <description>RAPIDS plugin for Apache Spark integration tests</description>
-    <version>23.02.0</version>
+    <version>23.04.0</version>
 
     <dependencies>
         <dependency>
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/AdaptiveQueryExecSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/AdaptiveQueryExecSuite.scala
index 2a1eccf2334..c34555bed0a 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/AdaptiveQueryExecSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/AdaptiveQueryExecSuite.scala
@@ -628,7 +628,7 @@ class AdaptiveQueryExecSuite
       val parts = rdd.partitions
       assert(parts.forall(rdd.preferredLocations(_).nonEmpty))
     }
-    assert(numShuffles === (numLocalReaders.length + numShufflesWithoutLocalReader))
+    assert(numShuffles == (numLocalReaders.length + numShufflesWithoutLocalReader))
     numLocalReaders.length
   }
 
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/AnsiCastOpSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/AnsiCastOpSuite.scala
index 6bca934ec60..98df284ef40 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/AnsiCastOpSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/AnsiCastOpSuite.scala
@@ -726,17 +726,12 @@ class AnsiCastOpSuite extends GpuExpressionTestSuite {
   ///////////////////////////////////////////////////////////////////////////
   // Copying between Hive tables, which has special rules
   ///////////////////////////////////////////////////////////////////////////
-  if (cmpSparkVersion(3, 4, 0) < 0) {
-    // The following two tests are failing in Spark 3.4.0.
-    // Disable them temporarily,
-    // and tracked by https://github.com/NVIDIA/spark-rapids/issues/5748
-    testSparkResultsAreEqual("Copy ints to long", testInts, sparkConf) {
-      frame => doTableCopy(frame, HIVE_INT_SQL_TYPE, HIVE_LONG_SQL_TYPE)
-    }
+  testSparkResultsAreEqual("Copy ints to long", testInts, sparkConf) {
+    frame => doTableCopy(frame, HIVE_INT_SQL_TYPE, HIVE_LONG_SQL_TYPE)
+  }
 
-    testSparkResultsAreEqual("Copy long to float", testLongs, sparkConf) {
-      frame => doTableCopy(frame, HIVE_LONG_SQL_TYPE, HIVE_FLOAT_SQL_TYPE)
-    }
+  testSparkResultsAreEqual("Copy long to float", testLongs, sparkConf) {
+    frame => doTableCopy(frame, HIVE_LONG_SQL_TYPE, HIVE_FLOAT_SQL_TYPE)
   }
 
   private def testCastTo(castTo: DataType)(frame: DataFrame): DataFrame ={
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala
index 1e9405ad3ac..a8568f24aa3 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala
@@ -257,9 +257,6 @@ class CastOpSuite extends GpuExpressionTestSuite {
   }
 
   test("Test all supported casts with in-range values") {
-    // Temporarily disable it for Spark 340.
-    // Tracked by https://github.com/NVIDIA/spark-rapids/issues/5748
-    assume(cmpSparkVersion(3, 4, 0) < 0)
     // test cast() and ansi_cast()
     Seq(false, true).foreach { ansiEnabled =>
 
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/CsvScanRetrySuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/CsvScanRetrySuite.scala
new file mode 100644
index 00000000000..d70df2509b4
--- /dev/null
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/CsvScanRetrySuite.scala
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import ai.rapids.cudf.CSVOptions
+import com.nvidia.spark.rapids.jni.RmmSpark
+
+import org.apache.spark.sql.types._
+
+class CsvScanRetrySuite extends RmmSparkRetrySuiteBase with Arm {
+  test("test simple retry") {
+    val bufferer = HostLineBuffererFactory.createBufferer(100, Array('\n'.toByte))
+    bufferer.add("1,2".getBytes, 0, 3)
+
+    val cudfSchema = GpuColumnVector.from(StructType(Seq(StructField("a", IntegerType),
+      StructField("b", IntegerType))))
+    val opts = CSVOptions.builder().hasHeader(false)
+    RmmSpark.forceRetryOOM(RmmSpark.getCurrentThreadId)
+    val table = CSVPartitionReader.readToTable(bufferer, cudfSchema, NoopMetric,
+      opts, "CSV", null)
+    table.close()
+    // We don't have any good way to verify that the retry was thrown, but we are going to trust
+    // that it was.
+  }
+}
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/DeviceMemoryEventHandlerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/DeviceMemoryEventHandlerSuite.scala
index ec9971a2d92..be370209e32 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/DeviceMemoryEventHandlerSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/DeviceMemoryEventHandlerSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,28 +24,46 @@ import org.scalatest.mockito.MockitoSugar
 class DeviceMemoryEventHandlerSuite extends FunSuite with MockitoSugar {
 
   test("a failed allocation should be retried if we spilled enough") {
+    val mockCatalog = mock[RapidsBufferCatalog]
     val mockStore = mock[RapidsDeviceMemoryStore]
     when(mockStore.currentSize).thenReturn(1024)
-    when(mockStore.synchronousSpill(any())).thenReturn(1024)
-    val handler = new DeviceMemoryEventHandler(mockStore, None, false, 2)
+    when(mockCatalog.synchronousSpill(any(), any(), any())).thenAnswer(_ => Some(1024))
+    val handler = new DeviceMemoryEventHandler(
+      mockCatalog,
+      mockStore,
+      None,
+      false,
+      2)
     assertResult(true)(handler.onAllocFailure(1024, 0))
   }
 
   test("when we deplete the store, retry up to max failed OOM retries") {
+    val mockCatalog = mock[RapidsBufferCatalog]
     val mockStore = mock[RapidsDeviceMemoryStore]
     when(mockStore.currentSize).thenReturn(0)
-    when(mockStore.synchronousSpill(any())).thenReturn(0)
-    val handler = new DeviceMemoryEventHandler(mockStore, None, false, 2)
+    when(mockCatalog.synchronousSpill(any(), any(), any())).thenAnswer(_ => Some(0))
+    val handler = new DeviceMemoryEventHandler(
+      mockCatalog,
+      mockStore,
+      None,
+      false,
+      2)
     assertResult(true)(handler.onAllocFailure(1024, 0)) // sync
     assertResult(true)(handler.onAllocFailure(1024, 1)) // sync 2
     assertResult(false)(handler.onAllocFailure(1024, 2)) // cuDF would OOM here
   }
 
   test("we reset our OOM state after a successful retry") {
+    val mockCatalog = mock[RapidsBufferCatalog]
     val mockStore = mock[RapidsDeviceMemoryStore]
     when(mockStore.currentSize).thenReturn(0)
-    when(mockStore.synchronousSpill(any())).thenReturn(0)
-    val handler = new DeviceMemoryEventHandler(mockStore, None, false, 2)
+    when(mockCatalog.synchronousSpill(any(), any(), any())).thenAnswer(_ => Some(0))
+    val handler = new DeviceMemoryEventHandler(
+      mockCatalog,
+      mockStore,
+      None,
+      false,
+      2)
     // with this call we sync, and we mark our attempts at 1, we store 0 as the last count
     assertResult(true)(handler.onAllocFailure(1024, 0))
     // this retryCount is still 0, we should be back at 1 for attempts
@@ -55,18 +73,30 @@ class DeviceMemoryEventHandlerSuite extends FunSuite with MockitoSugar {
   }
 
   test("a negative allocation cannot be retried and handler throws") {
+    val mockCatalog = mock[RapidsBufferCatalog]
     val mockStore = mock[RapidsDeviceMemoryStore]
     when(mockStore.currentSize).thenReturn(1024)
-    when(mockStore.synchronousSpill(any())).thenReturn(1024)
-    val handler = new DeviceMemoryEventHandler(mockStore, None, false, 2)
+    when(mockCatalog.synchronousSpill(any(), any(), any())).thenAnswer(_ => Some(1024))
+    val handler = new DeviceMemoryEventHandler(
+      mockCatalog,
+      mockStore,
+      None,
+      false,
+      2)
     assertThrows[IllegalArgumentException](handler.onAllocFailure(-1, 0))
   }
 
   test("a negative retry count is invalid") {
+    val mockCatalog = mock[RapidsBufferCatalog]
     val mockStore = mock[RapidsDeviceMemoryStore]
     when(mockStore.currentSize).thenReturn(1024)
-    when(mockStore.synchronousSpill(any())).thenReturn(1024)
-    val handler = new DeviceMemoryEventHandler(mockStore, None, false, 2)
+    when(mockCatalog.synchronousSpill(any(), any(), any())).thenAnswer(_ => Some(1024))
+    val handler = new DeviceMemoryEventHandler(
+      mockCatalog,
+      mockStore,
+      None,
+      false,
+      2)
     assertThrows[IllegalArgumentException](handler.onAllocFailure(1024, -1))
   }
 }
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesRetrySuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesRetrySuite.scala
new file mode 100644
index 00000000000..97dd9830bc4
--- /dev/null
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesRetrySuite.scala
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import scala.collection.mutable.ArrayBuffer
+
+import ai.rapids.cudf.Table
+import com.nvidia.spark.rapids.jni.{RetryOOM, RmmSpark, SplitAndRetryOOM}
+import org.mockito.Mockito._
+import org.scalatest.mockito.MockitoSugar
+
+import org.apache.spark.sql.types.{DataType, LongType, StructField, StructType}
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+class GpuCoalesceBatchesRetrySuite
+  extends RmmSparkRetrySuiteBase
+    with MockitoSugar
+      with Arm {
+
+  private def buildBatchesToCoalesce(): Seq[ColumnarBatch] = {
+    (0 until 10).map { _ =>
+      withResource(new Table.TestBuilder()
+          .column(1L.asInstanceOf[java.lang.Long])
+          .build()) { tbl =>
+        spy(GpuColumnVector.from(tbl, Seq(LongType).toArray[DataType]))
+      }
+    }
+  }
+
+  private def buildHostBatchesToCoalesce(): Seq[ColumnarBatch] = {
+    buildBatchesToCoalesce().map { dcb =>
+      withResource(dcb) { _ =>
+        val hostColumns = (0 until dcb.numCols()).map(
+          i => dcb.column(i).asInstanceOf[GpuColumnVector].copyToHost())
+        spy(new ColumnarBatch(hostColumns.toArray, dcb.numRows()))
+      }
+    }
+  }
+
+  def getIters(
+      goal: CoalesceSizeGoal = TargetSize(1024),
+      injectRetry: Int = 0,
+      injectSplitAndRetry: Int = 0,
+      mockInjectSplitAndRetry: Boolean = false): Seq[Iterator[ColumnarBatch]] = {
+    val ab = new ArrayBuffer[Iterator[ColumnarBatch]]()
+    ab.append(new InjectableCoalesceIterator(
+      buildBatchesToCoalesce(),
+      goal,
+      injectRetry,
+      injectSplitAndRetry,
+      mockInjectSplitAndRetry))
+    ab.append(new InjectableCompressionAwareCoalesceIterator(
+      buildBatchesToCoalesce(),
+      goal,
+      injectRetry,
+      injectSplitAndRetry,
+      mockInjectSplitAndRetry))
+    ab
+  }
+
+  def getHostIter(
+      injectRetry: Int = 0,
+      goal: CoalesceSizeGoal = TargetSize(1024)): Iterator[ColumnarBatch] = {
+    new InjectableHostToGpuCoalesceIterator(
+      buildHostBatchesToCoalesce(),
+      goal = goal,
+      injectRetry)
+  }
+
+  test("coalesce gpu batches without failures") {
+    val iters = getIters()
+    iters.foreach { iter =>
+      withResource(iter.next()) { coalesced =>
+        assertResult(10)(coalesced.numRows())
+        assertResult(true)(GpuColumnVector.isTaggedAsFinalBatch(coalesced))
+      }
+    }
+  }
+
+  test("coalesce host batches without failures") {
+    val iter = getHostIter()
+    withResource(iter.next()) { coalesced =>
+      assertResult(10)(coalesced.numRows())
+      assertResult(true)(GpuColumnVector.isTaggedAsFinalBatch(coalesced))
+    }
+
+    // ensure that this iterator _did not close_ the incoming batches
+    // as that is the semantics of the HostToGpuCoalesceIterator
+    val allBatches = iter.asInstanceOf[CoalesceIteratorMocks].getBatches()
+    assertResult(10)(allBatches.length)
+    allBatches.foreach { x =>
+      verify(x, times(0)).close()
+    }
+    allBatches.foreach(_.close())
+  }
+
+  test("coalesce gpu batches with retry") {
+    val iters = getIters(injectRetry = 1)
+    iters.foreach { iter =>
+      withResource(iter.next()) { coalesced =>
+        assertResult(10)(coalesced.numRows())
+        assertResult(true)(GpuColumnVector.isTaggedAsFinalBatch(coalesced))
+      }
+    }
+  }
+
+  // this is a placeholder test. The HostToGpuCoalesceIterator is going to
+  // need a change in cuDF to make it retriable, so we are asserting here
+  // that the exception we could handle `RetryOOM` is being thrown.
+  test("coalesce gpu batches with retry host iter") {
+    val iter = getHostIter(injectRetry = 1)
+    assertThrows[RetryOOM] {
+      withResource(iter.next()) { coalesced =>
+        assertResult(10)(coalesced.numRows())
+      }
+    }
+    // ensure that this iterator _did not close_ the incoming batches
+    // as that is the semantics of the HostToGpuCoalesceIterator
+    val allBatches = iter.asInstanceOf[CoalesceIteratorMocks].getBatches()
+    assertResult(10)(allBatches.length)
+    allBatches.foreach { x =>
+      verify(x, times(0)).close()
+    }
+    allBatches.foreach(_.close())
+  }
+
+  test("coalesce gpu batches splits in half with SplitAndRetryOOM") {
+    val iters = getIters(injectSplitAndRetry = 1)
+    iters.foreach { iter =>
+      withResource(iter.next()) { coalesced =>
+        assertResult(5)(coalesced.numRows())
+        assertResult(false)(GpuColumnVector.isTaggedAsFinalBatch(coalesced))
+      }
+      withResource(iter.next()) { coalesced =>
+        assertResult(5)(coalesced.numRows())
+        assertResult(true)(GpuColumnVector.isTaggedAsFinalBatch(coalesced))
+      }
+      assertResult(false)(iter.hasNext)
+    }
+  }
+
+  test("coalesce gpu batches splits in quarters with SplitAndRetryOOM") {
+    val iters = getIters(injectSplitAndRetry = 2)
+    iters.foreach { iter =>
+      withResource(iter.next()) { coalesced =>
+        assertResult(2)(coalesced.numRows())
+        assertResult(false)(GpuColumnVector.isTaggedAsFinalBatch(coalesced))
+      }
+      withResource(iter.next()) { coalesced =>
+        assertResult(3)(coalesced.numRows())
+        assertResult(false)(GpuColumnVector.isTaggedAsFinalBatch(coalesced))
+      }
+      withResource(iter.next()) { coalesced =>
+        assertResult(5)(coalesced.numRows())
+        assertResult(true)(GpuColumnVector.isTaggedAsFinalBatch(coalesced))
+      }
+      assertResult(false)(iter.hasNext)
+    }
+  }
+
+  test("coalesce gpu batches fails with OOM if it cannot split enough") {
+    val iters = getIters(mockInjectSplitAndRetry = true)
+    iters.foreach { iter =>
+      assertThrows[OutOfMemoryError] {
+        iter.next() // throws
+      }
+      val batches = iter.asInstanceOf[CoalesceIteratorMocks].getBatches()
+      assertResult(10)(batches.length)
+      batches.foreach(b =>
+        verify(b, times(1)).close()
+      )
+    }
+  }
+
+  test("coalesce gpu batches with retry with non-splittable goal") {
+    val iters = getIters(injectRetry = 1, goal = RequireSingleBatch)
+    iters.foreach { iter =>
+      withResource(iter.next()) { coalesced =>
+        assertResult(10)(coalesced.numRows())
+      }
+    }
+  }
+
+  test("coalesce gpu batches throws if SplitAndRetryOOM with non-splittable goal") {
+    val iters = getIters(injectSplitAndRetry = 1, goal = RequireSingleBatch)
+    iters.foreach { iter =>
+      assertThrows[SplitAndRetryOOM] {
+        iter.next()
+      }
+      val batches = iter.asInstanceOf[CoalesceIteratorMocks].getBatches()
+      assertResult(10)(batches.length)
+      batches.foreach(b =>
+        verify(b, times(1)).close()
+      )
+    }
+  }
+
+  class SpillableColumnarBatchThatThrows(batch: ColumnarBatch)
+      extends SpillableColumnarBatch {
+    override def numRows(): Int = 0
+    override def setSpillPriority(priority: Long): Unit = {}
+    override def getColumnarBatch(): ColumnarBatch = {
+      throw new SplitAndRetryOOM()
+    }
+    override def sizeInBytes: Long = 0
+    override def dataTypes: Array[DataType] = Array.empty
+    override def close(): Unit = batch.close()
+  }
+
+  trait CoalesceIteratorMocks {
+    def getBatches(): Seq[ColumnarBatch]
+
+    def injectError(injectRetry: Int, injectSplitAndRetry: Int): Unit = {
+      if (injectRetry > 0) {
+        RmmSpark.forceRetryOOM(RmmSpark.getCurrentThreadId, injectRetry)
+      }
+      if (injectSplitAndRetry > 0) {
+        RmmSpark.forceSplitAndRetryOOM(RmmSpark.getCurrentThreadId, injectSplitAndRetry)
+      }
+    }
+
+    def getBatchToConcat(
+        mockInjectSplitAndRetry: Boolean,
+        batch: ColumnarBatch): SpillableColumnarBatch = {
+      val spillableSpy = if (mockInjectSplitAndRetry) {
+        spy(
+          new SpillableColumnarBatchThatThrows(batch))
+      } else {
+        spy(SpillableColumnarBatch(
+          batch,
+          SpillPriorities.ACTIVE_BATCHING_PRIORITY))
+      }
+      spillableSpy
+    }
+  }
+
+  class InjectableHostToGpuCoalesceIterator(
+      batchesToConcat: Seq[ColumnarBatch],
+      goal: CoalesceSizeGoal,
+      injectRetry: Int = 0)
+    extends HostToGpuCoalesceIterator(
+      batchesToConcat.iterator,
+      goal,
+      StructType(Seq(StructField("col0", LongType, nullable = true))),
+      NoopMetric,
+      NoopMetric,
+      NoopMetric,
+      NoopMetric,
+      NoopMetric,
+      NoopMetric,
+      NoopMetric,
+      NoopMetric,
+      NoopMetric,
+      "test",
+      false)
+    with CoalesceIteratorMocks {
+
+    override def populateCandidateBatches(): Boolean = {
+      val lastBatchTag = super.populateCandidateBatches()
+      injectError(injectRetry, injectSplitAndRetry = 0)
+      lastBatchTag
+    }
+
+    override def getBatches(): Seq[ColumnarBatch] = batchesToConcat
+  }
+
+  class InjectableCompressionAwareCoalesceIterator(
+      batchesToConcat: Seq[ColumnarBatch],
+      goal: CoalesceSizeGoal,
+      injectRetry: Int = 0,
+      injectSplitAndRetry: Int = 0,
+      mockInjectSplitAndRetry: Boolean = false)
+      extends GpuCompressionAwareCoalesceIterator(
+        batchesToConcat.iterator,
+        Seq(LongType).toArray,
+        goal,
+        maxDecompressBatchMemory=10240,
+        NoopMetric,
+        NoopMetric,
+        NoopMetric,
+        NoopMetric,
+        NoopMetric,
+        NoopMetric,
+        NoopMetric,
+        NoopMetric,
+        "test",
+        TableCompressionCodecConfig(1024)) with CoalesceIteratorMocks {
+    override def populateCandidateBatches(): Boolean = {
+      val lastBatchTag = super.populateCandidateBatches()
+      injectError(injectRetry, injectSplitAndRetry)
+      lastBatchTag
+    }
+
+    override def addBatchToConcat(batch: ColumnarBatch): Unit = {
+      batches.append(getBatchToConcat(mockInjectSplitAndRetry, batch))
+    }
+
+    override def getBatches(): Seq[ColumnarBatch] = batchesToConcat
+  }
+
+  class InjectableCoalesceIterator(
+      batchesToConcat: Seq[ColumnarBatch],
+      goal: CoalesceSizeGoal,
+      injectRetry: Int = 0,
+      injectSplitAndRetry: Int = 0,
+      mockInjectSplitAndRetry: Boolean = false)
+      extends GpuCoalesceIterator (
+        batchesToConcat.iterator,
+        Seq(LongType).toArray,
+        goal,
+        NoopMetric,
+        NoopMetric,
+        NoopMetric,
+        NoopMetric,
+        NoopMetric,
+        NoopMetric,
+        NoopMetric,
+        NoopMetric,
+        "test") with CoalesceIteratorMocks {
+    override def populateCandidateBatches(): Boolean = {
+      val lastBatchTag = super.populateCandidateBatches()
+      injectError(injectRetry, injectSplitAndRetry)
+      lastBatchTag
+    }
+
+    override def addBatchToConcat(batch: ColumnarBatch): Unit = {
+      batches.append(getBatchToConcat(mockInjectSplitAndRetry, batch))
+    }
+
+    override def getBatches(): Seq[ColumnarBatch] = batchesToConcat
+  }
+}
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesSuite.scala
index 26567e6f613..5f7228c1cc3 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -261,7 +261,6 @@ class GpuCoalesceBatchesSuite extends SparkQueryCompareTestSuite {
         streamTime = NoopMetric,
         concatTime = NoopMetric,
         copyBufTime = NoopMetric,
-        semTime = NoopMetric,
         opTime = NoopMetric,
         peakDevMemory = NoopMetric,
         opName = "concat test",
@@ -291,7 +290,6 @@ class GpuCoalesceBatchesSuite extends SparkQueryCompareTestSuite {
       WrappedGpuMetric(new SQLMetric("t7", 0)),
       WrappedGpuMetric(new SQLMetric("t8", 0)),
       WrappedGpuMetric(new SQLMetric("t9", 0)),
-      WrappedGpuMetric(new SQLMetric("t10", 0)),
       "testcoalesce",
       useArrowCopyOpt = true)
 
@@ -316,7 +314,6 @@ class GpuCoalesceBatchesSuite extends SparkQueryCompareTestSuite {
       WrappedGpuMetric(new SQLMetric("t7", 0)),
       WrappedGpuMetric(new SQLMetric("t8", 0)),
       WrappedGpuMetric(new SQLMetric("t9", 0)),
-      WrappedGpuMetric(new SQLMetric("t10", 0)),
       "testcoalesce",
       useArrowCopyOpt = true)
 
@@ -356,7 +353,6 @@ class GpuCoalesceBatchesSuite extends SparkQueryCompareTestSuite {
       WrappedGpuMetric(new SQLMetric("t7", 0)),
       WrappedGpuMetric(new SQLMetric("t8", 0)),
       WrappedGpuMetric(new SQLMetric("t9", 0)),
-      WrappedGpuMetric(new SQLMetric("t10", 0)),
       "testcoalesce",
       useArrowCopyOpt = true)
 
@@ -392,7 +388,6 @@ class GpuCoalesceBatchesSuite extends SparkQueryCompareTestSuite {
       WrappedGpuMetric(new SQLMetric("t7", 0)),
       WrappedGpuMetric(new SQLMetric("t8", 0)),
       WrappedGpuMetric(new SQLMetric("t9", 0)),
-      WrappedGpuMetric(new SQLMetric("t10", 0)),
       "testcoalesce",
       useArrowCopyOpt = false)
 
@@ -532,7 +527,6 @@ class GpuCoalesceBatchesSuite extends SparkQueryCompareTestSuite {
       dummyMetric,
       dummyMetric,
       dummyMetric,
-      RapidsBuffer.defaultSpillCallback,
       "test concat",
       TableCompressionCodec.makeCodecConfig(rapidsConf))
 
@@ -616,7 +610,6 @@ class GpuCoalesceBatchesSuite extends SparkQueryCompareTestSuite {
       dummyMetric,
       dummyMetric,
       dummyMetric,
-      RapidsBuffer.defaultSpillCallback,
       "test concat",
       TableCompressionCodec.makeCodecConfig(rapidsConf))
 
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/GpuMultiFileReaderSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/GpuMultiFileReaderSuite.scala
index 137865256d7..0da28519d40 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/GpuMultiFileReaderSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/GpuMultiFileReaderSuite.scala
@@ -36,8 +36,7 @@ class GpuMultiFileReaderSuite extends FunSuite with Arm {
     val membuffers =
       Array(SingleHMBAndMeta(
         HostMemoryBuffer.allocate(0), 0L, 0, Seq.empty, null))
-    val metrics = Map(GpuMetric.PEAK_DEVICE_MEMORY -> NoopMetric,
-          GpuMetric.SEMAPHORE_WAIT_TIME -> NoopMetric)
+    val metrics = Map(GpuMetric.PEAK_DEVICE_MEMORY -> NoopMetric)
     val multiFileReader = new MultiFileCloudPartitionReaderBase(
       conf,
       inputFiles = Array.empty,
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/GpuPartitioningSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/GpuPartitioningSuite.scala
index 1f87c6f7520..662f8a5d5e0 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/GpuPartitioningSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/GpuPartitioningSuite.scala
@@ -150,8 +150,9 @@ class GpuPartitioningSuite extends FunSuite with Arm {
     TestUtils.withGpuSparkSession(conf) { _ =>
       GpuShuffleEnv.init(new RapidsConf(conf), new RapidsDiskBlockManager(conf))
       val spillPriority = 7L
-      val catalog = new RapidsBufferCatalog
-      withResource(new RapidsDeviceMemoryStore(catalog)) { deviceStore =>
+
+      withResource(new RapidsDeviceMemoryStore) { store =>
+        val catalog = new RapidsBufferCatalog(store)
         val partitionIndices = Array(0, 2, 2)
         val gp = new GpuPartitioning {
           override val numPartitions: Int = partitionIndices.length
@@ -195,7 +196,7 @@ class GpuPartitioningSuite extends FunSuite with Arm {
               if (GpuCompressedColumnVector.isBatchCompressed(partBatch)) {
                 val gccv = columns.head.asInstanceOf[GpuCompressedColumnVector]
                 val devBuffer = gccv.getTableBuffer
-                val handle = deviceStore.addBuffer(devBuffer, gccv.getTableMeta, spillPriority)
+                val handle = catalog.addBuffer(devBuffer, gccv.getTableMeta, spillPriority)
                 withResource(buildSubBatch(batch, startRow, endRow)) { expectedBatch =>
                   withResource(catalog.acquireBuffer(handle)) { buffer =>
                     withResource(buffer.getColumnarBatch(sparkTypes)) { batch =>
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/GpuSemaphoreSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/GpuSemaphoreSuite.scala
index 3ca64df08a5..2e69eafa7c1 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/GpuSemaphoreSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/GpuSemaphoreSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,13 +24,29 @@ import org.scalatest.mockito.MockitoSugar
 import org.scalatest.time.{Seconds, Span}
 
 import org.apache.spark.TaskContext
+import org.apache.spark.sql.SparkSession
 
 class GpuSemaphoreSuite extends FunSuite
     with BeforeAndAfterEach with MockitoSugar with TimeLimits  with TimeLimitedTests {
   val timeLimit = Span(10, Seconds)
 
-  override def beforeEach(): Unit = GpuSemaphore.shutdown()
-  override def afterEach(): Unit = GpuSemaphore.shutdown()
+  override def beforeEach(): Unit = {
+    GpuSemaphore.shutdown()
+    // semaphore tests depend on a SparkEnv being available
+    val activeSession = SparkSession.getActiveSession
+    if (activeSession.isEmpty) {
+      SparkSession.builder
+        .appName("semaphoreTests")
+        .master("local[1]")
+        .getOrCreate()
+    }
+  }
+
+  override def afterEach(): Unit = {
+    GpuSemaphore.shutdown()
+    SparkSession.getActiveSession.foreach(_.stop())
+    SparkSession.clearActiveSession()
+  }
 
   def mockContext(taskAttemptId: Long): TaskContext = {
     val context = mock[TaskContext]
@@ -46,18 +62,18 @@ class GpuSemaphoreSuite extends FunSuite
   test("Double release is not an error") {
     GpuDeviceManager.setRmmTaskInitEnabled(false)
     val context = mockContext(1)
-    GpuSemaphore.acquireIfNecessary(context, NoopMetric)
-    GpuSemaphore.acquireIfNecessary(context, NoopMetric)
+    GpuSemaphore.acquireIfNecessary(context)
+    GpuSemaphore.acquireIfNecessary(context)
     GpuSemaphore.releaseIfNecessary(context)
     GpuSemaphore.releaseIfNecessary(context)
   }
 
   test("Completion listener registered on first acquire") {
     val context = mockContext(1)
-    GpuSemaphore.acquireIfNecessary(context, NoopMetric)
+    GpuSemaphore.acquireIfNecessary(context)
     verify(context, times(1)).addTaskCompletionListener[Unit](any())
-    GpuSemaphore.acquireIfNecessary(context, NoopMetric)
-    GpuSemaphore.acquireIfNecessary(context, NoopMetric)
+    GpuSemaphore.acquireIfNecessary(context)
+    GpuSemaphore.acquireIfNecessary(context)
     verify(context, times(1)).addTaskCompletionListener[Unit](any())
   }
 }
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExecSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExecSuite.scala
index 2b29555852e..6a084fcd2d7 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExecSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExecSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,134 +25,132 @@ import org.scalatest.FunSuite
 import org.scalatest.mockito.MockitoSugar
 
 import org.apache.spark.SparkConf
-import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.types.IntegerType
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
+/** Tests for the "prepareBuildBatchesForJoin" function. */
 class GpuShuffledHashJoinExecSuite extends FunSuite with Arm with MockitoSugar {
-  val metricMap = mock[Map[String, GpuMetric]]
+  private val metricMap = mock[Map[String, GpuMetric]]
   when(metricMap(any())).thenReturn(NoopMetric)
 
-  test("fallback with empty build iterator") {
+  // The test table size is 20 (= 4 * 5) bytes
+  private val TARGET_SIZE_SMALL = 10L
+  private val TARGET_SIZE_BIG = 1024L
+  private val attrs = Array(AttributeReference("a1", IntegerType, nullable=false)())
+
+  private def newOneIntColumnTable(): Table = {
+    withResource(ColumnVector.fromInts(1, 2, 3, 4, 5)) { cudfCol =>
+      new Table(cudfCol)
+    }
+  }
+
+  private def testJoinPreparation(
+      buildIter: Iterator[ColumnarBatch],
+      buildAttrs: Seq[Attribute] = attrs,
+      targetSize: Long = TARGET_SIZE_BIG,
+      optimalCase: Boolean = false)
+      (verifyBuiltData: Either[ColumnarBatch, Iterator[ColumnarBatch]] => Unit): Unit = {
+    val mockStreamIter = mock[Iterator[ColumnarBatch]]
+    val mockBufferedStreamIterator = mock[BufferedIterator[ColumnarBatch]]
+    when(mockStreamIter.buffered).thenReturn(mockBufferedStreamIterator)
+    when(mockBufferedStreamIterator.hasNext).thenReturn(true)
+    val (builtData, _) = GpuShuffledHashJoinExec.prepareBuildBatchesForJoin(
+      buildIter,
+      mockStreamIter,
+      targetSize,
+      buildAttrs,
+      RequireSingleBatch, None, metricMap)
+
+    verifyBuiltData(builtData)
+    // build iterator should be drained
+    assertResult(expected = false)(buildIter.hasNext)
+    verify(mockStreamIter, times(0)).hasNext
+    if (optimalCase) {
+      verify(mockStreamIter, times(1)).buffered
+      verify(mockBufferedStreamIterator, times(1)).hasNext
+      verify(mockBufferedStreamIterator, times(1)).head
+    }
+  }
+
+  private def assertBatchColsAndRowsAndClose(batch: ColumnarBatch,
+      expectedNumCols: Int, expectedNumRows: Int): Unit = {
+    withResource(batch) { _ =>
+      assertResult(expectedNumCols)(batch.numCols())
+      assertResult(expectedNumRows)(batch.numRows())
+    }
+  }
+
+  test("test empty build iterator") {
     TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
-      val mockBuildIter = mock[Iterator[ColumnarBatch]]
-      when(mockBuildIter.hasNext).thenReturn(false)
-      val mockStreamIter = mock[Iterator[ColumnarBatch]]
-      val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
-        RequireSingleBatch,
-        0,
-        Seq.empty,
-        mockBuildIter,
-        mockStreamIter,
-        mock[SpillCallback],
-        metricMap)
-      withResource(builtBatch) { _ =>
-        // we ge an empty batch with no columns or rows
-        assertResult(builtBatch.numCols())(0)
-        assertResult(builtBatch.numRows())(0)
-        // 2 invocations, once in the `getBuiltBatchAndStreamIter`
-        // method, and a second one in `getSingleBatchWithVerification`
-        verify(mockBuildIter, times(2)).hasNext
-        verify(mockBuildIter, times(0)).next
-        verify(mockStreamIter, times(0)).hasNext
+      testJoinPreparation(Iterator.empty) { builtData =>
+        assert(builtData.isLeft)
+        // we get an empty batch
+        assertBatchColsAndRowsAndClose(builtData.left.get, 1, 0)
       }
     }
   }
 
-  test("fallback with 0 column build batches") {
+  test("test a batch of 0 cols and 0 rows") {
     TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
-      withResource(GpuColumnVector.emptyBatchFromTypes(Array.empty)) {
-        emptyBatch =>
-          val buildIter = mock[Iterator[ColumnarBatch]]
-          when(buildIter.hasNext).thenReturn(true, false)
-          val buildBufferedIter = mock[BufferedIterator[ColumnarBatch]]
-          when(buildBufferedIter.hasNext).thenReturn(true, false)
-          when(buildBufferedIter.head).thenReturn(emptyBatch)
-          when(buildBufferedIter.next).thenReturn(emptyBatch)
-          when(buildIter.buffered).thenReturn(buildBufferedIter)
-          val mockStreamIter = mock[Iterator[ColumnarBatch]]
-          val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
-            RequireSingleBatch,
-            0,
-            Seq.empty,
-            buildIter,
-            mockStreamIter,
-            mock[SpillCallback],
-            metricMap)
-          withResource(builtBatch) { _ =>
-            assertResult(builtBatch.numCols())(0)
-            assertResult(builtBatch.numRows())(0)
-            // 1 invocation in the `getBuiltBatchAndStreamIter`
-            // after which a buffered iterator is obtained and used for the fallback case
-            verify(buildIter, times(1)).hasNext
-            verify(buildIter, times(1)).buffered
-            // we ask the buffered iterator for `head` to inspect the number of columns
-            verify(buildBufferedIter, times(1)).head
-            // the buffered iterator is passed to `getSingleBatchWithVerification`,
-            // and that code calls hasNext twice
-            verify(buildBufferedIter, times(2)).hasNext
-            // and calls next to get that batch we buffered
-            verify(buildBufferedIter, times(1)).next
-            verify(mockStreamIter, times(0)).hasNext
-          }
+      val buildIter = Iterator(GpuColumnVector.emptyBatchFromTypes(Array.empty))
+      testJoinPreparation(buildIter, Seq.empty) { builtData =>
+        assert(builtData.isLeft)
+        assertBatchColsAndRowsAndClose(builtData.left.get, 0, 0)
       }
     }
   }
 
-  test("fallback with a non-SerializedTableColumn 1 col and 0 rows") {
+  test("test a batch of 1 col and 0 rows") {
     TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
-      val emptyBatch = GpuColumnVector.emptyBatchFromTypes(Seq(IntegerType).toArray)
-      val buildIter = Seq(emptyBatch).iterator
-      val mockStreamIter = mock[Iterator[ColumnarBatch]]
-      val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
-        RequireSingleBatch,
-        0,
-        Seq.empty,
-        buildIter,
-        mockStreamIter,
-        mock[SpillCallback],
-        metricMap)
-      withResource(builtBatch) { _ =>
-        assertResult(builtBatch.numCols())(1)
-        assertResult(builtBatch.numRows())(0)
-        // 2 invocations, once in the `getBuiltBatchAndStreamIter
-        // method, and one in `getSingleBatchWithVerification`
-        verify(mockStreamIter, times(0)).hasNext
-        // the buffered iterator drained the build iterator
-        assertResult(buildIter.hasNext)(false)
+      val buildIter = Iterator(GpuColumnVector.emptyBatchFromTypes(attrs.map(_.dataType)))
+      testJoinPreparation(buildIter) { builtData =>
+        assert(builtData.isLeft)
+        assertBatchColsAndRowsAndClose(builtData.left.get, 1, 0)
+
       }
     }
   }
 
-  test("fallback with a non-SerialiedTableColumn") {
+  test("test a nonempty batch going over the limit") {
     TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
-      closeOnExcept(ColumnVector.fromInts(1, 2, 3, 4, 5)) { cudfCol =>
-        val cv = GpuColumnVector.from(cudfCol, IntegerType)
-        val batch = new ColumnarBatch(Seq(cv).toArray, 5)
-        val buildIter = Seq(batch).iterator
-        val mockStreamIter = mock[Iterator[ColumnarBatch]]
-        val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
-          RequireSingleBatch,
-          0,
-          Seq.empty,
-          buildIter,
-          mockStreamIter,
-          mock[SpillCallback],
-          metricMap)
-        withResource(builtBatch) { _ =>
-          assertResult(builtBatch.numCols())(1)
-          assertResult(builtBatch.numRows())(5)
-          // 2 invocations, once in the `getBuiltBatchAndStreamIter
-          // method, and one in `getSingleBatchWithVerification`
-          verify(mockStreamIter, times(0)).hasNext
-          // the buffered iterator drained the build iterator
-          assertResult(buildIter.hasNext)(false)
+      val buildIter = withResource(newOneIntColumnTable()) { testTable =>
+        Iterator(GpuColumnVector.from(testTable, attrs.map(_.dataType)))
+      }
+      testJoinPreparation(buildIter, targetSize = TARGET_SIZE_SMALL) { builtData =>
+        assert(builtData.isRight)
+        var batchCount = 0
+        val builtIt = builtData.right.get
+        builtIt.foreach { builtBatch =>
+          batchCount += 1
+          assertBatchColsAndRowsAndClose(builtBatch, 1, 5)
         }
+        assert(batchCount == 1)
       }
     }
   }
 
-  def getSerializedBatch(tbl: Table): ColumnarBatch = {
+  test("test two batches going over the limit") {
+    TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
+      val buildIter = withResource(newOneIntColumnTable()) { testTable =>
+        closeOnExcept(GpuColumnVector.from(testTable, attrs.map(_.dataType))) { batch1 =>
+          Iterator(batch1, GpuColumnVector.from(testTable, attrs.map(_.dataType)))
+        }
+      }
+      testJoinPreparation(buildIter, targetSize = TARGET_SIZE_SMALL) { builtData =>
+        assert(builtData.isRight)
+        var batchCount = 0
+        val builtIt = builtData.right.get
+        builtIt.foreach { builtBatch =>
+          batchCount += 1
+          assertBatchColsAndRowsAndClose(builtBatch, 1, 5)
+        }
+        assert(batchCount == 2)
+      }
+    }
+  }
+
+  private def getSerializedBatch(tbl: Table): ColumnarBatch = {
     val outStream = new ByteArrayOutputStream()
     JCudfSerialization.writeToStream(tbl, outStream, 0, tbl.getRowCount)
     val dIn = new DataInputStream(new ByteArrayInputStream(outStream.toByteArray))
@@ -163,7 +161,7 @@ class GpuShuffledHashJoinExecSuite extends FunSuite with Arm with MockitoSugar {
     }
   }
 
-  def getSerializedBatch(numRows: Int): ColumnarBatch = {
+  private def getSerializedBatch(numRows: Int): ColumnarBatch = {
     val outStream = new ByteArrayOutputStream()
     JCudfSerialization.writeRowsToStream(outStream, numRows)
     val dIn = new DataInputStream(new ByteArrayInputStream(outStream.toByteArray))
@@ -174,147 +172,59 @@ class GpuShuffledHashJoinExecSuite extends FunSuite with Arm with MockitoSugar {
     }
   }
 
-  test("test a 0-column SerializedTableColumn") {
+  test("test a 0-column serialized batch, optimal case") {
     TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
-      val serializedBatch = getSerializedBatch(5)
-      val mockStreamIter = mock[Iterator[ColumnarBatch]]
-      val mockBufferedStreamIterator = mock[BufferedIterator[ColumnarBatch]]
-      when(mockStreamIter.hasNext).thenReturn(true)
-      when(mockStreamIter.buffered).thenReturn(mockBufferedStreamIterator)
-      when(mockBufferedStreamIterator.hasNext).thenReturn(true)
-      closeOnExcept(serializedBatch) { _ =>
-        val buildIter = Seq(serializedBatch).iterator
-        val attrs = AttributeReference("a", IntegerType, false)() :: Nil
-        val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
-          RequireSingleBatch,
-          1024,
-          attrs,
-          buildIter,
-          mockStreamIter,
-          mock[SpillCallback],
-          metricMap)
-        withResource(builtBatch) { _ =>
-          verify(mockBufferedStreamIterator, times(1)).hasNext
-          assertResult(builtBatch.numCols())(0)
-          assertResult(builtBatch.numRows())(5)
-          // the buffered iterator drained the build iterator
-          assertResult(buildIter.hasNext)(false)
-        }
+      val buildIter = Iterator(getSerializedBatch(5))
+      testJoinPreparation(buildIter, Seq.empty, optimalCase = true) { builtData =>
+        assert(builtData.isLeft)
+        assertBatchColsAndRowsAndClose(builtData.left.get, 0, 5)
       }
     }
   }
 
-  test("test a SerializedTableColumn") {
+  test("test a serialized batch, optimal case") {
     TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
-      withResource(ColumnVector.fromInts(1, 2, 3, 4, 5)) { cudfCol =>
-        val cv = GpuColumnVector.from(cudfCol, IntegerType)
-        val batch = new ColumnarBatch(Seq(cv).toArray, 5)
-        withResource(GpuColumnVector.from(batch)) { tbl =>
-          val serializedBatch = getSerializedBatch(tbl)
-          val mockStreamIter = mock[Iterator[ColumnarBatch]]
-          val mockBufferedStreamIterator = mock[BufferedIterator[ColumnarBatch]]
-          when(mockStreamIter.hasNext).thenReturn(true)
-          when(mockStreamIter.buffered).thenReturn(mockBufferedStreamIterator)
-          when(mockBufferedStreamIterator.hasNext).thenReturn(true)
-          closeOnExcept(serializedBatch) { _ =>
-            val buildIter = Seq(serializedBatch).iterator
-            val attrs = AttributeReference("a", IntegerType, false)() :: Nil
-            val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
-              RequireSingleBatch,
-              1024,
-              attrs,
-              buildIter,
-              mockStreamIter,
-              mock[SpillCallback],
-              metricMap)
-            withResource(builtBatch) { _ =>
-              verify(mockBufferedStreamIterator, times(1)).hasNext
-              assertResult(builtBatch.numCols())(1)
-              assertResult(builtBatch.numRows())(5)
-              // the buffered iterator drained the build iterator
-              assertResult(buildIter.hasNext)(false)
-            }
-          }
-        }
+      val buildIter = withResource(newOneIntColumnTable()) { tbl =>
+        Iterator(getSerializedBatch(tbl))
+      }
+      testJoinPreparation(buildIter, optimalCase = true) { builtData =>
+        assert(builtData.isLeft)
+        assertBatchColsAndRowsAndClose(builtData.left.get, 1, 5)
       }
     }
   }
 
-  test("test two batches, going over the limit") {
+  test("test two serialized batches, going over the limit") {
     TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
-      withResource(ColumnVector.fromInts(1, 2, 3, 4, 5)) { cudfCol =>
-        val cv = GpuColumnVector.from(cudfCol, IntegerType)
-        val batch = new ColumnarBatch(Seq(cv).toArray, 5)
-        withResource(GpuColumnVector.from(batch)) { tbl =>
-          val serializedBatch = getSerializedBatch(tbl)
-          val serializedBatch2 = getSerializedBatch(tbl)
-          val mockStreamIter = mock[Iterator[ColumnarBatch]]
-          val mockBufferedStreamIterator = mock[BufferedIterator[ColumnarBatch]]
-          when(mockStreamIter.hasNext).thenReturn(true)
-          when(mockStreamIter.buffered).thenReturn(mockBufferedStreamIterator)
-          when(mockBufferedStreamIterator.hasNext).thenReturn(true)
-          closeOnExcept(serializedBatch) { _ =>
-            closeOnExcept(serializedBatch2) { _ =>
-              val buildIter = Seq(serializedBatch, serializedBatch2).iterator
-              val attrs = AttributeReference("a", IntegerType, false)() :: Nil
-              val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
-                RequireSingleBatch,
-                1,
-                attrs,
-                buildIter,
-                mockStreamIter,
-                mock[SpillCallback],
-                metricMap)
-              withResource(builtBatch) { _ =>
-                verify(mockBufferedStreamIterator, times(0)).hasNext
-                assertResult(builtBatch.numCols())(1)
-                assertResult(builtBatch.numRows())(10)
-                // the buffered iterator drained the build iterator
-                assertResult(buildIter.hasNext)(false)
-              }
-            }
-          }
+      val buildIter = withResource(newOneIntColumnTable()) { tbl =>
+        closeOnExcept(getSerializedBatch(tbl)) { serializedBatch1 =>
+          Iterator(serializedBatch1, getSerializedBatch(tbl))
         }
       }
+      testJoinPreparation(buildIter, targetSize = TARGET_SIZE_SMALL) { builtData =>
+        assert(builtData.isRight)
+        var batchCount = 0
+        val builtIt = builtData.right.get
+        builtIt.foreach { builtBatch =>
+          batchCount += 1
+          assertBatchColsAndRowsAndClose(builtBatch, 1, 5)
+        }
+        assert(batchCount == 2)
+      }
     }
   }
 
-  test("test two batches, stating within the limit") {
+  test("test two serialized batches, stating within the limit, optimal case") {
     TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
-      withResource(ColumnVector.fromInts(1, 2, 3, 4, 5)) { cudfCol =>
-        val cv = GpuColumnVector.from(cudfCol, IntegerType)
-        val batch = new ColumnarBatch(Seq(cv).toArray, 5)
-        withResource(GpuColumnVector.from(batch)) { tbl =>
-          val serializedBatch = getSerializedBatch(tbl)
-          val serializedBatch2 = getSerializedBatch(tbl)
-          val mockStreamIter = mock[Iterator[ColumnarBatch]]
-          val mockBufferedStreamIterator = mock[BufferedIterator[ColumnarBatch]]
-          when(mockStreamIter.hasNext).thenReturn(true)
-          when(mockStreamIter.buffered).thenReturn(mockBufferedStreamIterator)
-          when(mockBufferedStreamIterator.hasNext).thenReturn(true)
-          closeOnExcept(serializedBatch) { _ =>
-            closeOnExcept(serializedBatch2) { _ =>
-              val buildIter = Seq(serializedBatch, serializedBatch2).iterator
-              val attrs = AttributeReference("a", IntegerType, false)() :: Nil
-              val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
-                RequireSingleBatch,
-                1024,
-                attrs,
-                buildIter,
-                mockStreamIter,
-                mock[SpillCallback],
-                metricMap)
-              withResource(builtBatch) { _ =>
-                verify(mockBufferedStreamIterator, times(1)).hasNext
-                assertResult(builtBatch.numCols())(1)
-                assertResult(builtBatch.numRows())(10)
-                // the buffered iterator drained the build iterator
-                assertResult(buildIter.hasNext)(false)
-              }
-            }
-          }
+      val buildIter = withResource(newOneIntColumnTable()) { tbl =>
+        closeOnExcept(getSerializedBatch(tbl)) { serializedBatch1 =>
+          Iterator(serializedBatch1, getSerializedBatch(tbl))
         }
       }
+      testJoinPreparation(buildIter, optimalCase = true) { builtData =>
+        assert(builtData.isLeft)
+        assertBatchColsAndRowsAndClose(builtData.left.get, 1, 10)
+      }
     }
   }
 }
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/GpuSubPartitionSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/GpuSubPartitionSuite.scala
new file mode 100644
index 00000000000..a0b9c7c692b
--- /dev/null
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/GpuSubPartitionSuite.scala
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import scala.collection.mutable.ArrayBuffer
+
+import ai.rapids.cudf.ColumnVector
+
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, ExprId}
+import org.apache.spark.sql.rapids.execution.{GpuBatchSubPartitioner, GpuBatchSubPartitionIterator}
+import org.apache.spark.sql.types.IntegerType
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+class GpuSubPartitionSuite extends SparkQueryCompareTestSuite {
+  val attrs: java.util.List[Attribute] =
+    java.util.Arrays.asList(AttributeReference("test", IntegerType)())
+  private val boundKeys =
+    Seq(GpuBoundReference(0, IntegerType, nullable = true)(ExprId(0), "test"))
+
+  test("Sub-partitioner with empty iterator") {
+    withGpuSparkSession { _ =>
+      val subPartitioner = new GpuBatchSubPartitioner(
+        Iterator.empty,
+        boundKeys,
+        numPartitions = 0)
+
+      // at least two partitions even given 0
+      assertResult(expected = 2)(subPartitioner.partitionsCount)
+      assertResult(expected = 0)(subPartitioner.batchesCount)
+      val ids = 0 until subPartitioner.partitionsCount
+      // every partition is empty
+      assert(ids.forall(subPartitioner.getBatchesByPartition(_).isEmpty))
+      // every partition becomes null after being released
+      // no actual resource so no need to close
+      ids.foreach(subPartitioner.releaseBatchesByPartition)
+      assert(ids.forall(subPartitioner.getBatchesByPartition(_) == null))
+      subPartitioner.close()
+      // repeated close should be ok
+      subPartitioner.close()
+    }
+  }
+
+  test("Sub-partitioner with nonempty iterator of one empty batch") {
+    withGpuSparkSession { _ =>
+      closeOnExcept(GpuColumnVector.emptyBatch(attrs)) { emptyBatch =>
+        val subPartitioner = new GpuBatchSubPartitioner(
+          Seq(emptyBatch).toIterator,
+          boundKeys,
+          numPartitions = 5)
+
+        assertResult(expected = 5)(subPartitioner.partitionsCount)
+        // empty batch is skipped
+        assertResult(expected = 0)(subPartitioner.batchesCount)
+        val ids = 0 until subPartitioner.partitionsCount
+        // every partition is empty
+        assert(ids.forall(subPartitioner.getBatchesByPartition(_).isEmpty))
+        // every partition becomes null after being released.
+        // no actual resource so no need to close
+        ids.foreach(subPartitioner.releaseBatchesByPartition)
+        assert(ids.forall(subPartitioner.getBatchesByPartition(_) == null))
+        subPartitioner.close()
+      }
+    }
+  }
+
+  test("Sub-partitioner with nonempty iterator of one nonempty batch") {
+    withGpuSparkSession { _ =>
+      closeOnExcept {
+        val col = GpuColumnVector.from(ColumnVector.fromInts(1,2,2,3,3,3), IntegerType)
+        new ColumnarBatch(Array(col), col.getRowCount.toInt)
+      } { nonemptyBatch =>
+        val subPartitioner = new GpuBatchSubPartitioner(
+          Seq(nonemptyBatch).toIterator,
+          boundKeys,
+          numPartitions = 5)
+
+        assertResult(expected = 5)(subPartitioner.partitionsCount)
+        // nonempty batches exist
+        assert(subPartitioner.batchesCount > 0)
+        val ids = 0 until subPartitioner.partitionsCount
+        var actualRowNum = 0
+        ids.foreach { id =>
+          withResource(subPartitioner.releaseBatchesByPartition(id)) { batches =>
+            actualRowNum += batches.map(_.numRows()).sum
+          }
+        }
+        assertResult(nonemptyBatch.numRows())(actualRowNum)
+        // every partition becomes null after being released
+        assert(ids.forall(subPartitioner.getBatchesByPartition(_) == null))
+        subPartitioner.close()
+      }
+    }
+  }
+
+  test("Sub-partitioner iterator with empty partitions") {
+    withGpuSparkSession { _ =>
+      closeOnExcept(GpuColumnVector.emptyBatch(attrs)) { emptyBatch =>
+        val subPartitioner = new GpuBatchSubPartitioner(
+          Seq(emptyBatch).toIterator,
+          boundKeys,
+          numPartitions = 5)
+        val subIter = new GpuBatchSubPartitionIterator(
+          subPartitioner,
+          targetBatchSize = 12L)
+
+        // return empty partitions one by one
+        val partCounts = ArrayBuffer(1, 1, 1, 1, 1)
+        while (subIter.hasNext) {
+          val (ids, batch) = subIter.next()
+          withResource(batch) { _ =>
+            partCounts -= ids.length
+            assert(ids.nonEmpty)
+            assert(batch.isEmpty)
+          }
+        }
+        assert(partCounts.isEmpty, partCounts)
+        subPartitioner.close()
+      }
+    }
+  }
+
+  test("Sub-partitioner iterator with nonempty partitions") {
+    withGpuSparkSession { _ =>
+      closeOnExcept {
+        val col = GpuColumnVector.from(ColumnVector.fromInts(1, 2, 2, 3, 3, 3), IntegerType)
+        new ColumnarBatch(Array(col), col.getRowCount.toInt)
+      } { nonemptyBatch =>
+        val subPartitioner = new GpuBatchSubPartitioner(
+          Seq(nonemptyBatch).toIterator,
+          boundKeys,
+          numPartitions = 5)
+        val subIter = new GpuBatchSubPartitionIterator(
+          subPartitioner,
+          targetBatchSize = 12L)
+
+        var actualRowNum = 0
+        while(subIter.hasNext) {
+          val (ids, batch) = subIter.next()
+          withResource(batch) { _ =>
+            assert(ids.nonEmpty)
+            batch.foreach { cb =>
+              // got nonempty partition, add its row number
+              actualRowNum += cb.numRows()
+            }
+          }
+        }
+        assertResult(nonemptyBatch.numRows())(actualRowNum)
+        subPartitioner.close()
+      }
+    }
+  }
+}
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/HashAggregateRetrySuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/HashAggregateRetrySuite.scala
new file mode 100644
index 00000000000..237becfa930
--- /dev/null
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/HashAggregateRetrySuite.scala
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import scala.collection.mutable.ArrayBuffer
+
+import ai.rapids.cudf.{CudfException, Table}
+import com.nvidia.spark.rapids.jni.RmmSpark
+import org.mockito.Mockito._
+import org.scalatest.mockito.MockitoSugar
+
+import org.apache.spark.sql.rapids.{CudfAggregate, CudfSum}
+import org.apache.spark.sql.types.{DataType, IntegerType, LongType}
+
+class HashAggregateRetrySuite
+    extends RmmSparkRetrySuiteBase
+        with MockitoSugar
+        with Arm {
+  private def buildReductionBatch(): SpillableColumnarBatch = {
+    val reductionTable = new Table.TestBuilder()
+      .column(5L, null.asInstanceOf[java.lang.Long], 3L, 1L)
+      .build()
+    withResource(reductionTable) { tbl =>
+      val cb = GpuColumnVector.from(tbl, Seq(LongType).toArray[DataType])
+      spy(SpillableColumnarBatch(cb, -1))
+    }
+  }
+
+  private def buildGroupByBatch(): SpillableColumnarBatch = {
+    val groupByTable = new Table.TestBuilder()
+        .column(5, null.asInstanceOf[java.lang.Integer], 1, 1)
+        .column(1L.asInstanceOf[java.lang.Long], 2L, 3L, 4L)
+        .build()
+    withResource(groupByTable) { tbl =>
+      val cb = GpuColumnVector.from(tbl, Seq(IntegerType, LongType).toArray[DataType])
+      spy(SpillableColumnarBatch(cb, -1))
+    }
+  }
+
+  def doReduction(input: SpillableColumnarBatch): SpillableColumnarBatch = {
+    val mockMetrics = mock[GpuHashAggregateMetrics]
+    when(mockMetrics.opTime).thenReturn(NoopMetric)
+    when(mockMetrics.concatTime).thenReturn(NoopMetric)
+    val aggHelper = spy(new GpuHashAggregateIterator.AggHelper(
+      Seq.empty, Seq.empty, Seq.empty,
+      forceMerge = false, isSorted = false))
+
+    // mock out a reduction on the first column
+    val aggs = new ArrayBuffer[CudfAggregate]()
+    val aggOrdinals = new ArrayBuffer[Int]()
+    aggs.append(new CudfSum(LongType))
+    aggOrdinals.append(0)
+    when(aggHelper.cudfAggregates).thenReturn(aggs)
+    when(aggHelper.aggOrdinals).thenReturn(aggOrdinals)
+
+    // attempt a cuDF reduction
+    withResource(input) { _ =>
+      GpuHashAggregateIterator.aggregate(
+        aggHelper, input, mockMetrics)
+    }
+  }
+
+  def makeGroupByAggHelper(forceMerge: Boolean): GpuHashAggregateIterator.AggHelper = {
+    val aggHelper = spy(new GpuHashAggregateIterator.AggHelper(
+      Seq.empty, Seq.empty, Seq.empty,
+      forceMerge = forceMerge, isSorted = false))
+
+    // mock out a group by with the first column as key, and second column
+    // as a group by sum
+    val groupingOrdinals = new Array[Int](1)
+    groupingOrdinals(0) = 0 // groupby the 0th column
+    val aggs = new ArrayBuffer[CudfAggregate]()
+    aggs.append(new CudfSum(LongType))
+    val aggOrdinals = new ArrayBuffer[Int]()
+    aggOrdinals.append(1)
+    val postStepDataTypes = new ArrayBuffer[DataType]()
+    postStepDataTypes.append(IntegerType) // group by col
+    postStepDataTypes.append(aggs(0).dataType) // sum
+
+    when(aggHelper.cudfAggregates).thenReturn(aggs)
+    when(aggHelper.aggOrdinals).thenReturn(aggOrdinals)
+    when(aggHelper.groupingOrdinals).thenReturn(groupingOrdinals)
+    when(aggHelper.postStepDataTypes).thenReturn(postStepDataTypes)
+    aggHelper
+  }
+
+  def doGroupBy(
+      input: SpillableColumnarBatch,
+      forceMerge: Boolean = false): SpillableColumnarBatch = {
+    val mockMetrics = mock[GpuHashAggregateMetrics]
+    when(mockMetrics.opTime).thenReturn(NoopMetric)
+    when(mockMetrics.concatTime).thenReturn(NoopMetric)
+
+    // attempt a cuDF group by
+    GpuHashAggregateIterator.aggregate(
+      makeGroupByAggHelper(forceMerge = false),
+      input,
+      mockMetrics)
+  }
+
+  test("computeAndAggregate reduction with retry") {
+    val reductionBatch = buildReductionBatch()
+    RmmSpark.forceRetryOOM(RmmSpark.getCurrentThreadId)
+    val result = doReduction(reductionBatch)
+    withResource(result) { spillable =>
+      withResource(spillable.getColumnarBatch) { cb =>
+        assertResult(1)(cb.numRows)
+        val gcv = cb.column(0).asInstanceOf[GpuColumnVector]
+        withResource(gcv.getBase.copyToHost()) { hcv =>
+          assertResult(9)(hcv.getLong(0))
+        }
+      }
+    }
+    // we need to request a ColumnarBatch twice here for the retry
+    verify(reductionBatch, times(2)).getColumnarBatch()
+  }
+
+  test("computeAndAggregate reduction with two retries") {
+    val reductionBatch = buildReductionBatch()
+    RmmSpark.forceRetryOOM(RmmSpark.getCurrentThreadId, 2)
+    val result = doReduction(reductionBatch)
+    withResource(result) { spillable =>
+      withResource(spillable.getColumnarBatch) { cb =>
+        assertResult(1)(cb.numRows)
+        val gcv = cb.column(0).asInstanceOf[GpuColumnVector]
+        withResource(gcv.getBase.copyToHost()) { hcv =>
+          assertResult(9)(hcv.getLong(0))
+        }
+      }
+    }
+    // we need to request a ColumnarBatch three times, because of 1 regular attempt,
+    // and two retries
+    verify(reductionBatch, times(3)).getColumnarBatch()
+  }
+
+  test("computeAndAggregate reduction with cudf exception") {
+    val reductionBatch = buildReductionBatch()
+    RmmSpark.forceCudfException(RmmSpark.getCurrentThreadId)
+    assertThrows[CudfException] {
+      doReduction(reductionBatch)
+    }
+    // columnar batch was obtained once, but since this was not a retriable exception
+    // we don't retry it
+    verify(reductionBatch, times(1)).getColumnarBatch()
+  }
+
+  test("computeAndAggregate group by with retry") {
+    val groupByBatch = buildGroupByBatch()
+    RmmSpark.forceRetryOOM(RmmSpark.getCurrentThreadId)
+    val result = doGroupBy(groupByBatch)
+    withResource(result) { spillable =>
+      withResource(spillable.getColumnarBatch) { cb =>
+        assertResult(3)(cb.numRows)
+        val gcv = cb.column(0).asInstanceOf[GpuColumnVector]
+        val aggv = cb.column(1).asInstanceOf[GpuColumnVector]
+        var rowsLeftToMatch = 3
+        withResource(aggv.getBase.copyToHost()) { aggvh =>
+          withResource(gcv.getBase.copyToHost()) { grph =>
+            (0 until 3).foreach { row =>
+              if (grph.isNull(row)) {
+                assertResult(2L)(aggvh.getLong(row))
+                rowsLeftToMatch -= 1
+              } else if (grph.getInt(row) == 5) {
+                assertResult(1L)(aggvh.getLong(row))
+                rowsLeftToMatch -= 1
+              } else if (grph.getInt(row) == 1) {
+                assertResult(7L)(aggvh.getLong(row))
+                rowsLeftToMatch -= 1
+              }
+            }
+          }
+        }
+        assertResult(0)(rowsLeftToMatch)
+      }
+    }
+    // we need to request a ColumnarBatch twice here for the retry
+    verify(groupByBatch, times(2)).getColumnarBatch()
+  }
+
+  test("computeAndAggregate reduction with split and retry") {
+    val reductionBatch = buildReductionBatch()
+    RmmSpark.forceSplitAndRetryOOM(RmmSpark.getCurrentThreadId)
+    val result = doReduction(reductionBatch)
+    withResource(result) { spillable =>
+      withResource(spillable.getColumnarBatch) { cb =>
+        assertResult(1)(cb.numRows)
+        val gcv = cb.column(0).asInstanceOf[GpuColumnVector]
+
+        withResource(gcv.getBase.copyToHost()) { hcv =>
+          assertResult(9L)(hcv.getLong(0))
+        }
+      }
+    }
+    // the second time we access this batch is to split it
+    verify(reductionBatch, times(2)).getColumnarBatch()
+  }
+
+  test("computeAndAggregate group by with split retry") {
+    val groupByBatch = buildGroupByBatch()
+    RmmSpark.forceSplitAndRetryOOM(RmmSpark.getCurrentThreadId)
+    val result = doGroupBy(groupByBatch)
+    withResource(result) { spillable =>
+      withResource(spillable.getColumnarBatch) { cb =>
+        assertResult(3)(cb.numRows)
+        val gcv = cb.column(0).asInstanceOf[GpuColumnVector]
+        val aggv = cb.column(1).asInstanceOf[GpuColumnVector]
+        var rowsLeftToMatch = 3
+        withResource(aggv.getBase.copyToHost()) { aggvh =>
+          withResource(gcv.getBase.copyToHost()) { grph =>
+            (0 until 3).foreach { row =>
+              if (grph.isNull(row)) {
+                assertResult(2L)(aggvh.getLong(row))
+                rowsLeftToMatch -= 1
+              } else if (grph.getInt(row) == 5) {
+                assertResult(1L)(aggvh.getLong(row))
+                rowsLeftToMatch -= 1
+              } else if (grph.getInt(row) == 1) {
+                assertResult(7L)(aggvh.getLong(row))
+                rowsLeftToMatch -= 1
+              }
+            }
+          }
+        }
+        assertResult(0)(rowsLeftToMatch)
+      }
+    }
+    // the second time we access this batch is to split it
+    verify(groupByBatch, times(2)).getColumnarBatch()
+  }
+
+  test("computeAndAggregate group by with retry and forceMerge") {
+    // with forceMerge we expect 1 batch to be returned at all costs
+    val groupByBatch = buildGroupByBatch()
+    // we force a split because that would cause us to compute two aggs
+    RmmSpark.forceSplitAndRetryOOM(RmmSpark.getCurrentThreadId)
+    val result = doGroupBy(groupByBatch, forceMerge = true)
+    withResource(result) { spillable =>
+      withResource(spillable.getColumnarBatch) { cb =>
+        assertResult(3)(cb.numRows)
+        val gcv = cb.column(0).asInstanceOf[GpuColumnVector]
+        val aggv = cb.column(1).asInstanceOf[GpuColumnVector]
+        var rowsLeftToMatch = 3
+        withResource(aggv.getBase.copyToHost()) { aggvh =>
+          withResource(gcv.getBase.copyToHost()) { grph =>
+            (0 until 3).foreach { row =>
+              if (grph.isNull(row)) {
+                assertResult(2L)(aggvh.getLong(row))
+                rowsLeftToMatch -= 1
+              } else if (grph.getInt(row) == 5) {
+                assertResult(1L)(aggvh.getLong(row))
+                rowsLeftToMatch -= 1
+              } else if (grph.getInt(row) == 1) {
+                assertResult(7L)(aggvh.getLong(row))
+                rowsLeftToMatch -= 1
+              }
+            }
+          }
+        }
+        assertResult(0)(rowsLeftToMatch)
+      }
+    }
+    // we need to request a ColumnarBatch twice here for the retry
+    verify(groupByBatch, times(2)).getColumnarBatch()
+  }
+}
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/JsonScanRetrySuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/JsonScanRetrySuite.scala
new file mode 100644
index 00000000000..12af04c5e1a
--- /dev/null
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/JsonScanRetrySuite.scala
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import ai.rapids.cudf.JSONOptions
+import com.nvidia.spark.rapids.jni.RmmSpark
+
+import org.apache.spark.sql.catalyst.json.rapids.JsonPartitionReader
+import org.apache.spark.sql.types._
+
+class JsonScanRetrySuite extends RmmSparkRetrySuiteBase with Arm {
+  test("test simple retry") {
+    val bufferer = HostLineBuffererFactory.createBufferer(100, Array('\n'.toByte))
+    bufferer.add("{\"a\": 1, \"b\": 2".getBytes, 0, 14)
+
+    val cudfSchema = GpuColumnVector.from(StructType(Seq(StructField("a", IntegerType),
+      StructField("b", IntegerType))))
+    val opts = JSONOptions.builder().withLines(true).build()
+    RmmSpark.forceRetryOOM(RmmSpark.getCurrentThreadId)
+    val table = JsonPartitionReader.readToTable(bufferer, cudfSchema, NoopMetric,
+      opts, "JSON", null)
+    table.close()
+    // We don't have any good way to verify that the retry was thrown, but we are going to trust
+    // that it was.
+  }
+}
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/OrcScanSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/OrcScanSuite.scala
index 6e1c72bf4f6..f1d605ba625 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/OrcScanSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/OrcScanSuite.scala
@@ -69,33 +69,12 @@ class OrcScanSuite extends SparkQueryCompareTestSuite {
       StructField("c3_long", LongType),
       StructField("c1_int", IntegerType))))) { frame => frame }
 
-  /**
-   * We can't compare the results from CPU and GPU, since CPU will get in-correct result
-   * see https://github.com/NVIDIA/spark-rapids/issues/3060
-   */
-  test("schema can't be pruned") {
-    withGpuSparkSession( spark => {
-      val df = frameFromOrcWithSchema("schema-cant-prune.orc",
-        StructType(Seq(
-          StructField("_col2", StringType),
-          StructField("_col3", LongType),
-          StructField("_col1", IntegerType))))(spark)
-      val ret = df.collect()
-      assert(ret(0).getString(0) === "hello")
-      assert(ret(0).getLong(1) === 2021)
-      assert(ret(0).getInt(2) === 1)
-
-      val df1 = frameFromOrcWithSchema("schema-cant-prune.orc",
+  testSparkResultsAreEqual("schema-can-prune reordered columns reordered",
+    frameFromOrcWithSchema("schema-cant-prune.orc",
         StructType(Seq(
           StructField("_col3", LongType),
-          StructField("_col1", IntegerType),
-          StructField("_col2", StringType))))(spark)
-      val ret1 = df1.collect()
-      assert(ret1(0).getLong(0) === 2021)
-      assert(ret1(0).getInt(1) === 1)
-      assert(ret1(0).getString(2) === "hello")
-    })
-  }
+          StructField("_col2", StringType),
+          StructField("_col1", LongType))))) { frame => frame }
 
   /**
    *
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala
index d64b4aa9425..626f6d7f732 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala
@@ -16,7 +16,7 @@
 
 package com.nvidia.spark.rapids
 
-import ai.rapids.cudf.ColumnVector
+import ai.rapids.cudf.{ColumnVector, RegexProgram}
 import java.sql.{Date, Timestamp}
 import org.scalatest.BeforeAndAfterEach
 import scala.collection.mutable.ListBuffer
@@ -279,11 +279,11 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE
     assert(res)
   }
 
-  @scala.annotation.nowarn("msg=method stringReplaceWithBackrefs in class ColumnView is deprecated")
   private def testRegex(rule: RegexReplace, values: Seq[String], expected: Seq[String]): Unit = {
     withResource(ColumnVector.fromStrings(values: _*)) { v =>
       withResource(ColumnVector.fromStrings(expected: _*)) { expected =>
-        withResource(v.stringReplaceWithBackrefs(rule.search, rule.replace)) { actual =>
+        val prog = new RegexProgram(rule.search)
+        withResource(v.stringReplaceWithBackrefs(prog, rule.replace)) { actual =>
           CudfTestHelper.assertColumnsAreEqual(expected, actual)
         }
       }
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/ProjectExprSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/ProjectExprSuite.scala
index cf7894d4dab..d2afff44a7a 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/ProjectExprSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/ProjectExprSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,10 +19,15 @@ package com.nvidia.spark.rapids
 import java.io.File
 import java.nio.file.Files
 
+import ai.rapids.cudf.Table
+import com.nvidia.spark.rapids.jni.RmmSpark
+import org.mockito.Mockito.spy
+
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{Column, DataFrame, Row, SparkSession}
-import org.apache.spark.sql.catalyst.expressions.Literal
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Literal, NamedExpression}
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.rapids.GpuAdd
 import org.apache.spark.sql.types._
 
 class ProjectExprSuite extends SparkQueryCompareTestSuite {
@@ -46,6 +51,78 @@ class ProjectExprSuite extends SparkQueryCompareTestSuite {
     }, conf = enableCsvConf())
   }
 
+  private def buildProjectBatch(): SpillableColumnarBatch = {
+    val projectTable = new Table.TestBuilder()
+        .column(5L, null.asInstanceOf[java.lang.Long], 3L, 1L)
+        .column(6L.asInstanceOf[java.lang.Long], 7L, 8L, 9L)
+        .build()
+    withResource(projectTable) { tbl =>
+      val cb = GpuColumnVector.from(tbl, Seq(LongType, LongType).toArray[DataType])
+      spy(SpillableColumnarBatch(cb, -1))
+    }
+  }
+
+  test("basic retry") {
+    RmmSpark.associateCurrentThreadWithTask(0)
+    try {
+      val expr = GpuAlias(GpuAdd(
+        GpuBoundReference(0, LongType, true)(NamedExpression.newExprId, "a"),
+        GpuBoundReference(1, LongType, true)(NamedExpression.newExprId, "b"), false),
+        "ret")()
+      val sb = buildProjectBatch()
+
+      RmmSpark.forceRetryOOM(RmmSpark.getCurrentThreadId)
+      val result = GpuProjectExec.projectAndCloseWithRetrySingleBatch(sb, Seq(expr))
+      withResource(result) { cb =>
+        assertResult(4)(cb.numRows)
+        assertResult(1)(cb.numCols)
+        val gcv = cb.column(0).asInstanceOf[GpuColumnVector]
+        withResource(gcv.getBase.copyToHost()) { hcv =>
+          assert(!hcv.isNull(0))
+          assertResult(11L)(hcv.getLong(0))
+          assert(hcv.isNull(1))
+          assert(!hcv.isNull(2))
+          assertResult(11L)(hcv.getLong(2))
+          assert(!hcv.isNull(3))
+          assertResult(10L)(hcv.getLong(3))
+        }
+      }
+    } finally {
+      RmmSpark.removeThreadAssociation(0)
+    }
+  }
+
+  test("tiered retry") {
+    RmmSpark.associateCurrentThreadWithTask(0)
+    try {
+      val a = AttributeReference("a", LongType)()
+      val b = AttributeReference("b", LongType)()
+      val simpleAdd = GpuAdd(a, b, false)
+      val fullAdd = GpuAlias(GpuAdd(simpleAdd, simpleAdd, false), "ret")()
+      val tp = GpuBindReferences.bindGpuReferencesTiered(Seq(fullAdd), Seq(a, b), true)
+      val sb = buildProjectBatch()
+
+      RmmSpark.forceRetryOOM(RmmSpark.getCurrentThreadId)
+      val result = tp.projectAndCloseWithRetrySingleBatch(sb)
+      withResource(result) { cb =>
+        assertResult(4)(cb.numRows)
+        assertResult(1)(cb.numCols)
+        val gcv = cb.column(0).asInstanceOf[GpuColumnVector]
+        withResource(gcv.getBase.copyToHost()) { hcv =>
+          assert(!hcv.isNull(0))
+          assertResult(22L)(hcv.getLong(0))
+          assert(hcv.isNull(1))
+          assert(!hcv.isNull(2))
+          assertResult(22L)(hcv.getLong(2))
+          assert(!hcv.isNull(3))
+          assertResult(20L)(hcv.getLong(3))
+        }
+      }
+    } finally {
+      RmmSpark.removeThreadAssociation(0)
+    }
+  }
+
   testSparkResultsAreEqual("Test literal values in select", mixedFloatDf) {
     frame =>
       frame.select(col("floats"),
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RapidsBufferCatalogSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RapidsBufferCatalogSuite.scala
index 2a951eb2902..8f11fb22f6e 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RapidsBufferCatalogSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RapidsBufferCatalogSuite.scala
@@ -21,6 +21,7 @@ import java.io.File
 import ai.rapids.cudf.{Cuda, DeviceMemoryBuffer, MemoryBuffer}
 import com.nvidia.spark.rapids.StorageTier.{DEVICE, DISK, HOST, StorageTier}
 import com.nvidia.spark.rapids.format.TableMeta
+import org.mockito.ArgumentMatchers.any
 import org.mockito.Mockito._
 import org.scalatest.FunSuite
 import org.scalatest.mockito.MockitoSugar
@@ -61,9 +62,9 @@ class RapidsBufferCatalogSuite extends FunSuite with MockitoSugar with Arm {
     val buffer = mockBuffer(bufferId)
     catalog.registerNewBuffer(buffer)
     val handle1 =
-      catalog.makeNewHandle(bufferId, -1, RapidsBuffer.defaultSpillCallback)
+      catalog.makeNewHandle(bufferId, -1)
     val handle2 =
-      catalog.makeNewHandle(bufferId, -1, RapidsBuffer.defaultSpillCallback)
+      catalog.makeNewHandle(bufferId, -1)
 
     handle1.close()
 
@@ -84,12 +85,12 @@ class RapidsBufferCatalogSuite extends FunSuite with MockitoSugar with Arm {
     val buffer = mockBuffer(bufferId, initialPriority = -1)
     catalog.registerNewBuffer(buffer)
     val handle1 =
-      catalog.makeNewHandle(bufferId, -1, RapidsBuffer.defaultSpillCallback)
+      catalog.makeNewHandle(bufferId, -1)
     withResource(catalog.acquireBuffer(handle1)) { buff =>
       assertResult(-1)(buff.getSpillPriority)
     }
     val handle2 =
-      catalog.makeNewHandle(bufferId, 0, RapidsBuffer.defaultSpillCallback)
+      catalog.makeNewHandle(bufferId, 0)
     withResource(catalog.acquireBuffer(handle2)) { buff =>
       assertResult(0)(buff.getSpillPriority)
     }
@@ -102,7 +103,7 @@ class RapidsBufferCatalogSuite extends FunSuite with MockitoSugar with Arm {
 
     // adding a lower priority -1000 handle keeps the high priority (0) spill
     val handle3 =
-      catalog.makeNewHandle(bufferId, -1000, RapidsBuffer.defaultSpillCallback)
+      catalog.makeNewHandle(bufferId, -1000)
     withResource(catalog.acquireBuffer(handle3)) { buff =>
       assertResult(0)(buff.getSpillPriority)
     }
@@ -117,55 +118,12 @@ class RapidsBufferCatalogSuite extends FunSuite with MockitoSugar with Arm {
     handle3.close()
   }
 
-  test("spill callbacks are updated as handles are registered and unregistered") {
-    val catalog = new RapidsBufferCatalog
-    val bufferId = MockBufferId(5)
-    val buffer = mockBuffer(bufferId, initialPriority = -1)
-    catalog.registerNewBuffer(buffer)
-    val handle1 =
-      catalog.makeNewHandle(bufferId, -1, null)
-    withResource(catalog.acquireBuffer(handle1)) { buff =>
-      assertResult(null)(buff.getSpillCallback)
-    }
-    val handle2 =
-      catalog.makeNewHandle(bufferId, 0, RapidsBuffer.defaultSpillCallback)
-    withResource(catalog.acquireBuffer(handle2)) { buff =>
-      assertResult(RapidsBuffer.defaultSpillCallback)(buff.getSpillCallback)
-    }
-
-    // adding a new handle puts a new callback in front, that's the new callback
-    val mySpillCallback = new SpillCallback {
-      override def apply(from: StorageTier, to: StorageTier, amount: Long): Unit = {}
-      override def semaphoreWaitTime: GpuMetric = null
-    }
-
-    val handle3 =
-      catalog.makeNewHandle(bufferId, -1000, mySpillCallback)
-    withResource(catalog.acquireBuffer(handle3)) { buff =>
-      assertResult(mySpillCallback)(buff.getSpillCallback)
-    }
-
-    // removing handles brings back the prior inserted callback
-    // low priority that is remaining
-    handle3.close()
-    withResource(catalog.acquireBuffer(handle2)) { buff =>
-      assertResult(RapidsBuffer.defaultSpillCallback)(buff.getSpillCallback)
-    }
-
-    handle2.close()
-    withResource(catalog.acquireBuffer(handle1)) { buff =>
-      assertResult(null)(buff.getSpillCallback)
-    }
-
-    handle1.close()
-  }
-
   test("buffer registering slower tier does not hide faster tier") {
     val catalog = new RapidsBufferCatalog
     val bufferId = MockBufferId(5)
     val buffer = mockBuffer(bufferId, tier = DEVICE)
     catalog.registerNewBuffer(buffer)
-    val handle = catalog.makeNewHandle(bufferId, 0, RapidsBuffer.defaultSpillCallback)
+    val handle = catalog.makeNewHandle(bufferId, 0)
     val buffer2 = mockBuffer(bufferId, tier = HOST)
     catalog.registerNewBuffer(buffer2)
     val buffer3 = mockBuffer(bufferId, tier = DISK)
@@ -183,7 +141,7 @@ class RapidsBufferCatalogSuite extends FunSuite with MockitoSugar with Arm {
     val bufferId = MockBufferId(5)
     val buffer = mockBuffer(bufferId)
     catalog.registerNewBuffer(buffer)
-    val handle = catalog.makeNewHandle(bufferId, 0, RapidsBuffer.defaultSpillCallback)
+    val handle = catalog.makeNewHandle(bufferId, 0)
     val acquired = catalog.acquireBuffer(handle)
     assertResult(5)(acquired.id.tableId)
     assertResult(buffer)(acquired)
@@ -197,7 +155,7 @@ class RapidsBufferCatalogSuite extends FunSuite with MockitoSugar with Arm {
     val bufferId = MockBufferId(5)
     val buffer = mockBuffer(bufferId, acquireAttempts = 9)
     catalog.registerNewBuffer(buffer)
-    val handle = catalog.makeNewHandle(bufferId, 0, RapidsBuffer.defaultSpillCallback)
+    val handle = catalog.makeNewHandle(bufferId, 0)
     val acquired = catalog.acquireBuffer(handle)
     assertResult(5)(acquired.id.tableId)
     assertResult(buffer)(acquired)
@@ -252,6 +210,46 @@ class RapidsBufferCatalogSuite extends FunSuite with MockitoSugar with Arm {
     assert(!catalog.isBufferSpilled(bufferId, DISK))
   }
 
+  test("multiple calls to unspill return existing DEVICE buffer") {
+    val deviceStore = spy(new RapidsDeviceMemoryStore)
+    val mockStore = mock[RapidsBufferStore]
+    withResource(
+      new RapidsHostMemoryStore(10000, 1000)) { hostStore =>
+      deviceStore.setSpillStore(hostStore)
+      hostStore.setSpillStore(mockStore)
+      val catalog = new RapidsBufferCatalog(deviceStore)
+      val handle = withResource(DeviceMemoryBuffer.allocate(1024)) { buff =>
+        val meta = MetaUtils.getTableMetaNoTable(buff)
+        catalog.addBuffer(
+          buff, meta, -1)
+      }
+      withResource(handle) { _ =>
+        catalog.synchronousSpill(deviceStore, 0)
+        val acquiredHostBuffer = catalog.acquireBuffer(handle)
+        withResource(acquiredHostBuffer) { _ =>
+          assertResult(HOST)(acquiredHostBuffer.storageTier)
+          val unspilled =
+            catalog.unspillBufferToDeviceStore(
+              acquiredHostBuffer,
+              acquiredHostBuffer.getMemoryBuffer,
+              Cuda.DEFAULT_STREAM)
+          withResource(unspilled) { _ =>
+            assertResult(DEVICE)(unspilled.storageTier)
+          }
+          val unspilledSame = catalog.unspillBufferToDeviceStore(
+            acquiredHostBuffer,
+            acquiredHostBuffer.getMemoryBuffer,
+            Cuda.DEFAULT_STREAM)
+          withResource(unspilledSame) { _ =>
+            assertResult(unspilled)(unspilledSame)
+          }
+          // verify that we invoked the copy function exactly once
+          verify(deviceStore, times(1)).copyBuffer(any(), any(), any())
+        }
+      }
+    }
+  }
+
   test("remove buffer tier") {
     val catalog = new RapidsBufferCatalog
     val bufferId = MockBufferId(5)
@@ -286,7 +284,7 @@ class RapidsBufferCatalogSuite extends FunSuite with MockitoSugar with Arm {
     val buffer = mockBuffer(bufferId)
     catalog.registerNewBuffer(buffer)
     val handle = catalog.makeNewHandle(
-      bufferId, -1, RapidsBuffer.defaultSpillCallback)
+      bufferId, -1)
     handle.close()
     verify(buffer).free()
   }
@@ -297,7 +295,7 @@ class RapidsBufferCatalogSuite extends FunSuite with MockitoSugar with Arm {
     val buffer = mockBuffer(bufferId, tier = DEVICE)
     catalog.registerNewBuffer(buffer)
     val handle = catalog.makeNewHandle(
-      bufferId, -1, RapidsBuffer.defaultSpillCallback)
+      bufferId, -1)
 
     // these next registrations don't get their own handle. This is an internal
     // operation from the store where it has spilled to host and disk the RapidsBuffer
@@ -322,7 +320,6 @@ class RapidsBufferCatalogSuite extends FunSuite with MockitoSugar with Arm {
     spy(new RapidsBuffer {
       var _acquireAttempts: Int = acquireAttempts
       var currentPriority: Long =  initialPriority
-      var currentCallback: SpillCallback = null
       override val id: RapidsBufferId = bufferId
       override val size: Long = 0
       override val meta: TableMeta = tableMeta
@@ -344,13 +341,9 @@ class RapidsBufferCatalogSuite extends FunSuite with MockitoSugar with Arm {
       }
       override def free(): Unit = {}
       override def getSpillPriority: Long = currentPriority
-      override def getSpillCallback: SpillCallback = currentCallback
       override def setSpillPriority(priority: Long): Unit = {
         currentPriority = priority
       }
-      override def setSpillCallback(spillCallback: SpillCallback): Unit = {
-        currentCallback = spillCallback
-      }
       override def close(): Unit = {}
     })
   }
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RapidsDeviceMemoryStoreSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RapidsDeviceMemoryStoreSuite.scala
index a00ea1c031d..072894d8f02 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RapidsDeviceMemoryStoreSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RapidsDeviceMemoryStoreSuite.scala
@@ -25,7 +25,7 @@ import ai.rapids.cudf.{ContiguousTable, Cuda, DeviceMemoryBuffer, HostMemoryBuff
 import com.nvidia.spark.rapids.StorageTier.StorageTier
 import com.nvidia.spark.rapids.format.TableMeta
 import org.mockito.ArgumentCaptor
-import org.mockito.Mockito.verify
+import org.mockito.Mockito.{spy, verify}
 import org.scalatest.FunSuite
 import org.scalatest.mockito.MockitoSugar
 
@@ -45,13 +45,13 @@ class RapidsDeviceMemoryStoreSuite extends FunSuite with Arm with MockitoSugar {
   }
 
   test("add table registers with catalog") {
-    val catalog = mock[RapidsBufferCatalog]
-    withResource(new RapidsDeviceMemoryStore(catalog)) { store =>
+    withResource(new RapidsDeviceMemoryStore) { store =>
+      val catalog = spy(new RapidsBufferCatalog(store))
       val spillPriority = 3
       val bufferId = MockRapidsBufferId(7)
       withResource(buildContiguousTable()) { ct =>
-        store.addContiguousTable(
-          bufferId, ct, spillPriority, RapidsBuffer.defaultSpillCallback, false)
+        catalog.addContiguousTable(
+          bufferId, ct, spillPriority, false)
       }
       val captor: ArgumentCaptor[RapidsBuffer] = ArgumentCaptor.forClass(classOf[RapidsBuffer])
       verify(catalog).registerNewBuffer(captor.capture())
@@ -61,16 +61,104 @@ class RapidsDeviceMemoryStoreSuite extends FunSuite with Arm with MockitoSugar {
     }
   }
 
+  test("a table is not spillable until the owner closes it") {
+    withResource(new RapidsDeviceMemoryStore) { store =>
+      val catalog = spy(new RapidsBufferCatalog(store))
+      val spillPriority = 3
+      val bufferId = MockRapidsBufferId(7)
+      val ct = buildContiguousTable()
+      val buffSize = ct.getBuffer.getLength
+      withResource(ct) { _ =>
+        catalog.addContiguousTable(
+          bufferId,
+          ct,
+          spillPriority,
+          false)
+        assertResult(buffSize)(store.currentSize)
+        assertResult(0)(store.currentSpillableSize)
+      }
+      // after closing the original table, the RapidsBuffer should be spillable
+      assertResult(buffSize)(store.currentSize)
+      assertResult(buffSize)(store.currentSpillableSize)
+    }
+  }
+
+  test("a buffer is not spillable until the owner closes columns referencing it") {
+    withResource(new RapidsDeviceMemoryStore) { store =>
+      val catalog = spy(new RapidsBufferCatalog(store))
+      val spillPriority = 3
+      val bufferId = MockRapidsBufferId(7)
+      val ct = buildContiguousTable()
+      val buffSize = ct.getBuffer.getLength
+      withResource(ct) { _ =>
+        val meta = MetaUtils.buildTableMeta(bufferId.tableId, ct)
+        withResource(ct) { _ =>
+          store.addBuffer(
+            bufferId,
+            ct.getBuffer,
+            meta,
+            spillPriority,
+            false)
+          assertResult(buffSize)(store.currentSize)
+          assertResult(0)(store.currentSpillableSize)
+        }
+      }
+      // after closing the original table, the RapidsBuffer should be spillable
+      assertResult(buffSize)(store.currentSize)
+      assertResult(buffSize)(store.currentSpillableSize)
+    }
+  }
+
+  test("a buffer is not spillable when the underlying device buffer is obtained from it") {
+    withResource(new RapidsDeviceMemoryStore) { store =>
+      val catalog = spy(new RapidsBufferCatalog(store))
+      val spillPriority = 3
+      val bufferId = MockRapidsBufferId(7)
+      val ct = buildContiguousTable()
+      val underlyingBuff = ct.getBuffer
+      val buffSize = ct.getBuffer.getLength
+      val buffer = withResource(ct) { _ =>
+        val meta = MetaUtils.buildTableMeta(bufferId.tableId, ct)
+        val buffer = store.addBuffer(
+          bufferId,
+          ct.getBuffer,
+          meta,
+          spillPriority,
+          false)
+        assertResult(buffSize)(store.currentSize)
+        assertResult(0)(store.currentSpillableSize)
+        buffer
+      }
+
+      // after closing the original table, the RapidsBuffer should be spillable
+      assertResult(buffSize)(store.currentSize)
+      assertResult(buffSize)(store.currentSpillableSize)
+
+      // if a device memory buffer is obtained from the buffer, it is no longer spillable
+      withResource(buffer.getDeviceMemoryBuffer) { deviceBuffer =>
+        assertResult(buffSize)(store.currentSize)
+        assertResult(0)(store.currentSpillableSize)
+      }
+
+      // once the DeviceMemoryBuffer is closed, the RapidsBuffer should be spillable again
+      assertResult(buffSize)(store.currentSpillableSize)
+    }
+  }
+
   test("add buffer registers with catalog") {
-    val catalog = mock[RapidsBufferCatalog]
-    withResource(new RapidsDeviceMemoryStore(catalog)) { store =>
+    withResource(new RapidsDeviceMemoryStore) { store =>
+      val catalog = spy(new RapidsBufferCatalog(store))
       val spillPriority = 3
       val bufferId = MockRapidsBufferId(7)
       val meta = withResource(buildContiguousTable()) { ct =>
         val meta = MetaUtils.buildTableMeta(bufferId.tableId, ct)
         withResource(ct) { _ =>
-          store.addBuffer(
-            bufferId, ct.getBuffer, meta, spillPriority, RapidsBuffer.defaultSpillCallback, false)
+          catalog.addBuffer(
+            bufferId,
+            ct.getBuffer,
+            meta,
+            spillPriority,
+            false)
         }
         meta
       }
@@ -84,20 +172,19 @@ class RapidsDeviceMemoryStoreSuite extends FunSuite with Arm with MockitoSugar {
   }
 
   test("get memory buffer") {
-    val catalog = new RapidsBufferCatalog
-    withResource(new RapidsDeviceMemoryStore(catalog)) { store =>
+    withResource(new RapidsDeviceMemoryStore) { store =>
+      val catalog = spy(new RapidsBufferCatalog(store))
       val bufferId = MockRapidsBufferId(7)
       withResource(buildContiguousTable()) { ct =>
         withResource(HostMemoryBuffer.allocate(ct.getBuffer.getLength)) { expectedHostBuffer =>
           expectedHostBuffer.copyFromDeviceBuffer(ct.getBuffer)
           val meta = MetaUtils.buildTableMeta(bufferId.tableId, ct)
           val handle = withResource(ct) { _ =>
-            store.addBuffer(
+            catalog.addBuffer(
               bufferId,
               ct.getBuffer,
               meta,
               initialSpillPriority = 3,
-              RapidsBuffer.defaultSpillCallback,
               needsSync = false)
           }
           withResource(catalog.acquireBuffer(handle)) { buffer =>
@@ -114,18 +201,22 @@ class RapidsDeviceMemoryStoreSuite extends FunSuite with Arm with MockitoSugar {
   }
 
   test("get column batch") {
-    val catalog = new RapidsBufferCatalog
-    val sparkTypes = Array[DataType](IntegerType, StringType, DoubleType,
-      DecimalType(ai.rapids.cudf.DType.DECIMAL64_MAX_PRECISION, 5))
-    withResource(new RapidsDeviceMemoryStore(catalog)) { store =>
+    withResource(new RapidsDeviceMemoryStore) { store =>
+      val catalog = new RapidsBufferCatalog(store)
+      val sparkTypes = Array[DataType](IntegerType, StringType, DoubleType,
+        DecimalType(ai.rapids.cudf.DType.DECIMAL64_MAX_PRECISION, 5))
       val bufferId = MockRapidsBufferId(7)
       withResource(buildContiguousTable()) { ct =>
         withResource(GpuColumnVector.from(ct.getTable, sparkTypes)) {
           expectedBatch =>
             val meta = MetaUtils.buildTableMeta(bufferId.tableId, ct)
             val handle = withResource(ct) { _ =>
-              store.addBuffer(bufferId, ct.getBuffer, meta, initialSpillPriority = 3,
-                RapidsBuffer.defaultSpillCallback, false)
+              catalog.addBuffer(
+                bufferId,
+                ct.getBuffer,
+                meta,
+                initialSpillPriority = 3,
+                false)
             }
             withResource(catalog.acquireBuffer(handle)) { buffer =>
               withResource(buffer.getColumnarBatch(sparkTypes)) { actualBatch =>
@@ -138,16 +229,16 @@ class RapidsDeviceMemoryStoreSuite extends FunSuite with Arm with MockitoSugar {
   }
 
   test("cannot receive spilled buffers") {
-    val catalog = new RapidsBufferCatalog
-    withResource(new RapidsDeviceMemoryStore(catalog)) { store =>
+    withResource(new RapidsDeviceMemoryStore) { store =>
       assertThrows[IllegalStateException](store.copyBuffer(
         mock[RapidsBuffer], mock[MemoryBuffer], Cuda.DEFAULT_STREAM))
     }
   }
 
   test("size statistics") {
-    val catalog = new RapidsBufferCatalog
-    withResource(new RapidsDeviceMemoryStore(catalog)) { store =>
+
+    withResource(new RapidsDeviceMemoryStore) { store =>
+      val catalog = new RapidsBufferCatalog(store)
       assertResult(0)(store.currentSize)
       val bufferSizes = new Array[Long](2)
       val bufferHandles = new Array[RapidsBufferHandle](2)
@@ -156,8 +247,11 @@ class RapidsDeviceMemoryStoreSuite extends FunSuite with Arm with MockitoSugar {
           bufferSizes(i) = ct.getBuffer.getLength
           // store takes ownership of the table
           bufferHandles(i) =
-            store.addContiguousTable(MockRapidsBufferId(i), ct, initialSpillPriority = 0,
-              RapidsBuffer.defaultSpillCallback, false)
+            catalog.addContiguousTable(
+              MockRapidsBufferId(i),
+              ct,
+              initialSpillPriority = 0,
+              false)
         }
         assertResult(bufferSizes.take(i+1).sum)(store.currentSize)
       }
@@ -169,40 +263,40 @@ class RapidsDeviceMemoryStoreSuite extends FunSuite with Arm with MockitoSugar {
   }
 
   test("spill") {
-    val catalog = new RapidsBufferCatalog
-    val spillStore = new MockSpillStore(catalog)
+    val spillStore = new MockSpillStore
     val spillPriorities = Array(0, -1, 2)
     val bufferSizes = new Array[Long](spillPriorities.length)
-    withResource(new RapidsDeviceMemoryStore(catalog)) { store =>
+    withResource(new RapidsDeviceMemoryStore) { store =>
+      val catalog = new RapidsBufferCatalog(store)
       store.setSpillStore(spillStore)
       spillPriorities.indices.foreach { i =>
         withResource(buildContiguousTable()) { ct =>
           bufferSizes(i) = ct.getBuffer.getLength
           // store takes ownership of the table
-          store.addContiguousTable(
+          catalog.addContiguousTable(
             MockRapidsBufferId(i), ct, spillPriorities(i),
-            RapidsBuffer.defaultSpillCallback, false)
+            false)
         }
       }
       assert(spillStore.spilledBuffers.isEmpty)
 
       // asking to spill 0 bytes should not spill
       val sizeBeforeSpill = store.currentSize
-      store.synchronousSpill(sizeBeforeSpill)
+      catalog.synchronousSpill(store, sizeBeforeSpill)
       assert(spillStore.spilledBuffers.isEmpty)
       assertResult(sizeBeforeSpill)(store.currentSize)
-      store.synchronousSpill(sizeBeforeSpill + 1)
+      catalog.synchronousSpill(store, sizeBeforeSpill + 1)
       assert(spillStore.spilledBuffers.isEmpty)
       assertResult(sizeBeforeSpill)(store.currentSize)
 
       // spilling 1 byte should force one buffer to spill in priority order
-      store.synchronousSpill(sizeBeforeSpill - 1)
+      catalog.synchronousSpill(store, sizeBeforeSpill - 1)
       assertResult(1)(spillStore.spilledBuffers.length)
       assertResult(bufferSizes.drop(1).sum)(store.currentSize)
       assertResult(1)(spillStore.spilledBuffers(0).tableId)
 
       // spilling to zero should force all buffers to spill in priority order
-      store.synchronousSpill(0)
+      catalog.synchronousSpill(store, 0)
       assertResult(3)(spillStore.spilledBuffers.length)
       assertResult(0)(store.currentSize)
       assertResult(0)(spillStore.spilledBuffers(1).tableId)
@@ -215,8 +309,7 @@ class RapidsDeviceMemoryStoreSuite extends FunSuite with Arm with MockitoSugar {
       throw new UnsupportedOperationException
   }
 
-  class MockSpillStore(catalog: RapidsBufferCatalog)
-      extends RapidsBufferStore(StorageTier.HOST, catalog) with Arm {
+  class MockSpillStore extends RapidsBufferStore(StorageTier.HOST) with Arm {
     val spilledBuffers = new ArrayBuffer[RapidsBufferId]
 
     override protected def createBuffer(
@@ -230,17 +323,13 @@ class RapidsDeviceMemoryStoreSuite extends FunSuite with Arm with MockitoSugar {
     }
 
     class MockRapidsBuffer(id: RapidsBufferId, size: Long, meta: TableMeta, spillPriority: Long)
-        extends RapidsBufferBase(id, size, meta, spillPriority, RapidsBuffer.defaultSpillCallback) {
+        extends RapidsBufferBase(id, size, meta, spillPriority) {
       override protected def releaseResources(): Unit = {}
 
       override val storageTier: StorageTier = StorageTier.HOST
 
       override def getMemoryBuffer: MemoryBuffer =
         throw new UnsupportedOperationException
-
-      override def getSpillCallback: SpillCallback = RapidsBuffer.defaultSpillCallback
-
-      override def setSpillCallback(spillCallback: SpillCallback): Unit = {}
     }
   }
 }
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RapidsDiskStoreSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RapidsDiskStoreSuite.scala
index a487df0e6da..2cd6140d466 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RapidsDiskStoreSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RapidsDiskStoreSuite.scala
@@ -44,20 +44,20 @@ class RapidsDiskStoreSuite extends FunSuiteWithTempDir with Arm with MockitoSuga
     val bufferId = MockRapidsBufferId(7, canShareDiskPaths = false)
     val spillPriority = -7
     val hostStoreMaxSize = 1L * 1024 * 1024
-    val catalog = spy(new RapidsBufferCatalog)
-    withResource(new RapidsDeviceMemoryStore(catalog)) { devStore =>
-      withResource(new RapidsHostMemoryStore(hostStoreMaxSize, hostStoreMaxSize, catalog)) {
+    withResource(new RapidsDeviceMemoryStore) { devStore =>
+      val catalog = spy(new RapidsBufferCatalog(devStore))
+      withResource(new RapidsHostMemoryStore(hostStoreMaxSize, hostStoreMaxSize)) {
         hostStore =>
           devStore.setSpillStore(hostStore)
-          withResource(new RapidsDiskStore(mock[RapidsDiskBlockManager], catalog)) { diskStore =>
+          withResource(new RapidsDiskStore(mock[RapidsDiskBlockManager])) { diskStore =>
             assertResult(0)(diskStore.currentSize)
             hostStore.setSpillStore(diskStore)
             val (bufferSize, handle) =
-              addTableToStore(devStore, bufferId, spillPriority)
+              addTableToCatalog(catalog, bufferId, spillPriority)
             val path = handle.id.getDiskPath(null)
             assert(!path.exists())
-            devStore.synchronousSpill(0)
-            hostStore.synchronousSpill(0)
+            catalog.synchronousSpill(devStore, 0)
+            catalog.synchronousSpill(hostStore, 0)
             assertResult(0)(hostStore.currentSize)
             assertResult(bufferSize)(diskStore.currentSize)
             assert(path.exists)
@@ -84,27 +84,34 @@ class RapidsDiskStoreSuite extends FunSuiteWithTempDir with Arm with MockitoSuga
       DecimalType(ai.rapids.cudf.DType.DECIMAL64_MAX_PRECISION, 5))
     val spillPriority = -7
     val hostStoreMaxSize = 1L * 1024 * 1024
-    val catalog = new RapidsBufferCatalog
-    withResource(new RapidsDeviceMemoryStore(catalog)) { devStore =>
-      withResource(new RapidsHostMemoryStore(hostStoreMaxSize, hostStoreMaxSize, catalog)) {
+    withResource(new RapidsDeviceMemoryStore) { devStore =>
+      val catalog = new RapidsBufferCatalog(devStore)
+      withResource(new RapidsHostMemoryStore(hostStoreMaxSize, hostStoreMaxSize)) {
         hostStore =>
           devStore.setSpillStore(hostStore)
-          withResource(new RapidsDiskStore(mock[RapidsDiskBlockManager], catalog, devStore)) {
+          withResource(new RapidsDiskStore(mock[RapidsDiskBlockManager])) {
             diskStore =>
               hostStore.setSpillStore(diskStore)
-              val (_, handle) = addTableToStore(devStore, bufferId, spillPriority)
+              val (_, handle) = addTableToCatalog(catalog, bufferId, spillPriority)
               assert(!handle.id.getDiskPath(null).exists())
-              val expectedBatch = withResource(catalog.acquireBuffer(handle)) { buffer =>
+              val expectedTable = withResource(catalog.acquireBuffer(handle)) { buffer =>
                 assertResult(StorageTier.DEVICE)(buffer.storageTier)
-                buffer.getColumnarBatch(sparkTypes)
+                withResource(buffer.getColumnarBatch(sparkTypes)) { beforeSpill =>
+                  withResource(GpuColumnVector.from(beforeSpill)) { table =>
+                    table.contiguousSplit()(0)
+                  }
+                } // closing the batch from the store so that we can spill it
               }
-              withResource(expectedBatch) { expectedBatch =>
-                devStore.synchronousSpill(0)
-                hostStore.synchronousSpill(0)
-                withResource(catalog.acquireBuffer(handle)) { buffer =>
-                  assertResult(StorageTier.DISK)(buffer.storageTier)
-                  withResource(buffer.getColumnarBatch(sparkTypes)) { actualBatch =>
-                    TestUtils.compareBatches(expectedBatch, actualBatch)
+              withResource(expectedTable) { _ =>
+                withResource(
+                    GpuColumnVector.from(expectedTable.getTable, sparkTypes)) { expectedBatch =>
+                  catalog.synchronousSpill(devStore, 0)
+                  catalog.synchronousSpill(hostStore, 0)
+                  withResource(catalog.acquireBuffer(handle)) { buffer =>
+                    assertResult(StorageTier.DISK)(buffer.storageTier)
+                    withResource(buffer.getColumnarBatch(sparkTypes)) { actualBatch =>
+                      TestUtils.compareBatches(expectedBatch, actualBatch)
+                    }
                   }
                 }
               }
@@ -119,14 +126,14 @@ class RapidsDiskStoreSuite extends FunSuiteWithTempDir with Arm with MockitoSuga
     assert(!bufferPath.exists)
     val spillPriority = -7
     val hostStoreMaxSize = 1L * 1024 * 1024
-    val catalog = new RapidsBufferCatalog
-    withResource(new RapidsDeviceMemoryStore(catalog)) { devStore =>
-      withResource(new RapidsHostMemoryStore(hostStoreMaxSize, hostStoreMaxSize, catalog)) {
+    withResource(new RapidsDeviceMemoryStore) { devStore =>
+      val catalog = new RapidsBufferCatalog(devStore)
+      withResource(new RapidsHostMemoryStore(hostStoreMaxSize, hostStoreMaxSize)) {
         hostStore =>
           devStore.setSpillStore(hostStore)
-          withResource(new RapidsDiskStore(mock[RapidsDiskBlockManager], catalog)) { diskStore =>
+          withResource(new RapidsDiskStore(mock[RapidsDiskBlockManager])) { diskStore =>
             hostStore.setSpillStore(diskStore)
-            val (_, handle) = addTableToStore(devStore, bufferId, spillPriority)
+            val (_, handle) = addTableToCatalog(catalog, bufferId, spillPriority)
             assert(!handle.id.getDiskPath(null).exists())
             val expectedBuffer = withResource(catalog.acquireBuffer(handle)) { buffer =>
               assertResult(StorageTier.DEVICE)(buffer.storageTier)
@@ -138,8 +145,8 @@ class RapidsDiskStoreSuite extends FunSuiteWithTempDir with Arm with MockitoSuga
               }
             }
             withResource(expectedBuffer) { expectedBuffer =>
-              devStore.synchronousSpill(0)
-              hostStore.synchronousSpill(0)
+              catalog.synchronousSpill(devStore, 0)
+              catalog.synchronousSpill(hostStore, 0)
               withResource(catalog.acquireBuffer(handle)) { buffer =>
                 assertResult(StorageTier.DISK)(buffer.storageTier)
                 withResource(buffer.getMemoryBuffer) { actualBuffer =>
@@ -168,18 +175,18 @@ class RapidsDiskStoreSuite extends FunSuiteWithTempDir with Arm with MockitoSuga
     assert(!bufferPath.exists)
     val spillPriority = -7
     val hostStoreMaxSize = 1L * 1024 * 1024
-    val catalog = new RapidsBufferCatalog
-    withResource(new RapidsDeviceMemoryStore(catalog)) { devStore =>
-      withResource(new RapidsHostMemoryStore(hostStoreMaxSize, hostStoreMaxSize, catalog)) {
+    withResource(new RapidsDeviceMemoryStore) { devStore =>
+      val catalog = new RapidsBufferCatalog(devStore)
+      withResource(new RapidsHostMemoryStore(hostStoreMaxSize, hostStoreMaxSize)) {
         hostStore =>
           devStore.setSpillStore(hostStore)
-          withResource(new RapidsDiskStore(mock[RapidsDiskBlockManager], catalog)) { diskStore =>
+          withResource(new RapidsDiskStore(mock[RapidsDiskBlockManager])) { diskStore =>
             hostStore.setSpillStore(diskStore)
-            val (_, handle) = addTableToStore(devStore, bufferId, spillPriority)
+            val (_, handle) = addTableToCatalog(catalog, bufferId, spillPriority)
             val bufferPath = handle.id.getDiskPath(null)
             assert(!bufferPath.exists())
-            devStore.synchronousSpill(0)
-            hostStore.synchronousSpill(0)
+            catalog.synchronousSpill(devStore, 0)
+            catalog.synchronousSpill(hostStore, 0)
             assert(bufferPath.exists)
             handle.close()
             if (canShareDiskPaths) {
@@ -192,18 +199,17 @@ class RapidsDiskStoreSuite extends FunSuiteWithTempDir with Arm with MockitoSuga
     }
   }
 
-  private def addTableToStore(
-      deviceMemoryStore: RapidsDeviceMemoryStore,
+  private def addTableToCatalog(
+      catalog: RapidsBufferCatalog,
       bufferId: RapidsBufferId,
       spillPriority: Long): (Long, RapidsBufferHandle) = {
     withResource(buildContiguousTable()) { ct =>
       val bufferSize = ct.getBuffer.getLength
       // store takes ownership of the table
-      val handle = deviceMemoryStore.addContiguousTable(
+      val handle = catalog.addContiguousTable(
         bufferId,
         ct,
         spillPriority,
-        RapidsBuffer.defaultSpillCallback,
         false)
       (bufferSize, handle)
     }
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RapidsGdsStoreSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RapidsGdsStoreSuite.scala
index fbb3b6b5750..b7500e41291 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RapidsGdsStoreSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RapidsGdsStoreSuite.scala
@@ -57,11 +57,11 @@ class RapidsGdsStoreSuite extends FunSuiteWithTempDir with Arm with MockitoSugar
        .thenReturn(paths(1))
    paths.foreach(f => assert(!f.exists))
    val spillPriority = -7
-   val catalog = spy(new RapidsBufferCatalog)
    val batchWriteBufferSize = 16384 // Holds 2 buffers.
-   withResource(new RapidsDeviceMemoryStore(catalog)) { devStore =>
+   withResource(new RapidsDeviceMemoryStore) { devStore =>
+     val catalog = spy(new RapidsBufferCatalog(devStore))
      withResource(new RapidsGdsStore(
-       diskBlockManager, batchWriteBufferSize, catalog)) { gdsStore =>
+       diskBlockManager, batchWriteBufferSize)) { gdsStore =>
 
        devStore.setSpillStore(gdsStore)
        assertResult(0)(gdsStore.currentSize)
@@ -70,8 +70,8 @@ class RapidsGdsStoreSuite extends FunSuiteWithTempDir with Arm with MockitoSugar
        val bufferHandles = new Array[RapidsBufferHandle](bufferIds.length)
 
        bufferIds.zipWithIndex.foreach { case(id, ix) =>
-         val (size, handle) = addTableToStore(devStore, id, spillPriority)
-         devStore.synchronousSpill(0)
+         val (size, handle) = addTableToCatalog(catalog, id, spillPriority)
+         catalog.synchronousSpill(devStore, 0)
          bufferSizes(ix) = size
          bufferHandles(ix) = handle
        }
@@ -109,14 +109,14 @@ class RapidsGdsStoreSuite extends FunSuiteWithTempDir with Arm with MockitoSugar
     val path = bufferId.getDiskPath(null)
     assert(!path.exists)
     val spillPriority = -7
-    val catalog = spy(new RapidsBufferCatalog)
-    withResource(new RapidsDeviceMemoryStore(catalog)) { devStore =>
-      withResource(new RapidsGdsStore(mock[RapidsDiskBlockManager], 4096, catalog)) {
+    withResource(new RapidsDeviceMemoryStore) { devStore =>
+      val catalog = spy(new RapidsBufferCatalog(devStore))
+      withResource(new RapidsGdsStore(mock[RapidsDiskBlockManager], 4096)) {
         gdsStore =>
         devStore.setSpillStore(gdsStore)
         assertResult(0)(gdsStore.currentSize)
-        val (bufferSize, handle) = addTableToStore(devStore, bufferId, spillPriority)
-        devStore.synchronousSpill(0)
+        val (bufferSize, handle) = addTableToCatalog(catalog, bufferId, spillPriority)
+        catalog.synchronousSpill(devStore, 0)
         assertResult(bufferSize)(gdsStore.currentSize)
         assert(path.exists)
         assertResult(bufferSize)(path.length)
@@ -140,15 +140,14 @@ class RapidsGdsStoreSuite extends FunSuiteWithTempDir with Arm with MockitoSugar
     }
   }
 
-  private def addTableToStore(
-      devStore: RapidsDeviceMemoryStore,
+  private def addTableToCatalog(
+      catalog: RapidsBufferCatalog,
       bufferId: RapidsBufferId,
       spillPriority: Long): (Long, RapidsBufferHandle) = {
     withResource(buildContiguousTable()) { ct =>
       val bufferSize = ct.getBuffer.getLength
       // store takes ownership of the table
-      val handle = devStore.addContiguousTable(bufferId, ct, spillPriority,
-        RapidsBuffer.defaultSpillCallback, false)
+      val handle = catalog.addContiguousTable(bufferId, ct, spillPriority, false)
       (bufferSize, handle)
     }
   }
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RapidsHostMemoryStoreSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RapidsHostMemoryStoreSuite.scala
index bc6dd6fefce..dedd0e64b88 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RapidsHostMemoryStoreSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RapidsHostMemoryStoreSuite.scala
@@ -21,12 +21,15 @@ import java.math.RoundingMode
 
 import ai.rapids.cudf.{ContiguousTable, Cuda, HostColumnVector, HostMemoryBuffer, MemoryBuffer, Table}
 import org.mockito.{ArgumentCaptor, ArgumentMatchers}
+import org.mockito.ArgumentMatchers.any
 import org.mockito.Mockito.{never, spy, times, verify, when}
 import org.scalatest.FunSuite
 import org.scalatest.mockito.MockitoSugar
 
 import org.apache.spark.sql.rapids.RapidsDiskBlockManager
 import org.apache.spark.sql.types.{DataType, DecimalType, DoubleType, IntegerType, LongType, StringType}
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
 
 class RapidsHostMemoryStoreSuite extends FunSuite with Arm with MockitoSugar {
   private def buildContiguousTable(): ContiguousTable = {
@@ -54,25 +57,26 @@ class RapidsHostMemoryStoreSuite extends FunSuite with Arm with MockitoSugar {
   test("spill updates catalog") {
     val spillPriority = -7
     val hostStoreMaxSize = 1L * 1024 * 1024
-    val catalog = spy(new RapidsBufferCatalog)
-    withResource(new RapidsDeviceMemoryStore(catalog)) { devStore =>
-      withResource(new RapidsHostMemoryStore(hostStoreMaxSize, hostStoreMaxSize, catalog)) {
+    val mockStore = mock[RapidsHostMemoryStore]
+    withResource(new RapidsDeviceMemoryStore) { devStore =>
+      val catalog = spy(new RapidsBufferCatalog(devStore))
+      withResource(new RapidsHostMemoryStore(hostStoreMaxSize, hostStoreMaxSize)) {
         hostStore =>
           assertResult(0)(hostStore.currentSize)
           assertResult(hostStoreMaxSize)(hostStore.numBytesFree)
           devStore.setSpillStore(hostStore)
+          hostStore.setSpillStore(mockStore)
 
           val (bufferSize, handle) = withResource(buildContiguousTable()) { ct =>
             val len = ct.getBuffer.getLength
             // store takes ownership of the table
-            val handle = devStore.addContiguousTable(
+            val handle = catalog.addContiguousTable(
               ct,
-              spillPriority,
-              RapidsBuffer.defaultSpillCallback)
+              spillPriority)
             (len, handle)
           }
 
-          devStore.synchronousSpill(0)
+          catalog.synchronousSpill(devStore, 0)
           assertResult(bufferSize)(hostStore.currentSize)
           assertResult(hostStoreMaxSize - bufferSize)(hostStore.numBytesFree)
           verify(catalog, times(2)).registerNewBuffer(ArgumentMatchers.any[RapidsBuffer])
@@ -91,25 +95,28 @@ class RapidsHostMemoryStoreSuite extends FunSuite with Arm with MockitoSugar {
   test("get columnar batch") {
     val spillPriority = -10
     val hostStoreMaxSize = 1L * 1024 * 1024
-    val catalog = new RapidsBufferCatalog
-    withResource(new RapidsDeviceMemoryStore(catalog)) { devStore =>
-      withResource(new RapidsHostMemoryStore(hostStoreMaxSize, hostStoreMaxSize, catalog)) {
+    val mockStore = mock[RapidsHostMemoryStore]
+    withResource(new RapidsDeviceMemoryStore) { devStore =>
+      val catalog = new RapidsBufferCatalog(devStore)
+      withResource(new RapidsHostMemoryStore(hostStoreMaxSize, hostStoreMaxSize)) {
         hostStore =>
           devStore.setSpillStore(hostStore)
-          withResource(buildContiguousTable()) { ct =>
-            withResource(HostMemoryBuffer.allocate(ct.getBuffer.getLength)) { expectedBuffer =>
-              expectedBuffer.copyFromDeviceBuffer(ct.getBuffer)
-              val handle = devStore.addContiguousTable(
-                ct,
-                spillPriority,
-                RapidsBuffer.defaultSpillCallback)
-              devStore.synchronousSpill(0)
-              withResource(catalog.acquireBuffer(handle)) { buffer =>
-                withResource(buffer.getMemoryBuffer) { actualBuffer =>
-                  assert(actualBuffer.isInstanceOf[HostMemoryBuffer])
-                  assertResult(expectedBuffer.asByteBuffer) {
-                    actualBuffer.asInstanceOf[HostMemoryBuffer].asByteBuffer
-                  }
+          hostStore.setSpillStore(mockStore)
+          var expectedBuffer: HostMemoryBuffer = null
+          val handle = withResource(buildContiguousTable()) { ct =>
+            expectedBuffer = HostMemoryBuffer.allocate(ct.getBuffer.getLength)
+            expectedBuffer.copyFromDeviceBuffer(ct.getBuffer)
+            catalog.addContiguousTable(
+              ct,
+              spillPriority)
+          }
+          withResource(expectedBuffer) { _ =>
+            catalog.synchronousSpill(devStore, 0)
+            withResource(catalog.acquireBuffer(handle)) { buffer =>
+              withResource(buffer.getMemoryBuffer) { actualBuffer =>
+                assert(actualBuffer.isInstanceOf[HostMemoryBuffer])
+                assertResult(expectedBuffer.asByteBuffer) {
+                  actualBuffer.asInstanceOf[HostMemoryBuffer].asByteBuffer
                 }
               }
             }
@@ -123,26 +130,31 @@ class RapidsHostMemoryStoreSuite extends FunSuite with Arm with MockitoSugar {
       DecimalType(ai.rapids.cudf.DType.DECIMAL64_MAX_PRECISION, 5))
     val spillPriority = -10
     val hostStoreMaxSize = 1L * 1024 * 1024
-    val catalog = new RapidsBufferCatalog
-    withResource(new RapidsDeviceMemoryStore(catalog)) { devStore =>
-      withResource(
-        new RapidsHostMemoryStore(hostStoreMaxSize, hostStoreMaxSize, catalog, devStore)) {
+    val mockStore = mock[RapidsHostMemoryStore]
+    withResource(new RapidsDeviceMemoryStore) { devStore =>
+      val catalog = new RapidsBufferCatalog(devStore)
+      withResource(new RapidsHostMemoryStore(hostStoreMaxSize, hostStoreMaxSize)) {
         hostStore =>
           devStore.setSpillStore(hostStore)
-          withResource(buildContiguousTable()) { ct =>
-            withResource(GpuColumnVector.from(ct.getTable, sparkTypes)) {
-              expectedBatch =>
-                val handle = devStore.addContiguousTable(
-                  ct,
-                  spillPriority,
-                  RapidsBuffer.defaultSpillCallback)
-                devStore.synchronousSpill(0)
-                withResource(catalog.acquireBuffer(handle)) { buffer =>
-                  assertResult(StorageTier.HOST)(buffer.storageTier)
-                  withResource(buffer.getColumnarBatch(sparkTypes)) { actualBatch =>
-                    TestUtils.compareBatches(expectedBatch, actualBatch)
-                  }
-                }
+          hostStore.setSpillStore(mockStore)
+          var expectedBatch: ColumnarBatch = null
+          val handle = withResource(buildContiguousTable()) { ct =>
+            // make a copy of the table so we can compare it later to the
+            // one reconstituted after the spill
+            withResource(ct.getTable.contiguousSplit()) { copied  =>
+              expectedBatch = GpuColumnVector.from(copied(0).getTable, sparkTypes)
+            }
+            catalog.addContiguousTable(
+              ct,
+              spillPriority)
+          }
+          withResource(expectedBatch) { _ =>
+            catalog.synchronousSpill(devStore, 0)
+            withResource(catalog.acquireBuffer(handle)) { buffer =>
+              assertResult(StorageTier.HOST)(buffer.storageTier)
+              withResource(buffer.getColumnarBatch(sparkTypes)) { actualBatch =>
+                TestUtils.compareBatches(expectedBatch, actualBatch)
+              }
             }
           }
       }
@@ -153,50 +165,67 @@ class RapidsHostMemoryStoreSuite extends FunSuite with Arm with MockitoSugar {
     val sparkTypes = Array[DataType](LongType)
     val spillPriority = -10
     val hostStoreMaxSize = 256
-    val catalog = new RapidsBufferCatalog
-    withResource(new RapidsDeviceMemoryStore(catalog)) { devStore =>
+    withResource(new RapidsDeviceMemoryStore) { devStore =>
+      val catalog = new RapidsBufferCatalog(devStore)
       val mockStore = mock[RapidsBufferStore]
+      val mockBuff = mock[mockStore.RapidsBufferBase]
+      when(mockBuff.id).thenReturn(new RapidsBufferId {
+        override val tableId: Int = 0
+        override def getDiskPath(diskBlockManager: RapidsDiskBlockManager): File = null
+      })
+      when(mockStore.getMaxSize).thenAnswer(_ => None)
+      when(mockStore.copyBuffer(any(), any(), any())).thenReturn(mockBuff)
       when(mockStore.tier) thenReturn (StorageTier.DISK)
-      withResource(
-        new RapidsHostMemoryStore(hostStoreMaxSize, hostStoreMaxSize, catalog, devStore)) {
-        hostStore =>
-          devStore.setSpillStore(hostStore)
-          hostStore.setSpillStore(mockStore)
-          withResource(buildContiguousTable(1024 * 1024)) { bigTable =>
-            withResource(buildContiguousTable(1)) { smallTable =>
-              withResource(GpuColumnVector.from(bigTable.getTable, sparkTypes)) { expectedBatch =>
-                // store takes ownership of the table
-                val bigHandle = devStore.addContiguousTable(
+      withResource(new RapidsHostMemoryStore(hostStoreMaxSize, hostStoreMaxSize)) { hostStore =>
+        devStore.setSpillStore(hostStore)
+        hostStore.setSpillStore(mockStore)
+        var bigHandle: RapidsBufferHandle = null
+        var bigTable = buildContiguousTable(1024 * 1024)
+        var smallTable = buildContiguousTable(1)
+        closeOnExcept(bigTable) { _ =>
+          closeOnExcept(smallTable) { _ =>
+            // make a copy of the table so we can compare it later to the
+            // one reconstituted after the spill
+            val expectedBatch =
+              withResource(bigTable.getTable.contiguousSplit()) { expectedTable =>
+                GpuColumnVector.from(expectedTable(0).getTable, sparkTypes)
+              }
+            withResource(expectedBatch) { _ =>
+              bigHandle = withResource(bigTable) { _ =>
+                catalog.addContiguousTable(
                   bigTable,
-                  spillPriority,
-                  RapidsBuffer.defaultSpillCallback)
-                devStore.synchronousSpill(0)
-                verify(mockStore, never()).copyBuffer(ArgumentMatchers.any[RapidsBuffer],
-                  ArgumentMatchers.any[MemoryBuffer],
-                  ArgumentMatchers.any[Cuda.Stream])
-                withResource(catalog.acquireBuffer(bigHandle)) { buffer =>
-                  assertResult(StorageTier.HOST)(buffer.storageTier)
-                  withResource(buffer.getColumnarBatch(sparkTypes)) { actualBatch =>
-                    TestUtils.compareBatches(expectedBatch, actualBatch)
-                  }
-                }
-
-                devStore.addContiguousTable(
-                  smallTable, spillPriority,
-                  RapidsBuffer.defaultSpillCallback, false)
-                devStore.synchronousSpill(0)
-                val rapidsBufferCaptor: ArgumentCaptor[RapidsBuffer] =
-                  ArgumentCaptor.forClass(classOf[RapidsBuffer])
-                val memoryBufferCaptor: ArgumentCaptor[MemoryBuffer] =
-                  ArgumentCaptor.forClass(classOf[MemoryBuffer])
-                verify(mockStore).copyBuffer(rapidsBufferCaptor.capture(),
-                  memoryBufferCaptor.capture(), ArgumentMatchers.any[Cuda.Stream])
-                withResource(memoryBufferCaptor.getValue) { _ =>
-                  assertResult(bigHandle.id)(rapidsBufferCaptor.getValue.id)
+                  spillPriority)
+              } // close the bigTable so it can be spilled
+              bigTable = null
+              catalog.synchronousSpill(devStore, 0)
+              verify(mockStore, never()).copyBuffer(ArgumentMatchers.any[RapidsBuffer],
+                ArgumentMatchers.any[MemoryBuffer],
+                ArgumentMatchers.any[Cuda.Stream])
+              withResource(catalog.acquireBuffer(bigHandle)) { buffer =>
+                assertResult(StorageTier.HOST)(buffer.storageTier)
+                withResource(buffer.getColumnarBatch(sparkTypes)) { actualBatch =>
+                  TestUtils.compareBatches(expectedBatch, actualBatch)
                 }
               }
             }
+            withResource(smallTable) { _ =>
+              catalog.addContiguousTable(
+                smallTable, spillPriority,
+                false)
+            } // close the smallTable so it can be spilled
+            smallTable = null
+            catalog.synchronousSpill(devStore, 0)
+            val rapidsBufferCaptor: ArgumentCaptor[RapidsBuffer] =
+              ArgumentCaptor.forClass(classOf[RapidsBuffer])
+            val memoryBufferCaptor: ArgumentCaptor[MemoryBuffer] =
+              ArgumentCaptor.forClass(classOf[MemoryBuffer])
+            verify(mockStore).copyBuffer(rapidsBufferCaptor.capture(),
+              memoryBufferCaptor.capture(), ArgumentMatchers.any[Cuda.Stream])
+            withResource(memoryBufferCaptor.getValue) { _ =>
+              assertResult(bigHandle.id)(rapidsBufferCaptor.getValue.id)
+            }
           }
+        }
       }
     }
   }
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionParserSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionParserSuite.scala
index 45c2cd65546..520b0356d38 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionParserSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionParserSuite.scala
@@ -185,6 +185,20 @@ class RegularExpressionParserSuite extends FunSuite {
         RegexSequence(ListBuffer(RegexChar('a')))), None))))
   }
 
+  test("multiple choice (2)") {
+    assert(parse("aa|bb") == RegexChoice(RegexSequence(ListBuffer(RegexChar('a'), RegexChar('a'))),
+          RegexSequence(ListBuffer(RegexChar('b'), RegexChar('b')))
+      ))
+  }
+
+  test("multiple choice (3)") {
+    assert(parse("aa|bb|cc") ==
+          RegexChoice(RegexSequence(ListBuffer(RegexChar('a'), RegexChar('a'))),
+          RegexChoice(RegexSequence(ListBuffer(RegexChar('b'), RegexChar('b'))),
+          RegexSequence(ListBuffer(RegexChar('c'), RegexChar('c')))
+      )))
+  }
+
   test("group containing quantifier") {
     val e = intercept[RegexUnsupportedException] {
       parse("(?)")
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionSuite.scala
index 7478e112216..f543805fe6a 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,6 @@ package com.nvidia.spark.rapids
 
 import java.nio.charset.Charset
 
-import com.nvidia.spark.rapids.shims.SparkShimImpl
-
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{DataFrame, SparkSession}
 
@@ -27,6 +25,26 @@ class RegularExpressionSuite extends SparkQueryCompareTestSuite {
   private val conf = new SparkConf()
     .set(RapidsConf.ENABLE_REGEXP.key, "true")
 
+  test("Plan toString should not leak internal details of ternary expressions") {
+    assume(isUnicodeEnabled())
+    // see https://github.com/NVIDIA/spark-rapids/issues/7924 for background, but our ternary
+    // operators, such as GpuRegexpExtract have additional attributes (cuDF patterns) that we
+    // do not want displayed when printing a plan
+    val conf = new SparkConf()
+      .set(RapidsConf.ENABLE_REGEXP.key, "true")
+      .set(RapidsConf.TEST_ALLOWED_NONGPU.key,
+        "FileSourceScanExec,CollectLimitExec,DeserializeToObjectExec")
+    withGpuSparkSession(spark => {
+      spark.read.csv("src/test/resources/strings.csv").createTempView("t")
+      val df = spark.sql("SELECT t._c0, regexp_extract(t._c0, '(.*) (.*) (.*)', 2) FROM t")
+      df.collect()
+      val planString = df.queryExecution.executedPlan.toString()
+      val planStringWithoutAttrRefs = planString.replaceAll("#[0-9]+", "")
+      assert(planStringWithoutAttrRefs.contains(
+        "regexp_extract(_c0, (.*) (.*) (.*), 2) AS regexp_extract(_c0, (.*) (.*) (.*), 2)"))
+    }, conf)
+  }
+
   testGpuFallback(
     "String regexp_replace replace str columnar fall back",
     "RegExpReplace",
@@ -43,7 +61,7 @@ class RegularExpressionSuite extends SparkQueryCompareTestSuite {
     frame => {
       // this test is only valid in Spark 3.0.x because the expression is NullIntolerant
       // since Spark 3.1.0 and gets replaced with a null literal instead
-      val isValidTestForSparkVersion = SparkShimImpl.getSparkShimVersion match {
+      val isValidTestForSparkVersion = ShimLoader.getShimVersion match {
         case SparkShimVersion(major, minor, _) => major == 3 && minor == 0
         case DatabricksShimVersion(major, minor, _, _) => major == 3 && minor == 0
         case ClouderaShimVersion(major, minor, _, _) => major == 3 && minor == 0
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
index 75f19178f10..6d097d902ff 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
@@ -21,7 +21,7 @@ import java.util.regex.Pattern
 import scala.collection.mutable.{HashSet, ListBuffer}
 import scala.util.{Random, Try}
 
-import ai.rapids.cudf.{ColumnVector, CudfException}
+import ai.rapids.cudf.{CaptureGroups, ColumnVector, CudfException, RegexProgram}
 import com.nvidia.spark.rapids.RegexParser.toReadableString
 import org.scalatest.FunSuite
 
@@ -409,7 +409,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
     val patterns = Seq("a[-b]", "a[+-]", "a[-+]", "a[-]", "a[^-]")
     val expected = Seq(raw"a[\-b]", raw"a[+\-]", raw"a[\-+]", raw"a[\-]", "a(?:[\r]|[^\\-])")
     val transpiler = new CudfRegexTranspiler(RegexFindMode)
-    val transpiled = patterns.map(transpiler.transpile(_, None)._1)
+    val transpiled = patterns.map(transpiler.transpile(_, None, None)._1)
     assert(transpiled === expected)
   }
 
@@ -457,6 +457,20 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
     doTranspileTest("\\p{Print}", "[a-zA-Z0-9!\"#$%&'()*+,\\-./:;<=>?@\\^_`{|}~\\[\\]\u0020]")
   }
 
+  test("transpile with group index to extract") {
+    doTranspileTest("(a)(b)", "(a)(?:b)", 1)
+    doTranspileTest("(a)(b)", "(?:a)(b)", 2)
+    doTranspileTest("(a)(b)(c)(d)", "(?:a)(?:b)(?:c)(?:d)", 0)
+    doTranspileTest("(a)(?:b)(c)", "(a)(?:b)(?:c)", 1)
+    doTranspileTest("(a)(b)(c)(d)", "(?:a)(?:b)(?:c)(?:d)", 5)
+    doTranspileTest("(a(b))(c)(d)", "(a(?:b))(?:c)(?:d)", 1)
+    doTranspileTest("(a(b))(c)(d)", "(?:a(b))(?:c)(?:d)", 2)
+    doTranspileTest("(ab)+(c)(d)", "(ab)+(?:c)(?:d)", 1)
+    doTranspileTest("(ab)+(c)(d)", "(?:ab)+(c)(?:d)", 2)
+    doTranspileTest("([a-z0-9]((([abcd](\\d?)))))", "(?:[a-z0-9](?:((?:[abcd](?:[0-9]?)))))", 3)
+    doTranspileTest("ab", "ab", 1)
+  }
+
   test("compare CPU and GPU: character range including unescaped + and -") {
     val patterns = Seq("a[-]+", "a[a-b-]+", "a[-a-b]", "a[-+]", "a[+-]")
     val inputs = Seq("a+", "a-", "a", "a-+", "a[a-b-]")
@@ -569,7 +583,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
 
     def test(pattern: String, expected: String): Unit = {
       val t = new CudfRegexTranspiler(RegexFindMode)
-      val (actual, _) = t.transpile(pattern, None)
+      val (actual, _) = t.transpile(pattern, None, None)
       assert(toReadableString(expected) === toReadableString(actual))
     }
 
@@ -802,7 +816,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
       val (isRegex, cudfPattern) = if (RegexParser.isRegExpString(pattern)) {
         transpiler.transpileToSplittableString(pattern) match {
           case Some(simplified) => (false, simplified)
-          case _ => (true, transpiler.transpile(pattern, None)._1)
+          case _ => (true, transpiler.transpile(pattern, None, None)._1)
         }
       } else {
         (false, pattern)
@@ -868,7 +882,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
     for ((javaPattern, patternIndex) <- javaPatterns.zipWithIndex) {
       val cpu = cpuContains(javaPattern, input)
       val (cudfPattern, _) =
-          new CudfRegexTranspiler(RegexFindMode).transpile(javaPattern, None)
+          new CudfRegexTranspiler(RegexFindMode).transpile(javaPattern, None, None)
       val gpu = try {
         gpuContains(cudfPattern, input)
       } catch {
@@ -893,7 +907,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
       val cpu = cpuReplace(javaPattern, input)
       val (cudfPattern, replaceString) =
           (new CudfRegexTranspiler(RegexReplaceMode)).transpile(javaPattern,
-              Some(REPLACE_STRING))
+              None, Some(REPLACE_STRING))
       val gpu = try {
         gpuReplace(cudfPattern, replaceString.get, input)
       } catch {
@@ -915,11 +929,11 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
   }
 
   /** cuDF containsRe helper */
-  @scala.annotation.nowarn("msg=method containsRe in class ColumnView is deprecated")
   private def gpuContains(cudfPattern: String, input: Seq[String]): Array[Boolean] = {
     val result = new Array[Boolean](input.length)
     withResource(ColumnVector.fromStrings(input: _*)) { cv =>
-      withResource(cv.containsRe(cudfPattern)) { c =>
+      val prog = new RegexProgram(cudfPattern, CaptureGroups.NON_CAPTURE)
+      withResource(cv.containsRe(prog)) { c =>
         withResource(c.copyToHost()) { hv =>
           result.indices.foreach(i => result(i) = hv.getBoolean(i))
         }
@@ -931,7 +945,6 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
   private val REPLACE_STRING = "\\_\\RE\\\\P\\L\\A\\C\\E\\_"
 
   /** cuDF replaceRe helper */
-  @scala.annotation.nowarn("msg=in class ColumnView is deprecated")
   private def gpuReplace(cudfPattern: String, replaceString: String,
       input: Seq[String]): Array[String] = {
     val result = new Array[String](input.length)
@@ -939,10 +952,11 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
     val (hasBackrefs, converted) = GpuRegExpUtils.backrefConversion(replace)
     withResource(ColumnVector.fromStrings(input: _*)) { cv =>
       val c = if (hasBackrefs) {
-        cv.stringReplaceWithBackrefs(cudfPattern, converted)
+        cv.stringReplaceWithBackrefs(new RegexProgram(cudfPattern), converted)
       } else {
         withResource(GpuScalar.from(converted, DataTypes.StringType)) { replace =>
-          cv.replaceRegex(cudfPattern, replace)
+          val prog = new RegexProgram(cudfPattern, CaptureGroups.NON_CAPTURE)
+          cv.replaceRegex(prog, replace)
         }
       }
       withResource(c) { c => 
@@ -968,14 +982,18 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
     input.map(s => s.split(pattern, limit))
   }
 
-  @scala.annotation.nowarn("msg=method stringSplitRecord in class ColumnView is deprecated")
   private def gpuSplit(
       pattern: String,
       input: Seq[String],
       limit: Int,
       isRegex: Boolean): Seq[Array[String]] = {
     withResource(ColumnVector.fromStrings(input: _*)) { cv =>
-      withResource(cv.stringSplitRecord(pattern, limit, isRegex)) { x =>
+      val x = if (isRegex) {
+        cv.stringSplitRecord(new RegexProgram(pattern, CaptureGroups.NON_CAPTURE), limit)
+      } else {
+        cv.stringSplitRecord(pattern, limit)
+      }
+      withResource(x) { x =>
         withResource(x.copyToHost()) { hcv =>
           (0 until hcv.getRowCount.toInt).map(i => {
             val list = hcv.getList(i)
@@ -991,15 +1009,27 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
     assert(toReadableString(transpiled) === toReadableString(expected))
   }
 
+  private def doTranspileTest(pattern: String, expected: String, groupIdx: Int) {
+    val transpiled: String = transpile(pattern, groupIdx)
+    assert(toReadableString(transpiled) === toReadableString(expected))
+  }
+
   private def transpile(pattern: String, mode: RegexMode): String = {
     val replace = mode match {
       case RegexReplaceMode => Some(REPLACE_STRING)
       case _ => None
     }
-    val (cudfPattern, _) = new CudfRegexTranspiler(mode).transpile(pattern, replace)
+    val (cudfPattern, _) = new CudfRegexTranspiler(mode).transpile(pattern, None, replace)
     cudfPattern
   }
 
+  private def transpile(pattern: String, groupIndex: Int): String = {
+    val (cudfPattern, _) = new CudfRegexTranspiler(RegexFindMode).transpile(
+        pattern, Some(groupIndex), None)
+    cudfPattern
+  }
+
+
   private def assertUnsupported(pattern: String, mode: RegexMode, message: String): Unit = {
     val e = intercept[RegexUnsupportedException] {
       transpile(pattern, mode)
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/ReusedExchangeFixupSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/ReusedExchangeFixupSuite.scala
new file mode 100644
index 00000000000..b5a1197da8a
--- /dev/null
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/ReusedExchangeFixupSuite.scala
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.optimizer.BuildRight
+import org.apache.spark.sql.catalyst.plans.Inner
+import org.apache.spark.sql.catalyst.plans.logical.Range
+import org.apache.spark.sql.execution.{RangeExec, UnionExec}
+import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec}
+import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, HashedRelationBroadcastMode}
+
+class ReusedExchangeFixupSuite extends SparkQueryCompareTestSuite {
+  /**
+   * Tests reuse exchange fixup. Ideally this would be an integration test, but
+   * attempts to reproduce the failure to reuse were not successful at smaller scales.
+   */
+  private def fixupExchangeReuseTest(spark: SparkSession): Unit = {
+    val range = RangeExec(Range(1, 2, 1, Some(1)))
+    val broadcast1 = BroadcastExchangeExec(
+      HashedRelationBroadcastMode(range.output),
+      range)
+    val join1 = BroadcastHashJoinExec(
+      range.output,
+      broadcast1.output,
+      Inner,
+      BuildRight,
+      None,
+      range,
+      broadcast1)
+
+    val broadcast2 = BroadcastExchangeExec(
+      HashedRelationBroadcastMode(range.output),
+      range)
+    val join2 = BroadcastHashJoinExec(
+      range.output,
+      broadcast2.output,
+      Inner,
+      BuildRight,
+      None,
+      range,
+      broadcast2)
+    val plan = UnionExec(Seq(join1, join2))
+    val overrides = GpuOverrides()
+    val transitionOverrides = new GpuTransitionOverrides
+    var updatedPlan = overrides(plan)
+    updatedPlan = transitionOverrides(updatedPlan)
+    val reused = updatedPlan.find {
+      case _: ReusedExchangeExec => true
+      case _ => false
+    }
+    assert(reused.isDefined)
+  }
+
+  test("fixupExchangeReuse") {
+    val conf = new SparkConf()
+        .set("spark.sql.adaptive.enabled", "true")
+        .set("spark.sql.exchange.reuse", "true")
+        .set(RapidsConf.ENABLE_AQE_EXCHANGE_REUSE_FIXUP.key, "true")
+    withGpuSparkSession(fixupExchangeReuseTest, conf)
+  }
+}
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RmmSparkRetrySuiteBase.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RmmSparkRetrySuiteBase.scala
new file mode 100644
index 00000000000..f1de989b605
--- /dev/null
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RmmSparkRetrySuiteBase.scala
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import ai.rapids.cudf.{Rmm, RmmAllocationMode, RmmEventHandler}
+import com.nvidia.spark.rapids.jni.RmmSpark
+import org.scalatest.{BeforeAndAfterEach, FunSuite}
+
+import org.apache.spark.sql.SparkSession
+
+class RmmSparkRetrySuiteBase extends FunSuite with BeforeAndAfterEach {
+  private var rmmWasInitialized = false
+
+  override def beforeEach(): Unit = {
+    SparkSession.getActiveSession.foreach(_.stop())
+    SparkSession.clearActiveSession()
+    if (!Rmm.isInitialized) {
+      rmmWasInitialized = true
+      Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, null, 512 * 1024 * 1024)
+    }
+    val deviceStorage = new RapidsDeviceMemoryStore()
+    val catalog = new RapidsBufferCatalog(deviceStorage)
+    RapidsBufferCatalog.setCatalog(catalog)
+    val mockEventHandler = new BaseRmmEventHandler()
+    RmmSpark.setEventHandler(mockEventHandler)
+    RmmSpark.associateThreadWithTask(RmmSpark.getCurrentThreadId, 1)
+  }
+
+  override def afterEach(): Unit = {
+    RmmSpark.removeThreadAssociation(RmmSpark.getCurrentThreadId)
+    RmmSpark.clearEventHandler()
+    RapidsBufferCatalog.close()
+    if (rmmWasInitialized) {
+      Rmm.shutdown()
+    }
+  }
+
+  private class BaseRmmEventHandler extends RmmEventHandler {
+    override def getAllocThresholds: Array[Long] = null
+    override def getDeallocThresholds: Array[Long] = null
+    override def onAllocThreshold(totalAllocSize: Long): Unit = {}
+    override def onDeallocThreshold(totalAllocSize: Long): Unit = {}
+    override def onAllocFailure(sizeRequested: Long, retryCount: Int): Boolean = {
+      false
+    }
+  }
+}
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala
index 76b9f88eb78..922ba639abe 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,6 @@ import java.nio.file.Files
 import java.sql.{Date, Timestamp}
 import java.util.{Locale, TimeZone}
 
-import com.nvidia.spark.rapids.shims.SparkShimImpl
 import org.scalatest.{Assertion, FunSuite}
 import scala.reflect.ClassTag
 import scala.util.{Failure, Try}
@@ -2048,7 +2047,7 @@ trait SparkQueryCompareTestSuite extends FunSuite with Arm {
     assume(isSpark340OrLater, "Spark version not 3.4.0+")
 
   def cmpSparkVersion(major: Int, minor: Int, bugfix: Int): Int = {
-    val sparkShimVersion = SparkShimImpl.getSparkShimVersion
+    val sparkShimVersion = ShimLoader.getShimVersion
     val (sparkMajor, sparkMinor, sparkBugfix) = sparkShimVersion match {
       case SparkShimVersion(a, b, c) => (a, b, c)
       case DatabricksShimVersion(a, b, c, _) => (a, b, c)
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/StringFunctionSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/StringFunctionSuite.scala
index 03a099631a9..29ff24fe7a6 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/StringFunctionSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/StringFunctionSuite.scala
@@ -16,11 +16,12 @@
 
 package com.nvidia.spark.rapids
 
-import org.scalatest.Ignore
+import org.scalatest.{FunSuite, Ignore}
 
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{DataFrame, Row, SparkSession}
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.rapids.GpuRegExpUtils
 
  /* 
  * Different versions of Java support different versions of Unicode. 
@@ -195,6 +196,29 @@ class StringOperatorsSuite extends SparkQueryCompareTestSuite {
   }
 }
 
+class RegExpUtilsSuite extends FunSuite {
+  test("get list of choices from regexp for multi-replace") {
+    val regexChoices = Map(
+      "aa|bb" -> Seq("aa", "bb"),
+      "(aa)|(bb)" -> Seq("aa", "bb"),
+      "aa|bb|cc" -> Seq("aa", "bb", "cc"),
+      "(aa)|(bb)|(cc)" -> Seq("aa", "bb", "cc"),
+      "aa|bb|cc|dd" -> Seq("aa", "bb", "cc", "dd"),
+      "(aa|bb)|(cc|dd)" -> Seq("aa", "bb", "cc", "dd"),
+      "aa|bb|cc|dd|ee" -> Seq("aa", "bb", "cc", "dd", "ee"),
+      "aa|bb|cc|dd|ee|ff" -> Seq("aa", "bb", "cc", "dd", "ee", "ff")
+    )
+
+    regexChoices.foreach { case (pattern, choices) =>
+      val (ast, _) = (new CudfRegexTranspiler(RegexReplaceMode)).getTranspiledAST(pattern,
+              None, Some(""))
+      val result = GpuRegExpUtils.getChoicesFromRegex(ast)
+      assert(result.isDefined && result.forall(_ == choices))
+    }
+
+  }
+}
+
 /*
 * This isn't actually a test.  It's just useful to help visualize what's going on when there are
 * differences present.
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/WindowRetrySuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/WindowRetrySuite.scala
new file mode 100644
index 00000000000..a24595140ac
--- /dev/null
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/WindowRetrySuite.scala
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import ai.rapids.cudf._
+import com.nvidia.spark.rapids.jni.{RmmSpark, SplitAndRetryOOM}
+import org.mockito.Mockito._
+import org.scalatest.mockito.MockitoSugar
+
+import org.apache.spark.sql.catalyst.expressions.{Ascending, CurrentRow, ExprId, RangeFrame, RowFrame, SortOrder, UnboundedFollowing, UnboundedPreceding}
+import org.apache.spark.sql.rapids.GpuCount
+import org.apache.spark.sql.types.{DataType, IntegerType, LongType}
+
+class WindowRetrySuite
+    extends RmmSparkRetrySuiteBase
+        with MockitoSugar
+        with Arm {
+  private def buildInputBatch() = {
+    val windowTable = new Table.TestBuilder()
+      .column(1.asInstanceOf[java.lang.Integer], 1, 1, 1)
+      .column(5L, null.asInstanceOf[java.lang.Long], 3L, 3L)
+      .build()
+    withResource(windowTable) { tbl =>
+      val cb = GpuColumnVector.from(tbl, Seq(IntegerType, LongType).toArray[DataType])
+      spy(SpillableColumnarBatch(cb, -1))
+    }
+  }
+
+  def setupCountAgg(
+      frame: GpuSpecifiedWindowFrame,
+      orderSpec: Seq[SortOrder] = Seq.empty):
+  (GroupedAggregations, Array[ai.rapids.cudf.ColumnVector]) = {
+    val groupAggs = new GroupedAggregations()
+    val spec = GpuWindowSpecDefinition(Seq.empty, orderSpec, frame)
+    val count = GpuWindowExpression(GpuCount(Seq(GpuLiteral.create(1, IntegerType))), spec)
+    groupAggs.addAggregation(count, Array(0), 2)
+    val windowOptsLength = 3
+    (groupAggs, new Array[ai.rapids.cudf.ColumnVector](windowOptsLength))
+  }
+
+  test("row based window handles RetryOOM") {
+    val inputBatch = buildInputBatch()
+    val frame = GpuSpecifiedWindowFrame(
+      RowFrame,
+      GpuSpecialFrameBoundary(UnboundedPreceding),
+      GpuSpecialFrameBoundary(UnboundedFollowing))
+    val (groupAggs, outputColumns) = setupCountAgg(frame)
+    RmmSpark.forceRetryOOM(RmmSpark.getCurrentThreadId, 1)
+    groupAggs.doAggsAndClose(
+      false,
+      Seq.empty[SortOrder],
+      Array.empty,
+      Array.empty,
+      inputBatch,
+      outputColumns)
+    withResource(outputColumns) { _ =>
+      var rowsLeftToCheck = 4
+      withResource(outputColumns(2).copyToHost()) { hostCol =>
+        (0 until hostCol.getRowCount.toInt).foreach { row =>
+          assertResult(4)(hostCol.getLong(row))
+          rowsLeftToCheck -= 1
+        }
+      }
+      assertResult(0)(rowsLeftToCheck)
+    }
+    verify(inputBatch, times(2)).getColumnarBatch()
+    verify(inputBatch, times(1)).close()
+  }
+
+  test("optimized-row based window handles RetryOOM") {
+    val inputBatch = buildInputBatch()
+    val frame = GpuSpecifiedWindowFrame(
+      RowFrame,
+      GpuSpecialFrameBoundary(UnboundedPreceding),
+      GpuSpecialFrameBoundary(CurrentRow))
+    val (groupAggs, outputColumns) = setupCountAgg(frame)
+    RmmSpark.forceRetryOOM(RmmSpark.getCurrentThreadId, 1)
+    groupAggs.doAggsAndClose(
+      false,
+      Seq.empty[SortOrder],
+      Array.empty,
+      Array.empty,
+      inputBatch,
+      outputColumns)
+    withResource(outputColumns) { _ =>
+      var rowsLeftToCheck = 4
+      withResource(outputColumns(2).copyToHost()) { hostCol =>
+        (0 until hostCol.getRowCount.toInt).foreach { row =>
+          assertResult(row + 1)(hostCol.getLong(row))
+          rowsLeftToCheck -= 1
+        }
+      }
+      assertResult(0)(rowsLeftToCheck)
+    }
+    verify(inputBatch, times(2)).getColumnarBatch()
+    verify(inputBatch, times(1)).close()
+  }
+
+  test("ranged based window handles RetryOOM") {
+    val inputBatch = buildInputBatch()
+    val frame = GpuSpecifiedWindowFrame(
+      RangeFrame,
+      GpuLiteral.create(-1, IntegerType),
+      GpuSpecialFrameBoundary(CurrentRow))
+    val child = GpuBoundReference(0, IntegerType, nullable = false)(ExprId(0), "test")
+    val orderSpec = SortOrder(child, Ascending) :: Nil
+    val (groupAggs, outputColumns) = setupCountAgg(frame, orderSpec = orderSpec)
+    RmmSpark.forceRetryOOM(RmmSpark.getCurrentThreadId, 1)
+    groupAggs.doAggsAndClose(
+      false,
+      orderSpec,
+      Array(0),
+      Array.empty,
+      inputBatch,
+      outputColumns)
+    withResource(outputColumns) { _ =>
+      var rowsLeftToCheck = 4
+      withResource(outputColumns(2).copyToHost()) { hostCol =>
+        (0 until hostCol.getRowCount.toInt).foreach { row =>
+          assertResult(4)(hostCol.getLong(row))
+          rowsLeftToCheck -= 1
+        }
+      }
+      assertResult(0)(rowsLeftToCheck)
+    }
+    verify(inputBatch, times(2)).getColumnarBatch()
+    verify(inputBatch, times(1)).close()
+  }
+
+  test("SplitAndRetryOOM is not handled in doAggs") {
+    val inputBatch = buildInputBatch()
+
+    val frame = GpuSpecifiedWindowFrame(
+      RowFrame,
+      GpuSpecialFrameBoundary(UnboundedPreceding),
+      GpuSpecialFrameBoundary(CurrentRow))
+    val (groupAggs, outputColumns) = setupCountAgg(frame)
+    // simulate a successful window operation
+    val theMock = mock[ColumnVector]
+    outputColumns(0) = theMock
+    RmmSpark.forceSplitAndRetryOOM(RmmSpark.getCurrentThreadId, 1)
+    assertThrows[SplitAndRetryOOM] {
+      groupAggs.doAggsAndClose(
+        false,
+        Seq.empty[SortOrder],
+        Array.empty,
+        Array.empty,
+        inputBatch,
+        outputColumns)
+    }
+    // when we throw we must have closed any columns in `outputColumns` that are not null
+    // and we would have marked them null
+    assertResult(null)(outputColumns(0))
+    verify(theMock, times(1)).close()
+    verify(inputBatch, times(1)).getColumnarBatch()
+    verify(inputBatch, times(1)).close()
+  }
+
+  test("row based group by window handles RetryOOM") {
+    val inputBatch = buildInputBatch()
+    val frame = GpuSpecifiedWindowFrame(
+      RowFrame,
+      GpuSpecialFrameBoundary(UnboundedPreceding),
+      GpuSpecialFrameBoundary(CurrentRow))
+    val (groupAggs, outputColumns) = setupCountAgg(frame)
+    RmmSpark.forceRetryOOM(RmmSpark.getCurrentThreadId, 1)
+    groupAggs.doAggsAndClose(
+      false,
+      Seq.empty[SortOrder],
+      Array.empty,
+      Array(1),
+      inputBatch,
+      outputColumns)
+    withResource(outputColumns) { _ =>
+      var rowsLeftToCheck = 4
+      withResource(outputColumns(2).copyToHost()) { hostCol =>
+        (0 until hostCol.getRowCount.toInt).foreach { row =>
+          if (row == 0) { // 5
+            assertResult(1)(hostCol.getLong(row))
+            rowsLeftToCheck -= 1
+          } else if (row == 1) { // null
+            assertResult(1)(hostCol.getLong(row))
+            rowsLeftToCheck -= 1
+          } else if (row == 2) { // 3
+            assertResult(1)(hostCol.getLong(row))
+            rowsLeftToCheck -= 1
+          } else if (row == 3) { // 3
+            assertResult(2)(hostCol.getLong(row))
+            rowsLeftToCheck -= 1
+          }
+        }
+      }
+      assertResult(0)(rowsLeftToCheck)
+    }
+    verify(inputBatch, times(2)).getColumnarBatch()
+    verify(inputBatch, times(1)).close()
+  }
+}
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/WithRetrySuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/WithRetrySuite.scala
new file mode 100644
index 00000000000..5c4cced9417
--- /dev/null
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/WithRetrySuite.scala
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import ai.rapids.cudf.{Rmm, RmmAllocationMode, RmmEventHandler, Table}
+import com.nvidia.spark.rapids.RmmRapidsRetryIterator.{withRestoreOnRetry, withRetry, withRetryNoSplit}
+import com.nvidia.spark.rapids.jni.{RetryOOM, RmmSpark, SplitAndRetryOOM}
+import org.mockito.Mockito._
+import org.scalatest.BeforeAndAfterEach
+import org.scalatest.FunSuite
+import org.scalatest.mockito.MockitoSugar
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.types.{DataType, LongType}
+
+class WithRetrySuite
+    extends FunSuite
+        with BeforeAndAfterEach with MockitoSugar with Arm {
+
+  private def buildBatch: SpillableColumnarBatch = {
+    val reductionTable = new Table.TestBuilder()
+        .column(5L, null.asInstanceOf[java.lang.Long], 3L, 1L)
+        .build()
+    withResource(reductionTable) { tbl =>
+      val cb = GpuColumnVector.from(tbl, Seq(LongType).toArray[DataType])
+      spy(SpillableColumnarBatch(cb, -1))
+    }
+  }
+  
+  private var rmmWasInitialized = false
+
+  override def beforeEach(): Unit = {
+    SparkSession.getActiveSession.foreach(_.stop())
+    SparkSession.clearActiveSession()
+    if (!Rmm.isInitialized) {
+      rmmWasInitialized = true
+      Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, null, 512 * 1024 * 1024)
+    }
+    val deviceStorage = new RapidsDeviceMemoryStore()
+    val catalog = new RapidsBufferCatalog(deviceStorage)
+    RapidsBufferCatalog.setCatalog(catalog)
+    val mockEventHandler = new BaseRmmEventHandler()
+    RmmSpark.setEventHandler(mockEventHandler)
+    RmmSpark.associateThreadWithTask(RmmSpark.getCurrentThreadId, 1)
+  }
+
+  override def afterEach(): Unit = {
+    RmmSpark.removeThreadAssociation(RmmSpark.getCurrentThreadId)
+    RmmSpark.clearEventHandler()
+    RapidsBufferCatalog.close()
+    if (rmmWasInitialized) {
+      Rmm.shutdown()
+    }
+  }
+
+  test("withRetry closes input on failure") {
+    val myItems = Seq(buildBatch, buildBatch)
+    assertThrows[IllegalStateException] {
+      try {
+        withRetry(myItems.iterator, splitPolicy = null) { _ =>
+          throw new IllegalStateException("unhandled exception")
+        }.toSeq
+      } finally {
+        // verify that close was called on the first item,
+        // which was attempted, but not the second
+        verify(myItems.head, times(1)).close()
+        verify(myItems.last, times(0)).close()
+        myItems(1).close()
+      }
+    }
+  }
+
+  test("withRetryNoSplit closes input on failure") {
+    val myItems = Seq(buildBatch, buildBatch)
+    assertThrows[IllegalStateException] {
+      try {
+        withRetryNoSplit(myItems) { _ =>
+          throw new IllegalStateException("unhandled exception")
+        }
+      } finally {
+        myItems.foreach { item =>
+          // verify that close was called
+          verify(item, times(1)).close()
+        }
+      }
+    }
+  }
+
+  test("withRetry closes input and attempts on failure") {
+    val myItems = Seq(buildBatch, buildBatch)
+    val myAttempts = Seq(buildBatch, buildBatch)
+    val mockSplitPolicy = (toSplit: SpillableColumnarBatch) => {
+      withResource(toSplit) { _ =>
+        myAttempts
+      }
+    }
+    assertThrows[IllegalStateException] {
+      try {
+        var didThrow = false
+        withRetry(myItems.iterator, mockSplitPolicy) { _ =>
+          if (!didThrow) {
+            didThrow = true
+            throw new SplitAndRetryOOM("in tests")
+          } else {
+            throw new IllegalStateException("unhandled exception")
+          }
+        }.toSeq
+      } finally {
+        myAttempts.foreach { item =>
+          // verify that close was called on all attempts
+          verify(item, times(1)).close()
+        }
+        verify(myItems.head, times(1)).close()
+        verify(myItems.last, times(0)).close()
+        myItems(1).close()
+      }
+    }
+  }
+
+  test("withRetry closes input on missing split policy") {
+    val myItems = Seq(buildBatch, buildBatch)
+    assertThrows[SplitAndRetryOOM] {
+      try {
+        withRetry(myItems.iterator, splitPolicy = null) { _ =>
+          throw new SplitAndRetryOOM("unhandled split-and-retry")
+        }.toSeq
+      } finally {
+        verify(myItems.head, times(1)).close()
+        verify(myItems.last, times(0)).close()
+        myItems(1).close()
+      }
+    }
+  }
+
+  test("withRestoreOnRetry restores state on retry") {
+    val initialValue = 5
+    val increment = 5
+    var didThrow = false
+    val myCheckpointable = new SimpleCheckpointRestore(initialValue)
+    try {
+      myCheckpointable.checkpoint()
+      withRetryNoSplit {
+        withRestoreOnRetry(myCheckpointable) {
+          myCheckpointable.value += increment
+          if (!didThrow) {
+            didThrow = true
+            throw new RetryOOM("in tests")
+          }
+        }
+      }
+    } finally {
+      assert(myCheckpointable.value == (initialValue + increment))
+    }
+  }
+
+  test("withRestoreOnRetry restores state on causedBy retry") {
+    val initialValue = 5
+    val increment = 5
+    var didThrow = false
+    val myCheckpointable = new SimpleCheckpointRestore(initialValue)
+    myCheckpointable.checkpoint()
+    try {
+      assertThrows[IllegalStateException] {
+        withRetryNoSplit {
+          withRestoreOnRetry(myCheckpointable) {
+            myCheckpointable.value += increment
+            if (!didThrow) {
+              val ex = new IllegalStateException()
+              ex.addSuppressed(new RetryOOM("causedby ex in tests"))
+              throw ex
+              didThrow = true
+            }
+          }
+        }
+      }
+    } finally {
+      assert(myCheckpointable.value == (initialValue + increment))
+    }
+  }
+
+  test("withRestoreOnRetry restores state for Seq on retry") {
+    val initialValue = 5
+    val increment = 3
+    var didThrow = false
+    val myCheckpointables = Seq(new SimpleCheckpointRestore(initialValue),
+      new SimpleCheckpointRestore(initialValue + 10))
+    try {
+      myCheckpointables.foreach(_.checkpoint())
+      withRetryNoSplit {
+        withRestoreOnRetry(myCheckpointables) {
+          myCheckpointables.foreach(_.value += increment)
+          if (!didThrow) {
+            didThrow = true
+            throw new RetryOOM("in tests")
+          }
+        }
+      }
+    } finally {
+      assert(myCheckpointables(0).value == (initialValue + increment))
+      assert(myCheckpointables(1).value == (initialValue + 10 + increment))
+    }
+  }
+
+  test("withRestoreOnRetry restores state for Seq on caused by retry") {
+    val initialValue = 5
+    val increment = 3
+    var didThrow = false
+    val myCheckpointables = Seq(new SimpleCheckpointRestore(initialValue),
+      new SimpleCheckpointRestore(initialValue + 10))
+    myCheckpointables.foreach(_.checkpoint())
+    try {
+      assertThrows[IllegalStateException] {
+        withRetryNoSplit {
+          withRestoreOnRetry(myCheckpointables) {
+            myCheckpointables.foreach(_.value += increment)
+            if (!didThrow) {
+              didThrow = true
+              val ex = new IllegalStateException()
+              ex.addSuppressed(new RetryOOM("causedby ex in tests"))
+              throw ex
+            }
+          }
+        }
+      }
+    } finally {
+      assert(myCheckpointables(0).value == (initialValue + increment))
+      assert(myCheckpointables(1).value == (initialValue + 10 + increment))
+    }
+  }
+
+  private class BaseRmmEventHandler extends RmmEventHandler {
+    override def getAllocThresholds: Array[Long] = null
+    override def getDeallocThresholds: Array[Long] = null
+    override def onAllocThreshold(totalAllocSize: Long): Unit = {}
+    override def onDeallocThreshold(totalAllocSize: Long): Unit = {}
+    override def onAllocFailure(sizeRequested: Long, retryCount: Int): Boolean = {
+      false
+    }
+  }
+  private class SimpleCheckpointRestore(var value:Int) extends CheckpointRestore {
+    private var lastValue:Int = value
+    def setValue(newVal: Int) = {
+      value = newVal
+    }
+    override def checkpoint() = {
+      lastValue = value
+    }
+    override def restore() = {
+      value = lastValue
+    }
+  }
+}
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClientSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClientSuite.scala
index 0b015cf72fd..f391574dbc1 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClientSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClientSuite.scala
@@ -231,7 +231,7 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
         verify(client, times(1)).track(any[DeviceMemoryBuffer](), tmCaptor.capture())
         verifyTableMeta(tableMeta, tmCaptor.getValue.asInstanceOf[TableMeta])
         verify(mockCatalog, times(1))
-            .addBuffer(dmbCaptor.capture(), any(), any(), any(), any())
+            .addBuffer(dmbCaptor.capture(), any(), any(), any())
 
         val receivedBuff = dmbCaptor.getValue.asInstanceOf[DeviceMemoryBuffer]
         assertResult(tableMeta.bufferMeta().size())(receivedBuff.getLength)
@@ -282,7 +282,7 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
       verify(client, times(1)).track(any[DeviceMemoryBuffer](), tmCaptor.capture())
       verifyTableMeta(tableMeta, tmCaptor.getValue.asInstanceOf[TableMeta])
       verify(mockCatalog, times(1))
-          .addBuffer(dmbCaptor.capture(), any(), any(), any(), any())
+          .addBuffer(dmbCaptor.capture(), any(), any(), any())
       verify(mockCatalog, times(1)).removeBuffer(any())
 
       val receivedBuff = dmbCaptor.getValue.asInstanceOf[DeviceMemoryBuffer]
@@ -335,7 +335,7 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
       }
 
       verify(mockCatalog, times(5))
-          .addBuffer(dmbCaptor.capture(), any(), any(), any(), any())
+          .addBuffer(dmbCaptor.capture(), any(), any(), any())
 
       assertResult(totalExpectedSize)(
         dmbCaptor.getAllValues().toArray().map(_.asInstanceOf[DeviceMemoryBuffer].getLength).sum)
@@ -388,7 +388,7 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
       }
 
       verify(mockCatalog, times(20))
-          .addBuffer(dmbCaptor.capture(), any(), any(), any(), any())
+          .addBuffer(dmbCaptor.capture(), any(), any(), any())
 
       assertResult(totalExpectedSize)(
         dmbCaptor.getAllValues().toArray().map(_.asInstanceOf[DeviceMemoryBuffer].getLength).sum)
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIteratorSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIteratorSuite.scala
index 4a348a2ad1a..a8e6c789906 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIteratorSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIteratorSuite.scala
@@ -17,6 +17,7 @@
 package com.nvidia.spark.rapids.shuffle
 
 import com.nvidia.spark.rapids.{RapidsBuffer, RapidsBufferHandle}
+import com.nvidia.spark.rapids.jni.RmmSpark
 import org.mockito.ArgumentCaptor
 import org.mockito.ArgumentMatchers._
 import org.mockito.Mockito._
@@ -26,63 +27,75 @@ import org.apache.spark.sql.vectorized.ColumnarBatch
 
 class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
   test("inability to get a client raises a fetch failure") {
-    val blocksByAddress = RapidsShuffleTestHelper.getBlocksByAddress
-
-    val cl = new RapidsShuffleIterator(
-      RapidsShuffleTestHelper.makeMockBlockManager("1", "1"),
-      mockConf,
-      mockTransport,
-      blocksByAddress,
-      testMetricsUpdater,
-      Array.empty,
-      mockCatalog,
-      123)
-
-    when(mockTransaction.getStatus).thenReturn(TransactionStatus.Error)
-
-    when(mockTransport.makeClient(any())).thenThrow(new IllegalStateException("Test"))
-
-    assert(cl.hasNext)
-    assertThrows[RapidsShuffleFetchFailedException](cl.next())
-
-    // not invoked, since we never blocked
-    verify(testMetricsUpdater, times(0))
-        .update(any(), any(), any(), any())
+    val taskId = 1
+    try {
+      RmmSpark.associateCurrentThreadWithTask(taskId)
+      val blocksByAddress = RapidsShuffleTestHelper.getBlocksByAddress
+
+      val cl = new RapidsShuffleIterator(
+        RapidsShuffleTestHelper.makeMockBlockManager("1", "1"),
+        mockConf,
+        mockTransport,
+        blocksByAddress,
+        testMetricsUpdater,
+        Array.empty,
+        mockCatalog,
+        123)
+
+      when(mockTransaction.getStatus).thenReturn(TransactionStatus.Error)
+
+      when(mockTransport.makeClient(any())).thenThrow(new IllegalStateException("Test"))
+
+      assert(cl.hasNext)
+      assertThrows[RapidsShuffleFetchFailedException](cl.next())
+
+      // not invoked, since we never blocked
+      verify(testMetricsUpdater, times(0))
+          .update(any(), any(), any(), any())
+    } finally {
+      RmmSpark.taskDone(taskId)
+    }
   }
 
   private def doTestErrorOrCancelledRaisesFetchFailure(status: TransactionStatus.Value): Unit = {
-    when(mockTransaction.getStatus).thenReturn(status)
-
-    val blocksByAddress = RapidsShuffleTestHelper.getBlocksByAddress
-
-    val cl = spy(new RapidsShuffleIterator(
-      RapidsShuffleTestHelper.makeMockBlockManager("1", "1"),
-      mockConf,
-      mockTransport,
-      blocksByAddress,
-      testMetricsUpdater,
-      Array.empty,
-      mockCatalog,
-      123))
-
-    val ac = ArgumentCaptor.forClass(classOf[RapidsShuffleFetchHandler])
-    when(mockTransport.makeClient(any())).thenReturn(client)
-    doNothing().when(client).doFetch(any(), ac.capture())
-    cl.start()
-
-    val handler = ac.getValue.asInstanceOf[RapidsShuffleFetchHandler]
-    handler.transferError("Test", null)
-
-    assert(cl.hasNext)
-    assertThrows[RapidsShuffleFetchFailedException](cl.next())
-
-    verify(mockTransport, times(1)).cancelPending(handler)
-
-    verify(testMetricsUpdater, times(1))
-      .update(any(), any(), any(), any())
-    assertResult(0)(testMetricsUpdater.totalRemoteBlocksFetched)
-    assertResult(0)(testMetricsUpdater.totalRemoteBytesRead)
-    assertResult(0)(testMetricsUpdater.totalRowsFetched)
+    val taskId = 1
+    try {
+      RmmSpark.associateCurrentThreadWithTask(taskId)
+      when(mockTransaction.getStatus).thenReturn(status)
+
+      val blocksByAddress = RapidsShuffleTestHelper.getBlocksByAddress
+
+      val cl = spy(new RapidsShuffleIterator(
+        RapidsShuffleTestHelper.makeMockBlockManager("1", "1"),
+        mockConf,
+        mockTransport,
+        blocksByAddress,
+        testMetricsUpdater,
+        Array.empty,
+        mockCatalog,
+        123))
+
+      val ac = ArgumentCaptor.forClass(classOf[RapidsShuffleFetchHandler])
+      when(mockTransport.makeClient(any())).thenReturn(client)
+      doNothing().when(client).doFetch(any(), ac.capture())
+      cl.start()
+
+      val handler = ac.getValue.asInstanceOf[RapidsShuffleFetchHandler]
+      handler.transferError("Test", null)
+
+      assert(cl.hasNext)
+      assertThrows[RapidsShuffleFetchFailedException](cl.next())
+
+      verify(mockTransport, times(1)).cancelPending(handler)
+
+      verify(testMetricsUpdater, times(1))
+          .update(any(), any(), any(), any())
+      assertResult(0)(testMetricsUpdater.totalRemoteBlocksFetched)
+      assertResult(0)(testMetricsUpdater.totalRemoteBytesRead)
+      assertResult(0)(testMetricsUpdater.totalRowsFetched)
+    } finally {
+      RmmSpark.taskDone(taskId)
+    }
   }
 
   test("a transport error raises a fetch failure") {
@@ -94,112 +107,130 @@ class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
   }
 
   test("a transport exception raises a fetch failure with the cause exception") {
-    val blocksByAddress = RapidsShuffleTestHelper.getBlocksByAddress
-
-    val cl = spy(new RapidsShuffleIterator(
-      RapidsShuffleTestHelper.makeMockBlockManager("1", "1"),
-      mockConf,
-      mockTransport,
-      blocksByAddress,
-      testMetricsUpdater,
-      Array.empty,
-      mockCatalog,
-      123))
-
-    val ac = ArgumentCaptor.forClass(classOf[RapidsShuffleFetchHandler])
-    when(mockTransport.makeClient(any())).thenReturn(client)
-    doNothing().when(client).doFetch(any(), ac.capture())
-    cl.start()
-
-    val handler = ac.getValue.asInstanceOf[RapidsShuffleFetchHandler]
-    val causeException = new RuntimeException("some exception")
-    handler.transferError("Test", causeException)
-
-    assert(cl.hasNext)
-    assertThrows[RapidsShuffleFetchFailedException] {
-      try {
-        cl.next()
-      } catch {
-        case rsffe: RapidsShuffleFetchFailedException =>
-          val cause = rsffe.getCause
-          assertResult(cause)(causeException)
-          throw rsffe
+    val taskId = 1
+    try {
+      RmmSpark.associateCurrentThreadWithTask(taskId)
+      val blocksByAddress = RapidsShuffleTestHelper.getBlocksByAddress
+
+      val cl = spy(new RapidsShuffleIterator(
+        RapidsShuffleTestHelper.makeMockBlockManager("1", "1"),
+        mockConf,
+        mockTransport,
+        blocksByAddress,
+        testMetricsUpdater,
+        Array.empty,
+        mockCatalog,
+        123))
+
+      val ac = ArgumentCaptor.forClass(classOf[RapidsShuffleFetchHandler])
+      when(mockTransport.makeClient(any())).thenReturn(client)
+      doNothing().when(client).doFetch(any(), ac.capture())
+      cl.start()
+
+      val handler = ac.getValue.asInstanceOf[RapidsShuffleFetchHandler]
+      val causeException = new RuntimeException("some exception")
+      handler.transferError("Test", causeException)
+
+      assert(cl.hasNext)
+      assertThrows[RapidsShuffleFetchFailedException] {
+        try {
+          cl.next()
+        } catch {
+          case rsffe: RapidsShuffleFetchFailedException =>
+            val cause = rsffe.getCause
+            assertResult(cause)(causeException)
+            throw rsffe
+        }
       }
-    }
 
-    verify(mockTransport, times(1)).cancelPending(handler)
+      verify(mockTransport, times(1)).cancelPending(handler)
 
-    verify(testMetricsUpdater, times(1))
-        .update(any(), any(), any(), any())
-    assertResult(0)(testMetricsUpdater.totalRemoteBlocksFetched)
-    assertResult(0)(testMetricsUpdater.totalRemoteBytesRead)
-    assertResult(0)(testMetricsUpdater.totalRowsFetched)
+      verify(testMetricsUpdater, times(1))
+          .update(any(), any(), any(), any())
+      assertResult(0)(testMetricsUpdater.totalRemoteBlocksFetched)
+      assertResult(0)(testMetricsUpdater.totalRemoteBytesRead)
+      assertResult(0)(testMetricsUpdater.totalRowsFetched)
+    } finally {
+      RmmSpark.taskDone(taskId)
+    }
   }
 
   test("a timeout while waiting for batches raises a fetch failure") {
-    val blocksByAddress = RapidsShuffleTestHelper.getBlocksByAddress
-
-    val cl = spy(new RapidsShuffleIterator(
-      RapidsShuffleTestHelper.makeMockBlockManager("1", "1"),
-      mockConf,
-      mockTransport,
-      blocksByAddress,
-      testMetricsUpdater,
-      Array.empty,
-      mockCatalog,
-      123))
-
-    when(mockTransport.makeClient(any())).thenReturn(client)
-    doNothing().when(client).doFetch(any(), any())
-    cl.start()
-
-    // signal a timeout to the iterator
-    when(cl.pollForResult(any())).thenReturn(None)
-
-    assertThrows[RapidsShuffleTimeoutException](cl.next())
-
-    verify(testMetricsUpdater, times(1))
-        .update(any(), any(), any(), any())
-    assertResult(0)(testMetricsUpdater.totalRemoteBlocksFetched)
-    assertResult(0)(testMetricsUpdater.totalRemoteBytesRead)
-    assertResult(0)(testMetricsUpdater.totalRowsFetched)
+    val taskId = 1
+    try {
+      RmmSpark.associateCurrentThreadWithTask(taskId)
+      val blocksByAddress = RapidsShuffleTestHelper.getBlocksByAddress
+
+      val cl = spy(new RapidsShuffleIterator(
+        RapidsShuffleTestHelper.makeMockBlockManager("1", "1"),
+        mockConf,
+        mockTransport,
+        blocksByAddress,
+        testMetricsUpdater,
+        Array.empty,
+        mockCatalog,
+        123))
+
+      when(mockTransport.makeClient(any())).thenReturn(client)
+      doNothing().when(client).doFetch(any(), any())
+      cl.start()
+
+      // signal a timeout to the iterator
+      when(cl.pollForResult(any())).thenReturn(None)
+
+      assertThrows[RapidsShuffleTimeoutException](cl.next())
+
+      verify(testMetricsUpdater, times(1))
+          .update(any(), any(), any(), any())
+      assertResult(0)(testMetricsUpdater.totalRemoteBlocksFetched)
+      assertResult(0)(testMetricsUpdater.totalRemoteBytesRead)
+      assertResult(0)(testMetricsUpdater.totalRowsFetched)
+    } finally {
+      RmmSpark.taskDone(taskId)
+    }
   }
 
   test("a new good batch is queued") {
-    val blocksByAddress = RapidsShuffleTestHelper.getBlocksByAddress
-
-    val cl = new RapidsShuffleIterator(
-      RapidsShuffleTestHelper.makeMockBlockManager("1", "1"),
-      mockConf,
-      mockTransport,
-      blocksByAddress,
-      testMetricsUpdater,
-      Array.empty,
-      mockCatalog,
-      123)
-
-    val ac = ArgumentCaptor.forClass(classOf[RapidsShuffleFetchHandler])
-    when(mockTransport.makeClient(any())).thenReturn(client)
-    doNothing().when(client).doFetch(any(), ac.capture())
-    val mockBuffer = mock[RapidsBuffer]
-
-    val cb = new ColumnarBatch(Array.empty, 10)
-    val handle = mock[RapidsBufferHandle]
-    when(mockBuffer.getColumnarBatch(Array.empty)).thenReturn(cb)
-    when(mockCatalog.acquireBuffer(any[RapidsBufferHandle]())).thenReturn(mockBuffer)
-    doNothing().when(mockCatalog).removeBuffer(any())
-    cl.start()
-
-    val handler = ac.getValue.asInstanceOf[RapidsShuffleFetchHandler]
-    handler.start(1)
-    handler.batchReceived(handle)
-
-    verify(mockTransport, times(0)).cancelPending(handler)
-
-    assert(cl.hasNext)
-    assertResult(cb)(cl.next())
-    assertResult(1)(testMetricsUpdater.totalRemoteBlocksFetched)
-    assertResult(mockBuffer.size)(testMetricsUpdater.totalRemoteBytesRead)
-    assertResult(10)(testMetricsUpdater.totalRowsFetched)
+    val taskId = 1
+    try {
+      RmmSpark.associateCurrentThreadWithTask(taskId)
+      val blocksByAddress = RapidsShuffleTestHelper.getBlocksByAddress
+
+      val cl = new RapidsShuffleIterator(
+        RapidsShuffleTestHelper.makeMockBlockManager("1", "1"),
+        mockConf,
+        mockTransport,
+        blocksByAddress,
+        testMetricsUpdater,
+        Array.empty,
+        mockCatalog,
+        123)
+
+      val ac = ArgumentCaptor.forClass(classOf[RapidsShuffleFetchHandler])
+      when(mockTransport.makeClient(any())).thenReturn(client)
+      doNothing().when(client).doFetch(any(), ac.capture())
+      val mockBuffer = mock[RapidsBuffer]
+
+      val cb = new ColumnarBatch(Array.empty, 10)
+      val handle = mock[RapidsBufferHandle]
+      when(mockBuffer.getColumnarBatch(Array.empty)).thenReturn(cb)
+      when(mockCatalog.acquireBuffer(any[RapidsBufferHandle]())).thenReturn(mockBuffer)
+      doNothing().when(mockCatalog).removeBuffer(any())
+      cl.start()
+
+      val handler = ac.getValue.asInstanceOf[RapidsShuffleFetchHandler]
+      handler.start(1)
+      handler.batchReceived(handle)
+
+      verify(mockTransport, times(0)).cancelPending(handler)
+
+      assert(cl.hasNext)
+      assertResult(cb)(cl.next())
+      assertResult(1)(testMetricsUpdater.totalRemoteBlocksFetched)
+      assertResult(mockBuffer.size)(testMetricsUpdater.totalRemoteBytesRead)
+      assertResult(10)(testMetricsUpdater.totalRowsFetched)
+    } finally {
+      RmmSpark.taskDone(taskId)
+    }
   }
 }
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleTestHelper.scala b/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleTestHelper.scala
index 6c3b2c3c568..1551d637ebf 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleTestHelper.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleTestHelper.scala
@@ -140,7 +140,7 @@ class RapidsShuffleTestHelper extends FunSuite
     testMetricsUpdater = spy(new TestShuffleMetricsUpdater)
 
     val dmbCaptor = ArgumentCaptor.forClass(classOf[DeviceMemoryBuffer])
-    when(mockCatalog.addBuffer(dmbCaptor.capture(), any(), any(), any(), any()))
+    when(mockCatalog.addBuffer(dmbCaptor.capture(), any(), any(), any()))
       .thenAnswer(_ => {
         val buffer = dmbCaptor.getValue.asInstanceOf[DeviceMemoryBuffer]
         buffersToClose.append(buffer)
diff --git a/tests/src/test/scala/org/apache/spark/sql/rapids/SpillableColumnarBatchSuite.scala b/tests/src/test/scala/org/apache/spark/sql/rapids/SpillableColumnarBatchSuite.scala
index 4433b78933c..4ca9aa32c9c 100644
--- a/tests/src/test/scala/org/apache/spark/sql/rapids/SpillableColumnarBatchSuite.scala
+++ b/tests/src/test/scala/org/apache/spark/sql/rapids/SpillableColumnarBatchSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.rapids
 import java.util.UUID
 
 import ai.rapids.cudf.{Cuda, DeviceMemoryBuffer, MemoryBuffer}
-import com.nvidia.spark.rapids.{Arm, NoopMetric, RapidsBuffer, RapidsBufferCatalog, RapidsBufferId, SpillableColumnarBatchImpl, SpillCallback, StorageTier}
+import com.nvidia.spark.rapids.{Arm, RapidsBuffer, RapidsBufferCatalog, RapidsBufferId, SpillableColumnarBatchImpl, StorageTier}
 import com.nvidia.spark.rapids.StorageTier.StorageTier
 import com.nvidia.spark.rapids.format.TableMeta
 import org.scalatest.FunSuite
@@ -36,13 +36,12 @@ class SpillableColumnarBatchSuite extends FunSuite with Arm {
     val catalog = RapidsBufferCatalog.singleton
     val oldBufferCount = catalog.numBuffers
     catalog.registerNewBuffer(mockBuffer)
-    val handle = catalog.makeNewHandle(id, -1, RapidsBuffer.defaultSpillCallback)
+    val handle = catalog.makeNewHandle(id, -1)
     assertResult(oldBufferCount + 1)(catalog.numBuffers)
     val spillableBatch = new SpillableColumnarBatchImpl(
       handle,
       5,
-      Array[DataType](IntegerType),
-      NoopMetric)
+      Array[DataType](IntegerType))
     spillableBatch.close()
     assertResult(oldBufferCount)(catalog.numBuffers)
   }
@@ -62,8 +61,5 @@ class SpillableColumnarBatchSuite extends FunSuite with Arm {
     override def close(): Unit = {}
     override def getColumnarBatch(
       sparkTypes: Array[DataType]): ColumnarBatch = null
-
-    override val getSpillCallback: SpillCallback = RapidsBuffer.defaultSpillCallback
-    override def setSpillCallback(spillCallback: SpillCallback): Unit = {}
   }
 }
diff --git a/tests/src/test/320+-noncdh-nondb/scala/com/nvidia/spark/rapids/DynamicPruningSuite.scala b/tests/src/test/spark321/scala/com/nvidia/spark/rapids/DynamicPruningSuite.scala
similarity index 98%
rename from tests/src/test/320+-noncdh-nondb/scala/com/nvidia/spark/rapids/DynamicPruningSuite.scala
rename to tests/src/test/spark321/scala/com/nvidia/spark/rapids/DynamicPruningSuite.scala
index 0e43130fcdb..4fa8a918f1d 100644
--- a/tests/src/test/320+-noncdh-nondb/scala/com/nvidia/spark/rapids/DynamicPruningSuite.scala
+++ b/tests/src/test/spark321/scala/com/nvidia/spark/rapids/DynamicPruningSuite.scala
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids
 
 import java.io.File
diff --git a/tests/src/test/320+-noncdh-nondb/scala/com/nvidia/spark/rapids/OrcEncryptionSuite.scala b/tests/src/test/spark321/scala/com/nvidia/spark/rapids/OrcEncryptionSuite.scala
similarity index 88%
rename from tests/src/test/320+-noncdh-nondb/scala/com/nvidia/spark/rapids/OrcEncryptionSuite.scala
rename to tests/src/test/spark321/scala/com/nvidia/spark/rapids/OrcEncryptionSuite.scala
index bdc17a4bd80..9946faab6bf 100644
--- a/tests/src/test/320+-noncdh-nondb/scala/com/nvidia/spark/rapids/OrcEncryptionSuite.scala
+++ b/tests/src/test/spark321/scala/com/nvidia/spark/rapids/OrcEncryptionSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,12 +14,20 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids
 
 import java.io.File
 import java.security.SecureRandom
 
-import com.nvidia.spark.rapids.shims.SparkShimImpl
 import org.apache.hadoop.conf.Configuration
 import org.apache.orc.{EncryptionAlgorithm, InMemoryKeystore}
 import org.apache.orc.impl.CryptoUtils
@@ -47,7 +55,7 @@ class OrcEncryptionSuite extends SparkQueryCompareTestSuite {
     execsAllowedNonGpu = Seq("ShuffleExchangeExec", "DataWritingCommandExec")) {
     frame =>
       // ORC encryption is only allowed in 3.2+
-      val isValidTestForSparkVersion = SparkShimImpl.getSparkShimVersion match {
+      val isValidTestForSparkVersion = ShimLoader.getShimVersion match {
         case SparkShimVersion(major, minor, _) => major == 3 && minor != 1
         case DatabricksShimVersion(major, minor, _, _) => major == 3 && minor != 1
         case ClouderaShimVersion(major, minor, _, _) => major == 3 && minor != 1
diff --git a/tests/src/test/320+-noncdh-nondb/scala/org/apache/spark/sql/rapids/RapidsShuffleThreadedReaderSuite.scala b/tests/src/test/spark321/scala/org/apache/spark/sql/rapids/RapidsShuffleThreadedReaderSuite.scala
similarity index 96%
rename from tests/src/test/320+-noncdh-nondb/scala/org/apache/spark/sql/rapids/RapidsShuffleThreadedReaderSuite.scala
rename to tests/src/test/spark321/scala/org/apache/spark/sql/rapids/RapidsShuffleThreadedReaderSuite.scala
index 2d017928ed9..dc2c1881811 100644
--- a/tests/src/test/320+-noncdh-nondb/scala/org/apache/spark/sql/rapids/RapidsShuffleThreadedReaderSuite.scala
+++ b/tests/src/test/spark321/scala/org/apache/spark/sql/rapids/RapidsShuffleThreadedReaderSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids
 
 import java.io.{ByteArrayOutputStream, InputStream}
diff --git a/tests/src/test/320+-noncdh-nondb/scala/org/apache/spark/sql/rapids/RapidsShuffleThreadedWriterSuite.scala b/tests/src/test/spark321/scala/org/apache/spark/sql/rapids/RapidsShuffleThreadedWriterSuite.scala
similarity index 98%
rename from tests/src/test/320+-noncdh-nondb/scala/org/apache/spark/sql/rapids/RapidsShuffleThreadedWriterSuite.scala
rename to tests/src/test/spark321/scala/org/apache/spark/sql/rapids/RapidsShuffleThreadedWriterSuite.scala
index 5217db41ed8..bbd6ee90831 100644
--- a/tests/src/test/320+-noncdh-nondb/scala/org/apache/spark/sql/rapids/RapidsShuffleThreadedWriterSuite.scala
+++ b/tests/src/test/spark321/scala/org/apache/spark/sql/rapids/RapidsShuffleThreadedWriterSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "321"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "330"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids
 
 import java.io.{DataInputStream, File, FileInputStream, IOException, ObjectStreamException}
diff --git a/tests/src/test/330+/scala/com/nvidia/spark/rapids/CsvScanForIntervalSuite.scala b/tests/src/test/spark330/scala/com/nvidia/spark/rapids/CsvScanForIntervalSuite.scala
similarity index 99%
rename from tests/src/test/330+/scala/com/nvidia/spark/rapids/CsvScanForIntervalSuite.scala
rename to tests/src/test/spark330/scala/com/nvidia/spark/rapids/CsvScanForIntervalSuite.scala
index 3997c1a1b8c..c7ebc7f0407 100644
--- a/tests/src/test/330+/scala/com/nvidia/spark/rapids/CsvScanForIntervalSuite.scala
+++ b/tests/src/test/spark330/scala/com/nvidia/spark/rapids/CsvScanForIntervalSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids
 
 import ai.rapids.cudf.ColumnVector
diff --git a/tests/src/test/330+/scala/com/nvidia/spark/rapids/GpuIntervalUtilsTest.scala b/tests/src/test/spark330/scala/com/nvidia/spark/rapids/GpuIntervalUtilsTest.scala
similarity index 98%
rename from tests/src/test/330+/scala/com/nvidia/spark/rapids/GpuIntervalUtilsTest.scala
rename to tests/src/test/spark330/scala/com/nvidia/spark/rapids/GpuIntervalUtilsTest.scala
index e9a3f50e396..c9200678c3d 100644
--- a/tests/src/test/330+/scala/com/nvidia/spark/rapids/GpuIntervalUtilsTest.scala
+++ b/tests/src/test/spark330/scala/com/nvidia/spark/rapids/GpuIntervalUtilsTest.scala
@@ -13,6 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids
 
 import ai.rapids.cudf.ColumnVector
diff --git a/tests/src/test/330+/scala/com/nvidia/spark/rapids/IntervalArithmeticSuite.scala b/tests/src/test/spark330/scala/com/nvidia/spark/rapids/IntervalArithmeticSuite.scala
similarity index 97%
rename from tests/src/test/330+/scala/com/nvidia/spark/rapids/IntervalArithmeticSuite.scala
rename to tests/src/test/spark330/scala/com/nvidia/spark/rapids/IntervalArithmeticSuite.scala
index 41f693e55ff..0805e242202 100644
--- a/tests/src/test/330+/scala/com/nvidia/spark/rapids/IntervalArithmeticSuite.scala
+++ b/tests/src/test/spark330/scala/com/nvidia/spark/rapids/IntervalArithmeticSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids
 
 import java.time.Period
diff --git a/tests/src/test/330+/scala/com/nvidia/spark/rapids/IntervalCastSuite.scala b/tests/src/test/spark330/scala/com/nvidia/spark/rapids/IntervalCastSuite.scala
similarity index 98%
rename from tests/src/test/330+/scala/com/nvidia/spark/rapids/IntervalCastSuite.scala
rename to tests/src/test/spark330/scala/com/nvidia/spark/rapids/IntervalCastSuite.scala
index 43eb1c189f6..a293fc7001c 100644
--- a/tests/src/test/330+/scala/com/nvidia/spark/rapids/IntervalCastSuite.scala
+++ b/tests/src/test/spark330/scala/com/nvidia/spark/rapids/IntervalCastSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids
 
 import java.time.Period
diff --git a/tests/src/test/330+/scala/com/nvidia/spark/rapids/IntervalDivisionSuite.scala b/tests/src/test/spark330/scala/com/nvidia/spark/rapids/IntervalDivisionSuite.scala
similarity index 98%
rename from tests/src/test/330+/scala/com/nvidia/spark/rapids/IntervalDivisionSuite.scala
rename to tests/src/test/spark330/scala/com/nvidia/spark/rapids/IntervalDivisionSuite.scala
index b6426da11a0..0c235343d50 100644
--- a/tests/src/test/330+/scala/com/nvidia/spark/rapids/IntervalDivisionSuite.scala
+++ b/tests/src/test/spark330/scala/com/nvidia/spark/rapids/IntervalDivisionSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids
 
 import java.time.Period
diff --git a/tests/src/test/330+/scala/com/nvidia/spark/rapids/IntervalMultiplySuite.scala b/tests/src/test/spark330/scala/com/nvidia/spark/rapids/IntervalMultiplySuite.scala
similarity index 98%
rename from tests/src/test/330+/scala/com/nvidia/spark/rapids/IntervalMultiplySuite.scala
rename to tests/src/test/spark330/scala/com/nvidia/spark/rapids/IntervalMultiplySuite.scala
index 461e9964155..c7db6bed53d 100644
--- a/tests/src/test/330+/scala/com/nvidia/spark/rapids/IntervalMultiplySuite.scala
+++ b/tests/src/test/spark330/scala/com/nvidia/spark/rapids/IntervalMultiplySuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids
 
 import java.time.{Duration, Period}
diff --git a/tests/src/test/330+/scala/com/nvidia/spark/rapids/IntervalSuite.scala b/tests/src/test/spark330/scala/com/nvidia/spark/rapids/IntervalSuite.scala
similarity index 96%
rename from tests/src/test/330+/scala/com/nvidia/spark/rapids/IntervalSuite.scala
rename to tests/src/test/spark330/scala/com/nvidia/spark/rapids/IntervalSuite.scala
index 7d5d3e787ed..763e86229e4 100644
--- a/tests/src/test/330+/scala/com/nvidia/spark/rapids/IntervalSuite.scala
+++ b/tests/src/test/spark330/scala/com/nvidia/spark/rapids/IntervalSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids
 
 import java.io.File
diff --git a/tests/src/test/330+/scala/com/nvidia/spark/rapids/SampleSuite.scala b/tests/src/test/spark330/scala/com/nvidia/spark/rapids/SampleSuite.scala
similarity index 89%
rename from tests/src/test/330+/scala/com/nvidia/spark/rapids/SampleSuite.scala
rename to tests/src/test/spark330/scala/com/nvidia/spark/rapids/SampleSuite.scala
index b905faf3148..04201c2e7db 100644
--- a/tests/src/test/330+/scala/com/nvidia/spark/rapids/SampleSuite.scala
+++ b/tests/src/test/spark330/scala/com/nvidia/spark/rapids/SampleSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids
 
 import java.time.Period
diff --git a/tests/src/test/330+/scala/com/nvidia/spark/rapids/TimestampSuite.scala b/tests/src/test/spark330/scala/com/nvidia/spark/rapids/TimestampSuite.scala
similarity index 94%
rename from tests/src/test/330+/scala/com/nvidia/spark/rapids/TimestampSuite.scala
rename to tests/src/test/spark330/scala/com/nvidia/spark/rapids/TimestampSuite.scala
index 91e690eb3f0..8838f883563 100644
--- a/tests/src/test/330+/scala/com/nvidia/spark/rapids/TimestampSuite.scala
+++ b/tests/src/test/spark330/scala/com/nvidia/spark/rapids/TimestampSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "340"}
+spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids
 
 import org.apache.spark.SparkConf
diff --git a/udf-compiler/pom.xml b/udf-compiler/pom.xml
index 89d49748a5f..97833bfdbd7 100644
--- a/udf-compiler/pom.xml
+++ b/udf-compiler/pom.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
-  Copyright (c) 2020-2022, NVIDIA CORPORATION.
+  Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
@@ -22,12 +22,12 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.02.0</version>
+        <version>23.04.0</version>
     </parent>
     <artifactId>rapids-4-spark-udf_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Scala UDF Plugin</name>
     <description>The RAPIDS Scala UDF plugin for Apache Spark</description>
-    <version>23.02.0</version>
+    <version>23.04.0</version>
 
     <dependencies>
         <dependency>