diff --git a/README.md b/README.md
index b627f170f6d..7171af73305 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 [![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json)
 
-|[Contributor Guide](https://github.com/NVIDIA/cccl/blob/main/CONTRIBUTING.md)|[Dev Containers](https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md)|[Discord](https://discord.gg/nvidiadeveloper)|[Godbolt](https://godbolt.org/z/x4G73af9a)|[GitHub Project](https://github.com/orgs/NVIDIA/projects/6)|[libcudacxx Docs](https://nvidia.github.io/libcudacxx/)|[Thrust Docs](https://nvidia.github.io/thrust/)|[CUB Docs](https://nvlabs.github.io/cub/)| 
+|[Contributor Guide](https://github.com/NVIDIA/cccl/blob/main/CONTRIBUTING.md)|[Dev Containers](https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md)|[Discord](https://discord.gg/nvidiadeveloper)|[Godbolt](https://godbolt.org/z/x4G73af9a)|[GitHub Project](https://github.com/orgs/NVIDIA/projects/6)|[libcudacxx Docs](https://nvidia.github.io/libcudacxx/)|[Thrust Docs](https://nvidia.github.io/thrust/)|[CUB Docs](https://nvlabs.github.io/cub/)|
 |-|-|-|-|-|-|-|-|
 
 # CUDA C++ Core Libraries (CCCL)
@@ -284,20 +284,22 @@ Anything not part of the public API may change at any time without warning.
 
 #### API Versioning
 
-The entirety of CCCL's public API across all components shares a common semantic version of `MAJOR.MINOR.PATCH`.
+The public API of all CCCL's components share a unified semantic version of `MAJOR.MINOR.PATCH`.
 
 Only the most recently released version is supported.
 As a rule, features and bug fixes are not backported to previously released version or branches.
 
-For historical reasons, the library versions are encoded separately in each of Thrust/CUB/libcudacxx as follows:
+The preferred method for querying the version is to use `CCCL_[MAJOR/MINOR/PATCH_]VERSION` as described below.
+For backwards compatibility, the Thrust/CUB/libcudacxxx version definitions are available and will always be consistent with `CCCL_VERSION`.
+Note that Thrust/CUB use a `MMMmmmpp` scheme whereas the CCCL and libcudacxx use `MMMmmmppp`.
 
-|                        | libcudacxx                                | Thrust                        | CUB                        | Incremented when?                                                  |
-|------------------------|-------------------------------------------|-------------------------------|----------------------------|--------------------------------------------------------------------|
-| Header                 | `<cuda/std/version>`                      | `<thrust/version.h>`          | `<cub/version.h>`          | -                                                                  |
-| Major Version          | `_LIBCUDACXX_CUDA_API_VERSION_MAJOR`      | `THRUST_MAJOR_VERSION`        | `CUB_MAJOR_VERSION`        | Public API breaking changes (only at new CTK major release)        |
-| Minor Version          | `_LIBCUDACXX_CUDA_API_VERSION_MINOR`      | `THRUST_MINOR_VERSION`        | `CUB_MINOR_VERSION`        | Non-breaking feature additions                                     |
-| Patch/Subminor Version | `_LIBCUDACXX_CUDA_API_VERSION_PATCH`      | `THRUST_SUBMINOR_VERSION`     | `CUB_SUBMINOR_VERSION`     | Minor changes not covered by major/minor versions                  |
-| Concatenated Version   | `_LIBCUDACXX_CUDA_API_VERSION (MMMmmmppp)`| `THRUST_VERSION (MMMmmmpp)`   | `CUB_VERSION (MMMmmmpp)`   | -                                                                  |
+|                        | CCCL                                   | libcudacxx                                | Thrust                       | CUB                       |
+|------------------------|----------------------------------------|-------------------------------------------|------------------------------|---------------------------|
+| Header                 | `<cuda/version>`                       | `<cuda/std/version>`                      | `<thrust/version.h>`         | `<cub/version.h>`         |
+| Major Version          | `CCCL_MAJOR_VERSION`                   | `_LIBCUDACXX_CUDA_API_VERSION_MAJOR`      | `THRUST_MAJOR_VERSION`       | `CUB_MAJOR_VERSION`       |
+| Minor Version          | `CCCL_MINOR_VERSION`                   | `_LIBCUDACXX_CUDA_API_VERSION_MINOR`      | `THRUST_MINOR_VERSION`       | `CUB_MINOR_VERSION`       |
+| Patch/Subminor Version | `CCCL_PATCH_VERSION`                   | `_LIBCUDACXX_CUDA_API_VERSION_PATCH`      | `THRUST_SUBMINOR_VERSION`    | `CUB_SUBMINOR_VERSION`    |
+| Concatenated Version   | `CCCL_VERSION (MMMmmmppp)`             | `_LIBCUDACXX_CUDA_API_VERSION (MMMmmmppp)`| `THRUST_VERSION (MMMmmmpp)`  | `CUB_VERSION (MMMmmmpp)`  |
 
 ### Application Binary Interface (ABI)
 
@@ -376,7 +378,7 @@ For a detailed overview of the CI pipeline, see [ci-overview.md](ci-overview.md)
 Projects that are related to CCCL's mission to make CUDA C++ more delightful:
 - [cuCollections](https://github.com/NVIDIA/cuCollections) - GPU accelerated data structures like hash tables
 - [NVBench](https://github.com/NVIDIA/nvbench) - Benchmarking library tailored for CUDA applications
-- [stdexec](https://github.com/nvidia/stdexec) - Reference implementation for Senders asynchronous programming model 
+- [stdexec](https://github.com/nvidia/stdexec) - Reference implementation for Senders asynchronous programming model
 
 ## Projects Using CCCL
 
diff --git a/benchmarks/scripts/cccl/bench/bench.py b/benchmarks/scripts/cccl/bench/bench.py
index adaa1f99557..eb5b05baa35 100644
--- a/benchmarks/scripts/cccl/bench/bench.py
+++ b/benchmarks/scripts/cccl/bench/bench.py
@@ -635,6 +635,14 @@ def do_run(self, ct_point, rt_values, timeout, is_search=True):
                 cmd.append("--min-samples")
                 cmd.append("70")
 
+            # Unlike noise, minimal benchmarking time is not directly related to variance.
+            # Default minimal time is 0.5 seconds. For CI we want to reduce it to 0.1 seconds, 
+            # becuse we have limited time budget. Having smaller minimal time doesn't affect 
+            # stability of sample distribution median in a deterministic way. For small problem sizes, 
+            # 0.1s leads to smaller variation than 0.5s. For other workloads, 0.5 leads to smaller variance. 
+            cmd.append("--min-time")
+            cmd.append("0.1")
+
             # NVBench is currently broken for multiple GPUs, use `CUDA_VISIBLE_DEVICES`
             cmd.append("-d")
             cmd.append("0")
diff --git a/benchmarks/scripts/cccl/bench/search.py b/benchmarks/scripts/cccl/bench/search.py
index 264861f8832..13ff05ca01c 100644
--- a/benchmarks/scripts/cccl/bench/search.py
+++ b/benchmarks/scripts/cccl/bench/search.py
@@ -47,19 +47,38 @@ def parse_arguments():
                         type=str, help="Parameter in the format `Param=Value`.")
     parser.add_argument(
         '--list-benches', action=argparse.BooleanOptionalAction, help="Show available benchmarks.")
+    parser.add_argument('--num-shards', type=int, default=1, help='Split benchmarks into M pieces and only run one')
+    parser.add_argument('--run-shard', type=int, default=0, help='Run shard N / M of benchmarks')
+    parser.add_argument('-P0', action=argparse.BooleanOptionalAction, help="Run P0 benchmarks (overwrites -R)")
     return parser.parse_args()
 
 
-def run_benches(benchmarks, sub_space, regex, seeker):
-    pattern = re.compile(regex)
+def run_benches(algnames, sub_space, seeker):
+    for algname in algnames:
+        bench = BaseBench(algname)
+        ct_space = bench.ct_workload_space(sub_space)
+        rt_values = bench.rt_axes_values(sub_space)
+        seeker(algname, ct_space, rt_values)
 
-    for algname in benchmarks:
-        if pattern.match(algname):
-            bench = BaseBench(algname)
-            ct_space = bench.ct_workload_space(sub_space)
-            rt_values = bench.rt_axes_values(sub_space)
-            seeker(algname, ct_space, rt_values)
 
+def filter_benchmarks(benchmarks, args):
+    if args.run_shard >= args.num_shards:
+        raise ValueError('run-shard must be less than num-shards')
+    
+    R = args.R
+    if args.P0:
+        R = '^(?!.*segmented).*(scan|reduce|select|sort).*'
+
+    pattern = re.compile(R)
+    algnames = list(filter(lambda x: pattern.match(x), benchmarks.keys()))
+    algnames.sort()
+
+    if args.num_shards > 1:
+        algnames = np.array_split(algnames, args.num_shards)[args.run_shard].tolist()
+        return algnames
+    
+    return algnames
+    
 
 def search(seeker):
     args = parse_arguments()
@@ -79,8 +98,8 @@ def search(seeker):
     if args.list_benches:
         list_benches()
         return
-
-    run_benches(config.benchmarks, workload_sub_space, args.R, seeker)
+    
+    run_benches(filter_benchmarks(config.benchmarks, args), workload_sub_space, seeker)
 
 
 class MedianCenterEstimator:
diff --git a/benchmarks/scripts/run.py b/benchmarks/scripts/run.py
index 23f0d9e0823..5b5d86e71e4 100755
--- a/benchmarks/scripts/run.py
+++ b/benchmarks/scripts/run.py
@@ -6,18 +6,36 @@
 import cccl.bench
 
 
-def elapsed_time_look_good(x):
+def elapsed_time_looks_good(x):
   if isinstance(x, float):
     if math.isfinite(x):
       return True
   return False
 
 
+def problem_size_looks_large_enough(elements):
+  # Small problem sizes do not utilize entire GPU.
+  # Benchmarking small problem sizes in environments where we do not control
+  # distributions comparison, e.g. CI, is not useful because of stability issues.
+  return elements.isdigit() and int(elements) > 20
+
+
+def filter_runtime_workloads_for_ci(rt_values):
+  for subbench in rt_values:
+    for axis in rt_values[subbench]:
+      if axis.startswith('Elements') and axis.endswith('[pow2]'):
+        rt_values[subbench][axis] = list(filter(problem_size_looks_large_enough, rt_values[subbench][axis]))
+
+  return rt_values
+
+
 class BaseRunner:
   def __init__(self):
     self.estimator = cccl.bench.MedianCenterEstimator()
 
   def __call__(self, algname, ct_workload_space, rt_values):
+    rt_values = filter_runtime_workloads_for_ci(rt_values)
+
     for ct_workload in ct_workload_space:
       bench = cccl.bench.BaseBench(algname)
       if bench.build():
@@ -28,7 +46,7 @@ def __call__(self, algname, ct_workload_space, rt_values):
             bench_name = bench_name.replace(' ', '___')
             bench_name = "".join(c if c.isalnum() else "_" for c in bench_name)
             elapsed_time = results[subbench][point]
-            if elapsed_time_look_good(elapsed_time):
+            if elapsed_time_looks_good(elapsed_time):
               print("&&&& PERF {} {} -sec".format(bench_name, elapsed_time))
       else:
         print("&&&& FAILED bench")
diff --git a/ci/update_version.sh b/ci/update_version.sh
new file mode 100755
index 00000000000..c31e0ef1148
--- /dev/null
+++ b/ci/update_version.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+# Usage: ./update_version.sh [--dry-run] <major> <minor> <patch>
+# Example: ./update_version.sh --dry-run 2 2 1
+
+# Run in root cccl/
+cd "$(dirname "${BASH_SOURCE[0]}")/.." || exit
+
+DRY_RUN=false
+
+while [[ "$#" -gt 0 ]]; do
+    case "$1" in
+        --dry-run) DRY_RUN=true; ;;
+        *) break ;;
+    esac
+    shift
+done
+
+major="$1"
+minor="$2"
+patch="$3"
+
+if [ -z "$major" ] || [ -z "$minor" ] || [ -z "$patch" ]; then
+    echo "Usage: $0 [--dry-run] <major> <minor> <patch>"
+    exit 1
+fi
+
+# Version file paths
+CCCL_VERSION_FILE="libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/version.h"
+THRUST_VERSION_FILE="thrust/thrust/version.h"
+CUB_VERSION_FILE="cub/cub/version.cuh"
+CCCL_CMAKE_VERSION_FILE="lib/cmake/cccl/cccl-config-version.cmake"
+CUB_CMAKE_VERSION_FILE="cub/cub/cmake/cub-config-version.cmake"
+LIBCUDACXX_CMAKE_VERSION_FILE="libcudacxx/lib/cmake/libcudacxx/libcudacxx-config-version.cmake"
+THRUST_CMAKE_VERSION_FILE="thrust/thrust/cmake/thrust-config-version.cmake"
+
+# Calculated version codes
+new_cccl_version=$((major * 1000000 + minor * 1000 + patch))     # MMMmmmppp
+new_thrust_cub_version=$((major * 100000 + minor * 100 + patch)) # MMMmmmpp
+
+# Fetch current version from file
+current_cccl_version=$(grep -oP "define CCCL_VERSION \K[0-9]+" "$CCCL_VERSION_FILE")
+
+# Fetch the latest tag from git and strip the 'v' prefix if present
+latest_tag=$(git tag --sort=-v:refname | head -n 1 | sed 's/^v//')
+
+# Since the tags and versions are numerically comparable, we cast them to integers
+latest_tag_version=$(echo "$latest_tag" | awk -F. '{ printf("%d%03d%03d", $1,$2,$3) }')
+
+echo "Running in $(pwd)"
+echo "New MMMmmmppp version: $new_cccl_version"
+echo "New MMMmmmpp  version: $new_thrust_cub_version"
+echo "Current CCCL version:  $current_cccl_version"
+echo "Latest git tag:        $latest_tag"
+
+# Check if new version is less than or equal to current or the latest tag
+if (( new_cccl_version < current_cccl_version )) || (( new_cccl_version < latest_tag_version )); then
+    echo "Error: New version $new_cccl_version is less than current version $current_cccl_version or latest git tag version $latest_tag_version."
+    exit 1
+fi
+
+update_file () {
+    local file=$1
+    local pattern=$2
+    local new_value=$3
+    if [ "$DRY_RUN" = true ]; then
+        local temp_file=$(mktemp)
+        sed "s/$pattern/$new_value/g" "$file" > "$temp_file"
+        diff --color=auto -U 0 "$file" "$temp_file" || true
+        rm "$temp_file"
+    else
+        sed -i "s/$pattern/$new_value/" "$file"
+    fi
+}
+
+# Update version information in files
+update_file "$CCCL_VERSION_FILE" "^#define CCCL_VERSION \([0-9]\+\)" "#define CCCL_VERSION $new_cccl_version"
+update_file "$THRUST_VERSION_FILE" "^#define THRUST_VERSION \([0-9]\+\)" "#define THRUST_VERSION $new_thrust_cub_version"
+update_file "$CUB_VERSION_FILE" "^#define CUB_VERSION \([0-9]\+\)" "#define CUB_VERSION $new_thrust_cub_version"
+
+update_file "$CUB_CMAKE_VERSION_FILE" "set(CUB_VERSION_MAJOR \([0-9]\+\))" "set(CUB_VERSION_MAJOR $major)"
+update_file "$CUB_CMAKE_VERSION_FILE" "set(CUB_VERSION_MINOR \([0-9]\+\))" "set(CUB_VERSION_MINOR $minor)"
+update_file "$CUB_CMAKE_VERSION_FILE" "set(CUB_VERSION_PATCH \([0-9]\+\))" "set(CUB_VERSION_PATCH $patch)"
+
+update_file "$LIBCUDACXX_CMAKE_VERSION_FILE" "set(libcudacxx_VERSION_MAJOR \([0-9]\+\))" "set(libcudacxx_VERSION_MAJOR $major)"
+update_file "$LIBCUDACXX_CMAKE_VERSION_FILE" "set(libcudacxx_VERSION_MINOR \([0-9]\+\))" "set(libcudacxx_VERSION_MINOR $minor)"
+update_file "$LIBCUDACXX_CMAKE_VERSION_FILE" "set(libcudacxx_VERSION_PATCH \([0-9]\+\))" "set(libcudacxx_VERSION_PATCH $patch)"
+
+update_file "$THRUST_CMAKE_VERSION_FILE" "set(THRUST_VERSION_MAJOR \([0-9]\+\))" "set(THRUST_VERSION_MAJOR $major)"
+update_file "$THRUST_CMAKE_VERSION_FILE" "set(THRUST_VERSION_MINOR \([0-9]\+\))" "set(THRUST_VERSION_MINOR $minor)"
+update_file "$THRUST_CMAKE_VERSION_FILE" "set(THRUST_VERSION_PATCH \([0-9]\+\))" "set(THRUST_VERSION_PATCH $patch)"
+
+update_file "$CCCL_CMAKE_VERSION_FILE" "set(CCCL_VERSION_MAJOR \([0-9]\+\))" "set(CCCL_VERSION_MAJOR $major)"
+update_file "$CCCL_CMAKE_VERSION_FILE" "set(CCCL_VERSION_MINOR \([0-9]\+\))" "set(CCCL_VERSION_MINOR $minor)"
+update_file "$CCCL_CMAKE_VERSION_FILE" "set(CCCL_VERSION_PATCH \([0-9]\+\))" "set(CCCL_VERSION_PATCH $patch)"
+
+if [ "$DRY_RUN" = true ]; then
+    echo "Dry run completed. No changes made."
+else
+    echo "Version updated to $major.$minor.$patch"
+fi
diff --git a/cub/benchmarks/bench/adjacent_difference/subtract_left.cu b/cub/benchmarks/bench/adjacent_difference/subtract_left.cu
index d47efd83abb..0539509b09c 100644
--- a/cub/benchmarks/bench/adjacent_difference/subtract_left.cu
+++ b/cub/benchmarks/bench/adjacent_difference/subtract_left.cu
@@ -100,7 +100,7 @@ void left(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
   std::uint8_t* d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(d_temp_storage,
                          temp_storage_bytes,
                          d_in,
diff --git a/cub/benchmarks/bench/copy/memcpy.cu b/cub/benchmarks/bench/copy/memcpy.cu
index 467e0c2e0ec..794619bc832 100644
--- a/cub/benchmarks/bench/copy/memcpy.cu
+++ b/cub/benchmarks/bench/copy/memcpy.cu
@@ -249,7 +249,7 @@ void copy(nvbench::state &state,
   thrust::device_vector<nvbench::uint8_t> temp_storage(temp_storage_bytes);
   d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(d_temp_storage,
                          temp_storage_bytes,
                          d_input_buffers,
@@ -275,8 +275,8 @@ void uniform(nvbench::state &state, nvbench::type_list<T, OffsetT> tl)
        elements,
        min_buffer_size,
        max_buffer_size,
-       state.get_int64("RandomizeInput"),
-       state.get_int64("RandomizeOutput"));
+       state.get_int64("Randomize"),
+       state.get_int64("Randomize"));
 }
 
 template <class T, class OffsetT>
@@ -309,8 +309,7 @@ NVBENCH_BENCH_TYPES(uniform, NVBENCH_TYPE_AXES(types, u_offset_types))
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(25, 29, 2))
   .add_int64_axis("MinBufferSizeRatio", {1, 99})
   .add_int64_axis("MaxBufferSize", {8, 64, 256, 1024, 64 * 1024})
-  .add_int64_axis("RandomizeInput", {0, 1})
-  .add_int64_axis("RandomizeOutput", {0, 1});
+  .add_int64_axis("Randomize", {0, 1});
 
 NVBENCH_BENCH_TYPES(large, NVBENCH_TYPE_AXES(types, u_offset_types))
   .set_name("large")
diff --git a/cub/benchmarks/bench/histogram/even.cu b/cub/benchmarks/bench/histogram/even.cu
index ce34cab5f4c..7ef201dbbad 100644
--- a/cub/benchmarks/bench/histogram/even.cu
+++ b/cub/benchmarks/bench/histogram/even.cu
@@ -110,7 +110,7 @@ static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, Of
   thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
   d_temp_storage = thrust::raw_pointer_cast(tmp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::DispatchEven(d_temp_storage,
                              temp_storage_bytes,
                              d_input,
@@ -139,5 +139,5 @@ NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset
   .set_name("base")
   .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
-  .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
-  .add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
+  .add_int64_axis("Bins", {32, 128, 2048, 2097152})
+  .add_string_axis("Entropy", {"0.201", "1.000"});
diff --git a/cub/benchmarks/bench/histogram/multi/even.cu b/cub/benchmarks/bench/histogram/multi/even.cu
index 1a501aeec13..83ebfcea5b9 100644
--- a/cub/benchmarks/bench/histogram/multi/even.cu
+++ b/cub/benchmarks/bench/histogram/multi/even.cu
@@ -121,7 +121,7 @@ static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, Of
   thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
   d_temp_storage = thrust::raw_pointer_cast(tmp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::DispatchEven(d_temp_storage,
                              temp_storage_bytes,
                              d_input,
@@ -150,5 +150,5 @@ NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset
   .set_name("base")
   .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
-  .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
-  .add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
+  .add_int64_axis("Bins", {32, 128, 2048, 2097152})
+  .add_string_axis("Entropy", {"0.201", "1.000"});
diff --git a/cub/benchmarks/bench/histogram/multi/range.cu b/cub/benchmarks/bench/histogram/multi/range.cu
index 9d8431635be..939cb79fe28 100644
--- a/cub/benchmarks/bench/histogram/multi/range.cu
+++ b/cub/benchmarks/bench/histogram/multi/range.cu
@@ -129,7 +129,7 @@ static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, O
   thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
   d_temp_storage = thrust::raw_pointer_cast(tmp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::DispatchRange(d_temp_storage,
                               temp_storage_bytes,
                               d_input,
@@ -157,5 +157,5 @@ NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offse
   .set_name("base")
   .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
-  .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
-  .add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
+  .add_int64_axis("Bins", {32, 128, 2048, 2097152})
+  .add_string_axis("Entropy", {"0.201", "1.000"});
diff --git a/cub/benchmarks/bench/histogram/range.cu b/cub/benchmarks/bench/histogram/range.cu
index 9e118064904..266c483a19b 100644
--- a/cub/benchmarks/bench/histogram/range.cu
+++ b/cub/benchmarks/bench/histogram/range.cu
@@ -116,7 +116,7 @@ static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, O
   thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
   d_temp_storage = thrust::raw_pointer_cast(tmp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::DispatchRange(d_temp_storage,
                               temp_storage_bytes,
                               d_input,
@@ -144,5 +144,5 @@ NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offse
   .set_name("base")
   .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
-  .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
-  .add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
+  .add_int64_axis("Bins", {32, 128, 2048, 2097152})
+  .add_string_axis("Entropy", {"0.201", "1.000"});
diff --git a/cub/benchmarks/bench/merge_sort/keys.cu b/cub/benchmarks/bench/merge_sort/keys.cu
index ba43e86cec3..831c021515d 100644
--- a/cub/benchmarks/bench/merge_sort/keys.cu
+++ b/cub/benchmarks/bench/merge_sort/keys.cu
@@ -131,7 +131,7 @@ void keys(nvbench::state &state, nvbench::type_list<T, OffsetT>)
   thrust::device_vector<nvbench::uint8_t> temp(temp_size);
   auto *temp_storage = thrust::raw_pointer_cast(temp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(temp_storage,
                          temp_size,
                          d_buffer_1,
diff --git a/cub/benchmarks/bench/merge_sort/pairs.cu b/cub/benchmarks/bench/merge_sort/pairs.cu
index cf4b23b69b7..6a1f9d8d566 100644
--- a/cub/benchmarks/bench/merge_sort/pairs.cu
+++ b/cub/benchmarks/bench/merge_sort/pairs.cu
@@ -134,7 +134,7 @@ void pairs(nvbench::state &state, nvbench::type_list<KeyT, ValueT, OffsetT>)
   thrust::device_vector<nvbench::uint8_t> temp(temp_size);
   auto *temp_storage = thrust::raw_pointer_cast(temp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(temp_storage,
                          temp_size,
                          d_keys_buffer_1,
diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu
index f26182e2609..0045fc44757 100644
--- a/cub/benchmarks/bench/partition/flagged.cu
+++ b/cub/benchmarks/bench/partition/flagged.cu
@@ -148,7 +148,7 @@ void flagged(nvbench::state &state, nvbench::type_list<T, OffsetT>)
   thrust::device_vector<nvbench::uint8_t> temp(temp_size);
   auto *temp_storage = thrust::raw_pointer_cast(temp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(temp_storage,
                          temp_size,
                          d_in,
diff --git a/cub/benchmarks/bench/partition/if.cu b/cub/benchmarks/bench/partition/if.cu
index 1ab89a02c54..fcbc97446f5 100644
--- a/cub/benchmarks/bench/partition/if.cu
+++ b/cub/benchmarks/bench/partition/if.cu
@@ -170,7 +170,7 @@ void partition(nvbench::state &state, nvbench::type_list<T, OffsetT>)
   thrust::device_vector<nvbench::uint8_t> temp(temp_size);
   auto *temp_storage = thrust::raw_pointer_cast(temp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(temp_storage,
                          temp_size,
                          d_in,
diff --git a/cub/benchmarks/bench/partition/three_way.cu b/cub/benchmarks/bench/partition/three_way.cu
index 9f2b5eb8b1b..577d01797fe 100644
--- a/cub/benchmarks/bench/partition/three_way.cu
+++ b/cub/benchmarks/bench/partition/three_way.cu
@@ -146,7 +146,7 @@ void partition(nvbench::state &state, nvbench::type_list<T, OffsetT>)
   thrust::device_vector<nvbench::uint8_t> temp(temp_size);
   auto *temp_storage = thrust::raw_pointer_cast(temp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(temp_storage,
                          temp_size,
                          d_in,
diff --git a/cub/benchmarks/bench/radix_sort/keys.cu b/cub/benchmarks/bench/radix_sort/keys.cu
index 6586e40b91c..c2fc6a7d1f5 100644
--- a/cub/benchmarks/bench/radix_sort/keys.cu
+++ b/cub/benchmarks/bench/radix_sort/keys.cu
@@ -182,7 +182,7 @@ void radix_sort_keys(std::integral_constant<bool, true>,
   thrust::device_vector<nvbench::uint8_t> temp(temp_size);
   auto *temp_storage = thrust::raw_pointer_cast(temp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     cub::DoubleBuffer<key_t> keys     = d_keys;
     cub::DoubleBuffer<value_t> values = d_values;
 
@@ -222,4 +222,4 @@ NVBENCH_BENCH_TYPES(radix_sort_keys, NVBENCH_TYPE_AXES(fundamental_types, offset
   .set_name("base")
   .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
-  .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"});
+  .add_string_axis("Entropy", {"1.000", "0.544", "0.201"});
diff --git a/cub/benchmarks/bench/radix_sort/pairs.cu b/cub/benchmarks/bench/radix_sort/pairs.cu
index 006e33b98ce..503a25bc4b8 100644
--- a/cub/benchmarks/bench/radix_sort/pairs.cu
+++ b/cub/benchmarks/bench/radix_sort/pairs.cu
@@ -186,7 +186,7 @@ void radix_sort_values(std::integral_constant<bool, true>,
   thrust::device_vector<nvbench::uint8_t> temp(temp_size);
   auto *temp_storage = thrust::raw_pointer_cast(temp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     cub::DoubleBuffer<key_t> keys     = d_keys;
     cub::DoubleBuffer<value_t> values = d_values;
 
@@ -224,7 +224,7 @@ void radix_sort_values(nvbench::state &state, nvbench::type_list<KeyT, ValueT, O
 #ifdef TUNE_KeyT
 using key_types = nvbench::type_list<TUNE_KeyT>;
 #else // !defined(TUNE_KeyT) 
-using key_types = fundamental_types;
+using key_types = integral_types;
 #endif // TUNE_KeyT
 
 #ifdef TUNE_ValueT
@@ -245,4 +245,4 @@ NVBENCH_BENCH_TYPES(radix_sort_values, NVBENCH_TYPE_AXES(key_types, value_types,
   .set_name("base")
   .set_type_axes_names({"KeyT{ct}", "ValueT{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
-  .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"});
+  .add_string_axis("Entropy", {"1.000", "0.201"});
diff --git a/cub/benchmarks/bench/reduce/base.cuh b/cub/benchmarks/bench/reduce/base.cuh
index 253e5533b44..d874d69ae67 100644
--- a/cub/benchmarks/bench/reduce/base.cuh
+++ b/cub/benchmarks/bench/reduce/base.cuh
@@ -103,7 +103,7 @@ void reduce(nvbench::state &state, nvbench::type_list<T, OffsetT>)
   thrust::device_vector<nvbench::uint8_t> temp(temp_size);
   auto *temp_storage = thrust::raw_pointer_cast(temp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(temp_storage,
                          temp_size,
                          d_in,
diff --git a/cub/benchmarks/bench/reduce/by_key.cu b/cub/benchmarks/bench/reduce/by_key.cu
index d11667b7a5b..4ccca911472 100644
--- a/cub/benchmarks/bench/reduce/by_key.cu
+++ b/cub/benchmarks/bench/reduce/by_key.cu
@@ -158,7 +158,7 @@ static void reduce(nvbench::state &state, nvbench::type_list<KeyT, ValueT, Offse
   state.add_global_memory_writes<KeyT>(num_runs);
   state.add_global_memory_writes<OffsetT>(1);
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(d_temp_storage,
                          temp_storage_bytes,
                          d_in_keys,
diff --git a/cub/benchmarks/bench/run_length_encode/encode.cu b/cub/benchmarks/bench/run_length_encode/encode.cu
index 1e02f01055c..16a07206c93 100644
--- a/cub/benchmarks/bench/run_length_encode/encode.cu
+++ b/cub/benchmarks/bench/run_length_encode/encode.cu
@@ -158,7 +158,7 @@ static void rle(nvbench::state &state, nvbench::type_list<T, OffsetT>)
   state.add_global_memory_writes<OffsetT>(num_runs);
   state.add_global_memory_writes<OffsetT>(1);
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(d_temp_storage,
                          temp_storage_bytes,
                          d_in_keys,
diff --git a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu
index b679aa40849..83e2a8eff31 100644
--- a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu
+++ b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu
@@ -145,7 +145,7 @@ static void rle(nvbench::state &state, nvbench::type_list<T, OffsetT>)
   state.add_global_memory_writes<OffsetT>(num_runs);
   state.add_global_memory_writes<OffsetT>(1);
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(d_temp_storage,
                          temp_storage_bytes,
                          d_in_keys,
diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh
index 3c3ec561c63..1e28450e26b 100644
--- a/cub/benchmarks/bench/scan/exclusive/base.cuh
+++ b/cub/benchmarks/bench/scan/exclusive/base.cuh
@@ -121,7 +121,7 @@ static void basic(nvbench::state &state, nvbench::type_list<T, OffsetT>)
   thrust::device_vector<nvbench::uint8_t> tmp(tmp_size);
   nvbench::uint8_t *d_tmp = thrust::raw_pointer_cast(tmp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(thrust::raw_pointer_cast(tmp.data()),
                          tmp_size,
                          d_input,
diff --git a/cub/benchmarks/bench/scan/exclusive/by_key.cu b/cub/benchmarks/bench/scan/exclusive/by_key.cu
index 93e515e02cb..26b36322ee2 100644
--- a/cub/benchmarks/bench/scan/exclusive/by_key.cu
+++ b/cub/benchmarks/bench/scan/exclusive/by_key.cu
@@ -134,7 +134,7 @@ static void scan(nvbench::state &state, nvbench::type_list<KeyT, ValueT, OffsetT
   thrust::device_vector<nvbench::uint8_t> tmp(tmp_size);
   nvbench::uint8_t *d_tmp = thrust::raw_pointer_cast(tmp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(d_tmp,
                          tmp_size,
                          d_keys,
diff --git a/cub/benchmarks/bench/segmented_sort/keys.cu b/cub/benchmarks/bench/segmented_sort/keys.cu
index 6f47c66d663..d0d15789940 100644
--- a/cub/benchmarks/bench/segmented_sort/keys.cu
+++ b/cub/benchmarks/bench/segmented_sort/keys.cu
@@ -220,7 +220,7 @@ void seg_sort(nvbench::state &state,
   thrust::device_vector<nvbench::uint8_t> temp_storage(temp_storage_bytes);
   d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch &launch) {
     cub::DoubleBuffer<key_t> keys     = d_keys;
     cub::DoubleBuffer<value_t> values = d_values;
 
@@ -255,7 +255,7 @@ NVBENCH_BENCH_TYPES(power_law, NVBENCH_TYPE_AXES(fundamental_types, some_offset_
   .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(22, 30, 4))
   .add_int64_power_of_two_axis("Segments{io}", nvbench::range(12, 20, 4))
-  .add_string_axis("Entropy", {"1.000", "0.544", "0.201"});
+  .add_string_axis("Entropy", {"1.000", "0.201"});
 
 
 template <class T, typename OffsetT>
diff --git a/cub/benchmarks/bench/select/flagged.cu b/cub/benchmarks/bench/select/flagged.cu
index fcbf57aba1a..dcf0598bd3a 100644
--- a/cub/benchmarks/bench/select/flagged.cu
+++ b/cub/benchmarks/bench/select/flagged.cu
@@ -152,7 +152,7 @@ void select(nvbench::state &state, nvbench::type_list<T, OffsetT>)
   thrust::device_vector<nvbench::uint8_t> temp(temp_size);
   auto *temp_storage = thrust::raw_pointer_cast(temp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(temp_storage,
                          temp_size,
                          d_in,
diff --git a/cub/benchmarks/bench/select/if.cu b/cub/benchmarks/bench/select/if.cu
index 089ffa0f4a7..981ed7b7013 100644
--- a/cub/benchmarks/bench/select/if.cu
+++ b/cub/benchmarks/bench/select/if.cu
@@ -174,7 +174,7 @@ void select(nvbench::state &state, nvbench::type_list<T, OffsetT>)
   thrust::device_vector<nvbench::uint8_t> temp(temp_size);
   auto *temp_storage = thrust::raw_pointer_cast(temp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(temp_storage,
                          temp_size,
                          d_in,
diff --git a/cub/benchmarks/bench/select/unique_by_key.cu b/cub/benchmarks/bench/select/unique_by_key.cu
index 1d610e2b823..e048d81b635 100644
--- a/cub/benchmarks/bench/select/unique_by_key.cu
+++ b/cub/benchmarks/bench/select/unique_by_key.cu
@@ -150,7 +150,7 @@ static void select(nvbench::state &state, nvbench::type_list<KeyT, ValueT, Offse
   state.add_global_memory_writes<KeyT>(num_runs);
   state.add_global_memory_writes<OffsetT>(1);
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(d_temp_storage,
                          temp_storage_bytes,
                          d_in_keys,
diff --git a/cub/benchmarks/nvbench_helper/CMakeLists.txt b/cub/benchmarks/nvbench_helper/CMakeLists.txt
index 38112b1779e..e9ef302d2ff 100644
--- a/cub/benchmarks/nvbench_helper/CMakeLists.txt
+++ b/cub/benchmarks/nvbench_helper/CMakeLists.txt
@@ -1,5 +1,5 @@
 # Fetch nvbench
-CPMAddPackage("gh:NVIDIA/nvbench#39b2770b62ce1f4e0ebeb9af60d7c6de624633a5")
+CPMAddPackage("gh:NVIDIA/nvbench#main")
 
 add_library(nvbench_helper OBJECT nvbench_helper/nvbench_helper.cuh
                                   nvbench_helper/nvbench_helper.cu)
diff --git a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
index c5ae5f6c508..3bedf5841ae 100644
--- a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
+++ b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
@@ -53,29 +53,34 @@ using offset_types = nvbench::type_list<int32_t, int64_t>;
 #endif
 
 #ifdef TUNE_T
+using integral_types    = nvbench::type_list<TUNE_T>;
 using fundamental_types = nvbench::type_list<TUNE_T>;
-using all_types = nvbench::type_list<TUNE_T>;
+using all_types         = nvbench::type_list<TUNE_T>;
 #else
-using fundamental_types = nvbench::type_list<int8_t,
-                                             int16_t,
-                                             int32_t,
-                                             int64_t,
-#if NVBENCH_HELPER_HAS_I128
-                                             int128_t,
-#endif
-                                             float,
-                                             double>;
-                                             
-using all_types = nvbench::type_list<int8_t,
-                                     int16_t,
-                                     int32_t,
-                                     int64_t,
-#if NVBENCH_HELPER_HAS_I128
-                                     int128_t,
-#endif
-                                     float,
-                                     double,
-                                     complex>;
+using integral_types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t>;
+
+using fundamental_types =
+  nvbench::type_list<int8_t,
+                     int16_t,
+                     int32_t,
+                     int64_t,
+#  if NVBENCH_HELPER_HAS_I128
+                     int128_t,
+#  endif
+                     float,
+                     double>;
+
+using all_types =
+  nvbench::type_list<int8_t,
+                     int16_t,
+                     int32_t,
+                     int64_t,
+#  if NVBENCH_HELPER_HAS_I128
+                     int128_t,
+#  endif
+                     float,
+                     double,
+                     complex>;
 #endif
 
 template <class T>
diff --git a/cub/cub/cmake/cub-config-version.cmake b/cub/cub/cmake/cub-config-version.cmake
index 5d9dd5b922d..87e667e9dc3 100644
--- a/cub/cub/cmake/cub-config-version.cmake
+++ b/cub/cub/cmake/cub-config-version.cmake
@@ -1,17 +1,10 @@
 # Parse version information from version.cuh:
 include("${CMAKE_CURRENT_LIST_DIR}/cub-header-search.cmake")
 
-file(READ "${_CUB_VERSION_INCLUDE_DIR}/cub/version.cuh" CUB_VERSION_HEADER)
-string(REGEX MATCH "#define[ \t]+CUB_VERSION[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}")
-set(CUB_VERSION_FLAT ${CMAKE_MATCH_1})
-# Note that CUB calls this the PATCH number, CMake calls it the TWEAK number:
-string(REGEX MATCH "#define[ \t]+CUB_PATCH_NUMBER[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}")
-set(CUB_VERSION_TWEAK ${CMAKE_MATCH_1})
-
-math(EXPR CUB_VERSION_MAJOR "${CUB_VERSION_FLAT} / 100000")
-math(EXPR CUB_VERSION_MINOR "(${CUB_VERSION_FLAT} / 100) % 1000")
-math(EXPR CUB_VERSION_PATCH "${CUB_VERSION_FLAT} % 100") # CUB: "subminor" CMake: "patch"
-
+set(CUB_VERSION_MAJOR 2)
+set(CUB_VERSION_MINOR 3)
+set(CUB_VERSION_PATCH 0)
+set(CUB_VERSION_TWEAK 0)
 set(CUB_VERSION "${CUB_VERSION_MAJOR}.${CUB_VERSION_MINOR}.${CUB_VERSION_PATCH}.${CUB_VERSION_TWEAK}")
 
 set(PACKAGE_VERSION ${CUB_VERSION})
diff --git a/cub/cub/version.cuh b/cub/cub/version.cuh
index 9934b92eff9..fdbfb641486 100644
--- a/cub/cub/version.cuh
+++ b/cub/cub/version.cuh
@@ -44,15 +44,19 @@
 _CCCL_IMPLICIT_SYSTEM_HEADER
 #endif // !_CCCL_COMPILER_NVHPC
 
+#include <cuda/version>
+
 /*! \def CUB_VERSION
  *  \brief The preprocessor macro \p CUB_VERSION encodes the version
- *         number of the CUB library.
+ *         number of the CUB library as MMMmmmpp.
+ *
+ *  \note CUB_VERSION is formatted as `MMMmmmpp`, which differs from `CCCL_VERSION` that uses `MMMmmmppp`.
  *
  *         <tt>CUB_VERSION % 100</tt> is the sub-minor version.
  *         <tt>CUB_VERSION / 100 % 1000</tt> is the minor version.
  *         <tt>CUB_VERSION / 100000</tt> is the major version.
  */
-#define CUB_VERSION 200200
+#define CUB_VERSION 200300 // macro expansion with ## requires this to be a single value
 
 /*! \def CUB_MAJOR_VERSION
  *  \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the
@@ -77,3 +81,7 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
  *         patch number of the CUB library.
  */
 #define CUB_PATCH_NUMBER 0
+
+static_assert(CUB_MAJOR_VERSION == CCCL_MAJOR_VERSION,"");
+static_assert(CUB_MINOR_VERSION == CCCL_MINOR_VERSION,"");
+static_assert(CUB_SUBMINOR_VERSION == CCCL_PATCH_VERSION,"");
diff --git a/cub/docs/tuning.rst b/cub/docs/tuning.rst
index ae6cdaabe3c..9ac2b9752b2 100644
--- a/cub/docs/tuning.rst
+++ b/cub/docs/tuning.rst
@@ -121,7 +121,7 @@ Finally, we can run the algorithm:
 
 .. code:: c++
 
-    state.exec([&](nvbench::launch &launch) {
+    state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
       dispatch_t::Dispatch(temp_storage,
                            temp_size,
                            d_in,
diff --git a/cub/test/bfloat16.h b/cub/test/bfloat16.h
index dbd735db83f..328fb644a5e 100644
--- a/cub/test/bfloat16.h
+++ b/cub/test/bfloat16.h
@@ -32,11 +32,14 @@
  * Utilities for interacting with the opaque CUDA __nv_bfloat16 type
  */
 
-#include <stdint.h>
+#include <cub/util_type.cuh>
+
 #include <cuda_bf16.h>
-#include <iosfwd>
 
-#include <cub/util_type.cuh>
+#include <cuda/std/type_traits>
+
+#include <cstdint>
+#include <iosfwd>
 
 #ifdef __GNUC__
 // There's a ton of type-punning going on in this file.
@@ -77,6 +80,16 @@ struct bfloat16_t
         *this = bfloat16_t(float(a));
     }
 
+    /// Constructor from unsigned long long int
+    template < typename T,
+               typename = typename ::cuda::std::enable_if<
+                 ::cuda::std::is_same<T, unsigned long long int>::value
+                 && (!::cuda::std::is_same<std::size_t, unsigned long long int>::value)>::type>
+    __host__ __device__ __forceinline__ bfloat16_t(T a)
+    {
+      *this = bfloat16_t(float(a));
+    }
+
     /// Default constructor
     bfloat16_t() = default;
 
diff --git a/cub/test/c2h/generators.cu b/cub/test/c2h/generators.cu
index 9e0f3188113..67bf81e558c 100644
--- a/cub/test/c2h/generators.cu
+++ b/cub/test/c2h/generators.cu
@@ -259,16 +259,17 @@ void generator_t::operator()(seed_t seed, thrust::device_vector<T> &data, T min,
 template <typename T>
 struct count_to_item_t
 {
-  std::size_t n;
+  unsigned long long int n;
 
-  count_to_item_t(std::size_t n)
+  count_to_item_t(unsigned long long int n)
       : n(n)
   {}
 
   template <typename CounterT>
   __device__ T operator()(CounterT id)
   {
-    return static_cast<T>(static_cast<std::size_t>(id) % n);
+    // This has to be a type for which extended floating point types like __nv_fp8_e5m2 provide an overload
+    return static_cast<T>(static_cast<unsigned long long int>(id) % n);
   }
 };
 
diff --git a/cub/test/half.h b/cub/test/half.h
index a009049cc71..74e507a57cd 100644
--- a/cub/test/half.h
+++ b/cub/test/half.h
@@ -37,6 +37,8 @@
 
 #include <cuda_fp16.h>
 
+#include <cuda/std/type_traits>
+
 #include <cstdint>
 #include <cstring>
 #include <iosfwd>
@@ -80,6 +82,16 @@ struct half_t
         *this = half_t(float(a));
     }
 
+    /// Constructor from unsigned long long int
+    template < typename T,
+               typename = typename ::cuda::std::enable_if<
+                 ::cuda::std::is_same<T, unsigned long long int>::value
+                 && (!::cuda::std::is_same<std::size_t, unsigned long long int>::value)>::type>
+    __host__ __device__ __forceinline__ half_t(T a)
+    {
+      *this = half_t(float(a));
+    }
+
     /// Default constructor
     half_t() = default;
 
diff --git a/lib/cmake/cccl/cccl-config-version.cmake b/lib/cmake/cccl/cccl-config-version.cmake
index 1e026c98334..d01aa5b6194 100644
--- a/lib/cmake/cccl/cccl-config-version.cmake
+++ b/lib/cmake/cccl/cccl-config-version.cmake
@@ -1,5 +1,5 @@
 set(CCCL_VERSION_MAJOR 2)
-set(CCCL_VERSION_MINOR 2)
+set(CCCL_VERSION_MINOR 3)
 set(CCCL_VERSION_PATCH 0)
 set(CCCL_VERSION_TWEAK 0)
 
diff --git a/libcudacxx/.upstream-tests/test/cuda/version.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/version.pass.cpp
new file mode 100644
index 00000000000..343de4e9ec6
--- /dev/null
+++ b/libcudacxx/.upstream-tests/test/cuda/version.pass.cpp
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/version>
+
+static_assert(CCCL_MAJOR_VERSION == (CCCL_VERSION/1000000),"");
+static_assert(CCCL_MINOR_VERSION == (CCCL_VERSION/1000 % 1000),"");
+static_assert(CCCL_PATCH_VERSION == (CCCL_VERSION % 1000),"");
+
+int main(int argc, char** argv){
+    return 0;
+}
diff --git a/libcudacxx/include/cuda/std/detail/__config b/libcudacxx/include/cuda/std/detail/__config
index b6dee85435b..ed1bad96c43 100644
--- a/libcudacxx/include/cuda/std/detail/__config
+++ b/libcudacxx/include/cuda/std/detail/__config
@@ -11,16 +11,12 @@
 #ifndef __cuda_std__
 #define __cuda_std__
 
-#define _LIBCUDACXX_CUDA_API_VERSION 2002000
+#include <cuda/std/detail/libcxx/include/__cccl/version.h>
 
-#define _LIBCUDACXX_CUDA_API_VERSION_MAJOR \
-  (_LIBCUDACXX_CUDA_API_VERSION / 1000000)
-
-#define _LIBCUDACXX_CUDA_API_VERSION_MINOR \
-  (_LIBCUDACXX_CUDA_API_VERSION / 1000 % 1000)
-
-#define _LIBCUDACXX_CUDA_API_VERSION_PATCH \
-  (_LIBCUDACXX_CUDA_API_VERSION % 1000)
+#define _LIBCUDACXX_CUDA_API_VERSION CCCL_VERSION
+#define _LIBCUDACXX_CUDA_API_VERSION_MAJOR CCCL_MAJOR_VERSION
+#define _LIBCUDACXX_CUDA_API_VERSION_MINOR CCCL_MINOR_VERSION
+#define _LIBCUDACXX_CUDA_API_VERSION_PATCH CCCL_PATCH_VERSION
 
 #ifndef _LIBCUDACXX_CUDA_ABI_VERSION_LATEST
 #  define _LIBCUDACXX_CUDA_ABI_VERSION_LATEST 4
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/version.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/version.h
new file mode 100644
index 00000000000..bb0cf2e6ebe
--- /dev/null
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/version.h
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CCCL_VERSION_H
+#define __CCCL_VERSION_H
+
+#define CCCL_VERSION 2003000
+#define CCCL_MAJOR_VERSION (CCCL_VERSION / 1000000)
+#define CCCL_MINOR_VERSION (((CCCL_VERSION / 1000) % 1000))
+#define CCCL_PATCH_VERSION (CCCL_VERSION % 1000)
+
+#if CCCL_PATCH_VERSION > 99
+#error "CCCL patch version cannot be greater than 99 for compatibility with Thrust/CUB's MMMmmmpp format."
+#endif
+
+#endif // __CCCL_VERSION_H
diff --git a/libcudacxx/include/cuda/version b/libcudacxx/include/cuda/version
new file mode 100644
index 00000000000..cd536749780
--- /dev/null
+++ b/libcudacxx/include/cuda/version
@@ -0,0 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_VERSION
+#define _CUDA_VERSION
+
+#include "std/version"
+
+#endif // _CUDA_VERSION
diff --git a/libcudacxx/lib/cmake/libcudacxx/libcudacxx-config-version.cmake b/libcudacxx/lib/cmake/libcudacxx/libcudacxx-config-version.cmake
index e528dedbfc9..17be3cddc98 100644
--- a/libcudacxx/lib/cmake/libcudacxx/libcudacxx-config-version.cmake
+++ b/libcudacxx/lib/cmake/libcudacxx/libcudacxx-config-version.cmake
@@ -1,19 +1,9 @@
 # Parse version information from version header:
 include("${CMAKE_CURRENT_LIST_DIR}/libcudacxx-header-search.cmake")
 
-file(READ "${_libcudacxx_VERSION_INCLUDE_DIR}/cuda/std/detail/__config"
-  libcudacxx_VERSION_HEADER
-)
-
-string(REGEX MATCH
-  "#define[ \t]+_LIBCUDACXX_CUDA_API_VERSION[ \t]+([0-9]+)" unused_var
-  "${libcudacxx_VERSION_HEADER}"
-)
-
-set(libcudacxx_VERSION_FLAT ${CMAKE_MATCH_1})
-math(EXPR libcudacxx_VERSION_MAJOR "${libcudacxx_VERSION_FLAT} / 1000000")
-math(EXPR libcudacxx_VERSION_MINOR "(${libcudacxx_VERSION_FLAT} / 1000) % 1000")
-math(EXPR libcudacxx_VERSION_PATCH "${libcudacxx_VERSION_FLAT} % 1000")
+set(libcudacxx_VERSION_MAJOR 2)
+set(libcudacxx_VERSION_MINOR 3)
+set(libcudacxx_VERSION_PATCH 0)
 set(libcudacxx_VERSION_TWEAK 0)
 
 set(libcudacxx_VERSION
diff --git a/thrust/benchmarks/bench/adjacent_difference/basic.cu b/thrust/benchmarks/bench/adjacent_difference/basic.cu
index d681aaa6aae..47f93f382b0 100644
--- a/thrust/benchmarks/bench/adjacent_difference/basic.cu
+++ b/thrust/benchmarks/bench/adjacent_difference/basic.cu
@@ -43,7 +43,7 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(elements);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     thrust::adjacent_difference(input.cbegin(), input.cend(), output.begin());
   });
 }
diff --git a/thrust/benchmarks/bench/adjacent_difference/custom.cu b/thrust/benchmarks/bench/adjacent_difference/custom.cu
index 63d5d69e12c..e8e892bf8d4 100644
--- a/thrust/benchmarks/bench/adjacent_difference/custom.cu
+++ b/thrust/benchmarks/bench/adjacent_difference/custom.cu
@@ -60,7 +60,7 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(elements);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     thrust::adjacent_difference(input.cbegin(), input.cend(), output.begin(), custom_op<T>{42});
   });
 }
diff --git a/thrust/benchmarks/bench/adjacent_difference/in_place.cu b/thrust/benchmarks/bench/adjacent_difference/in_place.cu
index e6c39748df1..4e91e8471c0 100644
--- a/thrust/benchmarks/bench/adjacent_difference/in_place.cu
+++ b/thrust/benchmarks/bench/adjacent_difference/in_place.cu
@@ -42,7 +42,7 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(elements);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     thrust::adjacent_difference(vec.begin(), vec.end(), vec.begin());
   });
 }
diff --git a/thrust/benchmarks/bench/copy/basic.cu b/thrust/benchmarks/bench/copy/basic.cu
index 8969a7b886d..1b2b96214df 100644
--- a/thrust/benchmarks/bench/copy/basic.cu
+++ b/thrust/benchmarks/bench/copy/basic.cu
@@ -45,7 +45,7 @@ static void basic(nvbench::state &state,
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(elements);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     thrust::copy(input.cbegin(),
                  input.cend(),
                  output.begin());
diff --git a/thrust/benchmarks/bench/copy/if.cu b/thrust/benchmarks/bench/copy/if.cu
index d8c4fd22e68..8b89e08db6f 100644
--- a/thrust/benchmarks/bench/copy/if.cu
+++ b/thrust/benchmarks/bench/copy/if.cu
@@ -74,7 +74,7 @@ static void basic(nvbench::state &state,
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(selected_elements);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     thrust::copy_if(input.cbegin(), input.cend(), output.begin(), select_op);
   });
 }
diff --git a/thrust/benchmarks/bench/fill/basic.cu b/thrust/benchmarks/bench/fill/basic.cu
index 63a24d2bc2f..3c29f3c7043 100644
--- a/thrust/benchmarks/bench/fill/basic.cu
+++ b/thrust/benchmarks/bench/fill/basic.cu
@@ -41,7 +41,7 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
   state.add_element_count(elements);
   state.add_global_memory_writes<T>(elements);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     thrust::fill(output.begin(), output.end(), T{42});
   });
 }
diff --git a/thrust/benchmarks/bench/inner_product/basic.cu b/thrust/benchmarks/bench/inner_product/basic.cu
index 5a60ca1cfd2..aa3b5d467e9 100644
--- a/thrust/benchmarks/bench/inner_product/basic.cu
+++ b/thrust/benchmarks/bench/inner_product/basic.cu
@@ -44,7 +44,7 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
   state.add_global_memory_reads<T>(elements * 2);
   state.add_global_memory_writes<T>(1);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     thrust::inner_product(lhs.begin(), lhs.end(), rhs.begin(), T{0});
   });
 }
diff --git a/thrust/benchmarks/bench/merge/basic.cu b/thrust/benchmarks/bench/merge/basic.cu
index fb8e8f8822a..854baf8ec0e 100644
--- a/thrust/benchmarks/bench/merge/basic.cu
+++ b/thrust/benchmarks/bench/merge/basic.cu
@@ -50,7 +50,7 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(elements);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     thrust::merge(in.cbegin(),
                   in.cbegin() + elements_in_lhs,
                   in.cbegin() + elements_in_lhs,
diff --git a/thrust/benchmarks/bench/partition/basic.cu b/thrust/benchmarks/bench/partition/basic.cu
index a04aae31283..aafdc892236 100644
--- a/thrust/benchmarks/bench/partition/basic.cu
+++ b/thrust/benchmarks/bench/partition/basic.cu
@@ -72,7 +72,7 @@ static void basic(nvbench::state &state,
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(elements);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     thrust::partition_copy(input.cbegin(),
                            input.cend(),
                            output.begin(),
diff --git a/thrust/benchmarks/bench/reduce/basic.cu b/thrust/benchmarks/bench/reduce/basic.cu
index 97dbe5d02b3..e6e31c22a05 100644
--- a/thrust/benchmarks/bench/reduce/basic.cu
+++ b/thrust/benchmarks/bench/reduce/basic.cu
@@ -42,7 +42,7 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(1);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     do_not_optimize(thrust::reduce(in.begin(), in.end()));
   });
 }
diff --git a/thrust/benchmarks/bench/reduce/by_key.cu b/thrust/benchmarks/bench/reduce/by_key.cu
index 4eaaed194e5..282dff7d940 100644
--- a/thrust/benchmarks/bench/reduce/by_key.cu
+++ b/thrust/benchmarks/bench/reduce/by_key.cu
@@ -57,7 +57,7 @@ static void basic(nvbench::state &state, nvbench::type_list<KeyT, ValueT>)
   state.add_global_memory_writes<KeyT>(unique_keys);
   state.add_global_memory_writes<ValueT>(unique_keys);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     thrust::reduce_by_key(in_keys.begin(),
                           in_keys.end(),
                           in_vals.begin(),
diff --git a/thrust/benchmarks/bench/scan/exclusive/by_key.cu b/thrust/benchmarks/bench/scan/exclusive/by_key.cu
index 76a5a0f9921..df650554b39 100644
--- a/thrust/benchmarks/bench/scan/exclusive/by_key.cu
+++ b/thrust/benchmarks/bench/scan/exclusive/by_key.cu
@@ -45,7 +45,7 @@ static void scan(nvbench::state &state, nvbench::type_list<KeyT, ValueT>)
   state.add_global_memory_reads<ValueT>(elements);
   state.add_global_memory_writes<ValueT>(elements);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     thrust::exclusive_scan_by_key(keys.cbegin(), keys.cend(), in_vals.cbegin(), out_vals.begin());
   });
 }
diff --git a/thrust/benchmarks/bench/scan/exclusive/max.cu b/thrust/benchmarks/bench/scan/exclusive/max.cu
index c434c537e5e..a18a3c96cb1 100644
--- a/thrust/benchmarks/bench/scan/exclusive/max.cu
+++ b/thrust/benchmarks/bench/scan/exclusive/max.cu
@@ -43,7 +43,7 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(elements);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     thrust::exclusive_scan(input.cbegin(), input.cend(), output.begin(), T{}, max_t{});
   });
 }
diff --git a/thrust/benchmarks/bench/scan/exclusive/sum.cu b/thrust/benchmarks/bench/scan/exclusive/sum.cu
index 75ae35894d0..29b82b68a81 100644
--- a/thrust/benchmarks/bench/scan/exclusive/sum.cu
+++ b/thrust/benchmarks/bench/scan/exclusive/sum.cu
@@ -43,7 +43,7 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(elements);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     thrust::exclusive_scan(input.cbegin(), input.cend(), output.begin());
   });
 }
diff --git a/thrust/benchmarks/bench/scan/inclusive/by_key.cu b/thrust/benchmarks/bench/scan/inclusive/by_key.cu
index bb468ff57dd..10e0cbc408b 100644
--- a/thrust/benchmarks/bench/scan/inclusive/by_key.cu
+++ b/thrust/benchmarks/bench/scan/inclusive/by_key.cu
@@ -45,7 +45,7 @@ static void scan(nvbench::state &state, nvbench::type_list<KeyT, ValueT>)
   state.add_global_memory_reads<ValueT>(elements);
   state.add_global_memory_writes<ValueT>(elements);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     thrust::inclusive_scan_by_key(keys.cbegin(), keys.cend(), in_vals.cbegin(), out_vals.begin());
   });
 }
diff --git a/thrust/benchmarks/bench/scan/inclusive/max.cu b/thrust/benchmarks/bench/scan/inclusive/max.cu
index affecbdd9f0..40d84942ec6 100644
--- a/thrust/benchmarks/bench/scan/inclusive/max.cu
+++ b/thrust/benchmarks/bench/scan/inclusive/max.cu
@@ -43,7 +43,7 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(elements);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     thrust::inclusive_scan(input.cbegin(), input.cend(), output.begin(), max_t{});
   });
 }
diff --git a/thrust/benchmarks/bench/scan/inclusive/sum.cu b/thrust/benchmarks/bench/scan/inclusive/sum.cu
index 540001a2344..ea98b7bcf31 100644
--- a/thrust/benchmarks/bench/scan/inclusive/sum.cu
+++ b/thrust/benchmarks/bench/scan/inclusive/sum.cu
@@ -43,7 +43,7 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(elements);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     thrust::inclusive_scan(input.cbegin(), input.cend(), output.begin());
   });
 }
diff --git a/thrust/benchmarks/bench/set_operations/base.cuh b/thrust/benchmarks/bench/set_operations/base.cuh
index c660d222f05..9f5ab563ac0 100644
--- a/thrust/benchmarks/bench/set_operations/base.cuh
+++ b/thrust/benchmarks/bench/set_operations/base.cuh
@@ -61,7 +61,7 @@ static void basic(nvbench::state &state, nvbench::type_list<T>, OpT op)
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(elements_in_AB);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     op(input.cbegin(),
        input.cbegin() + elements_in_A,
        input.cbegin() + elements_in_A,
diff --git a/thrust/benchmarks/bench/set_operations/by_key.cuh b/thrust/benchmarks/bench/set_operations/by_key.cuh
index ae19b2407e0..6e71601f85c 100644
--- a/thrust/benchmarks/bench/set_operations/by_key.cuh
+++ b/thrust/benchmarks/bench/set_operations/by_key.cuh
@@ -70,7 +70,7 @@ static void basic(nvbench::state &state, nvbench::type_list<KeyT, ValueT>, OpT o
   state.add_global_memory_reads<ValueT>(OpT::read_all_values ? elements : elements_in_A); 
   state.add_global_memory_writes<ValueT>(elements_in_AB);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     op(in_keys.cbegin(),
        in_keys.cbegin() + elements_in_A,
        in_keys.cbegin() + elements_in_A,
diff --git a/thrust/benchmarks/bench/shuffle/basic.cu b/thrust/benchmarks/bench/shuffle/basic.cu
index f70629f2a4c..cc24d267851 100644
--- a/thrust/benchmarks/bench/shuffle/basic.cu
+++ b/thrust/benchmarks/bench/shuffle/basic.cu
@@ -44,7 +44,7 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
   state.add_global_memory_writes<T>(elements);
 
   auto do_engine = [&](auto &&engine_constructor) {
-    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+    state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
       thrust::shuffle(data.begin(), data.end(), engine_constructor());
     });
   };
diff --git a/thrust/benchmarks/bench/sort/keys.cu b/thrust/benchmarks/bench/sort/keys.cu
index f086505fa2c..d52dd6e7d1e 100644
--- a/thrust/benchmarks/bench/sort/keys.cu
+++ b/thrust/benchmarks/bench/sort/keys.cu
@@ -58,4 +58,4 @@ NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
   .set_name("base")
   .set_type_axes_names({"T{ct}"})
   .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
-  .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"});
+  .add_string_axis("Entropy", {"1.000", "0.201"});
diff --git a/thrust/benchmarks/bench/sort/keys_custom.cu b/thrust/benchmarks/bench/sort/keys_custom.cu
index 3728006f687..f1eb8c2fdf1 100644
--- a/thrust/benchmarks/bench/sort/keys_custom.cu
+++ b/thrust/benchmarks/bench/sort/keys_custom.cu
@@ -58,4 +58,4 @@ NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
   .set_name("base")
   .set_type_axes_names({"T{ct}"})
   .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
-  .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"});
+  .add_string_axis("Entropy", {"1.000", "0.201"});
diff --git a/thrust/benchmarks/bench/sort/pairs.cu b/thrust/benchmarks/bench/sort/pairs.cu
index a6d45e33ed5..9d2f06b2f5c 100644
--- a/thrust/benchmarks/bench/sort/pairs.cu
+++ b/thrust/benchmarks/bench/sort/pairs.cu
@@ -59,19 +59,11 @@ static void basic(nvbench::state &state, nvbench::type_list<KeyT, ValueT>)
              });
 }
 
-using key_types   = fundamental_types;
-using value_types = nvbench::type_list<int8_t,
-                                       int16_t,
-                                       int32_t,
-                                       int64_t
-#if NVBENCH_HELPER_HAS_I128
-                                       ,
-                                       int128_t
-#endif
-                                       >;
+using key_types   = integral_types;
+using value_types = integral_types;
 
 NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(key_types, value_types))
   .set_name("base")
   .set_type_axes_names({"KeyT{ct}", "ValueT{ct}"})
   .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
-  .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"});
+  .add_string_axis("Entropy", {"1.000", "0.201"});
diff --git a/thrust/benchmarks/bench/sort/pairs_custom.cu b/thrust/benchmarks/bench/sort/pairs_custom.cu
index 4cd73139895..bb731e03c6f 100644
--- a/thrust/benchmarks/bench/sort/pairs_custom.cu
+++ b/thrust/benchmarks/bench/sort/pairs_custom.cu
@@ -59,19 +59,11 @@ static void basic(nvbench::state &state, nvbench::type_list<KeyT, ValueT>)
              });
 }
 
-using key_types   = fundamental_types;
-using value_types = nvbench::type_list<int8_t,
-                                       int16_t,
-                                       int32_t,
-                                       int64_t
-#if NVBENCH_HELPER_HAS_I128
-                                       ,
-                                       int128_t
-#endif
-                                       >;
+using key_types   = integral_types;
+using value_types = integral_types;
 
 NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(key_types, value_types))
   .set_name("base")
   .set_type_axes_names({"KeyT{ct}", "ValueT{ct}"})
   .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
-  .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"});
+  .add_string_axis("Entropy", {"1.000", "0.201"});
diff --git a/thrust/benchmarks/bench/unique/basic.cu b/thrust/benchmarks/bench/unique/basic.cu
index 7bef39ecc45..2f01fb30453 100644
--- a/thrust/benchmarks/bench/unique/basic.cu
+++ b/thrust/benchmarks/bench/unique/basic.cu
@@ -51,7 +51,7 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(unique_items);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     thrust::unique_copy(input.cbegin(), input.cend(), output.begin());
   });
 }
diff --git a/thrust/benchmarks/bench/unique/by_key.cu b/thrust/benchmarks/bench/unique/by_key.cu
index e6961bc4d95..ed43e64cb9b 100644
--- a/thrust/benchmarks/bench/unique/by_key.cu
+++ b/thrust/benchmarks/bench/unique/by_key.cu
@@ -56,7 +56,7 @@ static void basic(nvbench::state &state, nvbench::type_list<KeyT, ValueT>)
   state.add_global_memory_reads<ValueT>(elements);
   state.add_global_memory_writes<ValueT>(unique_elements);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     thrust::unique_by_key_copy(in_keys.cbegin(),
                                in_keys.cend(),
                                in_vals.cbegin(),
diff --git a/thrust/benchmarks/bench/vectorized_search/base.cu b/thrust/benchmarks/bench/vectorized_search/base.cu
index a733830fa1e..67e9ddedcc2 100644
--- a/thrust/benchmarks/bench/vectorized_search/base.cu
+++ b/thrust/benchmarks/bench/vectorized_search/base.cu
@@ -46,7 +46,7 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
 
   state.add_element_count(needles);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     thrust::binary_search(data.begin(),
                           data.begin() + elements,
                           data.begin() + elements,
diff --git a/thrust/benchmarks/bench/vectorized_search/lower_bound.cu b/thrust/benchmarks/bench/vectorized_search/lower_bound.cu
index 4ab9539215f..e3fbd6e6cbd 100644
--- a/thrust/benchmarks/bench/vectorized_search/lower_bound.cu
+++ b/thrust/benchmarks/bench/vectorized_search/lower_bound.cu
@@ -46,7 +46,7 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
 
   state.add_element_count(needles);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     thrust::lower_bound(data.begin(),
                         data.begin() + elements,
                         data.begin() + elements,
diff --git a/thrust/benchmarks/bench/vectorized_search/upper_bound.cu b/thrust/benchmarks/bench/vectorized_search/upper_bound.cu
index 5b57ebf92a5..6b412ca299c 100644
--- a/thrust/benchmarks/bench/vectorized_search/upper_bound.cu
+++ b/thrust/benchmarks/bench/vectorized_search/upper_bound.cu
@@ -46,7 +46,7 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
 
   state.add_element_count(needles);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
     thrust::upper_bound(data.begin(),
                         data.begin() + elements,
                         data.begin() + elements,
diff --git a/thrust/thrust/cmake/thrust-config-version.cmake b/thrust/thrust/cmake/thrust-config-version.cmake
index cf9407a4cec..5da3ad1339e 100644
--- a/thrust/thrust/cmake/thrust-config-version.cmake
+++ b/thrust/thrust/cmake/thrust-config-version.cmake
@@ -1,17 +1,10 @@
 # Parse version information from version.h:
 include("${CMAKE_CURRENT_LIST_DIR}/thrust-header-search.cmake")
 
-file(READ "${_THRUST_VERSION_INCLUDE_DIR}/thrust/version.h" THRUST_VERSION_HEADER)
-string(REGEX MATCH "#define[ \t]+THRUST_VERSION[ \t]+([0-9]+)" DUMMY "${THRUST_VERSION_HEADER}")
-set(THRUST_VERSION_FLAT ${CMAKE_MATCH_1})
-# Note that Thrust calls this the PATCH number, CMake calls it the TWEAK number:
-string(REGEX MATCH "#define[ \t]+THRUST_PATCH_NUMBER[ \t]+([0-9]+)" DUMMY "${THRUST_VERSION_HEADER}")
-set(THRUST_VERSION_TWEAK ${CMAKE_MATCH_1})
-
-math(EXPR THRUST_VERSION_MAJOR "${THRUST_VERSION_FLAT} / 100000")
-math(EXPR THRUST_VERSION_MINOR "(${THRUST_VERSION_FLAT} / 100) % 1000")
-math(EXPR THRUST_VERSION_PATCH "${THRUST_VERSION_FLAT} % 100") # Thrust: "subminor" CMake: "patch"
-
+set(THRUST_VERSION_MAJOR 2)
+set(THRUST_VERSION_MINOR 3)
+set(THRUST_VERSION_PATCH 0) # Thrust: "subminor" CMake: "patch"
+set(THRUST_VERSION_TWEAK 0)
 set(THRUST_VERSION "${THRUST_VERSION_MAJOR}.${THRUST_VERSION_MINOR}.${THRUST_VERSION_PATCH}.${THRUST_VERSION_TWEAK}")
 
 set(PACKAGE_VERSION ${THRUST_VERSION})
diff --git a/thrust/thrust/version.h b/thrust/thrust/version.h
index 8c3fa09acaf..a1ef0896751 100644
--- a/thrust/thrust/version.h
+++ b/thrust/thrust/version.h
@@ -31,11 +31,13 @@
 #include <thrust/detail/config/config.h>
 
 #if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
-#pragma GCC system_header
+#  pragma GCC system_header
 #else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
 _CCCL_IMPLICIT_SYSTEM_HEADER
 #endif // !_CCCL_COMPILER_NVHPC
 
+#include <cuda/version>
+
 //  This is the only Thrust header that is guaranteed to
 //  change with every Thrust release.
 //
@@ -49,13 +51,15 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 
 /*! \def THRUST_VERSION
  *  \brief The preprocessor macro \p THRUST_VERSION encodes the version
- *         number of the Thrust library.
+ *         number of the Thrust library as MMMmmmpp.
+ *
+ *  \note THRUST_VERSION is formatted as `MMMmmmpp`, which differs from `CCCL_VERSION` that uses `MMMmmmppp`.
  *
  *         <tt>THRUST_VERSION % 100</tt> is the sub-minor version.
  *         <tt>THRUST_VERSION / 100 % 1000</tt> is the minor version.
  *         <tt>THRUST_VERSION / 100000</tt> is the major version.
  */
-#define THRUST_VERSION 200200
+#define THRUST_VERSION 200300 // macro expansion with ## requires this to be a single value
 
 /*! \def THRUST_MAJOR_VERSION
  *  \brief The preprocessor macro \p THRUST_MAJOR_VERSION encodes the
@@ -81,3 +85,7 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
  *         Legacy; will be 0 for all future releases.
  */
 #define THRUST_PATCH_NUMBER 0
+
+static_assert(THRUST_MAJOR_VERSION == CCCL_MAJOR_VERSION, "");
+static_assert(THRUST_MINOR_VERSION == CCCL_MINOR_VERSION, "");
+static_assert(THRUST_SUBMINOR_VERSION == CCCL_PATCH_VERSION, "");