diff --git a/README.md b/README.md index b627f170f6d..7171af73305 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ [![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json) -|[Contributor Guide](https://github.com/NVIDIA/cccl/blob/main/CONTRIBUTING.md)|[Dev Containers](https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md)|[Discord](https://discord.gg/nvidiadeveloper)|[Godbolt](https://godbolt.org/z/x4G73af9a)|[GitHub Project](https://github.com/orgs/NVIDIA/projects/6)|[libcudacxx Docs](https://nvidia.github.io/libcudacxx/)|[Thrust Docs](https://nvidia.github.io/thrust/)|[CUB Docs](https://nvlabs.github.io/cub/)| +|[Contributor Guide](https://github.com/NVIDIA/cccl/blob/main/CONTRIBUTING.md)|[Dev Containers](https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md)|[Discord](https://discord.gg/nvidiadeveloper)|[Godbolt](https://godbolt.org/z/x4G73af9a)|[GitHub Project](https://github.com/orgs/NVIDIA/projects/6)|[libcudacxx Docs](https://nvidia.github.io/libcudacxx/)|[Thrust Docs](https://nvidia.github.io/thrust/)|[CUB Docs](https://nvlabs.github.io/cub/)| |-|-|-|-|-|-|-|-| # CUDA C++ Core Libraries (CCCL) @@ -284,20 +284,22 @@ Anything not part of the public API may change at any time without warning. #### API Versioning -The entirety of CCCL's public API across all components shares a common semantic version of `MAJOR.MINOR.PATCH`. +The public API of all CCCL's components share a unified semantic version of `MAJOR.MINOR.PATCH`. Only the most recently released version is supported. As a rule, features and bug fixes are not backported to previously released version or branches. -For historical reasons, the library versions are encoded separately in each of Thrust/CUB/libcudacxx as follows: +The preferred method for querying the version is to use `CCCL_[MAJOR/MINOR/PATCH_]VERSION` as described below. +For backwards compatibility, the Thrust/CUB/libcudacxxx version definitions are available and will always be consistent with `CCCL_VERSION`. +Note that Thrust/CUB use a `MMMmmmpp` scheme whereas the CCCL and libcudacxx use `MMMmmmppp`. -| | libcudacxx | Thrust | CUB | Incremented when? | -|------------------------|-------------------------------------------|-------------------------------|----------------------------|--------------------------------------------------------------------| -| Header | `` | `` | `` | - | -| Major Version | `_LIBCUDACXX_CUDA_API_VERSION_MAJOR` | `THRUST_MAJOR_VERSION` | `CUB_MAJOR_VERSION` | Public API breaking changes (only at new CTK major release) | -| Minor Version | `_LIBCUDACXX_CUDA_API_VERSION_MINOR` | `THRUST_MINOR_VERSION` | `CUB_MINOR_VERSION` | Non-breaking feature additions | -| Patch/Subminor Version | `_LIBCUDACXX_CUDA_API_VERSION_PATCH` | `THRUST_SUBMINOR_VERSION` | `CUB_SUBMINOR_VERSION` | Minor changes not covered by major/minor versions | -| Concatenated Version | `_LIBCUDACXX_CUDA_API_VERSION (MMMmmmppp)`| `THRUST_VERSION (MMMmmmpp)` | `CUB_VERSION (MMMmmmpp)` | - | +| | CCCL | libcudacxx | Thrust | CUB | +|------------------------|----------------------------------------|-------------------------------------------|------------------------------|---------------------------| +| Header | `` | `` | `` | `` | +| Major Version | `CCCL_MAJOR_VERSION` | `_LIBCUDACXX_CUDA_API_VERSION_MAJOR` | `THRUST_MAJOR_VERSION` | `CUB_MAJOR_VERSION` | +| Minor Version | `CCCL_MINOR_VERSION` | `_LIBCUDACXX_CUDA_API_VERSION_MINOR` | `THRUST_MINOR_VERSION` | `CUB_MINOR_VERSION` | +| Patch/Subminor Version | `CCCL_PATCH_VERSION` | `_LIBCUDACXX_CUDA_API_VERSION_PATCH` | `THRUST_SUBMINOR_VERSION` | `CUB_SUBMINOR_VERSION` | +| Concatenated Version | `CCCL_VERSION (MMMmmmppp)` | `_LIBCUDACXX_CUDA_API_VERSION (MMMmmmppp)`| `THRUST_VERSION (MMMmmmpp)` | `CUB_VERSION (MMMmmmpp)` | ### Application Binary Interface (ABI) @@ -376,7 +378,7 @@ For a detailed overview of the CI pipeline, see [ci-overview.md](ci-overview.md) Projects that are related to CCCL's mission to make CUDA C++ more delightful: - [cuCollections](https://github.com/NVIDIA/cuCollections) - GPU accelerated data structures like hash tables - [NVBench](https://github.com/NVIDIA/nvbench) - Benchmarking library tailored for CUDA applications -- [stdexec](https://github.com/nvidia/stdexec) - Reference implementation for Senders asynchronous programming model +- [stdexec](https://github.com/nvidia/stdexec) - Reference implementation for Senders asynchronous programming model ## Projects Using CCCL diff --git a/benchmarks/scripts/cccl/bench/bench.py b/benchmarks/scripts/cccl/bench/bench.py index adaa1f99557..eb5b05baa35 100644 --- a/benchmarks/scripts/cccl/bench/bench.py +++ b/benchmarks/scripts/cccl/bench/bench.py @@ -635,6 +635,14 @@ def do_run(self, ct_point, rt_values, timeout, is_search=True): cmd.append("--min-samples") cmd.append("70") + # Unlike noise, minimal benchmarking time is not directly related to variance. + # Default minimal time is 0.5 seconds. For CI we want to reduce it to 0.1 seconds, + # becuse we have limited time budget. Having smaller minimal time doesn't affect + # stability of sample distribution median in a deterministic way. For small problem sizes, + # 0.1s leads to smaller variation than 0.5s. For other workloads, 0.5 leads to smaller variance. + cmd.append("--min-time") + cmd.append("0.1") + # NVBench is currently broken for multiple GPUs, use `CUDA_VISIBLE_DEVICES` cmd.append("-d") cmd.append("0") diff --git a/benchmarks/scripts/cccl/bench/search.py b/benchmarks/scripts/cccl/bench/search.py index 264861f8832..13ff05ca01c 100644 --- a/benchmarks/scripts/cccl/bench/search.py +++ b/benchmarks/scripts/cccl/bench/search.py @@ -47,19 +47,38 @@ def parse_arguments(): type=str, help="Parameter in the format `Param=Value`.") parser.add_argument( '--list-benches', action=argparse.BooleanOptionalAction, help="Show available benchmarks.") + parser.add_argument('--num-shards', type=int, default=1, help='Split benchmarks into M pieces and only run one') + parser.add_argument('--run-shard', type=int, default=0, help='Run shard N / M of benchmarks') + parser.add_argument('-P0', action=argparse.BooleanOptionalAction, help="Run P0 benchmarks (overwrites -R)") return parser.parse_args() -def run_benches(benchmarks, sub_space, regex, seeker): - pattern = re.compile(regex) +def run_benches(algnames, sub_space, seeker): + for algname in algnames: + bench = BaseBench(algname) + ct_space = bench.ct_workload_space(sub_space) + rt_values = bench.rt_axes_values(sub_space) + seeker(algname, ct_space, rt_values) - for algname in benchmarks: - if pattern.match(algname): - bench = BaseBench(algname) - ct_space = bench.ct_workload_space(sub_space) - rt_values = bench.rt_axes_values(sub_space) - seeker(algname, ct_space, rt_values) +def filter_benchmarks(benchmarks, args): + if args.run_shard >= args.num_shards: + raise ValueError('run-shard must be less than num-shards') + + R = args.R + if args.P0: + R = '^(?!.*segmented).*(scan|reduce|select|sort).*' + + pattern = re.compile(R) + algnames = list(filter(lambda x: pattern.match(x), benchmarks.keys())) + algnames.sort() + + if args.num_shards > 1: + algnames = np.array_split(algnames, args.num_shards)[args.run_shard].tolist() + return algnames + + return algnames + def search(seeker): args = parse_arguments() @@ -79,8 +98,8 @@ def search(seeker): if args.list_benches: list_benches() return - - run_benches(config.benchmarks, workload_sub_space, args.R, seeker) + + run_benches(filter_benchmarks(config.benchmarks, args), workload_sub_space, seeker) class MedianCenterEstimator: diff --git a/benchmarks/scripts/run.py b/benchmarks/scripts/run.py index 23f0d9e0823..5b5d86e71e4 100755 --- a/benchmarks/scripts/run.py +++ b/benchmarks/scripts/run.py @@ -6,18 +6,36 @@ import cccl.bench -def elapsed_time_look_good(x): +def elapsed_time_looks_good(x): if isinstance(x, float): if math.isfinite(x): return True return False +def problem_size_looks_large_enough(elements): + # Small problem sizes do not utilize entire GPU. + # Benchmarking small problem sizes in environments where we do not control + # distributions comparison, e.g. CI, is not useful because of stability issues. + return elements.isdigit() and int(elements) > 20 + + +def filter_runtime_workloads_for_ci(rt_values): + for subbench in rt_values: + for axis in rt_values[subbench]: + if axis.startswith('Elements') and axis.endswith('[pow2]'): + rt_values[subbench][axis] = list(filter(problem_size_looks_large_enough, rt_values[subbench][axis])) + + return rt_values + + class BaseRunner: def __init__(self): self.estimator = cccl.bench.MedianCenterEstimator() def __call__(self, algname, ct_workload_space, rt_values): + rt_values = filter_runtime_workloads_for_ci(rt_values) + for ct_workload in ct_workload_space: bench = cccl.bench.BaseBench(algname) if bench.build(): @@ -28,7 +46,7 @@ def __call__(self, algname, ct_workload_space, rt_values): bench_name = bench_name.replace(' ', '___') bench_name = "".join(c if c.isalnum() else "_" for c in bench_name) elapsed_time = results[subbench][point] - if elapsed_time_look_good(elapsed_time): + if elapsed_time_looks_good(elapsed_time): print("&&&& PERF {} {} -sec".format(bench_name, elapsed_time)) else: print("&&&& FAILED bench") diff --git a/ci/update_version.sh b/ci/update_version.sh new file mode 100755 index 00000000000..c31e0ef1148 --- /dev/null +++ b/ci/update_version.sh @@ -0,0 +1,101 @@ +#!/bin/bash + +# Usage: ./update_version.sh [--dry-run] +# Example: ./update_version.sh --dry-run 2 2 1 + +# Run in root cccl/ +cd "$(dirname "${BASH_SOURCE[0]}")/.." || exit + +DRY_RUN=false + +while [[ "$#" -gt 0 ]]; do + case "$1" in + --dry-run) DRY_RUN=true; ;; + *) break ;; + esac + shift +done + +major="$1" +minor="$2" +patch="$3" + +if [ -z "$major" ] || [ -z "$minor" ] || [ -z "$patch" ]; then + echo "Usage: $0 [--dry-run] " + exit 1 +fi + +# Version file paths +CCCL_VERSION_FILE="libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/version.h" +THRUST_VERSION_FILE="thrust/thrust/version.h" +CUB_VERSION_FILE="cub/cub/version.cuh" +CCCL_CMAKE_VERSION_FILE="lib/cmake/cccl/cccl-config-version.cmake" +CUB_CMAKE_VERSION_FILE="cub/cub/cmake/cub-config-version.cmake" +LIBCUDACXX_CMAKE_VERSION_FILE="libcudacxx/lib/cmake/libcudacxx/libcudacxx-config-version.cmake" +THRUST_CMAKE_VERSION_FILE="thrust/thrust/cmake/thrust-config-version.cmake" + +# Calculated version codes +new_cccl_version=$((major * 1000000 + minor * 1000 + patch)) # MMMmmmppp +new_thrust_cub_version=$((major * 100000 + minor * 100 + patch)) # MMMmmmpp + +# Fetch current version from file +current_cccl_version=$(grep -oP "define CCCL_VERSION \K[0-9]+" "$CCCL_VERSION_FILE") + +# Fetch the latest tag from git and strip the 'v' prefix if present +latest_tag=$(git tag --sort=-v:refname | head -n 1 | sed 's/^v//') + +# Since the tags and versions are numerically comparable, we cast them to integers +latest_tag_version=$(echo "$latest_tag" | awk -F. '{ printf("%d%03d%03d", $1,$2,$3) }') + +echo "Running in $(pwd)" +echo "New MMMmmmppp version: $new_cccl_version" +echo "New MMMmmmpp version: $new_thrust_cub_version" +echo "Current CCCL version: $current_cccl_version" +echo "Latest git tag: $latest_tag" + +# Check if new version is less than or equal to current or the latest tag +if (( new_cccl_version < current_cccl_version )) || (( new_cccl_version < latest_tag_version )); then + echo "Error: New version $new_cccl_version is less than current version $current_cccl_version or latest git tag version $latest_tag_version." + exit 1 +fi + +update_file () { + local file=$1 + local pattern=$2 + local new_value=$3 + if [ "$DRY_RUN" = true ]; then + local temp_file=$(mktemp) + sed "s/$pattern/$new_value/g" "$file" > "$temp_file" + diff --color=auto -U 0 "$file" "$temp_file" || true + rm "$temp_file" + else + sed -i "s/$pattern/$new_value/" "$file" + fi +} + +# Update version information in files +update_file "$CCCL_VERSION_FILE" "^#define CCCL_VERSION \([0-9]\+\)" "#define CCCL_VERSION $new_cccl_version" +update_file "$THRUST_VERSION_FILE" "^#define THRUST_VERSION \([0-9]\+\)" "#define THRUST_VERSION $new_thrust_cub_version" +update_file "$CUB_VERSION_FILE" "^#define CUB_VERSION \([0-9]\+\)" "#define CUB_VERSION $new_thrust_cub_version" + +update_file "$CUB_CMAKE_VERSION_FILE" "set(CUB_VERSION_MAJOR \([0-9]\+\))" "set(CUB_VERSION_MAJOR $major)" +update_file "$CUB_CMAKE_VERSION_FILE" "set(CUB_VERSION_MINOR \([0-9]\+\))" "set(CUB_VERSION_MINOR $minor)" +update_file "$CUB_CMAKE_VERSION_FILE" "set(CUB_VERSION_PATCH \([0-9]\+\))" "set(CUB_VERSION_PATCH $patch)" + +update_file "$LIBCUDACXX_CMAKE_VERSION_FILE" "set(libcudacxx_VERSION_MAJOR \([0-9]\+\))" "set(libcudacxx_VERSION_MAJOR $major)" +update_file "$LIBCUDACXX_CMAKE_VERSION_FILE" "set(libcudacxx_VERSION_MINOR \([0-9]\+\))" "set(libcudacxx_VERSION_MINOR $minor)" +update_file "$LIBCUDACXX_CMAKE_VERSION_FILE" "set(libcudacxx_VERSION_PATCH \([0-9]\+\))" "set(libcudacxx_VERSION_PATCH $patch)" + +update_file "$THRUST_CMAKE_VERSION_FILE" "set(THRUST_VERSION_MAJOR \([0-9]\+\))" "set(THRUST_VERSION_MAJOR $major)" +update_file "$THRUST_CMAKE_VERSION_FILE" "set(THRUST_VERSION_MINOR \([0-9]\+\))" "set(THRUST_VERSION_MINOR $minor)" +update_file "$THRUST_CMAKE_VERSION_FILE" "set(THRUST_VERSION_PATCH \([0-9]\+\))" "set(THRUST_VERSION_PATCH $patch)" + +update_file "$CCCL_CMAKE_VERSION_FILE" "set(CCCL_VERSION_MAJOR \([0-9]\+\))" "set(CCCL_VERSION_MAJOR $major)" +update_file "$CCCL_CMAKE_VERSION_FILE" "set(CCCL_VERSION_MINOR \([0-9]\+\))" "set(CCCL_VERSION_MINOR $minor)" +update_file "$CCCL_CMAKE_VERSION_FILE" "set(CCCL_VERSION_PATCH \([0-9]\+\))" "set(CCCL_VERSION_PATCH $patch)" + +if [ "$DRY_RUN" = true ]; then + echo "Dry run completed. No changes made." +else + echo "Version updated to $major.$minor.$patch" +fi diff --git a/cub/benchmarks/bench/adjacent_difference/subtract_left.cu b/cub/benchmarks/bench/adjacent_difference/subtract_left.cu index d47efd83abb..0539509b09c 100644 --- a/cub/benchmarks/bench/adjacent_difference/subtract_left.cu +++ b/cub/benchmarks/bench/adjacent_difference/subtract_left.cu @@ -100,7 +100,7 @@ void left(nvbench::state& state, nvbench::type_list) thrust::device_vector temp_storage(temp_storage_bytes); std::uint8_t* d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(d_temp_storage, temp_storage_bytes, d_in, diff --git a/cub/benchmarks/bench/copy/memcpy.cu b/cub/benchmarks/bench/copy/memcpy.cu index 467e0c2e0ec..794619bc832 100644 --- a/cub/benchmarks/bench/copy/memcpy.cu +++ b/cub/benchmarks/bench/copy/memcpy.cu @@ -249,7 +249,7 @@ void copy(nvbench::state &state, thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch &launch) { dispatch_t::Dispatch(d_temp_storage, temp_storage_bytes, d_input_buffers, @@ -275,8 +275,8 @@ void uniform(nvbench::state &state, nvbench::type_list tl) elements, min_buffer_size, max_buffer_size, - state.get_int64("RandomizeInput"), - state.get_int64("RandomizeOutput")); + state.get_int64("Randomize"), + state.get_int64("Randomize")); } template @@ -309,8 +309,7 @@ NVBENCH_BENCH_TYPES(uniform, NVBENCH_TYPE_AXES(types, u_offset_types)) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(25, 29, 2)) .add_int64_axis("MinBufferSizeRatio", {1, 99}) .add_int64_axis("MaxBufferSize", {8, 64, 256, 1024, 64 * 1024}) - .add_int64_axis("RandomizeInput", {0, 1}) - .add_int64_axis("RandomizeOutput", {0, 1}); + .add_int64_axis("Randomize", {0, 1}); NVBENCH_BENCH_TYPES(large, NVBENCH_TYPE_AXES(types, u_offset_types)) .set_name("large") diff --git a/cub/benchmarks/bench/histogram/even.cu b/cub/benchmarks/bench/histogram/even.cu index ce34cab5f4c..7ef201dbbad 100644 --- a/cub/benchmarks/bench/histogram/even.cu +++ b/cub/benchmarks/bench/histogram/even.cu @@ -110,7 +110,7 @@ static void even(nvbench::state &state, nvbench::type_list tmp(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(tmp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::DispatchEven(d_temp_storage, temp_storage_bytes, d_input, @@ -139,5 +139,5 @@ NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset .set_name("base") .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) - .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152}) - .add_string_axis("Entropy", {"0.201", "0.544", "1.000"}); + .add_int64_axis("Bins", {32, 128, 2048, 2097152}) + .add_string_axis("Entropy", {"0.201", "1.000"}); diff --git a/cub/benchmarks/bench/histogram/multi/even.cu b/cub/benchmarks/bench/histogram/multi/even.cu index 1a501aeec13..83ebfcea5b9 100644 --- a/cub/benchmarks/bench/histogram/multi/even.cu +++ b/cub/benchmarks/bench/histogram/multi/even.cu @@ -121,7 +121,7 @@ static void even(nvbench::state &state, nvbench::type_list tmp(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(tmp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::DispatchEven(d_temp_storage, temp_storage_bytes, d_input, @@ -150,5 +150,5 @@ NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset .set_name("base") .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) - .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152}) - .add_string_axis("Entropy", {"0.201", "0.544", "1.000"}); + .add_int64_axis("Bins", {32, 128, 2048, 2097152}) + .add_string_axis("Entropy", {"0.201", "1.000"}); diff --git a/cub/benchmarks/bench/histogram/multi/range.cu b/cub/benchmarks/bench/histogram/multi/range.cu index 9d8431635be..939cb79fe28 100644 --- a/cub/benchmarks/bench/histogram/multi/range.cu +++ b/cub/benchmarks/bench/histogram/multi/range.cu @@ -129,7 +129,7 @@ static void range(nvbench::state &state, nvbench::type_list tmp(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(tmp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::DispatchRange(d_temp_storage, temp_storage_bytes, d_input, @@ -157,5 +157,5 @@ NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offse .set_name("base") .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) - .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152}) - .add_string_axis("Entropy", {"0.201", "0.544", "1.000"}); + .add_int64_axis("Bins", {32, 128, 2048, 2097152}) + .add_string_axis("Entropy", {"0.201", "1.000"}); diff --git a/cub/benchmarks/bench/histogram/range.cu b/cub/benchmarks/bench/histogram/range.cu index 9e118064904..266c483a19b 100644 --- a/cub/benchmarks/bench/histogram/range.cu +++ b/cub/benchmarks/bench/histogram/range.cu @@ -116,7 +116,7 @@ static void range(nvbench::state &state, nvbench::type_list tmp(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(tmp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::DispatchRange(d_temp_storage, temp_storage_bytes, d_input, @@ -144,5 +144,5 @@ NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offse .set_name("base") .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) - .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152}) - .add_string_axis("Entropy", {"0.201", "0.544", "1.000"}); + .add_int64_axis("Bins", {32, 128, 2048, 2097152}) + .add_string_axis("Entropy", {"0.201", "1.000"}); diff --git a/cub/benchmarks/bench/merge_sort/keys.cu b/cub/benchmarks/bench/merge_sort/keys.cu index ba43e86cec3..831c021515d 100644 --- a/cub/benchmarks/bench/merge_sort/keys.cu +++ b/cub/benchmarks/bench/merge_sort/keys.cu @@ -131,7 +131,7 @@ void keys(nvbench::state &state, nvbench::type_list) thrust::device_vector temp(temp_size); auto *temp_storage = thrust::raw_pointer_cast(temp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(temp_storage, temp_size, d_buffer_1, diff --git a/cub/benchmarks/bench/merge_sort/pairs.cu b/cub/benchmarks/bench/merge_sort/pairs.cu index cf4b23b69b7..6a1f9d8d566 100644 --- a/cub/benchmarks/bench/merge_sort/pairs.cu +++ b/cub/benchmarks/bench/merge_sort/pairs.cu @@ -134,7 +134,7 @@ void pairs(nvbench::state &state, nvbench::type_list) thrust::device_vector temp(temp_size); auto *temp_storage = thrust::raw_pointer_cast(temp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(temp_storage, temp_size, d_keys_buffer_1, diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu index f26182e2609..0045fc44757 100644 --- a/cub/benchmarks/bench/partition/flagged.cu +++ b/cub/benchmarks/bench/partition/flagged.cu @@ -148,7 +148,7 @@ void flagged(nvbench::state &state, nvbench::type_list) thrust::device_vector temp(temp_size); auto *temp_storage = thrust::raw_pointer_cast(temp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(temp_storage, temp_size, d_in, diff --git a/cub/benchmarks/bench/partition/if.cu b/cub/benchmarks/bench/partition/if.cu index 1ab89a02c54..fcbc97446f5 100644 --- a/cub/benchmarks/bench/partition/if.cu +++ b/cub/benchmarks/bench/partition/if.cu @@ -170,7 +170,7 @@ void partition(nvbench::state &state, nvbench::type_list) thrust::device_vector temp(temp_size); auto *temp_storage = thrust::raw_pointer_cast(temp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(temp_storage, temp_size, d_in, diff --git a/cub/benchmarks/bench/partition/three_way.cu b/cub/benchmarks/bench/partition/three_way.cu index 9f2b5eb8b1b..577d01797fe 100644 --- a/cub/benchmarks/bench/partition/three_way.cu +++ b/cub/benchmarks/bench/partition/three_way.cu @@ -146,7 +146,7 @@ void partition(nvbench::state &state, nvbench::type_list) thrust::device_vector temp(temp_size); auto *temp_storage = thrust::raw_pointer_cast(temp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(temp_storage, temp_size, d_in, diff --git a/cub/benchmarks/bench/radix_sort/keys.cu b/cub/benchmarks/bench/radix_sort/keys.cu index 6586e40b91c..c2fc6a7d1f5 100644 --- a/cub/benchmarks/bench/radix_sort/keys.cu +++ b/cub/benchmarks/bench/radix_sort/keys.cu @@ -182,7 +182,7 @@ void radix_sort_keys(std::integral_constant, thrust::device_vector temp(temp_size); auto *temp_storage = thrust::raw_pointer_cast(temp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { cub::DoubleBuffer keys = d_keys; cub::DoubleBuffer values = d_values; @@ -222,4 +222,4 @@ NVBENCH_BENCH_TYPES(radix_sort_keys, NVBENCH_TYPE_AXES(fundamental_types, offset .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) - .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"}); + .add_string_axis("Entropy", {"1.000", "0.544", "0.201"}); diff --git a/cub/benchmarks/bench/radix_sort/pairs.cu b/cub/benchmarks/bench/radix_sort/pairs.cu index 006e33b98ce..503a25bc4b8 100644 --- a/cub/benchmarks/bench/radix_sort/pairs.cu +++ b/cub/benchmarks/bench/radix_sort/pairs.cu @@ -186,7 +186,7 @@ void radix_sort_values(std::integral_constant, thrust::device_vector temp(temp_size); auto *temp_storage = thrust::raw_pointer_cast(temp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { cub::DoubleBuffer keys = d_keys; cub::DoubleBuffer values = d_values; @@ -224,7 +224,7 @@ void radix_sort_values(nvbench::state &state, nvbench::type_list; #else // !defined(TUNE_KeyT) -using key_types = fundamental_types; +using key_types = integral_types; #endif // TUNE_KeyT #ifdef TUNE_ValueT @@ -245,4 +245,4 @@ NVBENCH_BENCH_TYPES(radix_sort_values, NVBENCH_TYPE_AXES(key_types, value_types, .set_name("base") .set_type_axes_names({"KeyT{ct}", "ValueT{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) - .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"}); + .add_string_axis("Entropy", {"1.000", "0.201"}); diff --git a/cub/benchmarks/bench/reduce/base.cuh b/cub/benchmarks/bench/reduce/base.cuh index 253e5533b44..d874d69ae67 100644 --- a/cub/benchmarks/bench/reduce/base.cuh +++ b/cub/benchmarks/bench/reduce/base.cuh @@ -103,7 +103,7 @@ void reduce(nvbench::state &state, nvbench::type_list) thrust::device_vector temp(temp_size); auto *temp_storage = thrust::raw_pointer_cast(temp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(temp_storage, temp_size, d_in, diff --git a/cub/benchmarks/bench/reduce/by_key.cu b/cub/benchmarks/bench/reduce/by_key.cu index d11667b7a5b..4ccca911472 100644 --- a/cub/benchmarks/bench/reduce/by_key.cu +++ b/cub/benchmarks/bench/reduce/by_key.cu @@ -158,7 +158,7 @@ static void reduce(nvbench::state &state, nvbench::type_list(num_runs); state.add_global_memory_writes(1); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(d_temp_storage, temp_storage_bytes, d_in_keys, diff --git a/cub/benchmarks/bench/run_length_encode/encode.cu b/cub/benchmarks/bench/run_length_encode/encode.cu index 1e02f01055c..16a07206c93 100644 --- a/cub/benchmarks/bench/run_length_encode/encode.cu +++ b/cub/benchmarks/bench/run_length_encode/encode.cu @@ -158,7 +158,7 @@ static void rle(nvbench::state &state, nvbench::type_list) state.add_global_memory_writes(num_runs); state.add_global_memory_writes(1); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(d_temp_storage, temp_storage_bytes, d_in_keys, diff --git a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu index b679aa40849..83e2a8eff31 100644 --- a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu +++ b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu @@ -145,7 +145,7 @@ static void rle(nvbench::state &state, nvbench::type_list) state.add_global_memory_writes(num_runs); state.add_global_memory_writes(1); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(d_temp_storage, temp_storage_bytes, d_in_keys, diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh index 3c3ec561c63..1e28450e26b 100644 --- a/cub/benchmarks/bench/scan/exclusive/base.cuh +++ b/cub/benchmarks/bench/scan/exclusive/base.cuh @@ -121,7 +121,7 @@ static void basic(nvbench::state &state, nvbench::type_list) thrust::device_vector tmp(tmp_size); nvbench::uint8_t *d_tmp = thrust::raw_pointer_cast(tmp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(thrust::raw_pointer_cast(tmp.data()), tmp_size, d_input, diff --git a/cub/benchmarks/bench/scan/exclusive/by_key.cu b/cub/benchmarks/bench/scan/exclusive/by_key.cu index 93e515e02cb..26b36322ee2 100644 --- a/cub/benchmarks/bench/scan/exclusive/by_key.cu +++ b/cub/benchmarks/bench/scan/exclusive/by_key.cu @@ -134,7 +134,7 @@ static void scan(nvbench::state &state, nvbench::type_list tmp(tmp_size); nvbench::uint8_t *d_tmp = thrust::raw_pointer_cast(tmp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(d_tmp, tmp_size, d_keys, diff --git a/cub/benchmarks/bench/segmented_sort/keys.cu b/cub/benchmarks/bench/segmented_sort/keys.cu index 6f47c66d663..d0d15789940 100644 --- a/cub/benchmarks/bench/segmented_sort/keys.cu +++ b/cub/benchmarks/bench/segmented_sort/keys.cu @@ -220,7 +220,7 @@ void seg_sort(nvbench::state &state, thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch &launch) { cub::DoubleBuffer keys = d_keys; cub::DoubleBuffer values = d_values; @@ -255,7 +255,7 @@ NVBENCH_BENCH_TYPES(power_law, NVBENCH_TYPE_AXES(fundamental_types, some_offset_ .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(22, 30, 4)) .add_int64_power_of_two_axis("Segments{io}", nvbench::range(12, 20, 4)) - .add_string_axis("Entropy", {"1.000", "0.544", "0.201"}); + .add_string_axis("Entropy", {"1.000", "0.201"}); template diff --git a/cub/benchmarks/bench/select/flagged.cu b/cub/benchmarks/bench/select/flagged.cu index fcbf57aba1a..dcf0598bd3a 100644 --- a/cub/benchmarks/bench/select/flagged.cu +++ b/cub/benchmarks/bench/select/flagged.cu @@ -152,7 +152,7 @@ void select(nvbench::state &state, nvbench::type_list) thrust::device_vector temp(temp_size); auto *temp_storage = thrust::raw_pointer_cast(temp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(temp_storage, temp_size, d_in, diff --git a/cub/benchmarks/bench/select/if.cu b/cub/benchmarks/bench/select/if.cu index 089ffa0f4a7..981ed7b7013 100644 --- a/cub/benchmarks/bench/select/if.cu +++ b/cub/benchmarks/bench/select/if.cu @@ -174,7 +174,7 @@ void select(nvbench::state &state, nvbench::type_list) thrust::device_vector temp(temp_size); auto *temp_storage = thrust::raw_pointer_cast(temp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(temp_storage, temp_size, d_in, diff --git a/cub/benchmarks/bench/select/unique_by_key.cu b/cub/benchmarks/bench/select/unique_by_key.cu index 1d610e2b823..e048d81b635 100644 --- a/cub/benchmarks/bench/select/unique_by_key.cu +++ b/cub/benchmarks/bench/select/unique_by_key.cu @@ -150,7 +150,7 @@ static void select(nvbench::state &state, nvbench::type_list(num_runs); state.add_global_memory_writes(1); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(d_temp_storage, temp_storage_bytes, d_in_keys, diff --git a/cub/benchmarks/nvbench_helper/CMakeLists.txt b/cub/benchmarks/nvbench_helper/CMakeLists.txt index 38112b1779e..e9ef302d2ff 100644 --- a/cub/benchmarks/nvbench_helper/CMakeLists.txt +++ b/cub/benchmarks/nvbench_helper/CMakeLists.txt @@ -1,5 +1,5 @@ # Fetch nvbench -CPMAddPackage("gh:NVIDIA/nvbench#39b2770b62ce1f4e0ebeb9af60d7c6de624633a5") +CPMAddPackage("gh:NVIDIA/nvbench#main") add_library(nvbench_helper OBJECT nvbench_helper/nvbench_helper.cuh nvbench_helper/nvbench_helper.cu) diff --git a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh index c5ae5f6c508..3bedf5841ae 100644 --- a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh +++ b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh @@ -53,29 +53,34 @@ using offset_types = nvbench::type_list; #endif #ifdef TUNE_T +using integral_types = nvbench::type_list; using fundamental_types = nvbench::type_list; -using all_types = nvbench::type_list; +using all_types = nvbench::type_list; #else -using fundamental_types = nvbench::type_list; - -using all_types = nvbench::type_list; +using integral_types = nvbench::type_list; + +using fundamental_types = + nvbench::type_list; + +using all_types = + nvbench::type_list; #endif template diff --git a/cub/cub/cmake/cub-config-version.cmake b/cub/cub/cmake/cub-config-version.cmake index 5d9dd5b922d..87e667e9dc3 100644 --- a/cub/cub/cmake/cub-config-version.cmake +++ b/cub/cub/cmake/cub-config-version.cmake @@ -1,17 +1,10 @@ # Parse version information from version.cuh: include("${CMAKE_CURRENT_LIST_DIR}/cub-header-search.cmake") -file(READ "${_CUB_VERSION_INCLUDE_DIR}/cub/version.cuh" CUB_VERSION_HEADER) -string(REGEX MATCH "#define[ \t]+CUB_VERSION[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}") -set(CUB_VERSION_FLAT ${CMAKE_MATCH_1}) -# Note that CUB calls this the PATCH number, CMake calls it the TWEAK number: -string(REGEX MATCH "#define[ \t]+CUB_PATCH_NUMBER[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}") -set(CUB_VERSION_TWEAK ${CMAKE_MATCH_1}) - -math(EXPR CUB_VERSION_MAJOR "${CUB_VERSION_FLAT} / 100000") -math(EXPR CUB_VERSION_MINOR "(${CUB_VERSION_FLAT} / 100) % 1000") -math(EXPR CUB_VERSION_PATCH "${CUB_VERSION_FLAT} % 100") # CUB: "subminor" CMake: "patch" - +set(CUB_VERSION_MAJOR 2) +set(CUB_VERSION_MINOR 3) +set(CUB_VERSION_PATCH 0) +set(CUB_VERSION_TWEAK 0) set(CUB_VERSION "${CUB_VERSION_MAJOR}.${CUB_VERSION_MINOR}.${CUB_VERSION_PATCH}.${CUB_VERSION_TWEAK}") set(PACKAGE_VERSION ${CUB_VERSION}) diff --git a/cub/cub/version.cuh b/cub/cub/version.cuh index 9934b92eff9..fdbfb641486 100644 --- a/cub/cub/version.cuh +++ b/cub/cub/version.cuh @@ -44,15 +44,19 @@ _CCCL_IMPLICIT_SYSTEM_HEADER #endif // !_CCCL_COMPILER_NVHPC +#include + /*! \def CUB_VERSION * \brief The preprocessor macro \p CUB_VERSION encodes the version - * number of the CUB library. + * number of the CUB library as MMMmmmpp. + * + * \note CUB_VERSION is formatted as `MMMmmmpp`, which differs from `CCCL_VERSION` that uses `MMMmmmppp`. * * CUB_VERSION % 100 is the sub-minor version. * CUB_VERSION / 100 % 1000 is the minor version. * CUB_VERSION / 100000 is the major version. */ -#define CUB_VERSION 200200 +#define CUB_VERSION 200300 // macro expansion with ## requires this to be a single value /*! \def CUB_MAJOR_VERSION * \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the @@ -77,3 +81,7 @@ _CCCL_IMPLICIT_SYSTEM_HEADER * patch number of the CUB library. */ #define CUB_PATCH_NUMBER 0 + +static_assert(CUB_MAJOR_VERSION == CCCL_MAJOR_VERSION,""); +static_assert(CUB_MINOR_VERSION == CCCL_MINOR_VERSION,""); +static_assert(CUB_SUBMINOR_VERSION == CCCL_PATCH_VERSION,""); diff --git a/cub/docs/tuning.rst b/cub/docs/tuning.rst index ae6cdaabe3c..9ac2b9752b2 100644 --- a/cub/docs/tuning.rst +++ b/cub/docs/tuning.rst @@ -121,7 +121,7 @@ Finally, we can run the algorithm: .. code:: c++ - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(temp_storage, temp_size, d_in, diff --git a/cub/test/bfloat16.h b/cub/test/bfloat16.h index dbd735db83f..328fb644a5e 100644 --- a/cub/test/bfloat16.h +++ b/cub/test/bfloat16.h @@ -32,11 +32,14 @@ * Utilities for interacting with the opaque CUDA __nv_bfloat16 type */ -#include +#include + #include -#include -#include +#include + +#include +#include #ifdef __GNUC__ // There's a ton of type-punning going on in this file. @@ -77,6 +80,16 @@ struct bfloat16_t *this = bfloat16_t(float(a)); } + /// Constructor from unsigned long long int + template < typename T, + typename = typename ::cuda::std::enable_if< + ::cuda::std::is_same::value + && (!::cuda::std::is_same::value)>::type> + __host__ __device__ __forceinline__ bfloat16_t(T a) + { + *this = bfloat16_t(float(a)); + } + /// Default constructor bfloat16_t() = default; diff --git a/cub/test/c2h/generators.cu b/cub/test/c2h/generators.cu index 9e0f3188113..67bf81e558c 100644 --- a/cub/test/c2h/generators.cu +++ b/cub/test/c2h/generators.cu @@ -259,16 +259,17 @@ void generator_t::operator()(seed_t seed, thrust::device_vector &data, T min, template struct count_to_item_t { - std::size_t n; + unsigned long long int n; - count_to_item_t(std::size_t n) + count_to_item_t(unsigned long long int n) : n(n) {} template __device__ T operator()(CounterT id) { - return static_cast(static_cast(id) % n); + // This has to be a type for which extended floating point types like __nv_fp8_e5m2 provide an overload + return static_cast(static_cast(id) % n); } }; diff --git a/cub/test/half.h b/cub/test/half.h index a009049cc71..74e507a57cd 100644 --- a/cub/test/half.h +++ b/cub/test/half.h @@ -37,6 +37,8 @@ #include +#include + #include #include #include @@ -80,6 +82,16 @@ struct half_t *this = half_t(float(a)); } + /// Constructor from unsigned long long int + template < typename T, + typename = typename ::cuda::std::enable_if< + ::cuda::std::is_same::value + && (!::cuda::std::is_same::value)>::type> + __host__ __device__ __forceinline__ half_t(T a) + { + *this = half_t(float(a)); + } + /// Default constructor half_t() = default; diff --git a/lib/cmake/cccl/cccl-config-version.cmake b/lib/cmake/cccl/cccl-config-version.cmake index 1e026c98334..d01aa5b6194 100644 --- a/lib/cmake/cccl/cccl-config-version.cmake +++ b/lib/cmake/cccl/cccl-config-version.cmake @@ -1,5 +1,5 @@ set(CCCL_VERSION_MAJOR 2) -set(CCCL_VERSION_MINOR 2) +set(CCCL_VERSION_MINOR 3) set(CCCL_VERSION_PATCH 0) set(CCCL_VERSION_TWEAK 0) diff --git a/libcudacxx/.upstream-tests/test/cuda/version.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/version.pass.cpp new file mode 100644 index 00000000000..343de4e9ec6 --- /dev/null +++ b/libcudacxx/.upstream-tests/test/cuda/version.pass.cpp @@ -0,0 +1,19 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +static_assert(CCCL_MAJOR_VERSION == (CCCL_VERSION/1000000),""); +static_assert(CCCL_MINOR_VERSION == (CCCL_VERSION/1000 % 1000),""); +static_assert(CCCL_PATCH_VERSION == (CCCL_VERSION % 1000),""); + +int main(int argc, char** argv){ + return 0; +} diff --git a/libcudacxx/include/cuda/std/detail/__config b/libcudacxx/include/cuda/std/detail/__config index b6dee85435b..ed1bad96c43 100644 --- a/libcudacxx/include/cuda/std/detail/__config +++ b/libcudacxx/include/cuda/std/detail/__config @@ -11,16 +11,12 @@ #ifndef __cuda_std__ #define __cuda_std__ -#define _LIBCUDACXX_CUDA_API_VERSION 2002000 +#include -#define _LIBCUDACXX_CUDA_API_VERSION_MAJOR \ - (_LIBCUDACXX_CUDA_API_VERSION / 1000000) - -#define _LIBCUDACXX_CUDA_API_VERSION_MINOR \ - (_LIBCUDACXX_CUDA_API_VERSION / 1000 % 1000) - -#define _LIBCUDACXX_CUDA_API_VERSION_PATCH \ - (_LIBCUDACXX_CUDA_API_VERSION % 1000) +#define _LIBCUDACXX_CUDA_API_VERSION CCCL_VERSION +#define _LIBCUDACXX_CUDA_API_VERSION_MAJOR CCCL_MAJOR_VERSION +#define _LIBCUDACXX_CUDA_API_VERSION_MINOR CCCL_MINOR_VERSION +#define _LIBCUDACXX_CUDA_API_VERSION_PATCH CCCL_PATCH_VERSION #ifndef _LIBCUDACXX_CUDA_ABI_VERSION_LATEST # define _LIBCUDACXX_CUDA_ABI_VERSION_LATEST 4 diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/version.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/version.h new file mode 100644 index 00000000000..bb0cf2e6ebe --- /dev/null +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/version.h @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __CCCL_VERSION_H +#define __CCCL_VERSION_H + +#define CCCL_VERSION 2003000 +#define CCCL_MAJOR_VERSION (CCCL_VERSION / 1000000) +#define CCCL_MINOR_VERSION (((CCCL_VERSION / 1000) % 1000)) +#define CCCL_PATCH_VERSION (CCCL_VERSION % 1000) + +#if CCCL_PATCH_VERSION > 99 +#error "CCCL patch version cannot be greater than 99 for compatibility with Thrust/CUB's MMMmmmpp format." +#endif + +#endif // __CCCL_VERSION_H diff --git a/libcudacxx/include/cuda/version b/libcudacxx/include/cuda/version new file mode 100644 index 00000000000..cd536749780 --- /dev/null +++ b/libcudacxx/include/cuda/version @@ -0,0 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_VERSION +#define _CUDA_VERSION + +#include "std/version" + +#endif // _CUDA_VERSION diff --git a/libcudacxx/lib/cmake/libcudacxx/libcudacxx-config-version.cmake b/libcudacxx/lib/cmake/libcudacxx/libcudacxx-config-version.cmake index e528dedbfc9..17be3cddc98 100644 --- a/libcudacxx/lib/cmake/libcudacxx/libcudacxx-config-version.cmake +++ b/libcudacxx/lib/cmake/libcudacxx/libcudacxx-config-version.cmake @@ -1,19 +1,9 @@ # Parse version information from version header: include("${CMAKE_CURRENT_LIST_DIR}/libcudacxx-header-search.cmake") -file(READ "${_libcudacxx_VERSION_INCLUDE_DIR}/cuda/std/detail/__config" - libcudacxx_VERSION_HEADER -) - -string(REGEX MATCH - "#define[ \t]+_LIBCUDACXX_CUDA_API_VERSION[ \t]+([0-9]+)" unused_var - "${libcudacxx_VERSION_HEADER}" -) - -set(libcudacxx_VERSION_FLAT ${CMAKE_MATCH_1}) -math(EXPR libcudacxx_VERSION_MAJOR "${libcudacxx_VERSION_FLAT} / 1000000") -math(EXPR libcudacxx_VERSION_MINOR "(${libcudacxx_VERSION_FLAT} / 1000) % 1000") -math(EXPR libcudacxx_VERSION_PATCH "${libcudacxx_VERSION_FLAT} % 1000") +set(libcudacxx_VERSION_MAJOR 2) +set(libcudacxx_VERSION_MINOR 3) +set(libcudacxx_VERSION_PATCH 0) set(libcudacxx_VERSION_TWEAK 0) set(libcudacxx_VERSION diff --git a/thrust/benchmarks/bench/adjacent_difference/basic.cu b/thrust/benchmarks/bench/adjacent_difference/basic.cu index d681aaa6aae..47f93f382b0 100644 --- a/thrust/benchmarks/bench/adjacent_difference/basic.cu +++ b/thrust/benchmarks/bench/adjacent_difference/basic.cu @@ -43,7 +43,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::adjacent_difference(input.cbegin(), input.cend(), output.begin()); }); } diff --git a/thrust/benchmarks/bench/adjacent_difference/custom.cu b/thrust/benchmarks/bench/adjacent_difference/custom.cu index 63d5d69e12c..e8e892bf8d4 100644 --- a/thrust/benchmarks/bench/adjacent_difference/custom.cu +++ b/thrust/benchmarks/bench/adjacent_difference/custom.cu @@ -60,7 +60,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::adjacent_difference(input.cbegin(), input.cend(), output.begin(), custom_op{42}); }); } diff --git a/thrust/benchmarks/bench/adjacent_difference/in_place.cu b/thrust/benchmarks/bench/adjacent_difference/in_place.cu index e6c39748df1..4e91e8471c0 100644 --- a/thrust/benchmarks/bench/adjacent_difference/in_place.cu +++ b/thrust/benchmarks/bench/adjacent_difference/in_place.cu @@ -42,7 +42,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::adjacent_difference(vec.begin(), vec.end(), vec.begin()); }); } diff --git a/thrust/benchmarks/bench/copy/basic.cu b/thrust/benchmarks/bench/copy/basic.cu index 8969a7b886d..1b2b96214df 100644 --- a/thrust/benchmarks/bench/copy/basic.cu +++ b/thrust/benchmarks/bench/copy/basic.cu @@ -45,7 +45,7 @@ static void basic(nvbench::state &state, state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::copy(input.cbegin(), input.cend(), output.begin()); diff --git a/thrust/benchmarks/bench/copy/if.cu b/thrust/benchmarks/bench/copy/if.cu index d8c4fd22e68..8b89e08db6f 100644 --- a/thrust/benchmarks/bench/copy/if.cu +++ b/thrust/benchmarks/bench/copy/if.cu @@ -74,7 +74,7 @@ static void basic(nvbench::state &state, state.add_global_memory_reads(elements); state.add_global_memory_writes(selected_elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::copy_if(input.cbegin(), input.cend(), output.begin(), select_op); }); } diff --git a/thrust/benchmarks/bench/fill/basic.cu b/thrust/benchmarks/bench/fill/basic.cu index 63a24d2bc2f..3c29f3c7043 100644 --- a/thrust/benchmarks/bench/fill/basic.cu +++ b/thrust/benchmarks/bench/fill/basic.cu @@ -41,7 +41,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_element_count(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::fill(output.begin(), output.end(), T{42}); }); } diff --git a/thrust/benchmarks/bench/inner_product/basic.cu b/thrust/benchmarks/bench/inner_product/basic.cu index 5a60ca1cfd2..aa3b5d467e9 100644 --- a/thrust/benchmarks/bench/inner_product/basic.cu +++ b/thrust/benchmarks/bench/inner_product/basic.cu @@ -44,7 +44,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements * 2); state.add_global_memory_writes(1); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::inner_product(lhs.begin(), lhs.end(), rhs.begin(), T{0}); }); } diff --git a/thrust/benchmarks/bench/merge/basic.cu b/thrust/benchmarks/bench/merge/basic.cu index fb8e8f8822a..854baf8ec0e 100644 --- a/thrust/benchmarks/bench/merge/basic.cu +++ b/thrust/benchmarks/bench/merge/basic.cu @@ -50,7 +50,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::merge(in.cbegin(), in.cbegin() + elements_in_lhs, in.cbegin() + elements_in_lhs, diff --git a/thrust/benchmarks/bench/partition/basic.cu b/thrust/benchmarks/bench/partition/basic.cu index a04aae31283..aafdc892236 100644 --- a/thrust/benchmarks/bench/partition/basic.cu +++ b/thrust/benchmarks/bench/partition/basic.cu @@ -72,7 +72,7 @@ static void basic(nvbench::state &state, state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::partition_copy(input.cbegin(), input.cend(), output.begin(), diff --git a/thrust/benchmarks/bench/reduce/basic.cu b/thrust/benchmarks/bench/reduce/basic.cu index 97dbe5d02b3..e6e31c22a05 100644 --- a/thrust/benchmarks/bench/reduce/basic.cu +++ b/thrust/benchmarks/bench/reduce/basic.cu @@ -42,7 +42,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(1); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { do_not_optimize(thrust::reduce(in.begin(), in.end())); }); } diff --git a/thrust/benchmarks/bench/reduce/by_key.cu b/thrust/benchmarks/bench/reduce/by_key.cu index 4eaaed194e5..282dff7d940 100644 --- a/thrust/benchmarks/bench/reduce/by_key.cu +++ b/thrust/benchmarks/bench/reduce/by_key.cu @@ -57,7 +57,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_writes(unique_keys); state.add_global_memory_writes(unique_keys); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::reduce_by_key(in_keys.begin(), in_keys.end(), in_vals.begin(), diff --git a/thrust/benchmarks/bench/scan/exclusive/by_key.cu b/thrust/benchmarks/bench/scan/exclusive/by_key.cu index 76a5a0f9921..df650554b39 100644 --- a/thrust/benchmarks/bench/scan/exclusive/by_key.cu +++ b/thrust/benchmarks/bench/scan/exclusive/by_key.cu @@ -45,7 +45,7 @@ static void scan(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::exclusive_scan_by_key(keys.cbegin(), keys.cend(), in_vals.cbegin(), out_vals.begin()); }); } diff --git a/thrust/benchmarks/bench/scan/exclusive/max.cu b/thrust/benchmarks/bench/scan/exclusive/max.cu index c434c537e5e..a18a3c96cb1 100644 --- a/thrust/benchmarks/bench/scan/exclusive/max.cu +++ b/thrust/benchmarks/bench/scan/exclusive/max.cu @@ -43,7 +43,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::exclusive_scan(input.cbegin(), input.cend(), output.begin(), T{}, max_t{}); }); } diff --git a/thrust/benchmarks/bench/scan/exclusive/sum.cu b/thrust/benchmarks/bench/scan/exclusive/sum.cu index 75ae35894d0..29b82b68a81 100644 --- a/thrust/benchmarks/bench/scan/exclusive/sum.cu +++ b/thrust/benchmarks/bench/scan/exclusive/sum.cu @@ -43,7 +43,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::exclusive_scan(input.cbegin(), input.cend(), output.begin()); }); } diff --git a/thrust/benchmarks/bench/scan/inclusive/by_key.cu b/thrust/benchmarks/bench/scan/inclusive/by_key.cu index bb468ff57dd..10e0cbc408b 100644 --- a/thrust/benchmarks/bench/scan/inclusive/by_key.cu +++ b/thrust/benchmarks/bench/scan/inclusive/by_key.cu @@ -45,7 +45,7 @@ static void scan(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::inclusive_scan_by_key(keys.cbegin(), keys.cend(), in_vals.cbegin(), out_vals.begin()); }); } diff --git a/thrust/benchmarks/bench/scan/inclusive/max.cu b/thrust/benchmarks/bench/scan/inclusive/max.cu index affecbdd9f0..40d84942ec6 100644 --- a/thrust/benchmarks/bench/scan/inclusive/max.cu +++ b/thrust/benchmarks/bench/scan/inclusive/max.cu @@ -43,7 +43,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::inclusive_scan(input.cbegin(), input.cend(), output.begin(), max_t{}); }); } diff --git a/thrust/benchmarks/bench/scan/inclusive/sum.cu b/thrust/benchmarks/bench/scan/inclusive/sum.cu index 540001a2344..ea98b7bcf31 100644 --- a/thrust/benchmarks/bench/scan/inclusive/sum.cu +++ b/thrust/benchmarks/bench/scan/inclusive/sum.cu @@ -43,7 +43,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::inclusive_scan(input.cbegin(), input.cend(), output.begin()); }); } diff --git a/thrust/benchmarks/bench/set_operations/base.cuh b/thrust/benchmarks/bench/set_operations/base.cuh index c660d222f05..9f5ab563ac0 100644 --- a/thrust/benchmarks/bench/set_operations/base.cuh +++ b/thrust/benchmarks/bench/set_operations/base.cuh @@ -61,7 +61,7 @@ static void basic(nvbench::state &state, nvbench::type_list, OpT op) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements_in_AB); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { op(input.cbegin(), input.cbegin() + elements_in_A, input.cbegin() + elements_in_A, diff --git a/thrust/benchmarks/bench/set_operations/by_key.cuh b/thrust/benchmarks/bench/set_operations/by_key.cuh index ae19b2407e0..6e71601f85c 100644 --- a/thrust/benchmarks/bench/set_operations/by_key.cuh +++ b/thrust/benchmarks/bench/set_operations/by_key.cuh @@ -70,7 +70,7 @@ static void basic(nvbench::state &state, nvbench::type_list, OpT o state.add_global_memory_reads(OpT::read_all_values ? elements : elements_in_A); state.add_global_memory_writes(elements_in_AB); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { op(in_keys.cbegin(), in_keys.cbegin() + elements_in_A, in_keys.cbegin() + elements_in_A, diff --git a/thrust/benchmarks/bench/shuffle/basic.cu b/thrust/benchmarks/bench/shuffle/basic.cu index f70629f2a4c..cc24d267851 100644 --- a/thrust/benchmarks/bench/shuffle/basic.cu +++ b/thrust/benchmarks/bench/shuffle/basic.cu @@ -44,7 +44,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_writes(elements); auto do_engine = [&](auto &&engine_constructor) { - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::shuffle(data.begin(), data.end(), engine_constructor()); }); }; diff --git a/thrust/benchmarks/bench/sort/keys.cu b/thrust/benchmarks/bench/sort/keys.cu index f086505fa2c..d52dd6e7d1e 100644 --- a/thrust/benchmarks/bench/sort/keys.cu +++ b/thrust/benchmarks/bench/sort/keys.cu @@ -58,4 +58,4 @@ NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types)) .set_name("base") .set_type_axes_names({"T{ct}"}) .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) - .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"}); + .add_string_axis("Entropy", {"1.000", "0.201"}); diff --git a/thrust/benchmarks/bench/sort/keys_custom.cu b/thrust/benchmarks/bench/sort/keys_custom.cu index 3728006f687..f1eb8c2fdf1 100644 --- a/thrust/benchmarks/bench/sort/keys_custom.cu +++ b/thrust/benchmarks/bench/sort/keys_custom.cu @@ -58,4 +58,4 @@ NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types)) .set_name("base") .set_type_axes_names({"T{ct}"}) .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) - .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"}); + .add_string_axis("Entropy", {"1.000", "0.201"}); diff --git a/thrust/benchmarks/bench/sort/pairs.cu b/thrust/benchmarks/bench/sort/pairs.cu index a6d45e33ed5..9d2f06b2f5c 100644 --- a/thrust/benchmarks/bench/sort/pairs.cu +++ b/thrust/benchmarks/bench/sort/pairs.cu @@ -59,19 +59,11 @@ static void basic(nvbench::state &state, nvbench::type_list) }); } -using key_types = fundamental_types; -using value_types = nvbench::type_list; +using key_types = integral_types; +using value_types = integral_types; NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(key_types, value_types)) .set_name("base") .set_type_axes_names({"KeyT{ct}", "ValueT{ct}"}) .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) - .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"}); + .add_string_axis("Entropy", {"1.000", "0.201"}); diff --git a/thrust/benchmarks/bench/sort/pairs_custom.cu b/thrust/benchmarks/bench/sort/pairs_custom.cu index 4cd73139895..bb731e03c6f 100644 --- a/thrust/benchmarks/bench/sort/pairs_custom.cu +++ b/thrust/benchmarks/bench/sort/pairs_custom.cu @@ -59,19 +59,11 @@ static void basic(nvbench::state &state, nvbench::type_list) }); } -using key_types = fundamental_types; -using value_types = nvbench::type_list; +using key_types = integral_types; +using value_types = integral_types; NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(key_types, value_types)) .set_name("base") .set_type_axes_names({"KeyT{ct}", "ValueT{ct}"}) .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) - .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"}); + .add_string_axis("Entropy", {"1.000", "0.201"}); diff --git a/thrust/benchmarks/bench/unique/basic.cu b/thrust/benchmarks/bench/unique/basic.cu index 7bef39ecc45..2f01fb30453 100644 --- a/thrust/benchmarks/bench/unique/basic.cu +++ b/thrust/benchmarks/bench/unique/basic.cu @@ -51,7 +51,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(unique_items); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::unique_copy(input.cbegin(), input.cend(), output.begin()); }); } diff --git a/thrust/benchmarks/bench/unique/by_key.cu b/thrust/benchmarks/bench/unique/by_key.cu index e6961bc4d95..ed43e64cb9b 100644 --- a/thrust/benchmarks/bench/unique/by_key.cu +++ b/thrust/benchmarks/bench/unique/by_key.cu @@ -56,7 +56,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(unique_elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::unique_by_key_copy(in_keys.cbegin(), in_keys.cend(), in_vals.cbegin(), diff --git a/thrust/benchmarks/bench/vectorized_search/base.cu b/thrust/benchmarks/bench/vectorized_search/base.cu index a733830fa1e..67e9ddedcc2 100644 --- a/thrust/benchmarks/bench/vectorized_search/base.cu +++ b/thrust/benchmarks/bench/vectorized_search/base.cu @@ -46,7 +46,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_element_count(needles); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::binary_search(data.begin(), data.begin() + elements, data.begin() + elements, diff --git a/thrust/benchmarks/bench/vectorized_search/lower_bound.cu b/thrust/benchmarks/bench/vectorized_search/lower_bound.cu index 4ab9539215f..e3fbd6e6cbd 100644 --- a/thrust/benchmarks/bench/vectorized_search/lower_bound.cu +++ b/thrust/benchmarks/bench/vectorized_search/lower_bound.cu @@ -46,7 +46,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_element_count(needles); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::lower_bound(data.begin(), data.begin() + elements, data.begin() + elements, diff --git a/thrust/benchmarks/bench/vectorized_search/upper_bound.cu b/thrust/benchmarks/bench/vectorized_search/upper_bound.cu index 5b57ebf92a5..6b412ca299c 100644 --- a/thrust/benchmarks/bench/vectorized_search/upper_bound.cu +++ b/thrust/benchmarks/bench/vectorized_search/upper_bound.cu @@ -46,7 +46,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_element_count(needles); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::upper_bound(data.begin(), data.begin() + elements, data.begin() + elements, diff --git a/thrust/thrust/cmake/thrust-config-version.cmake b/thrust/thrust/cmake/thrust-config-version.cmake index cf9407a4cec..5da3ad1339e 100644 --- a/thrust/thrust/cmake/thrust-config-version.cmake +++ b/thrust/thrust/cmake/thrust-config-version.cmake @@ -1,17 +1,10 @@ # Parse version information from version.h: include("${CMAKE_CURRENT_LIST_DIR}/thrust-header-search.cmake") -file(READ "${_THRUST_VERSION_INCLUDE_DIR}/thrust/version.h" THRUST_VERSION_HEADER) -string(REGEX MATCH "#define[ \t]+THRUST_VERSION[ \t]+([0-9]+)" DUMMY "${THRUST_VERSION_HEADER}") -set(THRUST_VERSION_FLAT ${CMAKE_MATCH_1}) -# Note that Thrust calls this the PATCH number, CMake calls it the TWEAK number: -string(REGEX MATCH "#define[ \t]+THRUST_PATCH_NUMBER[ \t]+([0-9]+)" DUMMY "${THRUST_VERSION_HEADER}") -set(THRUST_VERSION_TWEAK ${CMAKE_MATCH_1}) - -math(EXPR THRUST_VERSION_MAJOR "${THRUST_VERSION_FLAT} / 100000") -math(EXPR THRUST_VERSION_MINOR "(${THRUST_VERSION_FLAT} / 100) % 1000") -math(EXPR THRUST_VERSION_PATCH "${THRUST_VERSION_FLAT} % 100") # Thrust: "subminor" CMake: "patch" - +set(THRUST_VERSION_MAJOR 2) +set(THRUST_VERSION_MINOR 3) +set(THRUST_VERSION_PATCH 0) # Thrust: "subminor" CMake: "patch" +set(THRUST_VERSION_TWEAK 0) set(THRUST_VERSION "${THRUST_VERSION_MAJOR}.${THRUST_VERSION_MINOR}.${THRUST_VERSION_PATCH}.${THRUST_VERSION_TWEAK}") set(PACKAGE_VERSION ${THRUST_VERSION}) diff --git a/thrust/thrust/version.h b/thrust/thrust/version.h index 8c3fa09acaf..a1ef0896751 100644 --- a/thrust/thrust/version.h +++ b/thrust/thrust/version.h @@ -31,11 +31,13 @@ #include #if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER) -#pragma GCC system_header +# pragma GCC system_header #else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv _CCCL_IMPLICIT_SYSTEM_HEADER #endif // !_CCCL_COMPILER_NVHPC +#include + // This is the only Thrust header that is guaranteed to // change with every Thrust release. // @@ -49,13 +51,15 @@ _CCCL_IMPLICIT_SYSTEM_HEADER /*! \def THRUST_VERSION * \brief The preprocessor macro \p THRUST_VERSION encodes the version - * number of the Thrust library. + * number of the Thrust library as MMMmmmpp. + * + * \note THRUST_VERSION is formatted as `MMMmmmpp`, which differs from `CCCL_VERSION` that uses `MMMmmmppp`. * * THRUST_VERSION % 100 is the sub-minor version. * THRUST_VERSION / 100 % 1000 is the minor version. * THRUST_VERSION / 100000 is the major version. */ -#define THRUST_VERSION 200200 +#define THRUST_VERSION 200300 // macro expansion with ## requires this to be a single value /*! \def THRUST_MAJOR_VERSION * \brief The preprocessor macro \p THRUST_MAJOR_VERSION encodes the @@ -81,3 +85,7 @@ _CCCL_IMPLICIT_SYSTEM_HEADER * Legacy; will be 0 for all future releases. */ #define THRUST_PATCH_NUMBER 0 + +static_assert(THRUST_MAJOR_VERSION == CCCL_MAJOR_VERSION, ""); +static_assert(THRUST_MINOR_VERSION == CCCL_MINOR_VERSION, ""); +static_assert(THRUST_SUBMINOR_VERSION == CCCL_PATCH_VERSION, "");