diff --git a/.gitmodules b/.gitmodules index 25514bfc6a..373431afe7 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "examples/raycast/Sponza"] path = examples/raycast/Sponza url = https://github.com/jimmiebergmann/Sponza +[submodule "thirdparty/thrust"] + path = thirdparty/thrust + url = https://github.com/bernhardmgruber/thrust.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 607691fa0e..90123133f6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -167,6 +167,7 @@ if (LLAMA_BUILD_EXAMPLES) add_subdirectory("examples/stream") add_subdirectory("examples/falsesharing") add_subdirectory("examples/comptime") + add_subdirectory("examples/thrust") # alpaka examples find_package(alpaka 1.0) diff --git a/examples/thrust/CMakeLists.txt b/examples/thrust/CMakeLists.txt new file mode 100644 index 0000000000..6131bfe648 --- /dev/null +++ b/examples/thrust/CMakeLists.txt @@ -0,0 +1,29 @@ +cmake_minimum_required (VERSION 3.15) +project(llama-thrust CXX CUDA) + +set (Thrust_DIR "../../thirdparty/thrust/thrust/cmake/") +find_package(Thrust REQUIRED CONFIG) +thrust_create_target(Thrust FROM_OPTIONS) +find_package(OpenMP REQUIRED) +if (NOT TARGET llama::llama) + find_package(llama REQUIRED) +endif() +add_executable(${PROJECT_NAME} thrust.cu ../common/Stopwatch.hpp ../common/hostname.hpp) +target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17) +target_compile_options(${PROJECT_NAME} PUBLIC + $<$: + -forward-unknown-to-host-compiler + --expt-extended-lambda + --expt-relaxed-constexpr + --use_fast_math + --ftemplate-backtrace-limit 500>) +target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama Thrust OpenMP::OpenMP_CXX) + +# OpenMP is not added to the compiler commandline by cmake when using nvcc .. +if (NOT MSVC) + find_package(TBB REQUIRED) + target_link_libraries(${PROJECT_NAME} PRIVATE TBB::tbb) + target_compile_options(${PROJECT_NAME} PUBLIC -march=native -ffast-math -fopenmp) +else() + target_compile_options(${PROJECT_NAME} PUBLIC /openmp) +endif() diff --git a/examples/thrust/thrust.cu b/examples/thrust/thrust.cu new file mode 100644 index 0000000000..b80f62a099 --- /dev/null +++ b/examples/thrust/thrust.cu @@ -0,0 +1,729 @@ +// #define THRUST_DEBUG_SYNC +#include "../common/Stopwatch.hpp" +#include "../common/hostname.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +// #include +#include +#include +#include +// #include +#include +#include +#include + +constexpr auto REPETITIONS = 5; +constexpr auto N = std::size_t{1024} * 1024; +constexpr auto shuffle_frac = 20; // 1/x fraction of the data to shuffle +constexpr auto usePSTL = false; +// const auto exec = std::execution::par_unseq; +const auto exec = std::execution::seq; + +constexpr auto thrustUsesCuda = THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA; +static_assert(!(thrustUsesCuda && usePSTL), "Cannot use thrust/CUDA and PSTL together"); + +// clang-format off +namespace tag +{ + struct Pos{}; + struct Vel{}; + struct X{}; + struct Y{}; + struct Z{}; + struct Mass{}; +} // namespace tag + +using Vec3 = llama::Record< + llama::Field, + llama::Field, + llama::Field +>; + +using Particle = llama::Record< + llama::Field, + llama::Field, + llama::Field +>; + +namespace tag +{ + struct rng_state{}; + struct index{}; + struct mother_index{}; + struct eventId{}; + + struct energy{}; + struct energy_loss{}; + struct numIALeft{}; + struct interaction_length{}; + struct total_length{}; + + struct pos{}; + struct dir{}; + struct current_state{}; + struct next_state{}; + + struct status{}; + + struct number_of_secondaries{}; + + struct num_step{}; + + struct current_process{}; + struct pdg{}; +} // namespace tag + +using Vec3D = llama::Record< + llama::Field, + llama::Field, + llama::Field +>; + +struct RngState +{ + std::byte state[240]; + + LLAMA_FN_HOST_ACC_INLINE auto operator=(double d) -> RngState& + { + for (int i = 0; i < 240; i++) + state[i] = static_cast(d); + return *this; + } +}; + +// inspired by: https://github.com/apt-sim/AdePT/blob/master/tracking/inc/track.h +// packed size: 621 B +// padded size: 632 B +using Track = llama::Record< + llama::Field, + llama::Field, + llama::Field, + llama::Field, + + llama::Field, + llama::Field, + llama::Field, + llama::Field, + llama::Field, + + llama::Field, + llama::Field, + + llama::Field, // matrix + llama::Field, // matrix + + llama::Field, + + llama::Field, + llama::Field, + + llama::Field, + llama::Field +>; +// clang-format on + +#define USE_TRACK + +#ifdef USE_TRACK +using RecordDim = Track; +#else +using RecordDim = Particle; +#endif + +// struct Struct +//{ +// float posx; +// float posy; +// float posz; +// float velx; +// float vely; +// float velz; +// float mass; +//}; +// +// LLAMA_FN_HOST_ACC_INLINE auto operator<(Struct a, Struct b) -> bool +//{ +// return a.mass < b.mass; +//} + +// auto less = [] LLAMA_FN_HOST_ACC_INLINE(auto&& a, auto&& b) +//{ +// // printf("a < b: %p %p\n", &a, &b); +// return a < b; +//}; +// auto less = [] LLAMA_FN_HOST_ACC_INLINE (auto&& a, auto&& b) +//{ +// auto apos = a(tag::Pos{}); +// auto bpos = b(tag::Pos{}); +// return apos(tag::X{}) + apos(tag::Y{}) + apos(tag::Z{}) < bpos(tag::X{}) + bpos(tag::Y{}) + bpos(tag::Z{}); +//}; + +// struct Less +//{ +// template +// LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, B&& b) -> bool +// { +// auto posA = a(tag::Pos{}); +// auto posB = b(tag::Pos{}); +// // return a(tag::Mass{}) < b(tag::Mass{}); +// return posA(tag::X{}) + posA(tag::Y{}) + posA(tag::Z{}) < posB(tag::X{}) + posB(tag::Y{}) + posB(tag::Z{}); +// } +//}; + +// due to abominal HACKs in LLAMA to get thrust working, all writing accesses on llama::One must be terminal, because +// the llama::One copies its storage on each non-terminal access +struct NormalizeVel +{ + template + LLAMA_FN_HOST_ACC_INLINE void operator()(VirtualRecord&& p) + { +#ifdef USE_TRACK + double& x = p(tag::dir{}, llama::RecordCoord<0>{}); + double& y = p(tag::dir{}, llama::RecordCoord<1>{}); + double& z = p(tag::dir{}, llama::RecordCoord<2>{}); + const auto invNorm = 1.0 / std::sqrt(x * x + y * y + z * z); + x *= invNorm; + y *= invNorm; + z *= invNorm; +#else + float& x = p(tag::Vel{}, tag::X{}); + float& y = p(tag::Vel{}, tag::Y{}); + float& z = p(tag::Vel{}, tag::Z{}); + const auto invNorm = 1.0f / std::sqrtf(x * x + y * y + z * z); + x *= invNorm; + y *= invNorm; + z *= invNorm; + return p; + // float x = p(tag::Vel{}, tag::X{}); + // float y = p(tag::Vel{}, tag::Y{}); + // float z = p(tag::Vel{}, tag::Z{}); + // p(tag::Vel{}, tag::X{}) = 0.36f * x + 0.48f * y + -0.80f * z; + // p(tag::Vel{}, tag::Y{}) = -0.80f * x + 0.60f * y + 0.00f * z; + // p(tag::Vel{}, tag::Z{}) = 0.48f * x + 0.64f * y + 0.60f * z; +#endif + } +}; + +struct Predicate +{ + // LLAMA_FN_HOST_ACC_INLINE int c = 0; + + //~Predicate() + //{ + // printf("total %d/%d\n", c, (int)N); + //} + + template + LLAMA_FN_HOST_ACC_INLINE auto operator()(VirtualPoint&& p) -> bool + { +#ifdef USE_TRACK + const auto r = (p(tag::dir{}, llama::RecordCoord<0>{}) + p(tag::dir{}, llama::RecordCoord<1>{}) + + p(tag::dir{}, llama::RecordCoord<2>{})) + < -0.6; + return r; +#else + // half the particles, 5 taken, 5 not + // const auto r = p(tag::Vel{}, tag::X{}) < 0; + + // quarter of the particles, 5 taken, 15 not + // const auto r = p(tag::Vel{}, tag::X{}) < 0 && p(tag::Vel{}, tag::Y{}) < 0; + + // half the particles, 3 taken, 3 not, 7 taken, t not + // const auto r = (p(tag::Vel{}, tag::X{}) + p(tag::Vel{}, tag::Y{})) < 0; + + // half the particles, 7 taken, 2 not, 4 taken, 6 not, 6 taken, 4 not, 2 taken, 8 not + // const auto r = (p(tag::Vel{}, tag::X{}) + p(tag::Vel{}, tag::Y{}) + p(tag::Vel{}, tag::Z{})) < 0; + + // 388/1024 of the particles + const auto r = (p(tag::Vel{}, tag::X{}) + p(tag::Vel{}, tag::Y{}) + p(tag::Vel{}, tag::Z{})) < -0.6; + + // c += r; + // printf("predicate %i\n", (int) r); + return r; +#endif + } +}; + +struct InitOne +{ + LLAMA_FN_HOST_ACC_INLINE auto operator()(int i) + { + const auto fx = static_cast(i % 10) / 9.0f - 0.5f; + const auto fy = static_cast(i % 20) / 19.0f - 0.5f; + const auto fz = static_cast(i % 40) / 39.0f - 0.5f; +#ifdef USE_TRACK + llama::One p; + p = static_cast(fx + fy + fz); + p(tag::dir{}, llama::RecordCoord<0>{}) = fx; + p(tag::dir{}, llama::RecordCoord<1>{}) = fy; + p(tag::dir{}, llama::RecordCoord<2>{}) = fz; + return p; +#else + llama::One p; + p(tag::Vel{}, tag::X{}) = fx; + p(tag::Vel{}, tag::Y{}) = fy; + p(tag::Vel{}, tag::Z{}) = fz; + p(tag::Pos{}, tag::X{}) = fx; + p(tag::Pos{}, tag::Y{}) = fy; + p(tag::Pos{}, tag::Z{}) = fz; + p(tag::Mass{}) = fx + fy + fz; + return p; +#endif + } +}; + +struct GetMass +{ + template + LLAMA_FN_HOST_ACC_INLINE auto operator()(VirtualPoint&& p) + { +#ifdef USE_TRACK + return p(tag::energy{}); +#else + return p(tag::Mass{}); +#endif + } +}; + +void checkError(cudaError_t code) +{ + if(code != cudaSuccess) + throw std::runtime_error(std::string{"CUDA Error: "} + cudaGetErrorString(code)); +} + +void syncWithCuda() +{ + if constexpr(thrustUsesCuda) + checkError(cudaDeviceSynchronize()); +} + +volatile std::uint32_t sink; +volatile void* sinkp; + +auto thrustDeviceAlloc = [](auto alignment, std::size_t size) +{ + auto* p = thrust::device_malloc(size).get(); + assert(reinterpret_cast(p) & (alignment - 1) == 0); // assert device_malloc sufficiently aligns + return p; +}; + +template +struct IndexToViewIterator +{ + View view; + LLAMA_FN_HOST_ACC_INLINE auto operator()(std::size_t i) + { + return *(view.begin() + i); + } +}; + +template +auto make_view_it(View view, std::size_t i) +{ + auto ci = thrust::counting_iterator{0}; + return thrust::transform_iterator< + IndexToViewIterator, + decltype(ci), + typename View::iterator::reference, + typename View::iterator::value_type>{ci, IndexToViewIterator{std::move(view)}}; +} + +template +void run(std::ostream& plotFile) +{ + auto mappingName = []() -> std::string + { + if(Mapping == 0) + return "AoS"; + if(Mapping == 1) + return "SoA"; + if(Mapping == 2) + return "SoA MB"; + if(Mapping == 3) + return "AoSoA8"; + if(Mapping == 4) + return "AoSoA16"; + if(Mapping == 5) + return "AoSoA32"; + if(Mapping == 6) + return "Split AoS"; + std::abort(); + }(); + auto mapping = [&] + { + auto extents = llama::ArrayExtents{N}; + if constexpr(Mapping == 0) + return llama::mapping::AoS{extents, RecordDim{}}; + if constexpr(Mapping == 1) + return llama::mapping::SoA{extents, RecordDim{}}; + if constexpr(Mapping == 2) + return llama::mapping::SoA{extents}; + if constexpr(Mapping == 3) + return llama::mapping::AoSoA{extents}; + if constexpr(Mapping == 4) + return llama::mapping::AoSoA{extents}; + if constexpr(Mapping == 5) + return llama::mapping::AoSoA{extents}; + if constexpr(Mapping == 6) + return llama::mapping::Split< + decltype(extents), + RecordDim, +#ifdef USE_TRACK + llama::RecordCoord<10>, // dir +#else + llama::RecordCoord<1>, // Vel +#endif + llama::mapping::BindAoS<>::fn, + llama::mapping::BindAoS<>::fn, + true>{extents}; + }(); + + std::cout << mappingName << '\n'; + + auto view = llama::allocView(mapping, thrustDeviceAlloc); + + auto b = make_view_it(view, 0); + auto e = make_view_it(view, N); + // auto b = view.begin(); + // auto e = view.end(); + + auto r = (*b); + r(tag::eventId{}) = 0; + + // touch memory once before running benchmarks + thrust::fill(thrust::device, b, e, 0); + syncWithCuda(); + + // #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA + // thrust::counting_iterator ci(0); + // thrust::transform( + // thrust::device, + // ci, + // ci + N, + // view.begin(), + // [] LLAMA_FN_HOST_ACC_INLINE(std::size_t i) + // { + // thrust::default_random_engine e; + // e.discard(i); + // thrust::uniform_real_distribution d{-1.0f, 1.0f}; + // return d(e); + // }); + // #else + // { + // std::default_random_engine e; + // std::uniform_real_distribution d{-1.0f, 1.0f}; + // for(auto vr : view) + // vr = d(e); + // } + // #endif + + double tabulateTotal = 0; + // double shuffleTotal = 0; + double forEachTotal = 0; + double transformTotal = 0; + double transformScanTotal = 0; + double transformReduceTotal = 0; + double copyTotal = 0; + double copyIfTotal = 0; + double removeIfTotal = 0; + // double sortTotal = 0; + for(auto i = 0; i < REPETITIONS; i++) + { + { + Stopwatch stopwatch; + if constexpr(usePSTL) + { + std::transform( + exec, + thrust::make_counting_iterator(std::size_t{0}), + thrust::make_counting_iterator(N), + view.begin(), + InitOne{}); + } + else + { + thrust::tabulate(thrust::device, b, e, InitOne{}); + syncWithCuda(); + } + tabulateTotal += stopwatch.printAndReset("tabulate", '\t'); + } + + //{ + // // shuffle less data so runtime stays in the same realm as the other benchmarks + // Stopwatch stopwatch; + // if constexpr(usePSTL) + // std::shuffle(view.begin(), view.begin() + N / shuffle_frac, std::default_random_engine{}); + // else + // { + // thrust::shuffle( + // thrust::device, + // view.begin(), + // view.begin() + N / shuffle_frac, + // thrust::default_random_engine{}); + // syncWithCuda(); + // } + // shuffleTotal += stopwatch.printAndReset("shuffle", '\t'); + //} + + { + Stopwatch stopwatch; + if constexpr(usePSTL) + std::for_each(exec, b, e, NormalizeVel{}); + else + { + thrust::for_each( + thrust::device, + thrust::make_counting_iterator(std::size_t{0}), + thrust::make_counting_iterator(N), + [view] __host__ __device__(std::size_t i) mutable { NormalizeVel{}(view[i]); }); + syncWithCuda(); + } + forEachTotal += stopwatch.printAndReset("for_each", '\t'); + } + + using MassType = decltype(GetMass{}(view[0])); + { + thrust::device_vector dst(N); + Stopwatch stopwatch; + if constexpr(usePSTL) + std::transform(exec, b, e, dst.begin(), GetMass{}); + else + { + thrust::transform(thrust::device, b, e, dst.begin(), GetMass{}); + syncWithCuda(); + } + transformTotal += stopwatch.printAndReset("transform", '\t'); + if constexpr(!thrustUsesCuda) + // make sure the compiler does not optimize away the benchmark + sink = std::reduce(dst.begin(), dst.end(), MassType{0}); + } + + { + thrust::device_vector scan_result(N); + Stopwatch stopwatch; + if constexpr(usePSTL) + std::transform_exclusive_scan( + exec, + b, + e, + scan_result.begin(), + std::uint32_t{0}, + std::plus<>{}, + Predicate{}); + else + { + thrust::transform_exclusive_scan( + thrust::device, + b, + e, + scan_result.begin(), + Predicate{}, + std::uint32_t{0}, + thrust::plus<>{}); + syncWithCuda(); + } + transformScanTotal += stopwatch.printAndReset("transform_scan", '\t'); + if constexpr(!thrustUsesCuda) + // make sure the compiler does not optimize away the benchmark + sink = std::reduce(scan_result.begin(), scan_result.end(), std::uint32_t{0}); + } + + { + Stopwatch stopwatch; + if constexpr(usePSTL) + sink = std::transform_reduce(exec, b, e, MassType{0}, std::plus<>{}, GetMass{}); + else + { + sink = thrust::transform_reduce(thrust::device, b, e, GetMass{}, MassType{0}, thrust::plus<>{}); + syncWithCuda(); + } + transformReduceTotal += stopwatch.printAndReset("transform_reduce", '\t'); + } + + { + auto dstView = llama::allocView(mapping, thrustDeviceAlloc); + auto db = make_view_it(dstView, 0); + Stopwatch stopwatch; + if constexpr(usePSTL) + std::copy(exec, b, e, db); + else + { + thrust::copy(thrust::device, b, e, db); + syncWithCuda(); + } + copyTotal += stopwatch.printAndReset("copy", '\t'); + for(std::byte* blob : dstView.storageBlobs) + thrust::device_free(thrust::device_ptr{blob}); + } + + { + auto dstView = llama::allocView(mapping, thrustDeviceAlloc); + auto db = make_view_it(dstView, 0); + Stopwatch stopwatch; + if constexpr(usePSTL) + std::copy_if(exec, b, e, db, Predicate{}); + else + { + thrust::copy_if(thrust::device, b, e, db, Predicate{}); + syncWithCuda(); + } + copyIfTotal += stopwatch.printAndReset("copy_if", '\t'); + for(std::byte* blob : dstView.storageBlobs) + thrust::device_free(thrust::device_ptr{blob}); + } + + { + Stopwatch stopwatch; + if constexpr(usePSTL) + std::remove_if(exec, b, e, Predicate{}); + else + { + thrust::remove_if(thrust::device, b, e, Predicate{}); + syncWithCuda(); + } + removeIfTotal += stopwatch.printAndReset("remove_if", '\t'); + } + + //{ + // Stopwatch stopwatch; + // if constexpr(usePSTL) + // std::sort(std::execution::par, b, e, Less{}); + // else + // { + // thrust::sort(thrust::device, b, e, Less{}); + // syncWithCuda(); + // } + // sortTotal += stopwatch.printAndReset("sort", '\t'); + // if(!thrust::is_sorted(thrust::device, b, e, Less{})) + // std::cerr << "VALIDATION FAILED\n"; + //} + + std::cout << std::endl; + } + plotFile << std::quoted(mappingName) << '\t'; + plotFile << tabulateTotal / REPETITIONS << '\t'; + // plotFile << shuffleTotal / REPETITIONS << '\t'; + plotFile << forEachTotal / REPETITIONS << '\t'; + plotFile << transformTotal / REPETITIONS << '\t'; + plotFile << transformScanTotal / REPETITIONS << '\t'; + plotFile << transformReduceTotal / REPETITIONS << '\t'; + plotFile << copyTotal / REPETITIONS << '\t'; + plotFile << copyIfTotal / REPETITIONS << '\t'; + plotFile << removeIfTotal / REPETITIONS << '\n'; + // plotFile << sortTotal / REPETITIONS << '\n'; + + if constexpr(!thrustUsesCuda) + // make sure the compiler does not optimize away the benchmark + for(std::byte* blob : view.storageBlobs) + sinkp = reinterpret_cast(blob); + + for(std::byte* blob : view.storageBlobs) + thrust::device_free(thrust::device_ptr{blob}); +} + +auto main(int argc, char* argv[]) -> int +try +{ + const auto plotFileName = argc == 1 ? "thrust.sh" : argv[1]; + + const auto deviceName = [] + { + if constexpr(usePSTL) + { + if constexpr(std::is_same_v) + return "STL"; + else + return "PSTL"; + } + else + return "thrust/" BOOST_PP_STRINGIZE(__THRUST_DEVICE_SYSTEM_NAMESPACE); + }(); + + fmt::print( + "{}Mi particles ({}MiB) with {}\n", + N / 1024 / 1024, + N * llama::sizeOf / 1024 / 1024, + deviceName); + + // static_assert(llama::mapping::MinAlignedOne, Particle>{}.blobSize(0) == 16); + //[[maybe_unused]] const auto v = llama::allocViewStack<0, Particle>(); + // static_assert(sizeof(v.storageBlobs) == 16); + // static_assert(sizeof(v) == 16); + //[[maybe_unused]] const auto p = llama::One{}; + // static_assert(sizeof(p) == 16); + + // static_assert(sizeof(Struct) == 4); + // std::cout << "alignof One " << alignof(llama::One) << "\n"; + // static_assert(alignof(llama::One) == 4); + + //{ + // Stopwatch stopwatch; + // thrust::device_vector v(N); + // thrust::sort(thrust::device, v.begin(), v.end()); + // stopwatch.printAndReset("Struct sort"); + //} + //{ + // Stopwatch stopwatch; + // thrust::device_vector> v(N); + // thrust::sort(thrust::device, v.begin(), v.end(), Less{}); + // stopwatch.printAndReset("One sort"); + //} + + const auto numThreads = static_cast(omp_get_max_threads()); + const char* affinity = std::getenv("GOMP_CPU_AFFINITY"); // NOLINT(concurrency-mt-unsafe) + affinity = affinity == nullptr ? "NONE - PLEASE PIN YOUR THREADS!" : affinity; + + std::ofstream plotFile{plotFileName}; + plotFile.exceptions(std::ios::badbit | std::ios::failbit); + plotFile << fmt::format( + R"aa(#!/usr/bin/env -S gnuplot -p +# threads: {} affinity: {} +set title "thrust {}Mi particles ({}MiB) with {} on {}" +set style data histograms +set style fill solid +set xtics rotate by 45 right +set key out top center maxrows 3 +set yrange [0:*] +set ylabel "runtime [s]" +$data << EOD +"" "tabulate" "for\\\_each" "transform" "transform\\\_scan" "transform\\\_reduce" "copy" "copy\\\_if" "remove\\\_if" +)aa", + numThreads, + affinity, + N / 1024 / 1024, + N * llama::sizeOf / 1024 / 1024, + deviceName, + common::hostname(), + shuffle_frac); + + run<0>(plotFile); + run<1>(plotFile); + run<2>(plotFile); + run<3>(plotFile); + run<4>(plotFile); + run<5>(plotFile); + run<6>(plotFile); + + plotFile << R"(EOD +plot for [COL=2:9] $data using COL:xtic(1) ti col +)"; + fmt::print("Plot with: ./{}\n", plotFileName); +} +catch(const std::exception& e) +{ + std::cerr << "Exception: " << e.what() << '\n'; +} diff --git a/include/llama/RecordRef.hpp b/include/llama/RecordRef.hpp index 77bf346601..8f2212bf50 100644 --- a/include/llama/RecordRef.hpp +++ b/include/llama/RecordRef.hpp @@ -13,6 +13,15 @@ #include #include +namespace thrust +{ + template + class reference; + + template + class device_reference; +} // namespace thrust + namespace llama { LLAMA_EXPORT @@ -79,6 +88,23 @@ namespace llama return left; } + template + LLAMA_FN_HOST_ACC_INLINE auto virtualRecordArithOperator( + LeftRecord& left, + const thrust::reference& right) -> LeftRecord& + { + return virtualRecordArithOperator(left, *(&right).get()); + } + + template + LLAMA_FN_HOST_ACC_INLINE auto virtualRecordArithOperator( + LeftRecord& left, + const thrust::device_reference& right) -> LeftRecord& + { + // printf("thrust::device_reference\n"); + return virtualRecordArithOperator(left, *(&right).get()); + } + template LLAMA_FN_HOST_ACC_INLINE auto recordRefArithOperator(LeftRecord& left, const T& right) -> LeftRecord& { diff --git a/thirdparty/thrust b/thirdparty/thrust new file mode 160000 index 0000000000..65ec0ac7b4 --- /dev/null +++ b/thirdparty/thrust @@ -0,0 +1 @@ +Subproject commit 65ec0ac7b4746e30bff383008456bac4abec691f