diff --git a/shared/subspace-proof-of-space-gpu/README.md b/shared/subspace-proof-of-space-gpu/README.md index 3f4a6af9aa..ed437f5e01 100644 --- a/shared/subspace-proof-of-space-gpu/README.md +++ b/shared/subspace-proof-of-space-gpu/README.md @@ -3,6 +3,8 @@ This crate exposes some low-level primitives to accelerate proof of space implementation on Nvidia (CUDA, Volta+) and AMD (ROCm) GPUs. +The implementation expects 64KB of shared memory and about 1.1GB of VRAM on GPU. + ## Build requirements ### CUDA diff --git a/shared/subspace-proof-of-space-gpu/build.rs b/shared/subspace-proof-of-space-gpu/build.rs index 2d1d619239..0e9d891da8 100644 --- a/shared/subspace-proof-of-space-gpu/build.rs +++ b/shared/subspace-proof-of-space-gpu/build.rs @@ -26,6 +26,9 @@ fn main() { hipcc.flag("--offload-arch=native,gfx1100,gfx1030,gfx942,gfx90a,gfx908"); // 6 corresponds to the number of offload-arch hipcc.flag("-parallel-jobs=6"); + // This controls how error strings get handled in the FFI. When defined error strings get + // returned from the FFI, and Rust must then free them. When not defined error strings are + // not returned. hipcc.define("TAKE_RESPONSIBILITY_FOR_ERROR_MESSAGE", None); if let Some(include) = env::var_os("DEP_SPPARK_ROOT") { hipcc.include(include); @@ -41,6 +44,9 @@ fn main() { if target_env != "msvc" { nvcc.flag("-Xcompiler").flag("-Wno-unused-function"); } + // This controls how error strings get handled in the FFI. When defined error strings get + // returned from the FFI, and Rust must then free them. When not defined error strings are + // not returned. nvcc.define("TAKE_RESPONSIBILITY_FOR_ERROR_MESSAGE", None); if let Some(include) = env::var_os("DEP_BLST_C_SRC") { nvcc.include(include); diff --git a/shared/subspace-proof-of-space-gpu/src/find_matches.cuh b/shared/subspace-proof-of-space-gpu/src/find_matches.cuh index 85769ca589..af79b36e67 100644 --- a/shared/subspace-proof-of-space-gpu/src/find_matches.cuh +++ b/shared/subspace-proof-of-space-gpu/src/find_matches.cuh @@ -39,6 +39,9 @@ __global__ __launch_bounds__(1024) void find_matches(uint2* out, const uint2* ys, const uint2* histogram, uint32_t* global_match_count) { + // This constant is based on the likely number of elements in each bucket. Generally it will not exceed 512, which + // means a single call will process the bucket. When it does exceed 512 the remaining elements are processed in the + // next pass. This is a heuristic and not expected to impact performance significantly either way. const uint32_t right_bucket_step = 512; const uint32_t warp_count = block_sz / WARP_SZ; const uint32_t warp_match_threshold = 4 * WARP_SZ;