Skip to content

Commit

Permalink
ROCm WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
nazar-pc committed Sep 16, 2024
1 parent 62a015d commit 470cc04
Show file tree
Hide file tree
Showing 6 changed files with 286 additions and 9 deletions.
3 changes: 1 addition & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions shared/subspace-proof-of-space-gpu/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ include = [
blst = { version = "0.3.13", optional = true }
rust-kzg-blst = { git = "https://github.com/grandinetech/rust-kzg", rev = "6c8fcc623df3d7e8c0f30951a49bfea764f90bf4", default-features = false, optional = true }
# TODO: This is `rocm` branch, it is needed for ROCm support
#sppark = { git = "https://github.com/dot-asm/sppark", rev = "8eeafe0f6cc0ca8211b1be93922df1b5a118bbd2", optional = true }
sppark = { version = "0.1.8", optional = true }
sppark = { git = "https://github.com/dot-asm/sppark", rev = "fe1237fe9eabb8aeb48a21af4d439fb4ac4f5d5d", optional = true }
#sppark = { version = "0.1.8", optional = true }
subspace-core-primitives = { version = "0.1.0", path = "../../crates/subspace-core-primitives", default-features = false, optional = true }

[dev-dependencies]
Expand All @@ -31,7 +31,7 @@ cc = "1.1.15"
[features]
# Only Volta+ architectures are supported (GeForce RTX 20xx consumer GPUs and newer)
cuda = ["_gpu"]
# TODO: ROCm can't be enabled at the same time as `cuda` feature at the moment and is not exposed on library level
# TODO: ROCm can't be enabled at the same time as `cuda` feature at the moment
rocm = ["_gpu"]
# Internal feature, shouldn't be used directly
_gpu = [
Expand Down
10 changes: 6 additions & 4 deletions shared/subspace-proof-of-space-gpu/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,10 @@ fn main() {
hipcc.compiler(env::var("HIPCC").unwrap_or("hipcc".to_string()));
hipcc.cpp(true);
if cfg!(debug_assertions) {
hipcc.opt_level(1);
hipcc.opt_level(2);
}
hipcc.flag("--offload-arch=native,gfx1100,gfx1030,gfx942,gfx90a,gfx908");
// 6 corresponds to the number of offload-arch
hipcc.flag("-parallel-jobs=6");
hipcc.flag("--offload-arch=gfx1100,gfx1030,gfx942,gfx90a,gfx908");
// hipcc.flag("--offload-device-only");
// This controls how error strings get handled in the FFI. When defined error strings get
// returned from the FFI, and Rust must then free them. When not defined error strings are
// not returned.
Expand All @@ -35,6 +34,9 @@ fn main() {
hipcc.flag("-include").flag("util/cuda2hip.hpp");
}
hipcc.file("src/subspace_api.cu").compile("subspace_rocm");

// Doesn't link otherwise
println!("cargo::rustc-link-lib=amdhip64");
}

if cfg!(feature = "cuda") {
Expand Down
2 changes: 2 additions & 0 deletions shared/subspace-proof-of-space-gpu/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
#[cfg(feature = "cuda")]
pub mod cuda;
#[cfg(feature = "rocm")]
pub mod rocm;
190 changes: 190 additions & 0 deletions shared/subspace-proof-of-space-gpu/src/rocm.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
// Copyright Supranational LLC
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

#[cfg(test)]
mod tests;

use rust_kzg_blst::types::fr::FsFr;
use std::ops::DerefMut;
use subspace_core_primitives::crypto::Scalar;
use subspace_core_primitives::{PosProof, PosSeed, Record};

extern "C" {
/// # Returns
/// * `usize` - The number of available GPUs.
fn gpu_count() -> usize;

/// # Parameters
/// * `k: The size parameter for the table.
/// * `seed: A pointer to the seed data.
/// * `lg_record_size: The logarithm of the record size.
/// * `challenge_index: A mutable pointer to store the index of the challenge.
/// * `record: A pointer to the record data.
/// * `chunks_scratch: A mutable pointer to a scratch space for chunk data.
/// * `proof_count: A mutable pointer to store the count of proofs.
/// * `source_record_chunks: A mutable pointer to the source record chunks.
/// * `parity_record_chunks: A mutable pointer to the parity record chunks.
/// * `gpu_id: The ID of the GPU to use.
///
/// # Returns
/// * `sppark::Error` - An error code indicating the result of the operation.
///
/// # Assumptions
/// * `seed` must be a valid pointer to a 32-byte.
/// * `record` must be a valid pointer to the record data (`*const Record`), with a length of `1 << lg_record_size`.
/// * `source_record_chunks` and `parity_record_chunks` must be valid mutable pointers to `Scalar` elements, each with a length of `1 << lg_record_size`.
/// * `chunks_scratch` must be a valid mutable pointer where up to `challenges_count` 32-byte chunks of GPU-calculated data will be written.
/// * `gpu_id` must be a valid identifier of an available GPU. The available GPUs can be determined by using the `gpu_count` function.
fn generate_and_encode_pospace_dispatch(
k: u32,
seed: *const [u8; 32],
lg_record_size: u32,
challenge_index: *mut u32,
record: *const [u8; 32],
chunks_scratch: *mut [u8; 32],
proof_count: *mut u32,
parity_record_chunks: *mut FsFr,
gpu_id: i32,
) -> sppark::Error;
}

/// Returns [`RocmDevice`] for each available device
pub fn cuda_devices() -> Vec<RocmDevice> {
let num_devices = unsafe { gpu_count() };

(0i32..)
.take(num_devices)
.map(|gpu_id| RocmDevice { gpu_id })
.collect()
}

/// Wrapper data structure encapsulating a single CUDA-capable device
#[derive(Debug)]
pub struct RocmDevice {
gpu_id: i32,
}

impl RocmDevice {
/// Cuda device ID
pub fn id(&self) -> i32 {
self.gpu_id
}

/// Generates and encodes PoSpace on the GPU.
///
/// This function performs the generation and encoding of PoSpace
/// on a GPU. It uses the specified parameters to perform the computations and
/// ensures that errors are properly handled by returning a `Result` type.
///
/// # Parameters
///
/// ## Input
///
/// - `k`: The size parameter for the table.
/// - `seed`: A 32-byte seed used for the table generation process.
/// - `record`: A slice of bytes (`&[u8]`). These records are the data on which the proof of space will be generated.
/// - `gpu_id`: ID of the GPU to use. This parameter specifies which GPU to use for the computation.
///
/// ## Output
///
/// - `source_record_chunks`: A mutable vector of original data chunks of type FsFr, each 32 bytes in size.
/// - `parity_record_chunks`: A mutable vector of parity chunks derived from the source, each 32 bytes in size.
/// - `proof_count`: A mutable reference to the proof count. This value will be updated with the number of proofs generated.
/// - `chunks_scratch`: A mutable vector used to store the processed chunks. This vector holds the final results after combining record chunks and proof hashes.
/// - `challenge_index`: A mutable vector used to map the challenges to specific parts of the data.
pub fn generate_and_encode_pospace(
&self,
seed: &PosSeed,
record: &mut Record,
encoded_chunks_used_output: impl ExactSizeIterator<Item = impl DerefMut<Target = bool>>,
) -> Result<(), String> {
let record_len = Record::NUM_CHUNKS;
let challenge_len = Record::NUM_S_BUCKETS;
let lg_record_size = record_len.ilog2();

if challenge_len > u32::MAX as usize {
return Err(String::from("challenge_len is too large to fit in u32"));
}

let mut proof_count = 0u32;
let mut chunks_scratch_gpu = Vec::<[u8; Scalar::FULL_BYTES]>::with_capacity(challenge_len);
let mut challenge_index_gpu = Vec::<u32>::with_capacity(challenge_len);
let mut parity_record_chunks = Vec::<Scalar>::with_capacity(Record::NUM_CHUNKS);

let error = unsafe {
generate_and_encode_pospace_dispatch(
u32::from(PosProof::K),
&**seed,
lg_record_size,
challenge_index_gpu.as_mut_ptr(),
record.as_ptr(),
chunks_scratch_gpu.as_mut_ptr(),
&mut proof_count,
Scalar::slice_mut_to_repr(&mut parity_record_chunks).as_mut_ptr(),
self.gpu_id,
)
};

if error.code != 0 {
return Err(error.to_string());
}

let proof_count = proof_count as usize;
unsafe {
chunks_scratch_gpu.set_len(proof_count);
challenge_index_gpu.set_len(proof_count);
parity_record_chunks.set_len(Record::NUM_CHUNKS);
}

let mut encoded_chunks_used = vec![false; challenge_len];
let source_record_chunks = record.to_vec();

let mut chunks_scratch = challenge_index_gpu
.into_iter()
.zip(chunks_scratch_gpu)
.collect::<Vec<_>>();

chunks_scratch
.sort_unstable_by(|(a_out_index, _), (b_out_index, _)| a_out_index.cmp(b_out_index));

// We don't need all the proofs
chunks_scratch.truncate(proof_count.min(Record::NUM_CHUNKS));

for (out_index, _chunk) in &chunks_scratch {
encoded_chunks_used[*out_index as usize] = true;
}

encoded_chunks_used_output
.zip(&encoded_chunks_used)
.for_each(|(mut output, input)| *output = *input);

record
.iter_mut()
.zip(
chunks_scratch
.into_iter()
.map(|(_out_index, chunk)| chunk)
.chain(
source_record_chunks
.into_iter()
.zip(parity_record_chunks)
.flat_map(|(a, b)| [a, b.to_bytes()])
.zip(encoded_chunks_used.iter())
// Skip chunks that were used previously
.filter_map(|(record_chunk, encoded_chunk_used)| {
if *encoded_chunk_used {
None
} else {
Some(record_chunk)
}
}),
),
)
.for_each(|(output_chunk, input_chunk)| {
*output_chunk = input_chunk;
});

Ok(())
}
}
84 changes: 84 additions & 0 deletions shared/subspace-proof-of-space-gpu/src/rocm/tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
use crate::rocm::rocm_devices;
use std::num::NonZeroUsize;
use std::slice;
use subspace_core_primitives::crypto::{blake3_254_hash_to_scalar, blake3_hash};
use subspace_core_primitives::{HistorySize, PieceOffset, Record, SectorId};
use subspace_erasure_coding::ErasureCoding;
use subspace_farmer_components::plotting::{CpuRecordsEncoder, RecordsEncoder};
use subspace_farmer_components::sector::SectorContentsMap;
use subspace_proof_of_space::chia::ChiaTable;
use subspace_proof_of_space::Table;

type PosTable = ChiaTable;

#[test]
fn basic() {
let rocm_device = rocm_devices()
.into_iter()
.next()
.expect("Need ROCm device to run this test");

let mut table_generator = PosTable::generator();
let erasure_coding = ErasureCoding::new(
NonZeroUsize::new(Record::NUM_S_BUCKETS.next_power_of_two().ilog2() as usize)
.expect("Not zero; qed"),
)
.unwrap();
let global_mutex = Default::default();
let mut cpu_records_encoder = CpuRecordsEncoder::<PosTable>::new(
slice::from_mut(&mut table_generator),
&erasure_coding,
&global_mutex,
);

let sector_id = SectorId::new(blake3_hash(b"hello"), 500);
let history_size = HistorySize::ONE;
let mut record = Record::new_boxed();
record.iter_mut().enumerate().for_each(|(index, chunk)| {
*chunk = blake3_254_hash_to_scalar(&index.to_le_bytes()).to_bytes()
});

let mut cpu_encoded_records = Record::new_zero_vec(2);
for cpu_encoded_record in &mut cpu_encoded_records {
cpu_encoded_record.clone_from(&record);
}
let cpu_sector_contents_map = cpu_records_encoder
.encode_records(
&sector_id,
&mut cpu_encoded_records,
history_size,
&Default::default(),
)
.unwrap();

let mut gpu_encoded_records = Record::new_zero_vec(2);
for gpu_encoded_record in &mut gpu_encoded_records {
gpu_encoded_record.clone_from(&record);
}
let mut gpu_sector_contents_map = SectorContentsMap::new(2);
rocm_device
.generate_and_encode_pospace(
&sector_id.derive_evaluation_seed(PieceOffset::ZERO, history_size),
&mut gpu_encoded_records[0],
gpu_sector_contents_map
.iter_record_bitfields_mut()
.next()
.unwrap()
.iter_mut(),
)
.unwrap();
rocm_device
.generate_and_encode_pospace(
&sector_id.derive_evaluation_seed(PieceOffset::ONE, history_size),
&mut gpu_encoded_records[1],
gpu_sector_contents_map
.iter_record_bitfields_mut()
.nth(1)
.unwrap()
.iter_mut(),
)
.unwrap();

assert!(cpu_sector_contents_map == gpu_sector_contents_map);
assert!(cpu_encoded_records == gpu_encoded_records);
}

0 comments on commit 470cc04

Please sign in to comment.