Skip to content

Commit

Permalink
ROCm WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
nazar-pc committed Sep 16, 2024
1 parent 62a015d commit 100a02b
Show file tree
Hide file tree
Showing 12 changed files with 728 additions and 13 deletions.
3 changes: 1 addition & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

55 changes: 51 additions & 4 deletions Dockerfile-farmer
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,39 @@ RUN \
curl -OL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/$CUDA_ARCH/cuda-ubuntu2004.pin && \
mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cuda-minimal-build-12-4
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cuda-minimal-build-12-4 && \
echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \
ldconfig

# ROCm is only used on x86-64 since they don't have other packages
ARG ROCM_VERSION=6.2
RUN \
export PATH=/usr/local/cuda/bin${PATH:+:${PATH}} && \
export LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} && \
if [ $(uname -p) = "x86_64" ]; then \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends gpg && \
mkdir -p --mode=0755 /etc/apt/keyrings && \
curl -L https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor > /etc/apt/keyrings/rocm.gpg && \
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$ROCM_VERSION focal main" > /etc/apt/sources.list.d/rocm.list && \
echo "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" > /etc/apt/preferences.d/rocm-pin-600 && \
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rocm-hip-runtime-dev && \
echo "/opt/rocm/lib" >> /etc/ld.so.conf.d/rocm.conf && \
ldconfig \
; fi

# TODO: Remove `NVCC=off` hack once `sppark` has proper features for CUDA and ROCm
# ROCm is only used on x86-64 since they don't have other packages
RUN \
export PATH=/usr/local/cuda/bin:/opt/rocm-$ROCM_VERSION/bin${PATH:+:${PATH}} && \
if [ $(uname -p) = "x86_64" ]; then \
NVCC=off /root/.cargo/bin/cargo -Zgitoxide -Zgit build \
--locked \
-Z build-std \
--profile $PROFILE \
--bin subspace-farmer \
--features rocm \
--target $(uname -p)-unknown-linux-gnu && \
mv target/*/*/subspace-farmer subspace-farmer-rocm \
; fi && \
/root/.cargo/bin/cargo -Zgitoxide -Zgit build \
--locked \
-Z build-std \
Expand All @@ -64,7 +92,26 @@ RUN \

FROM ubuntu:20.04

COPY --from=0 /code/subspace-farmer /subspace-farmer
# Next block is for ROCm support
# ROCm is only used on x86-64 since they don't have other packages
ARG ROCM_VERSION=6.2
RUN \
if [ $(uname -p) = "x86_64" ]; then \
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends curl ca-certificates gpg && \
mkdir -p --mode=0755 /etc/apt/keyrings && \
curl -L https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor > /etc/apt/keyrings/rocm.gpg && \
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$ROCM_VERSION focal main" > /etc/apt/sources.list.d/rocm.list && \
echo "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" > /etc/apt/preferences.d/rocm-pin-600 && \
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends hip-runtime-amd && \
DEBIAN_FRONTEND=noninteractive apt-get remove -y --purge --autoremove curl ca-certificates gpg && \
rm -rf /var/lib/apt/lists/* && \
echo "/opt/rocm/lib" >> /etc/ld.so.conf.d/rocm.conf && \
ldconfig \
; fi

COPY --from=0 /code/subspace-farmer* /

RUN mkdir /var/subspace && chown nobody:nogroup /var/subspace

Expand Down
1 change: 1 addition & 0 deletions crates/subspace-farmer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ cluster = ["dep:async-nats"]
numa = ["dep:hwlocality"]
# Only Volta+ architectures are supported (GeForce RTX 20xx consumer GPUs and newer)
cuda = ["_gpu", "subspace-proof-of-space-gpu/cuda"]
rocm = ["_gpu", "subspace-proof-of-space-gpu/rocm"]
# Internal feature, shouldn't be used directly
_gpu = []

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ use subspace_farmer::cluster::plotter::plotter_service;
use subspace_farmer::plotter::cpu::CpuPlotter;
#[cfg(feature = "cuda")]
use subspace_farmer::plotter::gpu::cuda::CudaRecordsEncoder;
#[cfg(feature = "rocm")]
use subspace_farmer::plotter::gpu::rocm::RocmRecordsEncoder;
#[cfg(feature = "_gpu")]
use subspace_farmer::plotter::gpu::GpuPlotter;
use subspace_farmer::plotter::pool::PoolPlotter;
Expand Down Expand Up @@ -101,6 +103,24 @@ struct CudaPlottingOptions {
cuda_gpus: Option<String>,
}

#[cfg(feature = "rocm")]
#[derive(Debug, Parser)]
struct RocmPlottingOptions {
/// Defines how many sectors farmer will download concurrently during plotting with ROCm GPU,
/// allows to limit memory usage of the plotting process, defaults to number of ROCm GPUs found
/// + 1 to download future sector ahead of time.
///
/// Increase will result in higher memory usage.
#[arg(long)]
rocm_sector_downloading_concurrency: Option<NonZeroUsize>,
/// Specify exact GPUs to be used for plotting instead of using all GPUs (default behavior).
///
/// GPUs are coma-separated: `--rocm-gpus 0,1,3`. Empty string can be specified to disable ROCm
/// GPU usage.
#[arg(long)]
rocm_gpus: Option<String>,
}

/// Arguments for plotter
#[derive(Debug, Parser)]
pub(super) struct PlotterArgs {
Expand All @@ -118,6 +138,10 @@ pub(super) struct PlotterArgs {
#[cfg(feature = "cuda")]
#[clap(flatten)]
cuda_plotting_options: CudaPlottingOptions,
/// Plotting options only used by ROCm GPU plotter
#[cfg(feature = "rocm")]
#[clap(flatten)]
rocm_plotting_options: RocmPlottingOptions,
/// Additional cluster components
#[clap(raw = true)]
pub(super) additional_components: Vec<String>,
Expand All @@ -137,6 +161,8 @@ where
cpu_plotting_options,
#[cfg(feature = "cuda")]
cuda_plotting_options,
#[cfg(feature = "rocm")]
rocm_plotting_options,
additional_components: _,
} = plotter_args;

Expand Down Expand Up @@ -168,6 +194,21 @@ where
modern_plotters.push(Box::new(cuda_plotter));
}
}
#[cfg(feature = "rocm")]
{
let maybe_rocm_plotter = init_rocm_plotter(
rocm_plotting_options,
piece_getter.clone(),
Arc::clone(&global_mutex),
kzg.clone(),
erasure_coding.clone(),
registry,
)?;

if let Some(rocm_plotter) = maybe_rocm_plotter {
modern_plotters.push(Box::new(rocm_plotter));
}
}
{
let cpu_sector_encoding_concurrency = cpu_plotting_options.cpu_sector_encoding_concurrency;
let maybe_cpu_plotters = init_cpu_plotters::<_, PosTableLegacy, PosTable>(
Expand Down Expand Up @@ -401,3 +442,85 @@ where
.map_err(|error| anyhow::anyhow!("Failed to initialize CUDA plotter: {error}"))?,
))
}

#[cfg(feature = "rocm")]
fn init_rocm_plotter<PG>(
rocm_plotting_options: RocmPlottingOptions,
piece_getter: PG,
global_mutex: Arc<AsyncMutex<()>>,
kzg: Kzg,
erasure_coding: ErasureCoding,
registry: &mut Registry,
) -> anyhow::Result<Option<GpuPlotter<PG, RocmRecordsEncoder>>>
where
PG: PieceGetter + Clone + Send + Sync + 'static,
{
use std::collections::BTreeSet;
use subspace_proof_of_space_gpu::rocm::rocm_devices;
use tracing::{debug, warn};

let RocmPlottingOptions {
rocm_sector_downloading_concurrency,
rocm_gpus,
} = rocm_plotting_options;

let mut rocm_devices = rocm_devices();
let mut used_rocm_devices = (0..rocm_devices.len()).collect::<Vec<_>>();

if let Some(rocm_gpus) = rocm_gpus {
if rocm_gpus.is_empty() {
info!("ROCm GPU plotting was explicitly disabled");
return Ok(None);
}

let mut rocm_gpus_to_use = rocm_gpus
.split(',')
.map(|gpu_index| gpu_index.parse())
.collect::<Result<BTreeSet<usize>, _>>()?;

(used_rocm_devices, rocm_devices) = rocm_devices
.into_iter()
.enumerate()
.filter(|(index, _rocm_device)| rocm_gpus_to_use.remove(index))
.unzip();

if !rocm_gpus_to_use.is_empty() {
warn!(
?rocm_gpus_to_use,
"Some ROCm GPUs were not found on the system"
);
}
}

if rocm_devices.is_empty() {
debug!("No ROCm GPU devices found");
return Ok(None);
}

info!(?used_rocm_devices, "Using ROCm GPUs");

let rocm_downloading_semaphore = Arc::new(Semaphore::new(
rocm_sector_downloading_concurrency
.map(|rocm_sector_downloading_concurrency| rocm_sector_downloading_concurrency.get())
.unwrap_or(rocm_devices.len() + 1),
));

Ok(Some(
GpuPlotter::new(
piece_getter,
rocm_downloading_semaphore,
rocm_devices
.into_iter()
.map(|rocm_device| RocmRecordsEncoder::new(rocm_device, Arc::clone(&global_mutex)))
.collect::<Result<_, _>>()
.map_err(|error| {
anyhow::anyhow!("Failed to create ROCm records encoder: {error}")
})?,
global_mutex,
kzg,
erasure_coding,
Some(registry),
)
.map_err(|error| anyhow::anyhow!("Failed to initialize ROCm plotter: {error}"))?,
))
}
Loading

0 comments on commit 100a02b

Please sign in to comment.