Skip to content

Commit

Permalink
Basic changes from Dustin's MVP work (kotekan#1107)
Browse files Browse the repository at this point in the history
This PR is trying to extract the changes to some function signatures and structures from Dustin's MVP development branch (dstn/frb-bf).  This was done by generating a giant "git diff" and then selectively applying changes with "git commit -p", trying to group commits in a sensible way.

Important changes are:
- Cuda: Create a pipeline state object (cudaPipelineState) that is passed to each stage's execute() function, as a way of passing information between stages
- silly, for debugging: GPU: pass a GPU frame counter that increments each time queue_commands() is called.
- graphviz: record GPU buffers used by each stage, and add them to the graphviz call graph
- Cuda stages: add the option to require a flag to be set in the cudaPipelineState object in order for a stage to run
- GPU memory buffers: add the ability to create GPU memory views, allowing one stage to see only a subset of a memory buffer created by a different stage.  Used in a few places in the MVP to handle rechunking sorts of operations.
- add some float16_t data type handling

And other changes that were hard to extricate from these diffs include:
- cudaBasebandBeamformer: make the 'info' buffer a local buffer (not known to kotekan)
  • Loading branch information
dstndstn authored Sep 12, 2023
1 parent 4ea896e commit 4c57d44
Show file tree
Hide file tree
Showing 46 changed files with 685 additions and 188 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
.kdev_include_paths
*.c~
*.h~
*~
*#
*.swp
*.pyc
/.idea/
Expand Down
4 changes: 4 additions & 0 deletions external/fmt/fmt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
#define FMT_HEADER_ONLY
#endif

#ifndef FMT_OVERRIDE
#define FMT_OVERRIDE override
#endif

// Enable the fmt() macro for compile time string format checking
#define FMT_STRING_ALIAS 1

Expand Down
68 changes: 45 additions & 23 deletions lib/cuda/cudaBasebandBeamformer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,13 @@ cudaBasebandBeamformer::cudaBasebandBeamformer(Config& config, const std::string
_gpu_mem_phase = config.get<std::string>(unique_name, "gpu_mem_phase");
_gpu_mem_output_scaling = config.get<std::string>(unique_name, "gpu_mem_output_scaling");
_gpu_mem_formed_beams = config.get<std::string>(unique_name, "gpu_mem_formed_beams");
_gpu_mem_info = config.get<std::string>(unique_name, "gpu_mem_info");
_gpu_mem_info = unique_name + "/info";

gpu_buffers_used.push_back(std::make_tuple(_gpu_mem_voltage, true, true, false));
gpu_buffers_used.push_back(std::make_tuple(_gpu_mem_phase, true, true, false));
gpu_buffers_used.push_back(std::make_tuple(_gpu_mem_output_scaling, true, true, false));
gpu_buffers_used.push_back(std::make_tuple(_gpu_mem_formed_beams, true, false, true));
gpu_buffers_used.push_back(std::make_tuple(get_name() + "_info", false, true, true));

if (_num_elements != cuda_nelements)
throw std::runtime_error("The num_elements config setting must be "
Expand All @@ -44,7 +50,7 @@ cudaBasebandBeamformer::cudaBasebandBeamformer(Config& config, const std::string
output_len = (size_t)_num_local_freq * _num_beams * _samples_per_data_set * 2;
info_len = (size_t)(threads_x * threads_y * blocks_x * sizeof(int32_t));

command_type = gpuCommandType::KERNEL;
set_command_type(gpuCommandType::KERNEL);

std::vector<std::string> opts = {
"--gpu-name=sm_86",
Expand All @@ -66,24 +72,31 @@ struct CuDeviceArray {
};
typedef CuDeviceArray<int32_t, 1> kernel_arg;

cudaEvent_t cudaBasebandBeamformer::execute(int gpu_frame_id,
cudaEvent_t cudaBasebandBeamformer::execute(cudaPipelineState& pipestate,
const std::vector<cudaEvent_t>& pre_events) {
(void)pre_events;
pre_execute(gpu_frame_id);
pre_execute(pipestate.gpu_frame_id);

void* voltage_memory = device.get_gpu_memory_array(_gpu_mem_voltage, gpu_frame_id, voltage_len);
void* voltage_memory =
device.get_gpu_memory_array(_gpu_mem_voltage, pipestate.gpu_frame_id, voltage_len);
int8_t* phase_memory =
(int8_t*)device.get_gpu_memory_array(_gpu_mem_phase, gpu_frame_id, phase_len);
int32_t* shift_memory =
(int32_t*)device.get_gpu_memory_array(_gpu_mem_output_scaling, gpu_frame_id, shift_len);
(int8_t*)device.get_gpu_memory_array(_gpu_mem_phase, pipestate.gpu_frame_id, phase_len);
int32_t* shift_memory = (int32_t*)device.get_gpu_memory_array(
_gpu_mem_output_scaling, pipestate.gpu_frame_id, shift_len);
void* output_memory =
device.get_gpu_memory_array(_gpu_mem_formed_beams, gpu_frame_id, output_len);
int32_t* info_memory =
(int32_t*)device.get_gpu_memory_array(_gpu_mem_info, gpu_frame_id, info_len);
device.get_gpu_memory_array(_gpu_mem_formed_beams, pipestate.gpu_frame_id, output_len);
int32_t* info_memory = (int32_t*)device.get_gpu_memory(_gpu_mem_info, info_len);

host_info.resize(_gpu_buffer_depth);
for (int i = 0; i < _gpu_buffer_depth; i++)
host_info[i].resize(info_len / sizeof(int32_t));

record_start_event(pipestate.gpu_frame_id);

record_start_event(gpu_frame_id);
// Initialize info_memory return codes
CHECK_CUDA_ERROR(
cudaMemsetAsync(info_memory, 0xff, info_len, device.getStream(cuda_stream_id)));

CUresult err;
// A, E, s, J
const char* exc = "exception";
kernel_arg arr[5];
Expand Down Expand Up @@ -123,16 +136,25 @@ cudaEvent_t cudaBasebandBeamformer::execute(int gpu_frame_id,
CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
shared_mem_bytes));

DEBUG("Running CUDA Baseband Beamformer on GPU frame {:d}", gpu_frame_id);
err = cuLaunchKernel(runtime_kernels[kernel_name], blocks_x, blocks_y, 1, threads_x, threads_y,
1, shared_mem_bytes, device.getStream(cuda_stream_id), parameters, NULL);
DEBUG("Running CUDA Baseband Beamformer on GPU frame {:d}", pipestate.gpu_frame_id);
CHECK_CU_ERROR(cuLaunchKernel(runtime_kernels[kernel_name], blocks_x, blocks_y, 1, threads_x,
threads_y, 1, shared_mem_bytes, device.getStream(cuda_stream_id),
parameters, NULL));

if (err != CUDA_SUCCESS) {
const char* errStr;
cuGetErrorString(err, &errStr);
INFO("Error number: {}", err);
ERROR("ERROR IN cuLaunchKernel: {}", errStr);
}
// Copy "info" result code back to host memory
CHECK_CUDA_ERROR(cudaMemcpyAsync(host_info[pipestate.gpu_frame_id].data(), info_memory,
info_len, cudaMemcpyDeviceToHost,
device.getStream(cuda_stream_id)));

return record_end_event(pipestate.gpu_frame_id);
}

return record_end_event(gpu_frame_id);
void cudaBasebandBeamformer::finalize_frame(int gpu_frame_id) {
cudaCommand::finalize_frame(gpu_frame_id);
for (size_t i = 0; i < host_info[gpu_frame_id].size(); i++)
if (host_info[gpu_frame_id][i] != 0)
ERROR(
"cudaBasebandBeamformer returned 'info' value {:d} at index {:d} (zero indicates no"
"error)",
host_info[gpu_frame_id][i], i);
}
33 changes: 32 additions & 1 deletion lib/cuda/cudaBasebandBeamformer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,41 @@
*
* Kernel by Kendrick Smith and Erik Schnetter.
* https://github.com/eschnett/GPUIndexSpaces.jl/blob/main/output/bb1.ptx
*
* @par GPU Memory
* @gpu_mem gpu_mem_voltage Input complex voltages of size samples_per_data_set * num_elements *
* num_local_freq
* @gpu_mem_type staging
* @gpu_mem_format Array of @c int4+4 complex
* @gpu_mem gpu_mem_phase Input complex phases of size num_elements * num_local_freq * num_beams *
* 2
* @gpu_mem_type staging
* @gpu_mem_format Array of @c int8
* @gpu_mem gpu_mem_output_scaling Input number of bits to shift result by; size num_local_freq *
* num_beams * 2
* @gpu_mem_type staging
* @gpu_mem_format Array of @c int32
* @gpu_mem gpu_mem_formed_beams Output beams; size num_local_freq * num_beams *
* samples_per_data_set * 2
* @gpu_mem_type staging
* @gpu_mem_format Array of @c int4+4 complex
* @gpu_mem gpu_mem_info Output status information; size threads_x * threads_y * blocks_x
* @gpu_mem_type staging
* @gpu_mem_format Array of @c int32
*
* @conf num_elements Int. Number of dishes x polarizations.
* @conf num_local_freq Int. Number of frequencies in each frame.
* @conf samples_per_data_set Int. Number of time samples per frame.
* @conf num_beams Int. Number of beams being formed.
*/
class cudaBasebandBeamformer : public cudaCommand {
public:
cudaBasebandBeamformer(kotekan::Config& config, const std::string& unique_name,
kotekan::bufferContainer& host_buffers, cudaDeviceInterface& device);
~cudaBasebandBeamformer();
cudaEvent_t execute(int gpu_frame_id, const std::vector<cudaEvent_t>& pre_events) override;
cudaEvent_t execute(cudaPipelineState& pipestate,
const std::vector<cudaEvent_t>& pre_events) override;
virtual void finalize_frame(int gpu_frame_id) override;

protected:
private:
Expand All @@ -47,6 +75,9 @@ class cudaBasebandBeamformer : public cudaCommand {
/// GPU side memory name for the status/info output
std::string _gpu_mem_info;

// Host-side buffer array for GPU kernel status/info output
std::vector<std::vector<int32_t>> host_info;

// derived gpu array sizes
size_t voltage_len;
size_t phase_len;
Expand Down
85 changes: 61 additions & 24 deletions lib/cuda/cudaCommand.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,43 @@ using kotekan::Config;
using std::string;
using std::to_string;

cudaPipelineState::cudaPipelineState(int _gpu_frame_id) : gpu_frame_id(_gpu_frame_id) {}

cudaPipelineState::~cudaPipelineState() {}

void cudaPipelineState::set_flag(const std::string& key, bool val) {
flags[key] = val;
}

bool cudaPipelineState::flag_exists(const std::string& key) const {
// C++20
// return flags.contains(key);
return (flags.find(key) != flags.end());
}

bool cudaPipelineState::flag_is_set(const std::string& key) const {
auto search = flags.find(key);
if (search == flags.end())
return false;
return search->second;
}

void cudaPipelineState::set_int(const std::string& key, int64_t val) {
intmap[key] = val;
}

int64_t cudaPipelineState::get_int(const std::string& key) const {
return intmap.at(key);
}

cudaCommand::cudaCommand(Config& config_, const std::string& unique_name_,
bufferContainer& host_buffers_, cudaDeviceInterface& device_,
const std::string& default_kernel_command,
const std::string& default_kernel_file_name) :
gpuCommand(config_, unique_name_, host_buffers_, device_, default_kernel_command,
default_kernel_file_name),
device(device_) {
_required_flag = config.get_default<std::string>(unique_name, "required_flag", "");
start_events = (cudaEvent_t*)malloc(_gpu_buffer_depth * sizeof(cudaEvent_t));
end_events = (cudaEvent_t*)malloc(_gpu_buffer_depth * sizeof(cudaEvent_t));
for (int j = 0; j < _gpu_buffer_depth; ++j) {
Expand Down Expand Up @@ -66,26 +96,34 @@ cudaCommand::~cudaCommand() {
DEBUG("post_events Freed: {:s}", unique_name.c_str());
}

void cudaCommand::finalize_frame(int gpu_frame_id) {
if (start_events[gpu_frame_id] != nullptr) {
if (profiling) {
float exec_time;
CHECK_CUDA_ERROR(cudaEventElapsedTime(&exec_time, start_events[gpu_frame_id],
start_events[gpu_frame_id]));
double active_time = exec_time * 1e-3; // convert ms to s
excute_time->add_sample(active_time);
utilization->add_sample(active_time / frame_arrival_period);
}
if (start_events[gpu_frame_id])
CHECK_CUDA_ERROR(cudaEventDestroy(start_events[gpu_frame_id]));
start_events[gpu_frame_id] = nullptr;
cudaEvent_t cudaCommand::execute_base(cudaPipelineState& pipestate,
const std::vector<cudaEvent_t>& pre_events) {
if (_required_flag.size() && !pipestate.flag_is_set(_required_flag)) {
DEBUG("Required flag \"{:s}\" is not set; skipping stage", _required_flag);
return nullptr;
}
if (end_events[gpu_frame_id] != nullptr) {
CHECK_CUDA_ERROR(cudaEventDestroy(end_events[gpu_frame_id]));
end_events[gpu_frame_id] = nullptr;
return execute(pipestate, pre_events);
}

void cudaCommand::finalize_frame(int gpu_frame_id) {
if (profiling && (start_events[gpu_frame_id] != nullptr)
&& (end_events[gpu_frame_id] != nullptr)) {
float exec_time;
CHECK_CUDA_ERROR(
cudaEventElapsedTime(&exec_time, start_events[gpu_frame_id], end_events[gpu_frame_id]));
double active_time = exec_time * 1e-3; // convert ms to s
excute_time->add_sample(active_time);
utilization->add_sample(active_time / frame_arrival_period);
} else {
FATAL_ERROR("Null end event in cudaCommand {:s}, this should never happen!", unique_name);
excute_time->add_sample(0.);
utilization->add_sample(0.);
}
if (start_events[gpu_frame_id])
CHECK_CUDA_ERROR(cudaEventDestroy(start_events[gpu_frame_id]));
start_events[gpu_frame_id] = nullptr;
if (end_events[gpu_frame_id] != nullptr)
CHECK_CUDA_ERROR(cudaEventDestroy(end_events[gpu_frame_id]));
end_events[gpu_frame_id] = nullptr;
}

int32_t cudaCommand::get_cuda_stream_id() {
Expand Down Expand Up @@ -217,13 +255,6 @@ void cudaCommand::build_ptx(const std::vector<std::string>& kernel_names,
FATAL_ERROR("Error reading the file: {:s}", kernel_file_name);
fclose(fp);

// Convert compiler options to a c-style array.
std::vector<char*> cstring_opts;
cstring_opts.reserve(opts.size());

for (auto& str : opts)
cstring_opts.push_back(&str[0]);

// Create the compiler
nv_res = nvPTXCompilerCreate(&compiler, program_size, program_buffer);
if (nv_res != NVPTXCOMPILE_SUCCESS) {
Expand All @@ -232,6 +263,12 @@ void cudaCommand::build_ptx(const std::vector<std::string>& kernel_names,
return;
}

// Convert compiler options to a c-style array.
std::vector<char*> cstring_opts;
cstring_opts.reserve(opts.size());
for (auto& str : opts)
cstring_opts.push_back(&str[0]);

// Compile the code
nv_res = nvPTXCompilerCompile(compiler, cstring_opts.size(), cstring_opts.data());
// TODO Abstract error checking
Expand Down
35 changes: 33 additions & 2 deletions lib/cuda/cudaCommand.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,23 @@
#include <string>
#include <vector>

class cudaPipelineState : public kotekan::kotekanLogging {
public:
cudaPipelineState(int _gpu_frame_id);
virtual ~cudaPipelineState();
void set_flag(const std::string&, bool val);
bool flag_exists(const std::string&) const;
bool flag_is_set(const std::string&) const;
void set_int(const std::string&, int64_t val);
int64_t get_int(const std::string&) const;

int gpu_frame_id;

protected:
std::map<std::string, bool> flags;
std::map<std::string, int64_t> intmap;
};

/**
* @class cudaCommand
* @brief Base class for defining CUDA commands to execute on GPUs
Expand Down Expand Up @@ -69,13 +86,24 @@ class cudaCommand : public gpuCommand {
virtual void build_ptx(const std::vector<std::string>& kernel_names,
std::vector<std::string>& opts);

/**
* @brief Execute a kernel, with more control over the *cudaPipelineState* object.
* Most subclassers should implement *execute*.
* @param pipestate The pipeline state object.
* @param pre_events Array of the last events from each cuda stream, indexed by stream
* number.
*/
virtual cudaEvent_t execute_base(cudaPipelineState& pipestate,
const std::vector<cudaEvent_t>& pre_events);

/**
* @brief Execute a kernel, copy, etc.
* @param gpu_frame_id The bufferID associated with the GPU commands.
* @param pipestate Pipeline state for this GPU frame.
* @param pre_events Array of the last events from each cuda stream, indexed by stream
* number.
**/
virtual cudaEvent_t execute(int gpu_frame_id, const std::vector<cudaEvent_t>& pre_events) = 0;
virtual cudaEvent_t execute(cudaPipelineState& pipestate,
const std::vector<cudaEvent_t>& pre_events) = 0;

/** Releases the memory of the event chain arrays per buffer_id
* @param gpu_frame_id The bufferID to release all the memory references for.
Expand Down Expand Up @@ -104,6 +132,9 @@ class cudaCommand : public gpuCommand {
/// The ID of the cuda stream to run operations on
int32_t cuda_stream_id;

// cudaPipelineState flag required for this command to run, set from config "required_flag"
std::string _required_flag;

// Map containing the runtime kernels built with nvrtc from the kernel file (if needed)
std::map<std::string, CUfunction> runtime_kernels;
};
Expand Down
Loading

0 comments on commit 4c57d44

Please sign in to comment.