Basic changes from Dustin's MVP work (kotekan#1107)

This PR is trying to extract the changes to some function signatures and structures from Dustin's MVP development branch (dstn/frb-bf). This was done by generating a giant "git diff" and then selectively applying changes with "git commit -p", trying to group commits in a sensible way. Important changes are: - Cuda: Create a pipeline state object (cudaPipelineState) that is passed to each stage's execute() function, as a way of passing information between stages - silly, for debugging: GPU: pass a GPU frame counter that increments each time queue_commands() is called. - graphviz: record GPU buffers used by each stage, and add them to the graphviz call graph - Cuda stages: add the option to require a flag to be set in the cudaPipelineState object in order for a stage to run - GPU memory buffers: add the ability to create GPU memory views, allowing one stage to see only a subset of a memory buffer created by a different stage. Used in a few places in the MVP to handle rechunking sorts of operations. - add some float16_t data type handling And other changes that were hard to extricate from these diffs include: - cudaBasebandBeamformer: make the 'info' buffer a local buffer (not known to kotekan)
CHIMEFRB · Sep 12, 2023 · 4c57d44 · 4c57d44
1 parent 4ea896e
commit 4c57d44
Show file tree

Hide file tree

Showing 46 changed files with 685 additions and 188 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,8 @@
 .kdev_include_paths
 *.c~
 *.h~
+*~
+*#
 *.swp
 *.pyc
 /.idea/

diff --git a/external/fmt/fmt.hpp b/external/fmt/fmt.hpp
@@ -7,6 +7,10 @@
     #define FMT_HEADER_ONLY
 #endif
 
+#ifndef FMT_OVERRIDE
+    #define FMT_OVERRIDE override
+#endif
+
 // Enable the fmt() macro for compile time string format checking
 #define FMT_STRING_ALIAS 1
 

diff --git a/lib/cuda/cudaBasebandBeamformer.cpp b/lib/cuda/cudaBasebandBeamformer.cpp
@@ -19,7 +19,13 @@ cudaBasebandBeamformer::cudaBasebandBeamformer(Config& config, const std::string
     _gpu_mem_phase = config.get<std::string>(unique_name, "gpu_mem_phase");
     _gpu_mem_output_scaling = config.get<std::string>(unique_name, "gpu_mem_output_scaling");
     _gpu_mem_formed_beams = config.get<std::string>(unique_name, "gpu_mem_formed_beams");
-    _gpu_mem_info = config.get<std::string>(unique_name, "gpu_mem_info");
+    _gpu_mem_info = unique_name + "/info";
+
+    gpu_buffers_used.push_back(std::make_tuple(_gpu_mem_voltage, true, true, false));
+    gpu_buffers_used.push_back(std::make_tuple(_gpu_mem_phase, true, true, false));
+    gpu_buffers_used.push_back(std::make_tuple(_gpu_mem_output_scaling, true, true, false));
+    gpu_buffers_used.push_back(std::make_tuple(_gpu_mem_formed_beams, true, false, true));
+    gpu_buffers_used.push_back(std::make_tuple(get_name() + "_info", false, true, true));
 
     if (_num_elements != cuda_nelements)
         throw std::runtime_error("The num_elements config setting must be "
@@ -44,7 +50,7 @@ cudaBasebandBeamformer::cudaBasebandBeamformer(Config& config, const std::string
     output_len = (size_t)_num_local_freq * _num_beams * _samples_per_data_set * 2;
     info_len = (size_t)(threads_x * threads_y * blocks_x * sizeof(int32_t));
 
-    command_type = gpuCommandType::KERNEL;
+    set_command_type(gpuCommandType::KERNEL);
 
     std::vector<std::string> opts = {
         "--gpu-name=sm_86",
@@ -66,24 +72,31 @@ struct CuDeviceArray {
 };
 typedef CuDeviceArray<int32_t, 1> kernel_arg;
 
-cudaEvent_t cudaBasebandBeamformer::execute(int gpu_frame_id,
+cudaEvent_t cudaBasebandBeamformer::execute(cudaPipelineState& pipestate,
                                             const std::vector<cudaEvent_t>& pre_events) {
     (void)pre_events;
-    pre_execute(gpu_frame_id);
+    pre_execute(pipestate.gpu_frame_id);
 
-    void* voltage_memory = device.get_gpu_memory_array(_gpu_mem_voltage, gpu_frame_id, voltage_len);
+    void* voltage_memory =
+        device.get_gpu_memory_array(_gpu_mem_voltage, pipestate.gpu_frame_id, voltage_len);
     int8_t* phase_memory =
-        (int8_t*)device.get_gpu_memory_array(_gpu_mem_phase, gpu_frame_id, phase_len);
-    int32_t* shift_memory =
-        (int32_t*)device.get_gpu_memory_array(_gpu_mem_output_scaling, gpu_frame_id, shift_len);
+        (int8_t*)device.get_gpu_memory_array(_gpu_mem_phase, pipestate.gpu_frame_id, phase_len);
+    int32_t* shift_memory = (int32_t*)device.get_gpu_memory_array(
+        _gpu_mem_output_scaling, pipestate.gpu_frame_id, shift_len);
     void* output_memory =
-        device.get_gpu_memory_array(_gpu_mem_formed_beams, gpu_frame_id, output_len);
-    int32_t* info_memory =
-        (int32_t*)device.get_gpu_memory_array(_gpu_mem_info, gpu_frame_id, info_len);
+        device.get_gpu_memory_array(_gpu_mem_formed_beams, pipestate.gpu_frame_id, output_len);
+    int32_t* info_memory = (int32_t*)device.get_gpu_memory(_gpu_mem_info, info_len);
+
+    host_info.resize(_gpu_buffer_depth);
+    for (int i = 0; i < _gpu_buffer_depth; i++)
+        host_info[i].resize(info_len / sizeof(int32_t));
+
+    record_start_event(pipestate.gpu_frame_id);
 
-    record_start_event(gpu_frame_id);
+    // Initialize info_memory return codes
+    CHECK_CUDA_ERROR(
+        cudaMemsetAsync(info_memory, 0xff, info_len, device.getStream(cuda_stream_id)));
 
-    CUresult err;
     // A, E, s, J
     const char* exc = "exception";
     kernel_arg arr[5];
@@ -123,16 +136,25 @@ cudaEvent_t cudaBasebandBeamformer::execute(int gpu_frame_id,
                                       CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
                                       shared_mem_bytes));
 
-    DEBUG("Running CUDA Baseband Beamformer on GPU frame {:d}", gpu_frame_id);
-    err = cuLaunchKernel(runtime_kernels[kernel_name], blocks_x, blocks_y, 1, threads_x, threads_y,
-                         1, shared_mem_bytes, device.getStream(cuda_stream_id), parameters, NULL);
+    DEBUG("Running CUDA Baseband Beamformer on GPU frame {:d}", pipestate.gpu_frame_id);
+    CHECK_CU_ERROR(cuLaunchKernel(runtime_kernels[kernel_name], blocks_x, blocks_y, 1, threads_x,
+                                  threads_y, 1, shared_mem_bytes, device.getStream(cuda_stream_id),
+                                  parameters, NULL));
 
-    if (err != CUDA_SUCCESS) {
-        const char* errStr;
-        cuGetErrorString(err, &errStr);
-        INFO("Error number: {}", err);
-        ERROR("ERROR IN cuLaunchKernel: {}", errStr);
-    }
+    // Copy "info" result code back to host memory
+    CHECK_CUDA_ERROR(cudaMemcpyAsync(host_info[pipestate.gpu_frame_id].data(), info_memory,
+                                     info_len, cudaMemcpyDeviceToHost,
+                                     device.getStream(cuda_stream_id)));
+
+    return record_end_event(pipestate.gpu_frame_id);
+}
 
-    return record_end_event(gpu_frame_id);
+void cudaBasebandBeamformer::finalize_frame(int gpu_frame_id) {
+    cudaCommand::finalize_frame(gpu_frame_id);
+    for (size_t i = 0; i < host_info[gpu_frame_id].size(); i++)
+        if (host_info[gpu_frame_id][i] != 0)
+            ERROR(
+                "cudaBasebandBeamformer returned 'info' value {:d} at index {:d} (zero indicates no"
+                "error)",
+                host_info[gpu_frame_id][i], i);
 }
diff --git a/lib/cuda/cudaBasebandBeamformer.hpp b/lib/cuda/cudaBasebandBeamformer.hpp
@@ -16,13 +16,41 @@
  *
  * Kernel by Kendrick Smith and Erik Schnetter.
  * https://github.com/eschnett/GPUIndexSpaces.jl/blob/main/output/bb1.ptx
+ *
+ * @par GPU Memory
+ * @gpu_mem  gpu_mem_voltage  Input complex voltages of size samples_per_data_set * num_elements *
+ * num_local_freq
+ *   @gpu_mem_type   staging
+ *   @gpu_mem_format Array of @c int4+4 complex
+ * @gpu_mem  gpu_mem_phase  Input complex phases of size num_elements * num_local_freq * num_beams *
+ * 2
+ *   @gpu_mem_type   staging
+ *   @gpu_mem_format Array of @c int8
+ * @gpu_mem  gpu_mem_output_scaling  Input number of bits to shift result by; size num_local_freq *
+ * num_beams * 2
+ *   @gpu_mem_type   staging
+ *   @gpu_mem_format Array of @c int32
+ * @gpu_mem  gpu_mem_formed_beams  Output beams; size num_local_freq * num_beams *
+ * samples_per_data_set * 2
+ *   @gpu_mem_type   staging
+ *   @gpu_mem_format Array of @c int4+4 complex
+ * @gpu_mem  gpu_mem_info  Output status information; size threads_x * threads_y * blocks_x
+ *   @gpu_mem_type   staging
+ *   @gpu_mem_format Array of @c int32
+ *
+ * @conf  num_elements          Int.  Number of dishes x polarizations.
+ * @conf  num_local_freq        Int.  Number of frequencies in each frame.
+ * @conf  samples_per_data_set  Int.  Number of time samples per frame.
+ * @conf  num_beams             Int.  Number of beams being formed.
  */
 class cudaBasebandBeamformer : public cudaCommand {
 public:
     cudaBasebandBeamformer(kotekan::Config& config, const std::string& unique_name,
                            kotekan::bufferContainer& host_buffers, cudaDeviceInterface& device);
     ~cudaBasebandBeamformer();
-    cudaEvent_t execute(int gpu_frame_id, const std::vector<cudaEvent_t>& pre_events) override;
+    cudaEvent_t execute(cudaPipelineState& pipestate,
+                        const std::vector<cudaEvent_t>& pre_events) override;
+    virtual void finalize_frame(int gpu_frame_id) override;
 
 protected:
 private:
@@ -47,6 +75,9 @@ class cudaBasebandBeamformer : public cudaCommand {
     /// GPU side memory name for the status/info output
     std::string _gpu_mem_info;
 
+    // Host-side buffer array for GPU kernel status/info output
+    std::vector<std::vector<int32_t>> host_info;
+
     // derived gpu array sizes
     size_t voltage_len;
     size_t phase_len;

diff --git a/lib/cuda/cudaCommand.cpp b/lib/cuda/cudaCommand.cpp
@@ -10,13 +10,43 @@ using kotekan::Config;
 using std::string;
 using std::to_string;
 
+cudaPipelineState::cudaPipelineState(int _gpu_frame_id) : gpu_frame_id(_gpu_frame_id) {}
+
+cudaPipelineState::~cudaPipelineState() {}
+
+void cudaPipelineState::set_flag(const std::string& key, bool val) {
+    flags[key] = val;
+}
+
+bool cudaPipelineState::flag_exists(const std::string& key) const {
+    // C++20
+    // return flags.contains(key);
+    return (flags.find(key) != flags.end());
+}
+
+bool cudaPipelineState::flag_is_set(const std::string& key) const {
+    auto search = flags.find(key);
+    if (search == flags.end())
+        return false;
+    return search->second;
+}
+
+void cudaPipelineState::set_int(const std::string& key, int64_t val) {
+    intmap[key] = val;
+}
+
+int64_t cudaPipelineState::get_int(const std::string& key) const {
+    return intmap.at(key);
+}
+
 cudaCommand::cudaCommand(Config& config_, const std::string& unique_name_,
                          bufferContainer& host_buffers_, cudaDeviceInterface& device_,
                          const std::string& default_kernel_command,
                          const std::string& default_kernel_file_name) :
     gpuCommand(config_, unique_name_, host_buffers_, device_, default_kernel_command,
                default_kernel_file_name),
     device(device_) {
+    _required_flag = config.get_default<std::string>(unique_name, "required_flag", "");
     start_events = (cudaEvent_t*)malloc(_gpu_buffer_depth * sizeof(cudaEvent_t));
     end_events = (cudaEvent_t*)malloc(_gpu_buffer_depth * sizeof(cudaEvent_t));
     for (int j = 0; j < _gpu_buffer_depth; ++j) {
@@ -66,26 +96,34 @@ cudaCommand::~cudaCommand() {
     DEBUG("post_events Freed: {:s}", unique_name.c_str());
 }
 
-void cudaCommand::finalize_frame(int gpu_frame_id) {
-    if (start_events[gpu_frame_id] != nullptr) {
-        if (profiling) {
-            float exec_time;
-            CHECK_CUDA_ERROR(cudaEventElapsedTime(&exec_time, start_events[gpu_frame_id],
-                                                  start_events[gpu_frame_id]));
-            double active_time = exec_time * 1e-3; // convert ms to s
-            excute_time->add_sample(active_time);
-            utilization->add_sample(active_time / frame_arrival_period);
-        }
-        if (start_events[gpu_frame_id])
-            CHECK_CUDA_ERROR(cudaEventDestroy(start_events[gpu_frame_id]));
-        start_events[gpu_frame_id] = nullptr;
+cudaEvent_t cudaCommand::execute_base(cudaPipelineState& pipestate,
+                                      const std::vector<cudaEvent_t>& pre_events) {
+    if (_required_flag.size() && !pipestate.flag_is_set(_required_flag)) {
+        DEBUG("Required flag \"{:s}\" is not set; skipping stage", _required_flag);
+        return nullptr;
     }
-    if (end_events[gpu_frame_id] != nullptr) {
-        CHECK_CUDA_ERROR(cudaEventDestroy(end_events[gpu_frame_id]));
-        end_events[gpu_frame_id] = nullptr;
+    return execute(pipestate, pre_events);
+}
+
+void cudaCommand::finalize_frame(int gpu_frame_id) {
+    if (profiling && (start_events[gpu_frame_id] != nullptr)
+        && (end_events[gpu_frame_id] != nullptr)) {
+        float exec_time;
+        CHECK_CUDA_ERROR(
+            cudaEventElapsedTime(&exec_time, start_events[gpu_frame_id], end_events[gpu_frame_id]));
+        double active_time = exec_time * 1e-3; // convert ms to s
+        excute_time->add_sample(active_time);
+        utilization->add_sample(active_time / frame_arrival_period);
     } else {
-        FATAL_ERROR("Null end event in cudaCommand {:s}, this should never happen!", unique_name);
+        excute_time->add_sample(0.);
+        utilization->add_sample(0.);
     }
+    if (start_events[gpu_frame_id])
+        CHECK_CUDA_ERROR(cudaEventDestroy(start_events[gpu_frame_id]));
+    start_events[gpu_frame_id] = nullptr;
+    if (end_events[gpu_frame_id] != nullptr)
+        CHECK_CUDA_ERROR(cudaEventDestroy(end_events[gpu_frame_id]));
+    end_events[gpu_frame_id] = nullptr;
 }
 
 int32_t cudaCommand::get_cuda_stream_id() {
@@ -217,13 +255,6 @@ void cudaCommand::build_ptx(const std::vector<std::string>& kernel_names,
         FATAL_ERROR("Error reading the file: {:s}", kernel_file_name);
     fclose(fp);
 
-    // Convert compiler options to a c-style array.
-    std::vector<char*> cstring_opts;
-    cstring_opts.reserve(opts.size());
-
-    for (auto& str : opts)
-        cstring_opts.push_back(&str[0]);
-
     // Create the compiler
     nv_res = nvPTXCompilerCreate(&compiler, program_size, program_buffer);
     if (nv_res != NVPTXCOMPILE_SUCCESS) {
@@ -232,6 +263,12 @@ void cudaCommand::build_ptx(const std::vector<std::string>& kernel_names,
         return;
     }
 
+    // Convert compiler options to a c-style array.
+    std::vector<char*> cstring_opts;
+    cstring_opts.reserve(opts.size());
+    for (auto& str : opts)
+        cstring_opts.push_back(&str[0]);
+
     // Compile the code
     nv_res = nvPTXCompilerCompile(compiler, cstring_opts.size(), cstring_opts.data());
     // TODO Abstract error checking

diff --git a/lib/cuda/cudaCommand.hpp b/lib/cuda/cudaCommand.hpp
@@ -25,6 +25,23 @@
 #include <string>
 #include <vector>
 
+class cudaPipelineState : public kotekan::kotekanLogging {
+public:
+    cudaPipelineState(int _gpu_frame_id);
+    virtual ~cudaPipelineState();
+    void set_flag(const std::string&, bool val);
+    bool flag_exists(const std::string&) const;
+    bool flag_is_set(const std::string&) const;
+    void set_int(const std::string&, int64_t val);
+    int64_t get_int(const std::string&) const;
+
+    int gpu_frame_id;
+
+protected:
+    std::map<std::string, bool> flags;
+    std::map<std::string, int64_t> intmap;
+};
+
 /**
  * @class cudaCommand
  * @brief Base class for defining CUDA commands to execute on GPUs
@@ -69,13 +86,24 @@ class cudaCommand : public gpuCommand {
     virtual void build_ptx(const std::vector<std::string>& kernel_names,
                            std::vector<std::string>& opts);
 
+    /**
+     * @brief Execute a kernel, with more control over the *cudaPipelineState* object.
+     *        Most subclassers should implement *execute*.
+     * @param pipestate  The pipeline state object.
+     * @param pre_events Array of the last events from each cuda stream, indexed by stream
+     *                   number.
+     */
+    virtual cudaEvent_t execute_base(cudaPipelineState& pipestate,
+                                     const std::vector<cudaEvent_t>& pre_events);
+
     /**
      * @brief Execute a kernel, copy, etc.
-     * @param gpu_frame_id  The bufferID associated with the GPU commands.
+     * @param pipestate     Pipeline state for this GPU frame.
      * @param pre_events    Array of the last events from each cuda stream, indexed by stream
      *                      number.
      **/
-    virtual cudaEvent_t execute(int gpu_frame_id, const std::vector<cudaEvent_t>& pre_events) = 0;
+    virtual cudaEvent_t execute(cudaPipelineState& pipestate,
+                                const std::vector<cudaEvent_t>& pre_events) = 0;
 
     /** Releases the memory of the event chain arrays per buffer_id
      * @param gpu_frame_id    The bufferID to release all the memory references for.
@@ -104,6 +132,9 @@ class cudaCommand : public gpuCommand {
     /// The ID of the cuda stream to run operations on
     int32_t cuda_stream_id;
 
+    // cudaPipelineState flag required for this command to run, set from config "required_flag"
+    std::string _required_flag;
+
     // Map containing the runtime kernels built with nvrtc from the kernel file (if needed)
     std::map<std::string, CUfunction> runtime_kernels;
 };
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,8 @@ @@
     .kdev_include_paths
     *.c~
     *.h~
+    *~
+    *#
     *.swp
     *.pyc
     /.idea/
@@ Expand Down @@