resolving conflicts

ut-parla · Nov 22, 2023 · e2b22ca · e2b22ca
2 parents 963d52f + 9bf75e8
commit e2b22ca
Show file tree

Hide file tree

Showing 59 changed files with 3,518 additions and 1,104 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -23,25 +23,25 @@ include(CTest)
 find_package(
   Python
   COMPONENTS Interpreter Development.Module
-  REQUIRED
-  )
+  REQUIRED)
 
 set(CYTHON cython)
 message(STATUS "Cython: " ${CYTHON})
-#Test if cython is installed
+# Test if cython is installed
 if(NOT CYTHON)
   message(
     FATAL_ERROR
-      "Cython is required to build Parla. Please install cython and try again."
-    )
+      "Cython is required to build Parla. Please install cython and try again.")
 endif()
-#Test if cython exists
-execute_process(COMMAND ${CYTHON} --version ERROR_QUIET OUTPUT_VARIABLE CYTHON_VERSION)
+# Test if cython exists
+execute_process(
+  COMMAND ${CYTHON} --version
+  ERROR_QUIET
+  OUTPUT_VARIABLE CYTHON_VERSION)
 if(NOT CYTHON_VERSION)
   message(
     FATAL_ERROR
-      "Cython is required to build Parla. Please install cython and try again."
-    )
+      "Cython is required to build Parla. Please install cython and try again.")
 endif()
 
 message(STATUS "Python: " ${Python_EXECUTABLE})
@@ -82,10 +82,9 @@ if(PARLA_ENABLE_CUDA)
 endif(PARLA_ENABLE_CUDA)
 
 if(PARLA_ENABLE_HIP)
-    enable_language(HIP)
+  enable_language(HIP)
 endif(PARLA_ENABLE_HIP)
 
-
 if(PARLA_ENABLE_LOGGING)
   # TODO: figure out binlog cmake support
   message(STATUS "Finding binlog...")
@@ -119,11 +118,9 @@ endif(PARLA_ENABLE_LOGGING)
 add_subdirectory(src/c/backend)
 add_subdirectory(src/python/parla)
 
-# if(PARLA_BUILD_TESTS)
-#   add_subdirectory(testing)
-# endif(PARLA_BUILD_TESTS)
+# if(PARLA_BUILD_TESTS) add_subdirectory(testing) endif(PARLA_BUILD_TESTS)
 
-# set(test_path_file ${CMAKE_SOURCE_DIR}/testing/run_tests.sh)
-# file(WRITE ${test_path_file} "export PARLA_TESTS=${CMAKE_BINARY_DIR}/testing\n")
-# file(APPEND ${test_path_file} "py.test $PARLA_TESTS\n")
-# file(APPEND ${test_path_file} "ctest --test-dir $PARLA_TESTS\n")
+# set(test_path_file ${CMAKE_SOURCE_DIR}/testing/run_tests.sh) file(WRITE
+# ${test_path_file} "export PARLA_TESTS=${CMAKE_BINARY_DIR}/testing\n")
+# file(APPEND ${test_path_file} "py.test $PARLA_TESTS\n") file(APPEND
+# ${test_path_file} "ctest --test-dir $PARLA_TESTS\n")
diff --git a/benchmark/python/benchmark.py b/benchmark/python/benchmark.py
@@ -110,7 +110,7 @@ def reduction_scalinum_gpus(fD_array_bytes, sD_array_bytes, \
 
         run_config = RunConfig(
             outer_iterations=1,
-            inner_iterations=1,
+            inner_iterations=iter,
             verbose=verbose,
             logfile=logpath,
             num_gpus=num_gpus,
@@ -160,7 +160,7 @@ def independent_scalinum_gpus(fD_array_bytes, sD_array_bytes, num_gpus,  \
 
         run_config = RunConfig(
             outer_iterations=1,
-            inner_iterations=1,
+            inner_iterations=iter,
             verbose=verbose,
             logfile=logpath,
             num_gpus=num_gpus,
@@ -209,7 +209,7 @@ def serial_scalinum_gpus(fD_array_bytes, sD_array_bytes, num_gpus,
 
         run_config = RunConfig(
             outer_iterations=1,
-            inner_iterations=1,
+            inner_iterations=iter,
             verbose=verbose,
             num_gpus=num_gpus,
             logfile=logpath,
@@ -263,7 +263,7 @@ def reduction_scatter_scalinum_gpus(fD_array_bytes, sD_array_bytes, \
 
         run_config = RunConfig(
             outer_iterations=1,
-            inner_iterations=1,
+            inner_iterations=iter,
             verbose=verbose,
             logfile=logpath,
             num_gpus=num_gpus,

diff --git a/pyproject.toml b/pyproject.toml
@@ -20,7 +20,7 @@ build-dir = "build"
 wheel.packages = ["src/python/parla"]
 wheel.license-files = []
 cmake.minimum-version = "3.22.1"
-cmake.build-type = "Debug"
+cmake.build-type = "Release"
 cmake.verbose = true
 ninja.minimum-version = "1.11"
 

diff --git a/src/c/backend/CMakeLists.txt b/src/c/backend/CMakeLists.txt
@@ -76,18 +76,17 @@ if(PARLA_ENABLE_CUDA)
 
     set_target_properties(backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
     set_target_properties(backend PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-
     target_compile_definitions(backend PUBLIC PARLA_ENABLE_CUDA)
 
-    target_include_directories(backend PUBLIC ${CUDAToolkit_LIBRARY_DIR})
-    target_include_directories(backend PUBLIC ${CUDAToolkit_LIBRARY_ROOT})
-    target_include_directories(backend PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+    #target_include_directories(backend PUBLIC ${CUDAToolkit_LIBRARY_DIR})
+    #target_include_directories(backend PUBLIC ${CUDAToolkit_LIBRARY_ROOT})
+    #target_include_directories(backend PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
 
-    target_link_directories(backend PUBLIC ${CUDAToolkit_LIBRARY_ROOT})
-    target_compile_options(backend PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda -DPROD>)
-    target_compile_options(backend PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
+    #target_link_directories(backend PUBLIC ${CUDAToolkit_LIBRARY_ROOT})
+    #target_compile_options(backend PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda -DPROD>)
+    #target_compile_options(backend PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
 
-    target_link_libraries(backend PUBLIC CUDA::cudart)
+    #target_link_libraries(backend PUBLIC CUDA::cudart)
 
     set(GPU_ARCH $ENV{CUDA_ARCH})
     if(GPU_ARCH)

diff --git a/src/c/backend/device.cpp b/src/c/backend/device.cpp
@@ -3,6 +3,5 @@
 
 const bool
 Device::check_resource_availability(DeviceRequirement *dev_req) const {
-  return get_resource_pool().check_greater<ResourceCategory::All>(
-      dev_req->res_req());
+  return get_resource_pool().check_greater<GPUResources>(dev_req->res_req());
 }
diff --git a/src/c/backend/impl_cuda/utility.cu b/src/c/backend/impl_cuda/utility.cu
@@ -40,3 +40,17 @@ void stream_synchronize(uintptr_t stream_ptr) {
   cudaStream_t stream = reinterpret_cast<cudaStream_t>(stream_ptr);
   cudaStreamSynchronize(stream);
 };
+
+void set_device(int device) { cudaSetDevice(device); }
+
+int get_device() {
+  int device;
+  cudaGetDevice(&device);
+  return device;
+}
+
+int get_num_devices() {
+  int num_devices;
+  cudaGetDeviceCount(&num_devices);
+  return num_devices;
+}
diff --git a/src/c/backend/impl_hip/utility.hip b/src/c/backend/impl_hip/utility.hip
@@ -41,3 +41,28 @@ void stream_synchronize(uintptr_t stream_ptr) {
   hipStream_t stream = reinterpret_cast<hipStream_t>(stream_ptr);
   auto res = hipStreamSynchronize(stream);
 };
+
+void set_device(int device) {
+  auto res = hipSetDevice(device);
+  if (res != hipSuccess) {
+    throw std::runtime_error("hipSetDevice failed");
+  }
+}
+
+int get_device(){
+  int device;
+  auto res = hipGetDevice(&device);
+  if (res != hipSuccess) {
+    throw std::runtime_error("hipGetDevice failed");
+  }
+  return device;
+}
+
+int get_num_devices() {
+  int num_devices;
+  auto res = hipGetDeviceCount(&num_devices);
+  if (res != hipSuccess) {
+    throw std::runtime_error("hipGetDeviceCount failed");
+  }
+  return num_devices;
+}
diff --git a/src/c/backend/impl_none/utility.cpp b/src/c/backend/impl_none/utility.cpp
@@ -1,4 +1,4 @@
-#include <gpu_utility.hpp>
+#include "../include/gpu_utility.hpp"
 
 void gpu_busy_sleep(const int device, const unsigned long t,
                     uintptr_t stream_ptr) {
@@ -8,3 +8,6 @@ void gpu_busy_sleep(const int device, const unsigned long t,
 void event_synchronize(uintptr_t event_ptr){};
 void event_wait(uintptr_t event_ptr, uintptr_t stream_ptr){};
 void stream_synchronize(uintptr_t stream_ptr){};
+void set_device(int device){};
+int get_device() { return 0; };
+int get_num_devices() { return 0; };
diff --git a/src/c/backend/include/containers.hpp b/src/c/backend/include/containers.hpp
@@ -3,8 +3,6 @@
  *
  *
  */
-
-#pragma once
 #ifndef PARLA_CONTAINERS_HPP
 #define PARLA_CONTAINERS_HPP
 
@@ -46,36 +44,33 @@ template <typename T> class ProtectedVector {
   std::string name;
 
 public:
-  ProtectedVector() = default;
+  ProtectedVector() { this->name = "default"; };
 
-  ProtectedVector(std::string name) {
-    this->mtx.lock();
-    this->name = name;
-    this->mtx.unlock();
+  ProtectedVector(const ProtectedVector<T> &other) {
+    this->name = other.name;
+    this->vec = other.vec;
+    this->length.exchange(other.length);
   }
 
+  ProtectedVector(std::string name) { this->name = name; }
+
   ProtectedVector(std::string name, std::vector<T> vec) {
-    this->mtx.lock();
     this->name = name;
     this->vec = vec;
-    this->mtx.unlock();
   }
 
   ProtectedVector(std::string name, size_t size) {
-    this->mtx.lock();
     this->name = name;
     this->vec.reserve(size);
-    this->mtx.unlock();
   }
 
-  /// Explicit move assignment due to the atomic size member.
-  ProtectedVector &operator=(ProtectedVector &&other) {
-    this->length.exchange(other.length);
-    this->vec = std::move(other.vec);
-    // The string should be small
-    this->name = std::move(other.name);
-    return *this;
-  }
+  // ProtectedVector &operator=(const ProtectedVector<T> &&other) {
+  //   this->length.exchange(other.length);
+  //   this->vec = std::move(other.vec);
+  //   // The string should be small
+  //   this->name = std::move(other.name);
+  //   return *this;
+  // }
 
   void lock() { this->mtx.lock(); }
 

diff --git a/src/c/backend/include/device.hpp b/src/c/backend/include/device.hpp
@@ -2,7 +2,6 @@
  *  @brief Provides interface for abstract device object.
  */
 
-#pragma once
 #ifndef PARLA_DEVICE_HPP
 #define PARLA_DEVICE_HPP
 
@@ -16,21 +15,28 @@
 using DevID_t = uint32_t;
 using MemorySz_t = Resource_t;
 using VCU_t = Resource_t;
-// using ResourcePool_t = ResourcePool<std::atomic<Resource_t>>;
-using ResourcePool_t = ResourcePool;
+
+using GPUResources = Resources<Resource::Memory, Resource::VCU, Resource::Copy>;
+using GPUResourcePool = ResourcePool<GPUResources>;
+
+using CPUResources = Resources<Resource::Memory, Resource::VCU, Resource::Copy>;
+using CPUResourcePool = ResourcePool<CPUResources>;
+
+// TODO(wlr): Temporarily maintain a single resource pool for all devices.
+using ResourcePool_t = GPUResourcePool;
 
 class DeviceRequirement;
 
 /**
  * @brief Architecture types for devices.
  */
-enum class DeviceType { INVALID = -2, All = -1, CPU = 0, CUDA = 1 };
+enum class DeviceType { INVALID = -2, All = -1, CPU = 0, GPU = 1 };
 
 inline const constexpr std::array architecture_types{DeviceType::CPU,
-                                                     DeviceType::CUDA};
+                                                     DeviceType::GPU};
 inline const constexpr int NUM_DEVICE_TYPES = architecture_types.size();
 inline const std::array<std::string, NUM_DEVICE_TYPES> architecture_names{
-    "CPU", "CUDA"};
+    "CPU", "GPU"};
 
 /// Devices can be distinguished from other devices
 /// by a class type and its index.
@@ -43,17 +49,9 @@ class Device {
          void *py_dev, int copy_engines = 2)
       : py_dev_(py_dev), dev_id_(dev_id), dev_type_(arch) {
 
-    res_.set(Resource::VCU, num_vcus);
-    res_.set(Resource::Memory, mem_sz);
-    res_.set(Resource::Copy, copy_engines);
-
-    reserved_res_.set(Resource::VCU, num_vcus);
-    reserved_res_.set(Resource::Memory, mem_sz);
-    reserved_res_.set(Resource::Copy, copy_engines);
-
-    mapped_res_.set(Resource::VCU, 0);
-    mapped_res_.set(Resource::Memory, 0);
-    mapped_res_.set(Resource::Copy, 0);
+    res_.set<GPUResources>({mem_sz, num_vcus, copy_engines});
+    reserved_res_.set<GPUResources>({mem_sz, num_vcus, copy_engines});
+    mapped_res_.set<GPUResources>({0, 0, 0});
   }
 
   /// Return a device id.
@@ -64,16 +62,16 @@ class Device {
            std::to_string(dev_id_);
   }
 
-  const Resource_t query_resource(Resource type) const {
-    return this->res_.get(type);
+  template <typename Resource> const Resource_t query_max() const {
+    return this->res_.get<Resource>();
   }
 
-  const Resource_t query_reserved_resource(Resource type) const {
-    return this->reserved_res_.get(type);
+  template <typename Resource> const Resource_t query_reserved() const {
+    return this->reserved_res_.get<Resource>();
   }
 
-  const Resource_t query_mapped_resource(Resource type) const {
-    return this->mapped_res_.get(type);
+  template <typename Resource> const Resource_t query_mapped() const {
+    return this->mapped_res_.get<Resource>();
   }
 
   const DeviceType get_type() const { return dev_type_; }
@@ -111,22 +109,10 @@ class Device {
   const DevID_t get_global_id() const { return dev_global_id_; }
 
   const MemorySz_t get_memory_size() const {
-    return res_.get(Resource::Memory);
+    return res_.get<Resource::Memory>();
   }
 
-  const VCU_t get_num_vcus() const { return res_.get(Resource::VCU); }
-
-  const Resource_t get_max_resource(Resource type) const {
-    return this->res_.get(type);
-  }
-
-  const Resource_t get_reserved_resource(Resource type) const {
-    return this->reserved_res_.get(type);
-  }
-
-  const Resource_t get_mapped_resource(Resource type) const {
-    return this->mapped_res_.get(type);
-  }
+  const VCU_t get_num_vcus() const { return res_.get<Resource::VCU>(); }
 
   const bool check_resource_availability(DeviceRequirement *dev_req) const;
 
@@ -142,10 +128,10 @@ class Device {
 };
 
 ///
-class CUDADevice : public Device {
+class GPUDevice : public Device {
 public:
-  CUDADevice(DevID_t dev_id, size_t mem_sz, size_t num_vcus, void *py_dev)
-      : Device(DeviceType::CUDA, dev_id, mem_sz, num_vcus, py_dev, 3) {}
+  GPUDevice(DevID_t dev_id, size_t mem_sz, size_t num_vcus, void *py_dev)
+      : Device(DeviceType::GPU, dev_id, mem_sz, num_vcus, py_dev, 3) {}
 
 private:
 };