From e5a95d5acc3ffbf90cdcb53d2bbee672336ca37f Mon Sep 17 00:00:00 2001 From: Jason Furmanek Date: Fri, 26 Aug 2022 21:29:20 +0000 Subject: [PATCH 1/9] [UVM] Allocation and move enablement front end --- aten/src/ATen/Context.cpp | 16 +++++++++++++++ aten/src/ATen/Context.h | 8 +++++++- torch/csrc/cuda/Module.cpp | 42 +++++++++++++++++++++++++++++++++++++- torch/cuda/memory.py | 32 +++++++++++++++++++++++++++++ 4 files changed, 96 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index 936e9b6252863..6fc28cf4bbed0 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -55,6 +55,22 @@ void Context::setUserEnabledMkldnn(bool e) { enabled_mkldnn = e; } +bool Context::userEnabledUVM() const { + return enabled_uvm; +} + +void Context::setUserEnabledUVM(bool e) { + enabled_uvm = e; +} + +bool Context::userEnabledMove() const { + return enabled_move; +} + +void Context::setUserEnabledMove(bool e) { + enabled_move = e; +} + bool Context::deterministicCuDNN() const { return deterministic_cudnn; } diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index 48e3c935a2c0c..ede1aeea02c7c 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -119,6 +119,10 @@ class TORCH_API Context { void setUserEnabledCuDNN(bool e); bool userEnabledMkldnn() const; void setUserEnabledMkldnn(bool e); + bool userEnabledUVM() const; + void setUserEnabledUVM(bool e); + bool userEnabledMove() const; + void setUserEnabledMove(bool e); bool benchmarkCuDNN() const; void setBenchmarkCuDNN(bool); int benchmarkLimitCuDNN() const; @@ -288,7 +292,9 @@ class TORCH_API Context { bool allow_fp16_reduction_cublas = true; bool enabled_mkldnn = true; at::LinalgBackend linalg_preferred_backend = at::LinalgBackend::Default; -#ifdef C10_MOBILE + bool enabled_uvm = false; + bool enabled_move = false; + #ifdef C10_MOBILE bool release_original_weights = true; #else bool release_original_weights = false; diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp index b526f87edd75d..468571fc454ee 100644 --- a/torch/csrc/cuda/Module.cpp +++ b/torch/csrc/cuda/Module.cpp @@ -763,7 +763,43 @@ PyObject* THCPModule_attachOutOfMemoryObserver( END_HANDLE_TH_ERRORS } -PyObject* THCPModule_cudaSetSyncDebugMode(PyObject* _unused, PyObject* arg) { +PyObject* THCPModule_setUserEnabledUVM(PyObject *_unused, PyObject *arg) +{ + HANDLE_TH_ERRORS + THPUtils_assert(PyBool_Check(arg), "set_enabled_uvm expects a bool, " + "but got %s", THPUtils_typename(arg)); + at::globalContext().setUserEnabledUVM(arg == Py_True); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +PyObject* THCPModule_userEnabledUVM(PyObject *_unused, PyObject *noargs) +{ + HANDLE_TH_ERRORS + if (at::globalContext().userEnabledUVM()) Py_RETURN_TRUE; + else Py_RETURN_FALSE; + END_HANDLE_TH_ERRORS +} + +PyObject* THCPModule_setUserEnabledMove(PyObject *_unused, PyObject *arg) +{ + HANDLE_TH_ERRORS + THPUtils_assert(PyBool_Check(arg), "set_enabled_move expects a bool, " + "but got %s", THPUtils_typename(arg)); + at::globalContext().setUserEnabledMove(arg == Py_True); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +PyObject* THCPModule_userEnabledMove(PyObject *_unused, PyObject *noargs) +{ + HANDLE_TH_ERRORS + if (at::globalContext().userEnabledMove()) Py_RETURN_TRUE; + else Py_RETURN_FALSE; + END_HANDLE_TH_ERRORS +} + +PyObject* THCPModule_cudaSetSyncDebugMode(PyObject * _unused, PyObject * arg){ HANDLE_TH_ERRORS TORCH_WARN_ONCE( "Synchronization debug mode is a prototype feature and does not yet detect all " @@ -1169,6 +1205,10 @@ static struct PyMethodDef _THCPModule_methods[] = { THCPModule_attachOutOfMemoryObserver, METH_O, nullptr}, + {"_cuda_getEnabledUVM", THCPModule_userEnabledUVM, METH_NOARGS, nullptr}, + {"_cuda_setEnabledUVM", THCPModule_setUserEnabledUVM, METH_O, nullptr}, + {"_cuda_getEnabledMove", THCPModule_userEnabledMove, METH_NOARGS, nullptr}, + {"_cuda_setEnabledMove", THCPModule_setUserEnabledMove, METH_O, nullptr}, {"_cuda_cudaHostAllocator", THCPModule_cudaHostAllocator, METH_NOARGS, diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py index c40d9de580406..047e393b125aa 100644 --- a/torch/cuda/memory.py +++ b/torch/cuda/memory.py @@ -26,6 +26,13 @@ torch._C.__dict__['_cuda_CUDAAllocator'] = _dummy_type('_cuda_CUDAAllocator') +from os import environ +if 'PYTORCH_UVM_ENABLE' in environ: + torch._C._cuda_setEnabledUVM(True) + +if 'PYTORCH_MOVE_ENABLE' in environ: + torch._C._cuda_setEnabledMove(True) + def _host_allocator(): _lazy_init() return torch._C._cuda_cudaHostAllocator() @@ -722,3 +729,28 @@ def _get_current_allocator() -> _CUDAAllocator: See :ref:`cuda-memory-management` for details on creating and using a custom allocator """ return _CUDAAllocator(torch._C._cuda_getAllocator()) + +def set_enabled_uvm(enable): + r"""Enable/disable Unified Virtual Memory. + Arguments: + enable (bool): desired UVM setting. + """ + torch._C._cuda_setEnabledUVM(enable) + + +def get_enabled_uvm(): + r"""Returns a bool indicating if Unified Virtual Memory is currently enabled.""" + return torch._C._cuda_getEnabledUVM() + + +def set_enabled_move(enable): + r"""Enable/disable Unified Virtual Memory. + Arguments: + enable (bool): desired UVM setting. + """ + torch._C._cuda_setEnabledMove(enable) + + +def get_enabled_move(): + r"""Returns a bool indicating if Unified Virtual Memory is currently enabled.""" + return torch._C._cuda_getEnabledMove() From c19db86e9929877c77a0891c9ca381bb6cbc3de6 Mon Sep 17 00:00:00 2001 From: Jason Furmanek Date: Fri, 26 Aug 2022 21:33:22 +0000 Subject: [PATCH 2/9] [UVM] Add Caching Managed Allocator --- .../src/ATen/cuda/CachingManagedAllocator.cpp | 1270 +++++++++++++++++ aten/src/ATen/cuda/CachingManagedAllocator.h | 121 ++ aten/src/ATen/cuda/ThrustAllocator.h | 11 +- aten/src/ATen/cuda/UvmMemoryAllocator.h | 16 + .../cuda/linalg/BatchLinearAlgebraLib.cpp | 56 +- aten/src/ATen/native/cudnn/Conv_v7.cpp | 8 +- aten/src/ATen/native/miopen/Conv_miopen.cpp | 8 +- 7 files changed, 1471 insertions(+), 19 deletions(-) create mode 100644 aten/src/ATen/cuda/CachingManagedAllocator.cpp create mode 100644 aten/src/ATen/cuda/CachingManagedAllocator.h create mode 100644 aten/src/ATen/cuda/UvmMemoryAllocator.h diff --git a/aten/src/ATen/cuda/CachingManagedAllocator.cpp b/aten/src/ATen/cuda/CachingManagedAllocator.cpp new file mode 100644 index 0000000000000..dec15566b0100 --- /dev/null +++ b/aten/src/ATen/cuda/CachingManagedAllocator.cpp @@ -0,0 +1,1270 @@ +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { + +C10_DEFINE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback); + +namespace cuda { +namespace CachingManagedAllocator { + +// +// Yet another caching allocator for CUDA device allocations. +// +// - Allocations are associated with a stream. Once freed, blocks can be +// re-allocated on the same stream, but not on any other stream. +// - The allocator attempts to find the smallest cached block that will fit the +// requested size. If the block is larger than the requested size, it may be +// split. If no block is found, the allocator will delegate to cudaMalloc. +// - If the cudaMalloc fails, the allocator will attempt to free one cached +// block of sufficient size that is not split and retry the allocation. +// If this also fails, the allocator will attempt to free all cached blocks +// that are not split and retry the allocation. +// - Large (>1MB) and small allocations are stored in separate pools. +// Small requests are packed into 2MB buffers. Large requests will use the +// smallest available free block or allocate a new block using cudaMalloc. +// - To reduce fragmentation, requests between 1MB and 10MB will allocate and +// split a 20MB block, if no free block of sufficient size is available. +// - To further reduce fragmentation, blocks >= 200MB are not allowed to be +// split. These oversize cached blocks will still satisfy requests within +// 20MB of the oversize cached block size. +// +// With this allocator, allocations and frees should logically be considered +// "usages" of the memory segment associated with streams, just like kernel +// launches. The programmer must insert the proper synchronization if memory +// segments are used from multiple streams. +// +// The library provides a recordStream() function to help insert the correct +// synchronization when allocations are used on multiple streams. This will +// ensure that the block is not reused before each recorded stream completes +// work. +// + +namespace { + +using stream_set = std::unordered_set; + +constexpr size_t kMinBlockSize = + 512; // all sizes are rounded to at least 512 bytes +constexpr size_t kSmallSize = 1048576; // largest "small" allocation is 1 MiB +constexpr size_t kSmallBuffer = + 2097152; // "small" allocations are packed in 2 MiB blocks +constexpr size_t kLargeBuffer = + 20971520; // "large" allocations may be packed in 20 MiB blocks +constexpr size_t kMinLargeAlloc = + 10485760; // allocations between 1 and 10 MiB may use kLargeBuffer +constexpr size_t kRoundLarge = 2097152; // round up large allocations to 2 MiB + +typedef std::bitset(StatType::NUM_TYPES)> StatTypes; + +void update_stat(Stat& stat, int64_t amount) { + stat.current += amount; + + TORCH_INTERNAL_ASSERT( + stat.current >= 0, + "Negative tracked stat in CUDA allocator (likely logic error)."); + + stat.peak = std::max(stat.current, stat.peak); + if (amount > 0) { + stat.allocated += amount; + } + if (amount < 0) { + stat.freed += -amount; + } +} + +void reset_accumulated_stat(Stat& stat) { + stat.allocated = 0; + stat.freed = 0; +} + +void reset_peak_stat(Stat& stat) { + stat.peak = stat.current; +} + +void update_stat_array(StatArray& stat_array, + uint64_t amount, + const StatTypes& stat_types) { + for (const auto stat_type : c10::irange(stat_types.size())) { + if (stat_types[stat_type]) { + update_stat(stat_array[stat_type], amount); + } + } +} + +struct Block; +struct PrivatePool; +typedef bool (*Comparison)(const Block*, const Block*); + +struct BlockPool { + BlockPool( + Comparison comparator, + bool small, + PrivatePool* private_pool = nullptr) + : blocks(comparator), is_small(small), owner_PrivatePool(private_pool) {} + std::set blocks; + const bool is_small; + PrivatePool* owner_PrivatePool; +}; + +struct Block { + int device; // gpu + cudaStream_t stream; // allocation stream + stream_set stream_uses; // streams on which the block was used + size_t size; // block size in bytes + BlockPool* pool; // owning memory pool + void* ptr; // memory address + bool allocated; // in-use flag + Block* prev; // prev block if split from a larger allocation + Block* next; // next block if split from a larger allocation + int event_count; // number of outstanding CUDA events + + Block( + int device, + cudaStream_t stream, + size_t size, + BlockPool* pool, + void* ptr) + : device(device), + stream(stream), + stream_uses(), + size(size), + pool(pool), + ptr(ptr), + allocated(0), + prev(nullptr), + next(nullptr), + event_count(0) {} + + // constructor for search key + Block(int device, cudaStream_t stream, size_t size) + : device(device), + stream(stream), + stream_uses(), + size(size), + pool(nullptr), + ptr(nullptr), + allocated(0), + prev(nullptr), + next(nullptr), + event_count(0) {} + + bool is_split() const { + return (prev != nullptr) || (next != nullptr); + } +}; + +static bool BlockComparator(const Block* a, const Block* b) { + if (a->stream != b->stream) { + return (uintptr_t)a->stream < (uintptr_t)b->stream; + } + if (a->size != b->size) { + return a->size < b->size; + } + return (uintptr_t)a->ptr < (uintptr_t)b->ptr; +} + +static std::string format_size(uint64_t size) { + std::ostringstream os; + os.precision(2); + os << std::fixed; + if (size <= 1024) { + os << size << " bytes"; + } else if (size <= 1048576) { + os << (size / 1024.0); + os << " KiB"; + } else if (size <= 1073741824ULL) { + os << size / 1048576.0; + os << " MiB"; + } else { + os << size / 1073741824.0; + os << " GiB"; + } + return os.str(); +} + +struct AllocParams { + AllocParams( + int device, + size_t size, + cudaStream_t stream, + BlockPool* pool, + size_t alloc_size, + DeviceStats& stats) + : search_key(device, stream, size), + pool(pool), + alloc_size(alloc_size), + block(nullptr), + err(cudaSuccess) {} + + int device() const { + return search_key.device; + } + cudaStream_t stream() const { + return search_key.stream; + } + size_t size() const { + return search_key.size; + } + + Block search_key; + BlockPool* pool; + size_t alloc_size; + Block* block; + StatTypes stat_types; + cudaError_t err; +}; + +struct MempoolIdHash { + std::size_t operator()(const MempoolId_t& mempool_id) const noexcept { + return mempool_id.first != 0 ? mempool_id.first : mempool_id.second; + } +}; + +cudaError_t cudaMallocMaybeCapturing(void** p, size_t size) { + return cudaMallocManaged(p, size); +} + +} // namespace + +class CachingAllocatorConfig { + public: + static size_t max_split_size() { + return instance().m_max_split_size; + } + + private: + static std::once_flag s_flag; + static CachingAllocatorConfig* s_instance; + static CachingAllocatorConfig& instance() { + std::call_once(s_flag, &CachingAllocatorConfig::init); + return *s_instance; + } + static void init() { + s_instance = new CachingAllocatorConfig(); + s_instance->parseArgs(); + } + + CachingAllocatorConfig() + : m_max_split_size(std::numeric_limits::max()) {} + size_t m_max_split_size; + + void parseArgs() { + const char* val = getenv("PYTORCH_CUDA_ALLOC_CONF"); + if (val != NULL) { + const std::string config(val); + + std::regex exp("[\\s,]+"); + std::sregex_token_iterator it(config.begin(), config.end(), exp, -1); + std::sregex_token_iterator end; + std::vector options(it, end); + + for (auto option : options) { + std::regex exp2("[:]+"); + std::sregex_token_iterator it2(option.begin(), option.end(), exp2, -1); + std::sregex_token_iterator end2; + std::vector kv(it2, end2); + if (kv.size() >= 2) { + /* Maximum split size in MB. Limited to large size blocks */ + if (kv[0].compare("max_split_size_mb") == 0) { + size_t val2 = stoi(kv[1]); + TORCH_CHECK( + val2 > kLargeBuffer / (1024 * 1024), + "CachingAllocator option max_split_size_mb too small, must be > ", + kLargeBuffer / (1024 * 1024), + ""); + val2 = std::max(val2, kLargeBuffer / (1024 * 1024)); + val2 = std::min( + val2, (std::numeric_limits::max() / (1024 * 1024))); + m_max_split_size = val2 * 1024 * 1024; + } else { + TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", kv[0]); + } + } + } + } + } +}; +CachingAllocatorConfig* CachingAllocatorConfig::s_instance; +std::once_flag CachingAllocatorConfig::s_flag; + +class ManagedAllocator { + private: + // lock around all operations + mutable std::recursive_mutex mutex; + + // lock around calls to cudaFree (to prevent deadlocks with NCCL) + mutable std::mutex cuda_free_mutex; + + // device statistics + DeviceStats stats; + + // unallocated cached blocks larger than 1 MB + BlockPool large_blocks; + + // unallocated cached blocks 1 MB or smaller + BlockPool small_blocks; + + // allocated or in use by a stream. Holds all active allocations, + // whether they came from graph_pools or one of the BlockPools above. + std::unordered_set active_blocks; + + //TODO: This is going to be redundant with active_blocks. Need to merge + //the two going forward. + // allocated blocks by device pointer + std::unordered_map allocated_blocks; + + // captures_underway tracks if a capture might be underway on any stream. + // Most of the time it's zero, in which case malloc can avoid calling + // cudaStreamGetCaptureInfo in the hot path. + + // See free() for this thing's purpose + std::vector needs_events_deferred_until_no_capture; + // outstanding cuda events + std::deque> cuda_events; + + // record used memory. + size_t total_allocated_memory = 0; + + size_t allowed_memory_maximum = 0; + + bool set_fraction = false; + + // Maps a capturing stream to its assigned private pool, + // in case we want multiple captures to share the same pool + std::unordered_map capture_to_pool_map; + + void add_allocated_block(Block* block) { + std::lock_guard lock(mutex); + allocated_blocks[block->ptr] = block; + } + + public: + ManagedAllocator() + : large_blocks(BlockComparator, /*is_small=*/false), + small_blocks(BlockComparator, /*is_small=*/true) { + stats.max_split_size = CachingAllocatorConfig::max_split_size(); + } + + std::mutex* getCudaFreeMutex() const { + return &cuda_free_mutex; + } + + Block* get_allocated_block(void* ptr, bool remove = false) { + std::lock_guard lock(mutex); + auto it = allocated_blocks.find(ptr); + if (it == allocated_blocks.end()) { + return nullptr; + } + Block* block = it->second; + if (remove) { + allocated_blocks.erase(it); + } + return block; + } + + cudaError_t recordEvent(void* ptr, at::cuda::CUDAStream stream) + { + + return cudaSuccess; + } + + // All public methods (except the above) acquire the allocator mutex. + // Thus, do not call a public method from another public method. +void malloc(void** devPtr, int device, size_t size, cudaStream_t stream) { + Block* block = malloc(device, size, stream); + add_allocated_block(block); + *devPtr = (void*)block->ptr; + } + + Block* malloc(int device, size_t size, cudaStream_t stream) { + std::unique_lock lock(mutex); + process_events(); + size = round_size(size); + auto& pool = get_pool(size, stream); + const size_t alloc_size = get_allocation_size(size); + AllocParams params(device, size, stream, &pool, alloc_size, stats); + params.stat_types[static_cast(StatType::AGGREGATE)] = true; + params.stat_types[static_cast(get_stat_type_for_pool(pool))] = true; + + bool block_found = + // Search pool + get_free_block(params) + // Trigger callbacks and retry search + || (trigger_free_memory_callbacks(params) && get_free_block(params)) + // Attempt allocate + || alloc_block(params, false) + // Free enough available cached blocks to satisfy alloc and retry alloc. + || + (release_available_cached_blocks(params) && alloc_block(params, false)) + // Free all non-split cached blocks and retry alloc. + || (release_cached_blocks() && alloc_block(params, true)); + + if (!block_found) { + // For any error code other than cudaErrorMemoryAllocation, + // alloc_block should have thrown an exception already. + TORCH_INTERNAL_ASSERT(params.err == cudaErrorMemoryAllocation); + + size_t device_free; + size_t device_total; + C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total)); + std::string allowed_info; + + if (set_fraction) { + allowed_info = format_size(allowed_memory_maximum) + " allowed; "; + } + + stats.num_ooms += 1; + + } + + TORCH_INTERNAL_ASSERT( + params.err == cudaSuccess && params.block != nullptr && + params.block->ptr != nullptr); + Block* block = params.block; + Block* remaining = nullptr; + + const bool already_split = block->is_split(); + if (should_split(block, size)) { + remaining = block; + + block = new Block(device, stream, size, &pool, block->ptr); + block->prev = remaining->prev; + if (block->prev) { + block->prev->next = block; + } + block->next = remaining; + + remaining->prev = block; + remaining->ptr = static_cast(remaining->ptr) + size; + remaining->size -= size; + bool inserted = pool.blocks.insert(remaining).second; + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inserted); + + if (already_split) { + // An already-split inactive block is being shrunk by size bytes. + update_stat_array( + stats.inactive_split_bytes, -block->size, params.stat_types); + } else { + // A new split inactive block is being created from a previously unsplit + // block, size remaining->size bytes. + update_stat_array( + stats.inactive_split_bytes, remaining->size, params.stat_types); + update_stat_array(stats.inactive_split, 1, params.stat_types); + } + } else if (already_split) { + // An already-split block is becoming active + update_stat_array( + stats.inactive_split_bytes, -block->size, params.stat_types); + update_stat_array(stats.inactive_split, -1, params.stat_types); + } + + block->allocated = true; + bool inserted = active_blocks.insert(block).second; + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inserted); + + update_stat_array(stats.allocation, 1, params.stat_types); + update_stat_array(stats.allocated_bytes, block->size, params.stat_types); + update_stat_array(stats.active, 1, params.stat_types); + update_stat_array(stats.active_bytes, block->size, params.stat_types); + if (block->size >= CachingAllocatorConfig::max_split_size()) + update_stat(stats.oversize_allocations, 1); + + c10::reportMemoryUsageToProfiler( + block->ptr, + block->size, + stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, + stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, + c10::Device(c10::DeviceType::CUDA, device)); + + add_allocated_block(block); + return block; + } + + void free(void* ptr) { + + std::lock_guard lock(mutex); + + // We aren't sure if a free'd block will be used by the same device + // or even the same stream, synchronize for now just in case + cudaDeviceSynchronize(); + if (!ptr) { + return; + } + Block* block = get_allocated_block(ptr, true /* remove */); + if (!block) { + TORCH_CHECK(false, "invalid device pointer: ", ptr); + } + + block->allocated = false; + + // following logic might modifying underlaying Block, causing the size + // changed. We store ahead for reporting + auto orig_block_ptr = block->ptr; + auto orig_block_size = block->size; + + StatTypes stat_types; + stat_types[static_cast(StatType::AGGREGATE)] = true; + stat_types[static_cast(get_stat_type_for_pool(*(block->pool)))] = + true; + update_stat_array(stats.allocation, -1, {stat_types}); + update_stat_array(stats.allocated_bytes, -block->size, {stat_types}); + if (block->size >= CachingAllocatorConfig::max_split_size()) + update_stat(stats.oversize_allocations, -1); + + if (!block->stream_uses.empty()) { + insert_events(block); + } else { + free_block(block); + } + + c10::reportMemoryUsageToProfiler( + orig_block_ptr, + -orig_block_size, + stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, + stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, + c10::Device(c10::DeviceType::CUDA, block->device)); + } + + void* getBaseAllocation(void* ptr, size_t* outSize) { + Block* block = get_allocated_block(ptr); + if (!block) { + TORCH_CHECK(false, "invalid device pointer: ", ptr); + } + std::lock_guard lock(mutex); + while (block->prev) { + block = block->prev; + } + void* basePtr = block->ptr; + if (outSize) { + size_t size = 0; + while (block) { + size += block->size; + block = block->next; + } + *outSize = size; + } + return basePtr; + } + + void recordStream(const DataPtr& ptr, cuda::CUDAStream stream) { + if (!ptr.get()) { + return; + } + + // If a tensor is not allocated by this instance, simply skip + // This usually happens when CUDA tensors are shared across processes, + // we have implemented reference counting based sharing mechanism to + // guarantee tensors won't be accidentally freed by one process while + // they are still being used in another + if (ptr.get_deleter() != &raw_delete) + return; + + Block* block = get_allocated_block(ptr.get()); + if (!block) { + TORCH_CHECK(false, "invalid device pointer: ", ptr); + } + std::lock_guard lock(mutex); + if (stream.stream() == block->stream) { + // ignore uses on the allocation stream, since those don't require any + // special synchronization + return; + } + block->stream_uses.insert(stream); + } + + /** set memory fraction to limit maximum allocated memory **/ + void setMemoryFraction(double fraction) { + size_t device_free; + size_t device_total; + C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total)); + allowed_memory_maximum = static_cast(fraction * device_total); + set_fraction = true; + } + + /** returns cached blocks to the system allocator **/ + void emptyCache() { + std::lock_guard lock(mutex); + release_cached_blocks(); + } + + /** Retrieves info (total size + largest block) of the memory cache **/ + void cacheInfo(size_t* total, size_t* largest) { + std::lock_guard lock(mutex); + if (*largest == + 0) { // make an initial guess if a zero *largest is passed in + size_t tmp_bytes; + cudaMemGetInfo( + largest, // Use free memory as an optimistic initial guess of *largest + &tmp_bytes); + } + cache_info_aux(large_blocks, total, largest); + cache_info_aux(small_blocks, total, largest); + } + + /** Returns a copy of the memory allocator stats **/ + DeviceStats getStats() { + std::lock_guard lock(mutex); + return stats; + } + + /** Resets the historical accumulation stats for the device **/ + void resetAccumulatedStats() { + std::lock_guard lock(mutex); + + for (const auto statType : + c10::irange(static_cast(StatType::NUM_TYPES))) { + reset_accumulated_stat(stats.allocation[statType]); + reset_accumulated_stat(stats.segment[statType]); + reset_accumulated_stat(stats.active[statType]); + reset_accumulated_stat(stats.inactive_split[statType]); + reset_accumulated_stat(stats.allocated_bytes[statType]); + reset_accumulated_stat(stats.reserved_bytes[statType]); + reset_accumulated_stat(stats.active_bytes[statType]); + reset_accumulated_stat(stats.inactive_split_bytes[statType]); + } + + stats.num_alloc_retries = 0; + stats.num_ooms = 0; + reset_accumulated_stat(stats.oversize_allocations); + reset_accumulated_stat(stats.oversize_segments); + } + + /** Resets the historical peak stats for the device **/ + void resetPeakStats() { + std::lock_guard lock(mutex); + + for (const auto statType : + c10::irange(static_cast(StatType::NUM_TYPES))) { + reset_peak_stat(stats.allocation[statType]); + reset_peak_stat(stats.segment[statType]); + reset_peak_stat(stats.active[statType]); + reset_peak_stat(stats.inactive_split[statType]); + reset_peak_stat(stats.allocated_bytes[statType]); + reset_peak_stat(stats.reserved_bytes[statType]); + reset_peak_stat(stats.active_bytes[statType]); + reset_peak_stat(stats.inactive_split_bytes[statType]); + } + reset_peak_stat(stats.oversize_allocations); + reset_peak_stat(stats.oversize_segments); + } + + /** Dump a complete snapshot of the memory held by the allocator. Potentially + * VERY expensive. **/ + std::vector snapshot() const { + std::lock_guard lock(mutex); + + std::vector result; + const auto all_blocks = get_all_blocks(); + + for (const Block* const head_block : all_blocks) { + if (head_block->prev != nullptr) { + continue; + } + result.emplace_back(); + SegmentInfo& segment_info = result.back(); + segment_info.device = head_block->device; + segment_info.address = reinterpret_cast(head_block->ptr); + segment_info.is_large = (!head_block->pool->is_small); + + const Block* block = head_block; + while (block != nullptr) { + segment_info.blocks.emplace_back(); + BlockInfo& block_info = segment_info.blocks.back(); + + block_info.size = block->size; + block_info.allocated = block->allocated; + block_info.active = block->allocated || (block->event_count > 0) || + !block->stream_uses.empty(); + + segment_info.total_size += block_info.size; + if (block_info.allocated) { + segment_info.allocated_size += block_info.size; + } + if (block_info.active) { + segment_info.active_size += block_info.size; + } + + block = block->next; + } + } + + std::sort( + result.begin(), + result.end(), + [](const SegmentInfo& a, const SegmentInfo& b) { + return a.address < b.address; + }); + + return result; + } + + static size_t round_size(size_t size) { + if (size < kMinBlockSize) { + return kMinBlockSize; + } else { + return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize); + } + } + + private: + // All private methods do not acquire the allocator mutex. + + std::vector get_all_blocks() const { + std::vector blocks; + blocks.insert( + blocks.end(), small_blocks.blocks.begin(), small_blocks.blocks.end()); + blocks.insert( + blocks.end(), large_blocks.blocks.begin(), large_blocks.blocks.end()); + blocks.insert(blocks.end(), active_blocks.begin(), active_blocks.end()); + return blocks; + } + + /** moves a block into a pool of cached free blocks */ + void free_block(Block* block) { + TORCH_INTERNAL_ASSERT( + !block->allocated && block->event_count == 0 && + block->stream_uses.empty()); + + size_t original_block_size = block->size; + + auto& pool = *block->pool; + int64_t net_change_inactive_split_blocks = 0; + int64_t net_change_inactive_split_size = 0; + + const std::array merge_candidates = {block->prev, block->next}; + for (Block* merge_candidate : merge_candidates) { + const int64_t subsumed_size = + try_merge_blocks(block, merge_candidate, pool); + if (subsumed_size > 0) { + net_change_inactive_split_blocks -= 1; + net_change_inactive_split_size -= subsumed_size; + } + } + + active_blocks.erase(block); + // Makes sure the Block* isn't already present in the pool we're freeing it + // back into. + bool inserted = pool.blocks.insert(block).second; + TORCH_INTERNAL_ASSERT(inserted); + + if (block->is_split()) { + net_change_inactive_split_blocks += 1; + net_change_inactive_split_size += block->size; + } + + StatTypes stat_types; + stat_types[static_cast(StatType::AGGREGATE)] = true; + stat_types[static_cast(get_stat_type_for_pool(pool))] = true; + update_stat_array( + stats.inactive_split, net_change_inactive_split_blocks, stat_types); + update_stat_array( + stats.inactive_split_bytes, net_change_inactive_split_size, stat_types); + update_stat_array(stats.active, -1, stat_types); + update_stat_array(stats.active_bytes, -original_block_size, stat_types); + } + + /** combine previously split blocks. returns the size of the subsumed block, + * or 0 on failure. */ + size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) { + if (!src || src->allocated || src->event_count > 0 || + !src->stream_uses.empty()) { + return 0; + } + + AT_ASSERT(dst->is_split() && src->is_split()); + + if (dst->prev == src) { + dst->ptr = src->ptr; + dst->prev = src->prev; + if (dst->prev) { + dst->prev->next = dst; + } + } else { + dst->next = src->next; + if (dst->next) { + dst->next->prev = dst; + } + } + + const size_t subsumed_size = src->size; + dst->size += subsumed_size; + auto erased = pool.blocks.erase(src); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(erased == 1); + delete src; + + return subsumed_size; + } + + BlockPool& get_pool(size_t size, cudaStream_t stream) { + + if (size <= kSmallSize) { + return small_blocks; + } else { + return large_blocks; + } + } + + StatType get_stat_type_for_pool(const BlockPool& pool) { + return pool.is_small ? StatType::SMALL_POOL : StatType::LARGE_POOL; + } + + bool should_split(const Block* block, size_t size) { + size_t remaining = block->size - size; + if (block->pool->is_small) { + return remaining >= kMinBlockSize; + } else { + return (size < CachingAllocatorConfig::max_split_size()) && + (remaining > kSmallSize); + } + } + + static size_t get_allocation_size(size_t size) { + if (size <= kSmallSize) { + return kSmallBuffer; + } else if (size < kMinLargeAlloc) { + return kLargeBuffer; + } else { + return kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge); + } + } + + bool get_free_block(AllocParams& p) { + BlockPool& pool = *p.pool; + auto it = pool.blocks.lower_bound(&p.search_key); + if (it == pool.blocks.end() || (*it)->stream != p.stream()) + return false; + // Do not return an oversized block for a large request + if ((p.size() < CachingAllocatorConfig::max_split_size()) && + ((*it)->size >= CachingAllocatorConfig::max_split_size())) + return false; + // Allow oversized block size to be rounded up but within a limit + if ((p.size() >= CachingAllocatorConfig::max_split_size()) && + ((*it)->size >= p.size() + kLargeBuffer)) + return false; + p.block = *it; + pool.blocks.erase(it); + return true; + } + + bool trigger_free_memory_callbacks(AllocParams& p) { + bool freed_memory = false; + for (const auto& name : FreeCudaMemoryCallbacksRegistry()->Keys()) { + freed_memory |= + FreeCudaMemoryCallbacksRegistry()->Create(name)->Execute(); + } + return freed_memory; + } + + bool alloc_block(AllocParams& p, bool isRetry) { + // Defensively checks for preexisting CUDA error state. + C10_CUDA_CHECK(cudaGetLastError()); + + size_t size = p.alloc_size; + void* ptr; + + if (isRetry) { + stats.num_alloc_retries += 1; + } + + if (set_fraction && + total_allocated_memory + size > allowed_memory_maximum) { + p.err = cudaErrorMemoryAllocation; + return false; + } else { + p.err = cudaMallocMaybeCapturing(&ptr, size); + if (p.err != cudaSuccess) { + if (p.err == cudaErrorMemoryAllocation) { + // If this is the first attempt (!isRetry), we can forgive and clear + // CUDA's + // internal error state. + // If this is the second attempt (isRetry), malloc's TORCH_CHECK_WITH + // will take + // over to throw a helpful exception. The user can choose to catch + // the exception, free some stuff in their script, and attempt their + // allocation again. In this case, we can also forgive and clear + // CUDA's internal error state. + cudaGetLastError(); + } else { + // If the error's unrelated to memory allocation, we should throw + // immediately. + C10_CUDA_CHECK(p.err); + } + return false; + } + } + + total_allocated_memory += size; + p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr); + update_stat_array(stats.segment, 1, p.stat_types); + update_stat_array(stats.reserved_bytes, size, p.stat_types); + if (size >= CachingAllocatorConfig::max_split_size()) + update_stat(stats.oversize_segments, 1); + + // p.block came from new, not cudaMalloc. It should not be nullptr here. + TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr); + return true; + } + + /** Free one or more oversize blocks to the system allocator. But only enough + * **/ + /** to satisfy the target size **/ + bool release_available_cached_blocks(const AllocParams& p) { + if (CachingAllocatorConfig::max_split_size() == + std::numeric_limits::max()) + return false; + BlockPool& pool = *p.pool; + Block key = p.search_key; + key.size = (key.size < CachingAllocatorConfig::max_split_size()) + ? CachingAllocatorConfig::max_split_size() + : key.size; + auto it = pool.blocks.lower_bound(&key); + if (it == pool.blocks.end() || (*it)->stream != p.stream()) { + // No single block is large enough; free multiple oversize blocks, + // starting with the largest + if (it == pool.blocks.begin()) + return false; + size_t totalReleased = 0; + --it; // Back up one item. Now on the largest block for the correct + // stream + while ((totalReleased < key.size) && + ((*it)->size >= CachingAllocatorConfig::max_split_size()) && + ((*it)->stream == p.stream())) { + auto cur = it; + totalReleased += (*it)->size; + if (it != pool.blocks.begin()) { + --it; + release_block(*cur); + } else { + release_block(*cur); + break; + } + } + if (totalReleased < key.size) + return false; + } else { + release_block(*it); + } + return true; + } + + bool release_cached_blocks() { + // First ensure that all blocks that can't currently be allocated due to + // outstanding events are returned to the pool. + synchronize_and_free_events(); + + // Free all non-split cached blocks to system allocator + release_blocks(large_blocks); + release_blocks(small_blocks); + + return true; + } + + void release_block(Block* block) { + C10_CUDA_CHECK(cudaFree((void*)block->ptr)); + total_allocated_memory -= block->size; + + auto* pool = block->pool; + StatTypes stat_types; + stat_types[static_cast(StatType::AGGREGATE)] = true; + stat_types[static_cast(get_stat_type_for_pool(*pool))] = true; + update_stat_array(stats.segment, -1, stat_types); + update_stat_array(stats.reserved_bytes, -block->size, stat_types); + if (block->size >= CachingAllocatorConfig::max_split_size()) + update_stat(stats.oversize_segments, -1); + + pool->blocks.erase(block); + delete block; + } + + void release_blocks(BlockPool& pool) { + // Frees all non-split blocks + auto it = pool.blocks.begin(); + while (it != pool.blocks.end()) { + Block* block = *it; + ++it; + if (!block->prev && !block->next) { + release_block(block); + } + } + } + + cudaEvent_t create_event_internal() { + cudaEvent_t event; + C10_CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + return event; + } + + void free_event_internal(cudaEvent_t event) { + C10_CUDA_CHECK(cudaEventDestroy(event)); + } + + void synchronize_and_free_events() { + // Synchronize on outstanding events and then free associated blocks. + + for (auto& e : cuda_events) { + cudaEvent_t event = e.first; + Block* block = e.second; + + C10_CUDA_CHECK(cudaEventSynchronize(event)); + free_event_internal(event); + + block->event_count--; + if (block->event_count == 0) { + free_block(block); + } + } + + cuda_events.clear(); + } + + void insert_events(Block* block) { + int prev_device; + C10_CUDA_CHECK(cudaGetDevice(&prev_device)); + + stream_set streams(std::move(block->stream_uses)); + AT_ASSERT(block->stream_uses.empty()); + for (auto& stream : streams) { + C10_CUDA_CHECK(cudaSetDevice(stream.device_index())); + + cudaEvent_t event = create_event_internal(); + C10_CUDA_CHECK(cudaEventRecord(event, stream.stream())); + + block->event_count++; + cuda_events.emplace_back(event, block); + } + + C10_CUDA_CHECK(cudaSetDevice(prev_device)); + } + + void insert_events_deferred_until_no_capture() { + if (C10_UNLIKELY(needs_events_deferred_until_no_capture.size() > 0)) { + for (auto* block : needs_events_deferred_until_no_capture) { + TORCH_INTERNAL_ASSERT(!block->stream_uses.empty()); + insert_events(block); + } + needs_events_deferred_until_no_capture.clear(); + } + } + + void process_events() { + insert_events_deferred_until_no_capture(); + + // Process outstanding cudaEvents. Events that are completed are removed + // from the queue, and the 'event_count' for the corresponding allocation + // is decremented. Stops at the first event which has not been completed. + // Since events on different devices or streams may occur out of order, + // the processing of some events may be delayed. + while (!cuda_events.empty()) { + auto& e = cuda_events.front(); + cudaEvent_t event = e.first; + Block* block = e.second; + + cudaError_t err = cudaEventQuery(event); + if (err == cudaErrorNotReady) { + // ignore and clear the error if not ready + cudaGetLastError(); + break; + } else if (err != cudaSuccess) { + C10_CUDA_CHECK(err); + } + + free_event_internal(event); + + block->event_count--; + if (block->event_count == 0) { + free_block(block); + } + cuda_events.pop_front(); + } + } + + // Accumulates sizes of all memory blocks for given device in given pool + void cache_info_aux(const BlockPool& pool, size_t* total, size_t* largest) { + for (const auto& block : pool.blocks) { + const auto blocksize = block->size; + *total += blocksize; + if (blocksize > *largest) { + *largest = blocksize; + } + } + } + +}; + +static ManagedAllocator allocator; + + void setMemoryFraction(double fraction, int device) { + allocator.setMemoryFraction(fraction); + } + + void cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock) { + allocator.cacheInfo(cachedAndFree, largestBlock); + } + + void* getBaseAllocation(void* ptr, size_t* size) { + return allocator.getBaseAllocation(ptr, size); + } + + void recordStream(const DataPtr& ptr, cuda::CUDAStream stream) { + allocator.recordStream(ptr, stream); + } + +std::mutex* getFreeMutex() { + return allocator.getCudaFreeMutex(); +} + +DeviceStats getDeviceStats() { + return allocator.getStats(); +} + +void resetAccumulatedStats(int device) { + allocator.resetAccumulatedStats(); +} + +void resetPeakStats(int device) { + allocator.resetPeakStats(); +} + +std::vector snapshot() { + return allocator.snapshot(); +} + +void* raw_alloc(size_t nbytes) { + if (nbytes == 0) { + return nullptr; + } + int device; + C10_CUDA_CHECK(cudaGetDevice(&device)); + void* r = nullptr; + allocator.malloc( + &r, device, nbytes, cuda::getCurrentCUDAStream(device)); + return r; +} + +void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) { + if (nbytes == 0) { + return nullptr; + } + int device; + C10_CUDA_CHECK(cudaGetDevice(&device)); + void* r = nullptr; + allocator.malloc(&r, device, nbytes, stream); + return r; +} + +void raw_delete(void* ptr) { + allocator.free(ptr); +} + +cudaError_t recordEvent(void *ptr, at::cuda::CUDAStream stream) +{ + return allocator.recordEvent(ptr, stream); +} + +void emptyCache() +{ + allocator.emptyCache(); +} + +static void CachingManagedDeleter(void* ptr) { + allocator.free(ptr); +} +bool forceUncachedAllocator() { + static bool force_uncached = + getenv("PYTORCH_NO_CUDA_MEMORY_CACHING") != nullptr; + return force_uncached; +} + +static void uncached_delete(void* ptr) { + C10_CUDA_CHECK(cudaFree(ptr)); +} + +struct CachingManagedAllocator final : public at::Allocator { + at::DataPtr allocate(size_t size) const override { + int device; + C10_CUDA_CHECK(cudaGetDevice(&device)); + void* ptr = nullptr; + if (forceUncachedAllocator()) { + // Deliberately don't use cudaMallocMaybeCapturing here, to force an error + // if someone tries to use forceUncachedAllocator while capturing. + C10_CUDA_CHECK(cudaMallocManaged(&ptr, size)); + return {ptr, ptr, &uncached_delete, Device(DeviceType::CUDA, device)}; + } + if (size != 0) { + ptr = allocator.malloc(device, size, cuda::getCurrentCUDAStream(device))->ptr; + //C10_CUDA_CHECK(cudaMemAdvise(ptr, size, cudaMemAdviseSetPreferredLocation, device)); + } + return {ptr, ptr, &CachingManagedDeleter, Device(DeviceType::CUDA, device)}; + } + at::DeleterFnPtr raw_deleter() const override { + if (forceUncachedAllocator()) { + return &uncached_delete; + } else { + return &CachingManagedDeleter; + } + } +}; + +struct CachingManagedAllocatorCpu final : public at::Allocator { + at::DataPtr allocate(size_t size) const override { + void* ptr = nullptr; + if (forceUncachedAllocator()) { + // Deliberately don't use cudaMallocMaybeCapturing here, to force an error + // if someone tries to use forceUncachedAllocator while capturing. + C10_CUDA_CHECK(cudaMallocManaged(&ptr, size)); + return {ptr, ptr, &uncached_delete, Device(DeviceType::CPU)}; + } + if (size != 0) { + int device; + C10_CUDA_CHECK(cudaGetDevice(&device)); + int hint_device = -1; //cudaCpuDeviceId + ptr = allocator.malloc(hint_device, size, cuda::getCurrentCUDAStream(device))->ptr; + + //C10_CUDA_CHECK(cudaMemAdvise(ptr, size, cudaMemAdviseSetPreferredLocation, hint_device)); + //C10_CUDA_CHECK(cudaMemAdvise(ptr, size, cudaMemAdviseSetAccessedBy, device)); + } + return {ptr, ptr, &CachingManagedDeleter, Device(DeviceType::CPU)}; + } + at::DeleterFnPtr raw_deleter() const override { + if (forceUncachedAllocator()) { + return &uncached_delete; + } else { + return &CachingManagedDeleter; + } + } +}; + +static CachingManagedAllocator caching_managed_allocator; + +static CachingManagedAllocatorCpu caching_managed_allocator_cpu; + +at::Allocator* get(DeviceType device_type) { + if (device_type == DeviceType::CUDA) + return &caching_managed_allocator; + else + return &caching_managed_allocator_cpu; +} + +} // namespace CachingManagedAllocator + +}} // namespace at::cuda diff --git a/aten/src/ATen/cuda/CachingManagedAllocator.h b/aten/src/ATen/cuda/CachingManagedAllocator.h new file mode 100644 index 0000000000000..ff37646218c40 --- /dev/null +++ b/aten/src/ATen/cuda/CachingManagedAllocator.h @@ -0,0 +1,121 @@ +#pragma once + +#include +#include + +namespace at { +namespace cuda { + +// +// A caching allocator for UVM allocations. +// +// This provides a drop-in replacement for THCudaHostAllocator, which re-uses +// freed pinned (page-locked) memory allocations. This avoids device +// synchronizations due to cudaFreeHost calls. +// +// To ensure correct behavior, THCCachingHostAllocator_recordEvent must be +// called anytime a pointer from this allocator is used in a cudaMemcpyAsync +// call between host and device. We implement this for storages and tensors in +// copy_from_cpu_async_ and copy_to_cpu_async_. +// +// Note that this allocator does not split larger allocations into smaller +// blocks, unlike the caching device allocator. +// +namespace CachingManagedAllocator { + +struct Stat { + int64_t current = 0; + int64_t peak = 0; + int64_t allocated = 0; + int64_t freed = 0; +}; + +enum struct StatType : uint64_t { + AGGREGATE = 0, + SMALL_POOL = 1, + LARGE_POOL = 2, + NUM_TYPES = 3 // remember to update this whenever a new stat type is added +}; + +typedef std::array(StatType::NUM_TYPES)> StatArray; + +struct DeviceStats { + // COUNT: allocations requested by client code + StatArray allocation; + // COUNT: number of allocated segments from cudaMalloc(). + StatArray segment; + // COUNT: number of active memory blocks (allocated or used by stream) + StatArray active; + // COUNT: number of inactive, split memory blocks (unallocated but can't be + // released via cudaFree) + StatArray inactive_split; + + // SUM: bytes requested by client code + StatArray allocated_bytes; + // SUM: bytes reserved by this memory allocator (both free and used) + StatArray reserved_bytes; + // SUM: bytes within active memory blocks + StatArray active_bytes; + // SUM: bytes within inactive, split memory blocks + StatArray inactive_split_bytes; + + // COUNT: total number of failed calls to CUDA malloc necessitating cache + // flushes. + int64_t num_alloc_retries = 0; + + // COUNT: total number of OOMs (i.e. failed calls to CUDA after cache flush) + int64_t num_ooms = 0; + + // COUNT: total number of oversize blocks allocated from pool + Stat oversize_allocations; + + // COUNT: total number of oversize blocks requiring malloc + Stat oversize_segments; + + // SIZE: maximum block size that is allowed to be split. + int64_t max_split_size = 0; +}; + +// Struct containing info of an allocation block (i.e. a fractional part of a +// cudaMalloc).. +struct BlockInfo { + int64_t size = 0; + bool allocated = false; + bool active = false; +}; + +// Struct containing info of a memory segment (i.e. one contiguous cudaMalloc). +struct SegmentInfo { + int64_t device = 0; + int64_t address = 0; + int64_t total_size = 0; + int64_t allocated_size = 0; + int64_t active_size = 0; + bool is_large = false; + std::vector blocks; +}; + +TORCH_CUDA_CPP_API c10::Allocator* get(DeviceType device_type); + +// Records an event in the specified stream. The allocation 'ptr' will not be +// re-used until the event has occurred. +TORCH_CUDA_CPP_API cudaError_t +recordEvent(void* ptr, c10::cuda::CUDAStream stream); + +// Releases cached pinned memory allocations via cudaHostFree +TORCH_CUDA_CPP_API void emptyCache(); +TORCH_CUDA_CPP_API DeviceStats getDeviceStats(); +TORCH_CUDA_CPP_API void* raw_alloc(size_t nbytes); +TORCH_CUDA_CPP_API void* raw_alloc_with_stream( + size_t nbytes, + cudaStream_t stream); +TORCH_CUDA_CPP_API void raw_delete(void* ptr); + +// inline TORCH_CUDA_CPP_API at::DataPtr ManagedAlloc(size_t size) { +// return getCachingManagedAllocator()->allocate(size); +// } + +} // namespace CachingManagedAllocator + +} // namespace cuda +} // namespace at diff --git a/aten/src/ATen/cuda/ThrustAllocator.h b/aten/src/ATen/cuda/ThrustAllocator.h index 92fc5cc13ba08..eeceb462c0fd1 100644 --- a/aten/src/ATen/cuda/ThrustAllocator.h +++ b/aten/src/ATen/cuda/ThrustAllocator.h @@ -2,6 +2,7 @@ #include #include +#include namespace at { namespace cuda { @@ -13,11 +14,17 @@ class ThrustAllocator { typedef char value_type; char* allocate(std::ptrdiff_t size) { - return static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(size)); + if (at::globalContext().userEnabledUVM()) + return static_cast(at::cuda::CachingManagedAllocator::raw_alloc(size)); + else + return static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(size)); } void deallocate(char* p, size_t size) { - c10::cuda::CUDACachingAllocator::raw_delete(p); + if (at::globalContext().userEnabledUVM()) + return at::cuda::CachingManagedAllocator::raw_delete(p); + else + return c10::cuda::CUDACachingAllocator::raw_delete(p); } }; diff --git a/aten/src/ATen/cuda/UvmMemoryAllocator.h b/aten/src/ATen/cuda/UvmMemoryAllocator.h new file mode 100644 index 0000000000000..2d3cbf61b2c46 --- /dev/null +++ b/aten/src/ATen/cuda/UvmMemoryAllocator.h @@ -0,0 +1,16 @@ +#pragma once + +#include +#include + +namespace at { namespace cuda { + +inline TORCH_CUDA_CPP_API at::Allocator* getUnifiedDeviceAllocator() { + return CachingManagedAllocator::get(DeviceType::CUDA); +} + +inline TORCH_CUDA_CPP_API at::Allocator* getUnifiedDeviceAllocatorCpu() { + return CachingManagedAllocator::get(DeviceType::CPU); +} + +}} // namespace at::cuda diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp index 89c1246a32d14..ef037202c7f2e 100644 --- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp +++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -940,11 +941,15 @@ inline static void apply_cholesky_cusolver_potrf_looped(const Tensor& self_worki at::cuda::solver::xpotrf_buffersize(handle, params, uplo, n, datatype, nullptr, lda, datatype, &worksize_device, &worksize_host); // allocate workspace storage - auto& device_allocator = *at::cuda::getCUDADeviceAllocator(); + auto& device_allocator = (at::globalContext().userEnabledUVM()) ? + *at::cuda::getUnifiedDeviceAllocator() : + *at::cuda::getCUDADeviceAllocator(); + auto& host_allocator = (at::globalContext().userEnabledUVM()) ? + *at::cuda::getUnifiedDeviceAllocator() : + *at::getCPUAllocator(); auto workdata_device = device_allocator.allocate(worksize_device * batch_size); void* workdata_device_ptr = workdata_device.get(); - auto& host_allocator = *at::getCPUAllocator(); auto workdata_host = host_allocator.allocate(worksize_host * batch_size); void* workdata_host_ptr = workdata_host.get(); @@ -968,7 +973,9 @@ inline static void apply_cholesky_cusolver_potrf_looped(const Tensor& self_worki handle, uplo, n_32, nullptr, lda_32, &lwork); // allocate workspace storage - auto& allocator = *at::cuda::getCUDADeviceAllocator(); + auto& allocator = (at::globalContext().userEnabledUVM()) ? + *at::cuda::getUnifiedDeviceAllocator() : + *at::cuda::getCUDADeviceAllocator(); auto work_data = allocator.allocate(sizeof(scalar_t)*lwork * batch_size); scalar_t* work_data_ptr = static_cast(work_data.get()); @@ -1215,9 +1222,13 @@ static void apply_geqrf(const Tensor& A, const Tensor& tau) { #ifdef USE_CUSOLVER_64_BIT // allocate workspace storage on device and host - auto& device_allocator = *at::cuda::getCUDADeviceAllocator(); + auto& device_allocator = (at::globalContext().userEnabledUVM()) ? + *at::cuda::getUnifiedDeviceAllocator() : + *at::cuda::getCUDADeviceAllocator(); + auto& host_allocator = (at::globalContext().userEnabledUVM()) ? + *at::cuda::getUnifiedDeviceAllocator() : + *at::getCPUAllocator(); auto work_device_data = device_allocator.allocate(worksize_device); - auto& host_allocator = *at::getCPUAllocator(); auto work_host_data = host_allocator.allocate(worksize_host); at::cuda::solver::xgeqrf( handle, @@ -1234,7 +1245,12 @@ static void apply_geqrf(const Tensor& A, const Tensor& tau) { infos_data); #else // allocate workspace storage on device - auto& allocator = *at::cuda::getCUDADeviceAllocator(); + auto& device_allocator = (at::globalContext().userEnabledUVM()) ? + *at::cuda::getUnifiedDeviceAllocator() : + *at::cuda::getCUDADeviceAllocator(); + auto& host_allocator = (at::globalContext().userEnabledUVM()) ? + *at::cuda::getUnifiedDeviceAllocator() : + *at::getCPUAllocator(); auto work_data = allocator.allocate(sizeof(scalar_t) * std::max(1, lwork)); at::cuda::solver::geqrf( handle, @@ -1311,7 +1327,9 @@ static void apply_ormqr(const Tensor& input, const Tensor& tau, const Tensor& ot auto handle = at::cuda::getCurrentCUDASolverDnHandle(); // allocate workspace storage - auto& allocator = *at::cuda::getCUDADeviceAllocator(); + auto& allocator = (at::globalContext().userEnabledUVM()) ? + *at::cuda::getUnifiedDeviceAllocator() : + *at::cuda::getCUDADeviceAllocator(); auto work_data = allocator.allocate(sizeof(scalar_t)*lwork); at::cuda::solver::ormqr( @@ -1388,7 +1406,9 @@ inline static void apply_orgqr(Tensor& self, const Tensor& tau) { auto handle = at::cuda::getCurrentCUDASolverDnHandle(); // allocate workspace storage - auto& allocator = *at::cuda::getCUDADeviceAllocator(); + auto& allocator = (at::globalContext().userEnabledUVM()) ? + *at::cuda::getUnifiedDeviceAllocator() : + *at::cuda::getCUDADeviceAllocator(); auto work_data = allocator.allocate(sizeof(scalar_t)*lwork); at::cuda::solver::orgqr( @@ -1465,9 +1485,13 @@ static void apply_syevd(const Tensor& values, const Tensor& vectors, const Tenso #ifdef USE_CUSOLVER_64_BIT // allocate workspace storage on device and host - auto& device_allocator = *at::cuda::getCUDADeviceAllocator(); + auto& device_allocator = (at::globalContext().userEnabledUVM()) ? + *at::cuda::getUnifiedDeviceAllocator() : + *at::cuda::getCUDADeviceAllocator(); + auto& host_allocator = (at::globalContext().userEnabledUVM()) ? + *at::cuda::getUnifiedDeviceAllocator() : + *at::getCPUAllocator(); auto work_device_data = device_allocator.allocate(worksize_device); - auto& host_allocator = *at::getCPUAllocator(); auto work_host_data = host_allocator.allocate(worksize_host); at::cuda::solver::xsyevd( handle, @@ -1485,7 +1509,9 @@ static void apply_syevd(const Tensor& values, const Tensor& vectors, const Tenso info_working_ptr); #else // allocate workspace storage on device - auto& allocator = *at::cuda::getCUDADeviceAllocator(); + auto& allocator = (at::globalContext().userEnabledUVM()) ? + *at::cuda::getUnifiedDeviceAllocator() : + *at::cuda::getCUDADeviceAllocator(); auto work_data = allocator.allocate(sizeof(scalar_t) * lwork); at::cuda::solver::syevd( handle, @@ -1541,7 +1567,9 @@ static void apply_syevj(const Tensor& values, const Tensor& vectors, const Tenso auto handle = at::cuda::getCurrentCUDASolverDnHandle(); // allocate workspace storage on device - auto& allocator = *at::cuda::getCUDADeviceAllocator(); + auto& allocator = (at::globalContext().userEnabledUVM()) ? + *at::cuda::getUnifiedDeviceAllocator() : + *at::cuda::getCUDADeviceAllocator(); auto work_data = allocator.allocate(sizeof(scalar_t) * lwork); at::cuda::solver::syevj( handle, @@ -1601,7 +1629,9 @@ static void apply_syevj_batched(const Tensor& values, const Tensor& vectors, con batch_size); // allocate workspace storage on device - auto& allocator = *at::cuda::getCUDADeviceAllocator(); + auto& allocator = (at::globalContext().userEnabledUVM()) ? + *at::cuda::getUnifiedDeviceAllocator() : + *at::cuda::getCUDADeviceAllocator(); auto work_data = allocator.allocate(sizeof(scalar_t) * lwork); at::cuda::solver::syevjBatched( handle, diff --git a/aten/src/ATen/native/cudnn/Conv_v7.cpp b/aten/src/ATen/native/cudnn/Conv_v7.cpp index f5c7af79a740b..3ea2c82104844 100644 --- a/aten/src/ATen/native/cudnn/Conv_v7.cpp +++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -138,14 +139,17 @@ struct Workspace { // workspace fail with some 64bit indexing error instead of an OOM error. In such case, // we manually fail with OOM. TORCH_CHECK_WITH(OutOfMemoryError, size < 1_TiB, "Not enough memory for workspace!"); - data = c10::cuda::CUDACachingAllocator::raw_alloc(size); + data = at::globalContext().userEnabledUVM() ? at::cuda::CachingManagedAllocator::raw_alloc(size) : c10::cuda::CUDACachingAllocator::raw_alloc(size); } Workspace(const Workspace&) = delete; Workspace(Workspace&&) = default; Workspace& operator=(Workspace&&) = default; ~Workspace() { if (data) { - c10::cuda::CUDACachingAllocator::raw_delete(data); + if (at::globalContext().userEnabledUVM()) + at::cuda::CachingManagedAllocator::raw_delete(data); + else + c10::cuda::CUDACachingAllocator::raw_delete(data); } } diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp index 060a97d6fc1c1..8215b8119cdf5 100644 --- a/aten/src/ATen/native/miopen/Conv_miopen.cpp +++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp @@ -148,6 +148,7 @@ at::Tensor miopen_convolution_relu( #include #include +#include #include #include @@ -291,14 +292,17 @@ BenchmarkCache bwd_filter_wssizes; struct Workspace { Workspace(size_t size) : size(size), data(NULL) { - data = c10::hip::HIPCachingAllocator::raw_alloc(size); + data = at::globalContext().userEnabledUVM() ? at::cuda::CachingManagedAllocator::raw_alloc(size) : c10::hip::HIPCachingAllocator::raw_alloc(size); } Workspace(const Workspace&) = delete; Workspace(Workspace&&) = default; Workspace& operator=(Workspace&&) = default; ~Workspace() { if (data) { - c10::hip::HIPCachingAllocator::raw_delete(data); + if (at::globalContext().userEnabledUVM()) + at::cuda::CachingManagedAllocator::raw_delete(data); + else + c10::hip::HIPCachingAllocator::raw_delete(data); } } From 5d2b294caf2d3bc4e0c8935a933486eb61d4ab95 Mon Sep 17 00:00:00 2001 From: Jason Furmanek Date: Wed, 7 Sep 2022 14:19:10 +0000 Subject: [PATCH 3/9] [UVM] Add CUDA Hooks, isManagedPtr, tensor.is_managed() --- aten/src/ATen/Context.h | 3 ++ aten/src/ATen/cuda/UvmMemoryAllocator.cpp | 44 ++++++++++++++++ aten/src/ATen/cuda/detail/CUDAHooks.cpp | 51 +++++++++++++++++++ aten/src/ATen/cuda/detail/CUDAHooks.h | 3 ++ aten/src/ATen/detail/CUDAHooksInterface.h | 12 +++++ aten/src/ATen/native/Memory.cpp | 11 ++++ aten/src/ATen/native/native_functions.yaml | 14 +++++ aten/src/ATen/quantized/Quantizer.cpp | 4 +- .../ATen/templates/RegisterBackendSelect.cpp | 15 ++++++ c10/core/TensorOptions.h | 41 +++++++++++++++ tools/autograd/derivatives.yaml | 3 ++ torch/csrc/StorageMethods.cpp | 14 +++++ torch/csrc/cuda/Module.cpp | 17 +++++++ torch/cuda/memory.py | 9 ++++ torch/storage.py | 9 ++++ torch/utils/hipify/cuda_to_hip_mappings.py | 1 + 16 files changed, 250 insertions(+), 1 deletion(-) create mode 100644 aten/src/ATen/cuda/UvmMemoryAllocator.cpp diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index ede1aeea02c7c..fb755cabf4c84 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -55,6 +55,9 @@ class TORCH_API Context { static bool isPinnedPtr(void* data) { return detail::getCUDAHooks().isPinnedPtr(data); } + static bool isManagedPtr(void* data) { + return detail::getCUDAHooks().isManagedPtr(data); + } static bool hasOpenMP(); static bool hasMKL(); static bool hasLAPACK(); diff --git a/aten/src/ATen/cuda/UvmMemoryAllocator.cpp b/aten/src/ATen/cuda/UvmMemoryAllocator.cpp new file mode 100644 index 0000000000000..199046b75faed --- /dev/null +++ b/aten/src/ATen/cuda/UvmMemoryAllocator.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { + +namespace native { + +bool is_managed_cuda(const Tensor& self, c10::optional device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + (!device.has_value() && device->is_cpu()) || device->is_cuda()); + // TODO: unhook this + return detail::getCUDAHooks().isManagedPtr(self.storage().data()); +} + +Tensor _manage_memory_cuda(const Tensor& self, c10::optional device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + (!device.has_value() && device->is_cpu()) || device->is_cuda()); + at::Allocator* allocator = nullptr; + if (self.is_cpu()) { + allocator = at::cuda::getUnifiedDeviceAllocatorCpu(); + } else if (self.is_cuda()) { + allocator = at::cuda::getUnifiedDeviceAllocator(); + } + size_t size_bytes = detail::computeStorageNbytes( + self.sizes(), self.strides(), self.dtype().itemsize()); + auto storage = Storage( + Storage::use_byte_size_t(), + size_bytes, + allocator, + /*resizable=*/false); + auto tensor = at::empty({0}, self.options()) + .set_(storage, 0, self.sizes(), self.strides()); + tensor.copy_(self); + return tensor; +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index 25e4c2b44fa99..40c6d31288618 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -123,6 +124,9 @@ bool CUDAHooks::isPinnedPtr(void* data) const { return false; } #endif +#if defined(USE_ROCM) + return attr.isManaged == cudaMemoryTypeHost; +#endif #if defined(CUDA_VERSION) && CUDA_VERSION >= 10000 return attr.type == cudaMemoryTypeHost; #else @@ -130,6 +134,45 @@ bool CUDAHooks::isPinnedPtr(void* data) const { #endif } +bool CUDAHooks::isManagedPtr(void* data) const { + // First check if driver is broken/missing, in which case PyTorch CPU + // functionalities should still work, we should report `false` here. + if (!at::cuda::is_available()) { + return false; + } + // cudaPointerGetAttributes grabs context on the current device, so we set + // device to one that already has context, if exists. + at::OptionalDeviceGuard device_guard; + auto primary_ctx_device_index = getDeviceIndexWithPrimaryContext(); + if (primary_ctx_device_index.has_value()) { + device_guard.reset_device(at::Device(at::DeviceType::CUDA, *primary_ctx_device_index)); + } + cudaPointerAttributes attr; + cudaError_t err = cudaPointerGetAttributes(&attr, data); +#if !defined(USE_ROCM) + if (err == cudaErrorInvalidValue) { + cudaGetLastError(); + return false; + } + AT_CUDA_CHECK(err); +#else + // HIP throws hipErrorUnknown here + if (err != cudaSuccess) { + cudaGetLastError(); + return false; + } +#endif +// NB: Potential temporary hack until we see a change in setting the "isManaged" attribute like CUDA does +#if defined(USE_ROCM) + return attr.isManaged; +#endif +#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000 + return attr.type == cudaMemoryTypeManaged; +#else + return attr.memoryType == cudaMemoryTypeManaged; +#endif +} + bool CUDAHooks::hasCUDA() const { return at::cuda::is_available(); } @@ -249,6 +292,14 @@ Allocator* CUDAHooks::getCUDADeviceAllocator() const { return at::cuda::getCUDADeviceAllocator(); } +Allocator* CUDAHooks::getUnifiedDeviceAllocator() const { + return at::cuda::getUnifiedDeviceAllocator(); +} + +Allocator* CUDAHooks::getUnifiedDeviceAllocatorCpu() const { + return at::cuda::getUnifiedDeviceAllocatorCpu(); +} + bool CUDAHooks::compiledWithCuDNN() const { return AT_CUDNN_ENABLED(); } diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h index d53276ab3bbac..3dafc8395b7d9 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.h +++ b/aten/src/ATen/cuda/detail/CUDAHooks.h @@ -24,6 +24,7 @@ struct CUDAHooks : public at::CUDAHooksInterface { void initCUDA() const override; Device getDeviceFromPtr(void* data) const override; bool isPinnedPtr(void* data) const override; + bool isManagedPtr(void* data) const override; const Generator& getDefaultCUDAGenerator(DeviceIndex device_index = -1) const override; bool hasCUDA() const override; bool hasMAGMA() const override; @@ -35,6 +36,8 @@ struct CUDAHooks : public at::CUDAHooksInterface { bool hasPrimaryContext(int64_t device_index) const override; Allocator* getCUDADeviceAllocator() const override; Allocator* getPinnedMemoryAllocator() const override; + Allocator* getUnifiedDeviceAllocator() const override; + Allocator* getUnifiedDeviceAllocatorCpu() const override; bool compiledWithCuDNN() const override; bool compiledWithMIOpen() const override; bool supportsDilatedConvolutionWithCuDNN() const override; diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h index 7ba8f68d94b20..d96b59defa4c2 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.h +++ b/aten/src/ATen/detail/CUDAHooksInterface.h @@ -87,6 +87,10 @@ struct TORCH_API CUDAHooksInterface { return false; } + virtual bool isManagedPtr(void* data) const { + return false; + } + virtual bool hasCUDA() const { return false; } @@ -131,6 +135,14 @@ struct TORCH_API CUDAHooksInterface { TORCH_CHECK(false, "CUDADeviceAllocator requires CUDA. ", CUDA_HELP); } + virtual Allocator* getUnifiedDeviceAllocator() const { + TORCH_CHECK(false, "Unified Device Allocator requires CUDA. ", CUDA_HELP); + } + + virtual Allocator* getUnifiedDeviceAllocatorCpu() const { + TORCH_CHECK(false, "Unified Device Allocator requires CUDA. ", CUDA_HELP); + } + virtual bool compiledWithCuDNN() const { return false; } diff --git a/aten/src/ATen/native/Memory.cpp b/aten/src/ATen/native/Memory.cpp index 2b66f08933934..e83b0bf060011 100644 --- a/aten/src/ATen/native/Memory.cpp +++ b/aten/src/ATen/native/Memory.cpp @@ -37,5 +37,16 @@ Tensor pin_memory(const Tensor& self, c10::optional device) { return at::_pin_memory(self, device); } +bool is_managed_default(const Tensor& self, c10::optional device) { + return false; } + +Tensor manage_memory(const Tensor& self, c10::optional device) { + if (self.is_managed(device)) { + return self; + } + return at::_manage_memory(self, device); } + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index abf94179f70a1..ce7531727ad1d 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -4199,6 +4199,20 @@ MPS: _pin_memory_mps autogen: _pin_memory.out +- func: is_managed(Tensor self, Device? device=None) -> bool + variants: method + dispatch: + CUDA: is_managed_cuda + CompositeExplicitAutograd: is_managed_default + +- func: manage_memory(Tensor(a) self, Device? device=None) -> Tensor(a) + variants: method + +# Unlike manage_memory, this is guaranteed to give a new non-aliasing tensor +- func: _manage_memory(Tensor self, Device? device=None) -> Tensor + dispatch: + CUDA: _manage_memory_cuda + - func: pinverse(Tensor self, float rcond=1e-15) -> Tensor variants: function, method diff --git a/aten/src/ATen/quantized/Quantizer.cpp b/aten/src/ATen/quantized/Quantizer.cpp index 54fe7082c3e79..e2afa8181756c 100644 --- a/aten/src/ATen/quantized/Quantizer.cpp +++ b/aten/src/ATen/quantized/Quantizer.cpp @@ -115,7 +115,9 @@ inline Tensor new_qtensor( auto device = options.device(); at::Allocator* allocator = nullptr; // TODO: why isn't this just using GetAllocator - if (device.is_cuda()) { + if (at::globalContext().userEnabledUVM()) { + allocator = at::detail::getCUDAHooks().getUnifiedDeviceAllocator(); + } else if (device.is_cuda()) { allocator = at::detail::getCUDAHooks().getCUDADeviceAllocator(); } else if (device.is_cpu()) { allocator = at::getCPUAllocator(); diff --git a/aten/src/ATen/templates/RegisterBackendSelect.cpp b/aten/src/ATen/templates/RegisterBackendSelect.cpp index 6463701a4939f..8373849f044fb 100644 --- a/aten/src/ATen/templates/RegisterBackendSelect.cpp +++ b/aten/src/ATen/templates/RegisterBackendSelect.cpp @@ -13,6 +13,8 @@ #else #include #include +#include +#include ${ops_headers} #endif @@ -39,10 +41,23 @@ at::Tensor _pin_memory(const Tensor& self, c10::optional device) { return at::_ops::_pin_memory::redispatch(_dk, self, device); } +bool is_managed(const Tensor& self, c10::optional device) { + // TODO: fetch scalar type from Tensor? But it doesn't really matter... + DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(c10::nullopt, self.layout(), device.value_or(at::kCUDA))); + return at::_ops::is_managed::redispatch(_dk, self, device); +} + +at::Tensor _manage_memory(const Tensor& self, c10::optional device) { + DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(c10::nullopt, self.layout(), device.value_or(at::kCUDA))); + return at::_ops::_manage_memory::redispatch(_dk, self, device); +} + TORCH_LIBRARY_IMPL(aten, BackendSelect, m) { ${backend_select_function_registrations}; m.impl(TORCH_SELECTIVE_NAME("aten::is_pinned"), TORCH_FN(is_pinned)); m.impl(TORCH_SELECTIVE_NAME("aten::_pin_memory"), TORCH_FN(_pin_memory)); + m.impl(TORCH_SELECTIVE_NAME("aten::is_managed"), TORCH_FN(is_managed)); + m.impl(TORCH_SELECTIVE_NAME("aten::_manage_memory"), TORCH_FN(_manage_memory)); } } // namespace diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h index 432fe4f1e4b6c..0bbac3a6651b3 100644 --- a/c10/core/TensorOptions.h +++ b/c10/core/TensorOptions.h @@ -45,6 +45,10 @@ inline bool pinned_memory_or_default(c10::optional pinned_memory) { return pinned_memory.value_or(false); } +inline bool managed_memory_or_default(c10::optional managed_memory) { + return managed_memory.value_or(false); +} + /// A class to encapsulate construction axes of an Tensor. TensorOptions was /// designed to support the Python style API for specifying construction options /// on factory functions, e.g., @@ -136,11 +140,13 @@ struct C10_API TensorOptions { TensorOptions() : requires_grad_(false), pinned_memory_(false), + managed_memory_(false), has_device_(false), has_dtype_(false), has_layout_(false), has_requires_grad_(false), has_pinned_memory_(false), + has_managed_memory_(false), has_memory_format_(false) {} /// Constructs a `TensorOptions` object with the given layout. @@ -263,6 +269,14 @@ struct C10_API TensorOptions { return r; } + /// Sets the `managed_memory` property on the `TensorOptions`. + C10_NODISCARD TensorOptions + managed_memory(c10::optional managed_memory) const noexcept { + TensorOptions r = *this; + r.set_managed_memory(managed_memory); + return r; + } + /// Sets the `memory_format` property on `TensorOptions`. C10_NODISCARD TensorOptions memory_format(c10::optional memory_format) const noexcept { @@ -351,6 +365,14 @@ struct C10_API TensorOptions { return has_pinned_memory_; } + bool managed_memory() const noexcept { + return managed_memory_or_default(managed_memory_opt()); + } + + bool has_managed_memory() const noexcept { + return has_managed_memory_; + } + /// Returns if the layout is sparse bool is_sparse() const { return layout_ == c10::Layout::Sparse; @@ -373,6 +395,11 @@ struct C10_API TensorOptions { : c10::nullopt; } + c10::optional managed_memory_opt() const noexcept { + return has_managed_memory_ ? c10::make_optional(managed_memory_) + : c10::nullopt; + } + /// Returns whether the `memory_layout` is specified bool has_memory_format() const noexcept { return has_memory_format_; @@ -417,6 +444,8 @@ struct C10_API TensorOptions { merged.set_requires_grad(options.requires_grad_opt()); if (options.has_pinned_memory()) merged.set_pinned_memory(options.pinned_memory_opt()); + if (options.has_managed_memory()) + merged.set_managed_memory(options.managed_memory_opt()); if (options.has_memory_format()) merged.set_memory_format(options.memory_format_opt()); return merged; @@ -514,6 +543,16 @@ struct C10_API TensorOptions { } } + /// Mutably set the `managed_memory` property of `TensorOptions`. + void set_managed_memory(c10::optional managed_memory) & noexcept { + if (managed_memory) { + managed_memory_ = *managed_memory; + has_managed_memory_ = true; + } else { + has_managed_memory_ = false; + } + } + /// Mutably set the `memory_Format` property of `TensorOptions`. void set_memory_format(c10::optional memory_format) & noexcept { if (memory_format) { @@ -546,12 +585,14 @@ struct C10_API TensorOptions { bool requires_grad_ : 1; bool pinned_memory_ : 1; + bool managed_memory_ : 1; bool has_device_ : 1; bool has_dtype_ : 1; bool has_layout_ : 1; bool has_requires_grad_ : 1; bool has_pinned_memory_ : 1; + bool has_managed_memory_ : 1; bool has_memory_format_ : 1; }; diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index 38429d8e58c86..04bdaa79d202b 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -2726,6 +2726,9 @@ - name: _pin_memory(Tensor self, Device? device=None) -> Tensor self: grad +- name: _manage_memory(Tensor self, Device? device=None) -> Tensor + self: grad + - name: _new_zeros_with_same_feature_meta(Tensor self, Tensor other, *, int self_num_batch_dims=0) -> Tensor self: non_differentiable other: non_differentiable diff --git a/torch/csrc/StorageMethods.cpp b/torch/csrc/StorageMethods.cpp index 29f0f67ce6ecb..0537842c9c2c6 100644 --- a/torch/csrc/StorageMethods.cpp +++ b/torch/csrc/StorageMethods.cpp @@ -91,6 +91,19 @@ static PyObject* THPStorage_isPinned(PyObject* _self, PyObject* noargs) { END_HANDLE_TH_ERRORS } +static PyObject* THPStorage_isManaged(PyObject* _self, PyObject* noargs) { + HANDLE_TH_ERRORS + auto self = (THPStorage*)_self; +#if defined(USE_CUDA) + return PyBool_FromLong( + at::globalContext().isManagedPtr(self->cdata->data())); +#else + Py_RETURN_FALSE; +#endif + END_HANDLE_TH_ERRORS +} + + static PyObject* THPStorage_elementSize(PyObject* _self, PyObject* noargs) { HANDLE_TH_ERRORS return THPUtils_packInt64(sizeof(uint8_t)); @@ -502,6 +515,7 @@ static PyMethodDef THPStorage_methods[] = { {"nbytes", THPStorage_nbytes, METH_NOARGS, nullptr}, {"data_ptr", THPStorage_dataPtr, METH_NOARGS, nullptr}, {"is_pinned", THPStorage_isPinned, METH_NOARGS, nullptr}, + {"is_managed", THPStorage_isManaged, METH_NOARGS, nullptr}, {"_write_file", THPStorage_writeFile, METH_VARARGS, nullptr}, {"_new_with_file", THPStorage_newWithFile, diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp index 468571fc454ee..b98d53ed68bb7 100644 --- a/torch/csrc/cuda/Module.cpp +++ b/torch/csrc/cuda/Module.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -237,6 +238,22 @@ PyObject* THCPModule_cudaCachingAllocator_raw_alloc( END_HANDLE_TH_ERRORS } +PyObject * THCPModule_cudaUnifiedDeviceAllocator(PyObject *_unused, PyObject *noargs) +{ + HANDLE_TH_ERRORS + c10::Allocator* allocator = at::cuda::getUnifiedDeviceAllocator(); + return PyLong_FromVoidPtr(allocator); + END_HANDLE_TH_ERRORS +} + +PyObject * THCPModule_cudaUnifiedDeviceAllocatorCpu(PyObject *_unused, PyObject *noargs) +{ + HANDLE_TH_ERRORS + c10::Allocator* allocator = at::cuda::getUnifiedDeviceAllocatorCpu(); + return PyLong_FromVoidPtr(allocator); + END_HANDLE_TH_ERRORS +} + // Unpack a PyObject to at::Scalar, throw an exception if it fails at::Scalar as_scalar(PyObject* arg) { // Zero-dim tensors are converted to Scalars as-is. Note this doesn't diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py index 047e393b125aa..b4d084abb18b2 100644 --- a/torch/cuda/memory.py +++ b/torch/cuda/memory.py @@ -37,6 +37,15 @@ def _host_allocator(): _lazy_init() return torch._C._cuda_cudaHostAllocator() +def _manage_allocator(): + _lazy_init() + return torch._C._cuda_cudaUnifiedDeviceAllocator() + + +def _manage_cpu_allocator(): + _lazy_init() + return torch._C._cuda_cudaUnifiedDeviceAllocatorCpu() + @contextlib.contextmanager def _free_mutex(): diff --git a/torch/storage.py b/torch/storage.py index 38ec5238cd735..de39213f29c5b 100644 --- a/torch/storage.py +++ b/torch/storage.py @@ -193,6 +193,15 @@ def pin_memory(self): allocator = torch.cuda.memory._host_allocator() # type: ignore[attr-defined] return type(self)(self.size(), allocator=allocator).copy_(self) + def manage_memory(self): + """Copies the storage to manage memory, if it's not already manage.""" + import torch.cuda + if self.is_cuda: + allocator = torch.cuda._manage_allocator() # type: ignore[attr-defined] + else: + allocator = torch.cuda._manage_cpu_allocator() # type: ignore[attr-defined] + return type(self)(self.size(), allocator=allocator).copy_(self) + def share_memory_(self): """Moves the storage to shared memory. diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py index 1272a2fe3d00d..0b59d7aba3ecd 100644 --- a/torch/utils/hipify/cuda_to_hip_mappings.py +++ b/torch/utils/hipify/cuda_to_hip_mappings.py @@ -4069,6 +4069,7 @@ ("cudaHostAlloc", ("hipHostMalloc", CONV_MEM, API_RUNTIME)), ("cudaMemoryTypeHost", ("hipMemoryTypeHost", CONV_MEM, API_RUNTIME)), ("cudaMemoryTypeDevice", ("hipMemoryTypeDevice", CONV_MEM, API_RUNTIME)), + ("cudaMemoryTypeManaged", ("hipMemoryTypeUnified", CONV_MEM, API_RUNTIME)), ("make_cudaExtent", ("make_hipExtent", CONV_MEM, API_RUNTIME)), ("make_cudaPitchedPtr", ("make_hipPitchedPtr", CONV_MEM, API_RUNTIME)), ("make_cudaPos", ("make_hipPos", CONV_MEM, API_RUNTIME)), From dcebdcaed52cea9379242400341a70803cdf1f1f Mon Sep 17 00:00:00 2001 From: Jason Furmanek Date: Tue, 30 Aug 2022 14:09:44 +0000 Subject: [PATCH 4/9] [UVM] Enable UVM stats module --- torch/csrc/cuda/Module.cpp | 59 +++++++++++++++++++++++++-- torch/cuda/memory.py | 81 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+), 3 deletions(-) diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp index b98d53ed68bb7..b3d9daeb7e4cd 100644 --- a/torch/csrc/cuda/Module.cpp +++ b/torch/csrc/cuda/Module.cpp @@ -531,9 +531,61 @@ PyObject* THCPModule_memoryStats(PyObject* _unused, PyObject* arg) { END_HANDLE_TH_ERRORS } -PyObject* THCPModule_resetAccumulatedMemoryStats( - PyObject* _unused, - PyObject* arg) { +PyObject* THCPModule_managedMemoryStats(PyObject *_unused, PyObject *arg) +{ + HANDLE_TH_ERRORS + THPUtils_assert(THPUtils_checkLong(arg), "invalid argument to memory_allocated"); + const int device = (int) THPUtils_unpackLong(arg); + + using at::cuda::CachingManagedAllocator::StatType; + using at::cuda::CachingManagedAllocator::Stat; + using at::cuda::CachingManagedAllocator::StatArray; + using at::cuda::CachingManagedAllocator::DeviceStats; + + const auto statToDict = [](const Stat& stat) { + py::dict dict; + + dict["current"] = stat.current; + dict["peak"] = stat.peak; + dict["allocated"] = stat.allocated; + dict["freed"] = stat.freed; + return dict; + }; + + const auto statArrayToDict = [=](const StatArray& statArray) { + const std::array(StatType::NUM_TYPES)> statTypeNames = { + "all", "small_pool", "large_pool" + }; + py::dict dict; + for (const auto i : c10::irange(statTypeNames.size())) { + dict[statTypeNames[i]] = statToDict(statArray[i]); + } + return dict; + }; + + const DeviceStats stats = at::cuda::CachingManagedAllocator::getDeviceStats(); + + py::dict result; + result["num_alloc_retries"] = stats.num_alloc_retries; + result["num_ooms"] = stats.num_ooms; + result["max_split_size"] = stats.max_split_size; + result["allocation"] = statArrayToDict(stats.allocation); + result["segment"] = statArrayToDict(stats.segment); + result["active"] = statArrayToDict(stats.active); + result["inactive_split"] = statArrayToDict(stats.inactive_split); + result["allocated_bytes"] = statArrayToDict(stats.allocated_bytes); + result["reserved_bytes"] = statArrayToDict(stats.reserved_bytes); + result["active_bytes"] = statArrayToDict(stats.active_bytes); + result["inactive_split_bytes"] = statArrayToDict(stats.inactive_split_bytes); + result["oversize_allocations"] = statToDict(stats.oversize_allocations); + result["oversize_segments"] = statToDict(stats.oversize_segments); + + return result.release().ptr(); + END_HANDLE_TH_ERRORS +} + +PyObject* THCPModule_resetAccumulatedMemoryStats(PyObject *_unused, PyObject *arg) +{ HANDLE_TH_ERRORS THPUtils_assert( THPUtils_checkLong(arg), @@ -1209,6 +1261,7 @@ static struct PyMethodDef _THCPModule_methods[] = { nullptr}, {"_cuda_emptyCache", THCPModule_emptyCache, METH_NOARGS, nullptr}, {"_cuda_memoryStats", THCPModule_memoryStats, METH_O, nullptr}, + {"_cuda_managedMemoryStats", THCPModule_managedMemoryStats, METH_O, nullptr}, {"_cuda_resetAccumulatedMemoryStats", THCPModule_resetAccumulatedMemoryStats, METH_O, diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py index b4d084abb18b2..b43d6df340385 100644 --- a/torch/cuda/memory.py +++ b/torch/cuda/memory.py @@ -763,3 +763,84 @@ def set_enabled_move(enable): def get_enabled_move(): r"""Returns a bool indicating if Unified Virtual Memory is currently enabled.""" return torch._C._cuda_getEnabledMove() + +def managed_memory_stats(device: Union[Device, int] = None) -> Dict[str, Any]: + r"""Returns a dictionary of Managed memory allocator statistics for a + given device. + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + Core statistics: + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + For these core statistics, values are broken down as follows. + Pool type: + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + Metric type: + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + In addition to the core statistics, we also provide some simple event + counters: + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory framentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = managed_memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def managed_memory_stats_as_nested_dict(device: Union[Device, int] = None) -> Dict[str, Any]: + r"""Returns the result of :func:`~torch.cuda.managed_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_managedMemoryStats(device) From d1cfce1727beb7f23275d71ec308e7245ac1ef33 Mon Sep 17 00:00:00 2001 From: Jason Furmanek Date: Thu, 8 Sep 2022 02:56:30 +0000 Subject: [PATCH 5/9] [UVM] Add Move kernel and associated functions --- aten/src/ATen/EmptyTensor.cpp | 14 ++- aten/src/ATen/core/TensorBase.h | 4 + aten/src/ATen/cuda/EmptyTensor.cpp | 15 ++- aten/src/ATen/native/Copy.cpp | 36 ++++++ aten/src/ATen/native/Copy.h | 4 + aten/src/ATen/native/TensorConversions.cpp | 59 +++++++++- aten/src/ATen/native/cuda/Copy.cu | 106 ++++++++++++++++++ aten/src/ATen/native/cuda/TensorShapeCUDA.cpp | 9 +- aten/src/ATen/native/native_functions.yaml | 14 +++ c10/core/TensorImpl.h | 53 +++++++++ tools/autograd/derivatives.yaml | 7 ++ tools/autograd/gen_python_functions.py | 5 +- torch/csrc/autograd/FunctionsManual.cpp | 11 ++ torch/csrc/autograd/FunctionsManual.h | 4 + 14 files changed, 332 insertions(+), 9 deletions(-) diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp index 55cdc09268f05..d2b883caa0d64 100644 --- a/aten/src/ATen/EmptyTensor.cpp +++ b/aten/src/ATen/EmptyTensor.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -10,10 +11,17 @@ namespace at { namespace detail { namespace { c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) { - if (pin_memory) { - return at::detail::getCUDAHooks().getPinnedMemoryAllocator(); + if (at::globalContext().userEnabledUVM()) { + // When UVM is enabled, we only use a single allocator + // for all allocations. + return detail::getCUDAHooks().getUnifiedDeviceAllocatorCpu(); + } + else { + if (pin_memory) { + return at::detail::getCUDAHooks().getPinnedMemoryAllocator(); + } + return c10::GetCPUAllocator(); } - return c10::GetCPUAllocator(); } constexpr uint64_t storage_max() { diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h index 0ecd4456033b0..adbebc66404ac 100644 --- a/aten/src/ATen/core/TensorBase.h +++ b/aten/src/ATen/core/TensorBase.h @@ -400,6 +400,10 @@ class TORCH_API TensorBase { return impl_->is_cpu(); } + inline void _set_new_device(Device dst_device) const { + impl_->_set_new_device(dst_device); + } + /// Returns if a `Tensor` has CUDA backend. bool is_cuda() const { // NB: this is not a native function to avoid dispatching overhead. diff --git a/aten/src/ATen/cuda/EmptyTensor.cpp b/aten/src/ATen/cuda/EmptyTensor.cpp index d40a245857d3b..1107b480fb2ee 100644 --- a/aten/src/ATen/cuda/EmptyTensor.cpp +++ b/aten/src/ATen/cuda/EmptyTensor.cpp @@ -1,6 +1,7 @@ #define TORCH_ASSERT_NO_OPERATORS #include #include +#include #include namespace at { @@ -15,7 +16,12 @@ TensorBase empty_cuda( const auto device = device_or_default(device_opt); TORCH_INTERNAL_ASSERT(device.is_cuda()); const DeviceGuard device_guard(device); - auto* allocator = at::cuda::getCUDADeviceAllocator(); + at::Allocator* allocator; + if (at::globalContext().userEnabledUVM()) { + allocator = at::cuda::getUnifiedDeviceAllocator(); + } else { + allocator = at::cuda::getCUDADeviceAllocator(); + } constexpr c10::DispatchKeySet cuda_dks(c10::DispatchKey::CUDA); return at::detail::empty_generic( size, allocator, cuda_dks, dtype, memory_format_opt); @@ -55,7 +61,12 @@ TensorBase empty_strided_cuda( const auto device = device_or_default(device_opt); TORCH_INTERNAL_ASSERT(device.is_cuda()); const DeviceGuard device_guard(device); - auto* allocator = at::cuda::getCUDADeviceAllocator(); + at::Allocator* allocator; + if (at::globalContext().userEnabledUVM()) { + allocator = at::cuda::getUnifiedDeviceAllocator(); + } else { + allocator = at::cuda::getCUDADeviceAllocator(); + } constexpr c10::DispatchKeySet cuda_dks(c10::DispatchKey::CUDA); return at::detail::empty_strided_generic( size, stride, allocator, cuda_dks, dtype); diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp index 0c99943eb0cb0..1697cfb570d03 100644 --- a/aten/src/ATen/native/Copy.cpp +++ b/aten/src/ATen/native/Copy.cpp @@ -345,7 +345,43 @@ void copy_ignoring_overlaps(const TensorBase &dst, const TensorBase &src) { copy_stub(iter.device_type(), iter, /*non_blocking=*/false); } +static Tensor & move_impl(Tensor & self, c10::optional dst_device, bool non_blocking) { + // TODO: this should be handled during dispatch, but that's missing... + TORCH_CHECK(self.defined(), "self is undefined"); + + // We'll create a tensor iterator with just an output + // Chagne the device up front so the iterator has + // the correct device + self._set_new_device(dst_device.value()); + + auto iter = TensorIteratorConfig() + .add_output(self) + .resize_outputs(false) + .set_check_mem_overlap(false) + .check_all_same_dtype(false) + .check_all_same_device(false) + .build(); + + move_stub(kCUDA, iter, dst_device, non_blocking); + return self; +} + +Tensor& move_(Tensor& self, c10::optional dst_device, bool non_blocking) { + // auto maybe_outnames = namedinference::compute_broadcast_outnames(self, src); + { + NoNamesGuard guard; //check if needed + if (self._is_zerotensor()) { + TORCH_CHECK(false, "ZeroTensors are immutable. Please materialize the tensor using `.clone()`, if you want a mutable zero tensor."); + } + move_impl(self, dst_device, non_blocking); + } + // namedinference::propagate_names_if_nonempty(self, maybe_outnames); + return self; +} + DEFINE_DISPATCH(copy_stub); +DEFINE_DISPATCH(move_stub); +REGISTER_NO_CPU_DISPATCH(move_stub); } // namespace native } // namespace at diff --git a/aten/src/ATen/native/Copy.h b/aten/src/ATen/native/Copy.h index 14abb32fa5ad4..4ba4085b1ebb2 100644 --- a/aten/src/ATen/native/Copy.h +++ b/aten/src/ATen/native/Copy.h @@ -1,6 +1,8 @@ #pragma once #include +#include +#include namespace at { @@ -11,8 +13,10 @@ class TensorBase; namespace native { using copy_fn = void (*)(TensorIterator&, bool non_blocking); +using move_fn = void (*)(TensorIterator&, c10::optional dst_device, bool non_blocking); DECLARE_DISPATCH(copy_fn, copy_stub); +DECLARE_DISPATCH(move_fn, move_stub); TORCH_API void copy_ignoring_overlaps(const TensorBase &dst, const TensorBase &src); diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp index 01f651982633b..51b43ad8959d8 100644 --- a/aten/src/ATen/native/TensorConversions.cpp +++ b/aten/src/ATen/native/TensorConversions.cpp @@ -345,6 +345,22 @@ Tensor _to_copy( return r; } +Tensor _to_move( + const Tensor& self, + c10::optional dtype, + c10::optional layout, + c10::optional device, + c10::optional pin_memory, + bool non_blocking, + c10::optional optional_memory_format) { + TORCH_CHECK(!layout.has_value() || self.layout() == layout.value(), + "to(options) doesn't support converting to a different layout, " + "but got self.layout being ", self.layout(), + " and options.layout set as ", layout.value()); + + return self.move_(device, non_blocking); +} + template static inline bool is_null_or_equal_to(const c10::optional& test, const T& value) { if (!test.has_value()) { @@ -373,12 +389,43 @@ bool to_will_alias( self.suggest_memory_format() == memory_format); } +bool to_will_move( + const Tensor& self, + c10::optional dtype, + c10::optional layout, + c10::optional device, + bool copy, + c10::optional optional_memory_format) { + auto memory_format = optional_memory_format.value_or(MemoryFormat::Preserve); + + if (!globalContext().userEnabledMove()) { + return false; + } + + // Let copy kernel handle meta tensor targets + if (device.has_value() && (device.value().type() == DeviceType::Meta)) { + return false; + } + + // Now check that we can "move" instead of copy if UVM is + // enabled, we have a manged tensor and the dtype and layout + // are the same. Device also needs to be changing (good enough?) + return is_null_or_equal_to(dtype, self.dtype().toScalarType()) && + is_null_or_equal_to(layout, self.layout()) && + (device != self.device()) && + !copy && globalContext().userEnabledUVM() && + self.is_managed() && + (memory_format == MemoryFormat::Preserve || + self.suggest_memory_format() == memory_format); +} + static inline Tensor to_impl( const Tensor& self, c10::optional dtype, c10::optional layout, c10::optional device, c10::optional pin_memory, + c10::optional managed_memory, bool non_blocking, bool copy, c10::optional optional_memory_format) { @@ -387,6 +434,10 @@ static inline Tensor to_impl( if (to_will_alias(self, dtype, layout, device, copy, optional_memory_format)) { return self; } + if (to_will_move(self, dtype, layout, device, copy, optional_memory_format)) { + return at::_to_move( + self, dtype, layout, device, pin_memory, non_blocking, optional_memory_format); + } return at::_to_copy( self, dtype, layout, device, pin_memory, non_blocking, optional_memory_format); } @@ -408,7 +459,7 @@ Tensor _autocast_to_reduced_precision(const Tensor& self, bool cuda_enabled, boo TORCH_INTERNAL_ASSERT(target != at::ScalarType::Undefined, "_autocast_to_reduced_precision requires legit ScalarType argument for given device"); return to_impl( - self, target, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt); + self, target, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt); } else { return self; } @@ -422,7 +473,7 @@ Tensor _autocast_to_full_precision(const Tensor& self, bool cuda_enabled, bool c (self.device().is_cpu() && cpu_enabled)) ) { return to_impl( - self, at::ScalarType::Float, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt); + self, at::ScalarType::Float, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt); } else { return self; } @@ -444,6 +495,7 @@ Tensor to( layout, ensure_has_index(device), pin_memory, + nullopt, non_blocking, copy, optional_memory_format); @@ -456,6 +508,7 @@ Tensor to(const Tensor& self, Device device, ScalarType dtype, bool non_blocking nullopt, ensure_has_index(device), nullopt, + nullopt, non_blocking, copy, optional_memory_format); @@ -468,6 +521,7 @@ Tensor to(const Tensor& self, ScalarType dtype, bool non_blocking, bool copy, c1 nullopt, nullopt, nullopt, + nullopt, non_blocking, copy, optional_memory_format); @@ -481,6 +535,7 @@ Tensor to(const Tensor& self, const Tensor& other, bool non_blocking, bool copy, options.layout(), options.device(), options.pinned_memory(), + options.managed_memory(), non_blocking, copy, optional_memory_format); diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu index 564ecf1c12913..b8e62b6a161c8 100644 --- a/aten/src/ATen/native/cuda/Copy.cu +++ b/aten/src/ATen/native/cuda/Copy.cu @@ -281,7 +281,113 @@ static void copy_kernel_cuda(TensorIterator& iter, bool non_blocking) { } } +int64_t uvm_get_guard_index(Tensor& t) { + int cuda_device_index; + if (t.is_cpu()) { + // TODO: We can to do more here once we have + // indirect contexts in place + cuda_device_index = -1; + } else { + TORCH_CHECK(t.is_cuda()); + cuda_device_index = t.get_device(); + } + return cuda_device_index; +} + +void uvm_cuda_mem_advise_preferred_location(Tensor t) { + // Call cudaMemAdvise on a managed tensor + at::cuda::OptionalCUDAGuard device_guard; + int64_t cuda_device_index = uvm_get_guard_index(t); + int hint_device; + if (t.is_cpu()) { + hint_device = -1 /*cudaCpuDeviceId*/; + } else { + hint_device = static_cast(cuda_device_index); + } + + void* ptr = t.data_ptr(); + size_t size_bytes = at::detail::computeStorageNbytes( + t.sizes(), t.strides(), t.dtype().itemsize()); + + device_guard.set_index(cuda_device_index); + AT_CUDA_CHECK(cudaMemAdvise( + ptr, + size_bytes, + cudaMemAdviseSetPreferredLocation, + hint_device)); + return; +} + + +void uvm_cuda_mem_prefetch_async(Tensor t, CUDAStream stream) { + // Call cudaMemPrefetchAsync on Tensor + at::cuda::OptionalCUDAGuard device_guard; + //TORCH_CHECK(t.is_cuda() || (t.is_cpu() && device_t.has_value())); + + int prefetch_device = + (t.is_cpu()) ? -1/*cudaCpuDeviceId*/ : static_cast(t.get_device()); + + void* ptr = t.data_ptr(); + size_t size_bytes = at::detail::computeStorageNbytes( + t.sizes(), t.strides(), t.dtype().itemsize()); + + device_guard.set_index(t.get_device()); + + AT_CUDA_CHECK(cudaMemPrefetchAsync(ptr, size_bytes, prefetch_device, stream)); + + return; +} + +static void move_kernel_cuda(TensorIterator& iter, c10::optional destination, bool non_blocking) { + //AT_ASSERT(iter.ntensors() == 1); + + Device dst_device = destination.value(); + Device src_device = iter.device(0); + + // Copy between CPU and GPU + cuda::OptionalCUDAGuard device_guard; + cudaMemcpyKind kind; + if (dst_device.is_cuda() && src_device.is_cpu()) { + device_guard.set_device(dst_device); + kind = cudaMemcpyHostToDevice; + } else if (dst_device.is_cpu() && src_device.is_cuda()) { + device_guard.set_device(src_device); + kind = cudaMemcpyDeviceToHost; + } + + CUDAStream stream = getCurrentCUDAStream(); + + // At this point we are setting up a memcopy between a + // CPU and GPU of a contiguous, same-type tensor. Since this is + // purely for device transfer, when UVM is enabled, + // we can skip the explicit copy as we have managed memory. + + // We do, however, need to call _set_new_device() which will: + // - change the device of the tensor's storage object to destination device + // - change the device of the tensor to destination + // - Edit the Tensor's dispatchKeySet + // TODO: D2D may need 2way event barrier + iter.tensor(0)._set_new_device(dst_device); + + // set device hint + // TODO: Enable memadvise after tuning + //uvm_cuda_mem_advise_preferred_location(self); + + // request a prefetch to new device + uvm_cuda_mem_prefetch_async(iter.tensor(0), stream); + + // An explicit sync is always needed when copying back to CPU + if (kind == cudaMemcpyDeviceToHost) { + AT_CUDA_CHECK(cudaStreamSynchronize(stream)); + } + // TODO: Sync both directions for now + AT_CUDA_CHECK(cudaDeviceSynchronize()); + AT_CUDA_CHECK(cudaStreamSynchronize(stream)); + +} + REGISTER_DISPATCH(copy_stub, ©_kernel_cuda); +REGISTER_CUDA_DISPATCH(move_stub, &move_kernel_cuda); } // namespace native } // namespace at diff --git a/aten/src/ATen/native/cuda/TensorShapeCUDA.cpp b/aten/src/ATen/native/cuda/TensorShapeCUDA.cpp index 0bb7eb410acf3..25a36d1f25f9c 100644 --- a/aten/src/ATen/native/cuda/TensorShapeCUDA.cpp +++ b/aten/src/ATen/native/cuda/TensorShapeCUDA.cpp @@ -1,6 +1,7 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include +#include #include #include @@ -18,10 +19,16 @@ namespace native { // the same as at::cuda::getCUDADeviceAllocator(). Tensor& set_cuda_(Tensor& result) { caffe2::TypeMeta dtype = result.dtype(); + at::Allocator* allocator; + if (at::globalContext().userEnabledUVM()) { + allocator = at::cuda::getUnifiedDeviceAllocator(); + } else { + allocator = at::cuda::getCUDADeviceAllocator(); + } Storage storage( Storage::use_byte_size_t(), 0, - at::cuda::getCUDADeviceAllocator(), + allocator, true); result.set_(storage, 0, {0}, {}); TORCH_INTERNAL_ASSERT(dtype == result.dtype()); diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index ce7531727ad1d..0bb28123168d9 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -1626,6 +1626,14 @@ NestedTensorCPU, NestedTensorCUDA: copy_nested_ autogen: copy.out +- func: move_(Tensor(a!) self, Device? dst_device=None, bool non_blocking=False) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: move_ + autogen: move.functional, move.out + - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor dispatch: MPS: _copy_from_mps @@ -7062,6 +7070,12 @@ autogen: _to_copy.out tags: canonical +- func: _to_move(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: _to_move + # to(Device) must not exist because all constructors of Device also works for # TensorOptions. Otherwise, an ambiguity error is thrown. # See NOTE [ TensorOptions Constructors ]. diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index 3a0cce80991f7..929488186e1ef 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -1270,6 +1270,59 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { } public: + void _set_new_device(Device dst_device) { + // This should only be called from the + // move kernel which is used when UVM is enabled + // TODO: enforce this + + // 1) set the device in the storage object + storage_.data_ptr().unsafe_set_device(dst_device); + + // 2) set the local device_opt + device_opt_ = storage_.device(); + + // 3) update dispatch key set + auto ks_iter = key_set_.begin(); + DispatchKeySet new_ks; + if (dst_device.is_cuda()) + { + TORCH_CHECK(!(key_set_.has(DispatchKey::MkldnnCPU))) + for (ks_iter = key_set_.begin(); ks_iter != key_set_.end(); ++ks_iter) + { + if (*ks_iter == DispatchKey::CPU) + new_ks = new_ks.add(DispatchKey::CUDA); + else if (*ks_iter == DispatchKey::SparseCPU) + new_ks = new_ks.add(DispatchKey::SparseCUDA); + else if (*ks_iter == DispatchKey::SparseCsrCPU) + new_ks = new_ks.add(DispatchKey::SparseCsrCUDA); + else if (*ks_iter == DispatchKey::QuantizedCPU) + new_ks = new_ks.add(DispatchKey::QuantizedCUDA); + else + new_ks = new_ks.add(*ks_iter); + } + } else if (dst_device.is_cpu()) { + for (ks_iter = key_set_.begin(); ks_iter != key_set_.end(); ++ks_iter) + { + if (*ks_iter == DispatchKey::CUDA) + new_ks = new_ks.add(DispatchKey::CPU); + else if (*ks_iter == DispatchKey::SparseCUDA) + new_ks = new_ks.add(DispatchKey::SparseCPU); + else if (*ks_iter == DispatchKey::SparseCsrCUDA) + new_ks = new_ks.add(DispatchKey::SparseCsrCPU); + else if (*ks_iter == DispatchKey::QuantizedCUDA) + new_ks = new_ks.add(DispatchKey::QuantizedCPU); + else + new_ks = new_ks.add(*ks_iter); + } + } else { + for (ks_iter = key_set_.begin(); ks_iter != key_set_.end(); ++ks_iter) + { + new_ks = new_ks.add(*ks_iter); + } + } + key_set_ = new_ks; + } + Layout layout() const { if (C10_UNLIKELY(layout_policy_)) { return layout_custom(); diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index 04bdaa79d202b..2d719b28afa73 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -420,6 +420,13 @@ # (If dtype IS nullopt, we rely on the regular check that any input requires grad). output_differentiability: ["!dtype || isDifferentiableType(*dtype)"] +- name: _to_move(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor + self: _to_move_backward(grad, self.options()) + result: _to_move(self_t, dtype, layout, device, pin_memory, non_blocking, memory_format) + # The condition is: if dtype is not nullopt, then isDifferentiableType(*dtype) + # (If dtype IS nullopt, we rely on the regular check that any input requires grad). + output_differentiability: ["!dtype || isDifferentiableType(*dtype)"] + - name: _coalesce(Tensor self) -> Tensor self: grad diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py index ee06a8ed12384..388d0c37841c7 100644 --- a/tools/autograd/gen_python_functions.py +++ b/tools/autograd/gen_python_functions.py @@ -125,8 +125,10 @@ "to", "_to_copy", "_reshape_copy", + "_to_move", "copy_sparse_to_sparse_", - "copy_", + "copy_", + "move_", "numpy_T", "matrix_H", "mT", @@ -149,6 +151,7 @@ "_reshape_alias", "replace_", # only used by the functionalization pass, doesn't need to be exposed to python "copy", # only used by the functionalization pass + "move", # only used by the functionalization pass "fill.Tensor", # only used by the functionalization pass "fill.Scalar", # only used by the functionalization pass "lift.*", diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp index 30593b5c00723..26f6362352513 100644 --- a/torch/csrc/autograd/FunctionsManual.cpp +++ b/torch/csrc/autograd/FunctionsManual.cpp @@ -6674,6 +6674,17 @@ Tensor _to_copy_backward( return grad->to(self_options, /*non_blocking=*/false, /*copy=*/false); } +Tensor _to_move_backward(const Tensor &grad_, const c10::TensorOptions &self_options) { + // Handle R->C copies without raising a warning + const auto self_type = self_options.dtype().toScalarType(); + auto grad = c10::MaybeOwned::borrowed(grad_); + if (!c10::isComplexType(self_type) && grad->is_complex()) { + grad = c10::MaybeOwned::owned(at::real(grad_)); + } + + return grad->to(self_options, /*non_blocking=*/false, /*copy=*/false); +} + std::tuple index_reduce_backward( const Tensor& grad, const Tensor& self, diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h index e1475ecb2b968..7112169af48c5 100644 --- a/torch/csrc/autograd/FunctionsManual.h +++ b/torch/csrc/autograd/FunctionsManual.h @@ -1016,6 +1016,10 @@ std::tuple scatter_reduce_backward( Tensor _to_copy_backward( const Tensor& grad, const c10::TensorOptions& self_options); + +Tensor _to_move_backward( + const Tensor &grad, + const c10::TensorOptions &self_options); std::tuple index_reduce_backward( const Tensor& grad, From 189be3669957ec2c97700f1a7b3849dcc167ead8 Mon Sep 17 00:00:00 2001 From: Douglas Lehr Date: Wed, 21 Dec 2022 21:13:43 +0000 Subject: [PATCH 6/9] Pytorch 2.0 updates for uvm --- aten/src/ATen/native/Memory.cpp | 1 + aten/src/ATen/native/native_functions.yaml | 2 +- torch/csrc/StorageMethods.cpp | 2 +- torchgen/native_function_generation.py | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/native/Memory.cpp b/aten/src/ATen/native/Memory.cpp index e83b0bf060011..237df64a87bfe 100644 --- a/aten/src/ATen/native/Memory.cpp +++ b/aten/src/ATen/native/Memory.cpp @@ -8,6 +8,7 @@ #else #include #include +#include #include #include #endif diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 0bb28123168d9..1e1b071b3391c 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -1632,7 +1632,7 @@ device_guard: False dispatch: CompositeExplicitAutograd: move_ - autogen: move.functional, move.out + autogen: move, move.out - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor dispatch: diff --git a/torch/csrc/StorageMethods.cpp b/torch/csrc/StorageMethods.cpp index 0537842c9c2c6..1b6078335c734 100644 --- a/torch/csrc/StorageMethods.cpp +++ b/torch/csrc/StorageMethods.cpp @@ -96,7 +96,7 @@ static PyObject* THPStorage_isManaged(PyObject* _self, PyObject* noargs) { auto self = (THPStorage*)_self; #if defined(USE_CUDA) return PyBool_FromLong( - at::globalContext().isManagedPtr(self->cdata->data())); + at::globalContext().isManagedPtr(self->cdata->data())); #else Py_RETURN_FALSE; #endif diff --git a/torchgen/native_function_generation.py b/torchgen/native_function_generation.py index f1ba555be62e6..dcca88405688b 100644 --- a/torchgen/native_function_generation.py +++ b/torchgen/native_function_generation.py @@ -64,6 +64,7 @@ "equal", # returns a boolean "is_coalesced", # returns an boolean "is_pinned", # returns a boolean + "is_managed", "is_same_size", # returns a boolean "is_set_to", # returns a boolean "q_per_channel_axis", # returns an int From 0a3a46362492b054a60921519af49333ee775453 Mon Sep 17 00:00:00 2001 From: Douglas Lehr Date: Tue, 24 Jan 2023 07:46:54 +0000 Subject: [PATCH 7/9] Add CUDAMallocManagedAllocator Backend With the new CUDAAllocator class, we have created a new CUDAMallocManagedAllocator, which will handle allocator requests from both cpu and cuda device types when the backend is enabled You can enable the backend using PYTORCH_CUDA_ALLOC_CONF=backend:cudaMallocManaged And view inside PyTorch using torch.cuda.get_allocator_backend() This allocator is initially rudimentary as the performance implications of a managed allocator are still being worked out. However, the goal is to be able to swap out the backend when running without any code change required. --- c10/cuda/CMakeLists.txt | 1 + c10/cuda/CUDACachingAllocator.cpp | 10 +- c10/cuda/CUDAMallocManagedAllocator.cpp | 553 ++++++++++++++++++++++++ 3 files changed, 563 insertions(+), 1 deletion(-) create mode 100644 c10/cuda/CUDAMallocManagedAllocator.cpp diff --git a/c10/cuda/CMakeLists.txt b/c10/cuda/CMakeLists.txt index 2c26bc06f6ca4..854ff699f2a52 100644 --- a/c10/cuda/CMakeLists.txt +++ b/c10/cuda/CMakeLists.txt @@ -25,6 +25,7 @@ set(C10_CUDA_SRCS CUDAException.cpp CUDAFunctions.cpp CUDAMallocAsyncAllocator.cpp + CUDAMallocManagedAllocator.cpp CUDAMiscFunctions.cpp CUDAStream.cpp impl/CUDAGuardImpl.cpp diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index aaa647502a897..c912ddf40b91a 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -628,7 +628,7 @@ size_t CachingAllocatorConfig::parseAllocatorConfig( consumeToken(config, ++i, ':'); if (++i < config.size()) { TORCH_CHECK( - ((config[i] == "native") || (config[i] == "cudaMallocAsync")), + ((config[i] == "native") || (config[i] == "cudaMallocAsync") || (config[i] == "cudaMallocManaged")), "Unknown allocator backend, " "options are native and cudaMallocAsync"); used_cudaMallocAsync = (config[i] == "cudaMallocAsync"); @@ -2336,6 +2336,12 @@ CUDAAllocator* allocator(); } // namespace CudaMallocAsync +namespace CudaMallocManaged { +// If this is put in its own header file, it gets incorrectly renamed in HIPify. +CUDAAllocator* allocator(); + +} // namespace CudaMallocManaged + struct BackendStaticInitializer { // Parses env for backend at load time, duplicating some logic from // CachingAllocatorConfig. CachingAllocatorConfig double-checks it later (at @@ -2361,6 +2367,8 @@ struct BackendStaticInitializer { if (kv[0] == "backend") { if (kv[1] == "cudaMallocAsync") return CudaMallocAsync::allocator(); + if (kv[1] == "cudaMallocManaged") + return CudaMallocManaged::allocator(); if (kv[1] == "native") return &Native::allocator; } diff --git a/c10/cuda/CUDAMallocManagedAllocator.cpp b/c10/cuda/CUDAMallocManagedAllocator.cpp new file mode 100644 index 0000000000000..fcba528ec2218 --- /dev/null +++ b/c10/cuda/CUDAMallocManagedAllocator.cpp @@ -0,0 +1,553 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace c10 { +namespace cuda { +namespace CUDACachingAllocator { +namespace CudaMallocManaged { + +// CUDA device allocator that uses CudaMallocManaged to implement +// the same interface as CUDACachingAllocator.cpp. + +// Implementation details, not declared in CUDACachingAllocator.h +namespace { + +// General helpers + +struct UsageStream { + cudaStream_t stream; + int device; + UsageStream() {} + UsageStream(cudaStream_t s, int d) : stream(s), device(d) {} + UsageStream(const UsageStream& us) : stream(us.stream), device(us.device) {} + UsageStream(const UsageStream&& us) : stream(us.stream), device(us.device) {} + UsageStream& operator=(UsageStream other) { + stream = other.stream; + device = other.device; + return *this; + } +}; + +bool operator==(const UsageStream& lhs, const UsageStream& rhs) { + return (lhs.stream == rhs.stream) && (lhs.device == rhs.device); +} + +struct UsageStreamHash { + size_t operator()(const UsageStream& us) const noexcept { + return std::hash{}(us.stream) + size_t(us.device); + } +}; + +struct PtrUsage { + + uint64_t size; + PtrUsage(uint64_t s) : size(s) {} +}; + +int device_count = 0; +// these don't need to be c10::once_flags as in CUDAGeneratorImpl.cpp +// because they'll only be flipped by functions that have locked the mutex. +bool devs_initialized_flag; + +// Possible micro-optimization: +// Some accesses to ptr_info are read-only. +// We could let those be concurrent with a shared_mutex and +// have concurrent calls take a shared_lock. +// Keeping it simple with an ordinary mutex for now. +std::mutex general_mutex; + +using PtrInfo = ska::flat_hash_map; +PtrInfo ptr_info; + + +// These two help setMemoryFraction limit the amount of memory +// used by PyTorch in particular (as opposed to other libraries +// in the same process that might be sharing the same cudaMemPool_t). +size_t pytorch_used_bytes; +size_t pytorch_memory_limits; + +bool capture_underway = false; + +// Implementation functions + +// Assumes the caller holds general_mutex +inline void lazy_init_device(int device) { + if (!devs_initialized_flag) { + pytorch_used_bytes= 0; + pytorch_memory_limits = UINT64_MAX; + + devs_initialized_flag = true; + } +} + + +void free(void* ptr) { + std::lock_guard lk(general_mutex); + + auto err = cudaGetLastError(); + C10_CUDA_CHECK(err); + auto it = ptr_info.find(ptr); + + TORCH_INTERNAL_ASSERT(it != ptr_info.end(), "ptr not found in ptr_info"); + + C10_CUDA_CHECK(cudaFree(it->first)); + + pytorch_used_bytes -= it->second.size; + + ptr_info.erase(it); +} + +// Symmetric with NativeCachingAllocator::malloc for now, +// although I don't think we absolutely need the symmetry. +void mallocManaged(void** devPtr, int device, size_t size, cudaStream_t stream) { + TORCH_INTERNAL_ASSERT( + 0 <= device && device < device_count, + "Invalid device index ", + device, + ": did you call init?"); + + CUDAGuard g(device); + + std::lock_guard lk(general_mutex); + + lazy_init_device(device); + + // Defensively checks for preexisting CUDA error state. + auto err = cudaGetLastError(); + C10_CUDA_CHECK(err); + + if (pytorch_used_bytes + size > pytorch_memory_limits) { + err = cudaErrorMemoryAllocation; + } else { + err = cudaMallocManaged(devPtr, size); + } + + if (err == cudaErrorMemoryAllocation) { + // Clears CUDA's internal error state so the user, if desired, can catch the + // OOM exception, free some stuff on the script side, and retry the + // allocation. This aligns with the behavior of alloc_block in + // CUDACachingAllocator.cpp. + cudaGetLastError(); + size_t device_free; + size_t device_total; + C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total)); + TORCH_CHECK_WITH( + OutOfMemoryError, + false, + "Allocation on device ", + device, + " would exceed allowed memory. (out of memory)", + "\nCurrently allocated : ", + format_size(pytorch_used_bytes), + "\nRequested : ", + format_size(size), + "\nDevice limit : ", + format_size(device_total), + "\nFree (according to CUDA): ", + format_size(device_free), + "\nPyTorch limit (set by user-supplied memory fraction)" + "\n : ", + format_size(pytorch_memory_limits)); + } else { + C10_CUDA_CHECK(err); + } +auto inserted = ptr_info.emplace(*devPtr, PtrUsage(size)); + TORCH_INTERNAL_ASSERT( + inserted.second, + "address returned by cudaMallocManaged already exists " + "in ptr_info"); + pytorch_used_bytes += size; +} + +} // anonymous namespace + +void local_raw_delete(void* ptr); + +// Same pattern as CUDACachingAllocator.cpp. +struct CudaMallocManagedAllocator : public CUDAAllocator { + DataPtr allocate(size_t size) const override { + constexpr size_t one_exa_bytes = 1152921504606846976ULL; + TORCH_CHECK_WITH( + OutOfMemoryError, + size < one_exa_bytes, + "CUDA out of memory. Tried to allocate more than 1EB memory."); + int device; + C10_CUDA_CHECK(cudaGetDevice(&device)); + void* r = nullptr; + if (size != 0) { + mallocManaged(&r, device, size, cuda::getCurrentCUDAStream(device)); + } + return {r, r, &local_raw_delete, Device(DeviceType::CUDA, device)}; + } + DeleterFnPtr raw_deleter() const override { + return &local_raw_delete; + } + + // This function should not issue any context-creating calls, + // just set up for later calls to init per-device pools based + // on the current device each later call sees. + void init(int dev_count) override { + static bool called = [](int dev_count) { + ; + // Are there external guarantees init will be called before + // any of the allocator's other functions? + // std::lock_guard lk(general_mutex); + device_count = dev_count; + return true; + }(dev_count); + (void)called; + } + + bool initialized() override { + return devs_initialized_flag; + } + + static inline void assertValidDevice(int device) { + TORCH_CHECK( + 0 <= device && device < device_count, "Invalid device argument."); + } + + void setMemoryFraction(double fraction, int device) override { + TORCH_INTERNAL_ASSERT( + 0 <= fraction && fraction <= 1, + "invalid fraction:", + fraction, + ". Please set within (0, 1)."); + + std::lock_guard lk(general_mutex); + assertValidDevice(device); + CUDAGuard g(device); + // Should setMemoryFraction be allowed to trigger a full device context and + // pool-creating lazy_init_device, or should we simply assert this device is + // already initialized, ie + // TORCH_CHECK(devs_initialized_flags[device], ...)? + lazy_init_device(device); + + size_t device_free; + size_t device_total; + C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total)); + pytorch_memory_limits = + static_cast(fraction * device_total); + + // Alternative: Instead of a manual hard limit, we could use + // cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, + // &threshold); This is a soft hint: The driver allows the pool's reserved + // memory to spike above threshold in regions of high cudaMallocAsync + // demand, but opportunistically trims reserved memory back to threshold + // when the memory in use is < threshold. I don't like this because it + // introduces performance nondeterminism. + } + + void emptyCache(void) override { + std::lock_guard lk(general_mutex); + + for (int dev = 0; dev < device_count; dev++) { + if (devs_initialized_flag) { + CUDAGuard g(dev); + + cudaMemPool_t mempool; + cudaDeviceGetDefaultMemPool(&mempool, dev); + cudaDeviceSynchronize(); + cudaMemPoolTrimTo(mempool, 0); + } + } + } + + void cacheInfo(int device, size_t* maxWorkspaceGuess) override { + // The only consumer of cacheInfo is getMaxWorkspaceSize in Conv_v7.cpp. + // Afaict, the role of cacheInfo is to give getMaxWorkspaceSize a reasonable + // maximum workspace size to use for an upcoming cudnnFind call. + // + // The native allocator's cacheInfo chooses to return the size of its + // largest unused block (which is the largest allocation the native + // allocator can service immediately and asynchronously without a + // cudaMalloc. + // + // Here, we use a different heuristic: figure out the max usable workspace + // size with a bit of educated trial and error. It's ok to be + // perf-inefficient because cacheInfo is a prelude to cudnnFind. + // + // The algo cache then stores the best-performing algo with workspace <= + // maxWorkspaceGuess. Later calls with the same param set hit in cache and + // try to allocate the same workspace. If, in one of those future calls, + // workspace allocation fails (ie because less ambient memory is available), + // the bindings rerun cudnnFind, including calling cacheInfo again + // beforehand to estimate a new (smaller) largest-available workspace. Over + // a few such calls, the cache should settle to the algo with a workspace + // size that's small enough to succeed every time (for that param set). + // + // So the strategy here is to return a rough, largeish guess and let the + // bindings retry to trim as needed over time. + + std::lock_guard lk(general_mutex); + assertValidDevice(device); + CUDAGuard g(device); + lazy_init_device(device); + + size_t free_upper_bound; + size_t device_total; + C10_CUDA_CHECK(cudaMemGetInfo(&free_upper_bound, &device_total)); + TORCH_INTERNAL_ASSERT( + free_upper_bound + pytorch_used_bytes <= device_total); + size_t guess = std::min( + free_upper_bound, + pytorch_memory_limits - pytorch_used_bytes); + auto stream = c10::cuda::getCurrentCUDAStream(); + void* dummy; + + // Defensively checks for preexisting CUDA error state. + auto err = cudaGetLastError(); + C10_CUDA_CHECK(err); + + while (true) { + // Duplicates some logic from mallocAsync to work with the error state + // directly instead of repeatedly catching an exception thrown by + // mallocAsync. + if (pytorch_used_bytes + guess > pytorch_memory_limits) { + err = cudaErrorMemoryAllocation; + } else { + err = cudaMallocManaged(&dummy, guess); + } + + if (err == cudaSuccess) { + cudaFree(dummy); + *maxWorkspaceGuess = guess; + return; + } else if (err == cudaErrorMemoryAllocation) { + cudaGetLastError(); // clear CUDA error + guess >>= 1; // quick and dirty: try half the size next iteration + } else { + C10_CUDA_CHECK(err); + } + } + } + + void* getBaseAllocation(void* ptr, size_t* size) override { + std::lock_guard lk(general_mutex); + + auto it = ptr_info.find(ptr); + TORCH_INTERNAL_ASSERT(it != ptr_info.end(), "ptr not found in ptr_info"); + + if (size) { + *size = it->second.size; + } + + return ptr; + } + + void recordStream(const DataPtr& ptr, cuda::CUDAStream stream) override { + TORCH_CHECK( + false, + "cudaMallocManaged does not yet support recordStream. " + "If you need it, please file an issue describing your use case."); + } + + std::shared_ptr getIpcDevPtr(std::string handle) override { + TORCH_CHECK( + false, + "cudaMallocManaged does not yet support getIpcDevPtr. " + "If you need it, please file an issue describing your use case."); + } + + void recordHistory( + bool enabled, + CreateContextFn context_recorder, + size_t alloc_trace_max_entries, + bool alloc_trace_record_context) override { + TORCH_CHECK( + false, + "cudaMallocManaged does not yet support recordHistory. " + "If you need it, please file an issue describing your use case."); + } + + void attachOutOfMemoryObserver(OutOfMemoryObserver observer) override { + TORCH_CHECK( + false, + "cudaMallocManaged does not yet support attachOutOfMemoryObserver. " + "If you need it, please file an issue describing your use case."); + } + + // Collects stats for device. + // If device hasn't been used yet, returns 0s without creating a context. + DeviceStats getDeviceStats(int device) override { + assertValidDevice(device); + + // Memory currently reserved by the mempool + uint64_t reserved_mem_current = 0; + // High-water mark of memory reserved by the mempool since last reset + uint64_t reserved_mem_peak = 0; + // Memory currently in use by the mempool + uint64_t used_mem_current = 0; + // High-water mark of memory + uint64_t used_mem_peak = 0; + + std::lock_guard lk(general_mutex); + + if (devs_initialized_flag) { + CUDAGuard g(device); + + cudaMemPool_t mempool; + C10_CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool, device)); + C10_CUDA_CHECK(cudaMemPoolGetAttribute( + mempool, cudaMemPoolAttrReservedMemCurrent, &reserved_mem_current)); + + C10_CUDA_CHECK(cudaMemPoolGetAttribute( + mempool, cudaMemPoolAttrReservedMemHigh, &reserved_mem_peak)); + + C10_CUDA_CHECK(cudaMemPoolGetAttribute( + mempool, cudaMemPoolAttrUsedMemCurrent, &used_mem_current)); + + C10_CUDA_CHECK(cudaMemPoolGetAttribute( + mempool, cudaMemPoolAttrUsedMemHigh, &used_mem_peak)); + } + + // Many stat types are specific to the native allocator. We leave these + // untouched. Their "struct Stat"s will contain zeroed values. + DeviceStats stats; + + // In the native allocator: + // allocated_bytes is the total bytes of blocks that have been malloc()ed + // and not yet free()d. + // active_bytes is the total bytes of blocks that have been malloc()ed but + // not yet released back into a free pool. In other words, it includes all + // allocated_bytes, as well as the bytes of "limbo state" blocks had have + // already been free()ed but not yet free_block()ed back into a pool due to + // outstanding stream_uses. + // + // Here, in the CudaMallocManaged allocator: + // We simply ask the driver's opinion about active memory. + // We don't bother distinguishing between allocated_bytes and active_bytes. + stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current = + used_mem_current; + stats.allocated_bytes[static_cast(StatType::AGGREGATE)].peak = + used_mem_peak; + stats.active_bytes[static_cast(StatType::AGGREGATE)].current = + used_mem_current; + stats.active_bytes[static_cast(StatType::AGGREGATE)].peak = + used_mem_peak; + stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current = + reserved_mem_current; + stats.reserved_bytes[static_cast(StatType::AGGREGATE)].peak = + reserved_mem_peak; + + return stats; + } + + void resetAccumulatedStats(int device) override { + assertValidDevice(device); + TORCH_WARN_ONCE( + "For backend:cudaMallocManaged, resetAccumulatedStats has no effect."); + } + + void resetPeakStats(int device) override { + assertValidDevice(device); + + CUDAGuard g(device); + cudaMemPool_t mempool; + C10_CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool, device)); + // Using zero as the reset value is the method recommended by Cuda driver + // team. Vivek Kini says: + // "Resetting to zero (which is the only valid value when setting + // ReservedMemHigh) resets it to ReservedMemCurrent inside the driver + // (same goes for UsedMemHigh/UsedMemCurrent)" + uint64_t zero = 0; + C10_CUDA_CHECK(cudaMemPoolSetAttribute( + mempool, cudaMemPoolAttrReservedMemHigh, &zero)); + C10_CUDA_CHECK( + cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrUsedMemHigh, &zero)); + } + + SnapshotInfo snapshot() override { + TORCH_CHECK( + false, + "Calling snapshot with backend:cudaMallocManaged is not supported. "); + // Alternative: TORCH_WARN + return {}; + } + + // CUDAGraph interactions + void notifyCaptureBegin( + int device, + CaptureId_t graph_id, + MempoolId_t mempool_id) override { + TORCH_CHECK( + false, + "cudaMallocManaged does not yet support notifyCaptureBegin. " + "If you need it, please file an issue describing your use case."); + } + + void notifyCaptureAboutToEnd(int device, CaptureId_t graph_id) override { + TORCH_CHECK( + false, + "cudaMallocManaged does not yet support notifyCaptureAboutToEnd. " + "If you need it, please file an issue describing your use case."); + } + + void notifyCaptureEnded(int device, CaptureId_t graph_id) override { + TORCH_CHECK( + false, + "cudaMallocManaged does not yet support notifyCaptureEnded. " + "If you need it, please file an issue describing your use case."); + } + + void notifyCaptureDestroy(int device, MempoolId_t mempool_id) override { + TORCH_CHECK( + false, + "cudaMallocManaged does not yet support notifyCaptureDestroy. " + "If you need it, please file an issue describing your use case."); + } + + void* raw_alloc(size_t nbytes) override { + if (nbytes == 0) { + return nullptr; + } + int device; + C10_CUDA_CHECK(cudaGetDevice(&device)); + void* r = nullptr; + mallocManaged(&r, device, nbytes, cuda::getCurrentCUDAStream(device)); + return r; + } + + void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) override { + if (nbytes == 0) { + return nullptr; + } + int device; + C10_CUDA_CHECK(cudaGetDevice(&device)); + void* r = nullptr; + mallocManaged(&r, device, nbytes, stream); + return r; + } + void raw_delete(void* ptr) override { + free(ptr); + } + bool needsPoolSpecificPeerAccess() override { + return true; + } + std::string name() override { + return "CudaMallocManaged"; + } +}; + +CudaMallocManagedAllocator device_allocator; + +void local_raw_delete(void* ptr) { + free(ptr); +} +CUDAAllocator* allocator() { + return &device_allocator; +} + + +} // namespace CudaMallocManaged +} // namespace CUDACachingAllocator +} // namespace cuda +} // namespace c10 From ad82c20351f14cf283c0f20edad65108ec7511eb Mon Sep 17 00:00:00 2001 From: Douglas Lehr Date: Tue, 7 Feb 2023 16:24:00 +0000 Subject: [PATCH 8/9] Clean up front end to use default DeviceAllocator --- aten/src/ATen/EmptyTensor.cpp | 2 +- aten/src/ATen/cuda/EmptyTensor.cpp | 14 +++------- aten/src/ATen/cuda/UvmMemoryAllocator.cpp | 8 ++---- aten/src/ATen/cuda/detail/CUDAHooks.cpp | 8 ------ aten/src/ATen/cuda/detail/CUDAHooks.h | 2 -- aten/src/ATen/detail/CUDAHooksInterface.h | 8 ------ torch/utils/hipify/cuda_to_hip_mappings.py | 30 ++++++++++++++++++++++ 7 files changed, 37 insertions(+), 35 deletions(-) diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp index d2b883caa0d64..236aad5f725fa 100644 --- a/aten/src/ATen/EmptyTensor.cpp +++ b/aten/src/ATen/EmptyTensor.cpp @@ -14,7 +14,7 @@ c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) { if (at::globalContext().userEnabledUVM()) { // When UVM is enabled, we only use a single allocator // for all allocations. - return detail::getCUDAHooks().getUnifiedDeviceAllocatorCpu(); + return detail::getCUDAHooks().getCUDADeviceAllocator(); } else { if (pin_memory) { diff --git a/aten/src/ATen/cuda/EmptyTensor.cpp b/aten/src/ATen/cuda/EmptyTensor.cpp index 1107b480fb2ee..2c7d2222108fd 100644 --- a/aten/src/ATen/cuda/EmptyTensor.cpp +++ b/aten/src/ATen/cuda/EmptyTensor.cpp @@ -17,11 +17,8 @@ TensorBase empty_cuda( TORCH_INTERNAL_ASSERT(device.is_cuda()); const DeviceGuard device_guard(device); at::Allocator* allocator; - if (at::globalContext().userEnabledUVM()) { - allocator = at::cuda::getUnifiedDeviceAllocator(); - } else { - allocator = at::cuda::getCUDADeviceAllocator(); - } + allocator = at::cuda::getCUDADeviceAllocator(); + constexpr c10::DispatchKeySet cuda_dks(c10::DispatchKey::CUDA); return at::detail::empty_generic( size, allocator, cuda_dks, dtype, memory_format_opt); @@ -62,11 +59,8 @@ TensorBase empty_strided_cuda( TORCH_INTERNAL_ASSERT(device.is_cuda()); const DeviceGuard device_guard(device); at::Allocator* allocator; - if (at::globalContext().userEnabledUVM()) { - allocator = at::cuda::getUnifiedDeviceAllocator(); - } else { - allocator = at::cuda::getCUDADeviceAllocator(); - } + allocator = at::cuda::getCUDADeviceAllocator(); + constexpr c10::DispatchKeySet cuda_dks(c10::DispatchKey::CUDA); return at::detail::empty_strided_generic( size, stride, allocator, cuda_dks, dtype); diff --git a/aten/src/ATen/cuda/UvmMemoryAllocator.cpp b/aten/src/ATen/cuda/UvmMemoryAllocator.cpp index 199046b75faed..6243e4c285b72 100644 --- a/aten/src/ATen/cuda/UvmMemoryAllocator.cpp +++ b/aten/src/ATen/cuda/UvmMemoryAllocator.cpp @@ -21,12 +21,8 @@ bool is_managed_cuda(const Tensor& self, c10::optional device) { Tensor _manage_memory_cuda(const Tensor& self, c10::optional device) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( (!device.has_value() && device->is_cpu()) || device->is_cuda()); - at::Allocator* allocator = nullptr; - if (self.is_cpu()) { - allocator = at::cuda::getUnifiedDeviceAllocatorCpu(); - } else if (self.is_cuda()) { - allocator = at::cuda::getUnifiedDeviceAllocator(); - } + at::Allocator* allocator = at::detail::getCUDAHooks().getCUDADeviceAllocator(); + size_t size_bytes = detail::computeStorageNbytes( self.sizes(), self.strides(), self.dtype().itemsize()); auto storage = Storage( diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index 40c6d31288618..71330af174886 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -292,14 +292,6 @@ Allocator* CUDAHooks::getCUDADeviceAllocator() const { return at::cuda::getCUDADeviceAllocator(); } -Allocator* CUDAHooks::getUnifiedDeviceAllocator() const { - return at::cuda::getUnifiedDeviceAllocator(); -} - -Allocator* CUDAHooks::getUnifiedDeviceAllocatorCpu() const { - return at::cuda::getUnifiedDeviceAllocatorCpu(); -} - bool CUDAHooks::compiledWithCuDNN() const { return AT_CUDNN_ENABLED(); } diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h index 3dafc8395b7d9..71704d26b9a12 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.h +++ b/aten/src/ATen/cuda/detail/CUDAHooks.h @@ -36,8 +36,6 @@ struct CUDAHooks : public at::CUDAHooksInterface { bool hasPrimaryContext(int64_t device_index) const override; Allocator* getCUDADeviceAllocator() const override; Allocator* getPinnedMemoryAllocator() const override; - Allocator* getUnifiedDeviceAllocator() const override; - Allocator* getUnifiedDeviceAllocatorCpu() const override; bool compiledWithCuDNN() const override; bool compiledWithMIOpen() const override; bool supportsDilatedConvolutionWithCuDNN() const override; diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h index d96b59defa4c2..ff8fed50f57fe 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.h +++ b/aten/src/ATen/detail/CUDAHooksInterface.h @@ -135,14 +135,6 @@ struct TORCH_API CUDAHooksInterface { TORCH_CHECK(false, "CUDADeviceAllocator requires CUDA. ", CUDA_HELP); } - virtual Allocator* getUnifiedDeviceAllocator() const { - TORCH_CHECK(false, "Unified Device Allocator requires CUDA. ", CUDA_HELP); - } - - virtual Allocator* getUnifiedDeviceAllocatorCpu() const { - TORCH_CHECK(false, "Unified Device Allocator requires CUDA. ", CUDA_HELP); - } - virtual bool compiledWithCuDNN() const { return false; } diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py index 0b59d7aba3ecd..82b0d7d83bd1a 100644 --- a/torch/utils/hipify/cuda_to_hip_mappings.py +++ b/torch/utils/hipify/cuda_to_hip_mappings.py @@ -4061,6 +4061,36 @@ ("hipMallocMipmappedArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), ), ("cudaMallocPitch", ("hipMallocPitch", CONV_MEM, API_RUNTIME)), + ("cudaMemPool_t", ("hipMemPool_t", CONV_MEM, API_RUNTIME)), + ("cudaMemPoolTrimTo", ("hipMemPoolTrimTo", CONV_MEM, API_RUNTIME)), + ( + "cudaDeviceGetDefaultMemPool", + ("hipDeviceGetDefaultMemPool", CONV_MEM, API_RUNTIME) + ), + ( + "cudaMemPoolGetAttribute", + ("hipMemPoolGetAttribute", CONV_MEM, API_RUNTIME) + ), + ( + "cudaMemPoolAttrUsedMemHigh", + ("hipMemPoolAttrUsedMemHigh", CONV_MEM, API_RUNTIME) + ), + ( + "cudaMemPoolSetAttribute", + ("hipMemPoolSetAttribute", CONV_MEM, API_RUNTIME) + ), + ( + "cudaMemPoolAttrReservedMemHigh", + ("hipMemPoolAttrReservedMemHigh", CONV_MEM, API_RUNTIME) + ), + ( + "cudaMemPoolAttrReservedMemCurrent", + ("hipMemPoolAttrReservedMemCurrent", CONV_MEM, API_RUNTIME) + ), + ( + "cudaMemPoolAttrUsedMemCurrent", + ("hipMemPoolAttrUsedMemCurrent", CONV_MEM, API_RUNTIME) + ), ("cudaFreeHost", ("hipHostFree", CONV_MEM, API_RUNTIME)), ("cudaFreeArray", ("hipFreeArray", CONV_MEM, API_RUNTIME)), ("cudaFree", ("hipFree", CONV_MEM, API_RUNTIME)), From 12328b5cb5a7d607210f366cc828d5836dad05d4 Mon Sep 17 00:00:00 2001 From: Jason Furmanek Date: Tue, 21 Feb 2023 16:17:46 +0000 Subject: [PATCH 9/9] Cleanup commit --- aten/src/ATen/cuda/EmptyTensor.cpp | 7 ++----- aten/src/ATen/cuda/detail/CUDAHooks.cpp | 1 - 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/aten/src/ATen/cuda/EmptyTensor.cpp b/aten/src/ATen/cuda/EmptyTensor.cpp index 2c7d2222108fd..96d0734bc5722 100644 --- a/aten/src/ATen/cuda/EmptyTensor.cpp +++ b/aten/src/ATen/cuda/EmptyTensor.cpp @@ -1,7 +1,6 @@ #define TORCH_ASSERT_NO_OPERATORS #include #include -#include #include namespace at { @@ -16,8 +15,7 @@ TensorBase empty_cuda( const auto device = device_or_default(device_opt); TORCH_INTERNAL_ASSERT(device.is_cuda()); const DeviceGuard device_guard(device); - at::Allocator* allocator; - allocator = at::cuda::getCUDADeviceAllocator(); + auto* allocator = at::cuda::getCUDADeviceAllocator(); constexpr c10::DispatchKeySet cuda_dks(c10::DispatchKey::CUDA); return at::detail::empty_generic( @@ -58,8 +56,7 @@ TensorBase empty_strided_cuda( const auto device = device_or_default(device_opt); TORCH_INTERNAL_ASSERT(device.is_cuda()); const DeviceGuard device_guard(device); - at::Allocator* allocator; - allocator = at::cuda::getCUDADeviceAllocator(); + auto* allocator = at::cuda::getCUDADeviceAllocator(); constexpr c10::DispatchKeySet cuda_dks(c10::DispatchKey::CUDA); return at::detail::empty_strided_generic( diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index 71330af174886..b264795bb9835 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include