From e5a95d5acc3ffbf90cdcb53d2bbee672336ca37f Mon Sep 17 00:00:00 2001
From: Jason Furmanek <Jason.Furmanek@amd.com>
Date: Fri, 26 Aug 2022 21:29:20 +0000
Subject: [PATCH 1/9] [UVM] Allocation and move enablement front end

---
 aten/src/ATen/Context.cpp  | 16 +++++++++++++++
 aten/src/ATen/Context.h    |  8 +++++++-
 torch/csrc/cuda/Module.cpp | 42 +++++++++++++++++++++++++++++++++++++-
 torch/cuda/memory.py       | 32 +++++++++++++++++++++++++++++
 4 files changed, 96 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index 936e9b6252863..6fc28cf4bbed0 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -55,6 +55,22 @@ void Context::setUserEnabledMkldnn(bool e) {
   enabled_mkldnn = e;
 }
 
+bool Context::userEnabledUVM() const {
+  return enabled_uvm;
+}
+
+void Context::setUserEnabledUVM(bool e) {
+  enabled_uvm = e;
+}
+
+bool Context::userEnabledMove() const {
+  return enabled_move;
+}
+
+void Context::setUserEnabledMove(bool e) {
+  enabled_move = e;
+}
+
 bool Context::deterministicCuDNN() const {
   return deterministic_cudnn;
 }
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 48e3c935a2c0c..ede1aeea02c7c 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -119,6 +119,10 @@ class TORCH_API Context {
   void setUserEnabledCuDNN(bool e);
   bool userEnabledMkldnn() const;
   void setUserEnabledMkldnn(bool e);
+  bool userEnabledUVM() const;
+  void setUserEnabledUVM(bool e);
+  bool userEnabledMove() const;
+  void setUserEnabledMove(bool e);
   bool benchmarkCuDNN() const;
   void setBenchmarkCuDNN(bool);
   int benchmarkLimitCuDNN() const;
@@ -288,7 +292,9 @@ class TORCH_API Context {
   bool allow_fp16_reduction_cublas = true;
   bool enabled_mkldnn = true;
   at::LinalgBackend linalg_preferred_backend = at::LinalgBackend::Default;
-#ifdef C10_MOBILE
+  bool enabled_uvm = false;
+  bool enabled_move = false;
+  #ifdef C10_MOBILE
   bool release_original_weights = true;
 #else
   bool release_original_weights = false;
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index b526f87edd75d..468571fc454ee 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -763,7 +763,43 @@ PyObject* THCPModule_attachOutOfMemoryObserver(
   END_HANDLE_TH_ERRORS
 }
 
-PyObject* THCPModule_cudaSetSyncDebugMode(PyObject* _unused, PyObject* arg) {
+PyObject* THCPModule_setUserEnabledUVM(PyObject *_unused, PyObject *arg)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(PyBool_Check(arg), "set_enabled_uvm expects a bool, "
+          "but got %s", THPUtils_typename(arg));
+  at::globalContext().setUserEnabledUVM(arg == Py_True);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_userEnabledUVM(PyObject *_unused, PyObject *noargs)
+{
+  HANDLE_TH_ERRORS
+  if (at::globalContext().userEnabledUVM()) Py_RETURN_TRUE;
+  else Py_RETURN_FALSE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_setUserEnabledMove(PyObject *_unused, PyObject *arg)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(PyBool_Check(arg), "set_enabled_move expects a bool, "
+          "but got %s", THPUtils_typename(arg));
+  at::globalContext().setUserEnabledMove(arg == Py_True);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_userEnabledMove(PyObject *_unused, PyObject *noargs)
+{
+  HANDLE_TH_ERRORS
+  if (at::globalContext().userEnabledMove()) Py_RETURN_TRUE;
+  else Py_RETURN_FALSE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_cudaSetSyncDebugMode(PyObject * _unused, PyObject * arg){
   HANDLE_TH_ERRORS
   TORCH_WARN_ONCE(
       "Synchronization debug mode is a prototype feature and does not yet detect all "
@@ -1169,6 +1205,10 @@ static struct PyMethodDef _THCPModule_methods[] = {
      THCPModule_attachOutOfMemoryObserver,
      METH_O,
      nullptr},
+    {"_cuda_getEnabledUVM", THCPModule_userEnabledUVM, METH_NOARGS, nullptr},
+    {"_cuda_setEnabledUVM", THCPModule_setUserEnabledUVM, METH_O,   nullptr},
+    {"_cuda_getEnabledMove", THCPModule_userEnabledMove, METH_NOARGS, nullptr},
+    {"_cuda_setEnabledMove", THCPModule_setUserEnabledMove, METH_O,   nullptr},
     {"_cuda_cudaHostAllocator",
      THCPModule_cudaHostAllocator,
      METH_NOARGS,
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index c40d9de580406..047e393b125aa 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -26,6 +26,13 @@
     torch._C.__dict__['_cuda_CUDAAllocator'] = _dummy_type('_cuda_CUDAAllocator')
 
 
+from os import environ
+if 'PYTORCH_UVM_ENABLE' in environ:
+    torch._C._cuda_setEnabledUVM(True)
+
+if 'PYTORCH_MOVE_ENABLE' in environ:
+    torch._C._cuda_setEnabledMove(True)
+
 def _host_allocator():
     _lazy_init()
     return torch._C._cuda_cudaHostAllocator()
@@ -722,3 +729,28 @@ def _get_current_allocator() -> _CUDAAllocator:
         See :ref:`cuda-memory-management` for details on creating and using a custom allocator
     """
     return _CUDAAllocator(torch._C._cuda_getAllocator())
+
+def set_enabled_uvm(enable):
+    r"""Enable/disable Unified Virtual Memory.
+    Arguments:
+        enable (bool): desired UVM setting.
+    """
+    torch._C._cuda_setEnabledUVM(enable)
+
+
+def get_enabled_uvm():
+    r"""Returns a bool indicating if Unified Virtual Memory is currently enabled."""
+    return torch._C._cuda_getEnabledUVM()
+
+
+def set_enabled_move(enable):
+    r"""Enable/disable Unified Virtual Memory.
+    Arguments:
+        enable (bool): desired UVM setting.
+    """
+    torch._C._cuda_setEnabledMove(enable)
+
+
+def get_enabled_move():
+    r"""Returns a bool indicating if Unified Virtual Memory is currently enabled."""
+    return torch._C._cuda_getEnabledMove()

From c19db86e9929877c77a0891c9ca381bb6cbc3de6 Mon Sep 17 00:00:00 2001
From: Jason Furmanek <Jason.Furmanek@amd.com>
Date: Fri, 26 Aug 2022 21:33:22 +0000
Subject: [PATCH 2/9] [UVM] Add Caching Managed Allocator

---
 .../src/ATen/cuda/CachingManagedAllocator.cpp | 1270 +++++++++++++++++
 aten/src/ATen/cuda/CachingManagedAllocator.h  |  121 ++
 aten/src/ATen/cuda/ThrustAllocator.h          |   11 +-
 aten/src/ATen/cuda/UvmMemoryAllocator.h       |   16 +
 .../cuda/linalg/BatchLinearAlgebraLib.cpp     |   56 +-
 aten/src/ATen/native/cudnn/Conv_v7.cpp        |    8 +-
 aten/src/ATen/native/miopen/Conv_miopen.cpp   |    8 +-
 7 files changed, 1471 insertions(+), 19 deletions(-)
 create mode 100644 aten/src/ATen/cuda/CachingManagedAllocator.cpp
 create mode 100644 aten/src/ATen/cuda/CachingManagedAllocator.h
 create mode 100644 aten/src/ATen/cuda/UvmMemoryAllocator.h

diff --git a/aten/src/ATen/cuda/CachingManagedAllocator.cpp b/aten/src/ATen/cuda/CachingManagedAllocator.cpp
new file mode 100644
index 0000000000000..dec15566b0100
--- /dev/null
+++ b/aten/src/ATen/cuda/CachingManagedAllocator.cpp
@@ -0,0 +1,1270 @@
+#include <ATen/cuda/CachingManagedAllocator.h>
+
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/util/UniqueVoidPtr.h>
+#include <c10/util/irange.h>
+
+#include <cuda_runtime_api.h>
+#include <algorithm>
+#include <bitset>
+#include <deque>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <regex>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace at {
+
+C10_DEFINE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
+
+namespace cuda {
+namespace CachingManagedAllocator {
+
+//
+// Yet another caching allocator for CUDA device allocations.
+//
+// - Allocations are associated with a stream. Once freed, blocks can be
+//   re-allocated on the same stream, but not on any other stream.
+// - The allocator attempts to find the smallest cached block that will fit the
+//   requested size. If the block is larger than the requested size, it may be
+//   split. If no block is found, the allocator will delegate to cudaMalloc.
+// - If the cudaMalloc fails, the allocator will attempt to free one cached
+//   block of sufficient size that is not split and retry the allocation.
+//   If this also fails, the allocator will attempt to free all cached blocks
+//   that are not split and retry the allocation.
+// - Large (>1MB) and small allocations are stored in separate pools.
+//   Small requests are packed into 2MB buffers. Large requests will use the
+//   smallest available free block or allocate a new block using cudaMalloc.
+// - To reduce fragmentation, requests between 1MB and 10MB will allocate and
+//   split a 20MB block, if no free block of sufficient size is available.
+// - To further reduce fragmentation, blocks >= 200MB are not allowed to be
+//   split. These oversize cached blocks will still satisfy requests within
+//   20MB of the oversize cached block size.
+//
+// With this allocator, allocations and frees should logically be considered
+// "usages" of the memory segment associated with streams, just like kernel
+// launches. The programmer must insert the proper synchronization if memory
+// segments are used from multiple streams.
+//
+// The library provides a recordStream() function to help insert the correct
+// synchronization when allocations are used on multiple streams. This will
+// ensure that the block is not reused before each recorded stream completes
+// work.
+//
+
+namespace {
+
+using stream_set = std::unordered_set<cuda::CUDAStream>;
+
+constexpr size_t kMinBlockSize =
+    512; // all sizes are rounded to at least 512 bytes
+constexpr size_t kSmallSize = 1048576; // largest "small" allocation is 1 MiB
+constexpr size_t kSmallBuffer =
+    2097152; // "small" allocations are packed in 2 MiB blocks
+constexpr size_t kLargeBuffer =
+    20971520; // "large" allocations may be packed in 20 MiB blocks
+constexpr size_t kMinLargeAlloc =
+    10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
+constexpr size_t kRoundLarge = 2097152; // round up large allocations to 2 MiB
+
+typedef std::bitset<static_cast<size_t>(StatType::NUM_TYPES)> StatTypes;
+
+void update_stat(Stat& stat, int64_t amount) {
+  stat.current += amount;
+
+  TORCH_INTERNAL_ASSERT(
+      stat.current >= 0,
+      "Negative tracked stat in CUDA allocator (likely logic error).");
+
+  stat.peak = std::max(stat.current, stat.peak);
+  if (amount > 0) {
+    stat.allocated += amount;
+  }
+  if (amount < 0) {
+    stat.freed += -amount;
+  }
+}
+
+void reset_accumulated_stat(Stat& stat) {
+  stat.allocated = 0;
+  stat.freed = 0;
+}
+
+void reset_peak_stat(Stat& stat) {
+  stat.peak = stat.current;
+}
+
+void update_stat_array(StatArray& stat_array,
+    uint64_t amount,
+    const StatTypes& stat_types) {
+  for (const auto stat_type : c10::irange(stat_types.size())) {
+    if (stat_types[stat_type]) {
+      update_stat(stat_array[stat_type], amount);
+    }
+  }
+}
+
+struct Block;
+struct PrivatePool;
+typedef bool (*Comparison)(const Block*, const Block*);
+
+struct BlockPool {
+  BlockPool(
+      Comparison comparator,
+      bool small,
+      PrivatePool* private_pool = nullptr)
+      : blocks(comparator), is_small(small), owner_PrivatePool(private_pool) {}
+  std::set<Block*, Comparison> blocks;
+  const bool is_small;
+  PrivatePool* owner_PrivatePool;
+};
+
+struct Block {
+  int device; // gpu
+  cudaStream_t stream; // allocation stream
+  stream_set stream_uses; // streams on which the block was used
+  size_t size; // block size in bytes
+  BlockPool* pool; // owning memory pool
+  void* ptr; // memory address
+  bool allocated; // in-use flag
+  Block* prev; // prev block if split from a larger allocation
+  Block* next; // next block if split from a larger allocation
+  int event_count; // number of outstanding CUDA events
+
+  Block(
+      int device,
+      cudaStream_t stream,
+      size_t size,
+      BlockPool* pool,
+      void* ptr)
+      : device(device),
+        stream(stream),
+        stream_uses(),
+        size(size),
+        pool(pool),
+        ptr(ptr),
+        allocated(0),
+        prev(nullptr),
+        next(nullptr),
+        event_count(0) {}
+
+  // constructor for search key
+  Block(int device, cudaStream_t stream, size_t size)
+      : device(device),
+        stream(stream),
+        stream_uses(),
+        size(size),
+        pool(nullptr),
+        ptr(nullptr),
+        allocated(0),
+        prev(nullptr),
+        next(nullptr),
+        event_count(0) {}
+
+  bool is_split() const {
+    return (prev != nullptr) || (next != nullptr);
+  }
+};
+
+static bool BlockComparator(const Block* a, const Block* b) {
+  if (a->stream != b->stream) {
+    return (uintptr_t)a->stream < (uintptr_t)b->stream;
+  }
+  if (a->size != b->size) {
+    return a->size < b->size;
+  }
+  return (uintptr_t)a->ptr < (uintptr_t)b->ptr;
+}
+
+static std::string format_size(uint64_t size) {
+  std::ostringstream os;
+  os.precision(2);
+  os << std::fixed;
+  if (size <= 1024) {
+    os << size << " bytes";
+  } else if (size <= 1048576) {
+    os << (size / 1024.0);
+    os << " KiB";
+  } else if (size <= 1073741824ULL) {
+    os << size / 1048576.0;
+    os << " MiB";
+  } else {
+    os << size / 1073741824.0;
+    os << " GiB";
+  }
+  return os.str();
+}
+
+struct AllocParams {
+  AllocParams(
+      int device,
+      size_t size,
+      cudaStream_t stream,
+      BlockPool* pool,
+      size_t alloc_size,
+      DeviceStats& stats)
+      : search_key(device, stream, size),
+        pool(pool),
+        alloc_size(alloc_size),
+        block(nullptr),
+        err(cudaSuccess) {}
+
+  int device() const {
+    return search_key.device;
+  }
+  cudaStream_t stream() const {
+    return search_key.stream;
+  }
+  size_t size() const {
+    return search_key.size;
+  }
+
+  Block search_key;
+  BlockPool* pool;
+  size_t alloc_size;
+  Block* block;
+  StatTypes stat_types;
+  cudaError_t err;
+};
+
+struct MempoolIdHash {
+  std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
+    return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
+  }
+};
+
+cudaError_t cudaMallocMaybeCapturing(void** p, size_t size) {
+    return cudaMallocManaged(p, size);
+}
+
+} // namespace 
+
+class CachingAllocatorConfig {
+ public:
+  static size_t max_split_size() {
+    return instance().m_max_split_size;
+  }
+
+ private:
+  static std::once_flag s_flag;
+  static CachingAllocatorConfig* s_instance;
+  static CachingAllocatorConfig& instance() {
+    std::call_once(s_flag, &CachingAllocatorConfig::init);
+    return *s_instance;
+  }
+  static void init() {
+    s_instance = new CachingAllocatorConfig();
+    s_instance->parseArgs();
+  }
+
+  CachingAllocatorConfig()
+      : m_max_split_size(std::numeric_limits<size_t>::max()) {}
+  size_t m_max_split_size;
+
+  void parseArgs() {
+    const char* val = getenv("PYTORCH_CUDA_ALLOC_CONF");
+    if (val != NULL) {
+      const std::string config(val);
+
+      std::regex exp("[\\s,]+");
+      std::sregex_token_iterator it(config.begin(), config.end(), exp, -1);
+      std::sregex_token_iterator end;
+      std::vector<std::string> options(it, end);
+
+      for (auto option : options) {
+        std::regex exp2("[:]+");
+        std::sregex_token_iterator it2(option.begin(), option.end(), exp2, -1);
+        std::sregex_token_iterator end2;
+        std::vector<std::string> kv(it2, end2);
+        if (kv.size() >= 2) {
+          /* Maximum split size in MB.  Limited to large size blocks */
+          if (kv[0].compare("max_split_size_mb") == 0) {
+            size_t val2 = stoi(kv[1]);
+            TORCH_CHECK(
+                val2 > kLargeBuffer / (1024 * 1024),
+                "CachingAllocator option max_split_size_mb too small, must be > ",
+                kLargeBuffer / (1024 * 1024),
+                "");
+            val2 = std::max(val2, kLargeBuffer / (1024 * 1024));
+            val2 = std::min(
+                val2, (std::numeric_limits<size_t>::max() / (1024 * 1024)));
+            m_max_split_size = val2 * 1024 * 1024;
+          } else {
+            TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", kv[0]);
+          }
+        }
+      }
+    }
+  }
+};
+CachingAllocatorConfig* CachingAllocatorConfig::s_instance;
+std::once_flag CachingAllocatorConfig::s_flag;
+
+class ManagedAllocator {
+ private:
+  // lock around all operations
+  mutable std::recursive_mutex mutex;
+
+  // lock around calls to cudaFree (to prevent deadlocks with NCCL)
+  mutable std::mutex cuda_free_mutex;
+
+  // device statistics
+  DeviceStats stats;
+
+  // unallocated cached blocks larger than 1 MB
+  BlockPool large_blocks;
+
+  // unallocated cached blocks 1 MB or smaller
+  BlockPool small_blocks;
+
+  // allocated or in use by a stream. Holds all active allocations,
+  // whether they came from graph_pools or one of the BlockPools above.
+  std::unordered_set<Block*> active_blocks;
+
+  //TODO:  This is going to be redundant with active_blocks.  Need to merge
+  //the two going forward.
+  // allocated blocks by device pointer
+  std::unordered_map<void*, Block*> allocated_blocks;
+
+  // captures_underway tracks if a capture might be underway on any stream.
+  // Most of the time it's zero, in which case malloc can avoid calling
+  // cudaStreamGetCaptureInfo in the hot path.
+
+  // See free() for this thing's purpose
+  std::vector<Block*> needs_events_deferred_until_no_capture;
+  // outstanding cuda events
+  std::deque<std::pair<cudaEvent_t, Block*>> cuda_events;
+
+  // record used memory.
+  size_t total_allocated_memory = 0;
+
+  size_t allowed_memory_maximum = 0;
+
+  bool set_fraction = false;
+
+  // Maps a capturing stream to its assigned private pool,
+  // in case we want multiple captures to share the same pool
+  std::unordered_map<CaptureId_t, MempoolId_t> capture_to_pool_map;
+
+  void add_allocated_block(Block* block) {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    allocated_blocks[block->ptr] = block;
+  }
+
+ public:
+  ManagedAllocator()
+      : large_blocks(BlockComparator, /*is_small=*/false),
+        small_blocks(BlockComparator, /*is_small=*/true) {
+    stats.max_split_size = CachingAllocatorConfig::max_split_size();
+  }
+
+  std::mutex* getCudaFreeMutex() const {
+    return &cuda_free_mutex;
+  }
+
+  Block* get_allocated_block(void* ptr, bool remove = false) {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    auto it = allocated_blocks.find(ptr);
+    if (it == allocated_blocks.end()) {
+      return nullptr;
+    }
+    Block* block = it->second;
+    if (remove) {
+      allocated_blocks.erase(it);
+    }
+    return block;
+  }
+
+  cudaError_t recordEvent(void* ptr, at::cuda::CUDAStream stream)
+  {
+
+    return cudaSuccess;
+  }
+
+  // All public methods (except the above) acquire the allocator mutex.
+  // Thus, do not call a public method from another public method.
+void malloc(void** devPtr, int device, size_t size, cudaStream_t stream) {
+    Block* block = malloc(device, size, stream);
+    add_allocated_block(block);
+    *devPtr = (void*)block->ptr;
+  }
+
+  Block* malloc(int device, size_t size, cudaStream_t stream) {
+    std::unique_lock<std::recursive_mutex> lock(mutex);
+    process_events();
+    size = round_size(size);
+    auto& pool = get_pool(size, stream);
+    const size_t alloc_size = get_allocation_size(size);
+    AllocParams params(device, size, stream, &pool, alloc_size, stats);
+    params.stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
+    params.stat_types[static_cast<size_t>(get_stat_type_for_pool(pool))] = true;
+
+    bool block_found =
+        // Search pool
+        get_free_block(params)
+        // Trigger callbacks and retry search
+        || (trigger_free_memory_callbacks(params) && get_free_block(params))
+        // Attempt allocate
+        || alloc_block(params, false)
+        // Free enough available cached blocks to satisfy alloc and retry alloc.
+        ||
+        (release_available_cached_blocks(params) && alloc_block(params, false))
+        // Free all non-split cached blocks and retry alloc.
+        || (release_cached_blocks() && alloc_block(params, true));
+
+    if (!block_found) {
+      // For any error code other than cudaErrorMemoryAllocation,
+      // alloc_block should have thrown an exception already.
+      TORCH_INTERNAL_ASSERT(params.err == cudaErrorMemoryAllocation);
+
+      size_t device_free;
+      size_t device_total;
+      C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
+      std::string allowed_info;
+
+      if (set_fraction) {
+        allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
+      }
+
+      stats.num_ooms += 1;
+
+    }
+
+    TORCH_INTERNAL_ASSERT(
+        params.err == cudaSuccess && params.block != nullptr &&
+        params.block->ptr != nullptr);
+    Block* block = params.block;
+    Block* remaining = nullptr;
+
+    const bool already_split = block->is_split();
+    if (should_split(block, size)) {
+      remaining = block;
+
+      block = new Block(device, stream, size, &pool, block->ptr);
+      block->prev = remaining->prev;
+      if (block->prev) {
+        block->prev->next = block;
+      }
+      block->next = remaining;
+
+      remaining->prev = block;
+      remaining->ptr = static_cast<char*>(remaining->ptr) + size;
+      remaining->size -= size;
+      bool inserted = pool.blocks.insert(remaining).second;
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inserted);
+
+      if (already_split) {
+        // An already-split inactive block is being shrunk by size bytes.
+        update_stat_array(
+            stats.inactive_split_bytes, -block->size, params.stat_types);
+      } else {
+        // A new split inactive block is being created from a previously unsplit
+        // block, size remaining->size bytes.
+        update_stat_array(
+            stats.inactive_split_bytes, remaining->size, params.stat_types);
+        update_stat_array(stats.inactive_split, 1, params.stat_types);
+      }
+    } else if (already_split) {
+      // An already-split block is becoming active
+      update_stat_array(
+          stats.inactive_split_bytes, -block->size, params.stat_types);
+      update_stat_array(stats.inactive_split, -1, params.stat_types);
+    }
+
+    block->allocated = true;
+    bool inserted = active_blocks.insert(block).second;
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inserted);
+
+    update_stat_array(stats.allocation, 1, params.stat_types);
+    update_stat_array(stats.allocated_bytes, block->size, params.stat_types);
+    update_stat_array(stats.active, 1, params.stat_types);
+    update_stat_array(stats.active_bytes, block->size, params.stat_types);
+    if (block->size >= CachingAllocatorConfig::max_split_size())
+      update_stat(stats.oversize_allocations, 1);
+
+    c10::reportMemoryUsageToProfiler(
+        block->ptr,
+        block->size,
+        stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+        stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+        c10::Device(c10::DeviceType::CUDA, device));
+
+    add_allocated_block(block);
+    return block;
+  }
+
+  void free(void* ptr) {
+
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+
+    // We aren't sure if a free'd block will be used by the same device
+    // or even the same stream, synchronize for now just in case
+    cudaDeviceSynchronize();
+    if (!ptr) {
+      return;
+    }
+    Block* block = get_allocated_block(ptr, true /* remove */);
+    if (!block) {
+      TORCH_CHECK(false, "invalid device pointer: ", ptr);
+    }
+
+    block->allocated = false;
+
+    // following logic might modifying underlaying Block, causing the size
+    // changed. We store ahead for reporting
+    auto orig_block_ptr = block->ptr;
+    auto orig_block_size = block->size;
+
+    StatTypes stat_types;
+    stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
+    stat_types[static_cast<size_t>(get_stat_type_for_pool(*(block->pool)))] =
+        true;
+    update_stat_array(stats.allocation, -1, {stat_types});
+    update_stat_array(stats.allocated_bytes, -block->size, {stat_types});
+    if (block->size >= CachingAllocatorConfig::max_split_size())
+      update_stat(stats.oversize_allocations, -1);
+
+    if (!block->stream_uses.empty()) {
+        insert_events(block);
+    } else {
+      free_block(block);
+    }
+
+    c10::reportMemoryUsageToProfiler(
+        orig_block_ptr,
+        -orig_block_size,
+        stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+        stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+        c10::Device(c10::DeviceType::CUDA, block->device));
+  }
+
+  void* getBaseAllocation(void* ptr, size_t* outSize) {
+    Block* block = get_allocated_block(ptr);
+    if (!block) {
+      TORCH_CHECK(false, "invalid device pointer: ", ptr);
+    }
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    while (block->prev) {
+      block = block->prev;
+    }
+    void* basePtr = block->ptr;
+    if (outSize) {
+      size_t size = 0;
+      while (block) {
+        size += block->size;
+        block = block->next;
+      }
+      *outSize = size;
+    }
+    return basePtr;
+  }
+
+  void recordStream(const DataPtr& ptr, cuda::CUDAStream stream) {
+      if (!ptr.get()) {
+      return;
+    }
+
+    // If a tensor is not allocated by this instance, simply skip
+    // This usually happens when CUDA tensors are shared across processes,
+    // we have implemented reference counting based sharing mechanism to
+    // guarantee tensors won't be accidentally freed by one process while
+    // they are still being used in another
+    if (ptr.get_deleter() != &raw_delete)
+      return;
+
+    Block* block = get_allocated_block(ptr.get());
+    if (!block) {
+      TORCH_CHECK(false, "invalid device pointer: ", ptr);
+    }
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    if (stream.stream() == block->stream) {
+      // ignore uses on the allocation stream, since those don't require any
+      // special synchronization
+      return;
+    }
+    block->stream_uses.insert(stream);
+  }
+
+  /** set memory fraction to limit maximum allocated memory **/
+  void setMemoryFraction(double fraction) {
+    size_t device_free;
+    size_t device_total;
+    C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
+    allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
+    set_fraction = true;
+  }
+
+  /** returns cached blocks to the system allocator **/
+  void emptyCache() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    release_cached_blocks();
+  }
+
+  /** Retrieves info (total size + largest block) of the memory cache **/
+  void cacheInfo(size_t* total, size_t* largest) {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    if (*largest ==
+        0) { // make an initial guess if a zero *largest is passed in
+      size_t tmp_bytes;
+      cudaMemGetInfo(
+          largest, // Use free memory as an optimistic initial guess of *largest
+          &tmp_bytes);
+    }
+    cache_info_aux(large_blocks, total, largest);
+    cache_info_aux(small_blocks, total, largest);
+  }
+
+  /** Returns a copy of the memory allocator stats **/
+  DeviceStats getStats() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    return stats;
+  }
+
+  /** Resets the historical accumulation stats for the device **/
+  void resetAccumulatedStats() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+
+    for (const auto statType :
+         c10::irange(static_cast<size_t>(StatType::NUM_TYPES))) {
+      reset_accumulated_stat(stats.allocation[statType]);
+      reset_accumulated_stat(stats.segment[statType]);
+      reset_accumulated_stat(stats.active[statType]);
+      reset_accumulated_stat(stats.inactive_split[statType]);
+      reset_accumulated_stat(stats.allocated_bytes[statType]);
+      reset_accumulated_stat(stats.reserved_bytes[statType]);
+      reset_accumulated_stat(stats.active_bytes[statType]);
+      reset_accumulated_stat(stats.inactive_split_bytes[statType]);
+    }
+
+    stats.num_alloc_retries = 0;
+    stats.num_ooms = 0;
+    reset_accumulated_stat(stats.oversize_allocations);
+    reset_accumulated_stat(stats.oversize_segments);
+  }
+
+  /** Resets the historical peak stats for the device **/
+  void resetPeakStats() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+
+    for (const auto statType :
+         c10::irange(static_cast<size_t>(StatType::NUM_TYPES))) {
+      reset_peak_stat(stats.allocation[statType]);
+      reset_peak_stat(stats.segment[statType]);
+      reset_peak_stat(stats.active[statType]);
+      reset_peak_stat(stats.inactive_split[statType]);
+      reset_peak_stat(stats.allocated_bytes[statType]);
+      reset_peak_stat(stats.reserved_bytes[statType]);
+      reset_peak_stat(stats.active_bytes[statType]);
+      reset_peak_stat(stats.inactive_split_bytes[statType]);
+    }
+    reset_peak_stat(stats.oversize_allocations);
+    reset_peak_stat(stats.oversize_segments);
+  }
+
+  /** Dump a complete snapshot of the memory held by the allocator. Potentially
+   * VERY expensive. **/
+  std::vector<SegmentInfo> snapshot() const {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+
+    std::vector<SegmentInfo> result;
+    const auto all_blocks = get_all_blocks();
+
+    for (const Block* const head_block : all_blocks) {
+      if (head_block->prev != nullptr) {
+        continue;
+      }
+      result.emplace_back();
+      SegmentInfo& segment_info = result.back();
+      segment_info.device = head_block->device;
+      segment_info.address = reinterpret_cast<int64_t>(head_block->ptr);
+      segment_info.is_large = (!head_block->pool->is_small);
+
+      const Block* block = head_block;
+      while (block != nullptr) {
+        segment_info.blocks.emplace_back();
+        BlockInfo& block_info = segment_info.blocks.back();
+
+        block_info.size = block->size;
+        block_info.allocated = block->allocated;
+        block_info.active = block->allocated || (block->event_count > 0) ||
+            !block->stream_uses.empty();
+
+        segment_info.total_size += block_info.size;
+        if (block_info.allocated) {
+          segment_info.allocated_size += block_info.size;
+        }
+        if (block_info.active) {
+          segment_info.active_size += block_info.size;
+        }
+
+        block = block->next;
+      }
+    }
+
+    std::sort(
+        result.begin(),
+        result.end(),
+        [](const SegmentInfo& a, const SegmentInfo& b) {
+          return a.address < b.address;
+        });
+
+    return result;
+  }
+
+  static size_t round_size(size_t size) {
+    if (size < kMinBlockSize) {
+      return kMinBlockSize;
+    } else {
+      return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
+    }
+  }
+
+ private:
+  // All private methods do not acquire the allocator mutex.
+
+  std::vector<const Block*> get_all_blocks() const {
+    std::vector<const Block*> blocks;
+    blocks.insert(
+        blocks.end(), small_blocks.blocks.begin(), small_blocks.blocks.end());
+    blocks.insert(
+        blocks.end(), large_blocks.blocks.begin(), large_blocks.blocks.end());
+    blocks.insert(blocks.end(), active_blocks.begin(), active_blocks.end());
+    return blocks;
+  }
+
+  /** moves a block into a pool of cached free blocks */
+  void free_block(Block* block) {
+    TORCH_INTERNAL_ASSERT(
+        !block->allocated && block->event_count == 0 &&
+        block->stream_uses.empty());
+
+    size_t original_block_size = block->size;
+
+    auto& pool = *block->pool;
+    int64_t net_change_inactive_split_blocks = 0;
+    int64_t net_change_inactive_split_size = 0;
+
+    const std::array<Block*, 2> merge_candidates = {block->prev, block->next};
+    for (Block* merge_candidate : merge_candidates) {
+      const int64_t subsumed_size =
+          try_merge_blocks(block, merge_candidate, pool);
+      if (subsumed_size > 0) {
+        net_change_inactive_split_blocks -= 1;
+        net_change_inactive_split_size -= subsumed_size;
+      }
+    }
+
+    active_blocks.erase(block);
+    // Makes sure the Block* isn't already present in the pool we're freeing it
+    // back into.
+    bool inserted = pool.blocks.insert(block).second;
+    TORCH_INTERNAL_ASSERT(inserted);
+
+    if (block->is_split()) {
+      net_change_inactive_split_blocks += 1;
+      net_change_inactive_split_size += block->size;
+    }
+
+    StatTypes stat_types;
+    stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
+    stat_types[static_cast<size_t>(get_stat_type_for_pool(pool))] = true;
+    update_stat_array(
+        stats.inactive_split, net_change_inactive_split_blocks, stat_types);
+    update_stat_array(
+        stats.inactive_split_bytes, net_change_inactive_split_size, stat_types);
+    update_stat_array(stats.active, -1, stat_types);
+    update_stat_array(stats.active_bytes, -original_block_size, stat_types);
+  }
+
+  /** combine previously split blocks. returns the size of the subsumed block,
+   * or 0 on failure. */
+  size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) {
+    if (!src || src->allocated || src->event_count > 0 ||
+        !src->stream_uses.empty()) {
+      return 0;
+    }
+
+    AT_ASSERT(dst->is_split() && src->is_split());
+
+    if (dst->prev == src) {
+      dst->ptr = src->ptr;
+      dst->prev = src->prev;
+      if (dst->prev) {
+        dst->prev->next = dst;
+      }
+    } else {
+      dst->next = src->next;
+      if (dst->next) {
+        dst->next->prev = dst;
+      }
+    }
+
+    const size_t subsumed_size = src->size;
+    dst->size += subsumed_size;
+    auto erased = pool.blocks.erase(src);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(erased == 1);
+    delete src;
+
+    return subsumed_size;
+  }
+
+  BlockPool& get_pool(size_t size, cudaStream_t stream) {
+
+    if (size <= kSmallSize) {
+      return small_blocks;
+    } else {
+      return large_blocks;
+    }
+  }
+
+  StatType get_stat_type_for_pool(const BlockPool& pool) {
+    return pool.is_small ? StatType::SMALL_POOL : StatType::LARGE_POOL;
+  }
+
+  bool should_split(const Block* block, size_t size) {
+    size_t remaining = block->size - size;
+    if (block->pool->is_small) {
+      return remaining >= kMinBlockSize;
+    } else {
+      return (size < CachingAllocatorConfig::max_split_size()) &&
+          (remaining > kSmallSize);
+    }
+  }
+
+  static size_t get_allocation_size(size_t size) {
+    if (size <= kSmallSize) {
+      return kSmallBuffer;
+    } else if (size < kMinLargeAlloc) {
+      return kLargeBuffer;
+    } else {
+      return kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
+    }
+  }
+
+  bool get_free_block(AllocParams& p) {
+    BlockPool& pool = *p.pool;
+    auto it = pool.blocks.lower_bound(&p.search_key);
+    if (it == pool.blocks.end() || (*it)->stream != p.stream())
+      return false;
+    // Do not return an oversized block for a large request
+    if ((p.size() < CachingAllocatorConfig::max_split_size()) &&
+        ((*it)->size >= CachingAllocatorConfig::max_split_size()))
+      return false;
+    // Allow oversized block size to be rounded up but within a limit
+    if ((p.size() >= CachingAllocatorConfig::max_split_size()) &&
+        ((*it)->size >= p.size() + kLargeBuffer))
+      return false;
+    p.block = *it;
+    pool.blocks.erase(it);
+    return true;
+  }
+
+  bool trigger_free_memory_callbacks(AllocParams& p) {
+    bool freed_memory = false;
+    for (const auto& name : FreeCudaMemoryCallbacksRegistry()->Keys()) {
+      freed_memory |=
+          FreeCudaMemoryCallbacksRegistry()->Create(name)->Execute();
+    }
+    return freed_memory;
+  }
+
+  bool alloc_block(AllocParams& p, bool isRetry) {
+    // Defensively checks for preexisting CUDA error state.
+    C10_CUDA_CHECK(cudaGetLastError());
+
+    size_t size = p.alloc_size;
+    void* ptr;
+
+    if (isRetry) {
+      stats.num_alloc_retries += 1;
+    }
+
+    if (set_fraction &&
+        total_allocated_memory + size > allowed_memory_maximum) {
+      p.err = cudaErrorMemoryAllocation;
+      return false;
+    } else {
+      p.err = cudaMallocMaybeCapturing(&ptr, size);
+      if (p.err != cudaSuccess) {
+        if (p.err == cudaErrorMemoryAllocation) {
+          // If this is the first attempt (!isRetry), we can forgive and clear
+          // CUDA's
+          //   internal error state.
+          // If this is the second attempt (isRetry), malloc's TORCH_CHECK_WITH
+          // will take
+          //   over to throw a helpful exception. The user can choose to catch
+          //   the exception, free some stuff in their script, and attempt their
+          //   allocation again. In this case, we can also forgive and clear
+          //   CUDA's internal error state.
+          cudaGetLastError();
+        } else {
+          // If the error's unrelated to memory allocation, we should throw
+          // immediately.
+          C10_CUDA_CHECK(p.err);
+        }
+        return false;
+      }
+    }
+
+    total_allocated_memory += size;
+    p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr);
+    update_stat_array(stats.segment, 1, p.stat_types);
+    update_stat_array(stats.reserved_bytes, size, p.stat_types);
+    if (size >= CachingAllocatorConfig::max_split_size())
+      update_stat(stats.oversize_segments, 1);
+
+    // p.block came from new, not cudaMalloc. It should not be nullptr here.
+    TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr);
+    return true;
+  }
+
+  /** Free one or more oversize blocks to the system allocator.  But only enough
+   * **/
+  /** to satisfy the target size **/
+  bool release_available_cached_blocks(const AllocParams& p) {
+    if (CachingAllocatorConfig::max_split_size() ==
+        std::numeric_limits<size_t>::max())
+      return false;
+    BlockPool& pool = *p.pool;
+    Block key = p.search_key;
+    key.size = (key.size < CachingAllocatorConfig::max_split_size())
+        ? CachingAllocatorConfig::max_split_size()
+        : key.size;
+    auto it = pool.blocks.lower_bound(&key);
+    if (it == pool.blocks.end() || (*it)->stream != p.stream()) {
+      // No single block is large enough; free multiple oversize blocks,
+      // starting with the largest
+      if (it == pool.blocks.begin())
+        return false;
+      size_t totalReleased = 0;
+      --it; // Back up one item.  Now on the largest block for the correct
+            // stream
+      while ((totalReleased < key.size) &&
+             ((*it)->size >= CachingAllocatorConfig::max_split_size()) &&
+             ((*it)->stream == p.stream())) {
+        auto cur = it;
+        totalReleased += (*it)->size;
+        if (it != pool.blocks.begin()) {
+          --it;
+          release_block(*cur);
+        } else {
+          release_block(*cur);
+          break;
+        }
+      }
+      if (totalReleased < key.size)
+        return false;
+    } else {
+      release_block(*it);
+    }
+    return true;
+  }
+
+  bool release_cached_blocks() {
+    // First ensure that all blocks that can't currently be allocated due to
+    // outstanding events are returned to the pool.
+    synchronize_and_free_events();
+
+    // Free all non-split cached blocks to system allocator
+    release_blocks(large_blocks);
+    release_blocks(small_blocks);
+
+    return true;
+  }
+
+  void release_block(Block* block) {
+    C10_CUDA_CHECK(cudaFree((void*)block->ptr));
+    total_allocated_memory -= block->size;
+
+    auto* pool = block->pool;
+    StatTypes stat_types;
+    stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
+    stat_types[static_cast<size_t>(get_stat_type_for_pool(*pool))] = true;
+    update_stat_array(stats.segment, -1, stat_types);
+    update_stat_array(stats.reserved_bytes, -block->size, stat_types);
+    if (block->size >= CachingAllocatorConfig::max_split_size())
+      update_stat(stats.oversize_segments, -1);
+
+    pool->blocks.erase(block);
+    delete block;
+  }
+
+  void release_blocks(BlockPool& pool) {
+    // Frees all non-split blocks
+    auto it = pool.blocks.begin();
+    while (it != pool.blocks.end()) {
+      Block* block = *it;
+      ++it;
+      if (!block->prev && !block->next) {
+        release_block(block);
+      }
+    }
+  }
+
+  cudaEvent_t create_event_internal() {
+    cudaEvent_t event;
+    C10_CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+    return event;
+  }
+
+  void free_event_internal(cudaEvent_t event) {
+    C10_CUDA_CHECK(cudaEventDestroy(event));
+  }
+
+  void synchronize_and_free_events() {
+    // Synchronize on outstanding events and then free associated blocks.
+
+    for (auto& e : cuda_events) {
+      cudaEvent_t event = e.first;
+      Block* block = e.second;
+
+      C10_CUDA_CHECK(cudaEventSynchronize(event));
+      free_event_internal(event);
+
+      block->event_count--;
+      if (block->event_count == 0) {
+        free_block(block);
+      }
+    }
+
+    cuda_events.clear();
+  }
+
+  void insert_events(Block* block) {
+    int prev_device;
+    C10_CUDA_CHECK(cudaGetDevice(&prev_device));
+
+    stream_set streams(std::move(block->stream_uses));
+    AT_ASSERT(block->stream_uses.empty());
+    for (auto& stream : streams) {
+      C10_CUDA_CHECK(cudaSetDevice(stream.device_index()));
+
+      cudaEvent_t event = create_event_internal();
+      C10_CUDA_CHECK(cudaEventRecord(event, stream.stream()));
+
+      block->event_count++;
+      cuda_events.emplace_back(event, block);
+    }
+
+    C10_CUDA_CHECK(cudaSetDevice(prev_device));
+  }
+
+  void insert_events_deferred_until_no_capture() {
+    if (C10_UNLIKELY(needs_events_deferred_until_no_capture.size() > 0)) {
+      for (auto* block : needs_events_deferred_until_no_capture) {
+        TORCH_INTERNAL_ASSERT(!block->stream_uses.empty());
+        insert_events(block);
+      }
+      needs_events_deferred_until_no_capture.clear();
+    }
+  }
+
+  void process_events() {
+    insert_events_deferred_until_no_capture();
+
+    // Process outstanding cudaEvents. Events that are completed are removed
+    // from the queue, and the 'event_count' for the corresponding allocation
+    // is decremented. Stops at the first event which has not been completed.
+    // Since events on different devices or streams may occur out of order,
+    // the processing of some events may be delayed.
+    while (!cuda_events.empty()) {
+      auto& e = cuda_events.front();
+      cudaEvent_t event = e.first;
+      Block* block = e.second;
+
+      cudaError_t err = cudaEventQuery(event);
+      if (err == cudaErrorNotReady) {
+        // ignore and clear the error if not ready
+        cudaGetLastError();
+        break;
+      } else if (err != cudaSuccess) {
+        C10_CUDA_CHECK(err);
+      }
+
+      free_event_internal(event);
+
+      block->event_count--;
+      if (block->event_count == 0) {
+        free_block(block);
+      }
+      cuda_events.pop_front();
+    }
+  }
+
+  // Accumulates sizes of all memory blocks for given device in given pool
+  void cache_info_aux(const BlockPool& pool, size_t* total, size_t* largest) {
+    for (const auto& block : pool.blocks) {
+      const auto blocksize = block->size;
+      *total += blocksize;
+      if (blocksize > *largest) {
+        *largest = blocksize;
+      }
+    }
+  }
+
+};
+
+static ManagedAllocator allocator;
+
+ void setMemoryFraction(double fraction, int device) {
+   allocator.setMemoryFraction(fraction);
+ }
+
+ void cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock) {
+   allocator.cacheInfo(cachedAndFree, largestBlock);
+ }
+
+ void* getBaseAllocation(void* ptr, size_t* size) {
+   return allocator.getBaseAllocation(ptr, size);
+ }
+
+ void recordStream(const DataPtr& ptr, cuda::CUDAStream stream) {
+   allocator.recordStream(ptr, stream);
+ }
+
+std::mutex* getFreeMutex() {
+  return allocator.getCudaFreeMutex();
+}
+
+DeviceStats getDeviceStats() {
+   return allocator.getStats();
+}
+
+void resetAccumulatedStats(int device) {
+   allocator.resetAccumulatedStats();
+}
+
+void resetPeakStats(int device) {
+  allocator.resetPeakStats();
+}
+
+std::vector<SegmentInfo> snapshot() {
+  return allocator.snapshot();
+}
+
+void* raw_alloc(size_t nbytes) {
+  if (nbytes == 0) {
+    return nullptr;
+  }
+  int device;
+  C10_CUDA_CHECK(cudaGetDevice(&device));
+  void* r = nullptr;
+  allocator.malloc(
+      &r, device, nbytes, cuda::getCurrentCUDAStream(device));
+  return r;
+}
+
+void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) {
+  if (nbytes == 0) {
+    return nullptr;
+  }
+  int device;
+  C10_CUDA_CHECK(cudaGetDevice(&device));
+  void* r = nullptr;
+  allocator.malloc(&r, device, nbytes, stream);
+  return r;
+}
+
+void raw_delete(void* ptr) {
+  allocator.free(ptr);
+}
+
+cudaError_t recordEvent(void *ptr, at::cuda::CUDAStream stream)
+{
+  return allocator.recordEvent(ptr, stream);
+}
+
+void emptyCache()
+{
+  allocator.emptyCache();
+}
+
+static void CachingManagedDeleter(void* ptr) {
+  allocator.free(ptr);
+}
+bool forceUncachedAllocator() {
+  static bool force_uncached =
+      getenv("PYTORCH_NO_CUDA_MEMORY_CACHING") != nullptr;
+  return force_uncached;
+}
+
+static void uncached_delete(void* ptr) {
+  C10_CUDA_CHECK(cudaFree(ptr));
+}
+
+struct CachingManagedAllocator final : public at::Allocator {
+  at::DataPtr allocate(size_t size) const override {
+    int device;
+    C10_CUDA_CHECK(cudaGetDevice(&device));
+    void* ptr = nullptr;
+    if (forceUncachedAllocator()) {
+      // Deliberately don't use cudaMallocMaybeCapturing here, to force an error
+      // if someone tries to use forceUncachedAllocator while capturing.
+      C10_CUDA_CHECK(cudaMallocManaged(&ptr, size));
+      return {ptr, ptr, &uncached_delete, Device(DeviceType::CUDA, device)};
+    }
+    if (size != 0) {
+      ptr = allocator.malloc(device, size, cuda::getCurrentCUDAStream(device))->ptr;
+      //C10_CUDA_CHECK(cudaMemAdvise(ptr, size, cudaMemAdviseSetPreferredLocation, device));
+    }
+    return {ptr, ptr, &CachingManagedDeleter, Device(DeviceType::CUDA, device)};
+  }
+  at::DeleterFnPtr raw_deleter() const override {
+    if (forceUncachedAllocator()) {
+      return &uncached_delete;
+    } else {
+      return &CachingManagedDeleter;
+    }
+  }
+};
+
+struct CachingManagedAllocatorCpu final : public at::Allocator {
+  at::DataPtr allocate(size_t size) const override {
+    void* ptr = nullptr;
+    if (forceUncachedAllocator()) {
+      // Deliberately don't use cudaMallocMaybeCapturing here, to force an error
+      // if someone tries to use forceUncachedAllocator while capturing.
+      C10_CUDA_CHECK(cudaMallocManaged(&ptr, size));
+      return {ptr, ptr, &uncached_delete, Device(DeviceType::CPU)};
+    }
+    if (size != 0) {
+      int device;
+      C10_CUDA_CHECK(cudaGetDevice(&device));
+      int hint_device = -1; //cudaCpuDeviceId
+      ptr = allocator.malloc(hint_device, size, cuda::getCurrentCUDAStream(device))->ptr;
+
+      //C10_CUDA_CHECK(cudaMemAdvise(ptr, size, cudaMemAdviseSetPreferredLocation, hint_device));
+      //C10_CUDA_CHECK(cudaMemAdvise(ptr, size, cudaMemAdviseSetAccessedBy, device));
+    }
+    return {ptr, ptr, &CachingManagedDeleter, Device(DeviceType::CPU)};
+  }
+  at::DeleterFnPtr raw_deleter() const override {
+    if (forceUncachedAllocator()) {
+      return &uncached_delete;
+    } else {
+      return &CachingManagedDeleter;
+    }
+  }
+};
+
+static CachingManagedAllocator caching_managed_allocator;
+
+static CachingManagedAllocatorCpu caching_managed_allocator_cpu;
+
+at::Allocator* get(DeviceType device_type) {
+  if (device_type == DeviceType::CUDA)
+    return &caching_managed_allocator;
+  else
+    return &caching_managed_allocator_cpu;
+}
+
+} // namespace CachingManagedAllocator
+
+}} // namespace at::cuda
diff --git a/aten/src/ATen/cuda/CachingManagedAllocator.h b/aten/src/ATen/cuda/CachingManagedAllocator.h
new file mode 100644
index 0000000000000..ff37646218c40
--- /dev/null
+++ b/aten/src/ATen/cuda/CachingManagedAllocator.h
@@ -0,0 +1,121 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/cuda/CUDAStream.h>
+
+namespace at {
+namespace cuda {
+
+//
+// A caching allocator for UVM allocations.
+//
+// This provides a drop-in replacement for THCudaHostAllocator, which re-uses
+// freed pinned (page-locked) memory allocations. This avoids device
+// synchronizations due to cudaFreeHost calls.
+//
+// To ensure correct behavior, THCCachingHostAllocator_recordEvent must be
+// called anytime a pointer from this allocator is used in a cudaMemcpyAsync
+// call between host and device. We implement this for storages and tensors in
+// copy_from_cpu_async_ and copy_to_cpu_async_.
+//
+// Note that this allocator does not split larger allocations into smaller
+// blocks, unlike the caching device allocator.
+//
+namespace CachingManagedAllocator {
+
+struct Stat {
+  int64_t current = 0;
+  int64_t peak = 0;
+  int64_t allocated = 0;
+  int64_t freed = 0;
+};
+
+enum struct StatType : uint64_t {
+  AGGREGATE = 0,
+  SMALL_POOL = 1,
+  LARGE_POOL = 2,
+  NUM_TYPES = 3 // remember to update this whenever a new stat type is added
+};
+
+typedef std::array<Stat, static_cast<size_t>(StatType::NUM_TYPES)> StatArray;
+
+struct DeviceStats {
+  // COUNT: allocations requested by client code
+  StatArray allocation;
+  // COUNT: number of allocated segments from cudaMalloc().
+  StatArray segment;
+  // COUNT: number of active memory blocks (allocated or used by stream)
+  StatArray active;
+  // COUNT: number of inactive, split memory blocks (unallocated but can't be
+  // released via cudaFree)
+  StatArray inactive_split;
+
+  // SUM: bytes requested by client code
+  StatArray allocated_bytes;
+  // SUM: bytes reserved by this memory allocator (both free and used)
+  StatArray reserved_bytes;
+  // SUM: bytes within active memory blocks
+  StatArray active_bytes;
+  // SUM: bytes within inactive, split memory blocks
+  StatArray inactive_split_bytes;
+
+  // COUNT: total number of failed calls to CUDA malloc necessitating cache
+  // flushes.
+  int64_t num_alloc_retries = 0;
+
+  // COUNT: total number of OOMs (i.e. failed calls to CUDA after cache flush)
+  int64_t num_ooms = 0;
+
+  // COUNT: total number of oversize blocks allocated from pool
+  Stat oversize_allocations;
+
+  // COUNT: total number of oversize blocks requiring malloc
+  Stat oversize_segments;
+
+  // SIZE: maximum block size that is allowed to be split.
+  int64_t max_split_size = 0;
+};
+
+// Struct containing info of an allocation block (i.e. a fractional part of a
+// cudaMalloc)..
+struct BlockInfo {
+  int64_t size = 0;
+  bool allocated = false;
+  bool active = false;
+};
+
+// Struct containing info of a memory segment (i.e. one contiguous cudaMalloc).
+struct SegmentInfo {
+  int64_t device = 0;
+  int64_t address = 0;
+  int64_t total_size = 0;
+  int64_t allocated_size = 0;
+  int64_t active_size = 0;
+  bool is_large = false;
+  std::vector<BlockInfo> blocks;
+};
+
+TORCH_CUDA_CPP_API c10::Allocator* get(DeviceType device_type);
+
+// Records an event in the specified stream. The allocation 'ptr' will not be
+// re-used until the event has occurred.
+TORCH_CUDA_CPP_API cudaError_t
+recordEvent(void* ptr, c10::cuda::CUDAStream stream);
+
+// Releases cached pinned memory allocations via cudaHostFree
+TORCH_CUDA_CPP_API void emptyCache();
+TORCH_CUDA_CPP_API DeviceStats getDeviceStats();
+TORCH_CUDA_CPP_API void* raw_alloc(size_t nbytes);
+TORCH_CUDA_CPP_API void* raw_alloc_with_stream(
+    size_t nbytes,
+    cudaStream_t stream);
+TORCH_CUDA_CPP_API void raw_delete(void* ptr);
+
+// inline TORCH_CUDA_CPP_API at::DataPtr ManagedAlloc(size_t size) {
+//   return getCachingManagedAllocator()->allocate(size);
+// }
+
+} // namespace CachingManagedAllocator
+
+} // namespace cuda
+} // namespace at
diff --git a/aten/src/ATen/cuda/ThrustAllocator.h b/aten/src/ATen/cuda/ThrustAllocator.h
index 92fc5cc13ba08..eeceb462c0fd1 100644
--- a/aten/src/ATen/cuda/ThrustAllocator.h
+++ b/aten/src/ATen/cuda/ThrustAllocator.h
@@ -2,6 +2,7 @@
 
 #include <cstddef>
 #include <c10/cuda/CUDACachingAllocator.h>
+#include <ATen/cuda/CachingManagedAllocator.h>
 
 namespace at {
 namespace cuda {
@@ -13,11 +14,17 @@ class ThrustAllocator {
   typedef char value_type;
 
   char* allocate(std::ptrdiff_t size) {
-    return static_cast<char*>(c10::cuda::CUDACachingAllocator::raw_alloc(size));
+    if (at::globalContext().userEnabledUVM())
+      return static_cast<char*>(at::cuda::CachingManagedAllocator::raw_alloc(size));
+    else
+      return static_cast<char*>(c10::cuda::CUDACachingAllocator::raw_alloc(size));
   }
 
   void deallocate(char* p, size_t size) {
-    c10::cuda::CUDACachingAllocator::raw_delete(p);
+    if (at::globalContext().userEnabledUVM())
+      return at::cuda::CachingManagedAllocator::raw_delete(p);
+    else
+      return c10::cuda::CUDACachingAllocator::raw_delete(p);
   }
 };
 
diff --git a/aten/src/ATen/cuda/UvmMemoryAllocator.h b/aten/src/ATen/cuda/UvmMemoryAllocator.h
new file mode 100644
index 0000000000000..2d3cbf61b2c46
--- /dev/null
+++ b/aten/src/ATen/cuda/UvmMemoryAllocator.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <ATen/cuda/CachingManagedAllocator.h>
+
+namespace at { namespace cuda {
+
+inline TORCH_CUDA_CPP_API at::Allocator* getUnifiedDeviceAllocator() {
+  return CachingManagedAllocator::get(DeviceType::CUDA);
+}
+
+inline TORCH_CUDA_CPP_API at::Allocator* getUnifiedDeviceAllocatorCpu() {
+  return CachingManagedAllocator::get(DeviceType::CPU);
+}
+
+}} // namespace at::cuda
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
index 89c1246a32d14..ef037202c7f2e 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
@@ -3,6 +3,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/Dispatch.h>
 #include <ATen/ExpandUtils.h>
+#include <ATen/cuda/UvmMemoryAllocator.h>
 #include <ATen/cuda/PinnedMemoryAllocator.h>
 #include <ATen/cuda/CUDABlas.h>
 #include <ATen/cuda/CUDAEvent.h>
@@ -940,11 +941,15 @@ inline static void apply_cholesky_cusolver_potrf_looped(const Tensor& self_worki
   at::cuda::solver::xpotrf_buffersize(handle, params, uplo, n, datatype, nullptr, lda, datatype, &worksize_device, &worksize_host);
 
   // allocate workspace storage
-  auto& device_allocator = *at::cuda::getCUDADeviceAllocator();
+  auto& device_allocator = (at::globalContext().userEnabledUVM()) ?
+	                                 *at::cuda::getUnifiedDeviceAllocator() :
+					 *at::cuda::getCUDADeviceAllocator();
+  auto& host_allocator = (at::globalContext().userEnabledUVM()) ?
+	                                 *at::cuda::getUnifiedDeviceAllocator() :
+					 *at::getCPUAllocator();
   auto workdata_device = device_allocator.allocate(worksize_device * batch_size);
   void* workdata_device_ptr = workdata_device.get();
 
-  auto& host_allocator = *at::getCPUAllocator();
   auto workdata_host = host_allocator.allocate(worksize_host * batch_size);
   void* workdata_host_ptr = workdata_host.get();
 
@@ -968,7 +973,9 @@ inline static void apply_cholesky_cusolver_potrf_looped(const Tensor& self_worki
     handle, uplo, n_32, nullptr, lda_32, &lwork);
 
    // allocate workspace storage
-  auto& allocator = *at::cuda::getCUDADeviceAllocator();
+  auto& allocator = (at::globalContext().userEnabledUVM()) ?
+	                          *at::cuda::getUnifiedDeviceAllocator() :
+		                  *at::cuda::getCUDADeviceAllocator();
   auto work_data = allocator.allocate(sizeof(scalar_t)*lwork * batch_size);
   scalar_t* work_data_ptr = static_cast<scalar_t*>(work_data.get());
 
@@ -1215,9 +1222,13 @@ static void apply_geqrf(const Tensor& A, const Tensor& tau) {
 
 #ifdef USE_CUSOLVER_64_BIT
     // allocate workspace storage on device and host
-    auto& device_allocator = *at::cuda::getCUDADeviceAllocator();
+    auto& device_allocator = (at::globalContext().userEnabledUVM()) ?
+                                           *at::cuda::getUnifiedDeviceAllocator() :
+                                           *at::cuda::getCUDADeviceAllocator();
+    auto& host_allocator = (at::globalContext().userEnabledUVM()) ?
+                                           *at::cuda::getUnifiedDeviceAllocator() :
+                                           *at::getCPUAllocator();
     auto work_device_data = device_allocator.allocate(worksize_device);
-    auto& host_allocator = *at::getCPUAllocator();
     auto work_host_data = host_allocator.allocate(worksize_host);
     at::cuda::solver::xgeqrf<scalar_t>(
         handle,
@@ -1234,7 +1245,12 @@ static void apply_geqrf(const Tensor& A, const Tensor& tau) {
         infos_data);
 #else
     // allocate workspace storage on device
-    auto& allocator = *at::cuda::getCUDADeviceAllocator();
+    auto& device_allocator = (at::globalContext().userEnabledUVM()) ?
+                                  *at::cuda::getUnifiedDeviceAllocator() :
+                                  *at::cuda::getCUDADeviceAllocator();
+    auto& host_allocator = (at::globalContext().userEnabledUVM()) ?
+                                *at::cuda::getUnifiedDeviceAllocator() :
+                                *at::getCPUAllocator();
     auto work_data = allocator.allocate(sizeof(scalar_t) * std::max<int>(1, lwork));
     at::cuda::solver::geqrf<scalar_t>(
         handle,
@@ -1311,7 +1327,9 @@ static void apply_ormqr(const Tensor& input, const Tensor& tau, const Tensor& ot
     auto handle = at::cuda::getCurrentCUDASolverDnHandle();
 
     // allocate workspace storage
-    auto& allocator = *at::cuda::getCUDADeviceAllocator();
+    auto& allocator = (at::globalContext().userEnabledUVM()) ?
+                           *at::cuda::getUnifiedDeviceAllocator() :
+                           *at::cuda::getCUDADeviceAllocator();
     auto work_data = allocator.allocate(sizeof(scalar_t)*lwork);
 
     at::cuda::solver::ormqr<scalar_t>(
@@ -1388,7 +1406,9 @@ inline static void apply_orgqr(Tensor& self, const Tensor& tau) {
     auto handle = at::cuda::getCurrentCUDASolverDnHandle();
 
     // allocate workspace storage
-    auto& allocator = *at::cuda::getCUDADeviceAllocator();
+    auto& allocator = (at::globalContext().userEnabledUVM()) ?
+                           *at::cuda::getUnifiedDeviceAllocator() :
+                           *at::cuda::getCUDADeviceAllocator();
     auto work_data = allocator.allocate(sizeof(scalar_t)*lwork);
 
     at::cuda::solver::orgqr<scalar_t>(
@@ -1465,9 +1485,13 @@ static void apply_syevd(const Tensor& values, const Tensor& vectors, const Tenso
 
 #ifdef USE_CUSOLVER_64_BIT
     // allocate workspace storage on device and host
-    auto& device_allocator = *at::cuda::getCUDADeviceAllocator();
+    auto& device_allocator = (at::globalContext().userEnabledUVM()) ?
+                                  *at::cuda::getUnifiedDeviceAllocator() :
+                                  *at::cuda::getCUDADeviceAllocator();
+    auto& host_allocator = (at::globalContext().userEnabledUVM()) ?
+                                *at::cuda::getUnifiedDeviceAllocator() :
+                                *at::getCPUAllocator();
     auto work_device_data = device_allocator.allocate(worksize_device);
-    auto& host_allocator = *at::getCPUAllocator();
     auto work_host_data = host_allocator.allocate(worksize_host);
     at::cuda::solver::xsyevd<scalar_t>(
         handle,
@@ -1485,7 +1509,9 @@ static void apply_syevd(const Tensor& values, const Tensor& vectors, const Tenso
         info_working_ptr);
 #else
     // allocate workspace storage on device
-    auto& allocator = *at::cuda::getCUDADeviceAllocator();
+    auto& allocator = (at::globalContext().userEnabledUVM()) ?
+                           *at::cuda::getUnifiedDeviceAllocator() :
+                           *at::cuda::getCUDADeviceAllocator();
     auto work_data = allocator.allocate(sizeof(scalar_t) * lwork);
     at::cuda::solver::syevd<scalar_t>(
         handle,
@@ -1541,7 +1567,9 @@ static void apply_syevj(const Tensor& values, const Tensor& vectors, const Tenso
     auto handle = at::cuda::getCurrentCUDASolverDnHandle();
 
     // allocate workspace storage on device
-    auto& allocator = *at::cuda::getCUDADeviceAllocator();
+    auto& allocator = (at::globalContext().userEnabledUVM()) ?
+                           *at::cuda::getUnifiedDeviceAllocator() :
+                           *at::cuda::getCUDADeviceAllocator();
     auto work_data = allocator.allocate(sizeof(scalar_t) * lwork);
     at::cuda::solver::syevj<scalar_t>(
         handle,
@@ -1601,7 +1629,9 @@ static void apply_syevj_batched(const Tensor& values, const Tensor& vectors, con
       batch_size);
 
   // allocate workspace storage on device
-  auto& allocator = *at::cuda::getCUDADeviceAllocator();
+  auto& allocator = (at::globalContext().userEnabledUVM()) ?
+                         *at::cuda::getUnifiedDeviceAllocator() :
+                         *at::cuda::getCUDADeviceAllocator();
   auto work_data = allocator.allocate(sizeof(scalar_t) * lwork);
   at::cuda::solver::syevjBatched<scalar_t>(
       handle,
diff --git a/aten/src/ATen/native/cudnn/Conv_v7.cpp b/aten/src/ATen/native/cudnn/Conv_v7.cpp
index f5c7af79a740b..3ea2c82104844 100644
--- a/aten/src/ATen/native/cudnn/Conv_v7.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp
@@ -24,6 +24,7 @@
 #include <ATen/cudnn/Types.h>
 #include <ATen/cudnn/Utils.h>
 #include <ATen/native/utils/ParamsHash.h>
+#include <ATen/cuda/CachingManagedAllocator.h>
 
 #include <ATen/TensorUtils.h>
 #include <c10/util/irange.h>
@@ -138,14 +139,17 @@ struct Workspace {
     // workspace fail with some 64bit indexing error instead of an OOM error. In such case,
     // we manually fail with OOM.
     TORCH_CHECK_WITH(OutOfMemoryError, size < 1_TiB, "Not enough memory for workspace!");
-    data = c10::cuda::CUDACachingAllocator::raw_alloc(size);
+    data = at::globalContext().userEnabledUVM() ? at::cuda::CachingManagedAllocator::raw_alloc(size) : c10::cuda::CUDACachingAllocator::raw_alloc(size);
   }
   Workspace(const Workspace&) = delete;
   Workspace(Workspace&&) = default;
   Workspace& operator=(Workspace&&) = default;
   ~Workspace() {
     if (data) {
-      c10::cuda::CUDACachingAllocator::raw_delete(data);
+      if (at::globalContext().userEnabledUVM())
+        at::cuda::CachingManagedAllocator::raw_delete(data);
+      else
+        c10::cuda::CUDACachingAllocator::raw_delete(data);
     }
   }
 
diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
index 060a97d6fc1c1..8215b8119cdf5 100644
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -148,6 +148,7 @@ at::Tensor miopen_convolution_relu(
 #include <c10/util/irange.h>
 
 #include <c10/hip/HIPCachingAllocator.h>
+#include <ATen/hip/CachingManagedAllocator.h>
 
 #include <functional>
 #include <iterator>
@@ -291,14 +292,17 @@ BenchmarkCache<size_t> bwd_filter_wssizes;
 
 struct Workspace {
   Workspace(size_t size) : size(size), data(NULL) {
-    data = c10::hip::HIPCachingAllocator::raw_alloc(size);
+    data = at::globalContext().userEnabledUVM() ? at::cuda::CachingManagedAllocator::raw_alloc(size) : c10::hip::HIPCachingAllocator::raw_alloc(size);
   }
   Workspace(const Workspace&) = delete;
   Workspace(Workspace&&) = default;
   Workspace& operator=(Workspace&&) = default;
   ~Workspace() {
     if (data) {
-      c10::hip::HIPCachingAllocator::raw_delete(data);
+      if (at::globalContext().userEnabledUVM())
+        at::cuda::CachingManagedAllocator::raw_delete(data);
+      else
+        c10::hip::HIPCachingAllocator::raw_delete(data);
     }
   }
 

From 5d2b294caf2d3bc4e0c8935a933486eb61d4ab95 Mon Sep 17 00:00:00 2001
From: Jason Furmanek <Jason.Furmanek@amd.com>
Date: Wed, 7 Sep 2022 14:19:10 +0000
Subject: [PATCH 3/9] [UVM] Add CUDA Hooks, isManagedPtr, tensor.is_managed()

---
 aten/src/ATen/Context.h                       |  3 ++
 aten/src/ATen/cuda/UvmMemoryAllocator.cpp     | 44 ++++++++++++++++
 aten/src/ATen/cuda/detail/CUDAHooks.cpp       | 51 +++++++++++++++++++
 aten/src/ATen/cuda/detail/CUDAHooks.h         |  3 ++
 aten/src/ATen/detail/CUDAHooksInterface.h     | 12 +++++
 aten/src/ATen/native/Memory.cpp               | 11 ++++
 aten/src/ATen/native/native_functions.yaml    | 14 +++++
 aten/src/ATen/quantized/Quantizer.cpp         |  4 +-
 .../ATen/templates/RegisterBackendSelect.cpp  | 15 ++++++
 c10/core/TensorOptions.h                      | 41 +++++++++++++++
 tools/autograd/derivatives.yaml               |  3 ++
 torch/csrc/StorageMethods.cpp                 | 14 +++++
 torch/csrc/cuda/Module.cpp                    | 17 +++++++
 torch/cuda/memory.py                          |  9 ++++
 torch/storage.py                              |  9 ++++
 torch/utils/hipify/cuda_to_hip_mappings.py    |  1 +
 16 files changed, 250 insertions(+), 1 deletion(-)
 create mode 100644 aten/src/ATen/cuda/UvmMemoryAllocator.cpp

diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index ede1aeea02c7c..fb755cabf4c84 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -55,6 +55,9 @@ class TORCH_API Context {
   static bool isPinnedPtr(void* data) {
     return detail::getCUDAHooks().isPinnedPtr(data);
   }
+  static bool isManagedPtr(void* data) {
+    return detail::getCUDAHooks().isManagedPtr(data);
+  }
   static bool hasOpenMP();
   static bool hasMKL();
   static bool hasLAPACK();
diff --git a/aten/src/ATen/cuda/UvmMemoryAllocator.cpp b/aten/src/ATen/cuda/UvmMemoryAllocator.cpp
new file mode 100644
index 0000000000000..199046b75faed
--- /dev/null
+++ b/aten/src/ATen/cuda/UvmMemoryAllocator.cpp
@@ -0,0 +1,44 @@
+#include <ATen/ATen.h>
+#include <ATen/CPUFunctions.h>
+#include <ATen/Config.h>
+#include <ATen/Context.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/cuda/Exceptions.h>
+#include <ATen/cuda/UvmMemoryAllocator.h>
+#include <c10/core/Storage.h>
+
+namespace at {
+
+namespace native {
+
+bool is_managed_cuda(const Tensor& self, c10::optional<Device> device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      (!device.has_value() && device->is_cpu()) || device->is_cuda());
+  // TODO: unhook this
+  return detail::getCUDAHooks().isManagedPtr(self.storage().data());
+}
+
+Tensor _manage_memory_cuda(const Tensor& self, c10::optional<Device> device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      (!device.has_value() && device->is_cpu()) || device->is_cuda());
+  at::Allocator* allocator = nullptr;
+  if (self.is_cpu()) {
+    allocator = at::cuda::getUnifiedDeviceAllocatorCpu();
+  } else if (self.is_cuda()) {
+    allocator = at::cuda::getUnifiedDeviceAllocator();
+  }
+  size_t size_bytes = detail::computeStorageNbytes(
+      self.sizes(), self.strides(), self.dtype().itemsize());
+  auto storage = Storage(
+      Storage::use_byte_size_t(),
+      size_bytes,
+      allocator,
+      /*resizable=*/false);
+  auto tensor = at::empty({0}, self.options())
+                    .set_(storage, 0, self.sizes(), self.strides());
+  tensor.copy_(self);
+  return tensor;
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index 25e4c2b44fa99..40c6d31288618 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -10,6 +10,7 @@
 #include <ATen/cuda/Exceptions.h>
 #include <ATen/cuda/PeerToPeerAccess.h>
 #include <ATen/cuda/PinnedMemoryAllocator.h>
+#include <ATen/cuda/UvmMemoryAllocator.h>
 #include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/native/cuda/CuFFTPlanCache.h>
@@ -123,6 +124,9 @@ bool CUDAHooks::isPinnedPtr(void* data) const {
     return false;
   }
 #endif
+#if defined(USE_ROCM)
+  return attr.isManaged == cudaMemoryTypeHost;
+#endif
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
   return attr.type == cudaMemoryTypeHost;
 #else
@@ -130,6 +134,45 @@ bool CUDAHooks::isPinnedPtr(void* data) const {
 #endif
 }
 
+bool CUDAHooks::isManagedPtr(void* data) const {
+  // First check if driver is broken/missing, in which case PyTorch CPU
+  // functionalities should still work, we should report `false` here.
+  if (!at::cuda::is_available()) {
+    return false;
+  }
+  // cudaPointerGetAttributes grabs context on the current device, so we set
+  // device to one that already has context, if exists.
+  at::OptionalDeviceGuard device_guard;
+  auto primary_ctx_device_index = getDeviceIndexWithPrimaryContext();
+  if (primary_ctx_device_index.has_value()) {
+    device_guard.reset_device(at::Device(at::DeviceType::CUDA, *primary_ctx_device_index));
+  }
+  cudaPointerAttributes attr;
+  cudaError_t err = cudaPointerGetAttributes(&attr, data);
+#if !defined(USE_ROCM)
+  if (err == cudaErrorInvalidValue) {
+    cudaGetLastError();
+    return false;
+  }
+  AT_CUDA_CHECK(err);
+#else
+  // HIP throws hipErrorUnknown here
+  if (err != cudaSuccess) {
+    cudaGetLastError();
+    return false;
+  }
+#endif
+// NB: Potential temporary hack until we see a change in setting the "isManaged" attribute like CUDA does
+#if defined(USE_ROCM)
+  return attr.isManaged;
+#endif
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
+  return attr.type == cudaMemoryTypeManaged;
+#else
+  return attr.memoryType == cudaMemoryTypeManaged;
+#endif
+}
+
 bool CUDAHooks::hasCUDA() const {
   return at::cuda::is_available();
 }
@@ -249,6 +292,14 @@ Allocator* CUDAHooks::getCUDADeviceAllocator() const {
   return at::cuda::getCUDADeviceAllocator();
 }
 
+Allocator* CUDAHooks::getUnifiedDeviceAllocator() const {
+  return at::cuda::getUnifiedDeviceAllocator();
+}
+
+Allocator* CUDAHooks::getUnifiedDeviceAllocatorCpu() const {
+  return at::cuda::getUnifiedDeviceAllocatorCpu();
+}
+
 bool CUDAHooks::compiledWithCuDNN() const {
   return AT_CUDNN_ENABLED();
 }
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h
index d53276ab3bbac..3dafc8395b7d9 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@@ -24,6 +24,7 @@ struct CUDAHooks : public at::CUDAHooksInterface {
   void initCUDA() const override;
   Device getDeviceFromPtr(void* data) const override;
   bool isPinnedPtr(void* data) const override;
+  bool isManagedPtr(void* data) const override;
   const Generator& getDefaultCUDAGenerator(DeviceIndex device_index = -1) const override;
   bool hasCUDA() const override;
   bool hasMAGMA() const override;
@@ -35,6 +36,8 @@ struct CUDAHooks : public at::CUDAHooksInterface {
   bool hasPrimaryContext(int64_t device_index) const override;
   Allocator* getCUDADeviceAllocator() const override;
   Allocator* getPinnedMemoryAllocator() const override;
+  Allocator* getUnifiedDeviceAllocator() const override;
+  Allocator* getUnifiedDeviceAllocatorCpu() const override;
   bool compiledWithCuDNN() const override;
   bool compiledWithMIOpen() const override;
   bool supportsDilatedConvolutionWithCuDNN() const override;
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index 7ba8f68d94b20..d96b59defa4c2 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -87,6 +87,10 @@ struct TORCH_API CUDAHooksInterface {
     return false;
   }
 
+  virtual bool isManagedPtr(void* data) const {
+    return false;
+  }
+
   virtual bool hasCUDA() const {
     return false;
   }
@@ -131,6 +135,14 @@ struct TORCH_API CUDAHooksInterface {
     TORCH_CHECK(false, "CUDADeviceAllocator requires CUDA. ", CUDA_HELP);
   }
 
+  virtual Allocator* getUnifiedDeviceAllocator() const {
+    TORCH_CHECK(false, "Unified Device Allocator requires CUDA. ", CUDA_HELP);
+  }
+
+  virtual Allocator* getUnifiedDeviceAllocatorCpu() const {
+    TORCH_CHECK(false, "Unified Device Allocator requires CUDA. ", CUDA_HELP);
+  }
+
   virtual bool compiledWithCuDNN() const {
     return false;
   }
diff --git a/aten/src/ATen/native/Memory.cpp b/aten/src/ATen/native/Memory.cpp
index 2b66f08933934..e83b0bf060011 100644
--- a/aten/src/ATen/native/Memory.cpp
+++ b/aten/src/ATen/native/Memory.cpp
@@ -37,5 +37,16 @@ Tensor pin_memory(const Tensor& self, c10::optional<Device> device) {
   return at::_pin_memory(self, device);
 }
 
+bool is_managed_default(const Tensor& self, c10::optional<Device> device) {
+  return false;
 }
+
+Tensor manage_memory(const Tensor& self, c10::optional<Device> device) {
+  if (self.is_managed(device)) {
+    return self;
+  }
+  return at::_manage_memory(self, device);
 }
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index abf94179f70a1..ce7531727ad1d 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -4199,6 +4199,20 @@
     MPS: _pin_memory_mps
   autogen: _pin_memory.out
 
+- func: is_managed(Tensor self, Device? device=None) -> bool
+  variants: method
+  dispatch:
+    CUDA: is_managed_cuda
+    CompositeExplicitAutograd: is_managed_default
+
+- func: manage_memory(Tensor(a) self, Device? device=None) -> Tensor(a)
+  variants: method
+
+# Unlike manage_memory, this is guaranteed to give a new non-aliasing tensor
+- func: _manage_memory(Tensor self, Device? device=None) -> Tensor
+  dispatch:
+    CUDA: _manage_memory_cuda
+
 - func: pinverse(Tensor self, float rcond=1e-15) -> Tensor
   variants: function, method
 
diff --git a/aten/src/ATen/quantized/Quantizer.cpp b/aten/src/ATen/quantized/Quantizer.cpp
index 54fe7082c3e79..e2afa8181756c 100644
--- a/aten/src/ATen/quantized/Quantizer.cpp
+++ b/aten/src/ATen/quantized/Quantizer.cpp
@@ -115,7 +115,9 @@ inline Tensor new_qtensor(
   auto device = options.device();
   at::Allocator* allocator = nullptr;
   // TODO: why isn't this just using GetAllocator
-  if (device.is_cuda()) {
+  if (at::globalContext().userEnabledUVM()) {
+    allocator = at::detail::getCUDAHooks().getUnifiedDeviceAllocator();
+  } else if (device.is_cuda()) {
     allocator = at::detail::getCUDAHooks().getCUDADeviceAllocator();
   } else if (device.is_cpu()) {
     allocator = at::getCPUAllocator();
diff --git a/aten/src/ATen/templates/RegisterBackendSelect.cpp b/aten/src/ATen/templates/RegisterBackendSelect.cpp
index 6463701a4939f..8373849f044fb 100644
--- a/aten/src/ATen/templates/RegisterBackendSelect.cpp
+++ b/aten/src/ATen/templates/RegisterBackendSelect.cpp
@@ -13,6 +13,8 @@
 #else
 #include <ATen/ops/is_pinned_ops.h>
 #include <ATen/ops/_pin_memory_ops.h>
+#include <ATen/ops/is_managed_ops.h>
+#include <ATen/ops/_manage_memory_ops.h>
 
 ${ops_headers}
 #endif
@@ -39,10 +41,23 @@ at::Tensor _pin_memory(const Tensor& self, c10::optional<at::Device> device) {
   return at::_ops::_pin_memory::redispatch(_dk, self, device);
 }
 
+bool is_managed(const Tensor& self, c10::optional<at::Device> device) {
+  // TODO: fetch scalar type from Tensor? But it doesn't really matter...
+  DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(c10::nullopt, self.layout(), device.value_or(at::kCUDA)));
+  return at::_ops::is_managed::redispatch(_dk, self, device);
+}
+
+at::Tensor _manage_memory(const Tensor& self, c10::optional<at::Device> device) {
+  DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(c10::nullopt, self.layout(), device.value_or(at::kCUDA)));
+  return at::_ops::_manage_memory::redispatch(_dk, self, device);
+}
+
 TORCH_LIBRARY_IMPL(aten, BackendSelect, m) {
   ${backend_select_function_registrations};
   m.impl(TORCH_SELECTIVE_NAME("aten::is_pinned"), TORCH_FN(is_pinned));
   m.impl(TORCH_SELECTIVE_NAME("aten::_pin_memory"), TORCH_FN(_pin_memory));
+  m.impl(TORCH_SELECTIVE_NAME("aten::is_managed"), TORCH_FN(is_managed));
+  m.impl(TORCH_SELECTIVE_NAME("aten::_manage_memory"), TORCH_FN(_manage_memory));
 }
 
 } // namespace
diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h
index 432fe4f1e4b6c..0bbac3a6651b3 100644
--- a/c10/core/TensorOptions.h
+++ b/c10/core/TensorOptions.h
@@ -45,6 +45,10 @@ inline bool pinned_memory_or_default(c10::optional<bool> pinned_memory) {
   return pinned_memory.value_or(false);
 }
 
+inline bool managed_memory_or_default(c10::optional<bool> managed_memory) {
+  return managed_memory.value_or(false);
+}
+
 /// A class to encapsulate construction axes of an Tensor.  TensorOptions was
 /// designed to support the Python style API for specifying construction options
 /// on factory functions, e.g.,
@@ -136,11 +140,13 @@ struct C10_API TensorOptions {
   TensorOptions()
       : requires_grad_(false),
         pinned_memory_(false),
+	managed_memory_(false),
         has_device_(false),
         has_dtype_(false),
         has_layout_(false),
         has_requires_grad_(false),
         has_pinned_memory_(false),
+	has_managed_memory_(false),
         has_memory_format_(false) {}
 
   /// Constructs a `TensorOptions` object with the given layout.
@@ -263,6 +269,14 @@ struct C10_API TensorOptions {
     return r;
   }
 
+  /// Sets the `managed_memory` property on the `TensorOptions`.
+  C10_NODISCARD TensorOptions
+  managed_memory(c10::optional<bool> managed_memory) const noexcept {
+    TensorOptions r = *this;
+    r.set_managed_memory(managed_memory);
+    return r;
+  }
+
   /// Sets the `memory_format` property on `TensorOptions`.
   C10_NODISCARD TensorOptions
   memory_format(c10::optional<MemoryFormat> memory_format) const noexcept {
@@ -351,6 +365,14 @@ struct C10_API TensorOptions {
     return has_pinned_memory_;
   }
 
+  bool managed_memory() const noexcept {
+    return managed_memory_or_default(managed_memory_opt());
+  }
+
+  bool has_managed_memory() const noexcept {
+    return has_managed_memory_;
+  }
+
   /// Returns if the layout is sparse
   bool is_sparse() const {
     return layout_ == c10::Layout::Sparse;
@@ -373,6 +395,11 @@ struct C10_API TensorOptions {
                               : c10::nullopt;
   }
 
+  c10::optional<bool> managed_memory_opt() const noexcept {
+    return has_managed_memory_ ? c10::make_optional(managed_memory_)
+                              : c10::nullopt;
+  }
+
   /// Returns whether the `memory_layout` is specified
   bool has_memory_format() const noexcept {
     return has_memory_format_;
@@ -417,6 +444,8 @@ struct C10_API TensorOptions {
       merged.set_requires_grad(options.requires_grad_opt());
     if (options.has_pinned_memory())
       merged.set_pinned_memory(options.pinned_memory_opt());
+    if (options.has_managed_memory())
+      merged.set_managed_memory(options.managed_memory_opt());
     if (options.has_memory_format())
       merged.set_memory_format(options.memory_format_opt());
     return merged;
@@ -514,6 +543,16 @@ struct C10_API TensorOptions {
     }
   }
 
+  /// Mutably set the `managed_memory` property of `TensorOptions`.
+  void set_managed_memory(c10::optional<bool> managed_memory) & noexcept {
+    if (managed_memory) {
+      managed_memory_ = *managed_memory;
+      has_managed_memory_ = true;
+    } else {
+      has_managed_memory_ = false;
+    }
+  }
+
   /// Mutably set the `memory_Format` property of `TensorOptions`.
   void set_memory_format(c10::optional<MemoryFormat> memory_format) & noexcept {
     if (memory_format) {
@@ -546,12 +585,14 @@ struct C10_API TensorOptions {
 
   bool requires_grad_ : 1;
   bool pinned_memory_ : 1;
+  bool managed_memory_ : 1;
 
   bool has_device_ : 1;
   bool has_dtype_ : 1;
   bool has_layout_ : 1;
   bool has_requires_grad_ : 1;
   bool has_pinned_memory_ : 1;
+  bool has_managed_memory_ : 1;
   bool has_memory_format_ : 1;
 };
 
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 38429d8e58c86..04bdaa79d202b 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -2726,6 +2726,9 @@
 - name: _pin_memory(Tensor self, Device? device=None) -> Tensor
   self: grad
 
+- name: _manage_memory(Tensor self, Device? device=None) -> Tensor
+  self: grad
+
 - name: _new_zeros_with_same_feature_meta(Tensor self, Tensor other, *, int self_num_batch_dims=0) -> Tensor
   self: non_differentiable
   other: non_differentiable
diff --git a/torch/csrc/StorageMethods.cpp b/torch/csrc/StorageMethods.cpp
index 29f0f67ce6ecb..0537842c9c2c6 100644
--- a/torch/csrc/StorageMethods.cpp
+++ b/torch/csrc/StorageMethods.cpp
@@ -91,6 +91,19 @@ static PyObject* THPStorage_isPinned(PyObject* _self, PyObject* noargs) {
   END_HANDLE_TH_ERRORS
 }
 
+static PyObject* THPStorage_isManaged(PyObject* _self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  auto self = (THPStorage*)_self;
+#if defined(USE_CUDA)
+  return PyBool_FromLong(
+      at::globalContext().isManagedPtr(self->cdata->data<scalar_t>()));
+#else
+  Py_RETURN_FALSE;
+#endif
+  END_HANDLE_TH_ERRORS
+}
+
+
 static PyObject* THPStorage_elementSize(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
   return THPUtils_packInt64(sizeof(uint8_t));
@@ -502,6 +515,7 @@ static PyMethodDef THPStorage_methods[] = {
     {"nbytes", THPStorage_nbytes, METH_NOARGS, nullptr},
     {"data_ptr", THPStorage_dataPtr, METH_NOARGS, nullptr},
     {"is_pinned", THPStorage_isPinned, METH_NOARGS, nullptr},
+    {"is_managed", THPStorage_isManaged, METH_NOARGS, nullptr},
     {"_write_file", THPStorage_writeFile, METH_VARARGS, nullptr},
     {"_new_with_file",
      THPStorage_newWithFile,
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index 468571fc454ee..b98d53ed68bb7 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -8,6 +8,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ATen/cuda/CachingHostAllocator.h>
+#include <ATen/cuda/UvmMemoryAllocator.h>
 #include <ATen/cuda/Sleep.h>
 #include <ATen/cuda/detail/CUDAHooks.h>
 #include <ATen/cuda/jiterator.h>
@@ -237,6 +238,22 @@ PyObject* THCPModule_cudaCachingAllocator_raw_alloc(
   END_HANDLE_TH_ERRORS
 }
 
+PyObject * THCPModule_cudaUnifiedDeviceAllocator(PyObject *_unused, PyObject *noargs)
+{
+  HANDLE_TH_ERRORS
+  c10::Allocator* allocator = at::cuda::getUnifiedDeviceAllocator();
+  return PyLong_FromVoidPtr(allocator);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_cudaUnifiedDeviceAllocatorCpu(PyObject *_unused, PyObject *noargs)
+{
+  HANDLE_TH_ERRORS
+  c10::Allocator* allocator = at::cuda::getUnifiedDeviceAllocatorCpu();
+  return PyLong_FromVoidPtr(allocator);
+  END_HANDLE_TH_ERRORS
+}
+
 // Unpack a PyObject to at::Scalar, throw an exception if it fails
 at::Scalar as_scalar(PyObject* arg) {
   // Zero-dim tensors are converted to Scalars as-is. Note this doesn't
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index 047e393b125aa..b4d084abb18b2 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -37,6 +37,15 @@ def _host_allocator():
     _lazy_init()
     return torch._C._cuda_cudaHostAllocator()
 
+def _manage_allocator():
+    _lazy_init()
+    return torch._C._cuda_cudaUnifiedDeviceAllocator()
+
+
+def _manage_cpu_allocator():
+    _lazy_init()
+    return torch._C._cuda_cudaUnifiedDeviceAllocatorCpu()
+
 
 @contextlib.contextmanager
 def _free_mutex():
diff --git a/torch/storage.py b/torch/storage.py
index 38ec5238cd735..de39213f29c5b 100644
--- a/torch/storage.py
+++ b/torch/storage.py
@@ -193,6 +193,15 @@ def pin_memory(self):
         allocator = torch.cuda.memory._host_allocator()  # type: ignore[attr-defined]
         return type(self)(self.size(), allocator=allocator).copy_(self)
 
+    def manage_memory(self):
+        """Copies the storage to manage memory, if it's not already manage."""
+        import torch.cuda
+        if self.is_cuda:
+            allocator = torch.cuda._manage_allocator()  # type: ignore[attr-defined]
+        else:
+            allocator = torch.cuda._manage_cpu_allocator()  # type: ignore[attr-defined]
+        return type(self)(self.size(), allocator=allocator).copy_(self)
+
     def share_memory_(self):
         """Moves the storage to shared memory.
 
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index 1272a2fe3d00d..0b59d7aba3ecd 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -4069,6 +4069,7 @@
         ("cudaHostAlloc", ("hipHostMalloc", CONV_MEM, API_RUNTIME)),
         ("cudaMemoryTypeHost", ("hipMemoryTypeHost", CONV_MEM, API_RUNTIME)),
         ("cudaMemoryTypeDevice", ("hipMemoryTypeDevice", CONV_MEM, API_RUNTIME)),
+        ("cudaMemoryTypeManaged", ("hipMemoryTypeUnified", CONV_MEM, API_RUNTIME)),
         ("make_cudaExtent", ("make_hipExtent", CONV_MEM, API_RUNTIME)),
         ("make_cudaPitchedPtr", ("make_hipPitchedPtr", CONV_MEM, API_RUNTIME)),
         ("make_cudaPos", ("make_hipPos", CONV_MEM, API_RUNTIME)),

From dcebdcaed52cea9379242400341a70803cdf1f1f Mon Sep 17 00:00:00 2001
From: Jason Furmanek <Jason.Furmanek@amd.com>
Date: Tue, 30 Aug 2022 14:09:44 +0000
Subject: [PATCH 4/9] [UVM] Enable UVM stats module

---
 torch/csrc/cuda/Module.cpp | 59 +++++++++++++++++++++++++--
 torch/cuda/memory.py       | 81 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 137 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index b98d53ed68bb7..b3d9daeb7e4cd 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -531,9 +531,61 @@ PyObject* THCPModule_memoryStats(PyObject* _unused, PyObject* arg) {
   END_HANDLE_TH_ERRORS
 }
 
-PyObject* THCPModule_resetAccumulatedMemoryStats(
-    PyObject* _unused,
-    PyObject* arg) {
+PyObject* THCPModule_managedMemoryStats(PyObject *_unused, PyObject *arg)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(THPUtils_checkLong(arg), "invalid argument to memory_allocated");
+  const int device = (int) THPUtils_unpackLong(arg);
+
+  using at::cuda::CachingManagedAllocator::StatType;
+  using at::cuda::CachingManagedAllocator::Stat;
+  using at::cuda::CachingManagedAllocator::StatArray;
+  using at::cuda::CachingManagedAllocator::DeviceStats;
+
+  const auto statToDict = [](const Stat& stat) {
+    py::dict dict;
+
+    dict["current"] = stat.current;
+    dict["peak"] = stat.peak;
+    dict["allocated"] = stat.allocated;
+    dict["freed"] = stat.freed;
+    return dict;
+  };
+
+  const auto statArrayToDict = [=](const StatArray& statArray) {
+    const std::array<const char*, static_cast<size_t>(StatType::NUM_TYPES)> statTypeNames = {
+      "all", "small_pool", "large_pool"
+    };
+    py::dict dict;
+    for (const auto i : c10::irange(statTypeNames.size())) {
+      dict[statTypeNames[i]] = statToDict(statArray[i]);
+    }
+    return dict;
+  };
+
+  const DeviceStats stats = at::cuda::CachingManagedAllocator::getDeviceStats();
+
+  py::dict result;
+  result["num_alloc_retries"] = stats.num_alloc_retries;
+  result["num_ooms"] = stats.num_ooms;
+  result["max_split_size"] = stats.max_split_size;
+  result["allocation"] = statArrayToDict(stats.allocation);
+  result["segment"] = statArrayToDict(stats.segment);
+  result["active"] = statArrayToDict(stats.active);
+  result["inactive_split"] = statArrayToDict(stats.inactive_split);
+  result["allocated_bytes"] = statArrayToDict(stats.allocated_bytes);
+  result["reserved_bytes"] = statArrayToDict(stats.reserved_bytes);
+  result["active_bytes"] = statArrayToDict(stats.active_bytes);
+  result["inactive_split_bytes"] = statArrayToDict(stats.inactive_split_bytes);
+  result["oversize_allocations"] = statToDict(stats.oversize_allocations);
+  result["oversize_segments"] = statToDict(stats.oversize_segments);
+
+  return result.release().ptr();
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_resetAccumulatedMemoryStats(PyObject *_unused, PyObject *arg)
+{
   HANDLE_TH_ERRORS
   THPUtils_assert(
       THPUtils_checkLong(arg),
@@ -1209,6 +1261,7 @@ static struct PyMethodDef _THCPModule_methods[] = {
      nullptr},
     {"_cuda_emptyCache", THCPModule_emptyCache, METH_NOARGS, nullptr},
     {"_cuda_memoryStats", THCPModule_memoryStats, METH_O, nullptr},
+    {"_cuda_managedMemoryStats", THCPModule_managedMemoryStats, METH_O, nullptr},
     {"_cuda_resetAccumulatedMemoryStats",
      THCPModule_resetAccumulatedMemoryStats,
      METH_O,
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index b4d084abb18b2..b43d6df340385 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -763,3 +763,84 @@ def set_enabled_move(enable):
 def get_enabled_move():
     r"""Returns a bool indicating if Unified Virtual Memory is currently enabled."""
     return torch._C._cuda_getEnabledMove()
+
+def managed_memory_stats(device: Union[Device, int] = None) -> Dict[str, Any]:
+    r"""Returns a dictionary of Managed memory allocator statistics for a
+    given device.
+    The return value of this function is a dictionary of statistics, each of
+    which is a non-negative integer.
+    Core statistics:
+    - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of allocation requests received by the memory allocator.
+    - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of allocated memory.
+    - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of reserved segments from ``cudaMalloc()``.
+    - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of reserved memory.
+    - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of active memory blocks.
+    - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of active memory.
+    - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of inactive, non-releasable memory blocks.
+    - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of inactive, non-releasable memory.
+    For these core statistics, values are broken down as follows.
+    Pool type:
+    - ``all``: combined statistics across all memory pools.
+    - ``large_pool``: statistics for the large allocation pool
+      (as of October 2019, for size >= 1MB allocations).
+    - ``small_pool``: statistics for the small allocation pool
+      (as of October 2019, for size < 1MB allocations).
+    Metric type:
+    - ``current``: current value of this metric.
+    - ``peak``: maximum value of this metric.
+    - ``allocated``: historical total increase in this metric.
+    - ``freed``: historical total decrease in this metric.
+    In addition to the core statistics, we also provide some simple event
+    counters:
+    - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that
+      result in a cache flush and retry.
+    - ``"num_ooms"``: number of out-of-memory errors thrown.
+    The caching allocator can be configured via ENV to not split blocks larger than a
+    defined size (see Memory Management section of the Cuda Semantics documentation).
+    This helps avoid memory framentation but may have a performance
+    penalty. Additional outputs to assist with tuning and evaluating impact:
+    - ``"max_split_size"``: blocks above this size will not be split.
+    - ``"oversize_allocations.{current,peak,allocated,freed}"``:
+      number of over-size allocation requests received by the memory allocator.
+    - ``"oversize_segments.{current,peak,allocated,freed}"``:
+      number of over-size reserved segments from ``cudaMalloc()``.
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistics for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    result = []
+
+    def _recurse_add_to_result(prefix, obj):
+        if isinstance(obj, dict):
+            if len(prefix) > 0:
+                prefix += "."
+            for k, v in obj.items():
+                _recurse_add_to_result(prefix + k, v)
+        else:
+            result.append((prefix, obj))
+
+    stats = managed_memory_stats_as_nested_dict(device=device)
+    _recurse_add_to_result("", stats)
+    result.sort()
+
+    return collections.OrderedDict(result)
+
+
+def managed_memory_stats_as_nested_dict(device: Union[Device, int] = None) -> Dict[str, Any]:
+    r"""Returns the result of :func:`~torch.cuda.managed_memory_stats` as a nested dictionary."""
+    if not is_initialized():
+        return {}
+    device = _get_device_index(device, optional=True)
+    return torch._C._cuda_managedMemoryStats(device)

From d1cfce1727beb7f23275d71ec308e7245ac1ef33 Mon Sep 17 00:00:00 2001
From: Jason Furmanek <Jason.Furmanek@amd.com>
Date: Thu, 8 Sep 2022 02:56:30 +0000
Subject: [PATCH 5/9] [UVM] Add Move kernel and associated functions

---
 aten/src/ATen/EmptyTensor.cpp                 |  14 ++-
 aten/src/ATen/core/TensorBase.h               |   4 +
 aten/src/ATen/cuda/EmptyTensor.cpp            |  15 ++-
 aten/src/ATen/native/Copy.cpp                 |  36 ++++++
 aten/src/ATen/native/Copy.h                   |   4 +
 aten/src/ATen/native/TensorConversions.cpp    |  59 +++++++++-
 aten/src/ATen/native/cuda/Copy.cu             | 106 ++++++++++++++++++
 aten/src/ATen/native/cuda/TensorShapeCUDA.cpp |   9 +-
 aten/src/ATen/native/native_functions.yaml    |  14 +++
 c10/core/TensorImpl.h                         |  53 +++++++++
 tools/autograd/derivatives.yaml               |   7 ++
 tools/autograd/gen_python_functions.py        |   5 +-
 torch/csrc/autograd/FunctionsManual.cpp       |  11 ++
 torch/csrc/autograd/FunctionsManual.h         |   4 +
 14 files changed, 332 insertions(+), 9 deletions(-)

diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp
index 55cdc09268f05..d2b883caa0d64 100644
--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@@ -3,6 +3,7 @@
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <c10/core/CPUAllocator.h>
 #include <c10/util/safe_numerics.h>
+#include <ATen/Context.h>
 
 #include <limits>
 
@@ -10,10 +11,17 @@ namespace at {
 namespace detail {
 namespace {
 c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
-  if (pin_memory) {
-    return at::detail::getCUDAHooks().getPinnedMemoryAllocator();
+  if (at::globalContext().userEnabledUVM()) {
+    // When UVM is enabled, we only use a single allocator
+    // for all allocations.
+    return detail::getCUDAHooks().getUnifiedDeviceAllocatorCpu();
+  }
+  else {
+    if (pin_memory) {
+      return at::detail::getCUDAHooks().getPinnedMemoryAllocator();
+    }
+    return c10::GetCPUAllocator();
   }
-  return c10::GetCPUAllocator();
 }
 
 constexpr uint64_t storage_max() {
diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h
index 0ecd4456033b0..adbebc66404ac 100644
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@@ -400,6 +400,10 @@ class TORCH_API TensorBase {
     return impl_->is_cpu();
   }
 
+  inline void _set_new_device(Device dst_device) const {
+    impl_->_set_new_device(dst_device);
+  }
+
   /// Returns if a `Tensor` has CUDA backend.
   bool is_cuda() const {
     // NB: this is not a native function to avoid dispatching overhead.
diff --git a/aten/src/ATen/cuda/EmptyTensor.cpp b/aten/src/ATen/cuda/EmptyTensor.cpp
index d40a245857d3b..1107b480fb2ee 100644
--- a/aten/src/ATen/cuda/EmptyTensor.cpp
+++ b/aten/src/ATen/cuda/EmptyTensor.cpp
@@ -1,6 +1,7 @@
 #define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/cuda/EmptyTensor.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/UvmMemoryAllocator.h>
 #include <ATen/EmptyTensor.h>
 
 namespace at {
@@ -15,7 +16,12 @@ TensorBase empty_cuda(
   const auto device = device_or_default(device_opt);
   TORCH_INTERNAL_ASSERT(device.is_cuda());
   const DeviceGuard device_guard(device);
-  auto* allocator = at::cuda::getCUDADeviceAllocator();
+  at::Allocator* allocator;
+  if (at::globalContext().userEnabledUVM()) {
+    allocator = at::cuda::getUnifiedDeviceAllocator();
+  } else {
+    allocator = at::cuda::getCUDADeviceAllocator();
+  }
   constexpr c10::DispatchKeySet cuda_dks(c10::DispatchKey::CUDA);
   return at::detail::empty_generic(
       size, allocator, cuda_dks, dtype, memory_format_opt);
@@ -55,7 +61,12 @@ TensorBase empty_strided_cuda(
   const auto device = device_or_default(device_opt);
   TORCH_INTERNAL_ASSERT(device.is_cuda());
   const DeviceGuard device_guard(device);
-  auto* allocator = at::cuda::getCUDADeviceAllocator();
+  at::Allocator* allocator;
+  if (at::globalContext().userEnabledUVM()) {
+    allocator = at::cuda::getUnifiedDeviceAllocator();
+  } else {
+    allocator = at::cuda::getCUDADeviceAllocator();
+  }
   constexpr c10::DispatchKeySet cuda_dks(c10::DispatchKey::CUDA);
   return at::detail::empty_strided_generic(
       size, stride, allocator, cuda_dks, dtype);
diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index 0c99943eb0cb0..1697cfb570d03 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -345,7 +345,43 @@ void copy_ignoring_overlaps(const TensorBase &dst, const TensorBase &src) {
   copy_stub(iter.device_type(), iter, /*non_blocking=*/false);
 }
 
+static Tensor & move_impl(Tensor & self, c10::optional<Device> dst_device, bool non_blocking) {
+  // TODO: this should be handled during dispatch, but that's missing...
+  TORCH_CHECK(self.defined(), "self is undefined");
+
+  // We'll create a tensor iterator with just an output
+  // Chagne the device up front so the iterator has
+  // the correct device
+  self._set_new_device(dst_device.value());
+
+  auto iter = TensorIteratorConfig()
+   .add_output(self)
+   .resize_outputs(false)
+   .set_check_mem_overlap(false)
+   .check_all_same_dtype(false)
+   .check_all_same_device(false)
+   .build();
+
+  move_stub(kCUDA, iter, dst_device, non_blocking);
+  return self;
+}
+
+Tensor& move_(Tensor& self, c10::optional<Device> dst_device, bool non_blocking) {
+  // auto maybe_outnames = namedinference::compute_broadcast_outnames(self, src);
+  {
+    NoNamesGuard guard;  //check if needed
+    if (self._is_zerotensor()) {
+     TORCH_CHECK(false, "ZeroTensors are immutable. Please materialize the tensor using `.clone()`, if you want a mutable zero tensor.");
+    }
+    move_impl(self, dst_device, non_blocking);
+  }
+  // namedinference::propagate_names_if_nonempty(self, maybe_outnames);
+  return self;
+}
+
 DEFINE_DISPATCH(copy_stub);
+DEFINE_DISPATCH(move_stub);
+REGISTER_NO_CPU_DISPATCH(move_stub);
 
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/Copy.h b/aten/src/ATen/native/Copy.h
index 14abb32fa5ad4..4ba4085b1ebb2 100644
--- a/aten/src/ATen/native/Copy.h
+++ b/aten/src/ATen/native/Copy.h
@@ -1,6 +1,8 @@
 #pragma once
 
 #include <ATen/native/DispatchStub.h>
+#include <c10/core/Device.h>
+#include <c10/util/Optional.h>
 
 namespace at {
 
@@ -11,8 +13,10 @@ class TensorBase;
 namespace native {
 
 using copy_fn = void (*)(TensorIterator&, bool non_blocking);
+using move_fn = void (*)(TensorIterator&, c10::optional<Device> dst_device, bool non_blocking);
 
 DECLARE_DISPATCH(copy_fn, copy_stub);
+DECLARE_DISPATCH(move_fn, move_stub);
 
 TORCH_API void copy_ignoring_overlaps(const TensorBase &dst, const TensorBase &src);
 
diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp
index 01f651982633b..51b43ad8959d8 100644
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@@ -345,6 +345,22 @@ Tensor _to_copy(
   return r;
 }
 
+Tensor _to_move(
+    const Tensor& self,
+    c10::optional<ScalarType> dtype,
+    c10::optional<Layout> layout,
+    c10::optional<Device> device,
+    c10::optional<bool> pin_memory,
+    bool non_blocking,
+    c10::optional<c10::MemoryFormat> optional_memory_format) {
+  TORCH_CHECK(!layout.has_value() || self.layout() == layout.value(),
+           "to(options) doesn't support converting to a different layout, "
+           "but got self.layout being ", self.layout(),
+           " and options.layout set as ", layout.value());
+
+  return self.move_(device, non_blocking);
+}
+
 template <typename T>
 static inline bool is_null_or_equal_to(const c10::optional<T>& test, const T& value) {
   if (!test.has_value()) {
@@ -373,12 +389,43 @@ bool to_will_alias(
      self.suggest_memory_format() == memory_format);
 }
 
+bool to_will_move(
+    const Tensor& self,
+    c10::optional<ScalarType> dtype,
+    c10::optional<Layout> layout,
+    c10::optional<Device> device,
+    bool copy,
+    c10::optional<c10::MemoryFormat> optional_memory_format) {
+  auto memory_format = optional_memory_format.value_or(MemoryFormat::Preserve);
+
+  if (!globalContext().userEnabledMove()) {
+    return false;
+  }
+
+  // Let copy kernel handle meta tensor targets
+  if (device.has_value() && (device.value().type() == DeviceType::Meta)) {
+    return false;
+  }
+
+  // Now check that we can "move" instead of copy if UVM is
+  // enabled, we have a manged tensor and the dtype and layout
+  // are the same. Device also needs to be changing (good enough?)
+  return is_null_or_equal_to(dtype, self.dtype().toScalarType()) &&
+    is_null_or_equal_to(layout, self.layout()) &&
+    (device != self.device()) &&
+    !copy && globalContext().userEnabledUVM() &&
+    self.is_managed() &&
+    (memory_format == MemoryFormat::Preserve ||
+     self.suggest_memory_format() == memory_format);
+}
+
 static inline Tensor to_impl(
     const Tensor& self,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,
     c10::optional<Device> device,
     c10::optional<bool> pin_memory,
+     c10::optional<bool> managed_memory,
     bool non_blocking,
     bool copy,
     c10::optional<c10::MemoryFormat> optional_memory_format) {
@@ -387,6 +434,10 @@ static inline Tensor to_impl(
   if (to_will_alias(self, dtype, layout, device, copy, optional_memory_format)) {
     return self;
   }
+  if (to_will_move(self, dtype, layout, device, copy, optional_memory_format)) {
+    return at::_to_move(
+      self, dtype, layout, device, pin_memory, non_blocking, optional_memory_format);
+  }
   return at::_to_copy(
       self, dtype, layout, device, pin_memory, non_blocking, optional_memory_format);
 }
@@ -408,7 +459,7 @@ Tensor _autocast_to_reduced_precision(const Tensor& self, bool cuda_enabled, boo
     TORCH_INTERNAL_ASSERT(target != at::ScalarType::Undefined, "_autocast_to_reduced_precision requires legit ScalarType argument for given device");
 
     return to_impl(
-        self, target, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt);
+        self, target, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt);
   } else {
     return self;
   }
@@ -422,7 +473,7 @@ Tensor _autocast_to_full_precision(const Tensor& self, bool cuda_enabled, bool c
       (self.device().is_cpu() && cpu_enabled))
       ) {
     return to_impl(
-        self, at::ScalarType::Float, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt);
+        self, at::ScalarType::Float, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt);
   } else {
     return self;
   }
@@ -444,6 +495,7 @@ Tensor to(
       layout,
       ensure_has_index(device),
       pin_memory,
+      nullopt,
       non_blocking,
       copy,
       optional_memory_format);
@@ -456,6 +508,7 @@ Tensor to(const Tensor& self, Device device, ScalarType dtype, bool non_blocking
       nullopt,
       ensure_has_index(device),
       nullopt,
+      nullopt,
       non_blocking,
       copy,
       optional_memory_format);
@@ -468,6 +521,7 @@ Tensor to(const Tensor& self, ScalarType dtype, bool non_blocking, bool copy, c1
       nullopt,
       nullopt,
       nullopt,
+      nullopt,
       non_blocking,
       copy,
       optional_memory_format);
@@ -481,6 +535,7 @@ Tensor to(const Tensor& self, const Tensor& other, bool non_blocking, bool copy,
       options.layout(),
       options.device(),
       options.pinned_memory(),
+      options.managed_memory(),
       non_blocking,
       copy,
       optional_memory_format);
diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu
index 564ecf1c12913..b8e62b6a161c8 100644
--- a/aten/src/ATen/native/cuda/Copy.cu
+++ b/aten/src/ATen/native/cuda/Copy.cu
@@ -281,7 +281,113 @@ static void copy_kernel_cuda(TensorIterator& iter, bool non_blocking) {
   }
 }
 
+int64_t uvm_get_guard_index(Tensor& t) {
+  int cuda_device_index;
+  if (t.is_cpu()) {
+    // TODO: We can to do more here once we have
+    // indirect contexts in place
+    cuda_device_index = -1;
+  } else {
+    TORCH_CHECK(t.is_cuda());
+    cuda_device_index = t.get_device();
+  }
+  return cuda_device_index;
+}
+
+void uvm_cuda_mem_advise_preferred_location(Tensor t) {
+  // Call cudaMemAdvise on a managed tensor
+  at::cuda::OptionalCUDAGuard device_guard;
+  int64_t cuda_device_index = uvm_get_guard_index(t);
+  int hint_device;
+  if (t.is_cpu()) {
+    hint_device = -1 /*cudaCpuDeviceId*/;
+  } else {
+    hint_device = static_cast<int>(cuda_device_index);
+  }
+
+  void* ptr = t.data_ptr();
+  size_t size_bytes = at::detail::computeStorageNbytes(
+      t.sizes(), t.strides(), t.dtype().itemsize());
+
+  device_guard.set_index(cuda_device_index);
+  AT_CUDA_CHECK(cudaMemAdvise(
+      ptr,
+      size_bytes,
+      cudaMemAdviseSetPreferredLocation,
+      hint_device));
+  return;
+}
+
+
+void uvm_cuda_mem_prefetch_async(Tensor t, CUDAStream stream) {
+  // Call cudaMemPrefetchAsync on Tensor
+  at::cuda::OptionalCUDAGuard device_guard;
+  //TORCH_CHECK(t.is_cuda() || (t.is_cpu() && device_t.has_value()));
+
+  int prefetch_device =
+      (t.is_cpu()) ? -1/*cudaCpuDeviceId*/ : static_cast<int>(t.get_device());
+
+  void* ptr = t.data_ptr();
+  size_t size_bytes = at::detail::computeStorageNbytes(
+      t.sizes(), t.strides(), t.dtype().itemsize());
+
+  device_guard.set_index(t.get_device());
+
+  AT_CUDA_CHECK(cudaMemPrefetchAsync(ptr, size_bytes, prefetch_device, stream));
+
+  return;
+}
+
+static void move_kernel_cuda(TensorIterator& iter, c10::optional<Device> destination, bool non_blocking) {
+  //AT_ASSERT(iter.ntensors() == 1);
+
+  Device dst_device = destination.value();
+  Device src_device = iter.device(0);
+
+  // Copy between CPU and GPU
+  cuda::OptionalCUDAGuard device_guard;
+  cudaMemcpyKind kind;
+  if (dst_device.is_cuda() && src_device.is_cpu()) {
+    device_guard.set_device(dst_device);
+    kind = cudaMemcpyHostToDevice;
+  } else if (dst_device.is_cpu() && src_device.is_cuda()) {
+    device_guard.set_device(src_device);
+    kind = cudaMemcpyDeviceToHost;
+  }
+
+  CUDAStream stream = getCurrentCUDAStream();
+
+  // At this point we are setting up a memcopy between a
+  // CPU and GPU of a contiguous, same-type tensor. Since this is
+  // purely for device transfer, when UVM is enabled,
+  // we can skip the explicit copy as we have managed memory.
+
+  // We do, however, need to call _set_new_device() which will:
+  // - change the device of the tensor's storage object to destination device
+  // - change the device of the tensor to destination
+  // - Edit the Tensor's dispatchKeySet
+  // TODO: D2D may need 2way event barrier
+  iter.tensor(0)._set_new_device(dst_device);
+
+  // set device hint
+  // TODO: Enable memadvise after tuning
+  //uvm_cuda_mem_advise_preferred_location(self);
+
+  // request a prefetch to new device
+  uvm_cuda_mem_prefetch_async(iter.tensor(0), stream);
+
+  // An explicit sync is always needed when copying back to CPU
+  if (kind == cudaMemcpyDeviceToHost) {
+    AT_CUDA_CHECK(cudaStreamSynchronize(stream));
+  }
+  // TODO: Sync both directions for now
+  AT_CUDA_CHECK(cudaDeviceSynchronize());
+  AT_CUDA_CHECK(cudaStreamSynchronize(stream));
+
+}
+
 REGISTER_DISPATCH(copy_stub, &copy_kernel_cuda);
+REGISTER_CUDA_DISPATCH(move_stub, &move_kernel_cuda);
 
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/cuda/TensorShapeCUDA.cpp b/aten/src/ATen/native/cuda/TensorShapeCUDA.cpp
index 0bb7eb410acf3..25a36d1f25f9c 100644
--- a/aten/src/ATen/native/cuda/TensorShapeCUDA.cpp
+++ b/aten/src/ATen/native/cuda/TensorShapeCUDA.cpp
@@ -1,6 +1,7 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/UvmMemoryAllocator.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/cuda/Resize.h>
 
@@ -18,10 +19,16 @@ namespace native {
 // the same as at::cuda::getCUDADeviceAllocator().
 Tensor& set_cuda_(Tensor& result) {
   caffe2::TypeMeta dtype = result.dtype();
+  at::Allocator* allocator;
+  if (at::globalContext().userEnabledUVM()) {
+    allocator = at::cuda::getUnifiedDeviceAllocator();
+  } else {
+    allocator = at::cuda::getCUDADeviceAllocator();
+  }
   Storage storage(
       Storage::use_byte_size_t(),
       0,
-      at::cuda::getCUDADeviceAllocator(),
+      allocator,
       true);
   result.set_(storage, 0, {0}, {});
   TORCH_INTERNAL_ASSERT(dtype == result.dtype());
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index ce7531727ad1d..0bb28123168d9 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1626,6 +1626,14 @@
     NestedTensorCPU, NestedTensorCUDA: copy_nested_
   autogen: copy.out
 
+- func: move_(Tensor(a!) self, Device? dst_device=None, bool non_blocking=False) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: move_
+  autogen: move.functional, move.out
+
 - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor
   dispatch:
     MPS: _copy_from_mps
@@ -7062,6 +7070,12 @@
   autogen: _to_copy.out
   tags: canonical
 
+- func: _to_move(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: _to_move
+
 # to(Device) must not exist because all constructors of Device also works for
 # TensorOptions. Otherwise, an ambiguity error is thrown.
 # See NOTE [ TensorOptions Constructors ].
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 3a0cce80991f7..929488186e1ef 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -1270,6 +1270,59 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
  public:
+  void _set_new_device(Device dst_device) {
+    // This should only be called from the
+    // move kernel which is used when UVM is enabled
+    // TODO: enforce this
+
+    // 1) set the device in the storage object
+    storage_.data_ptr().unsafe_set_device(dst_device);
+
+    // 2) set the local device_opt
+    device_opt_ = storage_.device();
+
+    // 3) update dispatch key set
+    auto ks_iter = key_set_.begin();
+    DispatchKeySet new_ks;
+    if (dst_device.is_cuda())
+    {
+      TORCH_CHECK(!(key_set_.has(DispatchKey::MkldnnCPU)))
+      for (ks_iter = key_set_.begin(); ks_iter != key_set_.end(); ++ks_iter)
+      {
+        if (*ks_iter == DispatchKey::CPU)
+          new_ks = new_ks.add(DispatchKey::CUDA);
+        else if (*ks_iter == DispatchKey::SparseCPU)
+          new_ks = new_ks.add(DispatchKey::SparseCUDA);
+        else if (*ks_iter == DispatchKey::SparseCsrCPU)
+          new_ks = new_ks.add(DispatchKey::SparseCsrCUDA);
+        else if (*ks_iter == DispatchKey::QuantizedCPU)
+          new_ks = new_ks.add(DispatchKey::QuantizedCUDA);
+        else
+          new_ks = new_ks.add(*ks_iter);
+      }
+    } else if (dst_device.is_cpu()) {
+      for (ks_iter = key_set_.begin(); ks_iter != key_set_.end(); ++ks_iter)
+      {
+        if (*ks_iter == DispatchKey::CUDA)
+          new_ks = new_ks.add(DispatchKey::CPU);
+        else if (*ks_iter == DispatchKey::SparseCUDA)
+          new_ks = new_ks.add(DispatchKey::SparseCPU);
+        else if (*ks_iter == DispatchKey::SparseCsrCUDA)
+          new_ks = new_ks.add(DispatchKey::SparseCsrCPU);
+        else if (*ks_iter == DispatchKey::QuantizedCUDA)
+          new_ks = new_ks.add(DispatchKey::QuantizedCPU);
+        else
+          new_ks = new_ks.add(*ks_iter);
+      }
+    } else {
+      for (ks_iter = key_set_.begin(); ks_iter != key_set_.end(); ++ks_iter)
+      {
+          new_ks = new_ks.add(*ks_iter);
+      }
+    }
+    key_set_ = new_ks;
+  }
+
   Layout layout() const {
     if (C10_UNLIKELY(layout_policy_)) {
       return layout_custom();
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 04bdaa79d202b..2d719b28afa73 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -420,6 +420,13 @@
   # (If dtype IS nullopt, we rely on the regular check that any input requires grad).
   output_differentiability: ["!dtype || isDifferentiableType(*dtype)"]
 
+- name: _to_move(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor
+  self: _to_move_backward(grad, self.options())
+  result: _to_move(self_t, dtype, layout, device, pin_memory, non_blocking, memory_format)
+  # The condition is: if dtype is not nullopt, then isDifferentiableType(*dtype)
+  # (If dtype IS nullopt, we rely on the regular check that any input requires grad).
+  output_differentiability: ["!dtype || isDifferentiableType(*dtype)"]
+
 - name: _coalesce(Tensor self) -> Tensor
   self: grad
 
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index ee06a8ed12384..388d0c37841c7 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -125,8 +125,10 @@
     "to",
     "_to_copy",
     "_reshape_copy",
+    "_to_move",
     "copy_sparse_to_sparse_",
-    "copy_",
+    "copy_", 
+    "move_",
     "numpy_T",
     "matrix_H",
     "mT",
@@ -149,6 +151,7 @@
     "_reshape_alias",
     "replace_",  # only used by the functionalization pass, doesn't need to be exposed to python
     "copy",  # only used by the functionalization pass
+    "move",  # only used by the functionalization pass
     "fill.Tensor",  # only used by the functionalization pass
     "fill.Scalar",  # only used by the functionalization pass
     "lift.*",
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 30593b5c00723..26f6362352513 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -6674,6 +6674,17 @@ Tensor _to_copy_backward(
   return grad->to(self_options, /*non_blocking=*/false, /*copy=*/false);
 }
 
+Tensor _to_move_backward(const Tensor &grad_, const c10::TensorOptions &self_options) {
+  // Handle R->C copies without raising a warning
+  const auto self_type = self_options.dtype().toScalarType();
+  auto grad = c10::MaybeOwned<at::Tensor>::borrowed(grad_);
+  if (!c10::isComplexType(self_type) && grad->is_complex()) {
+    grad = c10::MaybeOwned<at::Tensor>::owned(at::real(grad_));
+  }
+
+  return grad->to(self_options, /*non_blocking=*/false, /*copy=*/false);
+}
+
 std::tuple<Tensor, Tensor> index_reduce_backward(
     const Tensor& grad,
     const Tensor& self,
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index e1475ecb2b968..7112169af48c5 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -1016,6 +1016,10 @@ std::tuple<Tensor, Tensor> scatter_reduce_backward(
 Tensor _to_copy_backward(
     const Tensor& grad,
     const c10::TensorOptions& self_options);
+    
+Tensor _to_move_backward(
+    const Tensor &grad, 
+    const c10::TensorOptions &self_options);
 
 std::tuple<Tensor, Tensor> index_reduce_backward(
     const Tensor& grad,

From 189be3669957ec2c97700f1a7b3849dcc167ead8 Mon Sep 17 00:00:00 2001
From: Douglas Lehr <Doug.Lehr@amd.com>
Date: Wed, 21 Dec 2022 21:13:43 +0000
Subject: [PATCH 6/9] Pytorch 2.0 updates for uvm

---
 aten/src/ATen/native/Memory.cpp            | 1 +
 aten/src/ATen/native/native_functions.yaml | 2 +-
 torch/csrc/StorageMethods.cpp              | 2 +-
 torchgen/native_function_generation.py     | 1 +
 4 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/Memory.cpp b/aten/src/ATen/native/Memory.cpp
index e83b0bf060011..237df64a87bfe 100644
--- a/aten/src/ATen/native/Memory.cpp
+++ b/aten/src/ATen/native/Memory.cpp
@@ -8,6 +8,7 @@
 #else
 #include <ATen/ops/_debug_has_internal_overlap_native.h>
 #include <ATen/ops/_pin_memory.h>
+#include <ATen/ops/_manage_memory.h>
 #include <ATen/ops/is_pinned_native.h>
 #include <ATen/ops/pin_memory_native.h>
 #endif
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 0bb28123168d9..1e1b071b3391c 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1632,7 +1632,7 @@
   device_guard: False
   dispatch:
     CompositeExplicitAutograd: move_
-  autogen: move.functional, move.out
+  autogen: move, move.out
 
 - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor
   dispatch:
diff --git a/torch/csrc/StorageMethods.cpp b/torch/csrc/StorageMethods.cpp
index 0537842c9c2c6..1b6078335c734 100644
--- a/torch/csrc/StorageMethods.cpp
+++ b/torch/csrc/StorageMethods.cpp
@@ -96,7 +96,7 @@ static PyObject* THPStorage_isManaged(PyObject* _self, PyObject* noargs) {
   auto self = (THPStorage*)_self;
 #if defined(USE_CUDA)
   return PyBool_FromLong(
-      at::globalContext().isManagedPtr(self->cdata->data<scalar_t>()));
+      at::globalContext().isManagedPtr(self->cdata->data<uint8_t>()));
 #else
   Py_RETURN_FALSE;
 #endif
diff --git a/torchgen/native_function_generation.py b/torchgen/native_function_generation.py
index f1ba555be62e6..dcca88405688b 100644
--- a/torchgen/native_function_generation.py
+++ b/torchgen/native_function_generation.py
@@ -64,6 +64,7 @@
     "equal",  # returns a boolean
     "is_coalesced",  # returns an boolean
     "is_pinned",  # returns a boolean
+    "is_managed",
     "is_same_size",  # returns a boolean
     "is_set_to",  # returns a boolean
     "q_per_channel_axis",  # returns an int

From 0a3a46362492b054a60921519af49333ee775453 Mon Sep 17 00:00:00 2001
From: Douglas Lehr <Doug.Lehr@amd.com>
Date: Tue, 24 Jan 2023 07:46:54 +0000
Subject: [PATCH 7/9] Add CUDAMallocManagedAllocator Backend

With the new CUDAAllocator class, we have created a new
CUDAMallocManagedAllocator, which will handle allocator requests from both
cpu and cuda device types when the backend is enabled
You can enable the backend using PYTORCH_CUDA_ALLOC_CONF=backend:cudaMallocManaged
And view inside PyTorch using torch.cuda.get_allocator_backend()

This allocator is initially rudimentary as the performance implications of a
managed allocator are still being worked out. However, the goal is to be able to
swap out the backend when running without any code change required.
---
 c10/cuda/CMakeLists.txt                 |   1 +
 c10/cuda/CUDACachingAllocator.cpp       |  10 +-
 c10/cuda/CUDAMallocManagedAllocator.cpp | 553 ++++++++++++++++++++++++
 3 files changed, 563 insertions(+), 1 deletion(-)
 create mode 100644 c10/cuda/CUDAMallocManagedAllocator.cpp

diff --git a/c10/cuda/CMakeLists.txt b/c10/cuda/CMakeLists.txt
index 2c26bc06f6ca4..854ff699f2a52 100644
--- a/c10/cuda/CMakeLists.txt
+++ b/c10/cuda/CMakeLists.txt
@@ -25,6 +25,7 @@ set(C10_CUDA_SRCS
     CUDAException.cpp
     CUDAFunctions.cpp
     CUDAMallocAsyncAllocator.cpp
+    CUDAMallocManagedAllocator.cpp
     CUDAMiscFunctions.cpp
     CUDAStream.cpp
     impl/CUDAGuardImpl.cpp
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index aaa647502a897..c912ddf40b91a 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -628,7 +628,7 @@ size_t CachingAllocatorConfig::parseAllocatorConfig(
   consumeToken(config, ++i, ':');
   if (++i < config.size()) {
     TORCH_CHECK(
-        ((config[i] == "native") || (config[i] == "cudaMallocAsync")),
+        ((config[i] == "native") || (config[i] == "cudaMallocAsync") || (config[i] == "cudaMallocManaged")),
         "Unknown allocator backend, "
         "options are native and cudaMallocAsync");
     used_cudaMallocAsync = (config[i] == "cudaMallocAsync");
@@ -2336,6 +2336,12 @@ CUDAAllocator* allocator();
 
 } // namespace CudaMallocAsync
 
+namespace CudaMallocManaged {
+// If this is put in its own header file, it gets incorrectly renamed in HIPify.
+CUDAAllocator* allocator();
+
+} // namespace CudaMallocManaged
+
 struct BackendStaticInitializer {
   // Parses env for backend at load time, duplicating some logic from
   // CachingAllocatorConfig. CachingAllocatorConfig double-checks it later (at
@@ -2361,6 +2367,8 @@ struct BackendStaticInitializer {
           if (kv[0] == "backend") {
             if (kv[1] == "cudaMallocAsync")
               return CudaMallocAsync::allocator();
+            if (kv[1] == "cudaMallocManaged")
+              return CudaMallocManaged::allocator();
             if (kv[1] == "native")
               return &Native::allocator;
           }
diff --git a/c10/cuda/CUDAMallocManagedAllocator.cpp b/c10/cuda/CUDAMallocManagedAllocator.cpp
new file mode 100644
index 0000000000000..fcba528ec2218
--- /dev/null
+++ b/c10/cuda/CUDAMallocManagedAllocator.cpp
@@ -0,0 +1,553 @@
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/util/UniqueVoidPtr.h>
+#include <c10/util/flat_hash_map.h>
+#include <c10/util/irange.h>
+
+#include <unordered_set>
+#include <vector>
+
+namespace c10 {
+namespace cuda {
+namespace CUDACachingAllocator {
+namespace CudaMallocManaged {
+
+// CUDA device allocator that uses CudaMallocManaged to implement
+// the same interface as CUDACachingAllocator.cpp.
+
+// Implementation details, not declared in CUDACachingAllocator.h
+namespace {
+
+// General helpers
+
+struct UsageStream {
+  cudaStream_t stream;
+  int device;
+  UsageStream() {}
+  UsageStream(cudaStream_t s, int d) : stream(s), device(d) {}
+  UsageStream(const UsageStream& us) : stream(us.stream), device(us.device) {}
+  UsageStream(const UsageStream&& us) : stream(us.stream), device(us.device) {}
+  UsageStream& operator=(UsageStream other) {
+    stream = other.stream;
+    device = other.device;
+    return *this;
+  }
+};
+
+bool operator==(const UsageStream& lhs, const UsageStream& rhs) {
+  return (lhs.stream == rhs.stream) && (lhs.device == rhs.device);
+}
+
+struct UsageStreamHash {
+  size_t operator()(const UsageStream& us) const noexcept {
+    return std::hash<void*>{}(us.stream) + size_t(us.device);
+  }
+};
+
+struct PtrUsage {
+
+  uint64_t size;
+  PtrUsage(uint64_t s) : size(s) {}
+};
+
+int device_count = 0;
+// these don't need to be c10::once_flags as in CUDAGeneratorImpl.cpp
+// because they'll only be flipped by functions that have locked the mutex.
+bool devs_initialized_flag;
+
+// Possible micro-optimization:
+// Some accesses to ptr_info are read-only.
+// We could let those be concurrent with a shared_mutex and
+// have concurrent calls take a shared_lock.
+// Keeping it simple with an ordinary mutex for now.
+std::mutex general_mutex;
+
+using PtrInfo = ska::flat_hash_map<void*, PtrUsage>;
+PtrInfo ptr_info;
+
+
+// These two help setMemoryFraction limit the amount of memory
+// used by PyTorch in particular (as opposed to other libraries
+// in the same process that might be sharing the same cudaMemPool_t).
+size_t pytorch_used_bytes;
+size_t pytorch_memory_limits;
+
+bool capture_underway = false;
+
+// Implementation functions
+
+// Assumes the caller holds general_mutex
+inline void lazy_init_device(int device) {
+  if (!devs_initialized_flag) {
+    pytorch_used_bytes= 0;
+    pytorch_memory_limits = UINT64_MAX;
+
+    devs_initialized_flag = true;
+  }
+}
+
+
+void free(void* ptr) {
+  std::lock_guard<std::mutex> lk(general_mutex);
+
+  auto err = cudaGetLastError();
+  C10_CUDA_CHECK(err);
+  auto it = ptr_info.find(ptr);
+
+  TORCH_INTERNAL_ASSERT(it != ptr_info.end(), "ptr not found in ptr_info");
+  
+  C10_CUDA_CHECK(cudaFree(it->first));
+
+  pytorch_used_bytes -= it->second.size;
+
+  ptr_info.erase(it);
+}
+
+// Symmetric with NativeCachingAllocator::malloc for now,
+// although I don't think we absolutely need the symmetry.
+void mallocManaged(void** devPtr, int device, size_t size, cudaStream_t stream) {
+  TORCH_INTERNAL_ASSERT(
+      0 <= device && device < device_count,
+      "Invalid device index ",
+      device,
+      ": did you call init?");
+
+  CUDAGuard g(device);
+
+  std::lock_guard<std::mutex> lk(general_mutex);
+
+  lazy_init_device(device);
+
+  // Defensively checks for preexisting CUDA error state.
+  auto err = cudaGetLastError();
+  C10_CUDA_CHECK(err);
+
+  if (pytorch_used_bytes + size > pytorch_memory_limits) {
+    err = cudaErrorMemoryAllocation;
+  } else {
+    err = cudaMallocManaged(devPtr, size);
+  }
+
+  if (err == cudaErrorMemoryAllocation) {
+    // Clears CUDA's internal error state so the user, if desired, can catch the
+    // OOM exception, free some stuff on the script side, and retry the
+    // allocation. This aligns with the behavior of alloc_block in
+    // CUDACachingAllocator.cpp.
+    cudaGetLastError();
+    size_t device_free;
+    size_t device_total;
+    C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
+    TORCH_CHECK_WITH(
+        OutOfMemoryError,
+        false,
+        "Allocation on device ",
+        device,
+        " would exceed allowed memory. (out of memory)",
+        "\nCurrently allocated     : ",
+        format_size(pytorch_used_bytes),
+        "\nRequested               : ",
+        format_size(size),
+        "\nDevice limit            : ",
+        format_size(device_total),
+        "\nFree (according to CUDA): ",
+        format_size(device_free),
+        "\nPyTorch limit (set by user-supplied memory fraction)"
+        "\n                        : ",
+        format_size(pytorch_memory_limits));
+  } else {
+    C10_CUDA_CHECK(err);
+  }
+auto inserted = ptr_info.emplace(*devPtr,  PtrUsage(size));
+  TORCH_INTERNAL_ASSERT(
+      inserted.second,
+      "address returned by cudaMallocManaged already exists "
+      "in ptr_info");
+  pytorch_used_bytes += size;
+}
+
+} // anonymous namespace
+
+void local_raw_delete(void* ptr);
+
+// Same pattern as CUDACachingAllocator.cpp.
+struct CudaMallocManagedAllocator : public CUDAAllocator {
+  DataPtr allocate(size_t size) const override {
+    constexpr size_t one_exa_bytes = 1152921504606846976ULL;
+    TORCH_CHECK_WITH(
+        OutOfMemoryError,
+        size < one_exa_bytes,
+        "CUDA out of memory. Tried to allocate more than 1EB memory.");
+    int device;
+    C10_CUDA_CHECK(cudaGetDevice(&device));
+    void* r = nullptr;
+    if (size != 0) {
+      mallocManaged(&r, device, size, cuda::getCurrentCUDAStream(device));
+    }
+    return {r, r, &local_raw_delete, Device(DeviceType::CUDA, device)};
+  }
+  DeleterFnPtr raw_deleter() const override {
+    return &local_raw_delete;
+  }
+
+  // This function should not issue any context-creating calls,
+  // just set up for later calls to init per-device pools based
+  // on the current device each later call sees.
+  void init(int dev_count) override {
+    static bool called = [](int dev_count) {
+      ;
+      // Are there external guarantees init will be called before
+      // any of the allocator's other functions?
+      // std::lock_guard<std::mutex> lk(general_mutex);
+      device_count = dev_count;
+      return true;
+    }(dev_count);
+    (void)called;
+  }
+
+  bool initialized() override {
+    return devs_initialized_flag;
+  }
+
+  static inline void assertValidDevice(int device) {
+    TORCH_CHECK(
+        0 <= device && device < device_count, "Invalid device argument.");
+  }
+
+  void setMemoryFraction(double fraction, int device) override {
+    TORCH_INTERNAL_ASSERT(
+        0 <= fraction && fraction <= 1,
+        "invalid fraction:",
+        fraction,
+        ". Please set within (0, 1).");
+
+    std::lock_guard<std::mutex> lk(general_mutex);
+    assertValidDevice(device);
+    CUDAGuard g(device);
+    // Should setMemoryFraction be allowed to trigger a full device context and
+    // pool-creating lazy_init_device, or should we simply assert this device is
+    // already initialized, ie
+    // TORCH_CHECK(devs_initialized_flags[device], ...)?
+    lazy_init_device(device);
+
+    size_t device_free;
+    size_t device_total;
+    C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
+    pytorch_memory_limits =
+        static_cast<uint64_t>(fraction * device_total);
+
+    // Alternative: Instead of a manual hard limit, we could use
+    // cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold,
+    // &threshold); This is a soft hint: The driver allows the pool's reserved
+    // memory to spike above threshold in regions of high cudaMallocAsync
+    // demand, but opportunistically trims reserved memory back to threshold
+    // when the memory in use is < threshold. I don't like this because it
+    // introduces performance nondeterminism.
+  }
+
+  void emptyCache(void) override {
+    std::lock_guard<std::mutex> lk(general_mutex);
+
+    for (int dev = 0; dev < device_count; dev++) {
+      if (devs_initialized_flag) {
+        CUDAGuard g(dev);
+
+        cudaMemPool_t mempool;
+        cudaDeviceGetDefaultMemPool(&mempool, dev);
+        cudaDeviceSynchronize();
+        cudaMemPoolTrimTo(mempool, 0);
+      }
+    }
+  }
+
+  void cacheInfo(int device, size_t* maxWorkspaceGuess) override {
+    // The only consumer of cacheInfo is getMaxWorkspaceSize in Conv_v7.cpp.
+    // Afaict, the role of cacheInfo is to give getMaxWorkspaceSize a reasonable
+    // maximum workspace size to use for an upcoming cudnnFind call.
+    //
+    // The native allocator's cacheInfo chooses to return the size of its
+    // largest unused block (which is the largest allocation the native
+    // allocator can service immediately and asynchronously without a
+    // cudaMalloc.
+    //
+    // Here, we use a different heuristic: figure out the max usable workspace
+    // size with a bit of educated trial and error. It's ok to be
+    // perf-inefficient because cacheInfo is a prelude to cudnnFind.
+    //
+    // The algo cache then stores the best-performing algo with workspace <=
+    // maxWorkspaceGuess. Later calls with the same param set hit in cache and
+    // try to allocate the same workspace. If, in one of those future calls,
+    // workspace allocation fails (ie because less ambient memory is available),
+    // the bindings rerun cudnnFind, including calling cacheInfo again
+    // beforehand to estimate a new (smaller) largest-available workspace. Over
+    // a few such calls, the cache should settle to the algo with a workspace
+    // size that's small enough to succeed every time (for that param set).
+    //
+    // So the strategy here is to return a rough, largeish guess and let the
+    // bindings retry to trim as needed over time.
+
+    std::lock_guard<std::mutex> lk(general_mutex);
+    assertValidDevice(device);
+    CUDAGuard g(device);
+    lazy_init_device(device);
+
+    size_t free_upper_bound;
+    size_t device_total;
+    C10_CUDA_CHECK(cudaMemGetInfo(&free_upper_bound, &device_total));
+    TORCH_INTERNAL_ASSERT(
+        free_upper_bound + pytorch_used_bytes <= device_total);
+    size_t guess = std::min(
+        free_upper_bound,
+        pytorch_memory_limits - pytorch_used_bytes);
+    auto stream = c10::cuda::getCurrentCUDAStream();
+    void* dummy;
+
+    // Defensively checks for preexisting CUDA error state.
+    auto err = cudaGetLastError();
+    C10_CUDA_CHECK(err);
+
+    while (true) {
+      // Duplicates some logic from mallocAsync to work with the error state
+      // directly instead of repeatedly catching an exception thrown by
+      // mallocAsync.
+      if (pytorch_used_bytes + guess > pytorch_memory_limits) {
+        err = cudaErrorMemoryAllocation;
+      } else {
+        err = cudaMallocManaged(&dummy, guess);
+      }
+
+      if (err == cudaSuccess) {
+        cudaFree(dummy);
+        *maxWorkspaceGuess = guess;
+        return;
+      } else if (err == cudaErrorMemoryAllocation) {
+        cudaGetLastError(); // clear CUDA error
+        guess >>= 1; // quick and dirty: try half the size next iteration
+      } else {
+        C10_CUDA_CHECK(err);
+      }
+    }
+  }
+
+  void* getBaseAllocation(void* ptr, size_t* size) override {
+    std::lock_guard<std::mutex> lk(general_mutex);
+
+    auto it = ptr_info.find(ptr);
+    TORCH_INTERNAL_ASSERT(it != ptr_info.end(), "ptr not found in ptr_info");
+
+    if (size) {
+      *size = it->second.size;
+    }
+
+    return ptr;
+  }
+
+  void recordStream(const DataPtr& ptr, cuda::CUDAStream stream) override {
+    TORCH_CHECK(
+        false,
+        "cudaMallocManaged does not yet support recordStream. "
+        "If you need it, please file an issue describing your use case.");
+  }
+
+  std::shared_ptr<void> getIpcDevPtr(std::string handle) override {
+    TORCH_CHECK(
+        false,
+        "cudaMallocManaged does not yet support getIpcDevPtr. "
+        "If you need it, please file an issue describing your use case.");
+  }
+
+  void recordHistory(
+      bool enabled,
+      CreateContextFn context_recorder,
+      size_t alloc_trace_max_entries,
+      bool alloc_trace_record_context) override {
+    TORCH_CHECK(
+        false,
+        "cudaMallocManaged does not yet support recordHistory. "
+        "If you need it, please file an issue describing your use case.");
+  }
+
+  void attachOutOfMemoryObserver(OutOfMemoryObserver observer) override {
+    TORCH_CHECK(
+        false,
+        "cudaMallocManaged does not yet support attachOutOfMemoryObserver. "
+        "If you need it, please file an issue describing your use case.");
+  }
+
+  // Collects stats for device.
+  // If device hasn't been used yet, returns 0s without creating a context.
+  DeviceStats getDeviceStats(int device) override {
+    assertValidDevice(device);
+
+    // Memory currently reserved by the mempool
+    uint64_t reserved_mem_current = 0;
+    // High-water mark of memory reserved by the mempool since last reset
+    uint64_t reserved_mem_peak = 0;
+    // Memory currently in use by the mempool
+    uint64_t used_mem_current = 0;
+    // High-water mark of memory
+    uint64_t used_mem_peak = 0;
+
+    std::lock_guard<std::mutex> lk(general_mutex);
+
+    if (devs_initialized_flag) {
+      CUDAGuard g(device);
+
+      cudaMemPool_t mempool;
+      C10_CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool, device));
+      C10_CUDA_CHECK(cudaMemPoolGetAttribute(
+          mempool, cudaMemPoolAttrReservedMemCurrent, &reserved_mem_current));
+
+      C10_CUDA_CHECK(cudaMemPoolGetAttribute(
+          mempool, cudaMemPoolAttrReservedMemHigh, &reserved_mem_peak));
+
+      C10_CUDA_CHECK(cudaMemPoolGetAttribute(
+          mempool, cudaMemPoolAttrUsedMemCurrent, &used_mem_current));
+
+      C10_CUDA_CHECK(cudaMemPoolGetAttribute(
+          mempool, cudaMemPoolAttrUsedMemHigh, &used_mem_peak));
+    }
+
+    // Many stat types are specific to the native allocator. We leave these
+    // untouched. Their "struct Stat"s will contain zeroed values.
+    DeviceStats stats;
+
+    // In the native allocator:
+    // allocated_bytes is the total bytes of blocks that have been malloc()ed
+    // and not yet free()d.
+    // active_bytes is the total bytes of blocks that have been malloc()ed but
+    // not yet released back into a free pool. In other words, it includes all
+    // allocated_bytes, as well as the bytes of "limbo state" blocks had have
+    // already been free()ed but not yet free_block()ed back into a pool due to
+    // outstanding stream_uses.
+    //
+    // Here, in the CudaMallocManaged allocator:
+    // We simply ask the driver's opinion about active memory.
+    // We don't bother distinguishing between allocated_bytes and active_bytes.
+    stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current =
+        used_mem_current;
+    stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].peak =
+        used_mem_peak;
+    stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].current =
+        used_mem_current;
+    stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].peak =
+        used_mem_peak;
+    stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current =
+        reserved_mem_current;
+    stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].peak =
+        reserved_mem_peak;
+
+    return stats;
+  }
+
+  void resetAccumulatedStats(int device) override {
+    assertValidDevice(device);
+    TORCH_WARN_ONCE(
+        "For backend:cudaMallocManaged, resetAccumulatedStats has no effect.");
+  }
+
+  void resetPeakStats(int device) override {
+    assertValidDevice(device);
+
+    CUDAGuard g(device);
+    cudaMemPool_t mempool;
+    C10_CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool, device));
+    // Using zero as the reset value is the method recommended by Cuda driver
+    // team. Vivek Kini says:
+    //   "Resetting to zero (which is the only valid value when setting
+    //    ReservedMemHigh) resets it to ReservedMemCurrent inside the driver
+    //   (same goes for UsedMemHigh/UsedMemCurrent)"
+    uint64_t zero = 0;
+    C10_CUDA_CHECK(cudaMemPoolSetAttribute(
+        mempool, cudaMemPoolAttrReservedMemHigh, &zero));
+    C10_CUDA_CHECK(
+        cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrUsedMemHigh, &zero));
+  }
+
+  SnapshotInfo snapshot() override {
+    TORCH_CHECK(
+        false,
+        "Calling snapshot with backend:cudaMallocManaged is not supported. ");
+    // Alternative: TORCH_WARN
+    return {};
+  }
+
+  // CUDAGraph interactions
+  void notifyCaptureBegin(
+      int device,
+      CaptureId_t graph_id,
+      MempoolId_t mempool_id) override {
+    TORCH_CHECK(
+        false,
+        "cudaMallocManaged does not yet support notifyCaptureBegin. "
+        "If you need it, please file an issue describing your use case.");
+  }
+
+  void notifyCaptureAboutToEnd(int device, CaptureId_t graph_id) override {
+    TORCH_CHECK(
+        false,
+        "cudaMallocManaged does not yet support notifyCaptureAboutToEnd. "
+        "If you need it, please file an issue describing your use case.");
+  }
+
+  void notifyCaptureEnded(int device, CaptureId_t graph_id) override {
+    TORCH_CHECK(
+        false,
+        "cudaMallocManaged does not yet support notifyCaptureEnded. "
+        "If you need it, please file an issue describing your use case.");
+  }
+
+  void notifyCaptureDestroy(int device, MempoolId_t mempool_id) override {
+    TORCH_CHECK(
+        false,
+        "cudaMallocManaged does not yet support notifyCaptureDestroy. "
+        "If you need it, please file an issue describing your use case.");
+  }
+
+  void* raw_alloc(size_t nbytes) override {
+    if (nbytes == 0) {
+      return nullptr;
+    }
+    int device;
+    C10_CUDA_CHECK(cudaGetDevice(&device));
+    void* r = nullptr;
+    mallocManaged(&r, device, nbytes, cuda::getCurrentCUDAStream(device));
+    return r;
+  }
+
+  void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) override {
+    if (nbytes == 0) {
+      return nullptr;
+    }
+    int device;
+    C10_CUDA_CHECK(cudaGetDevice(&device));
+    void* r = nullptr;
+    mallocManaged(&r, device, nbytes, stream);
+    return r;
+  }
+  void raw_delete(void* ptr) override {
+    free(ptr);
+  }
+  bool needsPoolSpecificPeerAccess() override {
+    return true;
+  }
+  std::string name() override {
+    return "CudaMallocManaged";
+  }
+};
+
+CudaMallocManagedAllocator device_allocator;
+
+void local_raw_delete(void* ptr) {
+  free(ptr);
+}
+CUDAAllocator* allocator() {
+  return &device_allocator;
+}
+
+
+} // namespace CudaMallocManaged
+} // namespace CUDACachingAllocator
+} // namespace cuda
+} // namespace c10

From ad82c20351f14cf283c0f20edad65108ec7511eb Mon Sep 17 00:00:00 2001
From: Douglas Lehr <Doug.Lehr@amd.com>
Date: Tue, 7 Feb 2023 16:24:00 +0000
Subject: [PATCH 8/9] Clean up front end to use default DeviceAllocator

---
 aten/src/ATen/EmptyTensor.cpp              |  2 +-
 aten/src/ATen/cuda/EmptyTensor.cpp         | 14 +++-------
 aten/src/ATen/cuda/UvmMemoryAllocator.cpp  |  8 ++----
 aten/src/ATen/cuda/detail/CUDAHooks.cpp    |  8 ------
 aten/src/ATen/cuda/detail/CUDAHooks.h      |  2 --
 aten/src/ATen/detail/CUDAHooksInterface.h  |  8 ------
 torch/utils/hipify/cuda_to_hip_mappings.py | 30 ++++++++++++++++++++++
 7 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp
index d2b883caa0d64..236aad5f725fa 100644
--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@@ -14,7 +14,7 @@ c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
   if (at::globalContext().userEnabledUVM()) {
     // When UVM is enabled, we only use a single allocator
     // for all allocations.
-    return detail::getCUDAHooks().getUnifiedDeviceAllocatorCpu();
+    return detail::getCUDAHooks().getCUDADeviceAllocator();
   }
   else {
     if (pin_memory) {
diff --git a/aten/src/ATen/cuda/EmptyTensor.cpp b/aten/src/ATen/cuda/EmptyTensor.cpp
index 1107b480fb2ee..2c7d2222108fd 100644
--- a/aten/src/ATen/cuda/EmptyTensor.cpp
+++ b/aten/src/ATen/cuda/EmptyTensor.cpp
@@ -17,11 +17,8 @@ TensorBase empty_cuda(
   TORCH_INTERNAL_ASSERT(device.is_cuda());
   const DeviceGuard device_guard(device);
   at::Allocator* allocator;
-  if (at::globalContext().userEnabledUVM()) {
-    allocator = at::cuda::getUnifiedDeviceAllocator();
-  } else {
-    allocator = at::cuda::getCUDADeviceAllocator();
-  }
+  allocator = at::cuda::getCUDADeviceAllocator();
+
   constexpr c10::DispatchKeySet cuda_dks(c10::DispatchKey::CUDA);
   return at::detail::empty_generic(
       size, allocator, cuda_dks, dtype, memory_format_opt);
@@ -62,11 +59,8 @@ TensorBase empty_strided_cuda(
   TORCH_INTERNAL_ASSERT(device.is_cuda());
   const DeviceGuard device_guard(device);
   at::Allocator* allocator;
-  if (at::globalContext().userEnabledUVM()) {
-    allocator = at::cuda::getUnifiedDeviceAllocator();
-  } else {
-    allocator = at::cuda::getCUDADeviceAllocator();
-  }
+  allocator = at::cuda::getCUDADeviceAllocator();
+
   constexpr c10::DispatchKeySet cuda_dks(c10::DispatchKey::CUDA);
   return at::detail::empty_strided_generic(
       size, stride, allocator, cuda_dks, dtype);
diff --git a/aten/src/ATen/cuda/UvmMemoryAllocator.cpp b/aten/src/ATen/cuda/UvmMemoryAllocator.cpp
index 199046b75faed..6243e4c285b72 100644
--- a/aten/src/ATen/cuda/UvmMemoryAllocator.cpp
+++ b/aten/src/ATen/cuda/UvmMemoryAllocator.cpp
@@ -21,12 +21,8 @@ bool is_managed_cuda(const Tensor& self, c10::optional<Device> device) {
 Tensor _manage_memory_cuda(const Tensor& self, c10::optional<Device> device) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       (!device.has_value() && device->is_cpu()) || device->is_cuda());
-  at::Allocator* allocator = nullptr;
-  if (self.is_cpu()) {
-    allocator = at::cuda::getUnifiedDeviceAllocatorCpu();
-  } else if (self.is_cuda()) {
-    allocator = at::cuda::getUnifiedDeviceAllocator();
-  }
+  at::Allocator* allocator = at::detail::getCUDAHooks().getCUDADeviceAllocator();
+
   size_t size_bytes = detail::computeStorageNbytes(
       self.sizes(), self.strides(), self.dtype().itemsize());
   auto storage = Storage(
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index 40c6d31288618..71330af174886 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -292,14 +292,6 @@ Allocator* CUDAHooks::getCUDADeviceAllocator() const {
   return at::cuda::getCUDADeviceAllocator();
 }
 
-Allocator* CUDAHooks::getUnifiedDeviceAllocator() const {
-  return at::cuda::getUnifiedDeviceAllocator();
-}
-
-Allocator* CUDAHooks::getUnifiedDeviceAllocatorCpu() const {
-  return at::cuda::getUnifiedDeviceAllocatorCpu();
-}
-
 bool CUDAHooks::compiledWithCuDNN() const {
   return AT_CUDNN_ENABLED();
 }
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h
index 3dafc8395b7d9..71704d26b9a12 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@@ -36,8 +36,6 @@ struct CUDAHooks : public at::CUDAHooksInterface {
   bool hasPrimaryContext(int64_t device_index) const override;
   Allocator* getCUDADeviceAllocator() const override;
   Allocator* getPinnedMemoryAllocator() const override;
-  Allocator* getUnifiedDeviceAllocator() const override;
-  Allocator* getUnifiedDeviceAllocatorCpu() const override;
   bool compiledWithCuDNN() const override;
   bool compiledWithMIOpen() const override;
   bool supportsDilatedConvolutionWithCuDNN() const override;
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index d96b59defa4c2..ff8fed50f57fe 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -135,14 +135,6 @@ struct TORCH_API CUDAHooksInterface {
     TORCH_CHECK(false, "CUDADeviceAllocator requires CUDA. ", CUDA_HELP);
   }
 
-  virtual Allocator* getUnifiedDeviceAllocator() const {
-    TORCH_CHECK(false, "Unified Device Allocator requires CUDA. ", CUDA_HELP);
-  }
-
-  virtual Allocator* getUnifiedDeviceAllocatorCpu() const {
-    TORCH_CHECK(false, "Unified Device Allocator requires CUDA. ", CUDA_HELP);
-  }
-
   virtual bool compiledWithCuDNN() const {
     return false;
   }
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index 0b59d7aba3ecd..82b0d7d83bd1a 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -4061,6 +4061,36 @@
             ("hipMallocMipmappedArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
         ),
         ("cudaMallocPitch", ("hipMallocPitch", CONV_MEM, API_RUNTIME)),
+        ("cudaMemPool_t", ("hipMemPool_t", CONV_MEM, API_RUNTIME)),
+        ("cudaMemPoolTrimTo", ("hipMemPoolTrimTo", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaDeviceGetDefaultMemPool",
+            ("hipDeviceGetDefaultMemPool", CONV_MEM, API_RUNTIME)
+        ),
+        (
+            "cudaMemPoolGetAttribute",
+            ("hipMemPoolGetAttribute", CONV_MEM, API_RUNTIME)
+        ),
+        (
+            "cudaMemPoolAttrUsedMemHigh",
+            ("hipMemPoolAttrUsedMemHigh", CONV_MEM, API_RUNTIME)
+        ),
+        (
+            "cudaMemPoolSetAttribute",
+            ("hipMemPoolSetAttribute", CONV_MEM, API_RUNTIME)
+        ),
+        (
+            "cudaMemPoolAttrReservedMemHigh",
+            ("hipMemPoolAttrReservedMemHigh", CONV_MEM, API_RUNTIME)
+        ),
+        (
+            "cudaMemPoolAttrReservedMemCurrent",
+            ("hipMemPoolAttrReservedMemCurrent", CONV_MEM, API_RUNTIME)
+        ),
+        (
+            "cudaMemPoolAttrUsedMemCurrent",
+            ("hipMemPoolAttrUsedMemCurrent", CONV_MEM, API_RUNTIME)
+        ),
         ("cudaFreeHost", ("hipHostFree", CONV_MEM, API_RUNTIME)),
         ("cudaFreeArray", ("hipFreeArray", CONV_MEM, API_RUNTIME)),
         ("cudaFree", ("hipFree", CONV_MEM, API_RUNTIME)),

From 12328b5cb5a7d607210f366cc828d5836dad05d4 Mon Sep 17 00:00:00 2001
From: Jason Furmanek <Jason.Furmanek@amd.com>
Date: Tue, 21 Feb 2023 16:17:46 +0000
Subject: [PATCH 9/9] Cleanup commit

---
 aten/src/ATen/cuda/EmptyTensor.cpp      | 7 ++-----
 aten/src/ATen/cuda/detail/CUDAHooks.cpp | 1 -
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/cuda/EmptyTensor.cpp b/aten/src/ATen/cuda/EmptyTensor.cpp
index 2c7d2222108fd..96d0734bc5722 100644
--- a/aten/src/ATen/cuda/EmptyTensor.cpp
+++ b/aten/src/ATen/cuda/EmptyTensor.cpp
@@ -1,7 +1,6 @@
 #define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/cuda/EmptyTensor.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/UvmMemoryAllocator.h>
 #include <ATen/EmptyTensor.h>
 
 namespace at {
@@ -16,8 +15,7 @@ TensorBase empty_cuda(
   const auto device = device_or_default(device_opt);
   TORCH_INTERNAL_ASSERT(device.is_cuda());
   const DeviceGuard device_guard(device);
-  at::Allocator* allocator;
-  allocator = at::cuda::getCUDADeviceAllocator();
+  auto* allocator = at::cuda::getCUDADeviceAllocator();
 
   constexpr c10::DispatchKeySet cuda_dks(c10::DispatchKey::CUDA);
   return at::detail::empty_generic(
@@ -58,8 +56,7 @@ TensorBase empty_strided_cuda(
   const auto device = device_or_default(device_opt);
   TORCH_INTERNAL_ASSERT(device.is_cuda());
   const DeviceGuard device_guard(device);
-  at::Allocator* allocator;
-  allocator = at::cuda::getCUDADeviceAllocator();
+  auto* allocator = at::cuda::getCUDADeviceAllocator();
 
   constexpr c10::DispatchKeySet cuda_dks(c10::DispatchKey::CUDA);
   return at::detail::empty_strided_generic(
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index 71330af174886..b264795bb9835 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -10,7 +10,6 @@
 #include <ATen/cuda/Exceptions.h>
 #include <ATen/cuda/PeerToPeerAccess.h>
 #include <ATen/cuda/PinnedMemoryAllocator.h>
-#include <ATen/cuda/UvmMemoryAllocator.h>
 #include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/native/cuda/CuFFTPlanCache.h>