Chnages: Torch > 1.12.0 compat for __torch_function__ as a classmethod

josiahls · josiahls · commit fc62b419aa25 · 2022-05-21T16:20:28.000-04:00
Adds: pip_memory_device as a field for FakeLoader/DataLoader
Adds: foreach param unit tests for optimizers for torch version&gt;=1.12
Adds: nightly torch comment option for docker-compose.yml
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -13,6 +13,7 @@ services:
     volumes:
       - .:/data/
     environment:
+#      - LIB_INSTALL_TYPE=.[dev] && pip install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu --upgrade #optionally change this locally to .[dev] and install nighty torch
       - LIB_INSTALL_TYPE=. #optionally change this locally to .[dev] to install dev packages as well
 
   notebook:
diff --git a/fastai/data/load.py b/fastai/data/load.py
@@ -28,9 +28,9 @@ def _fn_noops(self, x=None, *args, **kwargs): return x
     _index_sampler,generator,prefetch_factor  = Inf.count,None,2
     dataset_kind = _dataset_kind = _DatasetKind.Iterable
 
-    def __init__(self, d, pin_memory, num_workers, timeout, persistent_workers):
-        self.dataset,self.default,self.worker_init_fn = self,d,_wif
-        store_attr('d,pin_memory,num_workers,timeout,persistent_workers')
+    def __init__(self, d, pin_memory, num_workers, timeout, persistent_workers,pin_memory_device):
+        self.dataset,self.default,self.worker_init_fn,self.pin_memory_device = self,d,_wif,pin_memory_device
+        store_attr('d,pin_memory,num_workers,timeout,persistent_workers,pin_memory_device')
 
     def __iter__(self): return iter(self.d.create_batches(self.d.sample()))
 
@@ -92,7 +92,8 @@ class DataLoader(GetAttr):
         get_idxs sample shuffle_fn do_batch create_batch'.split()
     _default = 'dataset'
     def __init__(self, dataset=None, bs=None, num_workers=0, pin_memory=False, timeout=0, batch_size=None,
-                 shuffle=False, drop_last=False, indexed=None, n=None, device=None, persistent_workers=False, **kwargs):
+                 shuffle=False, drop_last=False, indexed=None, n=None, device=None, persistent_workers=False,
+                 pin_memory_device='', **kwargs):
         if batch_size is not None: bs = batch_size # PyTorch compatibility
         assert not (bs is None and drop_last)
         if indexed is None: indexed = (hasattr(dataset,'__getitem__')
@@ -107,7 +108,8 @@ def __init__(self, dataset=None, bs=None, num_workers=0, pin_memory=False, timeo
             print("Due to IPython and Windows limitation, python multiprocessing isn't available now.")
             print("So `number_workers` is changed to 0 to avoid getting stuck")
             num_workers = 0
-        self.fake_l = _FakeLoader(self, pin_memory, num_workers, timeout, persistent_workers=persistent_workers)
+        self.fake_l = _FakeLoader(self, pin_memory, num_workers, timeout, persistent_workers=persistent_workers,
+                                  pin_memory_device=pin_memory_device)
 
     def __len__(self):
         if self.n is None: raise TypeError
diff --git a/fastai/distributed.py b/fastai/distributed.py
@@ -87,7 +87,9 @@ def __init__(self,dl,rank=None,world_size=None):
                 pin_memory=dl.pin_memory, timeout=dl.timeout, shuffle=shuffle, drop_last=dl.drop_last, persistent_workers=dl.persistent_workers)
         self.bs,self.device,self.drop_last,self.dataset,fake,self.num_workers,self.offs,self.pin_memory = \
             attrgetter('bs','device','drop_last','dataset','fake_l','num_workers','offs','pin_memory')(self.dl)
-        self.fake_l = _FakeLoader(self, fake.pin_memory, fake.num_workers, fake.timeout, persistent_workers=fake.persistent_workers)
+        self.fake_l = _FakeLoader(self, fake.pin_memory, fake.num_workers, fake.timeout,
+                                  persistent_workers=fake.persistent_workers,
+                                  pin_memory_device=fake.pin_memory_device)
 
     def _broadcast(self,t,rank):
         "Broadcasts t from rank `rank` to all other ranks. Returns t so t is same for all ranks after call."
diff --git a/fastai/optimizer.py b/fastai/optimizer.py
@@ -12,6 +12,7 @@
 # Cell
 #nbdev_comment from __future__ import annotations
 from .torch_basics import *
+from packaging import version
 
 # Cell
 class _BaseOptimizer():
@@ -378,7 +379,11 @@ def set_item_pg(pg, k, v):
     return pg
 
 # Cell
-pytorch_hp_map = {'momentum': 'mom', 'weight_decay': 'wd', 'alpha': 'sqr_mom', 'betas__0': 'mom', 'betas__1': 'sqr_mom'}
+pytorch_hp_map = {'momentum': 'mom', 'weight_decay': 'wd', 'alpha': 'sqr_mom', 'betas__0': 'mom',
+                  'betas__1': 'sqr_mom'}
+if version.parse(torch.version.__version__)>version.parse('1.12.0'):
+    # Torch>=1.12 has a foreach param
+    pytorch_hp_map = merge(*(pytorch_hp_map,{'foreach': 'foreach'}))
 
 # Cell
 def _convert_params(o:list) -> list:
diff --git a/fastai/torch_core.py b/fastai/torch_core.py
@@ -353,13 +353,14 @@ def __reduce_ex__(self,proto):
     @classmethod
     def register_func(cls, func, *oks): cls._opt[func].append(oks)
 
-    def __torch_function__(self, func, types, args=(), kwargs=None):
-        if self.debug and func.__name__ not in ('__str__','__repr__'): print(func, types, args, kwargs)
-        convert=False
-        if _torch_handled(args, self._opt, func): convert,types = type(self),(torch.Tensor,)
-        res = super().__torch_function__(func, types, args=args, kwargs=kwargs)
-        if convert: res = convert(res)
-        if isinstance(res, TensorBase): res.set_meta(self, as_copy=True)
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        if cls.debug and func.__name__ not in ('__str__','__repr__'): print(func, types, args, kwargs)
+        if is_listy(args[0]) and args[0]: dict_objs = [a for a in args[0] if hasattr(a,'__dict__')]
+        else:                             dict_objs = [a for a in args if hasattr(a,'__dict__')]
+        if _torch_handled(args, cls._opt, func): types = (torch.Tensor,)
+        res = super().__torch_function__(func, types, args, ifnone(kwargs, {}))
+        if issubclass(type(res),TensorBase) and dict_objs: res.set_meta(dict_objs[0],as_copy=True)
         return res
 
     def new_tensor(self, size, dtype=None, device=None, requires_grad=False):
diff --git a/nbs/00_torch_core.ipynb b/nbs/00_torch_core.ipynb
diff --git a/nbs/02_data.load.ipynb b/nbs/02_data.load.ipynb
@@ -97,9 +97,9 @@
     "    _index_sampler,generator,prefetch_factor  = Inf.count,None,2\n",
     "    dataset_kind = _dataset_kind = _DatasetKind.Iterable\n",
     "    \n",
-    "    def __init__(self, d, pin_memory, num_workers, timeout, persistent_workers):\n",
-    "        self.dataset,self.default,self.worker_init_fn = self,d,_wif\n",
-    "        store_attr('d,pin_memory,num_workers,timeout,persistent_workers')\n",
+    "    def __init__(self, d, pin_memory, num_workers, timeout, persistent_workers,pin_memory_device):\n",
+    "        self.dataset,self.default,self.worker_init_fn,self.pin_memory_device = self,d,_wif,pin_memory_device\n",
+    "        store_attr('d,pin_memory,num_workers,timeout,persistent_workers,pin_memory_device')\n",
     "\n",
     "    def __iter__(self): return iter(self.d.create_batches(self.d.sample()))\n",
     "\n",
@@ -274,7 +274,8 @@
     "        get_idxs sample shuffle_fn do_batch create_batch'.split()\n",
     "    _default = 'dataset'\n",
     "    def __init__(self, dataset=None, bs=None, num_workers=0, pin_memory=False, timeout=0, batch_size=None,\n",
-    "                 shuffle=False, drop_last=False, indexed=None, n=None, device=None, persistent_workers=False, **kwargs):\n",
+    "                 shuffle=False, drop_last=False, indexed=None, n=None, device=None, persistent_workers=False,\n",
+    "                 pin_memory_device='', **kwargs):\n",
     "        if batch_size is not None: bs = batch_size # PyTorch compatibility\n",
     "        assert not (bs is None and drop_last)\n",
     "        if indexed is None: indexed = (hasattr(dataset,'__getitem__')\n",
@@ -289,7 +290,8 @@
     "            print(\"Due to IPython and Windows limitation, python multiprocessing isn't available now.\")\n",
     "            print(\"So `number_workers` is changed to 0 to avoid getting stuck\")\n",
     "            num_workers = 0       \n",
-    "        self.fake_l = _FakeLoader(self, pin_memory, num_workers, timeout, persistent_workers=persistent_workers)\n",
+    "        self.fake_l = _FakeLoader(self, pin_memory, num_workers, timeout, persistent_workers=persistent_workers,\n",
+    "                                  pin_memory_device=pin_memory_device)\n",
     "\n",
     "    def __len__(self):\n",
     "        if self.n is None: raise TypeError\n",
@@ -423,7 +425,7 @@
     {
      "data": {
       "text/plain": [
-       "(#40) [0.6220516703202649,0.38347354268972134,0.36273911288359706,0.4314958642862322,0.48170868503127295,0.1755075234373844,0.26036103499878493,0.16037428323147251,0.8350911770957413,0.4347179239514216...]"
+       "(#0) []"
       ]
      },
      "execution_count": null,
@@ -448,7 +450,7 @@
     {
      "data": {
       "text/plain": [
-       "(#1) [4]"
+       "(#11) [4,4,4,4,4,4,4,4,4,4...]"
       ]
      },
      "execution_count": null,
@@ -468,7 +470,7 @@
     {
      "data": {
       "text/plain": [
-       "(#21) [4,4,4,4,4,4,4,4,4,4...]"
+       "(#10) [4,4,4,4,4,4,4,4,4,4]"
       ]
      },
      "execution_count": null,
@@ -503,7 +505,7 @@
     {
      "data": {
       "text/plain": [
-       "(#2) [0.6192763059885179,0.33021254121031707]"
+       "(#7) [0.41917548410987093,0.5197100010284023,0.7706771870574884,0.6479314353871329,0.43661079462005437,0.6094953292136542,0.4985993416362957]"
       ]
      },
      "execution_count": null,
@@ -631,18 +633,18 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 4.27 ms, sys: 1.05 ms, total: 5.32 ms\n",
-      "Wall time: 316 ms\n",
-      "CPU times: user 12.7 ms, sys: 11.9 ms, total: 24.5 ms\n",
-      "Wall time: 197 ms\n",
-      "CPU times: user 14.5 ms, sys: 16.2 ms, total: 30.7 ms\n",
-      "Wall time: 127 ms\n"
+      "CPU times: user 6.97 ms, sys: 0 ns, total: 6.97 ms\n",
+      "Wall time: 309 ms\n",
+      "CPU times: user 12.2 ms, sys: 12.8 ms, total: 25 ms\n",
+      "Wall time: 277 ms\n",
+      "CPU times: user 21.9 ms, sys: 23.9 ms, total: 45.7 ms\n",
+      "Wall time: 325 ms\n"
      ]
     },
     {
      "data": {
       "text/plain": [
-       "(#26) ['r','c','q','n','j','s','l','p','b','y'...]"
+       "(#26) ['i','x','t','y','p','u','j','n','f','k'...]"
       ]
      },
      "execution_count": null,
@@ -677,8 +679,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 12 ms, sys: 22.3 ms, total: 34.3 ms\n",
-      "Wall time: 130 ms\n"
+      "CPU times: user 19 ms, sys: 22.1 ms, total: 41 ms\n",
+      "Wall time: 295 ms\n"
      ]
     }
    ],
@@ -728,9 +730,9 @@
     {
      "data": {
       "text/plain": [
-       "[tensor([16, 14,  5,  1, 39, 49, 10, 40,  7, 36, 28, 42, 32, 24, 43, 46,  4,  3,\n",
-       "         11, 48, 26, 35, 15, 25, 23,  8, 44, 47,  0, 34, 21, 17]),\n",
-       " tensor([45, 41,  6, 20, 38, 19, 29, 37, 13, 18,  2, 27, 30, 12, 33, 22,  9, 31])]"
+       "[tensor([29, 10, 19, 23, 36,  5, 31,  1, 40, 22, 24, 47, 34,  9,  2, 33, 39, 30,\n",
+       "         42, 49, 14, 17, 18, 35, 15, 27, 13, 48,  3, 32,  4,  8]),\n",
+       " tensor([11, 25, 45, 28, 38,  7,  6, 37, 44,  0, 26, 12, 41, 43, 21, 16, 20, 46])]"
       ]
      },
      "execution_count": null,
@@ -831,6 +833,7 @@
       "Converted 21_vision.learner.ipynb.\n",
       "Converted 22_tutorial.imagenette.ipynb.\n",
       "Converted 23_tutorial.vision.ipynb.\n",
+      "Converted 24_tutorial.image_sequence.ipynb.\n",
       "Converted 24_tutorial.siamese.ipynb.\n",
       "Converted 24_vision.gan.ipynb.\n",
       "Converted 30_text.core.ipynb.\n",
@@ -839,7 +842,6 @@
       "Converted 33_text.models.core.ipynb.\n",
       "Converted 34_callback.rnn.ipynb.\n",
       "Converted 35_tutorial.wikitext.ipynb.\n",
-      "Converted 36_text.models.qrnn.ipynb.\n",
       "Converted 37_text.learner.ipynb.\n",
       "Converted 38_tutorial.text.ipynb.\n",
       "Converted 39_tutorial.transformers.ipynb.\n",
@@ -858,7 +860,7 @@
       "Converted 71_callback.tensorboard.ipynb.\n",
       "Converted 72_callback.neptune.ipynb.\n",
       "Converted 73_callback.captum.ipynb.\n",
-      "Converted 74_callback.azureml.ipynb.\n",
+      "Converted 74_huggingface.ipynb.\n",
       "Converted 97_test_utils.ipynb.\n",
       "Converted 99_pytorch_doc.ipynb.\n",
       "Converted dev-setup.ipynb.\n",
@@ -868,6 +870,7 @@
       "Converted migrating_ignite.ipynb.\n",
       "Converted migrating_lightning.ipynb.\n",
       "Converted migrating_pytorch.ipynb.\n",
+      "Converted migrating_pytorch_verbose.ipynb.\n",
       "Converted ulmfit.ipynb.\n",
       "Converted index.ipynb.\n",
       "Converted quick_start.ipynb.\n",
diff --git a/nbs/12_optimizer.ipynb b/nbs/12_optimizer.ipynb
@@ -28,7 +28,8 @@
    "source": [
     "#|export\n",
     "from __future__ import annotations\n",
-    "from fastai.torch_basics import *"
+    "from fastai.torch_basics import *\n",
+    "from packaging import version"
    ]
   },
   {
@@ -456,7 +457,7 @@
       "text/markdown": [
        "<h4 id=\"Optimizer.step\" class=\"doc_header\"><code>Optimizer.step</code><a href=\"__main__.py#L24\" class=\"source_link\" style=\"float:right\">[source]</a></h4>\n",
        "\n",
-       "> <code>Optimizer.step</code>()\n",
+       "> <code>Optimizer.step</code>(**`closure`**=*`None`*)\n",
        "\n",
        "Standard PyTorch API: Update the stats and execute the steppers in on all parameters that have a grad"
       ],
@@ -865,7 +866,7 @@
     {
      "data": {
       "text/markdown": [
-       "<h4 id=\"Optimizer.state_dict\" class=\"doc_header\"><code>Optimizer.state_dict</code><a href=\"__main__.py#L33\" class=\"source_link\" style=\"float:right\">[source]</a></h4>\n",
+       "<h4 id=\"Optimizer.state_dict\" class=\"doc_header\"><code>Optimizer.state_dict</code><a href=\"__main__.py#L34\" class=\"source_link\" style=\"float:right\">[source]</a></h4>\n",
        "\n",
        "> <code>Optimizer.state_dict</code>()\n",
        "\n",
@@ -891,7 +892,7 @@
     {
      "data": {
       "text/markdown": [
-       "<h4 id=\"Optimizer.load_state_dict\" class=\"doc_header\"><code>Optimizer.load_state_dict</code><a href=\"__main__.py#L37\" class=\"source_link\" style=\"float:right\">[source]</a></h4>\n",
+       "<h4 id=\"Optimizer.load_state_dict\" class=\"doc_header\"><code>Optimizer.load_state_dict</code><a href=\"__main__.py#L38\" class=\"source_link\" style=\"float:right\">[source]</a></h4>\n",
        "\n",
        "> <code>Optimizer.load_state_dict</code>(**`sd`**:`dict`)\n",
        "\n",
@@ -943,7 +944,7 @@
     {
      "data": {
       "text/markdown": [
-       "<h4 id=\"Optimizer.clear_state\" class=\"doc_header\"><code>Optimizer.clear_state</code><a href=\"__main__.py#L29\" class=\"source_link\" style=\"float:right\">[source]</a></h4>\n",
+       "<h4 id=\"Optimizer.clear_state\" class=\"doc_header\"><code>Optimizer.clear_state</code><a href=\"__main__.py#L30\" class=\"source_link\" style=\"float:right\">[source]</a></h4>\n",
        "\n",
        "> <code>Optimizer.clear_state</code>()\n",
        "\n",
@@ -1768,7 +1769,11 @@
    "outputs": [],
    "source": [
     "#|export\n",
-    "pytorch_hp_map = {'momentum': 'mom', 'weight_decay': 'wd', 'alpha': 'sqr_mom', 'betas__0': 'mom', 'betas__1': 'sqr_mom'}"
+    "pytorch_hp_map = {'momentum': 'mom', 'weight_decay': 'wd', 'alpha': 'sqr_mom', 'betas__0': 'mom',\n",
+    "                  'betas__1': 'sqr_mom'}\n",
+    "if version.parse(torch.version.__version__)>version.parse('1.12.0'):\n",
+    "    # Torch>=1.12 has a foreach param\n",
+    "    pytorch_hp_map = merge(*(pytorch_hp_map,{'foreach': 'foreach'}))"
    ]
   },
   {
@@ -1847,6 +1852,10 @@
     "test_eq(tst_sgd.opt.param_groups[0]['params'], [tensor(4,5,6)])\n",
     "#Access to hypers\n",
     "_xtra_hypers = dict(dampening=0., nesterov=False, maximize=False)\n",
+    "\n",
+    "if version.parse(torch.version.__version__)>version.parse('1.12.0'):\n",
+    "    _xtra_hypers = merge(*(_xtra_hypers,dict(foreach=None)))\n",
+    "                         \n",
     "test_eq(tst_sgd.hypers, [{**sgd.hypers[0], **_xtra_hypers}])\n",
     "#Set hypers\n",
     "tst_sgd.set_hyper('mom', 0.95)\n",
@@ -1912,17 +1921,28 @@
     "#|hide\n",
     "#check it works with tuply hp names like in Adam\n",
     "tst_adam = OptimWrapper([tensor([1,2,3])], torch.optim.Adam, lr=1e-2, betas=(0.9, 0.99))\n",
-    "test_eq(tst_adam.hypers, [{\n",
-    "    'lr': 0.01, 'mom': 0.9, 'sqr_mom': 0.99, 'eps': 1e-08, 'wd': 0, 'amsgrad': False, 'maximize':False}])\n",
+    "\n",
+    "tst_hypers = {'lr': 0.01, 'mom': 0.9, 'sqr_mom': 0.99, 'eps': 1e-08, 'wd': 0, \n",
+    "        'amsgrad': False, 'maximize':False}\n",
+    "if version.parse(torch.version.__version__)>version.parse('1.12.0'):\n",
+    "    tst_hypers = merge(*(tst_hypers,dict(foreach=None)))\n",
+    "\n",
+    "test_eq(tst_adam.hypers, [tst_hypers])\n",
     "tst_adam.set_hyper('mom', 0.95)\n",
     "test_eq(tst_adam.opt.param_groups[0]['betas'], (0.95, 0.99))\n",
     "tst_adam.set_hyper('sqr_mom', 0.9)\n",
     "test_eq(tst_adam.opt.param_groups[0]['betas'], (0.95, 0.9))\n",
     "\n",
     "tst_adam = torch.optim.Adam([tensor([1,2,3])], lr=1e-2, betas=(0.9, 0.99))\n",
     "tst_adam = OptimWrapper(opt=tst_adam)\n",
-    "test_eq(tst_adam.hypers, [{\n",
-    "    'lr': 0.01, 'mom': 0.9, 'sqr_mom': 0.99, 'eps': 1e-08, 'wd': 0, 'amsgrad': False, 'maximize':False}])\n",
+    "\n",
+    "tst_hypers = {'lr': 0.01, 'mom': 0.9, 'sqr_mom': 0.99, 'eps': 1e-08, 'wd': 0, 'amsgrad': False, \n",
+    "              'maximize':False}\n",
+    "\n",
+    "if version.parse(torch.version.__version__)>version.parse('1.12.0'):\n",
+    "    tst_hypers = merge(*(tst_hypers,dict(foreach=None)))\n",
+    "\n",
+    "test_eq(tst_adam.hypers, [tst_hypers])\n",
     "tst_adam.set_hyper('mom', 0.95)\n",
     "test_eq(tst_adam.opt.param_groups[0]['betas'], (0.95, 0.99))\n",
     "tst_adam.set_hyper('sqr_mom', 0.9)\n",
diff --git a/nbs/20a_distributed.ipynb b/nbs/20a_distributed.ipynb
@@ -240,7 +240,9 @@
     "                pin_memory=dl.pin_memory, timeout=dl.timeout, shuffle=shuffle, drop_last=dl.drop_last, persistent_workers=dl.persistent_workers)\n",
     "        self.bs,self.device,self.drop_last,self.dataset,fake,self.num_workers,self.offs,self.pin_memory = \\\n",
     "            attrgetter('bs','device','drop_last','dataset','fake_l','num_workers','offs','pin_memory')(self.dl)\n",
-    "        self.fake_l = _FakeLoader(self, fake.pin_memory, fake.num_workers, fake.timeout, persistent_workers=fake.persistent_workers)\n",
+    "        self.fake_l = _FakeLoader(self, fake.pin_memory, fake.num_workers, fake.timeout, \n",
+    "                                  persistent_workers=fake.persistent_workers, \n",
+    "                                  pin_memory_device=fake.pin_memory_device)\n",
     "        \n",
     "    def _broadcast(self,t,rank):\n",
     "        \"Broadcasts t from rank `rank` to all other ranks. Returns t so t is same for all ranks after call.\"\n",