vicolab · igorbb · Aug 28, 2020 · Aug 28, 2020 · Aug 28, 2020 · Sep 18, 2020
diff --git a/examples/torch-parallel.ipynb b/examples/torch-parallel.ipynb
@@ -0,0 +1,329 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import os\n",
+    "import requests\n",
+    "import pyxis as px"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Let us start downloading the data\n",
+    "\n",
+    "We will use the MNIST dataset, this version is already in numpy array and is provided by tensorflow."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Downloads dataset\n",
+    "if not os.path.exists(\"mnist.npz\"):\n",
+    "    with open(\"mnist.npz\", 'wb') as fout:\n",
+    "        response = requests.get(\"https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\", stream=True)\n",
+    "        response.raise_for_status()\n",
+    "        for block in response.iter_content(4096):\n",
+    "            fout.write(block)\n",
+    "# Loads arrays\n",
+    "\n",
+    "with np.load(\"./mnist.npz\") as f:\n",
+    "    x_train, y_train = f['x_train'], f['y_train']\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Creates train database\n",
+    "\n",
+    "We will begin by creating a small dataset to test with. It will consist of `60000` samples from the training partion of mnist."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with px.Writer(dirpath='mnist_train', map_size_limit=256, ram_gb_limit=1) as db:\n",
+    "    db.put_samples('X', x_train.astype(np.float32), 'y', y_train.astype(np.long))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Checks dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "pyxis.Reader\n",
+      "Location:\t\t'mnist_train'\n",
+      "Number of samples:\t60000\n",
+      "Data keys (0th sample):\n",
+      "\t'X' <- dtype: float32, shape: (28, 28)\n",
+      "\t'y' <- dtype: int64, shape: ()\n"
+     ]
+    }
+   ],
+   "source": [
+    "with px.Reader('mnist_train') as db:\n",
+    "    print(db)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train 1 epoch using pytorch dataloader\n",
+    "\n",
+    "We will train 1 epoch using the normal dataloader and torch.utils.data.DataLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    import torch\n",
+    "    import torch.utils.data\n",
+    "except ImportError:\n",
+    "    raise ImportError('Could not import the PyTorch library `torch` or '\n",
+    "                      '`torch.utils.data`. Please refer to '\n",
+    "                      'https://pytorch.org/ for installation instructions.')\n",
+    "import pyxis.torch as pxt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Initialize pytorch methods"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "# Not optimal network, but big enough to simulate some gradient load\n",
+    "network = torch.nn.Sequential(\n",
+    "    torch.nn.Flatten(),\n",
+    "    torch.nn.Linear(28*28,32),\n",
+    "    torch.nn.ReLU(),\n",
+    "    torch.nn.Linear(32,32),\n",
+    "    torch.nn.ReLU(),\n",
+    "    torch.nn.Linear(32,32),\n",
+    "    torch.nn.ReLU(),\n",
+    "    torch.nn.Linear(32,10)) \n",
+    "network.to(device)\n",
+    "criterion = torch.nn.CrossEntropyLoss()\n",
+    "optimizer = torch.optim.RMSprop(network.parameters(), lr=1e-5, weight_decay=1e-2)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Initialize classical dataloaders"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = pxt.TorchDataset('mnist_train')\n",
+    "use_cuda = True and torch.cuda.is_available()\n",
+    "kwargs = {\"num_workers\": 2, \"pin_memory\": True} if use_cuda else {}\n",
+    "loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=False,**kwargs)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train_epoch():\n",
+    "    for i, d in enumerate(loader):\n",
+    "        data,target = d['X'],d['y']\n",
+    "        data, target = data.to(device), target.to(device)\n",
+    "        optimizer.zero_grad()\n",
+    "        output = network(data)\n",
+    "        loss = criterion(output, target)\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "    print('Train[{}/{} ({:.0f}%)]\\tLoss: {:.6f}'.format(\n",
+    "            i * len(data), len(loader.dataset),\n",
+    "            100. * i / len(loader), loss.item()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train[59968/60000 (100%)]\tLoss: 0.682663\n",
+      "Train[59968/60000 (100%)]\tLoss: 0.292598\n",
+      "Train[59968/60000 (100%)]\tLoss: 0.182662\n",
+      "Train[59968/60000 (100%)]\tLoss: 0.138513\n",
+      "Train[59968/60000 (100%)]\tLoss: 0.125213\n",
+      "Train[59968/60000 (100%)]\tLoss: 0.114652\n",
+      "Train[59968/60000 (100%)]\tLoss: 0.108128\n",
+      "Train[59968/60000 (100%)]\tLoss: 0.103621\n",
+      "4.44 s ± 62.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit train_epoch()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Initialize pytorch methods again "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "# Not optimal network, but big enough to simulate some gradient load\n",
+    "network = torch.nn.Sequential(\n",
+    "    torch.nn.Flatten(),\n",
+    "    torch.nn.Linear(28*28,32),\n",
+    "    torch.nn.ReLU(),\n",
+    "    torch.nn.Linear(32,32),\n",
+    "    torch.nn.ReLU(),\n",
+    "    torch.nn.Linear(32,32),\n",
+    "    torch.nn.ReLU(),\n",
+    "    torch.nn.Linear(32,10)) \n",
+    "network.to(device)\n",
+    "criterion = torch.nn.CrossEntropyLoss()\n",
+    "optimizer = torch.optim.RMSprop(network.parameters(), lr=1e-5, weight_decay=1e-2)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Initialize pyxis iterator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "iterator = pxt.TorchIterator(\n",
+    "    device = device,\n",
+    "    dir_path = 'mnist_train',\n",
+    "    keys=('X','y'),\n",
+    "    batch_size=32,\n",
+    "    num_worker=2,\n",
+    "    pre_fetcher_queue=100,\n",
+    "    device_transfer_queue=2,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train_epoch_with_pyxis():\n",
+    "    for i, d in enumerate(iterator):\n",
+    "        data,target = d\n",
+    "        optimizer.zero_grad()\n",
+    "        output = network(data)\n",
+    "        loss = criterion(output, target)\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "    print('Train[{}/{} ({:.0f}%)]\\tLoss: {:.6f}'.format(\n",
+    "            i * len(data), len(loader.dataset),\n",
+    "            100. * i / len(loader), loss.item()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train[599680/60000 (100%)]\tLoss: 0.418909\n",
+      "Train[599680/60000 (100%)]\tLoss: 0.345567\n",
+      "Train[599680/60000 (100%)]\tLoss: 0.131335\n",
+      "Train[599680/60000 (100%)]\tLoss: 0.163470\n",
+      "Train[599680/60000 (100%)]\tLoss: 0.200821\n",
+      "Train[599680/60000 (100%)]\tLoss: 0.117903\n",
+      "Train[599680/60000 (100%)]\tLoss: 0.149772\n",
+      "Train[599680/60000 (100%)]\tLoss: 0.104580\n",
+      "3.6 s ± 67.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit train_epoch_with_pyxis()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/pyxis/__init__.py b/pyxis/__init__.py
@@ -6,4 +6,4 @@
 from .pyxis import *
 from .iterators import *
 
-__version__ = "0.4.dev0"
+__version__ = "0.5.dev0"