OSU-Nowlab · Quentin-Anthony · Aug 15, 2023 · Jul 3, 2023 · Jul 3, 2023 · Jul 3, 2023
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+models/__pycache__/
+now-dl.egg-info/
+torchgems/__pycache__/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
 - repo: https://github.com/psf/black
-  rev: stable
+  rev: 23.3.0
   hooks:
   - id: black
     name: black-format-test

diff --git a/README.md b/README.md
@@ -79,10 +79,32 @@ Python=3.9.16, cuda=11.6, gcc=10.3.0, cmake=3.22.2, PyTorch=1.12.0a0+git35202d2,
 cd torch-gems
 python setup.py install
 ```
-Example to run Amoebanet model with partition size for model as two, spatial partition as four and spatial size (i.e. number of model partition which will use spatial partition) as 1
+Example to run AmoebaNet model with partition size for model as two, spatial partition as four and spatial size (i.e. number of model partition which will use spatial partition) as 1
 ```bash
 $MV2_HOME/bin/mpirun_rsh --export-all -np 5 --hostfile {$HOSTFILE} MV2_USE_GDRCOPY=0 MV2_ENABLE_AFFINITY=0 MV2_USE_CUDA=1 LD_PRELOAD=$MV2_HOME/lib/libmpi.so python benchmarks/spatial/model/amoebanet_run.py --image-size 512 --num-spatial-parts 4 --slice-method "vertical" --split-size 2 --spatial-size 1
 ```
+
+Refer [Spatial Parallelism Benchmarks](https://github.com/OSU-Nowlab/now-dl/tree/main/benchmarks/spatial) for more details.
+
+## Experimental Results:
+
+#### Using Spatial, Model and Pipeline Parallelism, where the model is split into two parts and utilizes spatial parallelism by dividing the image into four parts
+
+- AmeobaNet Model 
+
+<div align="center">
+<img src="docs/assets/images/AmeobaNet_img_size_1024.png" width="400"> 
+<img src="docs/assets/images/AmeobaNet_img_size_2048.png" width="400"> 
+</div>
+
+- ResNet Model 
+
+<div align="center">
+<img src="docs/assets/images/ResNet_img_size_1024.png" width="400"> 
+<img src="docs/assets/images/ResNet_img_size_2048.png" width="400"> 
+</div>
+
+
 ## References:
 1. Arpan Jain, Ammar Ahmad Awan, Asmaa M. Aljuhani, Jahanzeb Maqbool Hashmi, Quentin G. Anthony, Hari Subramoni, Dhableswar K. Panda, Raghu Machiraju, and Anil Parwani. 2020. GEMS: <u>G</u>PU-<u>e</u>nabled <u>m</u>emory-aware model-parallelism <u>s</u>ystem for distributed DNN training. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC '20). IEEE Press, Article 45, 1–15.
 2. Arpan Jain, Aamir Shafi, Quentin Anthony, Pouya Kousha, Hari Subramoni, and Dhableswar K. Panda. 2022. Hy-Fi: Hybrid Five-Dimensional Parallel DNN Training on High-Performance GPU Clusters. In High Performance Computing: 37th International Conference, ISC High Performance 2022, Hamburg, Germany, May 29 – June 2, 2022, Proceedings. Springer-Verlag, Berlin, Heidelberg, 109–130. https://doi.org/10.1007/978-3-031-07312-0_6

diff --git a/benchmarks/layer_parallelism/amoebanet_lp.py b/benchmarks/layer_parallelism/amoebanet_lp.py
@@ -0,0 +1,207 @@
+import torch
+import torchvision.transforms as transforms
+import torchvision
+import numpy as np
+import sys
+import math
+import logging
+from torchgems import parser
+import time
+from torchgems.mp_pipeline import model_generator, train_model
+from models import amoebanet
+import torchgems.comm as gems_comm
+
+
+parser_obj = parser.get_parser()
+args = parser_obj.parse_args()
+
+if args.verbose:
+    logging.basicConfig(level=logging.DEBUG)
+
+gems_comm.initialize_cuda()
+
+
+class Unbuffered(object):
+    def __init__(self, stream):
+        self.stream = stream
+
+    def write(self, data):
+        self.stream.write(data)
+        self.stream.flush()
+
+    def writelines(self, datas):
+        self.stream.writelines(datas)
+        self.stream.flush()
+
+    def __getattr__(self, attr):
+        return getattr(self.stream, attr)
+
+
+sys.stdout = Unbuffered(sys.stdout)
+
+np.random.seed(seed=1405)
+ENABLE_ASYNC = True
+ENABLE_APP = False
+parts = args.parts
+batch_size = args.batch_size
+epoch = args.num_epochs
+image_size = int(args.image_size)
+num_layers = args.num_layers
+num_filters = args.num_filters
+balance = args.balance
+mp_size = args.split_size
+times = args.times
+datapath = args.datapath
+
+image_size_seq = 512
+num_classes = 1000
+steps = 100
+
+mpi_comm = gems_comm.MPIComm(split_size=mp_size, ENABLE_MASTER=False)
+rank = mpi_comm.rank
+
+local_rank = rank % mp_size
+
+if balance is not None:
+    balance = [int(i) for i in balance.split(",")]
+
+model = amoebanet.amoebanetd(
+    num_classes=1000, num_layers=args.num_layers, num_filters=args.num_filters
+)
+
+model_gen = model_generator(
+    model=model,
+    split_size=mp_size,
+    input_size=(int(batch_size / parts), 3, image_size_seq, image_size_seq),
+    balance=balance,
+)
+model_gen.ready_model(split_rank=local_rank, GET_SHAPES_ON_CUDA=True)
+
+
+image_size_times = int(image_size / image_size_seq)
+amoebanet_shapes_list = []
+for output_shape in model_gen.shape_list:
+    if isinstance(output_shape, list):
+        temp_shape = []
+        for shape_tuple in output_shape:
+            x = (
+                shape_tuple[0],
+                shape_tuple[1],
+                int(shape_tuple[2] * image_size_times),
+                int(shape_tuple[3] * image_size_times),
+            )
+            temp_shape.append(x)
+        amoebanet_shapes_list.append(temp_shape)
+    else:
+        if len(output_shape) == 2:
+            amoebanet_shapes_list.append(output_shape)
+        else:
+            x = (
+                output_shape[0],
+                output_shape[1],
+                int(output_shape[2] * image_size_times),
+                int(output_shape[3] * image_size_times),
+            )
+            amoebanet_shapes_list.append(x)
+
+model_gen.shape_list = amoebanet_shapes_list
+print("local_ran:", local_rank, " Shapes:", model_gen.shape_list)
+
+
+del model_gen
+del model
+torch.cuda.ipc_collect()
+
+model = amoebanet.amoebanetd(
+    num_classes=1000, num_layers=args.num_layers, num_filters=args.num_filters
+)
+
+
+model_gen = model_generator(
+    model=model,
+    split_size=mp_size,
+    input_size=(int(batch_size / parts), 3, image_size, image_size),
+    balance=balance,
+    shape_list=amoebanet_shapes_list,
+)
+model_gen.ready_model(split_rank=local_rank, GET_SHAPES_ON_CUDA=True)
+
+tm = train_model(
+    model_gen,
+    local_rank,
+    batch_size,
+    epoch,
+    criterion=None,
+    optimizer=None,
+    parts=parts,
+    ASYNC=ENABLE_ASYNC,
+)
+
+# Dataset
+transform = transforms.Compose(
+    [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
+)
+torch.manual_seed(0)
+if ENABLE_APP == True:
+    trainset = torchvision.datasets.ImageFolder(
+        datapath, transform=transform, target_transform=None
+    )
+    my_dataloader = torch.utils.data.DataLoader(
+        trainset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True
+    )
+else:
+    my_dataset = torchvision.datasets.FakeData(
+        size=10 * batch_size,
+        image_size=(3, image_size, image_size),
+        num_classes=num_classes,
+        transform=transform,
+        target_transform=None,
+        random_offset=0,
+    )
+    my_dataloader = torch.utils.data.DataLoader(
+        my_dataset,
+        batch_size=batch_size * times,
+        shuffle=False,
+        num_workers=0,
+        pin_memory=True,
+    )
+    size_dataset = 10 * batch_size
+
+perf = []
+
+
+def run_epoch():
+    for i_e in range(epoch):
+        loss = 0
+        t = time.time()
+        for i, data in enumerate(my_dataloader, 0):
+            start_event = torch.cuda.Event(enable_timing=True, blocking=True)
+            end_event = torch.cuda.Event(enable_timing=True, blocking=True)
+            start_event.record()
+
+            if i > math.floor(size_dataset / (times * batch_size)) - 1:
+                break
+            inputs, labels = data
+
+            temp_loss = tm.run_step(inputs, labels)
+            loss += temp_loss
+            tm.update()
+
+            end_event.record()
+            torch.cuda.synchronize()
+            t = start_event.elapsed_time(end_event) / 1000
+
+            if local_rank == mp_size - 1:
+                logging.info(f"Step :{i}, LOSS: {temp_loss}, Global loss: {loss/(i+1)}")
+
+            if local_rank == 0:
+                print("Epoch: {} images per sec:{}".format(i_e, batch_size / t))
+                perf.append(batch_size / t)
+
+            t = time.time()
+
+
+run_epoch()
+
+if local_rank == 0:
+    print("Mean {} Median {}".format(sum(perf) / len(perf), np.median(perf)))