From 9e7302fc67a84863fbd3c95b2f265c5d4c73309a Mon Sep 17 00:00:00 2001
From: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
Date: Mon, 18 Sep 2023 14:50:31 -0500
Subject: [PATCH 01/11] Initial Changes for GEMS + SPATIAL

Signed-off-by: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
---
 .../benchmark_amoebanet_gems+spatial.py       | 452 ++++++++++++++++++
 src/torchgems/comm.py                         |  19 +-
 src/torchgems/mp_pipeline.py                  |  39 +-
 src/torchgems/parser.py                       |   7 +
 src/torchgems/train_spatial.py                |  77 ++-
 src/torchgems/train_spatial_master.py         |  12 +-
 6 files changed, 596 insertions(+), 10 deletions(-)
 create mode 100644 benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py

diff --git a/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py b/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py
new file mode 100644
index 00000000..78f10a02
--- /dev/null
+++ b/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py
@@ -0,0 +1,452 @@
+import torch
+import torch.distributed as dist
+import torchvision.transforms as transforms
+import torchvision
+import numpy as np
+import time
+import sys
+import math
+from torchgems import parser
+from torchgems.mp_pipeline import model_generator
+from torchgems.train_spatial import get_shapes_spatial, split_input
+from torchgems.train_spatial_master import train_spatial_model_master
+import torchgems.comm as gems_comm
+
+parser_obj = parser.get_parser()
+args = parser_obj.parse_args()
+
+if args.halo_d2:
+    from models import amoebanet
+    from models import amoebanet_d2
+
+else:
+    from models import amoebanet
+
+gems_comm.initialize_cuda()
+
+
+class Unbuffered(object):
+    def __init__(self, stream):
+        self.stream = stream
+
+    def write(self, data):
+        self.stream.write(data)
+        self.stream.flush()
+
+    def writelines(self, datas):
+        self.stream.writelines(datas)
+        self.stream.flush()
+
+    def __getattr__(self, attr):
+        return getattr(self.stream, attr)
+
+
+def init_processes(backend="tcp"):
+    """Initialize the distributed environment."""
+    dist.init_process_group(backend)
+    size = dist.get_world_size()
+    rank = dist.get_rank()
+    return size, rank
+
+
+def get_depth(version, n):
+    if version == 1:
+        return n * 6 + 2
+    elif version == 2:
+        return n * 9 + 2
+
+
+sys.stdout = Unbuffered(sys.stdout)
+
+# torch.set_num_threads(1)
+np.random.seed(seed=1405)
+parts = args.parts
+batch_size = args.batch_size
+resnet_n = 12
+epoch = args.num_epochs
+ENABLE_ASYNC = True
+
+# APP
+# 1: Medical
+# 2: Cifar
+# 3: synthetic
+APP = 3
+amoebanet_test = False
+image_size = int(args.image_size)
+print("image size", image_size)
+steps = 100
+num_layers = args.num_layers
+num_filters = args.num_filters
+balance = args.balance
+split_size = args.split_size
+spatial_size = args.spatial_size
+ENABLE_MASTER_OPT = args.enable_master_comm_opt
+
+temp_num_spatial_parts = args.num_spatial_parts.split(",")
+
+if len(temp_num_spatial_parts) == 1:
+    num_spatial_parts_list = [int(temp_num_spatial_parts[0])]
+    num_spatial_parts = int(temp_num_spatial_parts[0])
+else:
+    num_spatial_parts = [int(i) for i in temp_num_spatial_parts]
+    num_spatial_parts_list = num_spatial_parts
+
+times = args.times
+num_classes = 1000
+LOCAL_DP_LP = args.local_DP
+
+
+mpi_comm_first = gems_comm.MPIComm(
+    split_size=split_size,
+    ENABLE_MASTER=False,
+    ENABLE_SPATIAL=True,
+    num_spatial_parts=num_spatial_parts,
+    spatial_size=spatial_size,
+    LOCAL_DP_LP=LOCAL_DP_LP,
+)
+mpi_comm_second = gems_comm.MPIComm(
+    split_size=split_size,
+    ENABLE_MASTER=True,
+    ENABLE_SPATIAL=True,
+    num_spatial_parts=num_spatial_parts,
+    spatial_size=spatial_size,
+    LOCAL_DP_LP=LOCAL_DP_LP,
+    DISABLE_INIT=True,
+)
+
+gems_comm.sync_comms_for_master(mpi_comm_first, mpi_comm_second)
+comm_size = mpi_comm_first.size
+# rank = mpi_comm.local_rank
+# comm_size = mpi_comm.size
+# local_rank = rank
+
+# split_rank = mpi_comm.split_rank
+
+
+if args.balance != None:
+    balance = args.balance.split(",")
+    balance = [int(j) for j in balance]
+else:
+    balance = None
+
+
+image_size_seq = 512
+
+model_seq = amoebanet.amoebanetd(
+    num_layers=num_layers, num_filters=num_filters, num_classes=num_classes
+)
+print("length", len(model_seq), balance)
+model_gen_seq = model_generator(
+    model=model_seq,
+    split_size=split_size,
+    input_size=(int(batch_size / parts), 3, image_size_seq, image_size_seq),
+    balance=balance,
+)
+model_gen_seq.ready_model(
+    split_rank=mpi_comm_second.split_rank, GET_SHAPES_ON_CUDA=True
+)
+
+image_size_times = int(image_size / image_size_seq)
+
+resnet_shapes_list = get_shapes_spatial(
+    shape_list=model_gen_seq.shape_list,
+    slice_method=args.slice_method,
+    spatial_size=spatial_size,
+    num_spatial_parts_list=num_spatial_parts_list,
+    image_size_times=image_size_times,
+)
+
+print(model_gen_seq.shape_list, resnet_shapes_list)
+
+del model_seq
+del model_gen_seq
+torch.cuda.ipc_collect()
+
+
+if args.halo_d2:
+    model1 = amoebanet_d2.amoebanetd_spatial(
+        local_rank=mpi_comm_first.local_rank % mpi_comm_first.total_spatial_processes,
+        spatial_size=spatial_size,
+        num_spatial_parts=num_spatial_parts,
+        mp_size=split_size,
+        balance=balance,
+        slice_method="square",
+        num_classes=num_classes,
+        num_layers=num_layers,
+        num_filters=num_filters,
+    )
+
+    model2 = amoebanet_d2.amoebanetd_spatial(
+        local_rank=mpi_comm_second.local_rank % mpi_comm_second.total_spatial_processes,
+        spatial_size=spatial_size,
+        num_spatial_parts=num_spatial_parts,
+        mp_size=split_size,
+        balance=balance,
+        slice_method="square",
+        num_classes=num_classes,
+        num_layers=num_layers,
+        num_filters=num_filters,
+    )
+else:
+    model1 = amoebanet.amoebanetd_spatial(
+        local_rank=mpi_comm_first.local_rank % mpi_comm_first.total_spatial_processes,
+        spatial_size=spatial_size,
+        num_spatial_parts=num_spatial_parts,
+        mp_size=split_size,
+        balance=balance,
+        slice_method="square",
+        num_classes=num_classes,
+        num_layers=num_layers,
+        num_filters=num_filters,
+    )
+
+    model2 = amoebanet.amoebanetd_spatial(
+        local_rank=mpi_comm_second.local_rank % mpi_comm_second.total_spatial_processes,
+        spatial_size=spatial_size,
+        num_spatial_parts=num_spatial_parts,
+        mp_size=split_size,
+        balance=balance,
+        slice_method="square",
+        num_classes=num_classes,
+        num_layers=num_layers,
+        num_filters=num_filters,
+    )
+
+
+model_gen1 = model_generator(
+    model=model1,
+    split_size=split_size,
+    input_size=(int(batch_size / parts), 3, image_size, image_size),
+    balance=balance,
+    shape_list=resnet_shapes_list,
+)
+model_gen1.ready_model(split_rank=mpi_comm_first.split_rank)
+# model_gen1.DDP_model(mpi_comm_first, num_spatial_parts, spatial_size, bucket_size=25, local_rank = mpi_comm_first.local_rank)
+
+
+model_gen2 = model_generator(
+    model=model2,
+    split_size=split_size,
+    input_size=(int(batch_size / parts), 3, image_size, image_size),
+    balance=balance,
+    shape_list=resnet_shapes_list,
+)
+model_gen2.ready_model(split_rank=mpi_comm_second.split_rank)
+# model_gen2.DDP_model(mpi_comm_second, num_spatial_parts, spatial_size, bucket_size=25, local_rank = mpi_comm_second.local_rank)
+
+
+# model_gen.mp_size = 5
+print("Shape list", resnet_shapes_list)
+
+
+# t_s1 = train_model_spatial(model_gen1, mpi_comm_first.local_rank,batch_size,epochs=1, spatial_size=spatial_size, num_spatial_parts=num_spatial_parts ,criterion=None,optimizer=None,parts=parts,ASYNC=True,GEMS_INVERSE=False, slice_method = args.slice_method,
+# 							LOCAL_DP_LP=LOCAL_DP_LP,
+# 							mpi_comm = mpi_comm_first)
+
+
+# t_s2 = train_model_spatial(model_gen2, mpi_comm_second.local_rank,batch_size,epochs=1, spatial_size=spatial_size, num_spatial_parts=num_spatial_parts ,criterion=None,optimizer=None,parts=parts,ASYNC=True,GEMS_INVERSE=True, slice_method = args.slice_method,
+# 							LOCAL_DP_LP=LOCAL_DP_LP,
+# 							mpi_comm = mpi_comm_second)
+
+t_s_master = train_spatial_model_master(
+    model_gen1,
+    model_gen2,
+    batch_size,
+    spatial_size,
+    num_spatial_parts,
+    args.slice_method,
+    mpi_comm_first,
+    mpi_comm_second,
+    LOCAL_DP_LP=LOCAL_DP_LP,
+    criterion=None,
+    optimizer=None,
+    parts=parts,
+    ASYNC=True,
+    replications=int(args.times / 2),
+)
+
+x = torch.zeros(
+    (batch_size, 3, int(image_size / 2), int(image_size / 2)), device="cuda"
+)
+y = torch.zeros((batch_size,), dtype=torch.long, device="cuda")
+
+
+transform = transforms.Compose(
+    [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
+)
+torch.manual_seed(0)
+
+if APP == 1:
+    trainset = torchvision.datasets.ImageFolder(
+        "/usr/workspace/jain8/project/cancer/1024_1024_5/train",
+        transform=transform,
+        target_transform=None,
+    )
+    my_dataloader = torch.utils.data.DataLoader(
+        trainset,
+        batch_size=times * batch_size,
+        shuffle=True,
+        num_workers=0,
+        pin_memory=True,
+    )
+    size_dataset = 1030
+elif APP == 2:
+    trainset = torchvision.datasets.CIFAR10(
+        root="./data", train=True, download=True, transform=transform
+    )
+    my_dataloader = torch.utils.data.DataLoader(
+        trainset,
+        batch_size=times * batch_size,
+        shuffle=False,
+        num_workers=0,
+        pin_memory=True,
+    )
+    size_dataset = 50000
+else:
+    my_dataset = torchvision.datasets.FakeData(
+        size=10 * batch_size * args.times,
+        image_size=(3, image_size, image_size),
+        num_classes=num_classes,
+        transform=transform,
+        target_transform=None,
+        random_offset=0,
+    )
+    my_dataloader = torch.utils.data.DataLoader(
+        my_dataset,
+        batch_size=batch_size * args.times,
+        shuffle=False,
+        num_workers=0,
+        pin_memory=True,
+    )
+    size_dataset = 10 * batch_size
+
+
+# sync_allreduce.sync_model_spatial(model_gen)
+perf = []
+
+sync_comm = gems_comm.SyncAllreduce(mpi_comm_first)
+
+
+MASTER = args.times
+
+print("ENABLE_MASTER_OPT", ENABLE_MASTER_OPT)
+
+
+def run_epoch():
+    for i_e in range(epoch):
+        loss = 0
+        correct = 0
+        t = time.time()
+        for i, data in enumerate(my_dataloader, 0):
+            start_event = torch.cuda.Event(enable_timing=True, blocking=True)
+            end_event = torch.cuda.Event(enable_timing=True, blocking=True)
+            start_event.record()
+            if i > math.floor(size_dataset / (times * batch_size)) - 1:
+                break
+            # inputs=data_x
+            # labels = data_y
+            inputs, labels = data
+
+            # inputs = inputs.to(device)
+            # labels = labels.to(device)
+
+            # t= time.time()
+            if mpi_comm_first.local_rank < num_spatial_parts_list[0]:
+                x = split_input(
+                    inputs=inputs,
+                    image_size=image_size,
+                    slice_method=args.slice_method,
+                    local_rank=mpi_comm_first.local_rank,
+                    num_spatial_parts_list=num_spatial_parts_list,
+                )
+            elif mpi_comm_second.local_rank < num_spatial_parts_list[0]:
+                x = split_input(
+                    inputs=inputs,
+                    image_size=image_size,
+                    slice_method=args.slice_method,
+                    local_rank=mpi_comm_second.local_rank,
+                    num_spatial_parts_list=num_spatial_parts_list,
+                )
+            else:
+                x = inputs
+
+            # for j in range(MASTER):
+
+            # 	temp_loss,temp_correct = t_s1.run_step(x,labels)
+            # 	temp_loss,temp_correct = t_s2.run_step(x,labels)
+
+            if ENABLE_MASTER_OPT:
+                temp_loss, temp_correct = t_s_master.run_step_allreduce(
+                    x, labels, i % 2 == 1
+                )
+            else:
+                temp_loss, temp_correct = t_s_master.run_step(x, labels)
+
+            loss += temp_loss
+            correct += temp_correct
+
+            start_event_allreduce = torch.cuda.Event(enable_timing=True, blocking=True)
+            end_event_allreduce = torch.cuda.Event(enable_timing=True, blocking=True)
+            start_event_allreduce.record()
+            t_allreduce_temp = time.time()
+
+            if ENABLE_MASTER_OPT == False:
+                sync_comm.apply_allreduce_master_master(
+                    model_gen1, model_gen2, mpi_comm_first, mpi_comm_second
+                )
+
+            """
+			if(local_rank < spatial_size * num_spatial_parts):
+				None
+				#No need for this as, DDP is now used
+				# sync_allreduce.apply_allreduce(model_gen,mpi_comm.spatial_allreduce_grp)
+			"""
+            torch.cuda.synchronize()
+
+            if ENABLE_MASTER_OPT:
+                if i % 2 == 1:
+                    t_s_master.train_model1.update()
+                else:
+                    t_s_master.train_model2.update()
+            else:
+                t_s_master.train_model1.update()
+                t_s_master.train_model2.update()
+
+            end_event_allreduce.record()
+            torch.cuda.synchronize()
+            t_allreduce = start_event_allreduce.elapsed_time(end_event_allreduce) / 1000
+            t_allreduce = time.time() - t_allreduce_temp
+
+            if mpi_comm_second.local_rank == comm_size - 1:
+                None
+                # print("Step",i," LOSS",temp_loss, " Global loss:",loss/(i+1), " Acc:",temp_correct)
+
+            if ENABLE_MASTER_OPT:
+                torch.distributed.barrier()
+
+            end_event.record()
+            torch.cuda.synchronize()
+            t = start_event.elapsed_time(end_event) / 1000
+            if mpi_comm_second.local_rank == 0:
+                None
+                print(
+                    "images per sec:",
+                    batch_size / t,
+                    "Time:",
+                    t,
+                    " Time Allreduce:",
+                    t_allreduce,
+                )
+                perf.append(batch_size / t)
+
+            t = time.time()
+        if mpi_comm_second.local_rank == comm_size - 1:
+            print("epoch", i_e, " Global loss:", loss, " acc", correct / i)
+
+
+run_epoch()
+
+if mpi_comm_second.local_rank == 0:
+    print("Mean {} Median {}".format(sum(perf) / len(perf), np.median(perf)))
+
+exit()
diff --git a/src/torchgems/comm.py b/src/torchgems/comm.py
index 9a5fbf88..6f1eba55 100644
--- a/src/torchgems/comm.py
+++ b/src/torchgems/comm.py
@@ -65,6 +65,7 @@ def __init__(
                 - spatial_size
                 + (split_size - spatial_size) * (LOCAL_DP_LP - 1)
             )
+        print("MP_SIZE : ", self.mp_size)
 
         if DISABLE_INIT:
             self.rank = dist.get_rank()
@@ -213,19 +214,28 @@ def create_allreduce_comm_spatial(self):
 
             if self.ENABLE_MASTER:
                 for i in range(len(ranks)):
-                    ranks.append(self.mp_size - 1 - ranks[i])
-
+                    ranks[i] = self.mp_size - 1 - ranks[i]
+                    # ranks.append(self.mp_size - 1 - ranks[i])
+            print("RANKS:", ranks)
             temp_spatial_allreduce_grp = torch.distributed.new_group(ranks=ranks)
 
             if self.ENABLE_MASTER:
                 if self.spatial_size == 1 and first_local_rank < self.num_spatial_parts:
                     self.first_spatial_allreduce_grp = temp_spatial_allreduce_grp
-
-                elif (
+                    print(
+                        "first_spatial_allreduce_grp", self.rank, self.local_rank, ranks
+                    )
+                if (
                     self.spatial_size == 1
                     and second_local_rank < self.num_spatial_parts
                 ):
                     self.second_spatial_allreduce_grp = temp_spatial_allreduce_grp
+                    print(
+                        "second_spatial_allreduce_grp",
+                        self.rank,
+                        self.local_rank,
+                        ranks,
+                    )
 
                 elif self.spatial_size > 1:
                     if first_local_rank < np.sum(
@@ -313,6 +323,7 @@ def sync_comms_for_master(comm1, comm2):
     # MASTER related communicators are in comm2
     first_local_rank = comm1.local_rank
     second_local_rank = comm2.local_rank
+    print("sync_comms_for_master", first_local_rank, second_local_rank)
 
     if first_local_rank < comm1.total_spatial_processes:
         comm1.spatial_allreduce_grp = comm2.first_spatial_allreduce_grp
diff --git a/src/torchgems/mp_pipeline.py b/src/torchgems/mp_pipeline.py
index 2cdb16ab..b0fb54e7 100644
--- a/src/torchgems/mp_pipeline.py
+++ b/src/torchgems/mp_pipeline.py
@@ -437,6 +437,7 @@ def forward_pass(self, data_x, data_y, part_number=0):
         # part_number: part number between 0 and self.parts-1 used to find right input recv buffer
 
         # Receive inputs if local is not 0
+        print("mp_pipeline:forward_pass: START", data_x.size(), data_y.size())
         if self.split_rank == 0:
             input_x = data_x
         else:
@@ -457,6 +458,11 @@ def forward_pass(self, data_x, data_y, part_number=0):
 
         torch.cuda.synchronize()
 
+        print(
+            "mp_pipeline:forward_pass: SEND_INPUT_OR_CAL_LOSS",
+            data_x.size(),
+            data_y.size(),
+        )
         if self.split_rank != self.split_size - 1:
             if self.ENABLE_ASYNC == True:
                 self.send_input_async(y)
@@ -465,7 +471,7 @@ def forward_pass(self, data_x, data_y, part_number=0):
 
         else:
             loss = self.criterion(y, data_y)
-
+        print("mp_pipeline:forward_pass: END", data_x.size(), data_y.size())
         if self.split_rank == self.split_size - 1:
             corrects = (data_y.eq(torch.argmax(y, dim=-1).long())).sum().float()
             return loss, corrects / self.batch_size
@@ -518,19 +524,50 @@ def run_step(self, data_x, data_y):
         for i in range(self.parts):
             start = i * parts_size
             end = (i + 1) * parts_size
+            print(
+                "mp_pipeline:train_model:run_step : START FORWARD PASS",
+                self.local_rank,
+                self.parts,
+                start,
+                end,
+            )
             temp_y, temp_correct = self.forward_pass(
                 data_x[start:end], data_y[start:end], part_number=i
             )
+
+            print(
+                "mp_pipeline:train_model:run_step : END FORWARD PASS",
+                self.local_rank,
+                self.parts,
+                start,
+                end,
+            )
+
             y_list.append(temp_y)
 
             if self.split_rank == self.split_size - 1:
                 loss += temp_y.item()
                 corrects += temp_correct.item()
 
+        print(
+            "mp_pipeline:train_model:run_step : START BACKWARD PASS",
+            self.local_rank,
+            self.parts,
+            start,
+            end,
+        )
         for i in range(self.parts):
             None
             self.backward_pass(y_list[i], part_number=i)
 
+        print(
+            "mp_pipeline:train_model:run_step : END BACKWARD PASS",
+            self.local_rank,
+            self.parts,
+            start,
+            end,
+        )
+
         return loss, corrects
 
     def update(self):
diff --git a/src/torchgems/parser.py b/src/torchgems/parser.py
index 4df14797..ca8c8bd5 100644
--- a/src/torchgems/parser.py
+++ b/src/torchgems/parser.py
@@ -125,5 +125,12 @@ def get_parser():
         default="./train",
         help="local Dataset path",
     )
+    parser.add_argument(
+        "--enable-master-comm-opt",
+        dest="enable_master_comm_opt",
+        action="store_true",
+        default=False,
+        help="Enable communication optimization for MASTER in Spatial",
+    )
 
     return parser
diff --git a/src/torchgems/train_spatial.py b/src/torchgems/train_spatial.py
index 4b5cda42..721510be 100644
--- a/src/torchgems/train_spatial.py
+++ b/src/torchgems/train_spatial.py
@@ -182,6 +182,68 @@ def get_shapes_spatial(
     return spatial_shapes_list
 
 
+def split_input_2(inputs, image_size, slice_method, local_rank):
+    image_height_local = int(image_size / 2)
+    image_width_local = int(image_size / 2)
+
+    # square == vertical
+
+    if slice_method == "square" or slice_method == "vertical":
+        if local_rank == 0:
+            return inputs[:, :, :, :image_width_local]
+        elif local_rank == 1:
+            return inputs[:, :, :, image_width_local : 2 * image_width_local]
+
+    elif slice_method == "horizontal":
+        if local_rank == 0:
+            return inputs[:, :, :image_height_local, :]
+        elif local_rank == 1:
+            return inputs[:, :, image_height_local : 2 * image_height_local, :]
+
+
+def split_input_4(inputs, image_size, slice_method, local_rank):
+    image_height_local = int(image_size / 4)
+    image_width_local = int(image_size / 4)
+
+    if slice_method == "square":
+        if local_rank == 0:
+            return inputs[:, :, : int(image_size / 2), : int(image_size / 2)]
+        elif local_rank == 1:
+            return inputs[:, :, : int(image_size / 2), int(image_size / 2) :]
+        elif local_rank == 2:
+            return inputs[:, :, int(image_size / 2) :, : int(image_size / 2)]
+        elif local_rank == 3:
+            return inputs[:, :, int(image_size / 2) :, int(image_size / 2) :]
+
+    elif slice_method == "vertical":
+        if local_rank == 0:
+            return inputs[:, :, :, :image_width_local]
+        elif local_rank == 1:
+            return inputs[:, :, :, image_width_local : 2 * image_width_local]
+        elif local_rank == 2:
+            return inputs[:, :, :, 2 * image_width_local : 3 * image_width_local]
+        elif local_rank == 3:
+            return inputs[:, :, :, 3 * image_width_local : 4 * image_width_local]
+
+    elif slice_method == "horizontal":
+        if local_rank == 0:
+            return inputs[:, :, :image_height_local, :]
+        elif local_rank == 1:
+            return inputs[:, :, image_height_local : 2 * image_height_local, :]
+        elif local_rank == 2:
+            return inputs[:, :, 2 * image_height_local : 3 * image_height_local, :]
+        elif local_rank == 3:
+            return inputs[:, :, 3 * image_height_local : 4 * image_height_local, :]
+
+
+def split_input(inputs, image_size, slice_method, local_rank, num_spatial_parts_list):
+    if num_spatial_parts_list[0] == 2:
+        return split_input_2(inputs, image_size, slice_method, local_rank)
+
+    elif num_spatial_parts_list[0] == 4:
+        return split_input_4(inputs, image_size, slice_method, local_rank)
+
+
 class train_model_spatial(train_model):
     def __init__(
         self,
@@ -1149,7 +1211,12 @@ def forward_pass(self, data_x, data_y, part_number=0):
         # data_x: input data
         # data_y: labels
         # part_number: part number between 0 and self.parts-1 used to find right input recv buffer
-
+        print(
+            "train_spatial:forward_pass: START",
+            data_x.size(),
+            data_y.size(),
+            self.local_rank,
+        )
         # Receive inputs if local is not 0
         if self.split_rank == 0:
             input_x = data_x
@@ -1183,6 +1250,7 @@ def forward_pass(self, data_x, data_y, part_number=0):
                 else:
                     input_x = self.input_x_list[part_number]
 
+        print("train_spatial:forward_pass: RECEIVED INPUTS", self.local_rank)
         # Apply forward pass
 
         torch.cuda.synchronize()
@@ -1200,7 +1268,7 @@ def forward_pass(self, data_x, data_y, part_number=0):
             y = self.models(input_x)
 
         torch.cuda.synchronize()
-
+        print("train_spatial:forward_pass: CALCULATED_Y", self.local_rank)
         if self.split_rank != self.split_size - 1:
             if self.ENABLE_ASYNC == True:
                 if self.split_rank == self.spatial_size - 1 and self.ENABLE_LOCAL_DP_LP:
@@ -1212,6 +1280,7 @@ def forward_pass(self, data_x, data_y, part_number=0):
                     self.send_input_spatial_MP_joint_LP_DP(y)
                 else:
                     self.send_input_sync(y)
+            print("train_spatial:forward_pass: SENT_Y", self.local_rank)
 
         else:
             pos = self.local_rank - (self.mp_size - self.LOCAL_DP_LP)
@@ -1231,6 +1300,10 @@ def forward_pass(self, data_x, data_y, part_number=0):
             else:
                 loss = self.criterion(y, data_y)
 
+            print("train_spatial:forward_pass: CALCULATED_LOSS", self.local_rank)
+
+        print("train_spatial:forward_pass: END", self.local_rank)
+
         if self.split_rank == self.split_size - 1:
             corrects = (data_y.eq(torch.argmax(y, dim=-1).long())).sum().float()
             return loss, corrects / self.batch_size
diff --git a/src/torchgems/train_spatial_master.py b/src/torchgems/train_spatial_master.py
index 18151985..07844ab7 100644
--- a/src/torchgems/train_spatial_master.py
+++ b/src/torchgems/train_spatial_master.py
@@ -395,23 +395,25 @@ def run_step_allreduce(self, inputs, labels, odd_iteration):
     def run_step(self, inputs, labels):
         loss, correct = 0, 0
         # torch.cuda.empty_cache()
+        print("START RUN_STEP MODEL1")
 
         # self.train_model1.models = self.train_model1.models.to('cuda')
         temp_loss, temp_correct = self.train_model1.run_step(
             inputs[: self.batch_size], labels[: self.batch_size]
         )
+        print("END RUN_STEP MODEL1")
         loss += temp_loss
         correct += temp_correct
 
         # torch.cuda.empty_cache()
-
+        print("START RUN_STEP MODEL2")
         # self.train_model1.models = self.train_model1.models.to('cpu')
         # self.train_model2.models = self.train_model2.models.to('cuda')
         temp_loss, temp_correct = self.train_model2.run_step(
             inputs[self.batch_size : 2 * self.batch_size],
             labels[self.batch_size : 2 * self.batch_size],
         )
-
+        print("END RUN_STEP MODEL2")
         # self.train_model2.models = self.train_model2.models.to('cpu')
 
         # torch.cuda.empty_cache()
@@ -422,17 +424,21 @@ def run_step(self, inputs, labels):
         torch.cuda.synchronize()
         for times in range(self.replications - 1):
             index = (2 * times) + 2
+            print("Times :", times)
+            print("START RUN_STEP MODEL1")
             temp_loss, temp_correct = self.train_model1.run_step(
                 inputs[index * self.batch_size : (index + 1) * self.batch_size],
                 labels[index * self.batch_size : (index + 1) * self.batch_size],
             )
+            print("END RUN_STEP MODEL1")
             loss += temp_loss
             correct += temp_correct
-
+            print("START RUN_STEP MODEL2")
             temp_loss, temp_correct = self.train_model2.run_step(
                 inputs[(index + 1) * self.batch_size : (index + 2) * self.batch_size],
                 labels[(index + 1) * self.batch_size : (index + 2) * self.batch_size],
             )
+            print("END RUN_STEP MODEL2")
 
             loss += temp_loss
             correct += temp_correct

From 46014d98444a28ee1fbadf6e8fce7779fc3bb817 Mon Sep 17 00:00:00 2001
From: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
Date: Wed, 18 Oct 2023 11:37:12 -0500
Subject: [PATCH 02/11] Debug comments

Signed-off-by: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
---
 .../benchmark_amoebanet_gems+spatial.py       |  19 +++
 src/torchgems/mp_pipeline.py                  |  12 ++
 src/torchgems/train_spatial.py                | 114 +++++++++++++++++-
 src/torchgems/train_spatial_master.py         |  10 +-
 4 files changed, 145 insertions(+), 10 deletions(-)

diff --git a/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py b/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py
index 78f10a02..1f6316c6 100644
--- a/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py
+++ b/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py
@@ -58,6 +58,25 @@ def get_depth(version, n):
 
 sys.stdout = Unbuffered(sys.stdout)
 
+# Example of GEMS + SPATIAL split_size = 2, spatial_size = 1, num_spatial_parts = 4
+#
+#  Model 1:
+# _______________             ____
+# |   0(0)|  1(1) |           |    |
+# |-------|-------| --------->|4(4)|
+# |  2(2) |  3(3) |           |    |
+# |_______|_______|           |____|
+#
+# Model 2 (INVERSE GEMS):
+# _______________             ____
+# |  0(4) |  1(3) |           |    |
+# |-------|-------| --------->|4(0)|
+# |  2(2) |  3(1) |           |    |
+# |_______|_______|           |____|
+#
+# Numbers inside the brackets () refer to World Rank
+# whereas outside numbers refer to local rank for each model
+
 # torch.set_num_threads(1)
 np.random.seed(seed=1405)
 parts = args.parts
diff --git a/src/torchgems/mp_pipeline.py b/src/torchgems/mp_pipeline.py
index b0fb54e7..b0b5479c 100644
--- a/src/torchgems/mp_pipeline.py
+++ b/src/torchgems/mp_pipeline.py
@@ -526,7 +526,10 @@ def run_step(self, data_x, data_y):
             end = (i + 1) * parts_size
             print(
                 "mp_pipeline:train_model:run_step : START FORWARD PASS",
+                " rank :",
                 self.local_rank,
+                " inverse : ",
+                self.GEMS_INVERSE,
                 self.parts,
                 start,
                 end,
@@ -537,7 +540,10 @@ def run_step(self, data_x, data_y):
 
             print(
                 "mp_pipeline:train_model:run_step : END FORWARD PASS",
+                " rank :",
                 self.local_rank,
+                " inverse : ",
+                self.GEMS_INVERSE,
                 self.parts,
                 start,
                 end,
@@ -551,7 +557,10 @@ def run_step(self, data_x, data_y):
 
         print(
             "mp_pipeline:train_model:run_step : START BACKWARD PASS",
+            " rank :",
             self.local_rank,
+            " inverse : ",
+            self.GEMS_INVERSE,
             self.parts,
             start,
             end,
@@ -562,7 +571,10 @@ def run_step(self, data_x, data_y):
 
         print(
             "mp_pipeline:train_model:run_step : END BACKWARD PASS",
+            " rank :",
             self.local_rank,
+            " inverse : ",
+            self.GEMS_INVERSE,
             self.parts,
             start,
             end,
diff --git a/src/torchgems/train_spatial.py b/src/torchgems/train_spatial.py
index 721510be..60cb39a7 100644
--- a/src/torchgems/train_spatial.py
+++ b/src/torchgems/train_spatial.py
@@ -645,10 +645,16 @@ def receive_input_async_joint(self, part_number, ranks):
         ranks = [
             self.local_rank - 1 - i for i in range(self.num_spatial_parts - 1, -1, -1)
         ]
-
+        print("receive_input_async_joint", " rank : ", self.local_rank, ranks)
         if self.GEMS_INVERSE:
             for i in range(len(ranks)):
                 ranks[i] = self.mp_size - 1 - ranks[i]
+            print(
+                "receive_input_async_joint if self.GEMS_INVERSE",
+                " rank : ",
+                self.local_rank,
+                ranks,
+            )
 
         reqs = []
 
@@ -1216,27 +1222,49 @@ def forward_pass(self, data_x, data_y, part_number=0):
             data_x.size(),
             data_y.size(),
             self.local_rank,
+            self.GEMS_INVERSE,
+            "self.ENABLE_ASYNC",
+            self.ENABLE_ASYNC,
+            "self.split_rank",
+            self.split_rank,
+            "self.spatial_size",
+            self.spatial_size,
+            " self.total_spatial_processes",
+            self.total_spatial_processes,
         )
         # Receive inputs if local is not 0
         if self.split_rank == 0:
             input_x = data_x
         else:
             if self.ENABLE_ASYNC == True:
+                print("self.ENABLE_ASYNC  == True")
                 if self.split_rank == self.spatial_size:
                     if self.ENABLE_LOCAL_DP_LP:
+                        print(
+                            "Calling recv_input_MP_joint_LP_DP",
+                            " rank : ",
+                            self.local_rank,
+                        )
                         self.recv_input_MP_joint_LP_DP(part_number)
                     else:
+                        print("Calling recv_inputs_joint", " rank : ", self.local_rank)
                         self.recv_inputs_joint(part_number)
                 elif self.SKEWED_RECV_SPATIAL:
+                    print("Calling recv_input_spatial", " rank : ", self.local_rank)
                     self.recv_input_spatial(part_number)
                 else:
+                    print("Calling receive_input_async", " rank : ", self.local_rank)
                     self.receive_input_async(part_number)
             else:
+                print("self.ENABLE_ASYNC  == False")
                 if self.local_rank == self.total_spatial_processes:
+                    print("Calling recv_inputs_joint", " rank : ", self.local_rank)
                     self.recv_inputs_joint(part_number)
                 elif self.SKEWED_RECV_SPATIAL:
+                    print("Calling recv_input_spatial", " rank : ", self.local_rank)
                     self.recv_input_spatial(part_number)
                 else:
+                    print("Calling receive_input_sync", " rank : ", self.local_rank)
                     self.receive_input_sync(part_number)
 
             # join spatial inputs
@@ -1250,7 +1278,13 @@ def forward_pass(self, data_x, data_y, part_number=0):
                 else:
                     input_x = self.input_x_list[part_number]
 
-        print("train_spatial:forward_pass: RECEIVED INPUTS", self.local_rank)
+        print(
+            "train_spatial:forward_pass: RECEIVED INPUTS",
+            " rank :",
+            self.local_rank,
+            self.GEMS_INVERSE,
+        )
+
         # Apply forward pass
 
         torch.cuda.synchronize()
@@ -1262,25 +1296,83 @@ def forward_pass(self, data_x, data_y, part_number=0):
             )
             and part_number != self.parts - 1
         ):
+            print(
+                "train_spatial:forward_pass: DP",
+                " rank :",
+                self.local_rank,
+                self.GEMS_INVERSE,
+                part_number,
+                self.parts,
+            )
             with self.models.no_sync():
+                # print("MODEL :", " rank ", self.local_rank, self.models)
                 y = self.models(input_x)
         else:
+            print(
+                "train_spatial:forward_pass: no_DP",
+                " rank :",
+                self.local_rank,
+                self.GEMS_INVERSE,
+                part_number,
+                self.parts,
+            )
+            # print("MODEL :", " rank ", self.local_rank, self.models)
             y = self.models(input_x)
 
+        print(
+            "train_spatial:forward_pass: DONE_MODEL_TRAIN",
+            " rank :",
+            self.local_rank,
+            self.GEMS_INVERSE,
+        )
         torch.cuda.synchronize()
-        print("train_spatial:forward_pass: CALCULATED_Y", self.local_rank)
+        print(
+            "train_spatial:forward_pass: CALCULATED_Y",
+            " rank :",
+            self.local_rank,
+            self.GEMS_INVERSE,
+        )
         if self.split_rank != self.split_size - 1:
             if self.ENABLE_ASYNC == True:
                 if self.split_rank == self.spatial_size - 1 and self.ENABLE_LOCAL_DP_LP:
+                    print(
+                        "train_spatial:forward_pass: calling self.ENABLE_ASYNC send_input_spatial_MP_joint_LP_DP",
+                        " rank :",
+                        self.local_rank,
+                        self.GEMS_INVERSE,
+                    )
                     self.send_input_spatial_MP_joint_LP_DP(y)
                 else:
+                    print(
+                        "train_spatial:forward_pass: calling self.ENABLE_ASYNC send_input_async",
+                        " rank :",
+                        self.local_rank,
+                        self.GEMS_INVERSE,
+                    )
                     self.send_input_async(y)
             else:
                 if self.split_rank == self.spatial_size - 1 and self.ENABLE_LOCAL_DP_LP:
+                    print(
+                        "train_spatial:forward_pass: calling send_input_spatial_MP_joint_LP_DP",
+                        " rank :",
+                        self.local_rank,
+                        self.GEMS_INVERSE,
+                    )
                     self.send_input_spatial_MP_joint_LP_DP(y)
                 else:
+                    print(
+                        "train_spatial:forward_pass: calling send_input_sync",
+                        " rank :",
+                        self.local_rank,
+                        self.GEMS_INVERSE,
+                    )
                     self.send_input_sync(y)
-            print("train_spatial:forward_pass: SENT_Y", self.local_rank)
+            print(
+                "train_spatial:forward_pass: SENT_Y",
+                " rank :",
+                self.local_rank,
+                self.GEMS_INVERSE,
+            )
 
         else:
             pos = self.local_rank - (self.mp_size - self.LOCAL_DP_LP)
@@ -1300,9 +1392,19 @@ def forward_pass(self, data_x, data_y, part_number=0):
             else:
                 loss = self.criterion(y, data_y)
 
-            print("train_spatial:forward_pass: CALCULATED_LOSS", self.local_rank)
+            print(
+                "train_spatial:forward_pass: CALCULATED_LOSS",
+                " rank :",
+                self.local_rank,
+                self.GEMS_INVERSE,
+            )
 
-        print("train_spatial:forward_pass: END", self.local_rank)
+        print(
+            "train_spatial:forward_pass: END",
+            " rank :",
+            self.local_rank,
+            self.GEMS_INVERSE,
+        )
 
         if self.split_rank == self.split_size - 1:
             corrects = (data_y.eq(torch.argmax(y, dim=-1).long())).sum().float()
diff --git a/src/torchgems/train_spatial_master.py b/src/torchgems/train_spatial_master.py
index 07844ab7..ab44ad09 100644
--- a/src/torchgems/train_spatial_master.py
+++ b/src/torchgems/train_spatial_master.py
@@ -395,25 +395,25 @@ def run_step_allreduce(self, inputs, labels, odd_iteration):
     def run_step(self, inputs, labels):
         loss, correct = 0, 0
         # torch.cuda.empty_cache()
-        print("START RUN_STEP MODEL1")
+        print("START RUN_STEP MODEL1", "rank ", self.local_rank)
 
         # self.train_model1.models = self.train_model1.models.to('cuda')
         temp_loss, temp_correct = self.train_model1.run_step(
             inputs[: self.batch_size], labels[: self.batch_size]
         )
-        print("END RUN_STEP MODEL1")
+        print("END RUN_STEP MODEL1", "rank ", self.local_rank)
         loss += temp_loss
         correct += temp_correct
 
         # torch.cuda.empty_cache()
-        print("START RUN_STEP MODEL2")
+        print("START RUN_STEP MODEL2", "rank ", self.local_rank)
         # self.train_model1.models = self.train_model1.models.to('cpu')
         # self.train_model2.models = self.train_model2.models.to('cuda')
         temp_loss, temp_correct = self.train_model2.run_step(
             inputs[self.batch_size : 2 * self.batch_size],
             labels[self.batch_size : 2 * self.batch_size],
         )
-        print("END RUN_STEP MODEL2")
+        print("END RUN_STEP MODEL2", "rank ", self.local_rank)
         # self.train_model2.models = self.train_model2.models.to('cpu')
 
         # torch.cuda.empty_cache()
@@ -421,6 +421,8 @@ def run_step(self, inputs, labels):
         loss += temp_loss
         correct += temp_correct
 
+        print("Calculated loss and accuracy for MODEL1 AND MODEL2")
+
         torch.cuda.synchronize()
         for times in range(self.replications - 1):
             index = (2 * times) + 2

From d6236c6fc487cad8bd596e2fff590b836e358943 Mon Sep 17 00:00:00 2001
From: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
Date: Wed, 18 Oct 2023 11:39:51 -0500
Subject: [PATCH 03/11] mend

Signed-off-by: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
---
 benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py b/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py
index 1f6316c6..60bb354e 100644
--- a/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py
+++ b/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py
@@ -61,14 +61,14 @@ def get_depth(version, n):
 # Example of GEMS + SPATIAL split_size = 2, spatial_size = 1, num_spatial_parts = 4
 #
 #  Model 1:
-# _______________             ____
+#  _______________             ____
 # |   0(0)|  1(1) |           |    |
 # |-------|-------| --------->|4(4)|
 # |  2(2) |  3(3) |           |    |
 # |_______|_______|           |____|
 #
 # Model 2 (INVERSE GEMS):
-# _______________             ____
+#  _______________             ____
 # |  0(4) |  1(3) |           |    |
 # |-------|-------| --------->|4(0)|
 # |  2(2) |  3(1) |           |    |

From 2de55b6aeb28a7ef25725b1ca895795e9a1fffb0 Mon Sep 17 00:00:00 2001
From: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
Date: Wed, 18 Oct 2023 11:39:51 -0500
Subject: [PATCH 04/11] mend

Signed-off-by: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
---
 .../benchmark_amoebanet_gems+spatial.py       |   5 +-
 .../benchmark_resnet_gems+spatial.py          | 436 ++++++++++++++++++
 2 files changed, 439 insertions(+), 2 deletions(-)
 create mode 100644 benchmarks/gems_model/benchmark_resnet_gems+spatial.py

diff --git a/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py b/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py
index 1f6316c6..a8e52f34 100644
--- a/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py
+++ b/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py
@@ -61,14 +61,14 @@ def get_depth(version, n):
 # Example of GEMS + SPATIAL split_size = 2, spatial_size = 1, num_spatial_parts = 4
 #
 #  Model 1:
-# _______________             ____
+#  _______________             ____
 # |   0(0)|  1(1) |           |    |
 # |-------|-------| --------->|4(4)|
 # |  2(2) |  3(3) |           |    |
 # |_______|_______|           |____|
 #
 # Model 2 (INVERSE GEMS):
-# _______________             ____
+#  _______________             ____
 # |  0(4) |  1(3) |           |    |
 # |-------|-------| --------->|4(0)|
 # |  2(2) |  3(1) |           |    |
@@ -410,6 +410,7 @@ def run_epoch():
             t_allreduce_temp = time.time()
 
             if ENABLE_MASTER_OPT == False:
+                print("benchmark_amoebanet_gems+spatial : START ALL REDUCE OPERATION")
                 sync_comm.apply_allreduce_master_master(
                     model_gen1, model_gen2, mpi_comm_first, mpi_comm_second
                 )
diff --git a/benchmarks/gems_model/benchmark_resnet_gems+spatial.py b/benchmarks/gems_model/benchmark_resnet_gems+spatial.py
new file mode 100644
index 00000000..df6cf38e
--- /dev/null
+++ b/benchmarks/gems_model/benchmark_resnet_gems+spatial.py
@@ -0,0 +1,436 @@
+import torch
+import torch.distributed as dist
+import torchvision.transforms as transforms
+import torchvision
+import numpy as np
+import time
+import sys
+import math
+import logging
+from models import resnet
+from torchgems import parser
+from torchgems.mp_pipeline import model_generator
+
+# from torchgems.gems_master import train_model_master
+# from torchgems.train_spatial import get_shapes_spatial
+from torchgems.train_spatial import split_input
+from torchgems.train_spatial_master import train_spatial_model_master
+import torchgems.comm as gems_comm
+
+parser_obj = parser.get_parser()
+args = parser_obj.parse_args()
+
+if args.verbose:
+    logging.basicConfig(level=logging.DEBUG)
+
+gems_comm.initialize_cuda()
+
+
+class Unbuffered(object):
+    def __init__(self, stream):
+        self.stream = stream
+
+    def write(self, data):
+        self.stream.write(data)
+        self.stream.flush()
+
+    def writelines(self, datas):
+        self.stream.writelines(datas)
+        self.stream.flush()
+
+    def __getattr__(self, attr):
+        return getattr(self.stream, attr)
+
+
+def init_processes(backend="tcp"):
+    """Initialize the distributed environment."""
+    dist.init_process_group(backend)
+    size = dist.get_world_size()
+    rank = dist.get_rank()
+    return size, rank
+
+
+sys.stdout = Unbuffered(sys.stdout)
+
+np.random.seed(seed=1405)
+parts = args.parts
+batch_size = args.batch_size
+epoch = args.num_epochs
+
+# APP
+# 1: Medical
+# 2: Cifar
+# 3: synthetic
+APP = args.app
+image_size = int(args.image_size)
+num_layers = args.num_layers
+num_filters = args.num_filters
+balance = args.balance
+mp_size = args.split_size
+datapath = args.datapath
+num_classes = args.num_classes
+split_size = args.split_size
+spatial_size = args.spatial_size
+ENABLE_MASTER_OPT = args.enable_master_comm_opt
+
+temp_num_spatial_parts = args.num_spatial_parts.split(",")
+
+if len(temp_num_spatial_parts) == 1:
+    num_spatial_parts_list = [int(temp_num_spatial_parts[0])]
+    num_spatial_parts = int(temp_num_spatial_parts[0])
+else:
+    num_spatial_parts = [int(i) for i in temp_num_spatial_parts]
+    num_spatial_parts_list = num_spatial_parts
+
+
+times = args.times
+num_classes = 1000
+LOCAL_DP_LP = args.local_DP
+
+################## ResNet model specific parameters/functions ##################
+
+image_size_seq = 32
+ENABLE_ASYNC = True
+resnet_n = 12
+
+
+def get_depth(version, n):
+    if version == 1:
+        return n * 6 + 2
+    elif version == 2:
+        return n * 9 + 2
+
+
+###############################################################################
+
+mpi_comm_first = gems_comm.MPIComm(
+    split_size=split_size,
+    ENABLE_MASTER=False,
+    ENABLE_SPATIAL=True,
+    num_spatial_parts=num_spatial_parts,
+    spatial_size=spatial_size,
+    LOCAL_DP_LP=LOCAL_DP_LP,
+)
+mpi_comm_second = gems_comm.MPIComm(
+    split_size=split_size,
+    ENABLE_MASTER=True,
+    ENABLE_SPATIAL=True,
+    num_spatial_parts=num_spatial_parts,
+    spatial_size=spatial_size,
+    LOCAL_DP_LP=LOCAL_DP_LP,
+    DISABLE_INIT=True,
+)
+
+gems_comm.sync_comms_for_master(mpi_comm_first, mpi_comm_second)
+comm_size = mpi_comm_first.size
+
+if args.balance != None:
+    balance = args.balance.split(",")
+    balance = [int(j) for j in balance]
+else:
+    balance = None
+# local_rank = rank % mp_size
+
+# Initialize ResNet model
+model = resnet.get_resnet_v2(
+    (int(batch_size / parts), 3, image_size_seq, image_size_seq),
+    depth=get_depth(2, resnet_n),
+    num_classes=num_classes,
+)
+
+mul_shape = int(args.image_size / image_size_seq)
+
+# Initialize parameters for Model Parallelism
+model_gen = model_generator(
+    model=model,
+    split_size=mp_size,
+    input_size=(int(batch_size / parts), 3, image_size_seq, image_size_seq),
+    balance=balance,
+)
+
+# Get the shape of model on each split rank for image_size_seq and move it to device
+# Note : we take shape w.r.t image_size_seq as model w.r.t image_size may not be
+# able to fit in memory
+model_gen.ready_model(split_rank=mpi_comm_second.split_rank, GET_SHAPES_ON_CUDA=True)
+
+# Get the shape of model on each split rank for image_size
+image_size_times = int(image_size / image_size_seq)
+resnet_shapes_list = []
+for output_shape in model_gen.shape_list:
+    if isinstance(output_shape, list):
+        temp_shape = []
+        for shape_tuple in output_shape:
+            x = (
+                shape_tuple[0],
+                shape_tuple[1],
+                int(shape_tuple[2] * image_size_times),
+                int(shape_tuple[3] * image_size_times),
+            )
+            temp_shape.append(x)
+        resnet_shapes_list.append(temp_shape)
+    else:
+        if len(output_shape) == 2:
+            resnet_shapes_list.append(output_shape)
+        else:
+            x = (
+                output_shape[0],
+                output_shape[1],
+                int(output_shape[2] * image_size_times),
+                int(output_shape[3] * image_size_times),
+            )
+            resnet_shapes_list.append(x)
+
+model_gen.shape_list = resnet_shapes_list
+# logging.info(f"Shape of model on local_rank {local_rank} : {model_gen.shape_list}")
+
+
+del model_gen
+del model
+torch.cuda.ipc_collect()
+
+model = resnet.get_resnet_v2(
+    (int(batch_size / parts), 3, image_size, image_size), get_depth(2, resnet_n)
+)
+
+# GEMS Model 1
+model_gen1 = model_generator(
+    model=model,
+    split_size=mp_size,
+    input_size=(int(batch_size / parts), 3, image_size, image_size),
+    balance=None,
+    shape_list=resnet_shapes_list,
+)
+model_gen1.ready_model(split_rank=mpi_comm_first.split_rank)
+
+
+model = resnet.get_resnet_v2(
+    (int(batch_size / parts), 3, image_size, image_size), get_depth(2, resnet_n)
+)
+
+# GEMS Model 2
+model_gen2 = model_generator(
+    model=model,
+    split_size=mp_size,
+    input_size=(int(batch_size / parts), 3, image_size, image_size),
+    balance=None,
+    shape_list=model_gen1.shape_list,
+)
+model_gen2.ready_model(split_rank=mpi_comm_second.split_rank)
+print("Shape list", resnet_shapes_list)
+
+# tm_master = train_model_master(
+#     model_gen1,
+#     model_gen2,
+#     local_rank,
+#     batch_size,
+#     epoch,
+#     criterion=None,
+#     optimizer=None,
+#     parts=parts,
+#     ASYNC=ENABLE_ASYNC,
+# )
+t_s_master = train_spatial_model_master(
+    model_gen1,
+    model_gen2,
+    batch_size,
+    spatial_size,
+    num_spatial_parts,
+    args.slice_method,
+    mpi_comm_first,
+    mpi_comm_second,
+    LOCAL_DP_LP=LOCAL_DP_LP,
+    criterion=None,
+    optimizer=None,
+    parts=parts,
+    ASYNC=True,
+    replications=int(args.times / 2),
+)
+
+# sync_allreduce = gems_comm.SyncAllreduce(mpi_comm)
+
+
+############################## Dataset Definition ##############################
+
+transform = transforms.Compose(
+    [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
+)
+torch.manual_seed(0)
+
+if APP == 1:
+    trainset = torchvision.datasets.ImageFolder(
+        datapath,
+        transform=transform,
+        target_transform=None,
+    )
+    my_dataloader = torch.utils.data.DataLoader(
+        trainset,
+        batch_size=times * batch_size,
+        shuffle=True,
+        num_workers=0,
+        pin_memory=True,
+    )
+    size_dataset = len(my_dataloader.dataset)
+elif APP == 2:
+    trainset = torchvision.datasets.CIFAR10(
+        root=datapath, train=True, download=True, transform=transform
+    )
+    my_dataloader = torch.utils.data.DataLoader(
+        trainset,
+        batch_size=times * batch_size,
+        shuffle=False,
+        num_workers=0,
+        pin_memory=True,
+    )
+    size_dataset = len(my_dataloader.dataset)
+else:
+    my_dataset = torchvision.datasets.FakeData(
+        size=10 * batch_size,
+        image_size=(3, image_size, image_size),
+        num_classes=num_classes,
+        transform=transform,
+        target_transform=None,
+        random_offset=0,
+    )
+    my_dataloader = torch.utils.data.DataLoader(
+        my_dataset,
+        batch_size=batch_size * times,
+        shuffle=False,
+        num_workers=0,
+        pin_memory=True,
+    )
+    size_dataset = 10 * batch_size
+
+
+################################################################################
+
+# sync_allreduce.sync_model(model_gen1, model_gen2)
+
+perf = []
+
+sync_comm = gems_comm.SyncAllreduce(mpi_comm_first)
+
+
+MASTER = args.times
+
+print("ENABLE_MASTER_OPT", ENABLE_MASTER_OPT)
+
+perf = []
+
+
+def run_epoch():
+    for i_e in range(epoch):
+        loss = 0
+        correct = 0
+        size = len(my_dataloader.dataset)
+        t = time.time()
+        for i, data in enumerate(my_dataloader, 0):
+            start_event = torch.cuda.Event(enable_timing=True, blocking=True)
+            end_event = torch.cuda.Event(enable_timing=True, blocking=True)
+            start_event.record()
+
+            if i > math.floor(size_dataset / (times * batch_size)) - 1:
+                break
+
+            inputs, labels = data
+
+            if mpi_comm_first.local_rank < num_spatial_parts_list[0]:
+                x = split_input(
+                    inputs=inputs,
+                    image_size=image_size,
+                    slice_method=args.slice_method,
+                    local_rank=mpi_comm_first.local_rank,
+                    num_spatial_parts_list=num_spatial_parts_list,
+                )
+            elif mpi_comm_second.local_rank < num_spatial_parts_list[0]:
+                x = split_input(
+                    inputs=inputs,
+                    image_size=image_size,
+                    slice_method=args.slice_method,
+                    local_rank=mpi_comm_second.local_rank,
+                    num_spatial_parts_list=num_spatial_parts_list,
+                )
+            else:
+                x = inputs
+
+            # for j in range(MASTER):
+
+            # 	temp_loss,temp_correct = t_s1.run_step(x,labels)
+            # 	temp_loss,temp_correct = t_s2.run_step(x,labels)
+
+            if ENABLE_MASTER_OPT:
+                temp_loss, temp_correct = t_s_master.run_step_allreduce(
+                    x, labels, i % 2 == 1
+                )
+            else:
+                temp_loss, temp_correct = t_s_master.run_step(x, labels)
+
+            loss += temp_loss
+            correct += temp_correct
+
+            start_event_allreduce = torch.cuda.Event(enable_timing=True, blocking=True)
+            end_event_allreduce = torch.cuda.Event(enable_timing=True, blocking=True)
+            start_event_allreduce.record()
+            t_allreduce_temp = time.time()
+
+            if ENABLE_MASTER_OPT == False:
+                print("benchmark_amoebanet_gems+spatial : START ALL REDUCE OPERATION")
+                sync_comm.apply_allreduce_master_master(
+                    model_gen1, model_gen2, mpi_comm_first, mpi_comm_second
+                )
+
+            """
+			if(local_rank < spatial_size * num_spatial_parts):
+				None
+				#No need for this as, DDP is now used
+				# sync_allreduce.apply_allreduce(model_gen,mpi_comm.spatial_allreduce_grp)
+			"""
+            torch.cuda.synchronize()
+
+            if ENABLE_MASTER_OPT:
+                if i % 2 == 1:
+                    t_s_master.train_model1.update()
+                else:
+                    t_s_master.train_model2.update()
+            else:
+                t_s_master.train_model1.update()
+                t_s_master.train_model2.update()
+
+            end_event_allreduce.record()
+            torch.cuda.synchronize()
+            t_allreduce = start_event_allreduce.elapsed_time(end_event_allreduce) / 1000
+            t_allreduce = time.time() - t_allreduce_temp
+
+            if mpi_comm_second.local_rank == comm_size - 1:
+                None
+                # print("Step",i," LOSS",temp_loss, " Global loss:",loss/(i+1), " Acc:",temp_correct)
+
+            if ENABLE_MASTER_OPT:
+                torch.distributed.barrier()
+
+            end_event.record()
+            torch.cuda.synchronize()
+            t = start_event.elapsed_time(end_event) / 1000
+            if mpi_comm_second.local_rank == 0:
+                None
+                print(
+                    "images per sec:",
+                    batch_size / t,
+                    "Time:",
+                    t,
+                    " Time Allreduce:",
+                    t_allreduce,
+                )
+                perf.append(batch_size / t)
+
+            t = time.time()
+        if mpi_comm_second.local_rank == comm_size - 1:
+            print("epoch", i_e, " Global loss:", loss, " acc", correct / i)
+
+
+run_epoch()
+
+if mpi_comm_second.local_rank == 0:
+    print(f"Mean {sum(perf) / len(perf)} Median {np.median(perf)}")
+
+exit()
+################################################################################

From f30db787fa9ae944a5aceaa7880f99462d685515 Mon Sep 17 00:00:00 2001
From: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
Date: Thu, 2 Nov 2023 18:19:47 -0400
Subject: [PATCH 05/11] Resolve SP_GEMS issue.

Ranks assigned to each spatial parts of model1 and model2 should not be overlapping. Added verify_spatial_master_config() to verify the configuration to avoid this error.

Signed-off-by: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
---
 .../benchmark_amoebanet_gems+spatial.py       |  27 +-
 .../benchmark_resnet_gems+spatial.py          | 307 ++++++++++--------
 src/torchgems/comm.py                         |   6 +-
 src/torchgems/train_spatial.py                |  49 +++
 src/torchgems/train_spatial_master.py         |  41 ++-
 5 files changed, 280 insertions(+), 150 deletions(-)

diff --git a/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py b/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py
index a8e52f34..6bd627bb 100644
--- a/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py
+++ b/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py
@@ -9,7 +9,10 @@
 from torchgems import parser
 from torchgems.mp_pipeline import model_generator
 from torchgems.train_spatial import get_shapes_spatial, split_input
-from torchgems.train_spatial_master import train_spatial_model_master
+from torchgems.train_spatial_master import (
+    train_spatial_model_master,
+    verify_spatial_master_config,
+)
 import torchgems.comm as gems_comm
 
 parser_obj = parser.get_parser()
@@ -99,6 +102,7 @@ def get_depth(version, n):
 balance = args.balance
 split_size = args.split_size
 spatial_size = args.spatial_size
+slice_method = args.slice_method
 ENABLE_MASTER_OPT = args.enable_master_comm_opt
 
 temp_num_spatial_parts = args.num_spatial_parts.split(",")
@@ -110,8 +114,10 @@ def get_depth(version, n):
     num_spatial_parts = [int(i) for i in temp_num_spatial_parts]
     num_spatial_parts_list = num_spatial_parts
 
+spatial_part_size = num_spatial_parts_list[0]  # Partition size for spatial parallelism
+
 times = args.times
-num_classes = 1000
+num_classes = args.num_classes
 LOCAL_DP_LP = args.local_DP
 
 
@@ -123,6 +129,15 @@ def get_depth(version, n):
     spatial_size=spatial_size,
     LOCAL_DP_LP=LOCAL_DP_LP,
 )
+
+verify_spatial_master_config(
+    slice_method,
+    image_size,
+    num_spatial_parts_list,
+    spatial_size,
+    mpi_comm_first.mp_size,
+)
+
 mpi_comm_second = gems_comm.MPIComm(
     split_size=split_size,
     ENABLE_MASTER=True,
@@ -169,7 +184,7 @@ def get_depth(version, n):
 
 resnet_shapes_list = get_shapes_spatial(
     shape_list=model_gen_seq.shape_list,
-    slice_method=args.slice_method,
+    slice_method=slice_method,
     spatial_size=spatial_size,
     num_spatial_parts_list=num_spatial_parts_list,
     image_size_times=image_size_times,
@@ -273,7 +288,7 @@ def get_depth(version, n):
     batch_size,
     spatial_size,
     num_spatial_parts,
-    args.slice_method,
+    slice_method,
     mpi_comm_first,
     mpi_comm_second,
     LOCAL_DP_LP=LOCAL_DP_LP,
@@ -374,7 +389,7 @@ def run_epoch():
                 x = split_input(
                     inputs=inputs,
                     image_size=image_size,
-                    slice_method=args.slice_method,
+                    slice_method=slice_method,
                     local_rank=mpi_comm_first.local_rank,
                     num_spatial_parts_list=num_spatial_parts_list,
                 )
@@ -382,7 +397,7 @@ def run_epoch():
                 x = split_input(
                     inputs=inputs,
                     image_size=image_size,
-                    slice_method=args.slice_method,
+                    slice_method=slice_method,
                     local_rank=mpi_comm_second.local_rank,
                     num_spatial_parts_list=num_spatial_parts_list,
                 )
diff --git a/benchmarks/gems_model/benchmark_resnet_gems+spatial.py b/benchmarks/gems_model/benchmark_resnet_gems+spatial.py
index df6cf38e..743e2b61 100644
--- a/benchmarks/gems_model/benchmark_resnet_gems+spatial.py
+++ b/benchmarks/gems_model/benchmark_resnet_gems+spatial.py
@@ -6,22 +6,25 @@
 import time
 import sys
 import math
-import logging
-from models import resnet
 from torchgems import parser
 from torchgems.mp_pipeline import model_generator
-
-# from torchgems.gems_master import train_model_master
-# from torchgems.train_spatial import get_shapes_spatial
-from torchgems.train_spatial import split_input
-from torchgems.train_spatial_master import train_spatial_model_master
+from torchgems.train_spatial import get_shapes_spatial, split_input
+from torchgems.train_spatial_master import (
+    train_spatial_model_master,
+    verify_spatial_master_config,
+)
 import torchgems.comm as gems_comm
+from models import resnet
+
 
 parser_obj = parser.get_parser()
 args = parser_obj.parse_args()
 
-if args.verbose:
-    logging.basicConfig(level=logging.DEBUG)
+if args.halo_d2:
+    # from models import resnet
+    from models import resnet_spatial_d2 as resnet_spatial
+else:
+    from models import resnet_spatial
 
 gems_comm.initialize_cuda()
 
@@ -50,27 +53,56 @@ def init_processes(backend="tcp"):
     return size, rank
 
 
+def get_depth(version, n):
+    if version == 1:
+        return n * 6 + 2
+    elif version == 2:
+        return n * 9 + 2
+
+
 sys.stdout = Unbuffered(sys.stdout)
 
+# Example of GEMS + SPATIAL split_size = 2, spatial_size = 1, num_spatial_parts = 4
+#
+#  Model 1:
+#  _______________             ____
+# |   0(0)|  1(1) |           |    |
+# |-------|-------| --------->|4(4)|
+# |  2(2) |  3(3) |           |    |
+# |_______|_______|           |____|
+#
+# Model 2 (INVERSE GEMS):
+#  _______________             ____
+# |  0(4) |  1(3) |           |    |
+# |-------|-------| --------->|4(0)|
+# |  2(2) |  3(1) |           |    |
+# |_______|_______|           |____|
+#
+# Numbers inside the brackets () refer to World Rank
+# whereas outside numbers refer to local rank for each model
+
+# torch.set_num_threads(1)
 np.random.seed(seed=1405)
 parts = args.parts
 batch_size = args.batch_size
+resnet_n = 12
 epoch = args.num_epochs
+ENABLE_ASYNC = True
 
 # APP
 # 1: Medical
 # 2: Cifar
 # 3: synthetic
-APP = args.app
+APP = 3
 image_size = int(args.image_size)
+print("image size", image_size)
+steps = 100
 num_layers = args.num_layers
 num_filters = args.num_filters
 balance = args.balance
-mp_size = args.split_size
-datapath = args.datapath
-num_classes = args.num_classes
 split_size = args.split_size
 spatial_size = args.spatial_size
+slice_method = args.slice_method
 ENABLE_MASTER_OPT = args.enable_master_comm_opt
 
 temp_num_spatial_parts = args.num_spatial_parts.split(",")
@@ -82,26 +114,10 @@ def init_processes(backend="tcp"):
     num_spatial_parts = [int(i) for i in temp_num_spatial_parts]
     num_spatial_parts_list = num_spatial_parts
 
-
 times = args.times
-num_classes = 1000
+num_classes = args.num_classes
 LOCAL_DP_LP = args.local_DP
 
-################## ResNet model specific parameters/functions ##################
-
-image_size_seq = 32
-ENABLE_ASYNC = True
-resnet_n = 12
-
-
-def get_depth(version, n):
-    if version == 1:
-        return n * 6 + 2
-    elif version == 2:
-        return n * 9 + 2
-
-
-###############################################################################
 
 mpi_comm_first = gems_comm.MPIComm(
     split_size=split_size,
@@ -111,6 +127,15 @@ def get_depth(version, n):
     spatial_size=spatial_size,
     LOCAL_DP_LP=LOCAL_DP_LP,
 )
+
+verify_spatial_master_config(
+    slice_method,
+    image_size,
+    num_spatial_parts_list,
+    spatial_size,
+    mpi_comm_first.mp_size,
+)
+
 mpi_comm_second = gems_comm.MPIComm(
     split_size=split_size,
     ENABLE_MASTER=True,
@@ -123,119 +148,140 @@ def get_depth(version, n):
 
 gems_comm.sync_comms_for_master(mpi_comm_first, mpi_comm_second)
 comm_size = mpi_comm_first.size
+# rank = mpi_comm.local_rank
+# comm_size = mpi_comm.size
+# local_rank = rank
+
+# split_rank = mpi_comm.split_rank
+
 
 if args.balance != None:
     balance = args.balance.split(",")
     balance = [int(j) for j in balance]
 else:
     balance = None
-# local_rank = rank % mp_size
 
-# Initialize ResNet model
-model = resnet.get_resnet_v2(
+
+image_size_seq = 32
+
+model_seq = resnet.get_resnet_v2(
     (int(batch_size / parts), 3, image_size_seq, image_size_seq),
     depth=get_depth(2, resnet_n),
-    num_classes=num_classes,
 )
-
-mul_shape = int(args.image_size / image_size_seq)
-
-# Initialize parameters for Model Parallelism
-model_gen = model_generator(
-    model=model,
-    split_size=mp_size,
+print("length", len(model_seq), balance)
+model_gen_seq = model_generator(
+    model=model_seq,
+    split_size=split_size,
     input_size=(int(batch_size / parts), 3, image_size_seq, image_size_seq),
     balance=balance,
 )
+model_gen_seq.ready_model(
+    split_rank=mpi_comm_second.split_rank, GET_SHAPES_ON_CUDA=True
+)
 
-# Get the shape of model on each split rank for image_size_seq and move it to device
-# Note : we take shape w.r.t image_size_seq as model w.r.t image_size may not be
-# able to fit in memory
-model_gen.ready_model(split_rank=mpi_comm_second.split_rank, GET_SHAPES_ON_CUDA=True)
-
-# Get the shape of model on each split rank for image_size
 image_size_times = int(image_size / image_size_seq)
-resnet_shapes_list = []
-for output_shape in model_gen.shape_list:
-    if isinstance(output_shape, list):
-        temp_shape = []
-        for shape_tuple in output_shape:
-            x = (
-                shape_tuple[0],
-                shape_tuple[1],
-                int(shape_tuple[2] * image_size_times),
-                int(shape_tuple[3] * image_size_times),
-            )
-            temp_shape.append(x)
-        resnet_shapes_list.append(temp_shape)
-    else:
-        if len(output_shape) == 2:
-            resnet_shapes_list.append(output_shape)
-        else:
-            x = (
-                output_shape[0],
-                output_shape[1],
-                int(output_shape[2] * image_size_times),
-                int(output_shape[3] * image_size_times),
-            )
-            resnet_shapes_list.append(x)
-
-model_gen.shape_list = resnet_shapes_list
-# logging.info(f"Shape of model on local_rank {local_rank} : {model_gen.shape_list}")
-
-
-del model_gen
-del model
-torch.cuda.ipc_collect()
 
-model = resnet.get_resnet_v2(
-    (int(batch_size / parts), 3, image_size, image_size), get_depth(2, resnet_n)
+resnet_shapes_list = get_shapes_spatial(
+    shape_list=model_gen_seq.shape_list,
+    slice_method=slice_method,
+    spatial_size=spatial_size,
+    num_spatial_parts_list=num_spatial_parts_list,
+    image_size_times=image_size_times,
 )
 
-# GEMS Model 1
+print(model_gen_seq.shape_list, resnet_shapes_list)
+
+del model_seq
+del model_gen_seq
+torch.cuda.ipc_collect()
+
+
+if args.halo_d2:
+    model1, balance = resnet_spatial.get_resnet_v2(
+        input_shape=(batch_size / parts, 3, image_size, image_size),
+        depth=get_depth(2, 12),
+        local_rank=mpi_comm_first.local_rank % mpi_comm_first.total_spatial_processes,
+        mp_size=split_size,
+        balance=balance,
+        spatial_size=spatial_size,
+        num_spatial_parts=num_spatial_parts,
+        num_classes=num_classes,
+        fused_layers=args.fused_layers,
+        slice_method=slice_method,
+    )
+
+    model2, balance = resnet_spatial.get_resnet_v2(
+        input_shape=(batch_size / parts, 3, image_size, image_size),
+        depth=get_depth(2, 12),
+        local_rank=mpi_comm_second.local_rank % mpi_comm_second.total_spatial_processes,
+        mp_size=split_size,
+        balance=balance,
+        spatial_size=spatial_size,
+        num_spatial_parts=num_spatial_parts,
+        num_classes=num_classes,
+        fused_layers=args.fused_layers,
+        slice_method=slice_method,
+    )
+else:
+    model1 = resnet_spatial.get_resnet_v2(
+        input_shape=(batch_size / parts, 3, image_size, image_size),
+        depth=get_depth(2, 12),
+        local_rank=mpi_comm_first.local_rank % mpi_comm_first.total_spatial_processes,
+        mp_size=split_size,
+        balance=balance,
+        spatial_size=spatial_size,
+        num_spatial_parts=num_spatial_parts,
+        num_classes=num_classes,
+        fused_layers=args.fused_layers,
+        slice_method=slice_method,
+    )
+
+    model2 = resnet_spatial.get_resnet_v2(
+        input_shape=(batch_size / parts, 3, image_size, image_size),
+        depth=get_depth(2, 12),
+        local_rank=mpi_comm_second.local_rank % mpi_comm_second.total_spatial_processes,
+        mp_size=split_size,
+        balance=balance,
+        spatial_size=spatial_size,
+        num_spatial_parts=num_spatial_parts,
+        num_classes=num_classes,
+        fused_layers=args.fused_layers,
+        slice_method=slice_method,
+    )
+
+
 model_gen1 = model_generator(
-    model=model,
-    split_size=mp_size,
+    model=model1,
+    split_size=split_size,
     input_size=(int(batch_size / parts), 3, image_size, image_size),
-    balance=None,
+    balance=balance,
     shape_list=resnet_shapes_list,
 )
 model_gen1.ready_model(split_rank=mpi_comm_first.split_rank)
+# model_gen1.DDP_model(mpi_comm_first, num_spatial_parts, spatial_size, bucket_size=25, local_rank = mpi_comm_first.local_rank)
 
 
-model = resnet.get_resnet_v2(
-    (int(batch_size / parts), 3, image_size, image_size), get_depth(2, resnet_n)
-)
-
-# GEMS Model 2
 model_gen2 = model_generator(
-    model=model,
-    split_size=mp_size,
+    model=model2,
+    split_size=split_size,
     input_size=(int(batch_size / parts), 3, image_size, image_size),
-    balance=None,
-    shape_list=model_gen1.shape_list,
+    balance=balance,
+    shape_list=resnet_shapes_list,
 )
 model_gen2.ready_model(split_rank=mpi_comm_second.split_rank)
+# model_gen2.DDP_model(mpi_comm_second, num_spatial_parts, spatial_size, bucket_size=25, local_rank = mpi_comm_second.local_rank)
+
+
+# model_gen.mp_size = 5
 print("Shape list", resnet_shapes_list)
 
-# tm_master = train_model_master(
-#     model_gen1,
-#     model_gen2,
-#     local_rank,
-#     batch_size,
-#     epoch,
-#     criterion=None,
-#     optimizer=None,
-#     parts=parts,
-#     ASYNC=ENABLE_ASYNC,
-# )
 t_s_master = train_spatial_model_master(
     model_gen1,
     model_gen2,
     batch_size,
     spatial_size,
     num_spatial_parts,
-    args.slice_method,
+    slice_method,
     mpi_comm_first,
     mpi_comm_second,
     LOCAL_DP_LP=LOCAL_DP_LP,
@@ -246,10 +292,11 @@ def get_depth(version, n):
     replications=int(args.times / 2),
 )
 
-# sync_allreduce = gems_comm.SyncAllreduce(mpi_comm)
-
+x = torch.zeros(
+    (batch_size, 3, int(image_size / 2), int(image_size / 2)), device="cuda"
+)
+y = torch.zeros((batch_size,), dtype=torch.long, device="cuda")
 
-############################## Dataset Definition ##############################
 
 transform = transforms.Compose(
     [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
@@ -258,7 +305,7 @@ def get_depth(version, n):
 
 if APP == 1:
     trainset = torchvision.datasets.ImageFolder(
-        datapath,
+        "/usr/workspace/jain8/project/cancer/1024_1024_5/train",
         transform=transform,
         target_transform=None,
     )
@@ -269,10 +316,10 @@ def get_depth(version, n):
         num_workers=0,
         pin_memory=True,
     )
-    size_dataset = len(my_dataloader.dataset)
+    size_dataset = 1030
 elif APP == 2:
     trainset = torchvision.datasets.CIFAR10(
-        root=datapath, train=True, download=True, transform=transform
+        root="./data", train=True, download=True, transform=transform
     )
     my_dataloader = torch.utils.data.DataLoader(
         trainset,
@@ -281,10 +328,10 @@ def get_depth(version, n):
         num_workers=0,
         pin_memory=True,
     )
-    size_dataset = len(my_dataloader.dataset)
+    size_dataset = 50000
 else:
     my_dataset = torchvision.datasets.FakeData(
-        size=10 * batch_size,
+        size=10 * batch_size * args.times,
         image_size=(3, image_size, image_size),
         num_classes=num_classes,
         transform=transform,
@@ -293,7 +340,7 @@ def get_depth(version, n):
     )
     my_dataloader = torch.utils.data.DataLoader(
         my_dataset,
-        batch_size=batch_size * times,
+        batch_size=batch_size * args.times,
         shuffle=False,
         num_workers=0,
         pin_memory=True,
@@ -301,10 +348,7 @@ def get_depth(version, n):
     size_dataset = 10 * batch_size
 
 
-################################################################################
-
-# sync_allreduce.sync_model(model_gen1, model_gen2)
-
+# sync_allreduce.sync_model_spatial(model_gen)
 perf = []
 
 sync_comm = gems_comm.SyncAllreduce(mpi_comm_first)
@@ -314,30 +358,31 @@ def get_depth(version, n):
 
 print("ENABLE_MASTER_OPT", ENABLE_MASTER_OPT)
 
-perf = []
-
 
 def run_epoch():
     for i_e in range(epoch):
         loss = 0
         correct = 0
-        size = len(my_dataloader.dataset)
         t = time.time()
         for i, data in enumerate(my_dataloader, 0):
             start_event = torch.cuda.Event(enable_timing=True, blocking=True)
             end_event = torch.cuda.Event(enable_timing=True, blocking=True)
             start_event.record()
-
             if i > math.floor(size_dataset / (times * batch_size)) - 1:
                 break
-
+            # inputs=data_x
+            # labels = data_y
             inputs, labels = data
 
+            # inputs = inputs.to(device)
+            # labels = labels.to(device)
+
+            # t= time.time()
             if mpi_comm_first.local_rank < num_spatial_parts_list[0]:
                 x = split_input(
                     inputs=inputs,
                     image_size=image_size,
-                    slice_method=args.slice_method,
+                    slice_method=slice_method,
                     local_rank=mpi_comm_first.local_rank,
                     num_spatial_parts_list=num_spatial_parts_list,
                 )
@@ -345,18 +390,13 @@ def run_epoch():
                 x = split_input(
                     inputs=inputs,
                     image_size=image_size,
-                    slice_method=args.slice_method,
+                    slice_method=slice_method,
                     local_rank=mpi_comm_second.local_rank,
                     num_spatial_parts_list=num_spatial_parts_list,
                 )
             else:
                 x = inputs
 
-            # for j in range(MASTER):
-
-            # 	temp_loss,temp_correct = t_s1.run_step(x,labels)
-            # 	temp_loss,temp_correct = t_s2.run_step(x,labels)
-
             if ENABLE_MASTER_OPT:
                 temp_loss, temp_correct = t_s_master.run_step_allreduce(
                     x, labels, i % 2 == 1
@@ -373,7 +413,7 @@ def run_epoch():
             t_allreduce_temp = time.time()
 
             if ENABLE_MASTER_OPT == False:
-                print("benchmark_amoebanet_gems+spatial : START ALL REDUCE OPERATION")
+                print("benchmark_resnet_gems+spatial : START ALL REDUCE OPERATION")
                 sync_comm.apply_allreduce_master_master(
                     model_gen1, model_gen2, mpi_comm_first, mpi_comm_second
                 )
@@ -430,7 +470,6 @@ def run_epoch():
 run_epoch()
 
 if mpi_comm_second.local_rank == 0:
-    print(f"Mean {sum(perf) / len(perf)} Median {np.median(perf)}")
+    print("Mean {} Median {}".format(sum(perf) / len(perf), np.median(perf)))
 
 exit()
-################################################################################
diff --git a/src/torchgems/comm.py b/src/torchgems/comm.py
index 6f1eba55..6cc9be46 100644
--- a/src/torchgems/comm.py
+++ b/src/torchgems/comm.py
@@ -214,8 +214,8 @@ def create_allreduce_comm_spatial(self):
 
             if self.ENABLE_MASTER:
                 for i in range(len(ranks)):
-                    ranks[i] = self.mp_size - 1 - ranks[i]
-                    # ranks.append(self.mp_size - 1 - ranks[i])
+                    # ranks[i] = self.mp_size - 1 - ranks[i]
+                    ranks.append(self.mp_size - 1 - ranks[i])
             print("RANKS:", ranks)
             temp_spatial_allreduce_grp = torch.distributed.new_group(ranks=ranks)
 
@@ -225,7 +225,7 @@ def create_allreduce_comm_spatial(self):
                     print(
                         "first_spatial_allreduce_grp", self.rank, self.local_rank, ranks
                     )
-                if (
+                elif (
                     self.spatial_size == 1
                     and second_local_rank < self.num_spatial_parts
                 ):
diff --git a/src/torchgems/train_spatial.py b/src/torchgems/train_spatial.py
index 60cb39a7..9f6c0a1e 100644
--- a/src/torchgems/train_spatial.py
+++ b/src/torchgems/train_spatial.py
@@ -22,6 +22,46 @@
 import torch.distributed as dist
 
 
+def isPowerTwo(num):
+    return not (num & (num - 1))
+
+
+"""
+For SP, image size and image size after partitioning should be power of two.
+As, while performing convolution operations at different layers, odd input size
+(i.e. image size which is not power of 2) will lead to truncation of input. Thus,
+other GPU devices will receive truncated input with unexpected input size.
+"""
+
+
+def verify_spatial_config(slice_method, image_size, num_spatial_parts_list):
+    spatial_part_size = num_spatial_parts_list[
+        0
+    ]  # Partition size for spatial parallelism
+
+    assert slice_method in [
+        "square",
+        "vertical",
+        "horizontal",
+    ], "Possible slice methods are ['square', 'vertical', 'horizontal']"
+
+    assert isPowerTwo(int(image_size)), "Image size should be power of Two"
+
+    if slice_method == "square":
+        assert isPowerTwo(
+            int(image_size / math.sqrt(spatial_part_size))
+        ), "Image size of each partition should be power of Two"
+    else:
+        assert isPowerTwo(
+            int(image_size / spatial_part_size)
+        ), "Image size of each partition should be power of Two"
+
+    for each_part_size in num_spatial_parts_list:
+        assert (
+            each_part_size == spatial_part_size
+        ), "Size of each SP partition should be same"
+
+
 def get_shapes_spatial(
     shape_list, slice_method, spatial_size, num_spatial_parts_list, image_size_times
 ):
@@ -1317,7 +1357,16 @@ def forward_pass(self, data_x, data_y, part_number=0):
                 self.parts,
             )
             # print("MODEL :", " rank ", self.local_rank, self.models)
+            num_gpus = torch.cuda.device_count()
+
+            # Print information about each GPU device
+            for i in range(num_gpus):
+                gpu_properties = torch.cuda.get_device_properties(i)
+                print(f"GPU {i}: {gpu_properties.name}")
+                print(f"  Total Memory: {gpu_properties.total_memory / (1024 ** 2)} MB")
+
             y = self.models(input_x)
+            # NVMN -?GPU MEMORY  UTILIZATIOON
 
         print(
             "train_spatial:forward_pass: DONE_MODEL_TRAIN",
diff --git a/src/torchgems/train_spatial_master.py b/src/torchgems/train_spatial_master.py
index ab44ad09..2492cec6 100644
--- a/src/torchgems/train_spatial_master.py
+++ b/src/torchgems/train_spatial_master.py
@@ -17,11 +17,38 @@
 # limitations under the License.
 
 
-from torchgems.train_spatial import train_model_spatial
+from torchgems.train_spatial import train_model_spatial, verify_spatial_config
 import torch
 import torch.distributed as dist
 
 
+def isPowerTwo(num):
+    return not (num & (num - 1))
+
+
+"""
+TBD : Update comments
+For SP+MASTER, image size and image size after partitioning should be power of two.
+As, while performing convolution operations at different layers, odd input size
+(i.e. image size which is not power of 2) will lead to truncation of input. Thus,
+other GPU devices will receive truncated input with unexpected input size.
+"""
+
+
+def verify_spatial_master_config(
+    slice_method, image_size, num_spatial_parts_list, spatial_size, mp_size
+):
+    spatial_part_size = num_spatial_parts_list[
+        0
+    ]  # Partition size for spatial parallelism
+
+    verify_spatial_config(slice_method, image_size, num_spatial_parts_list)
+
+    assert mp_size >= 2 * (
+        spatial_part_size
+    ), "Spatial parts from each models i.e. model1 and model2 should use different ranks (cuda devices); To avoid this, increase the split size by keeping other configuration same."
+
+
 class train_spatial_model_master:
     def __init__(
         self,
@@ -397,7 +424,7 @@ def run_step(self, inputs, labels):
         # torch.cuda.empty_cache()
         print("START RUN_STEP MODEL1", "rank ", self.local_rank)
 
-        # self.train_model1.models = self.train_model1.models.to('cuda')
+        self.train_model1.models = self.train_model1.models.to("cuda")
         temp_loss, temp_correct = self.train_model1.run_step(
             inputs[: self.batch_size], labels[: self.batch_size]
         )
@@ -405,18 +432,18 @@ def run_step(self, inputs, labels):
         loss += temp_loss
         correct += temp_correct
 
-        # torch.cuda.empty_cache()
+        torch.cuda.empty_cache()
         print("START RUN_STEP MODEL2", "rank ", self.local_rank)
-        # self.train_model1.models = self.train_model1.models.to('cpu')
-        # self.train_model2.models = self.train_model2.models.to('cuda')
+        self.train_model1.models = self.train_model1.models.to("cpu")
+        self.train_model2.models = self.train_model2.models.to("cuda")
         temp_loss, temp_correct = self.train_model2.run_step(
             inputs[self.batch_size : 2 * self.batch_size],
             labels[self.batch_size : 2 * self.batch_size],
         )
         print("END RUN_STEP MODEL2", "rank ", self.local_rank)
-        # self.train_model2.models = self.train_model2.models.to('cpu')
+        self.train_model2.models = self.train_model2.models.to("cpu")
 
-        # torch.cuda.empty_cache()
+        torch.cuda.empty_cache()
 
         loss += temp_loss
         correct += temp_correct

From 3bef14bc1dbe27e5c5f9adfbc6df8e9e6bd1e3e2 Mon Sep 17 00:00:00 2001
From: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
Date: Fri, 3 Nov 2023 11:17:14 -0400
Subject: [PATCH 06/11] Add comments

Signed-off-by: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
---
 src/torchgems/train_spatial_master.py | 40 +++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/src/torchgems/train_spatial_master.py b/src/torchgems/train_spatial_master.py
index 2492cec6..6c5bad2d 100644
--- a/src/torchgems/train_spatial_master.py
+++ b/src/torchgems/train_spatial_master.py
@@ -44,6 +44,46 @@ def verify_spatial_master_config(
 
     verify_spatial_config(slice_method, image_size, num_spatial_parts_list)
 
+    # Spatial parts from each models i.e. model1 and model2 should use different ranks (cuda devices):
+    # Example =>
+    # Consider following configurations.
+    # split_size = 2, spatial_size = 1, num_spatial_parts = 4
+    # This configurations are not valid as ranks 1, 2, 3 are used by spatial parts from both the model.
+    #  Model 1:
+    #  _______________        ____
+    # |   0(0)|  1(1) |      |    |
+    # |-------|-------|----->|4(4)|
+    # |  2(2) |  3(3) |      |    |
+    # |_______|_______|      |____|
+    #
+    # Model 2 (INVERSE GEMS):
+    #  _______________        ____
+    # |  0(4) |  1(3) |      |    |
+    # |-------|-------|----->|4(0)|
+    # |  2(2) |  3(1) |      |    |
+    # |_______|_______|      |____|
+    #
+    # Numbers inside the brackets () refer to World Rank
+    # whereas outside numbers refer to local rank for each model
+    #
+    # Valid configurations :
+    # split_size = 5, spatial_size = 1, num_spatial_parts = 4 are not valid as ranks 1, 2, 3 are used by spatial parts from both the model.
+    #  Model 1:
+    #  _______________        ____        ____        ____        ____
+    # |  0(0) |  1(1) |      |    |      |    |      |    |      |    |
+    # |-------|-------|----->|4(4)|----->|5(5)|----->|6(6)|----->|7(7)|
+    # |  2(2) |  3(3) |      |    |      |    |      |    |      |    |
+    # |_______|_______|      |____|      |____|      |____|      |____|
+    #
+    # Model 2 (INVERSE GEMS):
+    #  _______________        ____        ____        ____        ____
+    # |  0(7) |  1(6) |      |    |      |    |      |    |      |    |
+    # |-------|-------|----->|4(3)|----->|5(2)|----->|6(1)|----->|7(0)|
+    # |  2(5) |  3(4) |      |    |      |    |      |    |      |    |
+    # |_______|_______|      |____|      |____|      |____|      |____|
+    #
+    # Numbers inside the brackets () refer to World Rank
+    # whereas outside numbers refer to local rank for each model
     assert mp_size >= 2 * (
         spatial_part_size
     ), "Spatial parts from each models i.e. model1 and model2 should use different ranks (cuda devices); To avoid this, increase the split size by keeping other configuration same."

From 7d0d186d3e780e9fe886f794172993cd98771c5e Mon Sep 17 00:00:00 2001
From: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
Date: Sun, 5 Nov 2023 03:21:45 -0500
Subject: [PATCH 07/11] Refactor the code

Signed-off-by: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
---
 .../benchmark_amoebanet_gems+spatial.py       | 199 +++++++-------
 .../benchmark_resnet_gems+spatial.py          | 242 ++++++++++--------
 .../benchmark_resnet_sp.py                    |   4 +-
 src/torchgems/comm.py                         |  15 +-
 src/torchgems/mp_pipeline.py                  |  51 +---
 src/torchgems/train_spatial.py                | 128 +--------
 src/torchgems/train_spatial_master.py         |  15 +-
 7 files changed, 242 insertions(+), 412 deletions(-)

diff --git a/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py b/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py
index 6bd627bb..43817ccc 100644
--- a/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py
+++ b/benchmarks/gems_model/benchmark_amoebanet_gems+spatial.py
@@ -1,3 +1,21 @@
+# Copyright 2023, The Ohio State University. All rights reserved.
+# The MPI4DL software package is developed by the team members of
+# The Ohio State University's Network-Based Computing Laboratory (NBCL),
+# headed by Professor Dhabaleswar K. (DK) Panda.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 import torch.distributed as dist
 import torchvision.transforms as transforms
@@ -6,6 +24,7 @@
 import time
 import sys
 import math
+import logging
 from torchgems import parser
 from torchgems.mp_pipeline import model_generator
 from torchgems.train_spatial import get_shapes_spatial, split_input
@@ -18,6 +37,9 @@
 parser_obj = parser.get_parser()
 args = parser_obj.parse_args()
 
+if args.verbose:
+    logging.basicConfig(level=logging.DEBUG)
+
 if args.halo_d2:
     from models import amoebanet
     from models import amoebanet_d2
@@ -63,46 +85,47 @@ def get_depth(version, n):
 
 # Example of GEMS + SPATIAL split_size = 2, spatial_size = 1, num_spatial_parts = 4
 #
+# split_size = 5, spatial_size = 1, num_spatial_parts = 4 are not valid as ranks 1, 2, 3 are used by spatial parts from both the model.
 #  Model 1:
-#  _______________             ____
-# |   0(0)|  1(1) |           |    |
-# |-------|-------| --------->|4(4)|
-# |  2(2) |  3(3) |           |    |
-# |_______|_______|           |____|
+#  _______________        ____        ____        ____        ____
+# |  0(0) |  1(1) |      |    |      |    |      |    |      |    |
+# |-------|-------|----->|4(4)|----->|5(5)|----->|6(6)|----->|7(7)|
+# |  2(2) |  3(3) |      |    |      |    |      |    |      |    |
+# |_______|_______|      |____|      |____|      |____|      |____|
 #
 # Model 2 (INVERSE GEMS):
-#  _______________             ____
-# |  0(4) |  1(3) |           |    |
-# |-------|-------| --------->|4(0)|
-# |  2(2) |  3(1) |           |    |
-# |_______|_______|           |____|
+#  _______________        ____        ____        ____        ____
+# |  0(7) |  1(6) |      |    |      |    |      |    |      |    |
+# |-------|-------|----->|4(3)|----->|5(2)|----->|6(1)|----->|7(0)|
+# |  2(5) |  3(4) |      |    |      |    |      |    |      |    |
+# |_______|_______|      |____|      |____|      |____|      |____|
 #
 # Numbers inside the brackets () refer to World Rank
 # whereas outside numbers refer to local rank for each model
 
-# torch.set_num_threads(1)
 np.random.seed(seed=1405)
 parts = args.parts
 batch_size = args.batch_size
 resnet_n = 12
-epoch = args.num_epochs
+epochs = args.num_epochs
 ENABLE_ASYNC = True
 
 # APP
 # 1: Medical
 # 2: Cifar
 # 3: synthetic
-APP = 3
-amoebanet_test = False
+APP = args.app
 image_size = int(args.image_size)
-print("image size", image_size)
-steps = 100
 num_layers = args.num_layers
 num_filters = args.num_filters
 balance = args.balance
 split_size = args.split_size
 spatial_size = args.spatial_size
 slice_method = args.slice_method
+times = args.times
+datapath = args.datapath
+num_classes = args.num_classes
+LOCAL_DP_LP = args.local_DP
 ENABLE_MASTER_OPT = args.enable_master_comm_opt
 
 temp_num_spatial_parts = args.num_spatial_parts.split(",")
@@ -116,11 +139,6 @@ def get_depth(version, n):
 
 spatial_part_size = num_spatial_parts_list[0]  # Partition size for spatial parallelism
 
-times = args.times
-num_classes = args.num_classes
-LOCAL_DP_LP = args.local_DP
-
-
 mpi_comm_first = gems_comm.MPIComm(
     split_size=split_size,
     ENABLE_MASTER=False,
@@ -150,12 +168,6 @@ def get_depth(version, n):
 
 gems_comm.sync_comms_for_master(mpi_comm_first, mpi_comm_second)
 comm_size = mpi_comm_first.size
-# rank = mpi_comm.local_rank
-# comm_size = mpi_comm.size
-# local_rank = rank
-
-# split_rank = mpi_comm.split_rank
-
 
 if args.balance != None:
     balance = args.balance.split(",")
@@ -164,24 +176,38 @@ def get_depth(version, n):
     balance = None
 
 
+##################### AmoebaNet model specific parameters #####################
+
+"""
+"image_size_seq" is required to determine the output shape after spatial partitioning of images.
+The shape of the output will be determined for each model partition based on the values in "image_size_seq."
+These values will then be used to calculate the output shape for a given input size and spatial partition.
+"""
 image_size_seq = 512
 
+###############################################################################
+
+# Initialize AmoebaNet model
 model_seq = amoebanet.amoebanetd(
     num_layers=num_layers, num_filters=num_filters, num_classes=num_classes
 )
-print("length", len(model_seq), balance)
+
+# Initialize parameters for Model Parallelism
 model_gen_seq = model_generator(
     model=model_seq,
     split_size=split_size,
     input_size=(int(batch_size / parts), 3, image_size_seq, image_size_seq),
     balance=balance,
 )
+# Get the shape of model on each split rank for image_size_seq and move it to device
+# Note : we take shape w.r.t image_size_seq as model w.r.t image_size may not be
+# able to fit in memory
 model_gen_seq.ready_model(
     split_rank=mpi_comm_second.split_rank, GET_SHAPES_ON_CUDA=True
 )
 
+# Get the shape of model on each split rank for image_size and number of spatial parts
 image_size_times = int(image_size / image_size_seq)
-
 resnet_shapes_list = get_shapes_spatial(
     shape_list=model_gen_seq.shape_list,
     slice_method=slice_method,
@@ -190,13 +216,11 @@ def get_depth(version, n):
     image_size_times=image_size_times,
 )
 
-print(model_gen_seq.shape_list, resnet_shapes_list)
-
 del model_seq
 del model_gen_seq
 torch.cuda.ipc_collect()
 
-
+# Initialize AmoebaNet model with Spatial and Model Parallelism support
 if args.halo_d2:
     model1 = amoebanet_d2.amoebanetd_spatial(
         local_rank=mpi_comm_first.local_rank % mpi_comm_first.total_spatial_processes,
@@ -254,9 +278,12 @@ def get_depth(version, n):
     balance=balance,
     shape_list=resnet_shapes_list,
 )
-model_gen1.ready_model(split_rank=mpi_comm_first.split_rank)
-# model_gen1.DDP_model(mpi_comm_first, num_spatial_parts, spatial_size, bucket_size=25, local_rank = mpi_comm_first.local_rank)
 
+# Move model it it's repective devices
+model_gen1.ready_model(split_rank=mpi_comm_first.split_rank)
+logging.info(
+    f"Shape of model1 on local_rank {mpi_comm_first.local_rank } : {model_gen1.shape_list}"
+)
 
 model_gen2 = model_generator(
     model=model2,
@@ -265,22 +292,13 @@ def get_depth(version, n):
     balance=balance,
     shape_list=resnet_shapes_list,
 )
-model_gen2.ready_model(split_rank=mpi_comm_second.split_rank)
-# model_gen2.DDP_model(mpi_comm_second, num_spatial_parts, spatial_size, bucket_size=25, local_rank = mpi_comm_second.local_rank)
-
-
-# model_gen.mp_size = 5
-print("Shape list", resnet_shapes_list)
-
-
-# t_s1 = train_model_spatial(model_gen1, mpi_comm_first.local_rank,batch_size,epochs=1, spatial_size=spatial_size, num_spatial_parts=num_spatial_parts ,criterion=None,optimizer=None,parts=parts,ASYNC=True,GEMS_INVERSE=False, slice_method = args.slice_method,
-# 							LOCAL_DP_LP=LOCAL_DP_LP,
-# 							mpi_comm = mpi_comm_first)
 
+# Move model it it's repective devices
+model_gen2.ready_model(split_rank=mpi_comm_second.split_rank)
+logging.info(
+    f"Shape of model2 on local_rank {mpi_comm_first.local_rank } : {model_gen2.shape_list}"
+)
 
-# t_s2 = train_model_spatial(model_gen2, mpi_comm_second.local_rank,batch_size,epochs=1, spatial_size=spatial_size, num_spatial_parts=num_spatial_parts ,criterion=None,optimizer=None,parts=parts,ASYNC=True,GEMS_INVERSE=True, slice_method = args.slice_method,
-# 							LOCAL_DP_LP=LOCAL_DP_LP,
-# 							mpi_comm = mpi_comm_second)
 
 t_s_master = train_spatial_model_master(
     model_gen1,
@@ -299,11 +317,7 @@ def get_depth(version, n):
     replications=int(args.times / 2),
 )
 
-x = torch.zeros(
-    (batch_size, 3, int(image_size / 2), int(image_size / 2)), device="cuda"
-)
-y = torch.zeros((batch_size,), dtype=torch.long, device="cuda")
-
+############################## Dataset Definition ##############################
 
 transform = transforms.Compose(
     [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
@@ -312,7 +326,7 @@ def get_depth(version, n):
 
 if APP == 1:
     trainset = torchvision.datasets.ImageFolder(
-        "/usr/workspace/jain8/project/cancer/1024_1024_5/train",
+        datapath,
         transform=transform,
         target_transform=None,
     )
@@ -325,8 +339,16 @@ def get_depth(version, n):
     )
     size_dataset = 1030
 elif APP == 2:
+    transform = transforms.Compose(
+        [
+            transforms.Resize((512, 512)),
+            transforms.ToTensor(),
+            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+        ]
+    )
+    torch.manual_seed(0)
     trainset = torchvision.datasets.CIFAR10(
-        root="./data", train=True, download=True, transform=transform
+        root=datapath, train=True, download=True, transform=transform
     )
     my_dataloader = torch.utils.data.DataLoader(
         trainset,
@@ -335,7 +357,7 @@ def get_depth(version, n):
         num_workers=0,
         pin_memory=True,
     )
-    size_dataset = 50000
+    size_dataset = len(my_dataloader.dataset)
 else:
     my_dataset = torchvision.datasets.FakeData(
         size=10 * batch_size * args.times,
@@ -354,37 +376,30 @@ def get_depth(version, n):
     )
     size_dataset = 10 * batch_size
 
-
-# sync_allreduce.sync_model_spatial(model_gen)
-perf = []
+################################################################################
 
 sync_comm = gems_comm.SyncAllreduce(mpi_comm_first)
 
+################################# Train Model ##################################
 
-MASTER = args.times
-
-print("ENABLE_MASTER_OPT", ENABLE_MASTER_OPT)
+perf = []
 
 
 def run_epoch():
-    for i_e in range(epoch):
+    for i_e in range(epochs):
         loss = 0
         correct = 0
         t = time.time()
-        for i, data in enumerate(my_dataloader, 0):
+        size = len(my_dataloader.dataset)
+        for batch, data in enumerate(my_dataloader, 0):
             start_event = torch.cuda.Event(enable_timing=True, blocking=True)
             end_event = torch.cuda.Event(enable_timing=True, blocking=True)
             start_event.record()
-            if i > math.floor(size_dataset / (times * batch_size)) - 1:
+            if batch > math.floor(size_dataset / (times * batch_size)) - 1:
                 break
-            # inputs=data_x
-            # labels = data_y
-            inputs, labels = data
 
-            # inputs = inputs.to(device)
-            # labels = labels.to(device)
+            inputs, labels = data
 
-            # t= time.time()
             if mpi_comm_first.local_rank < num_spatial_parts_list[0]:
                 x = split_input(
                     inputs=inputs,
@@ -404,20 +419,15 @@ def run_epoch():
             else:
                 x = inputs
 
-            # for j in range(MASTER):
-
-            # 	temp_loss,temp_correct = t_s1.run_step(x,labels)
-            # 	temp_loss,temp_correct = t_s2.run_step(x,labels)
-
             if ENABLE_MASTER_OPT:
-                temp_loss, temp_correct = t_s_master.run_step_allreduce(
-                    x, labels, i % 2 == 1
+                local_loss, local_correct = t_s_master.run_step_allreduce(
+                    x, labels, batch % 2 == 1
                 )
             else:
-                temp_loss, temp_correct = t_s_master.run_step(x, labels)
+                local_loss, local_correct = t_s_master.run_step(x, labels)
 
-            loss += temp_loss
-            correct += temp_correct
+            loss += local_loss
+            correct += local_correct
 
             start_event_allreduce = torch.cuda.Event(enable_timing=True, blocking=True)
             end_event_allreduce = torch.cuda.Event(enable_timing=True, blocking=True)
@@ -425,21 +435,13 @@ def run_epoch():
             t_allreduce_temp = time.time()
 
             if ENABLE_MASTER_OPT == False:
-                print("benchmark_amoebanet_gems+spatial : START ALL REDUCE OPERATION")
                 sync_comm.apply_allreduce_master_master(
                     model_gen1, model_gen2, mpi_comm_first, mpi_comm_second
                 )
-
-            """
-			if(local_rank < spatial_size * num_spatial_parts):
-				None
-				#No need for this as, DDP is now used
-				# sync_allreduce.apply_allreduce(model_gen,mpi_comm.spatial_allreduce_grp)
-			"""
             torch.cuda.synchronize()
 
             if ENABLE_MASTER_OPT:
-                if i % 2 == 1:
+                if batch % 2 == 1:
                     t_s_master.train_model1.update()
                 else:
                     t_s_master.train_model2.update()
@@ -453,8 +455,9 @@ def run_epoch():
             t_allreduce = time.time() - t_allreduce_temp
 
             if mpi_comm_second.local_rank == comm_size - 1:
-                None
-                # print("Step",i," LOSS",temp_loss, " Global loss:",loss/(i+1), " Acc:",temp_correct)
+                logging.info(
+                    f"Step :{batch}, LOSS: {local_loss}, Global loss: {loss/(batch+1)} Acc: {local_correct} [{batch * len(inputs):>5d}/{size:>5d}]"
+                )
 
             if ENABLE_MASTER_OPT:
                 torch.distributed.barrier()
@@ -463,24 +466,20 @@ def run_epoch():
             torch.cuda.synchronize()
             t = start_event.elapsed_time(end_event) / 1000
             if mpi_comm_second.local_rank == 0:
-                None
                 print(
-                    "images per sec:",
-                    batch_size / t,
-                    "Time:",
-                    t,
-                    " Time Allreduce:",
-                    t_allreduce,
+                    f"Epoch: {i_e} images per sec:{batch_size / t} Time:{t} Time Allreduce:{t_allreduce}"
                 )
                 perf.append(batch_size / t)
 
             t = time.time()
         if mpi_comm_second.local_rank == comm_size - 1:
-            print("epoch", i_e, " Global loss:", loss, " acc", correct / i)
+            print(f"Epoch {i_e} Global loss: {loss / batch} Acc {correct / batch}")
 
 
 run_epoch()
 
+################################################################################
+
 if mpi_comm_second.local_rank == 0:
     print("Mean {} Median {}".format(sum(perf) / len(perf), np.median(perf)))
 
diff --git a/benchmarks/gems_model/benchmark_resnet_gems+spatial.py b/benchmarks/gems_model/benchmark_resnet_gems+spatial.py
index 743e2b61..039a66be 100644
--- a/benchmarks/gems_model/benchmark_resnet_gems+spatial.py
+++ b/benchmarks/gems_model/benchmark_resnet_gems+spatial.py
@@ -1,3 +1,21 @@
+# Copyright 2023, The Ohio State University. All rights reserved.
+# The MPI4DL software package is developed by the team members of
+# The Ohio State University's Network-Based Computing Laboratory (NBCL),
+# headed by Professor Dhabaleswar K. (DK) Panda.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 import torch.distributed as dist
 import torchvision.transforms as transforms
@@ -6,6 +24,7 @@
 import time
 import sys
 import math
+import logging
 from torchgems import parser
 from torchgems.mp_pipeline import model_generator
 from torchgems.train_spatial import get_shapes_spatial, split_input
@@ -16,12 +35,33 @@
 import torchgems.comm as gems_comm
 from models import resnet
 
+# Example of GEMS + SPATIAL split_size = 2, spatial_size = 1, num_spatial_parts = 4
+#
+# split_size = 5, spatial_size = 1, num_spatial_parts = 4 are not valid as ranks 1, 2, 3 are used by spatial parts from both the model.
+#  Model 1:
+#  _______________        ____        ____        ____        ____
+# |  0(0) |  1(1) |      |    |      |    |      |    |      |    |
+# |-------|-------|----->|4(4)|----->|5(5)|----->|6(6)|----->|7(7)|
+# |  2(2) |  3(3) |      |    |      |    |      |    |      |    |
+# |_______|_______|      |____|      |____|      |____|      |____|
+#
+# Model 2 (INVERSE GEMS):
+#  _______________        ____        ____        ____        ____
+# |  0(7) |  1(6) |      |    |      |    |      |    |      |    |
+# |-------|-------|----->|4(3)|----->|5(2)|----->|6(1)|----->|7(0)|
+# |  2(5) |  3(4) |      |    |      |    |      |    |      |    |
+# |_______|_______|      |____|      |____|      |____|      |____|
+#
+# Numbers inside the brackets () refer to World Rank
+# whereas outside numbers refer to local rank for each model
 
 parser_obj = parser.get_parser()
 args = parser_obj.parse_args()
 
+if args.verbose:
+    logging.basicConfig(level=logging.DEBUG)
+
 if args.halo_d2:
-    # from models import resnet
     from models import resnet_spatial_d2 as resnet_spatial
 else:
     from models import resnet_spatial
@@ -53,57 +93,29 @@ def init_processes(backend="tcp"):
     return size, rank
 
 
-def get_depth(version, n):
-    if version == 1:
-        return n * 6 + 2
-    elif version == 2:
-        return n * 9 + 2
-
-
 sys.stdout = Unbuffered(sys.stdout)
 
-# Example of GEMS + SPATIAL split_size = 2, spatial_size = 1, num_spatial_parts = 4
-#
-#  Model 1:
-#  _______________             ____
-# |   0(0)|  1(1) |           |    |
-# |-------|-------| --------->|4(4)|
-# |  2(2) |  3(3) |           |    |
-# |_______|_______|           |____|
-#
-# Model 2 (INVERSE GEMS):
-#  _______________             ____
-# |  0(4) |  1(3) |           |    |
-# |-------|-------| --------->|4(0)|
-# |  2(2) |  3(1) |           |    |
-# |_______|_______|           |____|
-#
-# Numbers inside the brackets () refer to World Rank
-# whereas outside numbers refer to local rank for each model
-
-# torch.set_num_threads(1)
 np.random.seed(seed=1405)
+
+ENABLE_ASYNC = True
 parts = args.parts
 batch_size = args.batch_size
-resnet_n = 12
-epoch = args.num_epochs
-ENABLE_ASYNC = True
-
-# APP
-# 1: Medical
-# 2: Cifar
-# 3: synthetic
-APP = 3
+epochs = args.num_epochs
 image_size = int(args.image_size)
-print("image size", image_size)
-steps = 100
-num_layers = args.num_layers
-num_filters = args.num_filters
 balance = args.balance
 split_size = args.split_size
 spatial_size = args.spatial_size
 slice_method = args.slice_method
+times = args.times
+datapath = args.datapath
+num_classes = args.num_classes
+LOCAL_DP_LP = args.local_DP
 ENABLE_MASTER_OPT = args.enable_master_comm_opt
+# APP
+# 1: Medical
+# 2: Cifar
+# 3: synthetic
+APP = args.app
 
 temp_num_spatial_parts = args.num_spatial_parts.split(",")
 
@@ -114,10 +126,25 @@ def get_depth(version, n):
     num_spatial_parts = [int(i) for i in temp_num_spatial_parts]
     num_spatial_parts_list = num_spatial_parts
 
-times = args.times
-num_classes = args.num_classes
-LOCAL_DP_LP = args.local_DP
+################## ResNet model specific parameters/functions ##################
 
+"""
+"image_size_seq" is required to determine the output shape after spatial partitioning of images.
+The shape of the output will be determined for each model partition based on the values in "image_size_seq."
+These values will then be used to calculate the output shape for a given input size and spatial partition.
+"""
+image_size_seq = 32
+resnet_n = 12
+
+
+def get_depth(version, n):
+    if version == 1:
+        return n * 6 + 2
+    elif version == 2:
+        return n * 9 + 2
+
+
+###############################################################################
 
 mpi_comm_first = gems_comm.MPIComm(
     split_size=split_size,
@@ -148,12 +175,6 @@ def get_depth(version, n):
 
 gems_comm.sync_comms_for_master(mpi_comm_first, mpi_comm_second)
 comm_size = mpi_comm_first.size
-# rank = mpi_comm.local_rank
-# comm_size = mpi_comm.size
-# local_rank = rank
-
-# split_rank = mpi_comm.split_rank
-
 
 if args.balance != None:
     balance = args.balance.split(",")
@@ -161,26 +182,29 @@ def get_depth(version, n):
 else:
     balance = None
 
-
-image_size_seq = 32
-
+# Initialize ResNet model
 model_seq = resnet.get_resnet_v2(
     (int(batch_size / parts), 3, image_size_seq, image_size_seq),
     depth=get_depth(2, resnet_n),
 )
-print("length", len(model_seq), balance)
+
 model_gen_seq = model_generator(
     model=model_seq,
     split_size=split_size,
     input_size=(int(batch_size / parts), 3, image_size_seq, image_size_seq),
     balance=balance,
 )
+
+# Get the shape of model on each split rank for image_size_seq and move it to device
+# Note : we take shape w.r.t image_size_seq as model w.r.t image_size may not be
+# able to fit in memory
 model_gen_seq.ready_model(
     split_rank=mpi_comm_second.split_rank, GET_SHAPES_ON_CUDA=True
 )
 
 image_size_times = int(image_size / image_size_seq)
 
+# Get the shape of model on each split rank for image_size and number of spatial parts
 resnet_shapes_list = get_shapes_spatial(
     shape_list=model_gen_seq.shape_list,
     slice_method=slice_method,
@@ -189,13 +213,11 @@ def get_depth(version, n):
     image_size_times=image_size_times,
 )
 
-print(model_gen_seq.shape_list, resnet_shapes_list)
-
 del model_seq
 del model_gen_seq
 torch.cuda.ipc_collect()
 
-
+# Initialize ResNet model with Spatial and Model Parallelism support
 if args.halo_d2:
     model1, balance = resnet_spatial.get_resnet_v2(
         input_shape=(batch_size / parts, 3, image_size, image_size),
@@ -257,9 +279,12 @@ def get_depth(version, n):
     balance=balance,
     shape_list=resnet_shapes_list,
 )
-model_gen1.ready_model(split_rank=mpi_comm_first.split_rank)
-# model_gen1.DDP_model(mpi_comm_first, num_spatial_parts, spatial_size, bucket_size=25, local_rank = mpi_comm_first.local_rank)
 
+# Move model it it's repective devices
+model_gen1.ready_model(split_rank=mpi_comm_first.split_rank)
+logging.info(
+    f"Shape of model1 on local_rank {mpi_comm_first.local_rank } : {model_gen1.shape_list}"
+)
 
 model_gen2 = model_generator(
     model=model2,
@@ -268,12 +293,12 @@ def get_depth(version, n):
     balance=balance,
     shape_list=resnet_shapes_list,
 )
-model_gen2.ready_model(split_rank=mpi_comm_second.split_rank)
-# model_gen2.DDP_model(mpi_comm_second, num_spatial_parts, spatial_size, bucket_size=25, local_rank = mpi_comm_second.local_rank)
 
-
-# model_gen.mp_size = 5
-print("Shape list", resnet_shapes_list)
+# Move model it it's repective devices
+model_gen2.ready_model(split_rank=mpi_comm_second.split_rank)
+logging.info(
+    f"Shape of model2 on local_rank {mpi_comm_first.local_rank } : {model_gen2.shape_list}"
+)
 
 t_s_master = train_spatial_model_master(
     model_gen1,
@@ -292,11 +317,7 @@ def get_depth(version, n):
     replications=int(args.times / 2),
 )
 
-x = torch.zeros(
-    (batch_size, 3, int(image_size / 2), int(image_size / 2)), device="cuda"
-)
-y = torch.zeros((batch_size,), dtype=torch.long, device="cuda")
-
+############################## Dataset Definition ##############################
 
 transform = transforms.Compose(
     [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
@@ -329,7 +350,7 @@ def get_depth(version, n):
         pin_memory=True,
     )
     size_dataset = 50000
-else:
+elif APP == 3:
     my_dataset = torchvision.datasets.FakeData(
         size=10 * batch_size * args.times,
         image_size=(3, image_size, image_size),
@@ -346,38 +367,52 @@ def get_depth(version, n):
         pin_memory=True,
     )
     size_dataset = 10 * batch_size
+else:
+    transform = transforms.Compose(
+        [
+            transforms.Resize((64, 64)),
+            transforms.ToTensor(),
+            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+        ]
+    )
+    trainset = torchvision.datasets.ImageFolder(
+        datapath,
+        transform=transform,
+        target_transform=None,
+    )
+    my_dataloader = torch.utils.data.DataLoader(
+        trainset,
+        batch_size=times * batch_size,
+        shuffle=True,
+        num_workers=0,
+        pin_memory=True,
+    )
+    size_dataset = len(my_dataloader.dataset)
 
-
-# sync_allreduce.sync_model_spatial(model_gen)
-perf = []
+################################################################################
 
 sync_comm = gems_comm.SyncAllreduce(mpi_comm_first)
 
+################################# Train Model ##################################
 
-MASTER = args.times
-
-print("ENABLE_MASTER_OPT", ENABLE_MASTER_OPT)
+perf = []
 
 
 def run_epoch():
-    for i_e in range(epoch):
+    for i_e in range(epochs):
         loss = 0
         correct = 0
         t = time.time()
-        for i, data in enumerate(my_dataloader, 0):
+        size = len(my_dataloader.dataset)
+        for batch, data in enumerate(my_dataloader, 0):
             start_event = torch.cuda.Event(enable_timing=True, blocking=True)
             end_event = torch.cuda.Event(enable_timing=True, blocking=True)
             start_event.record()
-            if i > math.floor(size_dataset / (times * batch_size)) - 1:
+            if batch > math.floor(size_dataset / (times * batch_size)) - 1:
                 break
-            # inputs=data_x
-            # labels = data_y
-            inputs, labels = data
 
-            # inputs = inputs.to(device)
-            # labels = labels.to(device)
+            inputs, labels = data
 
-            # t= time.time()
             if mpi_comm_first.local_rank < num_spatial_parts_list[0]:
                 x = split_input(
                     inputs=inputs,
@@ -398,14 +433,14 @@ def run_epoch():
                 x = inputs
 
             if ENABLE_MASTER_OPT:
-                temp_loss, temp_correct = t_s_master.run_step_allreduce(
-                    x, labels, i % 2 == 1
+                local_loss, local_correct = t_s_master.run_step_allreduce(
+                    x, labels, batch % 2 == 1
                 )
             else:
-                temp_loss, temp_correct = t_s_master.run_step(x, labels)
+                local_loss, local_correct = t_s_master.run_step(x, labels)
 
-            loss += temp_loss
-            correct += temp_correct
+            loss += local_loss
+            correct += local_correct
 
             start_event_allreduce = torch.cuda.Event(enable_timing=True, blocking=True)
             end_event_allreduce = torch.cuda.Event(enable_timing=True, blocking=True)
@@ -413,21 +448,13 @@ def run_epoch():
             t_allreduce_temp = time.time()
 
             if ENABLE_MASTER_OPT == False:
-                print("benchmark_resnet_gems+spatial : START ALL REDUCE OPERATION")
                 sync_comm.apply_allreduce_master_master(
                     model_gen1, model_gen2, mpi_comm_first, mpi_comm_second
                 )
-
-            """
-			if(local_rank < spatial_size * num_spatial_parts):
-				None
-				#No need for this as, DDP is now used
-				# sync_allreduce.apply_allreduce(model_gen,mpi_comm.spatial_allreduce_grp)
-			"""
             torch.cuda.synchronize()
 
             if ENABLE_MASTER_OPT:
-                if i % 2 == 1:
+                if batch % 2 == 1:
                     t_s_master.train_model1.update()
                 else:
                     t_s_master.train_model2.update()
@@ -441,8 +468,9 @@ def run_epoch():
             t_allreduce = time.time() - t_allreduce_temp
 
             if mpi_comm_second.local_rank == comm_size - 1:
-                None
-                # print("Step",i," LOSS",temp_loss, " Global loss:",loss/(i+1), " Acc:",temp_correct)
+                logging.info(
+                    f"Step :{batch}, LOSS: {local_loss}, Global loss: {loss/(batch+1)} Acc: {local_correct} [{batch * len(inputs):>5d}/{size:>5d}]"
+                )
 
             if ENABLE_MASTER_OPT:
                 torch.distributed.barrier()
@@ -451,24 +479,20 @@ def run_epoch():
             torch.cuda.synchronize()
             t = start_event.elapsed_time(end_event) / 1000
             if mpi_comm_second.local_rank == 0:
-                None
                 print(
-                    "images per sec:",
-                    batch_size / t,
-                    "Time:",
-                    t,
-                    " Time Allreduce:",
-                    t_allreduce,
+                    f"Epoch: {i_e} images per sec:{batch_size / t} Time:{t} Time Allreduce:{t_allreduce}"
                 )
                 perf.append(batch_size / t)
 
             t = time.time()
         if mpi_comm_second.local_rank == comm_size - 1:
-            print("epoch", i_e, " Global loss:", loss, " acc", correct / i)
+            print(f"Epoch {i_e} Global loss: {loss / batch} Acc {correct / batch}")
 
 
 run_epoch()
 
+################################################################################
+
 if mpi_comm_second.local_rank == 0:
     print("Mean {} Median {}".format(sum(perf) / len(perf), np.median(perf)))
 
diff --git a/benchmarks/spatial_parallelism/benchmark_resnet_sp.py b/benchmarks/spatial_parallelism/benchmark_resnet_sp.py
index e4459fc4..7029ade9 100644
--- a/benchmarks/spatial_parallelism/benchmark_resnet_sp.py
+++ b/benchmarks/spatial_parallelism/benchmark_resnet_sp.py
@@ -106,7 +106,7 @@ def init_processes(backend="mpi"):
 ################## ResNet model specific parameters/functions ##################
 
 """
-"image_size_seq" is required to determine the output shape after spatial partitioning of images. 
+"image_size_seq" is required to determine the output shape after spatial partitioning of images.
 The shape of the output will be determined for each model partition based on the values in "image_size_seq."
 These values will then be used to calculate the output shape for a given input size and spatial partition.
 """
@@ -129,7 +129,7 @@ def isPowerTwo(num):
 
 
 """
-For ResNet model, image size and image size after partitioning should be power of two. 
+For ResNet model, image size and image size after partitioning should be power of two.
 As, ResNet performs convolution operations at different layers, odd input size
 (i.e. image size which is not power of 2) will lead to truncation of input. Thus,
 other GPU devices will receive truncated input with unexpected input size.
diff --git a/src/torchgems/comm.py b/src/torchgems/comm.py
index 6cc9be46..9a5fbf88 100644
--- a/src/torchgems/comm.py
+++ b/src/torchgems/comm.py
@@ -65,7 +65,6 @@ def __init__(
                 - spatial_size
                 + (split_size - spatial_size) * (LOCAL_DP_LP - 1)
             )
-        print("MP_SIZE : ", self.mp_size)
 
         if DISABLE_INIT:
             self.rank = dist.get_rank()
@@ -214,28 +213,19 @@ def create_allreduce_comm_spatial(self):
 
             if self.ENABLE_MASTER:
                 for i in range(len(ranks)):
-                    # ranks[i] = self.mp_size - 1 - ranks[i]
                     ranks.append(self.mp_size - 1 - ranks[i])
-            print("RANKS:", ranks)
+
             temp_spatial_allreduce_grp = torch.distributed.new_group(ranks=ranks)
 
             if self.ENABLE_MASTER:
                 if self.spatial_size == 1 and first_local_rank < self.num_spatial_parts:
                     self.first_spatial_allreduce_grp = temp_spatial_allreduce_grp
-                    print(
-                        "first_spatial_allreduce_grp", self.rank, self.local_rank, ranks
-                    )
+
                 elif (
                     self.spatial_size == 1
                     and second_local_rank < self.num_spatial_parts
                 ):
                     self.second_spatial_allreduce_grp = temp_spatial_allreduce_grp
-                    print(
-                        "second_spatial_allreduce_grp",
-                        self.rank,
-                        self.local_rank,
-                        ranks,
-                    )
 
                 elif self.spatial_size > 1:
                     if first_local_rank < np.sum(
@@ -323,7 +313,6 @@ def sync_comms_for_master(comm1, comm2):
     # MASTER related communicators are in comm2
     first_local_rank = comm1.local_rank
     second_local_rank = comm2.local_rank
-    print("sync_comms_for_master", first_local_rank, second_local_rank)
 
     if first_local_rank < comm1.total_spatial_processes:
         comm1.spatial_allreduce_grp = comm2.first_spatial_allreduce_grp
diff --git a/src/torchgems/mp_pipeline.py b/src/torchgems/mp_pipeline.py
index b0b5479c..2cdb16ab 100644
--- a/src/torchgems/mp_pipeline.py
+++ b/src/torchgems/mp_pipeline.py
@@ -437,7 +437,6 @@ def forward_pass(self, data_x, data_y, part_number=0):
         # part_number: part number between 0 and self.parts-1 used to find right input recv buffer
 
         # Receive inputs if local is not 0
-        print("mp_pipeline:forward_pass: START", data_x.size(), data_y.size())
         if self.split_rank == 0:
             input_x = data_x
         else:
@@ -458,11 +457,6 @@ def forward_pass(self, data_x, data_y, part_number=0):
 
         torch.cuda.synchronize()
 
-        print(
-            "mp_pipeline:forward_pass: SEND_INPUT_OR_CAL_LOSS",
-            data_x.size(),
-            data_y.size(),
-        )
         if self.split_rank != self.split_size - 1:
             if self.ENABLE_ASYNC == True:
                 self.send_input_async(y)
@@ -471,7 +465,7 @@ def forward_pass(self, data_x, data_y, part_number=0):
 
         else:
             loss = self.criterion(y, data_y)
-        print("mp_pipeline:forward_pass: END", data_x.size(), data_y.size())
+
         if self.split_rank == self.split_size - 1:
             corrects = (data_y.eq(torch.argmax(y, dim=-1).long())).sum().float()
             return loss, corrects / self.batch_size
@@ -524,62 +518,19 @@ def run_step(self, data_x, data_y):
         for i in range(self.parts):
             start = i * parts_size
             end = (i + 1) * parts_size
-            print(
-                "mp_pipeline:train_model:run_step : START FORWARD PASS",
-                " rank :",
-                self.local_rank,
-                " inverse : ",
-                self.GEMS_INVERSE,
-                self.parts,
-                start,
-                end,
-            )
             temp_y, temp_correct = self.forward_pass(
                 data_x[start:end], data_y[start:end], part_number=i
             )
-
-            print(
-                "mp_pipeline:train_model:run_step : END FORWARD PASS",
-                " rank :",
-                self.local_rank,
-                " inverse : ",
-                self.GEMS_INVERSE,
-                self.parts,
-                start,
-                end,
-            )
-
             y_list.append(temp_y)
 
             if self.split_rank == self.split_size - 1:
                 loss += temp_y.item()
                 corrects += temp_correct.item()
 
-        print(
-            "mp_pipeline:train_model:run_step : START BACKWARD PASS",
-            " rank :",
-            self.local_rank,
-            " inverse : ",
-            self.GEMS_INVERSE,
-            self.parts,
-            start,
-            end,
-        )
         for i in range(self.parts):
             None
             self.backward_pass(y_list[i], part_number=i)
 
-        print(
-            "mp_pipeline:train_model:run_step : END BACKWARD PASS",
-            " rank :",
-            self.local_rank,
-            " inverse : ",
-            self.GEMS_INVERSE,
-            self.parts,
-            start,
-            end,
-        )
-
         return loss, corrects
 
     def update(self):
diff --git a/src/torchgems/train_spatial.py b/src/torchgems/train_spatial.py
index 9f6c0a1e..7f09d18b 100644
--- a/src/torchgems/train_spatial.py
+++ b/src/torchgems/train_spatial.py
@@ -685,16 +685,10 @@ def receive_input_async_joint(self, part_number, ranks):
         ranks = [
             self.local_rank - 1 - i for i in range(self.num_spatial_parts - 1, -1, -1)
         ]
-        print("receive_input_async_joint", " rank : ", self.local_rank, ranks)
+
         if self.GEMS_INVERSE:
             for i in range(len(ranks)):
                 ranks[i] = self.mp_size - 1 - ranks[i]
-            print(
-                "receive_input_async_joint if self.GEMS_INVERSE",
-                " rank : ",
-                self.local_rank,
-                ranks,
-            )
 
         reqs = []
 
@@ -1257,54 +1251,27 @@ def forward_pass(self, data_x, data_y, part_number=0):
         # data_x: input data
         # data_y: labels
         # part_number: part number between 0 and self.parts-1 used to find right input recv buffer
-        print(
-            "train_spatial:forward_pass: START",
-            data_x.size(),
-            data_y.size(),
-            self.local_rank,
-            self.GEMS_INVERSE,
-            "self.ENABLE_ASYNC",
-            self.ENABLE_ASYNC,
-            "self.split_rank",
-            self.split_rank,
-            "self.spatial_size",
-            self.spatial_size,
-            " self.total_spatial_processes",
-            self.total_spatial_processes,
-        )
+
         # Receive inputs if local is not 0
         if self.split_rank == 0:
             input_x = data_x
         else:
             if self.ENABLE_ASYNC == True:
-                print("self.ENABLE_ASYNC  == True")
                 if self.split_rank == self.spatial_size:
                     if self.ENABLE_LOCAL_DP_LP:
-                        print(
-                            "Calling recv_input_MP_joint_LP_DP",
-                            " rank : ",
-                            self.local_rank,
-                        )
                         self.recv_input_MP_joint_LP_DP(part_number)
                     else:
-                        print("Calling recv_inputs_joint", " rank : ", self.local_rank)
                         self.recv_inputs_joint(part_number)
                 elif self.SKEWED_RECV_SPATIAL:
-                    print("Calling recv_input_spatial", " rank : ", self.local_rank)
                     self.recv_input_spatial(part_number)
                 else:
-                    print("Calling receive_input_async", " rank : ", self.local_rank)
                     self.receive_input_async(part_number)
             else:
-                print("self.ENABLE_ASYNC  == False")
                 if self.local_rank == self.total_spatial_processes:
-                    print("Calling recv_inputs_joint", " rank : ", self.local_rank)
                     self.recv_inputs_joint(part_number)
                 elif self.SKEWED_RECV_SPATIAL:
-                    print("Calling recv_input_spatial", " rank : ", self.local_rank)
                     self.recv_input_spatial(part_number)
                 else:
-                    print("Calling receive_input_sync", " rank : ", self.local_rank)
                     self.receive_input_sync(part_number)
 
             # join spatial inputs
@@ -1318,13 +1285,6 @@ def forward_pass(self, data_x, data_y, part_number=0):
                 else:
                     input_x = self.input_x_list[part_number]
 
-        print(
-            "train_spatial:forward_pass: RECEIVED INPUTS",
-            " rank :",
-            self.local_rank,
-            self.GEMS_INVERSE,
-        )
-
         # Apply forward pass
 
         torch.cuda.synchronize()
@@ -1336,92 +1296,24 @@ def forward_pass(self, data_x, data_y, part_number=0):
             )
             and part_number != self.parts - 1
         ):
-            print(
-                "train_spatial:forward_pass: DP",
-                " rank :",
-                self.local_rank,
-                self.GEMS_INVERSE,
-                part_number,
-                self.parts,
-            )
             with self.models.no_sync():
-                # print("MODEL :", " rank ", self.local_rank, self.models)
                 y = self.models(input_x)
         else:
-            print(
-                "train_spatial:forward_pass: no_DP",
-                " rank :",
-                self.local_rank,
-                self.GEMS_INVERSE,
-                part_number,
-                self.parts,
-            )
-            # print("MODEL :", " rank ", self.local_rank, self.models)
-            num_gpus = torch.cuda.device_count()
-
-            # Print information about each GPU device
-            for i in range(num_gpus):
-                gpu_properties = torch.cuda.get_device_properties(i)
-                print(f"GPU {i}: {gpu_properties.name}")
-                print(f"  Total Memory: {gpu_properties.total_memory / (1024 ** 2)} MB")
-
             y = self.models(input_x)
-            # NVMN -?GPU MEMORY  UTILIZATIOON
 
-        print(
-            "train_spatial:forward_pass: DONE_MODEL_TRAIN",
-            " rank :",
-            self.local_rank,
-            self.GEMS_INVERSE,
-        )
         torch.cuda.synchronize()
-        print(
-            "train_spatial:forward_pass: CALCULATED_Y",
-            " rank :",
-            self.local_rank,
-            self.GEMS_INVERSE,
-        )
+
         if self.split_rank != self.split_size - 1:
             if self.ENABLE_ASYNC == True:
                 if self.split_rank == self.spatial_size - 1 and self.ENABLE_LOCAL_DP_LP:
-                    print(
-                        "train_spatial:forward_pass: calling self.ENABLE_ASYNC send_input_spatial_MP_joint_LP_DP",
-                        " rank :",
-                        self.local_rank,
-                        self.GEMS_INVERSE,
-                    )
                     self.send_input_spatial_MP_joint_LP_DP(y)
                 else:
-                    print(
-                        "train_spatial:forward_pass: calling self.ENABLE_ASYNC send_input_async",
-                        " rank :",
-                        self.local_rank,
-                        self.GEMS_INVERSE,
-                    )
                     self.send_input_async(y)
             else:
                 if self.split_rank == self.spatial_size - 1 and self.ENABLE_LOCAL_DP_LP:
-                    print(
-                        "train_spatial:forward_pass: calling send_input_spatial_MP_joint_LP_DP",
-                        " rank :",
-                        self.local_rank,
-                        self.GEMS_INVERSE,
-                    )
                     self.send_input_spatial_MP_joint_LP_DP(y)
                 else:
-                    print(
-                        "train_spatial:forward_pass: calling send_input_sync",
-                        " rank :",
-                        self.local_rank,
-                        self.GEMS_INVERSE,
-                    )
                     self.send_input_sync(y)
-            print(
-                "train_spatial:forward_pass: SENT_Y",
-                " rank :",
-                self.local_rank,
-                self.GEMS_INVERSE,
-            )
 
         else:
             pos = self.local_rank - (self.mp_size - self.LOCAL_DP_LP)
@@ -1441,20 +1333,6 @@ def forward_pass(self, data_x, data_y, part_number=0):
             else:
                 loss = self.criterion(y, data_y)
 
-            print(
-                "train_spatial:forward_pass: CALCULATED_LOSS",
-                " rank :",
-                self.local_rank,
-                self.GEMS_INVERSE,
-            )
-
-        print(
-            "train_spatial:forward_pass: END",
-            " rank :",
-            self.local_rank,
-            self.GEMS_INVERSE,
-        )
-
         if self.split_rank == self.split_size - 1:
             corrects = (data_y.eq(torch.argmax(y, dim=-1).long())).sum().float()
             return loss, corrects / self.batch_size
diff --git a/src/torchgems/train_spatial_master.py b/src/torchgems/train_spatial_master.py
index 6c5bad2d..f0b37135 100644
--- a/src/torchgems/train_spatial_master.py
+++ b/src/torchgems/train_spatial_master.py
@@ -27,8 +27,7 @@ def isPowerTwo(num):
 
 
 """
-TBD : Update comments
-For SP+MASTER, image size and image size after partitioning should be power of two.
+For SP, image size and image size after partitioning should be power of two.
 As, while performing convolution operations at different layers, odd input size
 (i.e. image size which is not power of 2) will lead to truncation of input. Thus,
 other GPU devices will receive truncated input with unexpected input size.
@@ -462,25 +461,21 @@ def run_step_allreduce(self, inputs, labels, odd_iteration):
     def run_step(self, inputs, labels):
         loss, correct = 0, 0
         # torch.cuda.empty_cache()
-        print("START RUN_STEP MODEL1", "rank ", self.local_rank)
 
         self.train_model1.models = self.train_model1.models.to("cuda")
         temp_loss, temp_correct = self.train_model1.run_step(
             inputs[: self.batch_size], labels[: self.batch_size]
         )
-        print("END RUN_STEP MODEL1", "rank ", self.local_rank)
         loss += temp_loss
         correct += temp_correct
 
         torch.cuda.empty_cache()
-        print("START RUN_STEP MODEL2", "rank ", self.local_rank)
         self.train_model1.models = self.train_model1.models.to("cpu")
         self.train_model2.models = self.train_model2.models.to("cuda")
         temp_loss, temp_correct = self.train_model2.run_step(
             inputs[self.batch_size : 2 * self.batch_size],
             labels[self.batch_size : 2 * self.batch_size],
         )
-        print("END RUN_STEP MODEL2", "rank ", self.local_rank)
         self.train_model2.models = self.train_model2.models.to("cpu")
 
         torch.cuda.empty_cache()
@@ -488,26 +483,20 @@ def run_step(self, inputs, labels):
         loss += temp_loss
         correct += temp_correct
 
-        print("Calculated loss and accuracy for MODEL1 AND MODEL2")
-
         torch.cuda.synchronize()
         for times in range(self.replications - 1):
             index = (2 * times) + 2
-            print("Times :", times)
-            print("START RUN_STEP MODEL1")
             temp_loss, temp_correct = self.train_model1.run_step(
                 inputs[index * self.batch_size : (index + 1) * self.batch_size],
                 labels[index * self.batch_size : (index + 1) * self.batch_size],
             )
-            print("END RUN_STEP MODEL1")
             loss += temp_loss
             correct += temp_correct
-            print("START RUN_STEP MODEL2")
+
             temp_loss, temp_correct = self.train_model2.run_step(
                 inputs[(index + 1) * self.batch_size : (index + 2) * self.batch_size],
                 labels[(index + 1) * self.batch_size : (index + 2) * self.batch_size],
             )
-            print("END RUN_STEP MODEL2")
 
             loss += temp_loss
             correct += temp_correct

From 824450d59586350b0eae5148c562eda8123156f0 Mon Sep 17 00:00:00 2001
From: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
Date: Mon, 6 Nov 2023 14:34:36 -0500
Subject: [PATCH 08/11] Code refactor

Signed-off-by: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
---
 .../README.md                                 | 70 +++++++++++++++++++
 ...enchmark_amoebanet_gems_master_with_sp.py} |  0
 .../benchmark_resnet_gems_master_with_sp.py}  |  8 +--
 .../benchmark_amoebanet_sp.py                 |  6 +-
 .../benchmark_resnet_sp.py                    |  6 +-
 src/torchgems/train_spatial.py                |  5 +-
 src/torchgems/train_spatial_master.py         | 12 ++--
 7 files changed, 94 insertions(+), 13 deletions(-)
 create mode 100644 benchmarks/gems_master_with_spatial_parallelism/README.md
 rename benchmarks/{gems_master_model/benchmark_amoebanet_gems+spatial.py => gems_master_with_spatial_parallelism/benchmark_amoebanet_gems_master_with_sp.py} (100%)
 rename benchmarks/{gems_master_model/benchmark_resnet_gems+spatial.py => gems_master_with_spatial_parallelism/benchmark_resnet_gems_master_with_sp.py} (99%)

diff --git a/benchmarks/gems_master_with_spatial_parallelism/README.md b/benchmarks/gems_master_with_spatial_parallelism/README.md
new file mode 100644
index 00000000..79f3ffcd
--- /dev/null
+++ b/benchmarks/gems_master_with_spatial_parallelism/README.md
@@ -0,0 +1,70 @@
+# GEMS-MASTER + SP
+
+GEMS improves performance by efficiently utilizing memory, whereas SP is used to train high-resolution images. GEMS+SP enables training high-resolution images and enhances performance by integrating GEMS which allows training model with larger batch size than the maximum feasible batch size due to GEMS.
+
+
+## Run GEMS-MASTER + SP:
+
+#### Generic command:
+```bash
+$MV2_HOME/bin/mpirun_rsh --export-all -np ${np} --hostfile ${HOSTFILE} MV2_USE_GDRCOPY=0 MV2_ENABLE_AFFINITY=0 MV2_USE_CUDA=1 LD_PRELOAD=$MV2_HOME/lib/libmpi.so python ${gems_sp_model_script} --split-size ${split_size} --batch-size ${batch_size} --times ${times}
+
+```
+#### Examples
+
+- Example to run AmoebaNet MASTER+SP model for 1024 * 1024 image size with 5 model split size(i.e. # of partitions for MP), model replication factor (η = 2) and batch size for each model replica as 1 (i.e. effective batch size (EBS) = η × BS = 2).
+
+```bash
+$MV2_HOME/bin/mpirun_rsh --export-all -np ${np} --hostfile ${HOSTFILE} MV2_USE_GDRCOPY=0 MV2_ENABLE_AFFINITY=0 MV2_USE_CUDA=1 LD_PRELOAD=$MV2_HOME/lib/libmpi.so python benchmarks/gems_master_with_spatial_parallelism/benchmark_amoebanet_gems_master_with_sp.py --split-size 5 --batch-size 1 --image-size 1024 --times 2
+
+```
+- Similarly, we can run benchmark for ResNet MASTER model.
+Below is example to run ResNet MASTER+SP model for 2048 * 2048 image size with 5 model split size(i.e. # of partitions for MP), model replication factor (η = 4) and batch size for each model replica as 1 (i.e. effective batch size (EBS) = η × BS = 4).
+```bash
+$MV2_HOME/bin/mpirun_rsh --export-all -np $np --hostfile ${HOSTFILE} MV2_USE_GDRCOPY=0 MV2_ENABLE_AFFINITY=0 MV2_USE_CUDA=1 LD_PRELOAD=$MV2_HOME/lib/libmpi.so python benchmarks/gems_master_model/benchmark_resnet_gems_master_with_sp.py --split-size 5 --image-size 2048 --batch-size 1 --times 4
+
+```
+Below are the available configuration options :
+
+<pre>
+usage: benchmark_amoebanet_sp.py [-h] [-v] [--batch-size BATCH_SIZE] [--parts PARTS] [--split-size SPLIT_SIZE] [--num-spatial-parts NUM_SPATIAL_PARTS]
+                        [--spatial-size SPATIAL_SIZE] [--times TIMES] [--image-size IMAGE_SIZE] [--num-epochs NUM_EPOCHS] [--num-layers NUM_LAYERS]
+                        [--num-filters NUM_FILTERS] [--balance BALANCE] [--halo-D2] [--fused-layers FUSED_LAYERS] [--local-DP LOCAL_DP] [--slice-method SLICE_METHOD]
+                        [--app APP] [--datapath DATAPATH]
+
+SP-MP-DP Configuration Script
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -v, --verbose         Prints performance numbers or logs (default: False)
+  --batch-size BATCH_SIZE
+                        input batch size (default: 32)
+  --parts PARTS         Number of parts for MP (default: 1)
+  --split-size SPLIT_SIZE
+                        Number of process for MP (default: 2)
+  --num-spatial-parts NUM_SPATIAL_PARTS
+                        Number of partitions in spatial parallelism (default: 4)
+  --spatial-size SPATIAL_SIZE
+                        Number splits for spatial parallelism (default: 1)
+  --times TIMES         Number of times to repeat MASTER 1: 2 repications, 2: 4 replications (default: 1)
+  --image-size IMAGE_SIZE
+                        Image size for synthetic benchmark (default: 32)
+  --num-epochs NUM_EPOCHS
+                        Number of epochs (default: 1)
+  --num-layers NUM_LAYERS
+                        Number of layers in amoebanet (default: 18)
+  --num-filters NUM_FILTERS
+                        Number of layers in amoebanet (default: 416)
+  --balance BALANCE     length of list equals to number of partitions and sum should be equal to num layers (default: None)
+  --halo-D2             Enable design2 (do halo exhange on few convs) for spatial conv. (default: False)
+  --fused-layers FUSED_LAYERS
+                        When D2 design is enables for halo exchange, number of blocks to fuse in ResNet model (default: 1)
+  --local-DP LOCAL_DP   LBANN intergration of SP with MP. MP can apply data parallelism. 1: only one GPU for a given split, 2: two gpus for a given split (uses DP)
+                        (default: 1)
+  --slice-method SLICE_METHOD
+                        Slice method (square, vertical, and horizontal) in Spatial parallelism (default: square)
+  --app APP             Application type (1.medical, 2.cifar, and synthetic) in Spatial parallelism (default: 3)
+  --datapath DATAPATH   local Dataset path (default: ./train)
+  </pre>
+
+  *Note:"--times" is GEMS specific parameter and certain parameters such as "--num-spatial-parts", "--slice-method", "--halo-D2" would not be required by GEMS.*
diff --git a/benchmarks/gems_master_model/benchmark_amoebanet_gems+spatial.py b/benchmarks/gems_master_with_spatial_parallelism/benchmark_amoebanet_gems_master_with_sp.py
similarity index 100%
rename from benchmarks/gems_master_model/benchmark_amoebanet_gems+spatial.py
rename to benchmarks/gems_master_with_spatial_parallelism/benchmark_amoebanet_gems_master_with_sp.py
diff --git a/benchmarks/gems_master_model/benchmark_resnet_gems+spatial.py b/benchmarks/gems_master_with_spatial_parallelism/benchmark_resnet_gems_master_with_sp.py
similarity index 99%
rename from benchmarks/gems_master_model/benchmark_resnet_gems+spatial.py
rename to benchmarks/gems_master_with_spatial_parallelism/benchmark_resnet_gems_master_with_sp.py
index 039a66be..781ec535 100644
--- a/benchmarks/gems_master_model/benchmark_resnet_gems+spatial.py
+++ b/benchmarks/gems_master_with_spatial_parallelism/benchmark_resnet_gems_master_with_sp.py
@@ -221,7 +221,7 @@ def get_depth(version, n):
 if args.halo_d2:
     model1, balance = resnet_spatial.get_resnet_v2(
         input_shape=(batch_size / parts, 3, image_size, image_size),
-        depth=get_depth(2, 12),
+        depth=get_depth(2, resnet_n),
         local_rank=mpi_comm_first.local_rank % mpi_comm_first.total_spatial_processes,
         mp_size=split_size,
         balance=balance,
@@ -234,7 +234,7 @@ def get_depth(version, n):
 
     model2, balance = resnet_spatial.get_resnet_v2(
         input_shape=(batch_size / parts, 3, image_size, image_size),
-        depth=get_depth(2, 12),
+        depth=get_depth(2, resnet_n),
         local_rank=mpi_comm_second.local_rank % mpi_comm_second.total_spatial_processes,
         mp_size=split_size,
         balance=balance,
@@ -247,7 +247,7 @@ def get_depth(version, n):
 else:
     model1 = resnet_spatial.get_resnet_v2(
         input_shape=(batch_size / parts, 3, image_size, image_size),
-        depth=get_depth(2, 12),
+        depth=get_depth(2, resnet_n),
         local_rank=mpi_comm_first.local_rank % mpi_comm_first.total_spatial_processes,
         mp_size=split_size,
         balance=balance,
@@ -260,7 +260,7 @@ def get_depth(version, n):
 
     model2 = resnet_spatial.get_resnet_v2(
         input_shape=(batch_size / parts, 3, image_size, image_size),
-        depth=get_depth(2, 12),
+        depth=get_depth(2, resnet_n),
         local_rank=mpi_comm_second.local_rank % mpi_comm_second.total_spatial_processes,
         mp_size=split_size,
         balance=balance,
diff --git a/benchmarks/spatial_parallelism/benchmark_amoebanet_sp.py b/benchmarks/spatial_parallelism/benchmark_amoebanet_sp.py
index cf99736a..45263d39 100644
--- a/benchmarks/spatial_parallelism/benchmark_amoebanet_sp.py
+++ b/benchmarks/spatial_parallelism/benchmark_amoebanet_sp.py
@@ -363,7 +363,11 @@ def run_epoch():
 
             if local_rank < spatial_part_size:
                 x = split_input(
-                    inputs, args.slice_method, image_size, spatial_part_size, local_rank
+                    inputs,
+                    image_size,
+                    args.slice_method,
+                    local_rank,
+                    num_spatial_parts_list,
                 )
             else:
                 x = inputs
diff --git a/benchmarks/spatial_parallelism/benchmark_resnet_sp.py b/benchmarks/spatial_parallelism/benchmark_resnet_sp.py
index b1a60734..7504a130 100644
--- a/benchmarks/spatial_parallelism/benchmark_resnet_sp.py
+++ b/benchmarks/spatial_parallelism/benchmark_resnet_sp.py
@@ -364,7 +364,11 @@ def run_epoch():
 
             if local_rank < spatial_part_size:
                 x = split_input(
-                    inputs, args.slice_method, image_size, spatial_part_size, local_rank
+                    inputs,
+                    image_size,
+                    args.slice_method,
+                    local_rank,
+                    num_spatial_parts_list,
                 )
             else:
                 x = inputs
diff --git a/src/torchgems/train_spatial.py b/src/torchgems/train_spatial.py
index d4834049..8ce4ae6a 100644
--- a/src/torchgems/train_spatial.py
+++ b/src/torchgems/train_spatial.py
@@ -242,7 +242,10 @@ def get_shapes_spatial(
     return spatial_shapes_list
 
 
-def split_input(inputs, slice_method, image_size, spatial_part_size, local_rank):
+def split_input(inputs, image_size, slice_method, local_rank, num_spatial_parts_list):
+    spatial_part_size = num_spatial_parts_list[
+        0
+    ]  # Partition size for spatial parallelism
     if slice_method == "square":
         image_height_local = int(image_size / math.sqrt(spatial_part_size))
         image_width_local = int(image_size / math.sqrt(spatial_part_size))
diff --git a/src/torchgems/train_spatial_master.py b/src/torchgems/train_spatial_master.py
index f0b37135..40672554 100644
--- a/src/torchgems/train_spatial_master.py
+++ b/src/torchgems/train_spatial_master.py
@@ -462,23 +462,23 @@ def run_step(self, inputs, labels):
         loss, correct = 0, 0
         # torch.cuda.empty_cache()
 
-        self.train_model1.models = self.train_model1.models.to("cuda")
+        # self.train_model1.models = self.train_model1.models.to("cuda")
         temp_loss, temp_correct = self.train_model1.run_step(
             inputs[: self.batch_size], labels[: self.batch_size]
         )
         loss += temp_loss
         correct += temp_correct
 
-        torch.cuda.empty_cache()
-        self.train_model1.models = self.train_model1.models.to("cpu")
-        self.train_model2.models = self.train_model2.models.to("cuda")
+        # torch.cuda.empty_cache()
+        # self.train_model1.models = self.train_model1.models.to("cpu")
+        # self.train_model2.models = self.train_model2.models.to("cuda")
         temp_loss, temp_correct = self.train_model2.run_step(
             inputs[self.batch_size : 2 * self.batch_size],
             labels[self.batch_size : 2 * self.batch_size],
         )
-        self.train_model2.models = self.train_model2.models.to("cpu")
+        # self.train_model2.models = self.train_model2.models.to("cpu")
 
-        torch.cuda.empty_cache()
+        # torch.cuda.empty_cache()
 
         loss += temp_loss
         correct += temp_correct

From da24acd5610acae8fc1fc0c997fd5ca058b4d698 Mon Sep 17 00:00:00 2001
From: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
Date: Wed, 8 Nov 2023 13:17:14 -0500
Subject: [PATCH 09/11] Change flake8 version

Signed-off-by: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
---
 .pre-commit-config.yaml                       |  4 ++--
 ...benchmark_amoebanet_gems_master_with_sp.py | 23 ++++++++++++++++++-
 src/torchgems/comm.py                         |  1 +
 3 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0056c15c..5c5a15df 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,8 +6,8 @@ repos:
     name: black-format-test
 
 - repo: https://github.com/pycqa/flake8
-  rev: 4.0.1
+  rev: 5.0.4
   hooks:
   - id: flake8
     args: ['--ignore=E,F403,F405,F541,F841,W', '--select=E9,F,W6', '--per-file-ignores=__init__.py:F401']
-    name: flake8-test
\ No newline at end of file
+    name: flake8-test
diff --git a/benchmarks/gems_master_with_spatial_parallelism/benchmark_amoebanet_gems_master_with_sp.py b/benchmarks/gems_master_with_spatial_parallelism/benchmark_amoebanet_gems_master_with_sp.py
index 43817ccc..6a7ada81 100644
--- a/benchmarks/gems_master_with_spatial_parallelism/benchmark_amoebanet_gems_master_with_sp.py
+++ b/benchmarks/gems_master_with_spatial_parallelism/benchmark_amoebanet_gems_master_with_sp.py
@@ -358,7 +358,7 @@ def get_depth(version, n):
         pin_memory=True,
     )
     size_dataset = len(my_dataloader.dataset)
-else:
+elif APP == 3:
     my_dataset = torchvision.datasets.FakeData(
         size=10 * batch_size * args.times,
         image_size=(3, image_size, image_size),
@@ -375,6 +375,27 @@ def get_depth(version, n):
         pin_memory=True,
     )
     size_dataset = 10 * batch_size
+else:
+    transform = transforms.Compose(
+        [
+            transforms.Resize((512, 512)),
+            transforms.ToTensor(),
+            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+        ]
+    )
+    trainset = torchvision.datasets.ImageFolder(
+        datapath,
+        transform=transform,
+        target_transform=None,
+    )
+    my_dataloader = torch.utils.data.DataLoader(
+        trainset,
+        batch_size=times * batch_size,
+        shuffle=True,
+        num_workers=0,
+        pin_memory=True,
+    )
+    size_dataset = len(my_dataloader.dataset)
 
 ################################################################################
 
diff --git a/src/torchgems/comm.py b/src/torchgems/comm.py
index 9a5fbf88..a3edded0 100644
--- a/src/torchgems/comm.py
+++ b/src/torchgems/comm.py
@@ -213,6 +213,7 @@ def create_allreduce_comm_spatial(self):
 
             if self.ENABLE_MASTER:
                 for i in range(len(ranks)):
+                    # ranks[i] = (self.mp_size - 1 - ranks[i])
                     ranks.append(self.mp_size - 1 - ranks[i])
 
             temp_spatial_allreduce_grp = torch.distributed.new_group(ranks=ranks)

From 2aa0262f17e4f04fb28b721399605a05fd2148f6 Mon Sep 17 00:00:00 2001
From: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
Date: Wed, 8 Nov 2023 15:09:13 -0500
Subject: [PATCH 10/11] Refactor

Signed-off-by: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
---
 .../benchmark_amoebanet_gems_master.py        | 18 +++++++++++++
 .../benchmark_resnet_gems_master.py           | 18 +++++++++++++
 ...benchmark_amoebanet_gems_master_with_sp.py | 26 ++----------------
 .../benchmark_resnet_gems_master_with_sp.py   | 27 +++----------------
 src/torchgems/comm.py                         |  1 -
 src/torchgems/train_spatial_master.py         | 10 ++++---
 6 files changed, 47 insertions(+), 53 deletions(-)

diff --git a/benchmarks/gems_master_model/benchmark_amoebanet_gems_master.py b/benchmarks/gems_master_model/benchmark_amoebanet_gems_master.py
index 3d7a80f5..40212813 100644
--- a/benchmarks/gems_master_model/benchmark_amoebanet_gems_master.py
+++ b/benchmarks/gems_master_model/benchmark_amoebanet_gems_master.py
@@ -1,3 +1,21 @@
+# Copyright 2023, The Ohio State University. All rights reserved.
+# The MPI4DL software package is developed by the team members of
+# The Ohio State University's Network-Based Computing Laboratory (NBCL),
+# headed by Professor Dhabaleswar K. (DK) Panda.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 import torch.distributed as dist
 import torchvision.transforms as transforms
diff --git a/benchmarks/gems_master_model/benchmark_resnet_gems_master.py b/benchmarks/gems_master_model/benchmark_resnet_gems_master.py
index bacde2d2..b698a437 100644
--- a/benchmarks/gems_master_model/benchmark_resnet_gems_master.py
+++ b/benchmarks/gems_master_model/benchmark_resnet_gems_master.py
@@ -1,3 +1,21 @@
+# Copyright 2023, The Ohio State University. All rights reserved.
+# The MPI4DL software package is developed by the team members of
+# The Ohio State University's Network-Based Computing Laboratory (NBCL),
+# headed by Professor Dhabaleswar K. (DK) Panda.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 import torch.distributed as dist
 import torchvision.transforms as transforms
diff --git a/benchmarks/gems_master_with_spatial_parallelism/benchmark_amoebanet_gems_master_with_sp.py b/benchmarks/gems_master_with_spatial_parallelism/benchmark_amoebanet_gems_master_with_sp.py
index 6a7ada81..36453da4 100644
--- a/benchmarks/gems_master_with_spatial_parallelism/benchmark_amoebanet_gems_master_with_sp.py
+++ b/benchmarks/gems_master_with_spatial_parallelism/benchmark_amoebanet_gems_master_with_sp.py
@@ -337,7 +337,7 @@ def get_depth(version, n):
         num_workers=0,
         pin_memory=True,
     )
-    size_dataset = 1030
+    size_dataset = len(my_dataloader.dataset)
 elif APP == 2:
     transform = transforms.Compose(
         [
@@ -358,7 +358,7 @@ def get_depth(version, n):
         pin_memory=True,
     )
     size_dataset = len(my_dataloader.dataset)
-elif APP == 3:
+else:
     my_dataset = torchvision.datasets.FakeData(
         size=10 * batch_size * args.times,
         image_size=(3, image_size, image_size),
@@ -375,28 +375,6 @@ def get_depth(version, n):
         pin_memory=True,
     )
     size_dataset = 10 * batch_size
-else:
-    transform = transforms.Compose(
-        [
-            transforms.Resize((512, 512)),
-            transforms.ToTensor(),
-            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
-        ]
-    )
-    trainset = torchvision.datasets.ImageFolder(
-        datapath,
-        transform=transform,
-        target_transform=None,
-    )
-    my_dataloader = torch.utils.data.DataLoader(
-        trainset,
-        batch_size=times * batch_size,
-        shuffle=True,
-        num_workers=0,
-        pin_memory=True,
-    )
-    size_dataset = len(my_dataloader.dataset)
-
 ################################################################################
 
 sync_comm = gems_comm.SyncAllreduce(mpi_comm_first)
diff --git a/benchmarks/gems_master_with_spatial_parallelism/benchmark_resnet_gems_master_with_sp.py b/benchmarks/gems_master_with_spatial_parallelism/benchmark_resnet_gems_master_with_sp.py
index 781ec535..8ae34034 100644
--- a/benchmarks/gems_master_with_spatial_parallelism/benchmark_resnet_gems_master_with_sp.py
+++ b/benchmarks/gems_master_with_spatial_parallelism/benchmark_resnet_gems_master_with_sp.py
@@ -337,7 +337,7 @@ def get_depth(version, n):
         num_workers=0,
         pin_memory=True,
     )
-    size_dataset = 1030
+    size_dataset = len(my_dataloader.dataset)
 elif APP == 2:
     trainset = torchvision.datasets.CIFAR10(
         root="./data", train=True, download=True, transform=transform
@@ -349,8 +349,8 @@ def get_depth(version, n):
         num_workers=0,
         pin_memory=True,
     )
-    size_dataset = 50000
-elif APP == 3:
+    size_dataset = len(my_dataloader.dataset)
+else:
     my_dataset = torchvision.datasets.FakeData(
         size=10 * batch_size * args.times,
         image_size=(3, image_size, image_size),
@@ -367,27 +367,6 @@ def get_depth(version, n):
         pin_memory=True,
     )
     size_dataset = 10 * batch_size
-else:
-    transform = transforms.Compose(
-        [
-            transforms.Resize((64, 64)),
-            transforms.ToTensor(),
-            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
-        ]
-    )
-    trainset = torchvision.datasets.ImageFolder(
-        datapath,
-        transform=transform,
-        target_transform=None,
-    )
-    my_dataloader = torch.utils.data.DataLoader(
-        trainset,
-        batch_size=times * batch_size,
-        shuffle=True,
-        num_workers=0,
-        pin_memory=True,
-    )
-    size_dataset = len(my_dataloader.dataset)
 
 ################################################################################
 
diff --git a/src/torchgems/comm.py b/src/torchgems/comm.py
index a3edded0..9a5fbf88 100644
--- a/src/torchgems/comm.py
+++ b/src/torchgems/comm.py
@@ -213,7 +213,6 @@ def create_allreduce_comm_spatial(self):
 
             if self.ENABLE_MASTER:
                 for i in range(len(ranks)):
-                    # ranks[i] = (self.mp_size - 1 - ranks[i])
                     ranks.append(self.mp_size - 1 - ranks[i])
 
             temp_spatial_allreduce_grp = torch.distributed.new_group(ranks=ranks)
diff --git a/src/torchgems/train_spatial_master.py b/src/torchgems/train_spatial_master.py
index 40672554..993e9e71 100644
--- a/src/torchgems/train_spatial_master.py
+++ b/src/torchgems/train_spatial_master.py
@@ -462,7 +462,7 @@ def run_step(self, inputs, labels):
         loss, correct = 0, 0
         # torch.cuda.empty_cache()
 
-        # self.train_model1.models = self.train_model1.models.to("cuda")
+        # self.train_model1.models = self.train_model1.models.to('cuda')
         temp_loss, temp_correct = self.train_model1.run_step(
             inputs[: self.batch_size], labels[: self.batch_size]
         )
@@ -470,13 +470,15 @@ def run_step(self, inputs, labels):
         correct += temp_correct
 
         # torch.cuda.empty_cache()
-        # self.train_model1.models = self.train_model1.models.to("cpu")
-        # self.train_model2.models = self.train_model2.models.to("cuda")
+
+        # self.train_model1.models = self.train_model1.models.to('cpu')
+        # self.train_model2.models = self.train_model2.models.to('cuda')
         temp_loss, temp_correct = self.train_model2.run_step(
             inputs[self.batch_size : 2 * self.batch_size],
             labels[self.batch_size : 2 * self.batch_size],
         )
-        # self.train_model2.models = self.train_model2.models.to("cpu")
+
+        # self.train_model2.models = self.train_model2.models.to('cpu')
 
         # torch.cuda.empty_cache()
 

From 862485139e1894c75172b86d79cb2bae8cdcc5d1 Mon Sep 17 00:00:00 2001
From: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
Date: Wed, 8 Nov 2023 16:11:59 -0500
Subject: [PATCH 11/11] Address PR comments

Signed-off-by: Radha Guhane <gulhane.2@buckeyemail.osu.edu>
---
 .../benchmark_resnet_gems_master.py           |  9 +--
 ...benchmark_amoebanet_gems_master_with_sp.py |  9 +--
 .../benchmark_resnet_gems_master_with_sp.py   | 11 +--
 .../layer_parallelism/benchmark_resnet_lp.py  | 10 +--
 .../benchmark_amoebanet_sp.py                 | 59 +++-----------
 .../benchmark_resnet_sp.py                    | 76 +++++--------------
 src/torchgems/train_spatial.py                |  6 +-
 src/torchgems/train_spatial_master.py         |  4 -
 src/torchgems/utils.py                        | 30 ++++++++
 9 files changed, 66 insertions(+), 148 deletions(-)
 create mode 100644 src/torchgems/utils.py

diff --git a/benchmarks/gems_master_model/benchmark_resnet_gems_master.py b/benchmarks/gems_master_model/benchmark_resnet_gems_master.py
index b698a437..a814dd65 100644
--- a/benchmarks/gems_master_model/benchmark_resnet_gems_master.py
+++ b/benchmarks/gems_master_model/benchmark_resnet_gems_master.py
@@ -30,6 +30,7 @@
 from torchgems.mp_pipeline import model_generator
 from torchgems.gems_master import train_model_master
 import torchgems.comm as gems_comm
+from torchgems.utils import get_depth
 
 parser_obj = parser.get_parser()
 args = parser_obj.parse_args()
@@ -92,14 +93,6 @@ def init_processes(backend="mpi"):
 ENABLE_ASYNC = True
 resnet_n = 12
 
-
-def get_depth(version, n):
-    if version == 1:
-        return n * 6 + 2
-    elif version == 2:
-        return n * 9 + 2
-
-
 ###############################################################################
 mpi_comm = gems_comm.MPIComm(split_size=mp_size, ENABLE_MASTER=True)
 rank = mpi_comm.rank
diff --git a/benchmarks/gems_master_with_spatial_parallelism/benchmark_amoebanet_gems_master_with_sp.py b/benchmarks/gems_master_with_spatial_parallelism/benchmark_amoebanet_gems_master_with_sp.py
index 36453da4..93979f49 100644
--- a/benchmarks/gems_master_with_spatial_parallelism/benchmark_amoebanet_gems_master_with_sp.py
+++ b/benchmarks/gems_master_with_spatial_parallelism/benchmark_amoebanet_gems_master_with_sp.py
@@ -66,7 +66,7 @@ def __getattr__(self, attr):
         return getattr(self.stream, attr)
 
 
-def init_processes(backend="tcp"):
+def init_processes(backend="mpi"):
     """Initialize the distributed environment."""
     dist.init_process_group(backend)
     size = dist.get_world_size()
@@ -74,13 +74,6 @@ def init_processes(backend="tcp"):
     return size, rank
 
 
-def get_depth(version, n):
-    if version == 1:
-        return n * 6 + 2
-    elif version == 2:
-        return n * 9 + 2
-
-
 sys.stdout = Unbuffered(sys.stdout)
 
 # Example of GEMS + SPATIAL split_size = 2, spatial_size = 1, num_spatial_parts = 4
diff --git a/benchmarks/gems_master_with_spatial_parallelism/benchmark_resnet_gems_master_with_sp.py b/benchmarks/gems_master_with_spatial_parallelism/benchmark_resnet_gems_master_with_sp.py
index 8ae34034..94dadb35 100644
--- a/benchmarks/gems_master_with_spatial_parallelism/benchmark_resnet_gems_master_with_sp.py
+++ b/benchmarks/gems_master_with_spatial_parallelism/benchmark_resnet_gems_master_with_sp.py
@@ -34,6 +34,7 @@
 )
 import torchgems.comm as gems_comm
 from models import resnet
+from torchgems.utils import get_depth
 
 # Example of GEMS + SPATIAL split_size = 2, spatial_size = 1, num_spatial_parts = 4
 #
@@ -85,7 +86,7 @@ def __getattr__(self, attr):
         return getattr(self.stream, attr)
 
 
-def init_processes(backend="tcp"):
+def init_processes(backend="mpi"):
     """Initialize the distributed environment."""
     dist.init_process_group(backend)
     size = dist.get_world_size()
@@ -136,14 +137,6 @@ def init_processes(backend="tcp"):
 image_size_seq = 32
 resnet_n = 12
 
-
-def get_depth(version, n):
-    if version == 1:
-        return n * 6 + 2
-    elif version == 2:
-        return n * 9 + 2
-
-
 ###############################################################################
 
 mpi_comm_first = gems_comm.MPIComm(
diff --git a/benchmarks/layer_parallelism/benchmark_resnet_lp.py b/benchmarks/layer_parallelism/benchmark_resnet_lp.py
index 9cf1d13c..abc4fd59 100644
--- a/benchmarks/layer_parallelism/benchmark_resnet_lp.py
+++ b/benchmarks/layer_parallelism/benchmark_resnet_lp.py
@@ -28,7 +28,7 @@
 from torchgems.mp_pipeline import model_generator, train_model
 from models import resnet
 import torchgems.comm as gems_comm
-
+from torchgems.utils import get_depth
 
 parser_obj = parser.get_parser()
 args = parser_obj.parse_args()
@@ -79,14 +79,6 @@ def __getattr__(self, attr):
 image_size_seq = 32
 resnet_n = 12
 
-
-def get_depth(version, n):
-    if version == 1:
-        return n * 6 + 2
-    elif version == 2:
-        return n * 9 + 2
-
-
 ###############################################################################
 
 mpi_comm = gems_comm.MPIComm(split_size=mp_size, ENABLE_MASTER=False)
diff --git a/benchmarks/spatial_parallelism/benchmark_amoebanet_sp.py b/benchmarks/spatial_parallelism/benchmark_amoebanet_sp.py
index 45263d39..b3fecf7f 100644
--- a/benchmarks/spatial_parallelism/benchmark_amoebanet_sp.py
+++ b/benchmarks/spatial_parallelism/benchmark_amoebanet_sp.py
@@ -27,7 +27,12 @@
 import logging
 from torchgems import parser
 from torchgems.mp_pipeline import model_generator
-from torchgems.train_spatial import train_model_spatial, split_input, get_shapes_spatial
+from torchgems.train_spatial import (
+    train_model_spatial,
+    split_input,
+    get_shapes_spatial,
+    verify_spatial_config,
+)
 import torchgems.comm as gems_comm
 
 parser_obj = parser.get_parser()
@@ -62,7 +67,7 @@ def __getattr__(self, attr):
         return getattr(self.stream, attr)
 
 
-def init_processes(backend="tcp"):
+def init_processes(backend="mpi"):
     """Initialize the distributed environment."""
     dist.init_process_group(backend)
     size = dist.get_world_size()
@@ -84,6 +89,7 @@ def init_processes(backend="tcp"):
 balance = args.balance
 split_size = args.split_size
 spatial_size = args.spatial_size
+slice_method = args.slice_method
 times = args.times
 datapath = args.datapath
 num_workers = args.num_workers
@@ -107,48 +113,7 @@ def init_processes(backend="tcp"):
 spatial_part_size = num_spatial_parts_list[0]  # Partition size for spatial parallelism
 
 
-def isPowerTwo(num):
-    return not (num & (num - 1))
-
-
-"""
-For Amoebanet model, image size and image size after partitioning should be power of two.
-As, Amoebanet performs summation of results of two convolution layers during training,
-odd input size(i.e. image size which is not power of 2) will give different output sizes
-for convolution operations present at same layer, thus it will throw error as addition
-operation can not be performed with diffent size outputs.
-"""
-
-
-def verify_config():
-    assert args.slice_method in [
-        "square",
-        "vertical",
-        "horizontal",
-    ], "Possible slice methods are ['square', 'vertical', 'horizontal']"
-
-    assert args.app in range(
-        1, 4
-    ), "Possible Application values should be 1, 2, or 3 i.e. 1.medical, 2.cifar, and 3.synthetic"
-
-    assert isPowerTwo(int(image_size)), "Image size should be power of Two"
-
-    if args.slice_method == "square":
-        assert isPowerTwo(
-            int(image_size / math.sqrt(spatial_part_size))
-        ), "Image size of each partition should be power of Two"
-    else:
-        assert isPowerTwo(
-            int(image_size / spatial_part_size)
-        ), "Image size of each partition should be power of Two"
-
-    for each_part_size in num_spatial_parts_list:
-        assert (
-            each_part_size == spatial_part_size
-        ), "Size of each SP partition should be same"
-
-
-verify_config()
+verify_spatial_config(slice_method, image_size, num_spatial_parts_list)
 
 ##################### AmoebaNet model specific parameters #####################
 
@@ -207,7 +172,7 @@ def verify_config():
 image_size_times = int(image_size / image_size_seq)
 amoebanet_shapes_list = get_shapes_spatial(
     model_gen_seq.shape_list,
-    args.slice_method,
+    slice_method,
     spatial_size,
     num_spatial_parts_list,
     image_size_times,
@@ -273,7 +238,7 @@ def verify_config():
     parts=parts,
     ASYNC=True,
     GEMS_INVERSE=False,
-    slice_method=args.slice_method,
+    slice_method=slice_method,
     LOCAL_DP_LP=LOCAL_DP_LP,
     mpi_comm=mpi_comm,
 )
@@ -365,7 +330,7 @@ def run_epoch():
                 x = split_input(
                     inputs,
                     image_size,
-                    args.slice_method,
+                    slice_method,
                     local_rank,
                     num_spatial_parts_list,
                 )
diff --git a/benchmarks/spatial_parallelism/benchmark_resnet_sp.py b/benchmarks/spatial_parallelism/benchmark_resnet_sp.py
index 7504a130..1037846d 100644
--- a/benchmarks/spatial_parallelism/benchmark_resnet_sp.py
+++ b/benchmarks/spatial_parallelism/benchmark_resnet_sp.py
@@ -28,8 +28,14 @@
 import logging
 from torchgems import parser
 from torchgems.mp_pipeline import model_generator
-from torchgems.train_spatial import train_model_spatial, split_input, get_shapes_spatial
+from torchgems.train_spatial import (
+    train_model_spatial,
+    split_input,
+    get_shapes_spatial,
+    verify_spatial_config,
+)
 import torchgems.comm as gems_comm
+from torchgems.utils import get_depth
 
 parser_obj = parser.get_parser()
 args = parser_obj.parse_args()
@@ -82,6 +88,7 @@ def init_processes(backend="mpi"):
 balance = args.balance
 split_size = args.split_size
 spatial_size = args.spatial_size
+slice_method = args.slice_method
 times = args.times
 datapath = args.datapath
 num_workers = args.num_workers
@@ -114,58 +121,10 @@ def init_processes(backend="mpi"):
 image_size_seq = 32
 resnet_n = 12
 
-
-def get_depth(version, n):
-    if version == 1:
-        return n * 6 + 2
-    elif version == 2:
-        return n * 9 + 2
-
-
 ###############################################################################
 
 
-def isPowerTwo(num):
-    return not (num & (num - 1))
-
-
-"""
-For ResNet model, image size and image size after partitioning should be power of two.
-As, ResNet performs convolution operations at different layers, odd input size
-(i.e. image size which is not power of 2) will lead to truncation of input. Thus,
-other GPU devices will receive truncated input with unexpected input size.
-"""
-
-
-def verify_config():
-    assert args.slice_method in [
-        "square",
-        "vertical",
-        "horizontal",
-    ], "Possible slice methods are ['square', 'vertical', 'horizontal']"
-
-    assert args.app in range(
-        1, 4
-    ), "Possible Application values should be 1, 2, or 3 i.e. 1.medical, 2.cifar, and 3.synthetic"
-
-    assert isPowerTwo(int(image_size)), "Image size should be power of Two"
-
-    if args.slice_method == "square":
-        assert isPowerTwo(
-            int(image_size / math.sqrt(spatial_part_size))
-        ), "Image size of each partition should be power of Two"
-    else:
-        assert isPowerTwo(
-            int(image_size / spatial_part_size)
-        ), "Image size of each partition should be power of Two"
-
-    for each_part_size in num_spatial_parts_list:
-        assert (
-            each_part_size == spatial_part_size
-        ), "Size of each SP partition should be same"
-
-
-verify_config()
+verify_spatial_config(slice_method, image_size, num_spatial_parts_list)
 
 mpi_comm = gems_comm.MPIComm(
     split_size=split_size,
@@ -189,7 +148,8 @@ def verify_config():
 
 # Initialize ResNet model
 model_seq = resnet.get_resnet_v2(
-    (int(batch_size / parts), 3, image_size_seq, image_size_seq), depth=get_depth(2, 12)
+    (int(batch_size / parts), 3, image_size_seq, image_size_seq),
+    depth=get_depth(2, resnet_n),
 )
 
 model_gen_seq = model_generator(
@@ -209,7 +169,7 @@ def verify_config():
 image_size_times = int(image_size / image_size_seq)
 resnet_shapes_list = get_shapes_spatial(
     model_gen_seq.shape_list,
-    args.slice_method,
+    slice_method,
     spatial_size,
     num_spatial_parts_list,
     image_size_times,
@@ -223,7 +183,7 @@ def verify_config():
 if args.halo_d2:
     model, balance = resnet_spatial.get_resnet_v2(
         input_shape=(batch_size / parts, 3, image_size, image_size),
-        depth=get_depth(2, 12),
+        depth=get_depth(2, resnet_n),
         local_rank=local_rank % spatial_part_size,
         mp_size=split_size,
         balance=balance,
@@ -231,12 +191,12 @@ def verify_config():
         num_spatial_parts=num_spatial_parts,
         num_classes=num_classes,
         fused_layers=args.fused_layers,
-        slice_method=args.slice_method,
+        slice_method=slice_method,
     )
 else:
     model = resnet_spatial.get_resnet_v2(
         input_shape=(batch_size / parts, 3, image_size, image_size),
-        depth=get_depth(2, 12),
+        depth=get_depth(2, resnet_n),
         local_rank=local_rank % spatial_part_size,
         mp_size=split_size,
         balance=balance,
@@ -244,7 +204,7 @@ def verify_config():
         num_spatial_parts=num_spatial_parts,
         num_classes=num_classes,
         fused_layers=args.fused_layers,
-        slice_method=args.slice_method,
+        slice_method=slice_method,
     )
 
 
@@ -275,7 +235,7 @@ def verify_config():
     parts=1,
     ASYNC=True,
     GEMS_INVERSE=False,
-    slice_method=args.slice_method,
+    slice_method=slice_method,
     mpi_comm=mpi_comm,
 )
 
@@ -366,7 +326,7 @@ def run_epoch():
                 x = split_input(
                     inputs,
                     image_size,
-                    args.slice_method,
+                    slice_method,
                     local_rank,
                     num_spatial_parts_list,
                 )
diff --git a/src/torchgems/train_spatial.py b/src/torchgems/train_spatial.py
index 8ce4ae6a..66ead67e 100644
--- a/src/torchgems/train_spatial.py
+++ b/src/torchgems/train_spatial.py
@@ -20,11 +20,7 @@
 import torch
 import math
 import torch.distributed as dist
-
-
-def isPowerTwo(num):
-    return not (num & (num - 1))
-
+from utils import isPowerTwo
 
 """
 For SP, image size and image size after partitioning should be power of two.
diff --git a/src/torchgems/train_spatial_master.py b/src/torchgems/train_spatial_master.py
index 993e9e71..508e6fe0 100644
--- a/src/torchgems/train_spatial_master.py
+++ b/src/torchgems/train_spatial_master.py
@@ -22,10 +22,6 @@
 import torch.distributed as dist
 
 
-def isPowerTwo(num):
-    return not (num & (num - 1))
-
-
 """
 For SP, image size and image size after partitioning should be power of two.
 As, while performing convolution operations at different layers, odd input size
diff --git a/src/torchgems/utils.py b/src/torchgems/utils.py
new file mode 100644
index 00000000..b6a5e99f
--- /dev/null
+++ b/src/torchgems/utils.py
@@ -0,0 +1,30 @@
+# Copyright 2023, The Ohio State University. All rights reserved.
+# The MPI4DL software package is developed by the team members of
+# The Ohio State University's Network-Based Computing Laboratory (NBCL),
+# headed by Professor Dhabaleswar K. (DK) Panda.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def isPowerTwo(num):
+    return not (num & (num - 1))
+
+
+# Get the depth of ResNet model based on version and number of ResNet Blocks
+# This parameter will used for ResNet model architecture.
+def get_depth(version, n):
+    if version == 1:
+        return n * 6 + 2
+    elif version == 2:
+        return n * 9 + 2