Resolve PR comments

- Make dataset path as a argument varaible - Added benchmarking examples - Update README.md - Remove "ResNet" instances from AmoebaNet Signed-off-by: Radha Guhane <[email protected]>
OSU-Nowlab · Jul 21, 2023 · 7874a06 · 7874a06
1 parent 256d9ad
commit 7874a06
Show file tree

Hide file tree

Showing 12 changed files with 112 additions and 615 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,3 @@
 models/__pycache__/
-torch_gems.egg-info/
+now-dl.egg-info/
 torchgems/__pycache__/
diff --git a/README.md b/README.md
@@ -79,11 +79,13 @@ Python=3.9.16, cuda=11.6, gcc=10.3.0, cmake=3.22.2, PyTorch=1.12.0a0+git35202d2,
 cd torch-gems
 python setup.py install
 ```
-Example to run Amoebanet model with partition size for model as two, spatial partition as four and spatial size (i.e. number of model partition which will use spatial partition) as 1
+Example to run AmoebaNet model with partition size for model as two, spatial partition as four and spatial size (i.e. number of model partition which will use spatial partition) as 1
 ```bash
 $MV2_HOME/bin/mpirun_rsh --export-all -np 5 --hostfile {$HOSTFILE} MV2_USE_GDRCOPY=0 MV2_ENABLE_AFFINITY=0 MV2_USE_CUDA=1 LD_PRELOAD=$MV2_HOME/lib/libmpi.so python benchmarks/spatial/model/amoebanet_run.py --image-size 512 --num-spatial-parts 4 --slice-method "vertical" --split-size 2 --spatial-size 1
 ```
 
+Refer [Spatial Parallelism Benchmarks](https://github.com/OSU-Nowlab/now-dl/tree/main/benchmarks/spatial) for more details.
+
 ## Experimental Results:
 
 #### Using Spatial, Model and Pipeline Parallelism, where the model is split into two parts and utilizes spatial parallelism by dividing the image into four parts

diff --git a/benchmarks/layer_parallelism/amoebanet_lp.py b/benchmarks/layer_parallelism/amoebanet_lp.py
@@ -40,23 +40,22 @@ def __getattr__(self, attr):
 sys.stdout = Unbuffered(sys.stdout)
 
 np.random.seed(seed=1405)
+ENABLE_ASYNC = True
+ENABLE_APP = False
 parts = args.parts
 batch_size = args.batch_size
-resnet_n = 18
 epoch = args.num_epochs
-ENABLE_ASYNC = True
-ENABLE_APP = False
-amoebanet_test = False
 image_size = int(args.image_size)
-print("image size", image_size)
-steps = 100
 num_layers = args.num_layers
 num_filters = args.num_filters
 balance = args.balance
 mp_size = args.split_size
+times = args.times
+datapath = args.datapath
+
 image_size_seq = 512
-times = 1
 num_classes = 1000
+steps = 100
 
 mpi_comm = gems_comm.MPIComm(split_size=mp_size, ENABLE_MASTER=False)
 rank = mpi_comm.rank
@@ -80,7 +79,7 @@ def __getattr__(self, attr):
 
 
 image_size_times = int(image_size / image_size_seq)
-resnet_shapes_list = []
+amoebanet_shapes_list = []
 for output_shape in model_gen.shape_list:
     if isinstance(output_shape, list):
         temp_shape = []
@@ -92,20 +91,20 @@ def __getattr__(self, attr):
                 int(shape_tuple[3] * image_size_times),
             )
             temp_shape.append(x)
-        resnet_shapes_list.append(temp_shape)
+        amoebanet_shapes_list.append(temp_shape)
     else:
         if len(output_shape) == 2:
-            resnet_shapes_list.append(output_shape)
+            amoebanet_shapes_list.append(output_shape)
         else:
             x = (
                 output_shape[0],
                 output_shape[1],
                 int(output_shape[2] * image_size_times),
                 int(output_shape[3] * image_size_times),
             )
-            resnet_shapes_list.append(x)
+            amoebanet_shapes_list.append(x)
 
-model_gen.shape_list = resnet_shapes_list
+model_gen.shape_list = amoebanet_shapes_list
 print("local_ran:", local_rank, " Shapes:", model_gen.shape_list)
 
 
@@ -123,7 +122,7 @@ def __getattr__(self, attr):
     split_size=mp_size,
     input_size=(int(batch_size / parts), 3, image_size, image_size),
     balance=balance,
-    shape_list=resnet_shapes_list,
+    shape_list=amoebanet_shapes_list,
 )
 model_gen.ready_model(split_rank=local_rank, GET_SHAPES_ON_CUDA=True)
 
@@ -145,7 +144,7 @@ def __getattr__(self, attr):
 torch.manual_seed(0)
 if ENABLE_APP == True:
     trainset = torchvision.datasets.ImageFolder(
-        "/train", transform=transform, target_transform=None
+        datapath, transform=transform, target_transform=None
     )
     my_dataloader = torch.utils.data.DataLoader(
         trainset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True

diff --git a/benchmarks/layer_parallelism/resnet_lp.py b/benchmarks/layer_parallelism/resnet_lp.py
@@ -18,7 +18,6 @@
 if args.verbose:
     logging.basicConfig(level=logging.DEBUG)
 
-
 gems_comm.initialize_cuda()
 
 
@@ -41,23 +40,23 @@ def __getattr__(self, attr):
 sys.stdout = Unbuffered(sys.stdout)
 
 np.random.seed(seed=1405)
+ENABLE_ASYNC = True
+ENABLE_APP = False
 parts = args.parts
 batch_size = args.batch_size
-resnet_n = 12
 epoch = args.num_epochs
-ENABLE_ASYNC = True
-ENABLE_APP = False
-amoebanet_test = False
-image_size = int(args.image_size)  # 1024
-print("image size", image_size)
-steps = 100
+image_size = int(args.image_size)
 num_layers = args.num_layers
 num_filters = args.num_filters
 balance = args.balance
 mp_size = args.split_size
+times = args.times
+datapath = args.datapath
+
 image_size_seq = 32
-times = 1
 num_classes = 10
+resnet_n = 12
+steps = 100
 
 mpi_comm = gems_comm.MPIComm(split_size=mp_size, ENABLE_MASTER=False)
 rank = mpi_comm.rank
@@ -156,7 +155,7 @@ def get_depth(version, n):
 torch.manual_seed(0)
 if ENABLE_APP == True:
     trainset = torchvision.datasets.ImageFolder(
-        "/usr/workspace/jain8/project/cancer/1024_1024_5/train",
+        datapath,
         transform=transform,
         target_transform=None,
     )

diff --git a/benchmarks/spatial/README.md b/benchmarks/spatial/README.md
@@ -14,7 +14,7 @@ Spatial parallelism benchmarks include halo exchange and model benchmarks. These
 
 - Load Required model:
 ```bash
-cd torch-gems
+cd now-dl
 python setup.py install
 ```
 
@@ -52,38 +52,51 @@ optional arguments:
 
 Model benchmarks for spatial parallelism also require performing model parallelism. To configure the number of model partitions and the number of model partitions that will use spatial parallelism, you can use the --split-size and --spatial-size arguments respectively.
 
-1. Amoebanet benchmark
+Run spatial parallelism:
 
-Run spatial parallelism for Amoebanet model:
+# Generic command:
+```bash
+
+$MV2_HOME/bin/mpirun_rsh --export-all -np $np --hostfile  {$HOSTFILE} MV2_USE_GDRCOPY=0 MV2_ENABLE_AFFINITY=0 MV2_USE_CUDA=1 LD_PRELOAD=$MV2_HOME/lib/libmpi.so /home/gulhane.2/map_rank_to_gpu python ${model_type} --halo-D2 --num-spatial-parts ${num_spatial_parts}  --image-size ${image_size} --batch-size ${batch_size} --slice-method ${partition}
+
+```
+# Examples
+
+- With 5 GPUs [split size: 2, num_spatial_parts: 4, spatial_size: 1]
+
+Example to run AmoebaNet model with 2 model split size(i.e. # of partitions for MP), spatial partition (# of image partitions) as 4 and 1 as spatial size (i.e. number of model partition which will use spatial partition). In this configuration, we split model into two parts where first part will use spatial parallelism. 
 
-Example to run Amoebanet model with partition size for model as two, spatial partition as four and spatial size (i.e. number of model partition which will use spatial partition) as 1
 ```bash
 $MV2_HOME/bin/mpirun_rsh --export-all -np 5 --hostfile {$HOSTFILE} MV2_USE_GDRCOPY=0 MV2_ENABLE_AFFINITY=0 MV2_USE_CUDA=1 LD_PRELOAD=$MV2_HOME/lib/libmpi.so python amoebanet_run.py --image-size 512 --num-spatial-parts 4 --slice-method "vertical" --split-size 2 --spatial-size 1
 ```
+- With 9 GPUs [split size: 3, num_spatial_parts: 4, spatial_size: 2]
+In this configuration, we split model int three parts where first two part will use spatial parallelism. 
+
+```bash
+$MV2_HOME/bin/mpirun_rsh --export-all -np 9 --hostfile {$HOSTFILE} MV2_USE_GDRCOPY=0 MV2_ENABLE_AFFINITY=0 MV2_USE_CUDA=1 LD_PRELOAD=$MV2_HOME/lib/libmpi.so python amoebanet_run.py --image-size 512 --num-spatial-parts 4 --slice-method "vertical" --split-size 3 --spatial-size 2
+```
+
+- Similarly, we can run benchmark for ResNet model.
+Find the example to run ResNet with halo-D2 enabled to reduce communication opertaions. To learn more about halo-D2, refer [Hy-Fi: Hybrid Five-Dimensional Parallel DNN Training on High-Performance GPU Clusters](https://dl.acm.org/doi/abs/10.1007/978-3-031-07312-0_6)
+```bash
+$MV2_HOME/bin/mpirun_rsh --export-all -np 5 --hostfile {$HOSTFILE} MV2_USE_GDRCOPY=0 MV2_ENABLE_AFFINITY=0 MV2_USE_CUDA=1 LD_PRELOAD=$MV2_HOME/lib/libmpi.so resnet_model.py --halo-D2 --num-spatial-parts 4 --image-size 1024 --batch-size 2 --slice-method "square"
+``` 
 
 Below are the available configuration options :
 
 <pre>
-usage: amoebanet_run.py [-h] [--fp16-allreduce] [--model MODEL] [--batch-size BATCH_SIZE] [--learning-rate LEARNING_RATE] [--num-gpus-mp NUM_GPUS_MP]
-                        [--mem-per-process MEM_PER_PROCESS] [--parts PARTS] [--split-size SPLIT_SIZE] [--num-spatial-parts NUM_SPATIAL_PARTS]
-                        [--spatial-size SPATIAL_SIZE] [--times TIMES] [--image-size IMAGE_SIZE] [--dp-per-node DP_PER_NODE] [--enable-dp] [--enable-master-comm-opt]
-                        [--num-gpu-per-node NUM_GPU_PER_NODE] [--num-epochs NUM_EPOCHS] [--num-layers NUM_LAYERS] [--num-filters NUM_FILTERS] [--unet-b UNET_B]
-                        [--unet-c UNET_C] [--balance BALANCE] [--halo-D2] [--fused-layers FUSED_LAYERS] [--local-DP LOCAL_DP] [--slice-method SLICE_METHOD]
+usage: amoebanet_run.py [-h] [-v] [--batch-size BATCH_SIZE] [--parts PARTS] [--split-size SPLIT_SIZE] [--num-spatial-parts NUM_SPATIAL_PARTS]
+                        [--spatial-size SPATIAL_SIZE] [--times TIMES] [--image-size IMAGE_SIZE] [--num-epochs NUM_EPOCHS] [--num-layers NUM_LAYERS]
+                        [--num-filters NUM_FILTERS] [--balance BALANCE] [--halo-D2] [--fused-layers FUSED_LAYERS] [--local-DP LOCAL_DP] [--slice-method SLICE_METHOD]
+                        [--app APP] [--datapath DATAPATH]
 
-MP-DP ResNet Script
+SP-MP-DP Configuration Script
 
 optional arguments:
   -h, --help            show this help message and exit
-  --fp16-allreduce      use fp16 compression during allreduce (default: False)
-  --model MODEL         model to benchmark (default: resnet50)
+  -v, --verbose         Prints performance numbers or logs (default: False)
   --batch-size BATCH_SIZE
                         input batch size (default: 32)
-  --learning-rate LEARNING_RATE
-                        learning rate for the optimizer (default: 0.001)
-  --num-gpus-mp NUM_GPUS_MP
-                        number of GPUS per node for MP (default: 1)
-  --mem-per-process MEM_PER_PROCESS
-                        TF GPU memory per GPU (default: 1)
   --parts PARTS         Number of parts for MP (default: 1)
   --split-size SPLIT_SIZE
                         Number of process for MP (default: 2)
@@ -94,21 +107,12 @@ optional arguments:
   --times TIMES         Number of times to repeat MASTER 1: 2 repications, 2: 4 replications (default: 1)
   --image-size IMAGE_SIZE
                         Image size for synthetic benchmark (default: 32)
-  --dp-per-node DP_PER_NODE
-                        Number of DP modes per node (default: 1)
-  --enable-dp           Enable DP for pytorch scripts (default: False)
-  --enable-master-comm-opt
-                        Enable communication optimization for MASTER in Spatial (default: False)
-  --num-gpu-per-node NUM_GPU_PER_NODE
-                        Number of GPUs per node (default: 4)
   --num-epochs NUM_EPOCHS
                         Number of epochs (default: 1)
   --num-layers NUM_LAYERS
                         Number of layers in amoebanet (default: 18)
   --num-filters NUM_FILTERS
                         Number of layers in amoebanet (default: 416)
-  --unet-b UNET_B       B hyperparamter in unet (default: 6)
-  --unet-c UNET_C       C hyperparamter in unet (default: 72)
   --balance BALANCE     length of list equals to number of partitions and sum should be equal to num layers (default: None)
   --halo-D2             Enable design2 (do halo exhange on few convs) for spatial conv. (default: False)
   --fused-layers FUSED_LAYERS
@@ -117,5 +121,6 @@ optional arguments:
                         (default: 1)
   --slice-method SLICE_METHOD
                         Slice method (square, vertical, and horizontal) in Spatial parallelism (default: square)
-
-</pre>
+  --app APP             Application type (1.medical, 2.cifar, and synthetic) in Spatial parallelism (default: 3)
+  --datapath DATAPATH   local Dataset path (default: ./train)
+  </pre>