Skip to content

Commit

Permalink
Resolve PR comments
Browse files Browse the repository at this point in the history
- Make dataset path as a argument varaible
- Added benchmarking examples
- Update README.md
- Remove "ResNet" instances from AmoebaNet

Signed-off-by: Radha Guhane <[email protected]>
  • Loading branch information
RadhaGulhane13 committed Jul 21, 2023
1 parent 256d9ad commit 7874a06
Show file tree
Hide file tree
Showing 12 changed files with 112 additions and 615 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
models/__pycache__/
torch_gems.egg-info/
now-dl.egg-info/
torchgems/__pycache__/
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,13 @@ Python=3.9.16, cuda=11.6, gcc=10.3.0, cmake=3.22.2, PyTorch=1.12.0a0+git35202d2,
cd torch-gems
python setup.py install
```
Example to run Amoebanet model with partition size for model as two, spatial partition as four and spatial size (i.e. number of model partition which will use spatial partition) as 1
Example to run AmoebaNet model with partition size for model as two, spatial partition as four and spatial size (i.e. number of model partition which will use spatial partition) as 1
```bash
$MV2_HOME/bin/mpirun_rsh --export-all -np 5 --hostfile {$HOSTFILE} MV2_USE_GDRCOPY=0 MV2_ENABLE_AFFINITY=0 MV2_USE_CUDA=1 LD_PRELOAD=$MV2_HOME/lib/libmpi.so python benchmarks/spatial/model/amoebanet_run.py --image-size 512 --num-spatial-parts 4 --slice-method "vertical" --split-size 2 --spatial-size 1
```
Refer [Spatial Parallelism Benchmarks](https://github.com/OSU-Nowlab/now-dl/tree/main/benchmarks/spatial) for more details.
## Experimental Results:
#### Using Spatial, Model and Pipeline Parallelism, where the model is split into two parts and utilizes spatial parallelism by dividing the image into four parts
Expand Down
27 changes: 13 additions & 14 deletions benchmarks/layer_parallelism/amoebanet_lp.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,23 +40,22 @@ def __getattr__(self, attr):
sys.stdout = Unbuffered(sys.stdout)

np.random.seed(seed=1405)
ENABLE_ASYNC = True
ENABLE_APP = False
parts = args.parts
batch_size = args.batch_size
resnet_n = 18
epoch = args.num_epochs
ENABLE_ASYNC = True
ENABLE_APP = False
amoebanet_test = False
image_size = int(args.image_size)
print("image size", image_size)
steps = 100
num_layers = args.num_layers
num_filters = args.num_filters
balance = args.balance
mp_size = args.split_size
times = args.times
datapath = args.datapath

image_size_seq = 512
times = 1
num_classes = 1000
steps = 100

mpi_comm = gems_comm.MPIComm(split_size=mp_size, ENABLE_MASTER=False)
rank = mpi_comm.rank
Expand All @@ -80,7 +79,7 @@ def __getattr__(self, attr):


image_size_times = int(image_size / image_size_seq)
resnet_shapes_list = []
amoebanet_shapes_list = []
for output_shape in model_gen.shape_list:
if isinstance(output_shape, list):
temp_shape = []
Expand All @@ -92,20 +91,20 @@ def __getattr__(self, attr):
int(shape_tuple[3] * image_size_times),
)
temp_shape.append(x)
resnet_shapes_list.append(temp_shape)
amoebanet_shapes_list.append(temp_shape)
else:
if len(output_shape) == 2:
resnet_shapes_list.append(output_shape)
amoebanet_shapes_list.append(output_shape)
else:
x = (
output_shape[0],
output_shape[1],
int(output_shape[2] * image_size_times),
int(output_shape[3] * image_size_times),
)
resnet_shapes_list.append(x)
amoebanet_shapes_list.append(x)

model_gen.shape_list = resnet_shapes_list
model_gen.shape_list = amoebanet_shapes_list
print("local_ran:", local_rank, " Shapes:", model_gen.shape_list)


Expand All @@ -123,7 +122,7 @@ def __getattr__(self, attr):
split_size=mp_size,
input_size=(int(batch_size / parts), 3, image_size, image_size),
balance=balance,
shape_list=resnet_shapes_list,
shape_list=amoebanet_shapes_list,
)
model_gen.ready_model(split_rank=local_rank, GET_SHAPES_ON_CUDA=True)

Expand All @@ -145,7 +144,7 @@ def __getattr__(self, attr):
torch.manual_seed(0)
if ENABLE_APP == True:
trainset = torchvision.datasets.ImageFolder(
"/train", transform=transform, target_transform=None
datapath, transform=transform, target_transform=None
)
my_dataloader = torch.utils.data.DataLoader(
trainset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True
Expand Down
19 changes: 9 additions & 10 deletions benchmarks/layer_parallelism/resnet_lp.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
if args.verbose:
logging.basicConfig(level=logging.DEBUG)


gems_comm.initialize_cuda()


Expand All @@ -41,23 +40,23 @@ def __getattr__(self, attr):
sys.stdout = Unbuffered(sys.stdout)

np.random.seed(seed=1405)
ENABLE_ASYNC = True
ENABLE_APP = False
parts = args.parts
batch_size = args.batch_size
resnet_n = 12
epoch = args.num_epochs
ENABLE_ASYNC = True
ENABLE_APP = False
amoebanet_test = False
image_size = int(args.image_size) # 1024
print("image size", image_size)
steps = 100
image_size = int(args.image_size)
num_layers = args.num_layers
num_filters = args.num_filters
balance = args.balance
mp_size = args.split_size
times = args.times
datapath = args.datapath

image_size_seq = 32
times = 1
num_classes = 10
resnet_n = 12
steps = 100

mpi_comm = gems_comm.MPIComm(split_size=mp_size, ENABLE_MASTER=False)
rank = mpi_comm.rank
Expand Down Expand Up @@ -156,7 +155,7 @@ def get_depth(version, n):
torch.manual_seed(0)
if ENABLE_APP == True:
trainset = torchvision.datasets.ImageFolder(
"/usr/workspace/jain8/project/cancer/1024_1024_5/train",
datapath,
transform=transform,
target_transform=None,
)
Expand Down
63 changes: 34 additions & 29 deletions benchmarks/spatial/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Spatial parallelism benchmarks include halo exchange and model benchmarks. These

- Load Required model:
```bash
cd torch-gems
cd now-dl
python setup.py install
```

Expand Down Expand Up @@ -52,38 +52,51 @@ optional arguments:

Model benchmarks for spatial parallelism also require performing model parallelism. To configure the number of model partitions and the number of model partitions that will use spatial parallelism, you can use the --split-size and --spatial-size arguments respectively.

1. Amoebanet benchmark
Run spatial parallelism:

Run spatial parallelism for Amoebanet model:
# Generic command:
```bash

$MV2_HOME/bin/mpirun_rsh --export-all -np $np --hostfile {$HOSTFILE} MV2_USE_GDRCOPY=0 MV2_ENABLE_AFFINITY=0 MV2_USE_CUDA=1 LD_PRELOAD=$MV2_HOME/lib/libmpi.so /home/gulhane.2/map_rank_to_gpu python ${model_type} --halo-D2 --num-spatial-parts ${num_spatial_parts} --image-size ${image_size} --batch-size ${batch_size} --slice-method ${partition}

```
# Examples

- With 5 GPUs [split size: 2, num_spatial_parts: 4, spatial_size: 1]

Example to run AmoebaNet model with 2 model split size(i.e. # of partitions for MP), spatial partition (# of image partitions) as 4 and 1 as spatial size (i.e. number of model partition which will use spatial partition). In this configuration, we split model into two parts where first part will use spatial parallelism.

Example to run Amoebanet model with partition size for model as two, spatial partition as four and spatial size (i.e. number of model partition which will use spatial partition) as 1
```bash
$MV2_HOME/bin/mpirun_rsh --export-all -np 5 --hostfile {$HOSTFILE} MV2_USE_GDRCOPY=0 MV2_ENABLE_AFFINITY=0 MV2_USE_CUDA=1 LD_PRELOAD=$MV2_HOME/lib/libmpi.so python amoebanet_run.py --image-size 512 --num-spatial-parts 4 --slice-method "vertical" --split-size 2 --spatial-size 1
```
- With 9 GPUs [split size: 3, num_spatial_parts: 4, spatial_size: 2]
In this configuration, we split model int three parts where first two part will use spatial parallelism.

```bash
$MV2_HOME/bin/mpirun_rsh --export-all -np 9 --hostfile {$HOSTFILE} MV2_USE_GDRCOPY=0 MV2_ENABLE_AFFINITY=0 MV2_USE_CUDA=1 LD_PRELOAD=$MV2_HOME/lib/libmpi.so python amoebanet_run.py --image-size 512 --num-spatial-parts 4 --slice-method "vertical" --split-size 3 --spatial-size 2
```

- Similarly, we can run benchmark for ResNet model.
Find the example to run ResNet with halo-D2 enabled to reduce communication opertaions. To learn more about halo-D2, refer [Hy-Fi: Hybrid Five-Dimensional Parallel DNN Training on High-Performance GPU Clusters](https://dl.acm.org/doi/abs/10.1007/978-3-031-07312-0_6)
```bash
$MV2_HOME/bin/mpirun_rsh --export-all -np 5 --hostfile {$HOSTFILE} MV2_USE_GDRCOPY=0 MV2_ENABLE_AFFINITY=0 MV2_USE_CUDA=1 LD_PRELOAD=$MV2_HOME/lib/libmpi.so resnet_model.py --halo-D2 --num-spatial-parts 4 --image-size 1024 --batch-size 2 --slice-method "square"
```

Below are the available configuration options :

<pre>
usage: amoebanet_run.py [-h] [--fp16-allreduce] [--model MODEL] [--batch-size BATCH_SIZE] [--learning-rate LEARNING_RATE] [--num-gpus-mp NUM_GPUS_MP]
[--mem-per-process MEM_PER_PROCESS] [--parts PARTS] [--split-size SPLIT_SIZE] [--num-spatial-parts NUM_SPATIAL_PARTS]
[--spatial-size SPATIAL_SIZE] [--times TIMES] [--image-size IMAGE_SIZE] [--dp-per-node DP_PER_NODE] [--enable-dp] [--enable-master-comm-opt]
[--num-gpu-per-node NUM_GPU_PER_NODE] [--num-epochs NUM_EPOCHS] [--num-layers NUM_LAYERS] [--num-filters NUM_FILTERS] [--unet-b UNET_B]
[--unet-c UNET_C] [--balance BALANCE] [--halo-D2] [--fused-layers FUSED_LAYERS] [--local-DP LOCAL_DP] [--slice-method SLICE_METHOD]
usage: amoebanet_run.py [-h] [-v] [--batch-size BATCH_SIZE] [--parts PARTS] [--split-size SPLIT_SIZE] [--num-spatial-parts NUM_SPATIAL_PARTS]
[--spatial-size SPATIAL_SIZE] [--times TIMES] [--image-size IMAGE_SIZE] [--num-epochs NUM_EPOCHS] [--num-layers NUM_LAYERS]
[--num-filters NUM_FILTERS] [--balance BALANCE] [--halo-D2] [--fused-layers FUSED_LAYERS] [--local-DP LOCAL_DP] [--slice-method SLICE_METHOD]
[--app APP] [--datapath DATAPATH]

MP-DP ResNet Script
SP-MP-DP Configuration Script

optional arguments:
-h, --help show this help message and exit
--fp16-allreduce use fp16 compression during allreduce (default: False)
--model MODEL model to benchmark (default: resnet50)
-v, --verbose Prints performance numbers or logs (default: False)
--batch-size BATCH_SIZE
input batch size (default: 32)
--learning-rate LEARNING_RATE
learning rate for the optimizer (default: 0.001)
--num-gpus-mp NUM_GPUS_MP
number of GPUS per node for MP (default: 1)
--mem-per-process MEM_PER_PROCESS
TF GPU memory per GPU (default: 1)
--parts PARTS Number of parts for MP (default: 1)
--split-size SPLIT_SIZE
Number of process for MP (default: 2)
Expand All @@ -94,21 +107,12 @@ optional arguments:
--times TIMES Number of times to repeat MASTER 1: 2 repications, 2: 4 replications (default: 1)
--image-size IMAGE_SIZE
Image size for synthetic benchmark (default: 32)
--dp-per-node DP_PER_NODE
Number of DP modes per node (default: 1)
--enable-dp Enable DP for pytorch scripts (default: False)
--enable-master-comm-opt
Enable communication optimization for MASTER in Spatial (default: False)
--num-gpu-per-node NUM_GPU_PER_NODE
Number of GPUs per node (default: 4)
--num-epochs NUM_EPOCHS
Number of epochs (default: 1)
--num-layers NUM_LAYERS
Number of layers in amoebanet (default: 18)
--num-filters NUM_FILTERS
Number of layers in amoebanet (default: 416)
--unet-b UNET_B B hyperparamter in unet (default: 6)
--unet-c UNET_C C hyperparamter in unet (default: 72)
--balance BALANCE length of list equals to number of partitions and sum should be equal to num layers (default: None)
--halo-D2 Enable design2 (do halo exhange on few convs) for spatial conv. (default: False)
--fused-layers FUSED_LAYERS
Expand All @@ -117,5 +121,6 @@ optional arguments:
(default: 1)
--slice-method SLICE_METHOD
Slice method (square, vertical, and horizontal) in Spatial parallelism (default: square)

</pre>
--app APP Application type (1.medical, 2.cifar, and synthetic) in Spatial parallelism (default: 3)
--datapath DATAPATH local Dataset path (default: ./train)
</pre>
Loading

0 comments on commit 7874a06

Please sign in to comment.