Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Layer Parallelism and Bug fixes for ResNet Halo-D2 #8

Merged
merged 20 commits into from
Aug 15, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
models/__pycache__/
now-dl.egg-info/
torchgems/__pycache__/
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/psf/black
rev: stable
rev: 23.3.0
hooks:
- id: black
name: black-format-test
Expand Down
24 changes: 23 additions & 1 deletion README.md
Quentin-Anthony marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,32 @@ Python=3.9.16, cuda=11.6, gcc=10.3.0, cmake=3.22.2, PyTorch=1.12.0a0+git35202d2,
cd torch-gems
python setup.py install
```
Example to run Amoebanet model with partition size for model as two, spatial partition as four and spatial size (i.e. number of model partition which will use spatial partition) as 1
Example to run AmoebaNet model with partition size for model as two, spatial partition as four and spatial size (i.e. number of model partition which will use spatial partition) as 1
```bash
$MV2_HOME/bin/mpirun_rsh --export-all -np 5 --hostfile {$HOSTFILE} MV2_USE_GDRCOPY=0 MV2_ENABLE_AFFINITY=0 MV2_USE_CUDA=1 LD_PRELOAD=$MV2_HOME/lib/libmpi.so python benchmarks/spatial/model/amoebanet_run.py --image-size 512 --num-spatial-parts 4 --slice-method "vertical" --split-size 2 --spatial-size 1
```

Refer [Spatial Parallelism Benchmarks](https://github.com/OSU-Nowlab/now-dl/tree/main/benchmarks/spatial) for more details.

## Experimental Results:

#### Using Spatial, Model and Pipeline Parallelism, where the model is split into two parts and utilizes spatial parallelism by dividing the image into four parts

- AmeobaNet Model

<div align="center">
<img src="docs/assets/images/AmeobaNet_img_size_1024.png" width="400">
<img src="docs/assets/images/AmeobaNet_img_size_2048.png" width="400">
</div>

- ResNet Model

<div align="center">
<img src="docs/assets/images/ResNet_img_size_1024.png" width="400">
<img src="docs/assets/images/ResNet_img_size_2048.png" width="400">
</div>


## References:
1. Arpan Jain, Ammar Ahmad Awan, Asmaa M. Aljuhani, Jahanzeb Maqbool Hashmi, Quentin G. Anthony, Hari Subramoni, Dhableswar K. Panda, Raghu Machiraju, and Anil Parwani. 2020. GEMS: <u>G</u>PU-<u>e</u>nabled <u>m</u>emory-aware model-parallelism <u>s</u>ystem for distributed DNN training. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC '20). IEEE Press, Article 45, 1–15.
2. Arpan Jain, Aamir Shafi, Quentin Anthony, Pouya Kousha, Hari Subramoni, and Dhableswar K. Panda. 2022. Hy-Fi: Hybrid Five-Dimensional Parallel DNN Training on High-Performance GPU Clusters. In High Performance Computing: 37th International Conference, ISC High Performance 2022, Hamburg, Germany, May 29 – June 2, 2022, Proceedings. Springer-Verlag, Berlin, Heidelberg, 109–130. https://doi.org/10.1007/978-3-031-07312-0_6
Expand Down
207 changes: 207 additions & 0 deletions benchmarks/layer_parallelism/amoebanet_lp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
import torch
import torchvision.transforms as transforms
import torchvision
import numpy as np
import sys
import math
import logging
from torchgems import parser
import time
from torchgems.mp_pipeline import model_generator, train_model
from models import amoebanet
import torchgems.comm as gems_comm


parser_obj = parser.get_parser()
args = parser_obj.parse_args()

if args.verbose:
logging.basicConfig(level=logging.DEBUG)

gems_comm.initialize_cuda()


class Unbuffered(object):
def __init__(self, stream):
self.stream = stream

def write(self, data):
self.stream.write(data)
self.stream.flush()

def writelines(self, datas):
self.stream.writelines(datas)
self.stream.flush()

def __getattr__(self, attr):
return getattr(self.stream, attr)


sys.stdout = Unbuffered(sys.stdout)

np.random.seed(seed=1405)
ENABLE_ASYNC = True
ENABLE_APP = False
parts = args.parts
batch_size = args.batch_size
epoch = args.num_epochs
image_size = int(args.image_size)
num_layers = args.num_layers
num_filters = args.num_filters
balance = args.balance
mp_size = args.split_size
times = args.times
datapath = args.datapath

image_size_seq = 512
num_classes = 1000
steps = 100

mpi_comm = gems_comm.MPIComm(split_size=mp_size, ENABLE_MASTER=False)
rank = mpi_comm.rank

local_rank = rank % mp_size

if balance is not None:
balance = [int(i) for i in balance.split(",")]

model = amoebanet.amoebanetd(
num_classes=1000, num_layers=args.num_layers, num_filters=args.num_filters
)

model_gen = model_generator(
model=model,
split_size=mp_size,
input_size=(int(batch_size / parts), 3, image_size_seq, image_size_seq),
balance=balance,
)
model_gen.ready_model(split_rank=local_rank, GET_SHAPES_ON_CUDA=True)


image_size_times = int(image_size / image_size_seq)
amoebanet_shapes_list = []
for output_shape in model_gen.shape_list:
if isinstance(output_shape, list):
temp_shape = []
for shape_tuple in output_shape:
x = (
shape_tuple[0],
shape_tuple[1],
int(shape_tuple[2] * image_size_times),
int(shape_tuple[3] * image_size_times),
)
temp_shape.append(x)
amoebanet_shapes_list.append(temp_shape)
else:
if len(output_shape) == 2:
amoebanet_shapes_list.append(output_shape)
else:
x = (
output_shape[0],
output_shape[1],
int(output_shape[2] * image_size_times),
int(output_shape[3] * image_size_times),
)
amoebanet_shapes_list.append(x)

model_gen.shape_list = amoebanet_shapes_list
print("local_ran:", local_rank, " Shapes:", model_gen.shape_list)


del model_gen
del model
torch.cuda.ipc_collect()

model = amoebanet.amoebanetd(
num_classes=1000, num_layers=args.num_layers, num_filters=args.num_filters
)


model_gen = model_generator(
model=model,
split_size=mp_size,
input_size=(int(batch_size / parts), 3, image_size, image_size),
balance=balance,
shape_list=amoebanet_shapes_list,
)
model_gen.ready_model(split_rank=local_rank, GET_SHAPES_ON_CUDA=True)

tm = train_model(
model_gen,
local_rank,
batch_size,
epoch,
criterion=None,
optimizer=None,
parts=parts,
ASYNC=ENABLE_ASYNC,
)

# Dataset
transform = transforms.Compose(
[transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)
torch.manual_seed(0)
if ENABLE_APP == True:
trainset = torchvision.datasets.ImageFolder(
datapath, transform=transform, target_transform=None
)
my_dataloader = torch.utils.data.DataLoader(
trainset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True
)
else:
my_dataset = torchvision.datasets.FakeData(
size=10 * batch_size,
image_size=(3, image_size, image_size),
num_classes=num_classes,
transform=transform,
target_transform=None,
random_offset=0,
)
my_dataloader = torch.utils.data.DataLoader(
my_dataset,
batch_size=batch_size * times,
shuffle=False,
num_workers=0,
pin_memory=True,
)
size_dataset = 10 * batch_size

perf = []


def run_epoch():
for i_e in range(epoch):
loss = 0
t = time.time()
for i, data in enumerate(my_dataloader, 0):
start_event = torch.cuda.Event(enable_timing=True, blocking=True)
end_event = torch.cuda.Event(enable_timing=True, blocking=True)
start_event.record()

if i > math.floor(size_dataset / (times * batch_size)) - 1:
break
inputs, labels = data

temp_loss = tm.run_step(inputs, labels)
loss += temp_loss
tm.update()

end_event.record()
torch.cuda.synchronize()
t = start_event.elapsed_time(end_event) / 1000

if local_rank == mp_size - 1:
logging.info(f"Step :{i}, LOSS: {temp_loss}, Global loss: {loss/(i+1)}")

if local_rank == 0:
print("Epoch: {} images per sec:{}".format(i_e, batch_size / t))
perf.append(batch_size / t)

t = time.time()


run_epoch()

if local_rank == 0:
print("Mean {} Median {}".format(sum(perf) / len(perf), np.median(perf)))
Loading