diff --git a/.gitignore b/.gitignore index 56028830..c9be8d3b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ models/__pycache__/ -torch_gems.egg-info/ +now-dl.egg-info/ torchgems/__pycache__/ \ No newline at end of file diff --git a/README.md b/README.md index 4721bca0..da448695 100644 --- a/README.md +++ b/README.md @@ -79,11 +79,13 @@ Python=3.9.16, cuda=11.6, gcc=10.3.0, cmake=3.22.2, PyTorch=1.12.0a0+git35202d2, cd torch-gems python setup.py install ``` -Example to run Amoebanet model with partition size for model as two, spatial partition as four and spatial size (i.e. number of model partition which will use spatial partition) as 1 +Example to run AmoebaNet model with partition size for model as two, spatial partition as four and spatial size (i.e. number of model partition which will use spatial partition) as 1 ```bash $MV2_HOME/bin/mpirun_rsh --export-all -np 5 --hostfile {$HOSTFILE} MV2_USE_GDRCOPY=0 MV2_ENABLE_AFFINITY=0 MV2_USE_CUDA=1 LD_PRELOAD=$MV2_HOME/lib/libmpi.so python benchmarks/spatial/model/amoebanet_run.py --image-size 512 --num-spatial-parts 4 --slice-method "vertical" --split-size 2 --spatial-size 1 ``` +Refer [Spatial Parallelism Benchmarks](https://github.com/OSU-Nowlab/now-dl/tree/main/benchmarks/spatial) for more details. + ## Experimental Results: #### Using Spatial, Model and Pipeline Parallelism, where the model is split into two parts and utilizes spatial parallelism by dividing the image into four parts diff --git a/benchmarks/layer_parallelism/amoebanet_lp.py b/benchmarks/layer_parallelism/amoebanet_lp.py index f342610c..59e85539 100644 --- a/benchmarks/layer_parallelism/amoebanet_lp.py +++ b/benchmarks/layer_parallelism/amoebanet_lp.py @@ -40,23 +40,22 @@ def __getattr__(self, attr): sys.stdout = Unbuffered(sys.stdout) np.random.seed(seed=1405) +ENABLE_ASYNC = True +ENABLE_APP = False parts = args.parts batch_size = args.batch_size -resnet_n = 18 epoch = args.num_epochs -ENABLE_ASYNC = True -ENABLE_APP = False -amoebanet_test = False image_size = int(args.image_size) -print("image size", image_size) -steps = 100 num_layers = args.num_layers num_filters = args.num_filters balance = args.balance mp_size = args.split_size +times = args.times +datapath = args.datapath + image_size_seq = 512 -times = 1 num_classes = 1000 +steps = 100 mpi_comm = gems_comm.MPIComm(split_size=mp_size, ENABLE_MASTER=False) rank = mpi_comm.rank @@ -80,7 +79,7 @@ def __getattr__(self, attr): image_size_times = int(image_size / image_size_seq) -resnet_shapes_list = [] +amoebanet_shapes_list = [] for output_shape in model_gen.shape_list: if isinstance(output_shape, list): temp_shape = [] @@ -92,10 +91,10 @@ def __getattr__(self, attr): int(shape_tuple[3] * image_size_times), ) temp_shape.append(x) - resnet_shapes_list.append(temp_shape) + amoebanet_shapes_list.append(temp_shape) else: if len(output_shape) == 2: - resnet_shapes_list.append(output_shape) + amoebanet_shapes_list.append(output_shape) else: x = ( output_shape[0], @@ -103,9 +102,9 @@ def __getattr__(self, attr): int(output_shape[2] * image_size_times), int(output_shape[3] * image_size_times), ) - resnet_shapes_list.append(x) + amoebanet_shapes_list.append(x) -model_gen.shape_list = resnet_shapes_list +model_gen.shape_list = amoebanet_shapes_list print("local_ran:", local_rank, " Shapes:", model_gen.shape_list) @@ -123,7 +122,7 @@ def __getattr__(self, attr): split_size=mp_size, input_size=(int(batch_size / parts), 3, image_size, image_size), balance=balance, - shape_list=resnet_shapes_list, + shape_list=amoebanet_shapes_list, ) model_gen.ready_model(split_rank=local_rank, GET_SHAPES_ON_CUDA=True) @@ -145,7 +144,7 @@ def __getattr__(self, attr): torch.manual_seed(0) if ENABLE_APP == True: trainset = torchvision.datasets.ImageFolder( - "/train", transform=transform, target_transform=None + datapath, transform=transform, target_transform=None ) my_dataloader = torch.utils.data.DataLoader( trainset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True diff --git a/benchmarks/layer_parallelism/resnet_lp.py b/benchmarks/layer_parallelism/resnet_lp.py index cdcc2b90..224099d4 100644 --- a/benchmarks/layer_parallelism/resnet_lp.py +++ b/benchmarks/layer_parallelism/resnet_lp.py @@ -18,7 +18,6 @@ if args.verbose: logging.basicConfig(level=logging.DEBUG) - gems_comm.initialize_cuda() @@ -41,23 +40,23 @@ def __getattr__(self, attr): sys.stdout = Unbuffered(sys.stdout) np.random.seed(seed=1405) +ENABLE_ASYNC = True +ENABLE_APP = False parts = args.parts batch_size = args.batch_size -resnet_n = 12 epoch = args.num_epochs -ENABLE_ASYNC = True -ENABLE_APP = False -amoebanet_test = False -image_size = int(args.image_size) # 1024 -print("image size", image_size) -steps = 100 +image_size = int(args.image_size) num_layers = args.num_layers num_filters = args.num_filters balance = args.balance mp_size = args.split_size +times = args.times +datapath = args.datapath + image_size_seq = 32 -times = 1 num_classes = 10 +resnet_n = 12 +steps = 100 mpi_comm = gems_comm.MPIComm(split_size=mp_size, ENABLE_MASTER=False) rank = mpi_comm.rank @@ -156,7 +155,7 @@ def get_depth(version, n): torch.manual_seed(0) if ENABLE_APP == True: trainset = torchvision.datasets.ImageFolder( - "/usr/workspace/jain8/project/cancer/1024_1024_5/train", + datapath, transform=transform, target_transform=None, ) diff --git a/benchmarks/spatial/README.md b/benchmarks/spatial/README.md index ffcf78ec..d78488b7 100644 --- a/benchmarks/spatial/README.md +++ b/benchmarks/spatial/README.md @@ -14,7 +14,7 @@ Spatial parallelism benchmarks include halo exchange and model benchmarks. These - Load Required model: ```bash -cd torch-gems +cd now-dl python setup.py install ``` @@ -52,38 +52,51 @@ optional arguments: Model benchmarks for spatial parallelism also require performing model parallelism. To configure the number of model partitions and the number of model partitions that will use spatial parallelism, you can use the --split-size and --spatial-size arguments respectively. -1. Amoebanet benchmark +Run spatial parallelism: -Run spatial parallelism for Amoebanet model: +# Generic command: +```bash + +$MV2_HOME/bin/mpirun_rsh --export-all -np $np --hostfile {$HOSTFILE} MV2_USE_GDRCOPY=0 MV2_ENABLE_AFFINITY=0 MV2_USE_CUDA=1 LD_PRELOAD=$MV2_HOME/lib/libmpi.so /home/gulhane.2/map_rank_to_gpu python ${model_type} --halo-D2 --num-spatial-parts ${num_spatial_parts} --image-size ${image_size} --batch-size ${batch_size} --slice-method ${partition} + +``` +# Examples + +- With 5 GPUs [split size: 2, num_spatial_parts: 4, spatial_size: 1] + +Example to run AmoebaNet model with 2 model split size(i.e. # of partitions for MP), spatial partition (# of image partitions) as 4 and 1 as spatial size (i.e. number of model partition which will use spatial partition). In this configuration, we split model into two parts where first part will use spatial parallelism. -Example to run Amoebanet model with partition size for model as two, spatial partition as four and spatial size (i.e. number of model partition which will use spatial partition) as 1 ```bash $MV2_HOME/bin/mpirun_rsh --export-all -np 5 --hostfile {$HOSTFILE} MV2_USE_GDRCOPY=0 MV2_ENABLE_AFFINITY=0 MV2_USE_CUDA=1 LD_PRELOAD=$MV2_HOME/lib/libmpi.so python amoebanet_run.py --image-size 512 --num-spatial-parts 4 --slice-method "vertical" --split-size 2 --spatial-size 1 ``` +- With 9 GPUs [split size: 3, num_spatial_parts: 4, spatial_size: 2] +In this configuration, we split model int three parts where first two part will use spatial parallelism. + +```bash +$MV2_HOME/bin/mpirun_rsh --export-all -np 9 --hostfile {$HOSTFILE} MV2_USE_GDRCOPY=0 MV2_ENABLE_AFFINITY=0 MV2_USE_CUDA=1 LD_PRELOAD=$MV2_HOME/lib/libmpi.so python amoebanet_run.py --image-size 512 --num-spatial-parts 4 --slice-method "vertical" --split-size 3 --spatial-size 2 +``` + +- Similarly, we can run benchmark for ResNet model. +Find the example to run ResNet with halo-D2 enabled to reduce communication opertaions. To learn more about halo-D2, refer [Hy-Fi: Hybrid Five-Dimensional Parallel DNN Training on High-Performance GPU Clusters](https://dl.acm.org/doi/abs/10.1007/978-3-031-07312-0_6) +```bash +$MV2_HOME/bin/mpirun_rsh --export-all -np 5 --hostfile {$HOSTFILE} MV2_USE_GDRCOPY=0 MV2_ENABLE_AFFINITY=0 MV2_USE_CUDA=1 LD_PRELOAD=$MV2_HOME/lib/libmpi.so resnet_model.py --halo-D2 --num-spatial-parts 4 --image-size 1024 --batch-size 2 --slice-method "square" +``` Below are the available configuration options :
-usage: amoebanet_run.py [-h] [--fp16-allreduce] [--model MODEL] [--batch-size BATCH_SIZE] [--learning-rate LEARNING_RATE] [--num-gpus-mp NUM_GPUS_MP]
-                        [--mem-per-process MEM_PER_PROCESS] [--parts PARTS] [--split-size SPLIT_SIZE] [--num-spatial-parts NUM_SPATIAL_PARTS]
-                        [--spatial-size SPATIAL_SIZE] [--times TIMES] [--image-size IMAGE_SIZE] [--dp-per-node DP_PER_NODE] [--enable-dp] [--enable-master-comm-opt]
-                        [--num-gpu-per-node NUM_GPU_PER_NODE] [--num-epochs NUM_EPOCHS] [--num-layers NUM_LAYERS] [--num-filters NUM_FILTERS] [--unet-b UNET_B]
-                        [--unet-c UNET_C] [--balance BALANCE] [--halo-D2] [--fused-layers FUSED_LAYERS] [--local-DP LOCAL_DP] [--slice-method SLICE_METHOD]
+usage: amoebanet_run.py [-h] [-v] [--batch-size BATCH_SIZE] [--parts PARTS] [--split-size SPLIT_SIZE] [--num-spatial-parts NUM_SPATIAL_PARTS]
+                        [--spatial-size SPATIAL_SIZE] [--times TIMES] [--image-size IMAGE_SIZE] [--num-epochs NUM_EPOCHS] [--num-layers NUM_LAYERS]
+                        [--num-filters NUM_FILTERS] [--balance BALANCE] [--halo-D2] [--fused-layers FUSED_LAYERS] [--local-DP LOCAL_DP] [--slice-method SLICE_METHOD]
+                        [--app APP] [--datapath DATAPATH]
 
-MP-DP ResNet Script
+SP-MP-DP Configuration Script
 
 optional arguments:
   -h, --help            show this help message and exit
-  --fp16-allreduce      use fp16 compression during allreduce (default: False)
-  --model MODEL         model to benchmark (default: resnet50)
+  -v, --verbose         Prints performance numbers or logs (default: False)
   --batch-size BATCH_SIZE
                         input batch size (default: 32)
-  --learning-rate LEARNING_RATE
-                        learning rate for the optimizer (default: 0.001)
-  --num-gpus-mp NUM_GPUS_MP
-                        number of GPUS per node for MP (default: 1)
-  --mem-per-process MEM_PER_PROCESS
-                        TF GPU memory per GPU (default: 1)
   --parts PARTS         Number of parts for MP (default: 1)
   --split-size SPLIT_SIZE
                         Number of process for MP (default: 2)
@@ -94,21 +107,12 @@ optional arguments:
   --times TIMES         Number of times to repeat MASTER 1: 2 repications, 2: 4 replications (default: 1)
   --image-size IMAGE_SIZE
                         Image size for synthetic benchmark (default: 32)
-  --dp-per-node DP_PER_NODE
-                        Number of DP modes per node (default: 1)
-  --enable-dp           Enable DP for pytorch scripts (default: False)
-  --enable-master-comm-opt
-                        Enable communication optimization for MASTER in Spatial (default: False)
-  --num-gpu-per-node NUM_GPU_PER_NODE
-                        Number of GPUs per node (default: 4)
   --num-epochs NUM_EPOCHS
                         Number of epochs (default: 1)
   --num-layers NUM_LAYERS
                         Number of layers in amoebanet (default: 18)
   --num-filters NUM_FILTERS
                         Number of layers in amoebanet (default: 416)
-  --unet-b UNET_B       B hyperparamter in unet (default: 6)
-  --unet-c UNET_C       C hyperparamter in unet (default: 72)
   --balance BALANCE     length of list equals to number of partitions and sum should be equal to num layers (default: None)
   --halo-D2             Enable design2 (do halo exhange on few convs) for spatial conv. (default: False)
   --fused-layers FUSED_LAYERS
@@ -117,5 +121,6 @@ optional arguments:
                         (default: 1)
   --slice-method SLICE_METHOD
                         Slice method (square, vertical, and horizontal) in Spatial parallelism (default: square)
-
-
+ --app APP Application type (1.medical, 2.cifar, and synthetic) in Spatial parallelism (default: 3) + --datapath DATAPATH local Dataset path (default: ./train) + diff --git a/benchmarks/spatial/model/amoebanet_run.py b/benchmarks/spatial/model/amoebanet_run.py index 11dbd750..363d45cd 100644 --- a/benchmarks/spatial/model/amoebanet_run.py +++ b/benchmarks/spatial/model/amoebanet_run.py @@ -61,27 +61,30 @@ def get_depth(version, n): sys.stdout = Unbuffered(sys.stdout) +ENABLE_ASYNC = True + np.random.seed(seed=1405) parts = args.parts batch_size = args.batch_size -resnet_n = 12 epoch = args.num_epochs -ENABLE_ASYNC = True +image_size = int(args.image_size) +num_layers = args.num_layers +num_filters = args.num_filters +balance = args.balance +split_size = args.split_size +spatial_size = args.spatial_size +times = args.times +datapath = args.datapath +LOCAL_DP_LP = args.local_DP # APP # 1: Medical # 2: Cifar # 3: synthetic APP = args.app -amoebanet_test = False -image_size = int(args.image_size) -print("image size", image_size) + +num_classes = 1000 steps = 100 -num_layers = args.num_layers -num_filters = args.num_filters -balance = args.balance -split_size = args.split_size -spatial_size = args.spatial_size temp_num_spatial_parts = args.num_spatial_parts.split(",") @@ -94,12 +97,6 @@ def get_depth(version, n): spatial_part_size = num_spatial_parts_list[0] # Partition size for spatial parallelism -times = 1 -num_classes = 1000 -LOCAL_DP_LP = args.local_DP - -# DDP support - def isPowerTwo(num): return not (num & (num - 1)) @@ -192,7 +189,7 @@ def verify_config(): temp_count = 0 if args.slice_method == "square": - resnet_shapes_list = [] + amoebanet_shapes_list = [] for output_shape in model_gen_seq.shape_list: if isinstance(output_shape, list): temp_shape = [] @@ -214,11 +211,11 @@ def verify_config(): int(shape_tuple[3] * image_size_times), ) temp_shape.append(x) - resnet_shapes_list.append(temp_shape) + amoebanet_shapes_list.append(temp_shape) else: if len(output_shape) == 2: x = (int(output_shape[0]), output_shape[1]) - resnet_shapes_list.append(x) + amoebanet_shapes_list.append(x) else: if temp_count < spatial_size: x = ( @@ -227,7 +224,7 @@ def verify_config(): int(output_shape[2] * image_size_times / 2), int(output_shape[3] * image_size_times / 2), ) - resnet_shapes_list.append(x) + amoebanet_shapes_list.append(x) else: x = ( int(output_shape[0]), @@ -235,11 +232,11 @@ def verify_config(): int(output_shape[2] * image_size_times), int(output_shape[3] * image_size_times), ) - resnet_shapes_list.append(x) + amoebanet_shapes_list.append(x) temp_count += 1 elif args.slice_method == "vertical": - resnet_shapes_list = [] + amoebanet_shapes_list = [] for output_shape in model_gen_seq.shape_list: if isinstance(output_shape, list): temp_shape = [] @@ -264,11 +261,11 @@ def verify_config(): int(shape_tuple[3] * image_size_times), ) temp_shape.append(x) - resnet_shapes_list.append(temp_shape) + amoebanet_shapes_list.append(temp_shape) else: if len(output_shape) == 2: x = (int(output_shape[0]), output_shape[1]) - resnet_shapes_list.append(x) + amoebanet_shapes_list.append(x) else: if temp_count < spatial_size: x = ( @@ -281,7 +278,7 @@ def verify_config(): / num_spatial_parts_list[temp_count] ), ) - resnet_shapes_list.append(x) + amoebanet_shapes_list.append(x) else: x = ( int(output_shape[0]), @@ -289,12 +286,12 @@ def verify_config(): int(output_shape[2] * image_size_times), int(output_shape[3] * image_size_times), ) - resnet_shapes_list.append(x) + amoebanet_shapes_list.append(x) temp_count += 1 elif args.slice_method == "horizontal": - resnet_shapes_list = [] + amoebanet_shapes_list = [] for output_shape in model_gen_seq.shape_list: if isinstance(output_shape, list): temp_shape = [] @@ -319,11 +316,11 @@ def verify_config(): int(shape_tuple[3] * image_size_times), ) temp_shape.append(x) - resnet_shapes_list.append(temp_shape) + amoebanet_shapes_list.append(temp_shape) else: if len(output_shape) == 2: x = (int(output_shape[0]), output_shape[1]) - resnet_shapes_list.append(x) + amoebanet_shapes_list.append(x) else: if temp_count < spatial_size: x = ( @@ -336,7 +333,7 @@ def verify_config(): ), int(output_shape[3] * image_size_times / 1), ) - resnet_shapes_list.append(x) + amoebanet_shapes_list.append(x) else: x = ( int(output_shape[0]), @@ -344,11 +341,11 @@ def verify_config(): int(output_shape[2] * image_size_times), int(output_shape[3] * image_size_times), ) - resnet_shapes_list.append(x) + amoebanet_shapes_list.append(x) temp_count += 1 -print(model_gen_seq.shape_list, resnet_shapes_list) +print(model_gen_seq.shape_list, amoebanet_shapes_list) del model_seq del model_gen_seq @@ -385,7 +382,7 @@ def verify_config(): split_size=split_size, input_size=(int(batch_size / parts), 3, image_size, image_size), balance=balance, - shape_list=resnet_shapes_list, + shape_list=amoebanet_shapes_list, ) @@ -393,7 +390,7 @@ def verify_config(): model_gen.DDP_model(mpi_comm, num_spatial_parts, spatial_size, bucket_size=0) -print("Shape list", resnet_shapes_list) +print("Shape list", amoebanet_shapes_list) t_s = train_model_spatial( @@ -426,7 +423,7 @@ def verify_config(): if APP == 1: trainset = torchvision.datasets.ImageFolder( - "./train", + datapath, transform=transform, target_transform=None, ) diff --git a/benchmarks/spatial/model/master_amoebanet_run.py b/benchmarks/spatial/model/master_amoebanet_run.py deleted file mode 100644 index dc4af405..00000000 --- a/benchmarks/spatial/model/master_amoebanet_run.py +++ /dev/null @@ -1,459 +0,0 @@ -import torch -import torch.distributed as dist -import torchvision.transforms as transforms -import torchvision - -# import matplotlib.pyplot as plt -import numpy as np -import time -import sys -import math - -sys.path.append("/usr/WS1/jain8/project/pytorch_mp/mp/torch-gems/") - - -from torchgems import parser -from torchgems.mp_pipeline import model_generator -from torchgems.train_spatial import get_shapes_spatial, split_input -from torchgems.train_spatial_master import train_spatial_model_master -import torchgems.comm as gems_comm - -parser_obj = parser.get_parser() -args = parser_obj.parse_args() - -if args.halo_d2: - from models import amoebanet - from models import amoebanet_d2 - -else: - from models import amoebanet - -gems_comm.initialize_cuda() - - -class Unbuffered(object): - def __init__(self, stream): - self.stream = stream - - def write(self, data): - self.stream.write(data) - self.stream.flush() - - def writelines(self, datas): - self.stream.writelines(datas) - self.stream.flush() - - def __getattr__(self, attr): - return getattr(self.stream, attr) - - -def init_processes(backend="tcp"): - """Initialize the distributed environment.""" - dist.init_process_group(backend) - size = dist.get_world_size() - rank = dist.get_rank() - return size, rank - - -def get_depth(version, n): - if version == 1: - return n * 6 + 2 - elif version == 2: - return n * 9 + 2 - - -sys.stdout = Unbuffered(sys.stdout) - -# torch.set_num_threads(1) -np.random.seed(seed=1405) -parts = args.parts -batch_size = args.batch_size -resnet_n = 12 -epoch = args.num_epochs -ENABLE_ASYNC = True - -# APP -# 1: Medical -# 2: Cifar -# 3: synthetic -APP = 3 -amoebanet_test = False -image_size = int(args.image_size) -print("image size", image_size) -steps = 100 -num_layers = args.num_layers -num_filters = args.num_filters -balance = args.balance -split_size = args.split_size -spatial_size = args.spatial_size -ENABLE_MASTER_OPT = args.enable_master_comm_opt - -temp_num_spatial_parts = args.num_spatial_parts.split(",") - -if len(temp_num_spatial_parts) == 1: - num_spatial_parts_list = [int(temp_num_spatial_parts[0])] - num_spatial_parts = int(temp_num_spatial_parts[0]) -else: - num_spatial_parts = [int(i) for i in temp_num_spatial_parts] - num_spatial_parts_list = num_spatial_parts - -times = 1 -num_classes = 1000 -LOCAL_DP_LP = args.local_DP - - -mpi_comm_first = gems_comm.MPIComm( - split_size=split_size, - ENABLE_MASTER=False, - ENABLE_SPATIAL=True, - num_spatial_parts=num_spatial_parts, - spatial_size=spatial_size, - LOCAL_DP_LP=LOCAL_DP_LP, -) -mpi_comm_second = gems_comm.MPIComm( - split_size=split_size, - ENABLE_MASTER=True, - ENABLE_SPATIAL=True, - num_spatial_parts=num_spatial_parts, - spatial_size=spatial_size, - LOCAL_DP_LP=LOCAL_DP_LP, - DISABLE_INIT=True, -) - -gems_comm.sync_comms_for_master(mpi_comm_first, mpi_comm_second) -comm_size = mpi_comm_first.size -# rank = mpi_comm.local_rank -# comm_size = mpi_comm.size -# local_rank = rank - -# split_rank = mpi_comm.split_rank - - -if args.balance != None: - balance = args.balance.split(",") - balance = [int(j) for j in balance] -else: - balance = None - - -image_size_seq = 512 - -model_seq = amoebanet.amoebanetd( - num_layers=num_layers, num_filters=num_filters, num_classes=num_classes -) -print("length", len(model_seq), balance) -model_gen_seq = model_generator( - model=model_seq, - split_size=split_size, - input_size=(int(batch_size / parts), 3, image_size_seq, image_size_seq), - balance=balance, -) -model_gen_seq.ready_model( - split_rank=mpi_comm_second.split_rank, GET_SHAPES_ON_CUDA=True -) - -image_size_times = int(image_size / image_size_seq) - -resnet_shapes_list = get_shapes_spatial( - shape_list=model_gen_seq.shape_list, - slice_method=args.slice_method, - spatial_size=spatial_size, - num_spatial_parts_list=num_spatial_parts_list, - image_size_times=image_size_times, -) - -print(model_gen_seq.shape_list, resnet_shapes_list) - -del model_seq -del model_gen_seq -torch.cuda.ipc_collect() - - -if args.halo_d2: - model1 = amoebanet_d2.amoebanetd_spatial( - local_rank=mpi_comm_first.local_rank % mpi_comm_first.total_spatial_processes, - spatial_size=spatial_size, - num_spatial_parts=num_spatial_parts, - mp_size=split_size, - balance=balance, - slice_method="square", - num_classes=num_classes, - num_layers=num_layers, - num_filters=num_filters, - ) - - model2 = amoebanet_d2.amoebanetd_spatial( - local_rank=mpi_comm_second.local_rank % mpi_comm_second.total_spatial_processes, - spatial_size=spatial_size, - num_spatial_parts=num_spatial_parts, - mp_size=split_size, - balance=balance, - slice_method="square", - num_classes=num_classes, - num_layers=num_layers, - num_filters=num_filters, - ) -else: - model1 = amoebanet.amoebanetd_spatial( - local_rank=mpi_comm_first.local_rank % mpi_comm_first.total_spatial_processes, - spatial_size=spatial_size, - num_spatial_parts=num_spatial_parts, - mp_size=split_size, - balance=balance, - slice_method="square", - num_classes=num_classes, - num_layers=num_layers, - num_filters=num_filters, - ) - - model2 = amoebanet.amoebanetd_spatial( - local_rank=mpi_comm_second.local_rank % mpi_comm_second.total_spatial_processes, - spatial_size=spatial_size, - num_spatial_parts=num_spatial_parts, - mp_size=split_size, - balance=balance, - slice_method="square", - num_classes=num_classes, - num_layers=num_layers, - num_filters=num_filters, - ) - - -model_gen1 = model_generator( - model=model1, - split_size=split_size, - input_size=(int(batch_size / parts), 3, image_size, image_size), - balance=balance, - shape_list=resnet_shapes_list, -) -model_gen1.ready_model(split_rank=mpi_comm_first.split_rank) -# model_gen1.DDP_model(mpi_comm_first, num_spatial_parts, spatial_size, bucket_size=25, local_rank = mpi_comm_first.local_rank) - - -model_gen2 = model_generator( - model=model2, - split_size=split_size, - input_size=(int(batch_size / parts), 3, image_size, image_size), - balance=balance, - shape_list=resnet_shapes_list, -) -model_gen2.ready_model(split_rank=mpi_comm_second.split_rank) -# model_gen2.DDP_model(mpi_comm_second, num_spatial_parts, spatial_size, bucket_size=25, local_rank = mpi_comm_second.local_rank) - - -# model_gen.mp_size = 5 -print("Shape list", resnet_shapes_list) - - -# t_s1 = train_model_spatial(model_gen1, mpi_comm_first.local_rank,batch_size,epochs=1, spatial_size=spatial_size, num_spatial_parts=num_spatial_parts ,criterion=None,optimizer=None,parts=parts,ASYNC=True,GEMS_INVERSE=False, slice_method = args.slice_method, -# LOCAL_DP_LP=LOCAL_DP_LP, -# mpi_comm = mpi_comm_first) - - -# t_s2 = train_model_spatial(model_gen2, mpi_comm_second.local_rank,batch_size,epochs=1, spatial_size=spatial_size, num_spatial_parts=num_spatial_parts ,criterion=None,optimizer=None,parts=parts,ASYNC=True,GEMS_INVERSE=True, slice_method = args.slice_method, -# LOCAL_DP_LP=LOCAL_DP_LP, -# mpi_comm = mpi_comm_second) - -t_s_master = train_spatial_model_master( - model_gen1, - model_gen2, - batch_size, - spatial_size, - num_spatial_parts, - args.slice_method, - mpi_comm_first, - mpi_comm_second, - LOCAL_DP_LP=LOCAL_DP_LP, - criterion=None, - optimizer=None, - parts=parts, - ASYNC=True, - replications=int(args.times / 2), -) - -x = torch.zeros( - (batch_size, 3, int(image_size / 2), int(image_size / 2)), device="cuda" -) -y = torch.zeros((batch_size,), dtype=torch.long, device="cuda") - - -transform = transforms.Compose( - [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] -) -torch.manual_seed(0) - -if APP == 1: - trainset = torchvision.datasets.ImageFolder( - "/usr/workspace/jain8/project/cancer/1024_1024_5/train", - transform=transform, - target_transform=None, - ) - my_dataloader = torch.utils.data.DataLoader( - trainset, - batch_size=times * batch_size, - shuffle=True, - num_workers=0, - pin_memory=True, - ) - size_dataset = 1030 -elif APP == 2: - trainset = torchvision.datasets.CIFAR10( - root="./data", train=True, download=True, transform=transform - ) - my_dataloader = torch.utils.data.DataLoader( - trainset, - batch_size=times * batch_size, - shuffle=False, - num_workers=0, - pin_memory=True, - ) - size_dataset = 50000 -else: - my_dataset = torchvision.datasets.FakeData( - size=10 * batch_size * args.times, - image_size=(3, image_size, image_size), - num_classes=num_classes, - transform=transform, - target_transform=None, - random_offset=0, - ) - my_dataloader = torch.utils.data.DataLoader( - my_dataset, - batch_size=batch_size * args.times, - shuffle=False, - num_workers=0, - pin_memory=True, - ) - size_dataset = 10 * batch_size - - -# sync_allreduce.sync_model_spatial(model_gen) -perf = [] - -sync_comm = gems_comm.SyncAllreduce(mpi_comm_first) - - -MASTER = args.times - -print("ENABLE_MASTER_OPT", ENABLE_MASTER_OPT) - - -def run_epoch(): - for i_e in range(epoch): - loss = 0 - correct = 0 - t = time.time() - for i, data in enumerate(my_dataloader, 0): - start_event = torch.cuda.Event(enable_timing=True, blocking=True) - end_event = torch.cuda.Event(enable_timing=True, blocking=True) - start_event.record() - if i > math.floor(size_dataset / (times * batch_size)) - 1: - break - # inputs=data_x - # labels = data_y - inputs, labels = data - - # inputs = inputs.to(device) - # labels = labels.to(device) - - # t= time.time() - if mpi_comm_first.local_rank < num_spatial_parts_list[0]: - x = split_input( - inputs=inputs, - image_size=image_size, - slice_method=args.slice_method, - local_rank=mpi_comm_first.local_rank, - num_spatial_parts_list=num_spatial_parts_list, - ) - elif mpi_comm_second.local_rank < num_spatial_parts_list[0]: - x = split_input( - inputs=inputs, - image_size=image_size, - slice_method=args.slice_method, - local_rank=mpi_comm_second.local_rank, - num_spatial_parts_list=num_spatial_parts_list, - ) - else: - x = inputs - - # for j in range(MASTER): - - # temp_loss,temp_correct = t_s1.run_step(x,labels) - # temp_loss,temp_correct = t_s2.run_step(x,labels) - - if ENABLE_MASTER_OPT: - temp_loss, temp_correct = t_s_master.run_step_allreduce( - x, labels, i % 2 == 1 - ) - else: - temp_loss, temp_correct = t_s_master.run_step(x, labels) - - loss += temp_loss - correct += temp_correct - - start_event_allreduce = torch.cuda.Event(enable_timing=True, blocking=True) - end_event_allreduce = torch.cuda.Event(enable_timing=True, blocking=True) - start_event_allreduce.record() - t_allreduce_temp = time.time() - - if ENABLE_MASTER_OPT == False: - sync_comm.apply_allreduce_master_master( - model_gen1, model_gen2, mpi_comm_first, mpi_comm_second - ) - - """ - if(local_rank < spatial_size * num_spatial_parts): - None - #No need for this as, DDP is now used - # sync_allreduce.apply_allreduce(model_gen,mpi_comm.spatial_allreduce_grp) - """ - torch.cuda.synchronize() - - if ENABLE_MASTER_OPT: - if i % 2 == 1: - t_s_master.train_model1.update() - else: - t_s_master.train_model2.update() - else: - t_s_master.train_model1.update() - t_s_master.train_model2.update() - - end_event_allreduce.record() - torch.cuda.synchronize() - t_allreduce = start_event_allreduce.elapsed_time(end_event_allreduce) / 1000 - t_allreduce = time.time() - t_allreduce_temp - - if mpi_comm_second.local_rank == comm_size - 1: - None - # print("Step",i," LOSS",temp_loss, " Global loss:",loss/(i+1), " Acc:",temp_correct) - - if ENABLE_MASTER_OPT: - torch.distributed.barrier() - - end_event.record() - torch.cuda.synchronize() - t = start_event.elapsed_time(end_event) / 1000 - if mpi_comm_second.local_rank == 0: - None - print( - "images per sec:", - batch_size / t, - "Time:", - t, - " Time Allreduce:", - t_allreduce, - ) - perf.append(batch_size / t) - - t = time.time() - if mpi_comm_second.local_rank == comm_size - 1: - print("epoch", i_e, " Global loss:", loss, " acc", correct / i) - - -run_epoch() - -if mpi_comm_second.local_rank == 0: - print("Mean {} Median {}".format(sum(perf) / len(perf), np.median(perf))) -# y, _= t_s.forward_pass(x,y,part_number=0) -# t_s.backward_pass(y,part_number=0) -exit() diff --git a/benchmarks/spatial/model/resnet_model.py b/benchmarks/spatial/model/resnet_run.py similarity index 95% rename from benchmarks/spatial/model/resnet_model.py rename to benchmarks/spatial/model/resnet_run.py index d61c3425..414b5992 100644 --- a/benchmarks/spatial/model/resnet_model.py +++ b/benchmarks/spatial/model/resnet_run.py @@ -61,24 +61,27 @@ def get_depth(version, n): sys.stdout = Unbuffered(sys.stdout) np.random.seed(seed=1405) + +ENABLE_ASYNC = True parts = args.parts batch_size = args.batch_size -resnet_n = 12 epoch = args.num_epochs -ENABLE_ASYNC = True +image_size = int(args.image_size) +balance = args.balance +split_size = args.split_size +spatial_size = args.spatial_size +times = args.times +datapath = args.datapath # APP # 1: Medical # 2: Cifar # 3: synthetic APP = args.app -amoebanet_test = False -image_size = int(args.image_size) -print("image size", image_size) + +resnet_n = 12 +num_classes = 10 steps = 100 -balance = args.balance -split_size = args.split_size -spatial_size = args.spatial_size temp_num_spatial_parts = args.num_spatial_parts.split(",") @@ -91,9 +94,6 @@ def get_depth(version, n): spatial_part_size = num_spatial_parts_list[0] # Partition size for spatial parallelism -times = 1 -num_classes = 10 - def isPowerTwo(num): return not (num & (num - 1)) @@ -297,7 +297,7 @@ def verify_config(): if APP == 1: trainset = torchvision.datasets.ImageFolder( - "./train", transform=transform, target_transform=None + datapath, transform=transform, target_transform=None ) my_dataloader = torch.utils.data.DataLoader( trainset, diff --git a/docs/assets/images/AmeobaNet_img_size_1024.png b/docs/assets/images/AmeobaNet_img_size_1024.png index 1f587c62..35574ff4 100644 Binary files a/docs/assets/images/AmeobaNet_img_size_1024.png and b/docs/assets/images/AmeobaNet_img_size_1024.png differ diff --git a/docs/assets/images/AmeobaNet_img_size_2048.png b/docs/assets/images/AmeobaNet_img_size_2048.png index 1f587c62..4b9a8244 100644 Binary files a/docs/assets/images/AmeobaNet_img_size_2048.png and b/docs/assets/images/AmeobaNet_img_size_2048.png differ diff --git a/setup.py b/setup.py index 07391670..9ca37f4a 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,3 @@ from setuptools import setup, find_packages -setup(name="torch-gems", version="1.0", packages=find_packages()) +setup(name="now-dl", version="1.0", packages=find_packages()) diff --git a/torchgems/parser.py b/torchgems/parser.py index 31e69787..58d2bb17 100644 --- a/torchgems/parser.py +++ b/torchgems/parser.py @@ -3,7 +3,7 @@ def get_parser(): parser = argparse.ArgumentParser( - description="MP-DP ResNet Script", + description="SP-MP-DP Configuration Script", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) @@ -14,31 +14,8 @@ def get_parser(): action="store_true", ) - parser.add_argument( - "--fp16-allreduce", - action="store_true", - default=False, - help="use fp16 compression during allreduce", - ) - - parser.add_argument( - "--model", type=str, default="resnet50", help="model to benchmark" - ) parser.add_argument("--batch-size", type=int, default=32, help="input batch size") - parser.add_argument( - "--learning-rate", - type=float, - default=0.001, - help="learning rate for the optimizer", - ) - parser.add_argument( - "--num-gpus-mp", type=int, default=1, help="number of GPUS per node for MP" - ) - parser.add_argument( - "--mem-per-process", type=float, default=1, help="TF GPU memory per GPU" - ) - parser.add_argument("--parts", type=int, default=1, help="Number of parts for MP") parser.add_argument( @@ -66,29 +43,6 @@ def get_parser(): "--image-size", type=int, default=32, help="Image size for synthetic benchmark" ) - parser.add_argument( - "--dp-per-node", type=int, default=1, help="Number of DP modes per node" - ) - - parser.add_argument( - "--enable-dp", - dest="enable_dp", - action="store_true", - help="Enable DP for pytorch scripts", - ) - - parser.add_argument( - "--enable-master-comm-opt", - dest="enable_master_comm_opt", - action="store_true", - default=False, - help="Enable communication optimization for MASTER in Spatial", - ) - - parser.add_argument( - "--num-gpu-per-node", type=int, default=4, help="Number of GPUs per node" - ) - parser.add_argument("--num-epochs", type=int, default=1, help="Number of epochs") parser.add_argument( @@ -97,10 +51,6 @@ def get_parser(): parser.add_argument( "--num-filters", type=int, default=416, help="Number of layers in amoebanet" ) - parser.add_argument("--unet-b", type=int, default=6, help="B hyperparamter in unet") - parser.add_argument( - "--unet-c", type=int, default=72, help="C hyperparamter in unet" - ) parser.add_argument( "--balance", type=str, @@ -138,6 +88,10 @@ def get_parser(): default=3, help="Application type (1.medical, 2.cifar, and synthetic) in Spatial parallelism", ) - - parser.set_defaults(enable_dp=False) + parser.add_argument( + "--datapath", + type=int, + default="./train", + help="local Dataset path", + ) return parser