diff --git a/benchmarks/gems_master_model/benchmark_amoebanet_gems_master.py b/benchmarks/gems_master_model/benchmark_amoebanet_gems_master.py index 426133c3..3d7a80f5 100644 --- a/benchmarks/gems_master_model/benchmark_amoebanet_gems_master.py +++ b/benchmarks/gems_master_model/benchmark_amoebanet_gems_master.py @@ -64,6 +64,7 @@ def init_processes(backend="mpi"): balance = args.balance mp_size = args.split_size datapath = args.datapath +num_workers = args.num_workers num_classes = args.num_classes ##################### AmoebaNet GEMS model specific parameters ##################### @@ -192,6 +193,7 @@ def init_processes(backend="mpi"): trainset, batch_size=times * batch_size, shuffle=True, + num_workers=num_workers, pin_memory=True, ) size_dataset = len(my_dataloader.dataset) @@ -211,6 +213,7 @@ def init_processes(backend="mpi"): trainset, batch_size=times * batch_size, shuffle=False, + num_workers=num_workers, pin_memory=True, ) size_dataset = len(my_dataloader.dataset) @@ -227,6 +230,7 @@ def init_processes(backend="mpi"): my_dataset, batch_size=batch_size * times, shuffle=False, + num_workers=num_workers, pin_memory=True, ) size_dataset = 10 * batch_size diff --git a/benchmarks/gems_master_model/benchmark_resnet_gems_master.py b/benchmarks/gems_master_model/benchmark_resnet_gems_master.py index 68707f43..bacde2d2 100644 --- a/benchmarks/gems_master_model/benchmark_resnet_gems_master.py +++ b/benchmarks/gems_master_model/benchmark_resnet_gems_master.py @@ -65,6 +65,7 @@ def init_processes(backend="mpi"): balance = args.balance mp_size = args.split_size datapath = args.datapath +num_workers = args.num_workers num_classes = args.num_classes ################## ResNet model specific parameters/functions ################## @@ -208,6 +209,7 @@ def get_depth(version, n): trainset, batch_size=times * batch_size, shuffle=True, + num_workers=num_workers, pin_memory=True, ) size_dataset = len(my_dataloader.dataset) @@ -219,6 +221,7 @@ def get_depth(version, n): trainset, batch_size=times * batch_size, shuffle=False, + num_workers=num_workers, pin_memory=True, ) size_dataset = len(my_dataloader.dataset) @@ -235,6 +238,7 @@ def get_depth(version, n): my_dataset, batch_size=batch_size * times, shuffle=False, + num_workers=num_workers, pin_memory=True, ) size_dataset = 10 * batch_size diff --git a/benchmarks/layer_parallelism/benchmark_amoebanet_lp.py b/benchmarks/layer_parallelism/benchmark_amoebanet_lp.py index ebad7692..3dc47968 100644 --- a/benchmarks/layer_parallelism/benchmark_amoebanet_lp.py +++ b/benchmarks/layer_parallelism/benchmark_amoebanet_lp.py @@ -70,6 +70,7 @@ def __getattr__(self, attr): mp_size = args.split_size times = args.times datapath = args.datapath +num_workers = args.num_workers # APP # 1: Medical # 2: Cifar @@ -186,7 +187,7 @@ def __getattr__(self, attr): trainset, batch_size=times * batch_size, shuffle=True, - num_workers=0, + num_workers=num_workers, pin_memory=True, ) size_dataset = len(my_dataloader.dataset) @@ -198,7 +199,7 @@ def __getattr__(self, attr): trainset, batch_size=times * batch_size, shuffle=False, - num_workers=0, + num_workers=num_workers, pin_memory=True, ) size_dataset = 50000 @@ -215,7 +216,7 @@ def __getattr__(self, attr): my_dataset, batch_size=batch_size * times, shuffle=False, - num_workers=0, + num_workers=num_workers, pin_memory=True, ) size_dataset = 10 * batch_size diff --git a/benchmarks/layer_parallelism/benchmark_resnet_lp.py b/benchmarks/layer_parallelism/benchmark_resnet_lp.py index 8d48f473..9cf1d13c 100644 --- a/benchmarks/layer_parallelism/benchmark_resnet_lp.py +++ b/benchmarks/layer_parallelism/benchmark_resnet_lp.py @@ -67,6 +67,7 @@ def __getattr__(self, attr): mp_size = args.split_size times = args.times datapath = args.datapath +num_workers = args.num_workers # APP # 1: Medical # 2: Cifar @@ -197,7 +198,7 @@ def get_depth(version, n): trainset, batch_size=times * batch_size, shuffle=True, - num_workers=0, + num_workers=num_workers, pin_memory=True, ) size_dataset = len(my_dataloader.dataset) @@ -209,7 +210,7 @@ def get_depth(version, n): trainset, batch_size=times * batch_size, shuffle=False, - num_workers=0, + num_workers=num_workers, pin_memory=True, ) size_dataset = 50000 @@ -226,7 +227,7 @@ def get_depth(version, n): my_dataset, batch_size=batch_size * times, shuffle=False, - num_workers=0, + num_workers=num_workers, pin_memory=True, ) size_dataset = 10 * batch_size diff --git a/benchmarks/spatial_parallelism/benchmark_amoebanet_sp.py b/benchmarks/spatial_parallelism/benchmark_amoebanet_sp.py index 1080d6d9..e7f54ad9 100644 --- a/benchmarks/spatial_parallelism/benchmark_amoebanet_sp.py +++ b/benchmarks/spatial_parallelism/benchmark_amoebanet_sp.py @@ -86,6 +86,7 @@ def init_processes(backend="tcp"): spatial_size = args.spatial_size times = args.times datapath = args.datapath +num_workers = args.num_workers LOCAL_DP_LP = args.local_DP # APP # 1: Medical @@ -111,11 +112,11 @@ def isPowerTwo(num): """ -For Amoebanet model, image size and image size after partitioning should be power of two. -As, Amoebanet performs summation of results of two convolution layers during training, -odd input size(i.e. image size which is not power of 2) will give different output sizes -for convolution operations present at same layer, thus it will throw error as addition -operation can not be performed with diffent size outputs. +For Amoebanet model, image size and image size after partitioning should be power of two. +As, Amoebanet performs summation of results of two convolution layers during training, +odd input size(i.e. image size which is not power of 2) will give different output sizes +for convolution operations present at same layer, thus it will throw error as addition +operation can not be performed with diffent size outputs. """ @@ -152,7 +153,7 @@ def verify_config(): ##################### AmoebaNet model specific parameters ##################### """ -"image_size_seq" is required to determine the output shape after spatial partitioning of images. +"image_size_seq" is required to determine the output shape after spatial partitioning of images. The shape of the output will be determined for each model partition based on the values in "image_size_seq." These values will then be used to calculate the output shape for a given input size and spatial partition. """ @@ -470,7 +471,7 @@ def verify_config(): trainset, batch_size=times * batch_size, shuffle=True, - num_workers=0, + num_workers=num_workers, pin_memory=True, ) size_dataset = len(my_dataloader.dataset) @@ -482,7 +483,7 @@ def verify_config(): trainset, batch_size=times * batch_size, shuffle=False, - num_workers=0, + num_workers=num_workers, pin_memory=True, ) size_dataset = 50000 @@ -499,7 +500,7 @@ def verify_config(): my_dataset, batch_size=batch_size * times, shuffle=False, - num_workers=0, + num_workers=num_workers, pin_memory=True, ) size_dataset = 10 * batch_size diff --git a/benchmarks/spatial_parallelism/benchmark_resnet_sp.py b/benchmarks/spatial_parallelism/benchmark_resnet_sp.py index e4459fc4..512bc7de 100644 --- a/benchmarks/spatial_parallelism/benchmark_resnet_sp.py +++ b/benchmarks/spatial_parallelism/benchmark_resnet_sp.py @@ -84,6 +84,7 @@ def init_processes(backend="mpi"): spatial_size = args.spatial_size times = args.times datapath = args.datapath +num_workers = args.num_workers # APP # 1: Medical @@ -106,7 +107,7 @@ def init_processes(backend="mpi"): ################## ResNet model specific parameters/functions ################## """ -"image_size_seq" is required to determine the output shape after spatial partitioning of images. +"image_size_seq" is required to determine the output shape after spatial partitioning of images. The shape of the output will be determined for each model partition based on the values in "image_size_seq." These values will then be used to calculate the output shape for a given input size and spatial partition. """ @@ -129,7 +130,7 @@ def isPowerTwo(num): """ -For ResNet model, image size and image size after partitioning should be power of two. +For ResNet model, image size and image size after partitioning should be power of two. As, ResNet performs convolution operations at different layers, odd input size (i.e. image size which is not power of 2) will lead to truncation of input. Thus, other GPU devices will receive truncated input with unexpected input size. @@ -470,7 +471,7 @@ def verify_config(): trainset, batch_size=times * batch_size, shuffle=True, - num_workers=0, + num_workers=num_workers, pin_memory=True, ) size_dataset = len(my_dataloader.dataset) @@ -482,7 +483,7 @@ def verify_config(): trainset, batch_size=times * batch_size, shuffle=False, - num_workers=0, + num_workers=num_workers, pin_memory=True, ) size_dataset = 50000 @@ -499,7 +500,7 @@ def verify_config(): my_dataset, batch_size=batch_size * times, shuffle=False, - num_workers=0, + num_workers=num_workers, pin_memory=True, ) size_dataset = 10 * batch_size diff --git a/src/torchgems/parser.py b/src/torchgems/parser.py index 4df14797..0b6f5678 100644 --- a/src/torchgems/parser.py +++ b/src/torchgems/parser.py @@ -126,4 +126,11 @@ def get_parser(): help="local Dataset path", ) + parser.add_argument( + "--num-workers", + type=int, + default=0, + help="Slice method (square, vertical, and horizontal) in Spatial parallelism", + ) + return parser