Skip to content

Commit

Permalink
Make num_workers as arg
Browse files Browse the repository at this point in the history
Signed-off-by: Radha Guhane <[email protected]>
  • Loading branch information
RadhaGulhane13 committed Sep 24, 2023
1 parent 7427cbf commit 5473854
Show file tree
Hide file tree
Showing 7 changed files with 39 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def init_processes(backend="mpi"):
balance = args.balance
mp_size = args.split_size
datapath = args.datapath
num_workers = args.num_workers
num_classes = args.num_classes

##################### AmoebaNet GEMS model specific parameters #####################
Expand Down Expand Up @@ -192,6 +193,7 @@ def init_processes(backend="mpi"):
trainset,
batch_size=times * batch_size,
shuffle=True,
num_workers=num_workers,
pin_memory=True,
)
size_dataset = len(my_dataloader.dataset)
Expand All @@ -211,6 +213,7 @@ def init_processes(backend="mpi"):
trainset,
batch_size=times * batch_size,
shuffle=False,
num_workers=num_workers,
pin_memory=True,
)
size_dataset = len(my_dataloader.dataset)
Expand All @@ -227,6 +230,7 @@ def init_processes(backend="mpi"):
my_dataset,
batch_size=batch_size * times,
shuffle=False,
num_workers=num_workers,
pin_memory=True,
)
size_dataset = 10 * batch_size
Expand Down
4 changes: 4 additions & 0 deletions benchmarks/gems_master_model/benchmark_resnet_gems_master.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def init_processes(backend="mpi"):
balance = args.balance
mp_size = args.split_size
datapath = args.datapath
num_workers = args.num_workers
num_classes = args.num_classes

################## ResNet model specific parameters/functions ##################
Expand Down Expand Up @@ -208,6 +209,7 @@ def get_depth(version, n):
trainset,
batch_size=times * batch_size,
shuffle=True,
num_workers=num_workers,
pin_memory=True,
)
size_dataset = len(my_dataloader.dataset)
Expand All @@ -219,6 +221,7 @@ def get_depth(version, n):
trainset,
batch_size=times * batch_size,
shuffle=False,
num_workers=num_workers,
pin_memory=True,
)
size_dataset = len(my_dataloader.dataset)
Expand All @@ -235,6 +238,7 @@ def get_depth(version, n):
my_dataset,
batch_size=batch_size * times,
shuffle=False,
num_workers=num_workers,
pin_memory=True,
)
size_dataset = 10 * batch_size
Expand Down
7 changes: 4 additions & 3 deletions benchmarks/layer_parallelism/benchmark_amoebanet_lp.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def __getattr__(self, attr):
mp_size = args.split_size
times = args.times
datapath = args.datapath
num_workers = args.num_workers
# APP
# 1: Medical
# 2: Cifar
Expand Down Expand Up @@ -186,7 +187,7 @@ def __getattr__(self, attr):
trainset,
batch_size=times * batch_size,
shuffle=True,
num_workers=0,
num_workers=num_workers,
pin_memory=True,
)
size_dataset = len(my_dataloader.dataset)
Expand All @@ -198,7 +199,7 @@ def __getattr__(self, attr):
trainset,
batch_size=times * batch_size,
shuffle=False,
num_workers=0,
num_workers=num_workers,
pin_memory=True,
)
size_dataset = 50000
Expand All @@ -215,7 +216,7 @@ def __getattr__(self, attr):
my_dataset,
batch_size=batch_size * times,
shuffle=False,
num_workers=0,
num_workers=num_workers,
pin_memory=True,
)
size_dataset = 10 * batch_size
Expand Down
7 changes: 4 additions & 3 deletions benchmarks/layer_parallelism/benchmark_resnet_lp.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def __getattr__(self, attr):
mp_size = args.split_size
times = args.times
datapath = args.datapath
num_workers = args.num_workers
# APP
# 1: Medical
# 2: Cifar
Expand Down Expand Up @@ -197,7 +198,7 @@ def get_depth(version, n):
trainset,
batch_size=times * batch_size,
shuffle=True,
num_workers=0,
num_workers=num_workers,
pin_memory=True,
)
size_dataset = len(my_dataloader.dataset)
Expand All @@ -209,7 +210,7 @@ def get_depth(version, n):
trainset,
batch_size=times * batch_size,
shuffle=False,
num_workers=0,
num_workers=num_workers,
pin_memory=True,
)
size_dataset = 50000
Expand All @@ -226,7 +227,7 @@ def get_depth(version, n):
my_dataset,
batch_size=batch_size * times,
shuffle=False,
num_workers=0,
num_workers=num_workers,
pin_memory=True,
)
size_dataset = 10 * batch_size
Expand Down
19 changes: 10 additions & 9 deletions benchmarks/spatial_parallelism/benchmark_amoebanet_sp.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def init_processes(backend="tcp"):
spatial_size = args.spatial_size
times = args.times
datapath = args.datapath
num_workers = args.num_workers
LOCAL_DP_LP = args.local_DP
# APP
# 1: Medical
Expand All @@ -111,11 +112,11 @@ def isPowerTwo(num):


"""
For Amoebanet model, image size and image size after partitioning should be power of two.
As, Amoebanet performs summation of results of two convolution layers during training,
odd input size(i.e. image size which is not power of 2) will give different output sizes
for convolution operations present at same layer, thus it will throw error as addition
operation can not be performed with diffent size outputs.
For Amoebanet model, image size and image size after partitioning should be power of two.
As, Amoebanet performs summation of results of two convolution layers during training,
odd input size(i.e. image size which is not power of 2) will give different output sizes
for convolution operations present at same layer, thus it will throw error as addition
operation can not be performed with diffent size outputs.
"""


Expand Down Expand Up @@ -152,7 +153,7 @@ def verify_config():
##################### AmoebaNet model specific parameters #####################

"""
"image_size_seq" is required to determine the output shape after spatial partitioning of images.
"image_size_seq" is required to determine the output shape after spatial partitioning of images.
The shape of the output will be determined for each model partition based on the values in "image_size_seq."
These values will then be used to calculate the output shape for a given input size and spatial partition.
"""
Expand Down Expand Up @@ -470,7 +471,7 @@ def verify_config():
trainset,
batch_size=times * batch_size,
shuffle=True,
num_workers=0,
num_workers=num_workers,
pin_memory=True,
)
size_dataset = len(my_dataloader.dataset)
Expand All @@ -482,7 +483,7 @@ def verify_config():
trainset,
batch_size=times * batch_size,
shuffle=False,
num_workers=0,
num_workers=num_workers,
pin_memory=True,
)
size_dataset = 50000
Expand All @@ -499,7 +500,7 @@ def verify_config():
my_dataset,
batch_size=batch_size * times,
shuffle=False,
num_workers=0,
num_workers=num_workers,
pin_memory=True,
)
size_dataset = 10 * batch_size
Expand Down
11 changes: 6 additions & 5 deletions benchmarks/spatial_parallelism/benchmark_resnet_sp.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def init_processes(backend="mpi"):
spatial_size = args.spatial_size
times = args.times
datapath = args.datapath
num_workers = args.num_workers

# APP
# 1: Medical
Expand All @@ -106,7 +107,7 @@ def init_processes(backend="mpi"):
################## ResNet model specific parameters/functions ##################

"""
"image_size_seq" is required to determine the output shape after spatial partitioning of images.
"image_size_seq" is required to determine the output shape after spatial partitioning of images.
The shape of the output will be determined for each model partition based on the values in "image_size_seq."
These values will then be used to calculate the output shape for a given input size and spatial partition.
"""
Expand All @@ -129,7 +130,7 @@ def isPowerTwo(num):


"""
For ResNet model, image size and image size after partitioning should be power of two.
For ResNet model, image size and image size after partitioning should be power of two.
As, ResNet performs convolution operations at different layers, odd input size
(i.e. image size which is not power of 2) will lead to truncation of input. Thus,
other GPU devices will receive truncated input with unexpected input size.
Expand Down Expand Up @@ -470,7 +471,7 @@ def verify_config():
trainset,
batch_size=times * batch_size,
shuffle=True,
num_workers=0,
num_workers=num_workers,
pin_memory=True,
)
size_dataset = len(my_dataloader.dataset)
Expand All @@ -482,7 +483,7 @@ def verify_config():
trainset,
batch_size=times * batch_size,
shuffle=False,
num_workers=0,
num_workers=num_workers,
pin_memory=True,
)
size_dataset = 50000
Expand All @@ -499,7 +500,7 @@ def verify_config():
my_dataset,
batch_size=batch_size * times,
shuffle=False,
num_workers=0,
num_workers=num_workers,
pin_memory=True,
)
size_dataset = 10 * batch_size
Expand Down
7 changes: 7 additions & 0 deletions src/torchgems/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,4 +126,11 @@ def get_parser():
help="local Dataset path",
)

parser.add_argument(
"--num-workers",
type=int,
default=0,
help="Slice method (square, vertical, and horizontal) in Spatial parallelism",
)

return parser

0 comments on commit 5473854

Please sign in to comment.