Make num_workers as arg

Signed-off-by: Radha Guhane <[email protected]>
OSU-Nowlab · Sep 24, 2023 · 5473854 · 5473854
1 parent 7427cbf
commit 5473854
Show file tree

Hide file tree

Showing 7 changed files with 39 additions and 20 deletions.
diff --git a/benchmarks/gems_master_model/benchmark_amoebanet_gems_master.py b/benchmarks/gems_master_model/benchmark_amoebanet_gems_master.py
@@ -64,6 +64,7 @@ def init_processes(backend="mpi"):
 balance = args.balance
 mp_size = args.split_size
 datapath = args.datapath
+num_workers = args.num_workers
 num_classes = args.num_classes
 
 ##################### AmoebaNet GEMS model specific parameters #####################
@@ -192,6 +193,7 @@ def init_processes(backend="mpi"):
         trainset,
         batch_size=times * batch_size,
         shuffle=True,
+        num_workers=num_workers,
         pin_memory=True,
     )
     size_dataset = len(my_dataloader.dataset)
@@ -211,6 +213,7 @@ def init_processes(backend="mpi"):
         trainset,
         batch_size=times * batch_size,
         shuffle=False,
+        num_workers=num_workers,
         pin_memory=True,
     )
     size_dataset = len(my_dataloader.dataset)
@@ -227,6 +230,7 @@ def init_processes(backend="mpi"):
         my_dataset,
         batch_size=batch_size * times,
         shuffle=False,
+        num_workers=num_workers,
         pin_memory=True,
     )
     size_dataset = 10 * batch_size

diff --git a/benchmarks/gems_master_model/benchmark_resnet_gems_master.py b/benchmarks/gems_master_model/benchmark_resnet_gems_master.py
@@ -65,6 +65,7 @@ def init_processes(backend="mpi"):
 balance = args.balance
 mp_size = args.split_size
 datapath = args.datapath
+num_workers = args.num_workers
 num_classes = args.num_classes
 
 ################## ResNet model specific parameters/functions ##################
@@ -208,6 +209,7 @@ def get_depth(version, n):
         trainset,
         batch_size=times * batch_size,
         shuffle=True,
+        num_workers=num_workers,
         pin_memory=True,
     )
     size_dataset = len(my_dataloader.dataset)
@@ -219,6 +221,7 @@ def get_depth(version, n):
         trainset,
         batch_size=times * batch_size,
         shuffle=False,
+        num_workers=num_workers,
         pin_memory=True,
     )
     size_dataset = len(my_dataloader.dataset)
@@ -235,6 +238,7 @@ def get_depth(version, n):
         my_dataset,
         batch_size=batch_size * times,
         shuffle=False,
+        num_workers=num_workers,
         pin_memory=True,
     )
     size_dataset = 10 * batch_size

diff --git a/benchmarks/layer_parallelism/benchmark_amoebanet_lp.py b/benchmarks/layer_parallelism/benchmark_amoebanet_lp.py
@@ -70,6 +70,7 @@ def __getattr__(self, attr):
 mp_size = args.split_size
 times = args.times
 datapath = args.datapath
+num_workers = args.num_workers
 # APP
 # 1: Medical
 # 2: Cifar
@@ -186,7 +187,7 @@ def __getattr__(self, attr):
         trainset,
         batch_size=times * batch_size,
         shuffle=True,
-        num_workers=0,
+        num_workers=num_workers,
         pin_memory=True,
     )
     size_dataset = len(my_dataloader.dataset)
@@ -198,7 +199,7 @@ def __getattr__(self, attr):
         trainset,
         batch_size=times * batch_size,
         shuffle=False,
-        num_workers=0,
+        num_workers=num_workers,
         pin_memory=True,
     )
     size_dataset = 50000
@@ -215,7 +216,7 @@ def __getattr__(self, attr):
         my_dataset,
         batch_size=batch_size * times,
         shuffle=False,
-        num_workers=0,
+        num_workers=num_workers,
         pin_memory=True,
     )
     size_dataset = 10 * batch_size

diff --git a/benchmarks/layer_parallelism/benchmark_resnet_lp.py b/benchmarks/layer_parallelism/benchmark_resnet_lp.py
@@ -67,6 +67,7 @@ def __getattr__(self, attr):
 mp_size = args.split_size
 times = args.times
 datapath = args.datapath
+num_workers = args.num_workers
 # APP
 # 1: Medical
 # 2: Cifar
@@ -197,7 +198,7 @@ def get_depth(version, n):
         trainset,
         batch_size=times * batch_size,
         shuffle=True,
-        num_workers=0,
+        num_workers=num_workers,
         pin_memory=True,
     )
     size_dataset = len(my_dataloader.dataset)
@@ -209,7 +210,7 @@ def get_depth(version, n):
         trainset,
         batch_size=times * batch_size,
         shuffle=False,
-        num_workers=0,
+        num_workers=num_workers,
         pin_memory=True,
     )
     size_dataset = 50000
@@ -226,7 +227,7 @@ def get_depth(version, n):
         my_dataset,
         batch_size=batch_size * times,
         shuffle=False,
-        num_workers=0,
+        num_workers=num_workers,
         pin_memory=True,
     )
     size_dataset = 10 * batch_size

diff --git a/benchmarks/spatial_parallelism/benchmark_amoebanet_sp.py b/benchmarks/spatial_parallelism/benchmark_amoebanet_sp.py
@@ -86,6 +86,7 @@ def init_processes(backend="tcp"):
 spatial_size = args.spatial_size
 times = args.times
 datapath = args.datapath
+num_workers = args.num_workers
 LOCAL_DP_LP = args.local_DP
 # APP
 # 1: Medical
@@ -111,11 +112,11 @@ def isPowerTwo(num):
 
 
 """
-For Amoebanet model, image size and image size after partitioning should be power of two. 
-As, Amoebanet performs summation of results of two convolution layers during training, 
-odd input size(i.e. image size which is not power of 2) will give different output sizes 
-for convolution operations present at same layer, thus it will throw error as addition 
-operation can not be performed with diffent size outputs. 
+For Amoebanet model, image size and image size after partitioning should be power of two.
+As, Amoebanet performs summation of results of two convolution layers during training,
+odd input size(i.e. image size which is not power of 2) will give different output sizes
+for convolution operations present at same layer, thus it will throw error as addition
+operation can not be performed with diffent size outputs.
 """
 
 
@@ -152,7 +153,7 @@ def verify_config():
 ##################### AmoebaNet model specific parameters #####################
 
 """
-"image_size_seq" is required to determine the output shape after spatial partitioning of images. 
+"image_size_seq" is required to determine the output shape after spatial partitioning of images.
 The shape of the output will be determined for each model partition based on the values in "image_size_seq."
 These values will then be used to calculate the output shape for a given input size and spatial partition.
 """
@@ -470,7 +471,7 @@ def verify_config():
         trainset,
         batch_size=times * batch_size,
         shuffle=True,
-        num_workers=0,
+        num_workers=num_workers,
         pin_memory=True,
     )
     size_dataset = len(my_dataloader.dataset)
@@ -482,7 +483,7 @@ def verify_config():
         trainset,
         batch_size=times * batch_size,
         shuffle=False,
-        num_workers=0,
+        num_workers=num_workers,
         pin_memory=True,
     )
     size_dataset = 50000
@@ -499,7 +500,7 @@ def verify_config():
         my_dataset,
         batch_size=batch_size * times,
         shuffle=False,
-        num_workers=0,
+        num_workers=num_workers,
         pin_memory=True,
     )
     size_dataset = 10 * batch_size

diff --git a/benchmarks/spatial_parallelism/benchmark_resnet_sp.py b/benchmarks/spatial_parallelism/benchmark_resnet_sp.py
@@ -84,6 +84,7 @@ def init_processes(backend="mpi"):
 spatial_size = args.spatial_size
 times = args.times
 datapath = args.datapath
+num_workers = args.num_workers
 
 # APP
 # 1: Medical
@@ -106,7 +107,7 @@ def init_processes(backend="mpi"):
 ################## ResNet model specific parameters/functions ##################
 
 """
-"image_size_seq" is required to determine the output shape after spatial partitioning of images. 
+"image_size_seq" is required to determine the output shape after spatial partitioning of images.
 The shape of the output will be determined for each model partition based on the values in "image_size_seq."
 These values will then be used to calculate the output shape for a given input size and spatial partition.
 """
@@ -129,7 +130,7 @@ def isPowerTwo(num):
 
 
 """
-For ResNet model, image size and image size after partitioning should be power of two. 
+For ResNet model, image size and image size after partitioning should be power of two.
 As, ResNet performs convolution operations at different layers, odd input size
 (i.e. image size which is not power of 2) will lead to truncation of input. Thus,
 other GPU devices will receive truncated input with unexpected input size.
@@ -470,7 +471,7 @@ def verify_config():
         trainset,
         batch_size=times * batch_size,
         shuffle=True,
-        num_workers=0,
+        num_workers=num_workers,
         pin_memory=True,
     )
     size_dataset = len(my_dataloader.dataset)
@@ -482,7 +483,7 @@ def verify_config():
         trainset,
         batch_size=times * batch_size,
         shuffle=False,
-        num_workers=0,
+        num_workers=num_workers,
         pin_memory=True,
     )
     size_dataset = 50000
@@ -499,7 +500,7 @@ def verify_config():
         my_dataset,
         batch_size=batch_size * times,
         shuffle=False,
-        num_workers=0,
+        num_workers=num_workers,
         pin_memory=True,
     )
     size_dataset = 10 * batch_size

diff --git a/src/torchgems/parser.py b/src/torchgems/parser.py
@@ -126,4 +126,11 @@ def get_parser():
         help="local Dataset path",
     )
 
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        default=0,
+        help="Slice method (square, vertical, and horizontal) in Spatial parallelism",
+    )
+
     return parser