Update distributed training for MetaTensor (Project-MONAI#777)

Nic-Ma · web-flow · commit b10584916aea · 2022-07-11T13:05:12.000+01:00
* [DLMED] update brats ddp

Signed-off-by: Nic Ma &lt;nma@nvidia.com&gt;

* [DLMED] restore dist for compatible

Signed-off-by: Nic Ma &lt;nma@nvidia.com&gt;

* [DLMED] update ddp examples

Signed-off-by: Nic Ma &lt;nma@nvidia.com&gt;

* [DLMED] update ignite workflows

Signed-off-by: Nic Ma &lt;nma@nvidia.com&gt;

* [DLMED] update smart cache ddp

Signed-off-by: Nic Ma &lt;nma@nvidia.com&gt;

* [DLMED] update horovod ddp

Signed-off-by: Nic Ma &lt;nma@nvidia.com&gt;
diff --git a/acceleration/distributed_training/brats_training_ddp.py b/acceleration/distributed_training/brats_training_ddp.py
@@ -57,7 +57,6 @@
 """
 
 import argparse
-import numpy as np
 import os
 import sys
 import time
@@ -89,8 +88,6 @@
     RandSpatialCropd,
     Spacingd,
     ToDeviced,
-    EnsureTyped,
-    EnsureType,
 )
 from monai.utils import set_determinism
 
@@ -110,16 +107,16 @@ def __call__(self, data):
         for key in self.keys:
             result = []
             # merge label 2 and label 3 to construct TC
-            result.append(np.logical_or(d[key] == 2, d[key] == 3))
+            result.append(torch.logical_or(d[key] == 2, d[key] == 3))
             # merge labels 1, 2 and 3 to construct WT
             result.append(
-                np.logical_or(
-                    np.logical_or(d[key] == 2, d[key] == 3), d[key] == 1
+                torch.logical_or(
+                    torch.logical_or(d[key] == 2, d[key] == 3), d[key] == 1
                 )
             )
             # label 2 is ET
             result.append(d[key] == 2)
-            d[key] = np.stack(result, axis=0).astype(np.float32)
+            d[key] = torch.stack(result, dim=0)
         return d
 
 
@@ -132,7 +129,7 @@ def __init__(
         self,
         root_dir,
         section,
-        transform=LoadImaged(["image", "label"]),
+        transform=None,
         cache_rate=1.0,
         num_workers=0,
         shuffle=False,
@@ -187,6 +184,7 @@ def main_worker(args):
         [
             # load 4 Nifti images and stack them together
             LoadImaged(keys=["image", "label"]),
+            ToDeviced(keys=["image", "label"], device=device),
             EnsureChannelFirstd(keys="image"),
             ConvertToMultiChannelBasedOnBratsClassesd(keys="label"),
             Orientationd(keys=["image", "label"], axcodes="RAS"),
@@ -195,8 +193,6 @@ def main_worker(args):
                 pixdim=(1.0, 1.0, 1.0),
                 mode=("bilinear", "nearest"),
             ),
-            EnsureTyped(keys=["image", "label"]),
-            ToDeviced(keys=["image", "label"], device=device),
             RandSpatialCropd(keys=["image", "label"], roi_size=[224, 224, 144], random_size=False),
             RandFlipd(keys=["image", "label"], prob=0.5, spatial_axis=0),
             RandFlipd(keys=["image", "label"], prob=0.5, spatial_axis=1),
@@ -223,6 +219,7 @@ def main_worker(args):
     val_transforms = Compose(
         [
             LoadImaged(keys=["image", "label"]),
+            ToDeviced(keys=["image", "label"], device=device),
             EnsureChannelFirstd(keys="image"),
             ConvertToMultiChannelBasedOnBratsClassesd(keys="label"),
             Orientationd(keys=["image", "label"], axcodes="RAS"),
@@ -232,8 +229,6 @@ def main_worker(args):
                 mode=("bilinear", "nearest"),
             ),
             NormalizeIntensityd(keys="image", nonzero=True, channel_wise=True),
-            EnsureTyped(keys=["image", "label"]),
-            ToDeviced(keys=["image", "label"], device=device),
         ]
     )
     val_ds = BratsCacheDataset(
@@ -283,7 +278,7 @@ def main_worker(args):
     dice_metric = DiceMetric(include_background=True, reduction="mean")
     dice_metric_batch = DiceMetric(include_background=True, reduction="mean_batch")
 
-    post_trans = Compose([EnsureType(), Activations(sigmoid=True), AsDiscrete(threshold=0.5)])
+    post_trans = Compose([Activations(sigmoid=True), AsDiscrete(threshold=0.5)])
 
     # start a typical PyTorch training
     best_metric = -1
diff --git a/acceleration/distributed_training/unet_evaluation_ddp.py b/acceleration/distributed_training/unet_evaluation_ddp.py
@@ -62,7 +62,7 @@
 from monai.data import DataLoader, Dataset, create_test_image_3d, DistributedSampler, decollate_batch
 from monai.inferers import sliding_window_inference
 from monai.metrics import DiceMetric
-from monai.transforms import Activations, AsChannelFirstd, AsDiscrete, Compose, LoadImaged, ScaleIntensityd, EnsureTyped, EnsureType
+from monai.transforms import Activations, AsChannelFirstd, AsDiscrete, Compose, LoadImaged, ScaleIntensityd
 
 
 def evaluate(args):
@@ -92,7 +92,6 @@ def evaluate(args):
             LoadImaged(keys=["img", "seg"]),
             AsChannelFirstd(keys=["img", "seg"], channel_dim=-1),
             ScaleIntensityd(keys="img"),
-            EnsureTyped(keys=["img", "seg"]),
         ]
     )
 
@@ -103,7 +102,7 @@ def evaluate(args):
     # sliding window inference need to input 1 image in every iteration
     val_loader = DataLoader(val_ds, batch_size=1, shuffle=False, num_workers=2, pin_memory=True, sampler=val_sampler)
     dice_metric = DiceMetric(include_background=True, reduction="mean", get_not_nans=False)
-    post_trans = Compose([EnsureType(), Activations(sigmoid=True), AsDiscrete(threshold=0.5)])
+    post_trans = Compose([Activations(sigmoid=True), AsDiscrete(threshold=0.5)])
     # create UNet, DiceLoss and Adam optimizer
     device = torch.device(f"cuda:{args.local_rank}")
     torch.cuda.set_device(device)
diff --git a/acceleration/distributed_training/unet_evaluation_horovod.py b/acceleration/distributed_training/unet_evaluation_horovod.py
@@ -35,7 +35,7 @@
     Example script to execute this program, only need to run on the master node:
     `horovodrun -np 16 -H server1:4,server2:4,server3:4,server4:4 python unet_evaluation_horovod.py -d "./testdata"`
 
-    This example was tested with [Ubuntu 16.04/20.04], [NCCL 2.6.3], [horovod 0.19.5].
+    This example was tested with [Ubuntu 16.04/20.04], [NCCL 2.6.3], [horovod 0.25.0].
 
 Referring to: https://github.com/horovod/horovod/blob/master/examples/pytorch_mnist.py
 
@@ -56,7 +56,7 @@
 from monai.data import DataLoader, Dataset, create_test_image_3d, decollate_batch
 from monai.inferers import sliding_window_inference
 from monai.metrics import DiceMetric
-from monai.transforms import Activations, AsChannelFirstd, AsDiscrete, Compose, LoadImaged, ScaleIntensityd, EnsureTyped, EnsureType
+from monai.transforms import Activations, AsChannelFirstd, AsDiscrete, Compose, LoadImaged, ScaleIntensityd, EnsureType
 
 
 def evaluate(args):
@@ -88,7 +88,6 @@ def evaluate(args):
             LoadImaged(keys=["img", "seg"]),
             AsChannelFirstd(keys=["img", "seg"], channel_dim=-1),
             ScaleIntensityd(keys="img"),
-            EnsureTyped(keys=["img", "seg"]),
         ]
     )
 
@@ -156,7 +155,7 @@ def main():
     evaluate(args=args)
 
 
-# Example script to execute this program only on the master node:
+# Example script to execute this program on 4 nodes (only need to run below command on the master node):
 # horovodrun -np 16 -H server1:4,server2:4,server3:4,server4:4 python unet_evaluation_horovod.py -d "./testdata"
 if __name__ == "__main__":
     main()
diff --git a/acceleration/distributed_training/unet_evaluation_workflows.py b/acceleration/distributed_training/unet_evaluation_workflows.py
@@ -79,7 +79,6 @@
     KeepLargestConnectedComponentd,
     LoadImaged,
     ScaleIntensityd,
-    EnsureTyped,
     SaveImaged,
 )
 
@@ -113,7 +112,6 @@ def evaluate(args):
             LoadImaged(keys=["image", "label"]),
             AsChannelFirstd(keys=["image", "label"], channel_dim=-1),
             ScaleIntensityd(keys="image"),
-            EnsureTyped(keys=["image", "label"]),
         ]
     )
 
diff --git a/acceleration/distributed_training/unet_training_ddp.py b/acceleration/distributed_training/unet_training_ddp.py
@@ -67,7 +67,6 @@
     RandCropByPosNegLabeld,
     RandRotate90d,
     ScaleIntensityd,
-    EnsureTyped,
 )
 
 
@@ -106,7 +105,6 @@ def train(args):
                 keys=["img", "seg"], label_key="seg", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4
             ),
             RandRotate90d(keys=["img", "seg"], prob=0.5, spatial_axes=[0, 2]),
-            EnsureTyped(keys=["img", "seg"]),
         ]
     )
 
diff --git a/acceleration/distributed_training/unet_training_horovod.py b/acceleration/distributed_training/unet_training_horovod.py
@@ -39,7 +39,7 @@
     Example script to execute this program, only need to run on the master node:
     `horovodrun -np 16 -H server1:4,server2:4,server3:4,server4:4 python unet_training_horovod.py -d "./testdata"`
 
-    This example was tested with [Ubuntu 16.04/20.04], [NCCL 2.6.3], [horovod 0.19.5].
+    This example was tested with [Ubuntu 16.04/20.04], [NCCL 2.6.3], [horovod 0.25.0].
 
 Referring to: https://github.com/horovod/horovod/blob/master/examples/pytorch_mnist.py
 
@@ -66,7 +66,6 @@
     RandCropByPosNegLabeld,
     RandRotate90d,
     ScaleIntensityd,
-    EnsureTyped,
 )
 
 
@@ -106,7 +105,6 @@ def train(args):
                 keys=["img", "seg"], label_key="seg", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4
             ),
             RandRotate90d(keys=["img", "seg"], prob=0.5, spatial_axes=[0, 2]),
-            EnsureTyped(keys=["img", "seg"]),
         ]
     )
 
@@ -188,7 +186,7 @@ def main():
     train(args=args)
 
 
-# Example script to execute this program only on the master node:
+# Example script to execute this program on 4 nodes (only need to run below command on the master node):
 # horovodrun -np 16 -H server1:4,server2:4,server3:4,server4:4 python unet_training_horovod.py -d "./testdata"
 if __name__ == "__main__":
     main()
diff --git a/acceleration/distributed_training/unet_training_smartcache.py b/acceleration/distributed_training/unet_training_smartcache.py
@@ -73,7 +73,6 @@
     RandCropByPosNegLabeld,
     RandRotate90d,
     ScaleIntensityd,
-    EnsureTyped,
 )
 
 
@@ -112,7 +111,6 @@ def train(args):
                 keys=["img", "seg"], label_key="seg", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4
             ),
             RandRotate90d(keys=["img", "seg"], prob=0.5, spatial_axes=[0, 2]),
-            EnsureTyped(keys=["img", "seg"]),
         ]
     )
 
diff --git a/acceleration/distributed_training/unet_training_workflows.py b/acceleration/distributed_training/unet_training_workflows.py
@@ -81,7 +81,6 @@
     RandCropByPosNegLabeld,
     RandRotate90d,
     ScaleIntensityd,
-    EnsureTyped,
 )
 
 
@@ -118,7 +117,6 @@ def train(args):
                 keys=["image", "label"], label_key="label", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4
             ),
             RandRotate90d(keys=["image", "label"], prob=0.5, spatial_axes=[0, 2]),
-            EnsureTyped(keys=["image", "label"]),
         ]
     )
 
@@ -155,7 +153,6 @@ def train(args):
 
     train_post_transforms = Compose(
         [
-            EnsureTyped(keys="pred"),
             Activationsd(keys="pred", sigmoid=True),
             AsDiscreted(keys="pred", threshold=0.5),
             KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]),
@@ -198,7 +195,6 @@ def main():
     train(args=args)
 
 
-
 # python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_PER_NODE
 #        --nnodes=NUM_NODES --node_rank=INDEX_CURRENT_NODE
 #        --master_addr="192.168.1.1" --master_port=1234

Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,6 @@`
`79`	`79`	`KeepLargestConnectedComponentd,`
`80`	`80`	`LoadImaged,`
`81`	`81`	`ScaleIntensityd,`
`82`		`- EnsureTyped,`
`83`	`82`	`SaveImaged,`
`84`	`83`	`)`
`85`	`84`
`@@ -113,7 +112,6 @@ def evaluate(args):`
`113`	`112`	`LoadImaged(keys=["image", "label"]),`
`114`	`113`	`AsChannelFirstd(keys=["image", "label"], channel_dim=-1),`
`115`	`114`	`ScaleIntensityd(keys="image"),`
`116`		`- EnsureTyped(keys=["image", "label"]),`
`117`	`115`	`]`
`118`	`116`	`)`
`119`	`117`
Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,6 @@`
`67`	`67`	`RandCropByPosNegLabeld,`
`68`	`68`	`RandRotate90d,`
`69`	`69`	`ScaleIntensityd,`
`70`		`- EnsureTyped,`
`71`	`70`	`)`
`72`	`71`
`73`	`72`
`@@ -106,7 +105,6 @@ def train(args):`
`106`	`105`	`keys=["img", "seg"], label_key="seg", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4`
`107`	`106`	`),`
`108`	`107`	`RandRotate90d(keys=["img", "seg"], prob=0.5, spatial_axes=[0, 2]),`
`109`		`- EnsureTyped(keys=["img", "seg"]),`
`110`	`108`	`]`
`111`	`109`	`)`
`112`	`110`
Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,6 @@`
`73`	`73`	`RandCropByPosNegLabeld,`
`74`	`74`	`RandRotate90d,`
`75`	`75`	`ScaleIntensityd,`
`76`		`- EnsureTyped,`
`77`	`76`	`)`
`78`	`77`
`79`	`78`
`@@ -112,7 +111,6 @@ def train(args):`
`112`	`111`	`keys=["img", "seg"], label_key="seg", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4`
`113`	`112`	`),`
`114`	`113`	`RandRotate90d(keys=["img", "seg"], prob=0.5, spatial_axes=[0, 2]),`
`115`		`- EnsureTyped(keys=["img", "seg"]),`
`116`	`114`	`]`
`117`	`115`	`)`
`118`	`116`
Original file line number	Diff line number	Diff line change
`@@ -81,7 +81,6 @@`
`81`	`81`	`RandCropByPosNegLabeld,`
`82`	`82`	`RandRotate90d,`
`83`	`83`	`ScaleIntensityd,`
`84`		`- EnsureTyped,`
`85`	`84`	`)`
`86`	`85`
`87`	`86`
`@@ -118,7 +117,6 @@ def train(args):`
`118`	`117`	`keys=["image", "label"], label_key="label", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4`
`119`	`118`	`),`
`120`	`119`	`RandRotate90d(keys=["image", "label"], prob=0.5, spatial_axes=[0, 2]),`
`121`		`- EnsureTyped(keys=["image", "label"]),`
`122`	`120`	`]`
`123`	`121`	`)`
`124`	`122`
`@@ -155,7 +153,6 @@ def train(args):`
`155`	`153`
`156`	`154`	`train_post_transforms = Compose(`
`157`	`155`	`[`
`158`		`- EnsureTyped(keys="pred"),`
`159`	`156`	`Activationsd(keys="pred", sigmoid=True),`
`160`	`157`	`AsDiscreted(keys="pred", threshold=0.5),`
`161`	`158`	`KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]),`
`@@ -198,7 +195,6 @@ def main():`
`198`	`195`	`train(args=args)`
`199`	`196`
`200`	`197`
`201`		`-`
`202`	`198`	`# python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_PER_NODE`
`203`	`199`	`# --nnodes=NUM_NODES --node_rank=INDEX_CURRENT_NODE`
`204`	`200`	`# --master_addr="192.168.1.1" --master_port=1234`