From 039c3d1f5b0e2c502a0dbc31665bc50211c599a9 Mon Sep 17 00:00:00 2001
From: rtavon <remi.tavon@nrcan-rncan.gc.ca>
Date: Thu, 19 Jan 2023 16:59:07 -0500
Subject: [PATCH 1/4] inference_segmentation.py: softcode download directory
 for checkpoint if url default_multiclass.yaml: add checkpoint_dir param
 default_binary.yaml: idem model_choice.py: - remove load state dict path from
 define_model - read_checkpoint: set default "update=False", not True
 train_segmentation.py: adjust usage of define_model test_models.py: idem

---
 config/inference/default_binary.yaml     |  2 ++
 config/inference/default_multiclass.yaml |  2 ++
 inference_segmentation.py                | 18 +++++++++++-------
 models/model_choice.py                   |  9 ++-------
 tests/model/test_models.py               |  6 +++---
 train_segmentation.py                    |  5 +++--
 6 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/config/inference/default_binary.yaml b/config/inference/default_binary.yaml
index 6c1efd69..90c24212 100644
--- a/config/inference/default_binary.yaml
+++ b/config/inference/default_binary.yaml
@@ -1,8 +1,10 @@
 # @package _global_
 inference:
   raw_data_csv: tests/inference/inference_segmentation_binary.csv
+  root_dir: data/inference
   input_stac_item:  # alternatively, use a path or url to stac item directly
   state_dict_path: ${general.save_weights_dir}/
+  checkpoint_dir: ${general.save_weights_dir}  # (string, optional): directory in which to save the object if url
   chunk_size:  # if empty, will be calculated automatically from max_pix_per_mb_gpu
   # Maximum number of pixels each Mb of GPU Ram to allow. E.g. if GPU has 1000 Mb of Ram and this parameter is set to
   # 10, chunk_size will be set to sqrt(1000 * 10) = 100.
diff --git a/config/inference/default_multiclass.yaml b/config/inference/default_multiclass.yaml
index c35725c6..eab05234 100644
--- a/config/inference/default_multiclass.yaml
+++ b/config/inference/default_multiclass.yaml
@@ -1,8 +1,10 @@
 # @package _global_
 inference:
   raw_data_csv: tests/inference/inference_segmentation_multiclass.csv
+  root_dir: data/inference
   input_stac_item:  # alternatively, use a path or url to stac item directly
   state_dict_path: ${general.save_weights_dir}/
+  checkpoint_dir: ${general.save_weights_dir}  # (string, optional): directory in which to save the object if url
   chunk_size:  # if empty, will be calculated automatically from max_pix_per_mb_gpu
   # Maximum number of pixels each Mb of GPU Ram to allow. E.g. if GPU has 1000 Mb of Ram and this parameter is set to
   # 10, chunk_size will be set to sqrt(1000 * 10) = 100.
diff --git a/inference_segmentation.py b/inference_segmentation.py
index 3960997f..e8ac4f28 100644
--- a/inference_segmentation.py
+++ b/inference_segmentation.py
@@ -329,13 +329,17 @@ def main(params: Union[DictConfig, dict]) -> None:
     -------
     :param params: (dict) Parameters inputted during execution.
     """
-    # SETTING OUTPUT DIRECTORY
+    # Main params
+    root = get_key_def('root_dir', params['inference'], default="inference", to_path=True)
+    root.mkdir(exist_ok=True)
     state_dict = get_key_def('state_dict_path', params['inference'], to_path=True,
                              validate_path_exists=True,
                              wildcard='*pth.tar')
+    models_dir = get_key_def('checkpoint_dir', params['inference'], default=root / 'checkpoints', to_path=True)
+    models_dir.mkdir(exist_ok=True)
 
     # Override params from checkpoint
-    checkpoint = read_checkpoint(state_dict)
+    checkpoint = read_checkpoint(state_dict, out_dir=models_dir)
     params = override_model_params_from_checkpoint(
         params=params,
         checkpoint_params=checkpoint['params']
@@ -348,9 +352,9 @@ def main(params: Union[DictConfig, dict]) -> None:
     num_classes = num_classes + 1 if num_classes > 1 else num_classes  # multiclass account for background
     num_bands = len(bands_requested)
 
-    working_folder = state_dict.parent.joinpath(f'inference_{num_bands}bands')
+    working_folder = root / Path(state_dict).name.split(".")[0]
     logging.info("\nThe state dict path directory used '{}'".format(working_folder))
-    Path.mkdir(working_folder, parents=True, exist_ok=True)
+    Path.mkdir(working_folder, exist_ok=True)
     logging.info(f'\nInferences will be saved to: {working_folder}\n\n')
     # Default input directory based on default output directory
     raw_data_csv = get_key_def('raw_data_csv', params['inference'], expected_type=str, to_path=True,
@@ -407,8 +411,8 @@ def main(params: Union[DictConfig, dict]) -> None:
         out_classes=num_classes,
         main_device=device,
         devices=[list(gpu_devices_dict.keys())],
-        state_dict_path=state_dict,
     )
+    model.load_state_dict(state_dict=checkpoint['model_state_dict'])
 
     # GET LIST OF INPUT IMAGES FOR INFERENCE
     list_aois = aois_from_csv(csv_path=raw_data_csv, bands_requested=bands_requested,
@@ -417,8 +421,8 @@ def main(params: Union[DictConfig, dict]) -> None:
     # LOOP THROUGH LIST OF INPUT IMAGES
     for aoi in tqdm(list_aois, desc='Inferring from images', position=0, leave=True):
         Path.mkdir(working_folder / aoi.raster_name.parent.name, parents=True, exist_ok=True)
-        inference_image = working_folder / aoi.raster_name.parent.name / f"{aoi.raster_name.stem}_inference.tif"
-        temp_file = working_folder / aoi.raster_name.parent.name / f"{aoi.raster_name.stem}.dat"
+        inference_image = working_folder / f"{aoi.raster_name.stem}_inference.tif"
+        temp_file = working_folder / f"{aoi.raster_name.stem}.dat"
         logging.info(f'\nReading image: {aoi.raster_name.stem}')
         inf_meta = aoi.raster.meta
 
diff --git a/models/model_choice.py b/models/model_choice.py
index 4c99e57d..b04cf486 100644
--- a/models/model_choice.py
+++ b/models/model_choice.py
@@ -28,7 +28,7 @@ def define_model_architecture(
     return instantiate(net_params, in_channels=in_channels, classes=out_classes)
 
 
-def read_checkpoint(filename, out_dir: str = 'checkpoints', update=True) -> DictConfig:
+def read_checkpoint(filename, out_dir: str = 'checkpoints', update=False) -> DictConfig:
     """
     Loads checkpoint from provided path to GDL's expected format,
     ie model's state dictionary should be under "model_state_dict" and
@@ -123,9 +123,7 @@ def define_model(
         in_channels: int,
         out_classes: int,
         main_device: str = 'cpu',
-        devices: List = [],
-        state_dict_path: str = None,
-        state_dict_strict_load: bool = True,
+        devices: List = []
 ):
     """
     Defines model's architecture with weights from provided checkpoint and pushes to device(s)
@@ -138,7 +136,4 @@ def define_model(
     )
     model = to_dp_model(model=model, devices=devices[1:]) if len(devices) > 1 else model
     model.to(main_device)
-    if state_dict_path:
-        checkpoint = read_checkpoint(state_dict_path)
-        model.load_state_dict(state_dict=checkpoint['model_state_dict'], strict=state_dict_strict_load)
     return model
diff --git a/tests/model/test_models.py b/tests/model/test_models.py
index 7b28e2ec..d3201005 100644
--- a/tests/model/test_models.py
+++ b/tests/model/test_models.py
@@ -102,12 +102,12 @@ class TestDefineModelMultigpu(object):
     if len(gpu_devices_dict.keys()) == 0:
         logging.critical(f"No GPUs available. Cannot perform multi-gpu testing.")
     else:
-        define_model(
+        model = define_model(
             net_params={'_target_': 'models.unet.UNet'},
             in_channels=4,
             out_classes=4,
             main_device=device,
             devices=list(gpu_devices_dict.keys()),
-            state_dict_path=filename,
-            state_dict_strict_load=True,
         )
+        checkpoint = read_checkpoint(filename)
+        model.load_state_dict(state_dict=checkpoint['model_state_dict'])
diff --git a/train_segmentation.py b/train_segmentation.py
index 62c898b2..429ea4c1 100644
--- a/train_segmentation.py
+++ b/train_segmentation.py
@@ -635,9 +635,10 @@ def train(cfg: DictConfig) -> None:
         out_classes=num_classes,
         main_device=device,
         devices=list(gpu_devices_dict.keys()),
-        state_dict_path=train_state_dict_path,
-        state_dict_strict_load=state_dict_strict,
     )
+    checkpoint = read_checkpoint(train_state_dict_path)
+    model.load_state_dict(state_dict=checkpoint['model_state_dict'], strict=state_dict_strict)
+
     criterion = define_loss(loss_params=cfg.loss, class_weights=class_weights)
     criterion = criterion.to(device)
     optimizer = instantiate(cfg.optimizer, params=model.parameters())

From ecf50521fbed8f5704e500efcce3467f5f703c5a Mon Sep 17 00:00:00 2001
From: rtavon <remi.tavon@nrcan-rncan.gc.ca>
Date: Mon, 23 Jan 2023 09:33:34 -0500
Subject: [PATCH 2/4] train_segmentation.py: fix bug if not fine tuning from
 pretrained weights

---
 train_segmentation.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/train_segmentation.py b/train_segmentation.py
index 429ea4c1..fef8a491 100644
--- a/train_segmentation.py
+++ b/train_segmentation.py
@@ -636,8 +636,9 @@ def train(cfg: DictConfig) -> None:
         main_device=device,
         devices=list(gpu_devices_dict.keys()),
     )
-    checkpoint = read_checkpoint(train_state_dict_path)
-    model.load_state_dict(state_dict=checkpoint['model_state_dict'], strict=state_dict_strict)
+    if train_state_dict_path:
+        checkpoint = read_checkpoint(train_state_dict_path)
+        model.load_state_dict(state_dict=checkpoint['model_state_dict'], strict=state_dict_strict)
 
     criterion = define_loss(loss_params=cfg.loss, class_weights=class_weights)
     criterion = criterion.to(device)

From 6520f608f460f57c9e89c2548c9b534179a3fd48 Mon Sep 17 00:00:00 2001
From: rtavon <remi.tavon@nrcan-rncan.gc.ca>
Date: Mon, 23 Jan 2023 11:28:41 -0500
Subject: [PATCH 3/4] model_choice.py: remove define_model()
 inference_segmentation.py: refer directly to define_model_architecture()
 train_segmentation.py: idem test_models.py: idem

---
 inference_segmentation.py  | 23 +++++++++--------------
 models/model_choice.py     | 21 ---------------------
 tests/model/test_models.py |  9 +++++----
 train_segmentation.py      | 11 +++++++----
 4 files changed, 21 insertions(+), 43 deletions(-)

diff --git a/inference_segmentation.py b/inference_segmentation.py
index e8ac4f28..bdd31117 100644
--- a/inference_segmentation.py
+++ b/inference_segmentation.py
@@ -27,7 +27,7 @@
 from dataset.aoi import aois_from_csv
 from dataset.stacitem import SingleBandItemEO
 from utils.logger import get_logger, set_tracker
-from models.model_choice import define_model, read_checkpoint
+from models.model_choice import read_checkpoint, define_model_architecture
 from utils import augmentation
 from utils.utils import get_device_ids, get_key_def, \
     add_metadata_from_raster_to_sample, _window_2D, set_device
@@ -329,17 +329,13 @@ def main(params: Union[DictConfig, dict]) -> None:
     -------
     :param params: (dict) Parameters inputted during execution.
     """
-    # Main params
-    root = get_key_def('root_dir', params['inference'], default="inference", to_path=True)
-    root.mkdir(exist_ok=True)
+    # SETTING OUTPUT DIRECTORY
     state_dict = get_key_def('state_dict_path', params['inference'], to_path=True,
                              validate_path_exists=True,
                              wildcard='*pth.tar')
-    models_dir = get_key_def('checkpoint_dir', params['inference'], default=root / 'checkpoints', to_path=True)
-    models_dir.mkdir(exist_ok=True)
 
     # Override params from checkpoint
-    checkpoint = read_checkpoint(state_dict, out_dir=models_dir)
+    checkpoint = read_checkpoint(state_dict)
     params = override_model_params_from_checkpoint(
         params=params,
         checkpoint_params=checkpoint['params']
@@ -352,9 +348,9 @@ def main(params: Union[DictConfig, dict]) -> None:
     num_classes = num_classes + 1 if num_classes > 1 else num_classes  # multiclass account for background
     num_bands = len(bands_requested)
 
-    working_folder = root / Path(state_dict).name.split(".")[0]
+    working_folder = state_dict.parent.joinpath(f'inference_{num_bands}bands')
     logging.info("\nThe state dict path directory used '{}'".format(working_folder))
-    Path.mkdir(working_folder, exist_ok=True)
+    Path.mkdir(working_folder, parents=True, exist_ok=True)
     logging.info(f'\nInferences will be saved to: {working_folder}\n\n')
     # Default input directory based on default output directory
     raw_data_csv = get_key_def('raw_data_csv', params['inference'], expected_type=str, to_path=True,
@@ -405,13 +401,12 @@ def main(params: Union[DictConfig, dict]) -> None:
             bands_requested = [SingleBandItemEO.band_to_cname(band) for band in bands_requested]
             logging.warning(f"Will request: {bands_requested}")
 
-    model = define_model(
+    model = define_model_architecture(
         net_params=params.model,
         in_channels=num_bands,
         out_classes=num_classes,
-        main_device=device,
-        devices=[list(gpu_devices_dict.keys())],
     )
+    model.to(device)
     model.load_state_dict(state_dict=checkpoint['model_state_dict'])
 
     # GET LIST OF INPUT IMAGES FOR INFERENCE
@@ -421,8 +416,8 @@ def main(params: Union[DictConfig, dict]) -> None:
     # LOOP THROUGH LIST OF INPUT IMAGES
     for aoi in tqdm(list_aois, desc='Inferring from images', position=0, leave=True):
         Path.mkdir(working_folder / aoi.raster_name.parent.name, parents=True, exist_ok=True)
-        inference_image = working_folder / f"{aoi.raster_name.stem}_inference.tif"
-        temp_file = working_folder / f"{aoi.raster_name.stem}.dat"
+        inference_image = working_folder / aoi.raster_name.parent.name / f"{aoi.raster_name.stem}_inference.tif"
+        temp_file = working_folder / aoi.raster_name.parent.name / f"{aoi.raster_name.stem}.dat"
         logging.info(f'\nReading image: {aoi.raster_name.stem}')
         inf_meta = aoi.raster.meta
 
diff --git a/models/model_choice.py b/models/model_choice.py
index b04cf486..3bcb028f 100644
--- a/models/model_choice.py
+++ b/models/model_choice.py
@@ -116,24 +116,3 @@ def to_dp_model(model, devices: List):
                         f"Trying devices with ids {list(range(len(devices)))}")
         model = nn.DataParallel(model, device_ids=list(range(len(devices))))
     return model
-
-
-def define_model(
-        net_params: dict,
-        in_channels: int,
-        out_classes: int,
-        main_device: str = 'cpu',
-        devices: List = []
-):
-    """
-    Defines model's architecture with weights from provided checkpoint and pushes to device(s)
-    @return:
-    """
-    model = define_model_architecture(
-        net_params=net_params,
-        in_channels=in_channels,
-        out_classes=out_classes,
-    )
-    model = to_dp_model(model=model, devices=devices[1:]) if len(devices) > 1 else model
-    model.to(main_device)
-    return model
diff --git a/tests/model/test_models.py b/tests/model/test_models.py
index d3201005..fab36067 100644
--- a/tests/model/test_models.py
+++ b/tests/model/test_models.py
@@ -11,7 +11,7 @@
 
 import models.unet
 from models import unet
-from models.model_choice import read_checkpoint, adapt_checkpoint_to_dp_model, define_model, define_model_architecture
+from models.model_choice import read_checkpoint, adapt_checkpoint_to_dp_model, define_model_architecture, to_dp_model
 from utils.utils import get_device_ids, set_device
 
 
@@ -102,12 +102,13 @@ class TestDefineModelMultigpu(object):
     if len(gpu_devices_dict.keys()) == 0:
         logging.critical(f"No GPUs available. Cannot perform multi-gpu testing.")
     else:
-        model = define_model(
+        model = define_model_architecture(
             net_params={'_target_': 'models.unet.UNet'},
             in_channels=4,
             out_classes=4,
-            main_device=device,
-            devices=list(gpu_devices_dict.keys()),
         )
+        devices = list(gpu_devices_dict.keys())
+        model = to_dp_model(model=model, devices=devices[1:]) if len(devices) > 1 else model
+        model.to(device)
         checkpoint = read_checkpoint(filename)
         model.load_state_dict(state_dict=checkpoint['model_state_dict'])
diff --git a/train_segmentation.py b/train_segmentation.py
index fef8a491..9a04ecc9 100644
--- a/train_segmentation.py
+++ b/train_segmentation.py
@@ -16,7 +16,8 @@
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 
-from models.model_choice import read_checkpoint, define_model, adapt_checkpoint_to_dp_model
+from models.model_choice import read_checkpoint, adapt_checkpoint_to_dp_model, to_dp_model, \
+    define_model_architecture
 from tiling_segmentation import Tiler
 from utils import augmentation as aug
 from dataset import create_dataset
@@ -629,13 +630,15 @@ def train(cfg: DictConfig) -> None:
     device = set_device(gpu_devices_dict=gpu_devices_dict)
 
     # INSTANTIATE MODEL AND LOAD CHECKPOINT FROM PATH
-    model = define_model(
+    model = define_model_architecture(
         net_params=cfg.model,
         in_channels=num_bands,
         out_classes=num_classes,
-        main_device=device,
-        devices=list(gpu_devices_dict.keys()),
     )
+    devices = list(gpu_devices_dict.keys())
+    model = to_dp_model(model=model, devices=devices[1:]) if len(devices) > 1 else model
+    model.to(device)
+
     if train_state_dict_path:
         checkpoint = read_checkpoint(train_state_dict_path)
         model.load_state_dict(state_dict=checkpoint['model_state_dict'], strict=state_dict_strict)

From 65e515ba3fd8bddd35a68a0cf6cb76d689163d40 Mon Sep 17 00:00:00 2001
From: rtavon <remi.tavon@nrcan-rncan.gc.ca>
Date: Mon, 23 Jan 2023 11:30:03 -0500
Subject: [PATCH 4/4] default_binary.yaml: restore original config
 default_multiclass.yaml: idem

---
 config/inference/default_binary.yaml     | 2 --
 config/inference/default_multiclass.yaml | 2 --
 2 files changed, 4 deletions(-)

diff --git a/config/inference/default_binary.yaml b/config/inference/default_binary.yaml
index 90c24212..6c1efd69 100644
--- a/config/inference/default_binary.yaml
+++ b/config/inference/default_binary.yaml
@@ -1,10 +1,8 @@
 # @package _global_
 inference:
   raw_data_csv: tests/inference/inference_segmentation_binary.csv
-  root_dir: data/inference
   input_stac_item:  # alternatively, use a path or url to stac item directly
   state_dict_path: ${general.save_weights_dir}/
-  checkpoint_dir: ${general.save_weights_dir}  # (string, optional): directory in which to save the object if url
   chunk_size:  # if empty, will be calculated automatically from max_pix_per_mb_gpu
   # Maximum number of pixels each Mb of GPU Ram to allow. E.g. if GPU has 1000 Mb of Ram and this parameter is set to
   # 10, chunk_size will be set to sqrt(1000 * 10) = 100.
diff --git a/config/inference/default_multiclass.yaml b/config/inference/default_multiclass.yaml
index eab05234..c35725c6 100644
--- a/config/inference/default_multiclass.yaml
+++ b/config/inference/default_multiclass.yaml
@@ -1,10 +1,8 @@
 # @package _global_
 inference:
   raw_data_csv: tests/inference/inference_segmentation_multiclass.csv
-  root_dir: data/inference
   input_stac_item:  # alternatively, use a path or url to stac item directly
   state_dict_path: ${general.save_weights_dir}/
-  checkpoint_dir: ${general.save_weights_dir}  # (string, optional): directory in which to save the object if url
   chunk_size:  # if empty, will be calculated automatically from max_pix_per_mb_gpu
   # Maximum number of pixels each Mb of GPU Ram to allow. E.g. if GPU has 1000 Mb of Ram and this parameter is set to
   # 10, chunk_size will be set to sqrt(1000 * 10) = 100.