From 039c3d1f5b0e2c502a0dbc31665bc50211c599a9 Mon Sep 17 00:00:00 2001 From: rtavon Date: Thu, 19 Jan 2023 16:59:07 -0500 Subject: [PATCH 1/4] inference_segmentation.py: softcode download directory for checkpoint if url default_multiclass.yaml: add checkpoint_dir param default_binary.yaml: idem model_choice.py: - remove load state dict path from define_model - read_checkpoint: set default "update=False", not True train_segmentation.py: adjust usage of define_model test_models.py: idem --- config/inference/default_binary.yaml | 2 ++ config/inference/default_multiclass.yaml | 2 ++ inference_segmentation.py | 18 +++++++++++------- models/model_choice.py | 9 ++------- tests/model/test_models.py | 6 +++--- train_segmentation.py | 5 +++-- 6 files changed, 23 insertions(+), 19 deletions(-) diff --git a/config/inference/default_binary.yaml b/config/inference/default_binary.yaml index 6c1efd69..90c24212 100644 --- a/config/inference/default_binary.yaml +++ b/config/inference/default_binary.yaml @@ -1,8 +1,10 @@ # @package _global_ inference: raw_data_csv: tests/inference/inference_segmentation_binary.csv + root_dir: data/inference input_stac_item: # alternatively, use a path or url to stac item directly state_dict_path: ${general.save_weights_dir}/ + checkpoint_dir: ${general.save_weights_dir} # (string, optional): directory in which to save the object if url chunk_size: # if empty, will be calculated automatically from max_pix_per_mb_gpu # Maximum number of pixels each Mb of GPU Ram to allow. E.g. if GPU has 1000 Mb of Ram and this parameter is set to # 10, chunk_size will be set to sqrt(1000 * 10) = 100. diff --git a/config/inference/default_multiclass.yaml b/config/inference/default_multiclass.yaml index c35725c6..eab05234 100644 --- a/config/inference/default_multiclass.yaml +++ b/config/inference/default_multiclass.yaml @@ -1,8 +1,10 @@ # @package _global_ inference: raw_data_csv: tests/inference/inference_segmentation_multiclass.csv + root_dir: data/inference input_stac_item: # alternatively, use a path or url to stac item directly state_dict_path: ${general.save_weights_dir}/ + checkpoint_dir: ${general.save_weights_dir} # (string, optional): directory in which to save the object if url chunk_size: # if empty, will be calculated automatically from max_pix_per_mb_gpu # Maximum number of pixels each Mb of GPU Ram to allow. E.g. if GPU has 1000 Mb of Ram and this parameter is set to # 10, chunk_size will be set to sqrt(1000 * 10) = 100. diff --git a/inference_segmentation.py b/inference_segmentation.py index 3960997f..e8ac4f28 100644 --- a/inference_segmentation.py +++ b/inference_segmentation.py @@ -329,13 +329,17 @@ def main(params: Union[DictConfig, dict]) -> None: ------- :param params: (dict) Parameters inputted during execution. """ - # SETTING OUTPUT DIRECTORY + # Main params + root = get_key_def('root_dir', params['inference'], default="inference", to_path=True) + root.mkdir(exist_ok=True) state_dict = get_key_def('state_dict_path', params['inference'], to_path=True, validate_path_exists=True, wildcard='*pth.tar') + models_dir = get_key_def('checkpoint_dir', params['inference'], default=root / 'checkpoints', to_path=True) + models_dir.mkdir(exist_ok=True) # Override params from checkpoint - checkpoint = read_checkpoint(state_dict) + checkpoint = read_checkpoint(state_dict, out_dir=models_dir) params = override_model_params_from_checkpoint( params=params, checkpoint_params=checkpoint['params'] @@ -348,9 +352,9 @@ def main(params: Union[DictConfig, dict]) -> None: num_classes = num_classes + 1 if num_classes > 1 else num_classes # multiclass account for background num_bands = len(bands_requested) - working_folder = state_dict.parent.joinpath(f'inference_{num_bands}bands') + working_folder = root / Path(state_dict).name.split(".")[0] logging.info("\nThe state dict path directory used '{}'".format(working_folder)) - Path.mkdir(working_folder, parents=True, exist_ok=True) + Path.mkdir(working_folder, exist_ok=True) logging.info(f'\nInferences will be saved to: {working_folder}\n\n') # Default input directory based on default output directory raw_data_csv = get_key_def('raw_data_csv', params['inference'], expected_type=str, to_path=True, @@ -407,8 +411,8 @@ def main(params: Union[DictConfig, dict]) -> None: out_classes=num_classes, main_device=device, devices=[list(gpu_devices_dict.keys())], - state_dict_path=state_dict, ) + model.load_state_dict(state_dict=checkpoint['model_state_dict']) # GET LIST OF INPUT IMAGES FOR INFERENCE list_aois = aois_from_csv(csv_path=raw_data_csv, bands_requested=bands_requested, @@ -417,8 +421,8 @@ def main(params: Union[DictConfig, dict]) -> None: # LOOP THROUGH LIST OF INPUT IMAGES for aoi in tqdm(list_aois, desc='Inferring from images', position=0, leave=True): Path.mkdir(working_folder / aoi.raster_name.parent.name, parents=True, exist_ok=True) - inference_image = working_folder / aoi.raster_name.parent.name / f"{aoi.raster_name.stem}_inference.tif" - temp_file = working_folder / aoi.raster_name.parent.name / f"{aoi.raster_name.stem}.dat" + inference_image = working_folder / f"{aoi.raster_name.stem}_inference.tif" + temp_file = working_folder / f"{aoi.raster_name.stem}.dat" logging.info(f'\nReading image: {aoi.raster_name.stem}') inf_meta = aoi.raster.meta diff --git a/models/model_choice.py b/models/model_choice.py index 4c99e57d..b04cf486 100644 --- a/models/model_choice.py +++ b/models/model_choice.py @@ -28,7 +28,7 @@ def define_model_architecture( return instantiate(net_params, in_channels=in_channels, classes=out_classes) -def read_checkpoint(filename, out_dir: str = 'checkpoints', update=True) -> DictConfig: +def read_checkpoint(filename, out_dir: str = 'checkpoints', update=False) -> DictConfig: """ Loads checkpoint from provided path to GDL's expected format, ie model's state dictionary should be under "model_state_dict" and @@ -123,9 +123,7 @@ def define_model( in_channels: int, out_classes: int, main_device: str = 'cpu', - devices: List = [], - state_dict_path: str = None, - state_dict_strict_load: bool = True, + devices: List = [] ): """ Defines model's architecture with weights from provided checkpoint and pushes to device(s) @@ -138,7 +136,4 @@ def define_model( ) model = to_dp_model(model=model, devices=devices[1:]) if len(devices) > 1 else model model.to(main_device) - if state_dict_path: - checkpoint = read_checkpoint(state_dict_path) - model.load_state_dict(state_dict=checkpoint['model_state_dict'], strict=state_dict_strict_load) return model diff --git a/tests/model/test_models.py b/tests/model/test_models.py index 7b28e2ec..d3201005 100644 --- a/tests/model/test_models.py +++ b/tests/model/test_models.py @@ -102,12 +102,12 @@ class TestDefineModelMultigpu(object): if len(gpu_devices_dict.keys()) == 0: logging.critical(f"No GPUs available. Cannot perform multi-gpu testing.") else: - define_model( + model = define_model( net_params={'_target_': 'models.unet.UNet'}, in_channels=4, out_classes=4, main_device=device, devices=list(gpu_devices_dict.keys()), - state_dict_path=filename, - state_dict_strict_load=True, ) + checkpoint = read_checkpoint(filename) + model.load_state_dict(state_dict=checkpoint['model_state_dict']) diff --git a/train_segmentation.py b/train_segmentation.py index 62c898b2..429ea4c1 100644 --- a/train_segmentation.py +++ b/train_segmentation.py @@ -635,9 +635,10 @@ def train(cfg: DictConfig) -> None: out_classes=num_classes, main_device=device, devices=list(gpu_devices_dict.keys()), - state_dict_path=train_state_dict_path, - state_dict_strict_load=state_dict_strict, ) + checkpoint = read_checkpoint(train_state_dict_path) + model.load_state_dict(state_dict=checkpoint['model_state_dict'], strict=state_dict_strict) + criterion = define_loss(loss_params=cfg.loss, class_weights=class_weights) criterion = criterion.to(device) optimizer = instantiate(cfg.optimizer, params=model.parameters()) From ecf50521fbed8f5704e500efcce3467f5f703c5a Mon Sep 17 00:00:00 2001 From: rtavon Date: Mon, 23 Jan 2023 09:33:34 -0500 Subject: [PATCH 2/4] train_segmentation.py: fix bug if not fine tuning from pretrained weights --- train_segmentation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/train_segmentation.py b/train_segmentation.py index 429ea4c1..fef8a491 100644 --- a/train_segmentation.py +++ b/train_segmentation.py @@ -636,8 +636,9 @@ def train(cfg: DictConfig) -> None: main_device=device, devices=list(gpu_devices_dict.keys()), ) - checkpoint = read_checkpoint(train_state_dict_path) - model.load_state_dict(state_dict=checkpoint['model_state_dict'], strict=state_dict_strict) + if train_state_dict_path: + checkpoint = read_checkpoint(train_state_dict_path) + model.load_state_dict(state_dict=checkpoint['model_state_dict'], strict=state_dict_strict) criterion = define_loss(loss_params=cfg.loss, class_weights=class_weights) criterion = criterion.to(device) From 6520f608f460f57c9e89c2548c9b534179a3fd48 Mon Sep 17 00:00:00 2001 From: rtavon Date: Mon, 23 Jan 2023 11:28:41 -0500 Subject: [PATCH 3/4] model_choice.py: remove define_model() inference_segmentation.py: refer directly to define_model_architecture() train_segmentation.py: idem test_models.py: idem --- inference_segmentation.py | 23 +++++++++-------------- models/model_choice.py | 21 --------------------- tests/model/test_models.py | 9 +++++---- train_segmentation.py | 11 +++++++---- 4 files changed, 21 insertions(+), 43 deletions(-) diff --git a/inference_segmentation.py b/inference_segmentation.py index e8ac4f28..bdd31117 100644 --- a/inference_segmentation.py +++ b/inference_segmentation.py @@ -27,7 +27,7 @@ from dataset.aoi import aois_from_csv from dataset.stacitem import SingleBandItemEO from utils.logger import get_logger, set_tracker -from models.model_choice import define_model, read_checkpoint +from models.model_choice import read_checkpoint, define_model_architecture from utils import augmentation from utils.utils import get_device_ids, get_key_def, \ add_metadata_from_raster_to_sample, _window_2D, set_device @@ -329,17 +329,13 @@ def main(params: Union[DictConfig, dict]) -> None: ------- :param params: (dict) Parameters inputted during execution. """ - # Main params - root = get_key_def('root_dir', params['inference'], default="inference", to_path=True) - root.mkdir(exist_ok=True) + # SETTING OUTPUT DIRECTORY state_dict = get_key_def('state_dict_path', params['inference'], to_path=True, validate_path_exists=True, wildcard='*pth.tar') - models_dir = get_key_def('checkpoint_dir', params['inference'], default=root / 'checkpoints', to_path=True) - models_dir.mkdir(exist_ok=True) # Override params from checkpoint - checkpoint = read_checkpoint(state_dict, out_dir=models_dir) + checkpoint = read_checkpoint(state_dict) params = override_model_params_from_checkpoint( params=params, checkpoint_params=checkpoint['params'] @@ -352,9 +348,9 @@ def main(params: Union[DictConfig, dict]) -> None: num_classes = num_classes + 1 if num_classes > 1 else num_classes # multiclass account for background num_bands = len(bands_requested) - working_folder = root / Path(state_dict).name.split(".")[0] + working_folder = state_dict.parent.joinpath(f'inference_{num_bands}bands') logging.info("\nThe state dict path directory used '{}'".format(working_folder)) - Path.mkdir(working_folder, exist_ok=True) + Path.mkdir(working_folder, parents=True, exist_ok=True) logging.info(f'\nInferences will be saved to: {working_folder}\n\n') # Default input directory based on default output directory raw_data_csv = get_key_def('raw_data_csv', params['inference'], expected_type=str, to_path=True, @@ -405,13 +401,12 @@ def main(params: Union[DictConfig, dict]) -> None: bands_requested = [SingleBandItemEO.band_to_cname(band) for band in bands_requested] logging.warning(f"Will request: {bands_requested}") - model = define_model( + model = define_model_architecture( net_params=params.model, in_channels=num_bands, out_classes=num_classes, - main_device=device, - devices=[list(gpu_devices_dict.keys())], ) + model.to(device) model.load_state_dict(state_dict=checkpoint['model_state_dict']) # GET LIST OF INPUT IMAGES FOR INFERENCE @@ -421,8 +416,8 @@ def main(params: Union[DictConfig, dict]) -> None: # LOOP THROUGH LIST OF INPUT IMAGES for aoi in tqdm(list_aois, desc='Inferring from images', position=0, leave=True): Path.mkdir(working_folder / aoi.raster_name.parent.name, parents=True, exist_ok=True) - inference_image = working_folder / f"{aoi.raster_name.stem}_inference.tif" - temp_file = working_folder / f"{aoi.raster_name.stem}.dat" + inference_image = working_folder / aoi.raster_name.parent.name / f"{aoi.raster_name.stem}_inference.tif" + temp_file = working_folder / aoi.raster_name.parent.name / f"{aoi.raster_name.stem}.dat" logging.info(f'\nReading image: {aoi.raster_name.stem}') inf_meta = aoi.raster.meta diff --git a/models/model_choice.py b/models/model_choice.py index b04cf486..3bcb028f 100644 --- a/models/model_choice.py +++ b/models/model_choice.py @@ -116,24 +116,3 @@ def to_dp_model(model, devices: List): f"Trying devices with ids {list(range(len(devices)))}") model = nn.DataParallel(model, device_ids=list(range(len(devices)))) return model - - -def define_model( - net_params: dict, - in_channels: int, - out_classes: int, - main_device: str = 'cpu', - devices: List = [] -): - """ - Defines model's architecture with weights from provided checkpoint and pushes to device(s) - @return: - """ - model = define_model_architecture( - net_params=net_params, - in_channels=in_channels, - out_classes=out_classes, - ) - model = to_dp_model(model=model, devices=devices[1:]) if len(devices) > 1 else model - model.to(main_device) - return model diff --git a/tests/model/test_models.py b/tests/model/test_models.py index d3201005..fab36067 100644 --- a/tests/model/test_models.py +++ b/tests/model/test_models.py @@ -11,7 +11,7 @@ import models.unet from models import unet -from models.model_choice import read_checkpoint, adapt_checkpoint_to_dp_model, define_model, define_model_architecture +from models.model_choice import read_checkpoint, adapt_checkpoint_to_dp_model, define_model_architecture, to_dp_model from utils.utils import get_device_ids, set_device @@ -102,12 +102,13 @@ class TestDefineModelMultigpu(object): if len(gpu_devices_dict.keys()) == 0: logging.critical(f"No GPUs available. Cannot perform multi-gpu testing.") else: - model = define_model( + model = define_model_architecture( net_params={'_target_': 'models.unet.UNet'}, in_channels=4, out_classes=4, - main_device=device, - devices=list(gpu_devices_dict.keys()), ) + devices = list(gpu_devices_dict.keys()) + model = to_dp_model(model=model, devices=devices[1:]) if len(devices) > 1 else model + model.to(device) checkpoint = read_checkpoint(filename) model.load_state_dict(state_dict=checkpoint['model_state_dict']) diff --git a/train_segmentation.py b/train_segmentation.py index fef8a491..9a04ecc9 100644 --- a/train_segmentation.py +++ b/train_segmentation.py @@ -16,7 +16,8 @@ from torch.utils.data import DataLoader from tqdm import tqdm -from models.model_choice import read_checkpoint, define_model, adapt_checkpoint_to_dp_model +from models.model_choice import read_checkpoint, adapt_checkpoint_to_dp_model, to_dp_model, \ + define_model_architecture from tiling_segmentation import Tiler from utils import augmentation as aug from dataset import create_dataset @@ -629,13 +630,15 @@ def train(cfg: DictConfig) -> None: device = set_device(gpu_devices_dict=gpu_devices_dict) # INSTANTIATE MODEL AND LOAD CHECKPOINT FROM PATH - model = define_model( + model = define_model_architecture( net_params=cfg.model, in_channels=num_bands, out_classes=num_classes, - main_device=device, - devices=list(gpu_devices_dict.keys()), ) + devices = list(gpu_devices_dict.keys()) + model = to_dp_model(model=model, devices=devices[1:]) if len(devices) > 1 else model + model.to(device) + if train_state_dict_path: checkpoint = read_checkpoint(train_state_dict_path) model.load_state_dict(state_dict=checkpoint['model_state_dict'], strict=state_dict_strict) From 65e515ba3fd8bddd35a68a0cf6cb76d689163d40 Mon Sep 17 00:00:00 2001 From: rtavon Date: Mon, 23 Jan 2023 11:30:03 -0500 Subject: [PATCH 4/4] default_binary.yaml: restore original config default_multiclass.yaml: idem --- config/inference/default_binary.yaml | 2 -- config/inference/default_multiclass.yaml | 2 -- 2 files changed, 4 deletions(-) diff --git a/config/inference/default_binary.yaml b/config/inference/default_binary.yaml index 90c24212..6c1efd69 100644 --- a/config/inference/default_binary.yaml +++ b/config/inference/default_binary.yaml @@ -1,10 +1,8 @@ # @package _global_ inference: raw_data_csv: tests/inference/inference_segmentation_binary.csv - root_dir: data/inference input_stac_item: # alternatively, use a path or url to stac item directly state_dict_path: ${general.save_weights_dir}/ - checkpoint_dir: ${general.save_weights_dir} # (string, optional): directory in which to save the object if url chunk_size: # if empty, will be calculated automatically from max_pix_per_mb_gpu # Maximum number of pixels each Mb of GPU Ram to allow. E.g. if GPU has 1000 Mb of Ram and this parameter is set to # 10, chunk_size will be set to sqrt(1000 * 10) = 100. diff --git a/config/inference/default_multiclass.yaml b/config/inference/default_multiclass.yaml index eab05234..c35725c6 100644 --- a/config/inference/default_multiclass.yaml +++ b/config/inference/default_multiclass.yaml @@ -1,10 +1,8 @@ # @package _global_ inference: raw_data_csv: tests/inference/inference_segmentation_multiclass.csv - root_dir: data/inference input_stac_item: # alternatively, use a path or url to stac item directly state_dict_path: ${general.save_weights_dir}/ - checkpoint_dir: ${general.save_weights_dir} # (string, optional): directory in which to save the object if url chunk_size: # if empty, will be calculated automatically from max_pix_per_mb_gpu # Maximum number of pixels each Mb of GPU Ram to allow. E.g. if GPU has 1000 Mb of Ram and this parameter is set to # 10, chunk_size will be set to sqrt(1000 * 10) = 100.