From d90d19e12eda7c504e10f8a9d6e8f012eb8d04c4 Mon Sep 17 00:00:00 2001
From: Viacheslav Kukushkin <vy.kukushkin@gmail.com>
Date: Thu, 9 May 2024 01:00:02 +0300
Subject: [PATCH 01/18] fix train metrics errors

1. return output of whole batch, not just one item
2. make ground truth & predictions array to take into account `q_samples_per_volume` (the whole dataset size during 1 epoch is equal to len(data) * q_samples_per_volume; so if dataset df contains 100 records and q_samples_per_volume = 10 (by default) and batch size is 4, there would be 250 batches by 4 elements
3. make ground truth take into account that train_dataloader is shuffled. So now ground truth is sorted in the same order as predictions and as train_dataloader.
---
 GANDLF/compute/step.py          |  9 +++------
 GANDLF/compute/training_loop.py | 16 +++++++---------
 2 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/GANDLF/compute/step.py b/GANDLF/compute/step.py
index c36258c47..bdd997a50 100644
--- a/GANDLF/compute/step.py
+++ b/GANDLF/compute/step.py
@@ -7,7 +7,7 @@
 def step(
     model: torch.nn.Module,
     image: torch.Tensor,
-    label: torch.Tensor,
+    label: Optional[torch.Tensor],
     params: dict,
     train: Optional[bool] = True,
 ) -> Tuple[float, dict, torch.Tensor, torch.Tensor]:
@@ -62,7 +62,7 @@ def step(
                 if len(label.shape) > 1:
                     label = torch.squeeze(label, -1)
 
-    if not (train) and params["model"]["type"].lower() == "openvino":
+    if not train and params["model"]["type"].lower() == "openvino":
         output = torch.from_numpy(
             model(inputs={params["model"]["IO"][0][0]: image.cpu().numpy()})[
                 params["model"]["IO"][1][0]
@@ -86,12 +86,9 @@ def step(
     else:
         loss, metric_output = None, None
 
-    if len(output) > 1:
-        output = output[0]
-
     if params["model"]["dimension"] == 2:
-        output = torch.unsqueeze(output, -1)
         if "medcam_enabled" in params and params["medcam_enabled"]:
             attention_map = torch.unsqueeze(attention_map, -1)
 
+    assert len(output) == len(image), f"Error: output({len(output)}) and batch({len(image)}) have different lengths. Both should be equal to batch size!"
     return loss, metric_output, output, attention_map
diff --git a/GANDLF/compute/training_loop.py b/GANDLF/compute/training_loop.py
index 61e0e6b0f..287d78aea 100644
--- a/GANDLF/compute/training_loop.py
+++ b/GANDLF/compute/training_loop.py
@@ -79,10 +79,8 @@ def train_network(
 
     # get ground truths
     if calculate_overall_metrics:
-        (
-            ground_truth_array,
-            predictions_array,
-        ) = get_ground_truths_and_predictions_tensor(params, "training_data")
+        ground_truth_array = torch.zeros(len(train_dataloader.dataset), dtype=torch.int)
+        predictions_array = torch.zeros_like(ground_truth_array)
     # Set the model to train
     model.train()
     for batch_idx, (subject) in enumerate(
@@ -117,11 +115,11 @@ def train_network(
         loss, calculated_metrics, output, _ = step(model, image, label, params)
         # store predictions for classification
         if calculate_overall_metrics:
-            predictions_array[
-                batch_idx
-                * params["batch_size"] : (batch_idx + 1)
-                * params["batch_size"]
-            ] = (torch.argmax(output[0], 0).cpu().item())
+            batch_idx_slice = slice(batch_idx * params["batch_size"], (batch_idx + 1) * params["batch_size"])
+            ground_truth_array[batch_idx_slice] = label.detach().cpu().ravel()
+            batch_predictions = torch.argmax(output, 1).cpu()
+            assert len(batch_predictions) == len(label)
+            predictions_array[batch_idx_slice] = batch_predictions
 
         nan_loss = torch.isnan(loss)
         second_order = (

From ada957795dacf1d679d02d936b0c37bb5eb117a4 Mon Sep 17 00:00:00 2001
From: Viacheslav Kukushkin <vy.kukushkin@gmail.com>
Date: Tue, 14 May 2024 13:50:35 +0300
Subject: [PATCH 02/18] Output_metrics is filled only for the last weighted 
 avg_type

---
 GANDLF/metrics/classification.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/GANDLF/metrics/classification.py b/GANDLF/metrics/classification.py
index ba8fa589f..ccfa507f6 100644
--- a/GANDLF/metrics/classification.py
+++ b/GANDLF/metrics/classification.py
@@ -69,15 +69,16 @@ def overall_stats(prediction: torch.Tensor, target: torch.Tensor, params: dict)
             # ),
         }
         for metric_name, calculator in calculators.items():
+            avg_typed_metric_name = f"{metric_name}_{average_type_key}"
             if metric_name == "aucroc":
                 one_hot_preds = one_hot(
                     prediction.long(), num_classes=params["model"]["num_classes"]
                 )
-                output_metrics[metric_name] = get_output_from_calculator(
+                output_metrics[avg_typed_metric_name] = get_output_from_calculator(
                     one_hot_preds.float(), target, calculator
                 )
             else:
-                output_metrics[metric_name] = get_output_from_calculator(
+                output_metrics[avg_typed_metric_name] = get_output_from_calculator(
                     prediction, target, calculator
                 )
 

From 73174c732469d08336f4deade88e654ae1747392 Mon Sep 17 00:00:00 2001
From: Viacheslav Kukushkin <vy.kukushkin@gmail.com>
Date: Tue, 14 May 2024 22:37:51 +0300
Subject: [PATCH 03/18] Refactored logger

To ensure values in csv are always written in the same order as header
---
 GANDLF/compute/training_loop.py | 36 ++++++++---------
 GANDLF/logger.py                | 71 ++++++++++++++++-----------------
 2 files changed, 52 insertions(+), 55 deletions(-)

diff --git a/GANDLF/compute/training_loop.py b/GANDLF/compute/training_loop.py
index 287d78aea..c5cafd869 100644
--- a/GANDLF/compute/training_loop.py
+++ b/GANDLF/compute/training_loop.py
@@ -323,44 +323,44 @@ def training_loop(
     # datetime object containing current date and time
     print("Initializing training at :", get_date_time(), flush=True)
 
-    calculate_overall_metrics = (params["problem_type"] == "classification") or (
-        params["problem_type"] == "regression"
-    )
+    metrics_log = list(params["metrics"])
 
-    # get the overall metrics that are calculated automatically for classification/regression problems
-    if params["problem_type"] == "regression":
-        overall_metrics = overall_stats(torch.Tensor([1]), torch.Tensor([1]), params)
-    elif params["problem_type"] == "classification":
-        # this is just used to generate the headers for the overall stats
-        temp_tensor = torch.randint(0, params["model"]["num_classes"], (5,))
-        overall_metrics = overall_stats(
-            temp_tensor.to(dtype=torch.int32), temp_tensor.to(dtype=torch.int32), params
-        )
+    calculate_overall_metrics = params["problem_type"] in {"classification", "regression"}
 
-    metrics_log = params["metrics"].copy()
     if calculate_overall_metrics:
+        # get the overall metrics that are calculated automatically for classification/regression problems
+        if params["problem_type"] == "regression":
+            overall_metrics = overall_stats(torch.Tensor([1]), torch.Tensor([1]), params)
+        elif params["problem_type"] == "classification":
+            # this is just used to generate the headers for the overall stats
+            temp_tensor = torch.randint(0, params["model"]["num_classes"], (5,))
+            overall_metrics = overall_stats(
+                temp_tensor.to(dtype=torch.int32), temp_tensor.to(dtype=torch.int32), params
+            )
+        else:
+            raise NotImplementedError("Problem type not implemented for overall stats")
+
         for metric in overall_metrics:
             if metric not in metrics_log:
-                metrics_log[metric] = 0
+                metrics_log.append(metric)
 
     # Setup a few loggers for tracking
     train_logger = Logger(
         logger_csv_filename=os.path.join(output_dir, "logs_training.csv"),
         metrics=metrics_log,
+        mode="train",
     )
     valid_logger = Logger(
         logger_csv_filename=os.path.join(output_dir, "logs_validation.csv"),
         metrics=metrics_log,
+        mode="valid",
     )
     if testingDataDefined:
         test_logger = Logger(
             logger_csv_filename=os.path.join(output_dir, "logs_testing.csv"),
             metrics=metrics_log,
+            mode="test",
         )
-    train_logger.write_header(mode="train")
-    valid_logger.write_header(mode="valid")
-    if testingDataDefined:
-        test_logger.write_header(mode="test")
 
     if "medcam" in params:
         model = medcam.inject(
diff --git a/GANDLF/logger.py b/GANDLF/logger.py
index bb3168583..f7e15f044 100755
--- a/GANDLF/logger.py
+++ b/GANDLF/logger.py
@@ -7,39 +7,42 @@
 """
 
 import os
-from typing import Dict
+from typing import Dict, List, Union
 import torch
 
 
 class Logger:
-    def __init__(self, logger_csv_filename: str, metrics: Dict[str, float]) -> None:
+    def __init__(self, logger_csv_filename: str, metrics: List[str], mode: str) -> None:
         """
         Logger class to log the training and validation metrics to a csv file.
+            May append to existing file if headers match; elsewise raises an error.
 
         Args:
             logger_csv_filename (str): Path to a filename where the csv has to be stored.
             metrics (Dict[str, float]): The metrics to be logged.
         """
         self.filename = logger_csv_filename
-        self.metrics = metrics
+        mode = mode.lower()
+        self.mode = mode.lower()
 
-    def write_header(self, mode="train"):
-        self.csv = open(self.filename, "a")
-        if os.stat(self.filename).st_size == 0:
-            mode_lower = mode.lower()
-            row = "epoch_no," + mode_lower + "_loss,"
-            row += (
-                ",".join([mode_lower + "_" + metric for metric in self.metrics]) + ","
-            )
-            row = row[:-1]
-            row += "\n"
-            self.csv.write(row)
-        # else:
-        #     print("Found a pre-existing file for logging, now appending logs to that file!")
-        self.csv.close()
+        new_header = ["epoch_no", f"{mode}_loss"] + [f"{mode}_{metric}" for metric in metrics]
+
+        # TODO: do we really need to support appending to existing files?
+        if os.path.exists(self.filename):
+            with open(self.filename, "r") as f:
+                existing_header = f.readline().strip().split(",")
+            if set(existing_header) != set(new_header):
+                raise ValueError(f"Logger file {self.filename} error: existing header does not match new header."
+                                 f" Existing header: {existing_header}. New header: {new_header}")
+            self.ordered_header = existing_header
+        else:
+            with open(self.filename, "w") as f:
+                f.write(",".join(new_header) + "\n")
+            self.ordered_header = new_header
 
     def write(
-        self, epoch_number: int, loss: float, epoch_metrics: Dict[str, float]
+            self, epoch_number: int, loss: Union[float, torch.Tensor],
+            epoch_metrics: Dict[str, Union[float, torch.Tensor]]
     ) -> None:
         """
         Write the epoch number, loss and metrics to the csv file.
@@ -49,25 +52,19 @@ def write(
             loss (float): The loss value.
             epoch_metrics (Dict[str, float]): The metrics to be logged.
         """
-        self.csv = open(self.filename, "a")
-        row = ""
-        row += str(epoch_number) + ","
+
         if torch.is_tensor(loss):
-            row += str(loss.cpu().item())
-        else:
-            row += str(loss)
-        row += ","
+            loss = loss.cpu().item()
+
+        row = {"epoch_no": epoch_number,
+               f"{self.mode}_loss": loss}
 
-        for metric in epoch_metrics:
-            if torch.is_tensor(epoch_metrics[metric]):
-                row += str(epoch_metrics[metric].cpu().item())
-            else:
-                row += str(epoch_metrics[metric])
-            row += ","
-        row = row[:-1]
-        self.csv.write(row)
-        self.csv.write("\n")
-        self.csv.close()
+        for metric, metric_val in epoch_metrics.items():
+            if torch.is_tensor(metric_val):
+                metric_val = metric_val.cpu().item()
+            row[f"{self.mode}_{metric}"] = metric_val
 
-    def close(self):
-        self.csv.close()
+        with open(self.filename, "a") as f:
+            line = [row[col] for col in self.ordered_header]
+            line = [str(x) for x in line]
+            f.write(",".join(line) + "\n")

From 73037366371932bcffbc758ba26305ad1657a139 Mon Sep 17 00:00:00 2001
From: Viacheslav Kukushkin <vy.kukushkin@gmail.com>
Date: Tue, 14 May 2024 22:46:27 +0300
Subject: [PATCH 04/18] general refactoring & typing

---
 GANDLF/compute/forward_pass.py    | 40 ++++++-------------
 GANDLF/compute/loss_and_metric.py | 66 ++++++++++++++++---------------
 GANDLF/compute/step.py            |  4 +-
 GANDLF/compute/training_loop.py   | 65 +++++++++++++++++-------------
 GANDLF/logger.py                  | 19 +++++----
 GANDLF/metrics/__init__.py        |  4 +-
 GANDLF/metrics/classification.py  |  6 ++-
 GANDLF/metrics/regression.py      |  5 ++-
 GANDLF/utils/generic.py           |  8 ++--
 GANDLF/utils/tensor.py            |  2 +-
 10 files changed, 116 insertions(+), 103 deletions(-)

diff --git a/GANDLF/compute/forward_pass.py b/GANDLF/compute/forward_pass.py
index 9da87b8ff..e910c98d0 100644
--- a/GANDLF/compute/forward_pass.py
+++ b/GANDLF/compute/forward_pass.py
@@ -1,6 +1,6 @@
 import os
 import pathlib
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -51,12 +51,14 @@ def validate_network(
     print("*" * 20)
     # Initialize a few things
     total_epoch_valid_loss = 0
-    total_epoch_valid_metric = {}
+    total_epoch_valid_metric: dict[str, Union[float, np.array]] = {}
     average_epoch_valid_metric = {}
 
     for metric in params["metrics"]:
         if "per_label" in metric:
-            total_epoch_valid_metric[metric] = []
+            total_epoch_valid_metric[metric] = np.zeros(
+                shape=params["model"]["num_classes"]
+            )
         else:
             total_epoch_valid_metric[metric] = 0
 
@@ -64,8 +66,7 @@ def validate_network(
     subject_id_list = []
     is_classification = params.get("problem_type") == "classification"
     calculate_overall_metrics = (
-        (params["problem_type"] == "classification")
-        or (params["problem_type"] == "regression")
+        params["problem_type"] in {"classification", "regression"}
     ) and mode == "validation"
     is_inference = mode == "inference"
 
@@ -193,6 +194,7 @@ def validate_network(
 
             if params["save_output"] or is_inference:
                 # we divide by scaling factor here because we multiply by it during loss/metric calculation
+                # TODO: regression-only, right?
                 outputToWrite += (
                     str(epoch)
                     + ","
@@ -206,23 +208,14 @@ def validate_network(
             )
 
             if calculate_overall_metrics:
+                # TODO: that's for classification only. What about regression?
                 predictions_array[batch_idx] = (
                     torch.argmax(pred_output[0], 0).cpu().item()
                 )
             # # Non network validation related
             total_epoch_valid_loss += final_loss.detach().cpu().item()
-            for metric in final_metric.keys():
-                if isinstance(total_epoch_valid_metric[metric], list):
-                    if len(total_epoch_valid_metric[metric]) == 0:
-                        total_epoch_valid_metric[metric] = np.array(
-                            final_metric[metric]
-                        )
-                    else:
-                        total_epoch_valid_metric[metric] += np.array(
-                            final_metric[metric]
-                        )
-                else:
-                    total_epoch_valid_metric[metric] += final_metric[metric]
+            for metric, metric_val in final_metric.keys():
+                total_epoch_valid_metric[metric] += metric_val
 
         else:  # for segmentation problems OR regression/classification when no label is present
             grid_sampler = torchio.inference.GridSampler(
@@ -386,6 +379,7 @@ def validate_network(
                 # final regression output
                 output_prediction = output_prediction / len(patch_loader)
                 if calculate_overall_metrics:
+                    # TOD: what? regression and argmax?
                     predictions_array[batch_idx] = (
                         torch.argmax(output_prediction[0], 0).cpu().item()
                     )
@@ -440,17 +434,7 @@ def validate_network(
                 # loss.cpu().data.item()
                 total_epoch_valid_loss += final_loss.cpu().item()
                 for metric in final_metric.keys():
-                    if isinstance(total_epoch_valid_metric[metric], list):
-                        if len(total_epoch_valid_metric[metric]) == 0:
-                            total_epoch_valid_metric[metric] = np.array(
-                                final_metric[metric]
-                            )
-                        else:
-                            total_epoch_valid_metric[metric] += np.array(
-                                final_metric[metric]
-                            )
-                    else:
-                        total_epoch_valid_metric[metric] += final_metric[metric]
+                    total_epoch_valid_metric[metric] += final_metric[metric]
 
         if label_ground_truth is not None:
             if params["verbose"]:
diff --git a/GANDLF/compute/loss_and_metric.py b/GANDLF/compute/loss_and_metric.py
index e149c08db..36f78560e 100644
--- a/GANDLF/compute/loss_and_metric.py
+++ b/GANDLF/compute/loss_and_metric.py
@@ -1,5 +1,6 @@
 import sys
-from typing import Dict, Tuple
+import warnings
+from typing import Dict, Tuple, Union
 from GANDLF.losses import global_losses_dict
 from GANDLF.metrics import global_metrics_dict
 import torch
@@ -13,7 +14,7 @@ def get_metric_output(
     prediction: torch.Tensor,
     target: torch.Tensor,
     params: dict,
-) -> float:
+) -> Union[float, list]:
     """
     This function computes the metric output for a given metric function, prediction and target.
 
@@ -36,6 +37,12 @@ def get_metric_output(
         if len(temp) > 1:
             return temp
         else:
+            # TODO: this branch is extremely age case and is buggy.
+            #  Overall the case when metric returns a list but of length 1 is very rare. The only case is when
+            #  the metric returns Nx.. tensor (i.e. without aggregation by elements) and batch_size==N==1. This branch
+            #  would definitely fail for such a metrics like
+            #  MulticlassAccuracy(num_classes=3, multidim_average="samplewise")
+            #  Maybe the best solution is to raise an error here if metric is configured to return samplewise results?
             return metric_output.item()
 
 
@@ -115,41 +122,38 @@ def get_loss_and_metrics(
         loss_kld = global_losses_dict["kld"](prediction[2], prediction[3])
         loss_cycle = global_losses_dict["mse"](prediction[2], prediction[4], None)
         loss = 0.01 * loss_kld + loss_reco + 10 * loss_seg + loss_cycle
+    elif deep_supervision_model:
+        # this is for models that have deep-supervision
+        for i, _ in enumerate(prediction):
+            # loss is calculated based on resampled "soft" labels using a pre-defined weights array
+            loss += (
+                loss_function(prediction[i], ground_truth_resampled[i], params)
+                * loss_weights[i]
+            )
     else:
-        if deep_supervision_model:
-            # this is for models that have deep-supervision
-            for i, _ in enumerate(prediction):
-                # loss is calculated based on resampled "soft" labels using a pre-defined weights array
-                loss += (
-                    loss_function(prediction[i], ground_truth_resampled[i], params)
-                    * loss_weights[i]
-                )
-        else:
-            loss = loss_function(prediction, target, params)
+        loss = loss_function(prediction, target, params)
     metric_output = {}
 
     # Metrics should be a list
     for metric in params["metrics"]:
         metric_lower = metric.lower()
         metric_output[metric] = 0
-        if metric_lower in global_metrics_dict:
-            metric_function = global_metrics_dict[metric_lower]
-            if sdnet_check:
-                metric_output[metric] = get_metric_output(
-                    metric_function, prediction[0], target.squeeze(-1), params
+        if metric_lower not in global_metrics_dict:
+            warnings.warn("WARNING: Could not find the requested metric '" + metric)
+            continue
+
+        metric_function = global_metrics_dict[metric_lower]
+        if sdnet_check:
+            metric_output[metric] = get_metric_output(
+                metric_function, prediction[0], target.squeeze(-1), params
+            )
+        elif deep_supervision_model:
+            for i, _ in enumerate(prediction):
+                metric_output[metric] += get_metric_output(
+                    metric_function, prediction[i], ground_truth_resampled[i], params
                 )
-            else:
-                if deep_supervision_model:
-                    for i, _ in enumerate(prediction):
-                        metric_output[metric] += get_metric_output(
-                            metric_function,
-                            prediction[i],
-                            ground_truth_resampled[i],
-                            params,
-                        )
-
-                else:
-                    metric_output[metric] = get_metric_output(
-                        metric_function, prediction, target, params
-                    )
+        else:
+            metric_output[metric] = get_metric_output(
+                metric_function, prediction, target, params
+            )
     return loss, metric_output
diff --git a/GANDLF/compute/step.py b/GANDLF/compute/step.py
index bdd997a50..151ee0872 100644
--- a/GANDLF/compute/step.py
+++ b/GANDLF/compute/step.py
@@ -90,5 +90,7 @@ def step(
         if "medcam_enabled" in params and params["medcam_enabled"]:
             attention_map = torch.unsqueeze(attention_map, -1)
 
-    assert len(output) == len(image), f"Error: output({len(output)}) and batch({len(image)}) have different lengths. Both should be equal to batch size!"
+    assert len(output) == len(
+        image
+    ), f"Error: output({len(output)}) and batch({len(image)}) have different lengths. Both should be equal to batch size!"
     return loss, metric_output, output, attention_map
diff --git a/GANDLF/compute/training_loop.py b/GANDLF/compute/training_loop.py
index c5cafd869..8f183030d 100644
--- a/GANDLF/compute/training_loop.py
+++ b/GANDLF/compute/training_loop.py
@@ -1,5 +1,5 @@
 import os, time, psutil
-from typing import Tuple
+from typing import Tuple, Union
 import pandas as pd
 import torch
 from torch.utils.data import DataLoader
@@ -22,7 +22,6 @@
     version_check,
     write_training_patches,
     print_model_summary,
-    get_ground_truths_and_predictions_tensor,
     get_model_dict,
     print_and_format_metrics,
 )
@@ -59,15 +58,26 @@ def train_network(
     print("*" * 20)
     # Initialize a few things
     total_epoch_train_loss = 0
-    total_epoch_train_metric = {}
+    total_epoch_train_metric: dict[str, Union[float, np.array]] = {}
     average_epoch_train_metric = {}
-    calculate_overall_metrics = (params["problem_type"] == "classification") or (
-        params["problem_type"] == "regression"
-    )
+    # TODO: calculate metrics for segmentation and other problems. btw what are possible problem types?
+    calculate_overall_metrics = params["problem_type"] in {
+        "classification",
+        "regression",
+    }
+
+    # get ground truths
+    if calculate_overall_metrics:
+        # TODO: for regression / segmentation we need different dtypes + different shape
+        ground_truth_array = torch.zeros(len(train_dataloader.dataset), dtype=torch.int)
+        predictions_array = torch.zeros_like(ground_truth_array)
 
     for metric in params["metrics"]:
+        # TODO: can it be per-label for non-classif?
         if "per_label" in metric:
-            total_epoch_train_metric[metric] = []
+            total_epoch_train_metric[metric] = np.zeros(
+                shape=params["model"]["num_classes"]
+            )
         else:
             total_epoch_train_metric[metric] = 0
 
@@ -77,10 +87,6 @@ def train_network(
         if params["verbose"]:
             print("Using Automatic mixed precision", flush=True)
 
-    # get ground truths
-    if calculate_overall_metrics:
-        ground_truth_array = torch.zeros(len(train_dataloader.dataset), dtype=torch.int)
-        predictions_array = torch.zeros_like(ground_truth_array)
     # Set the model to train
     model.train()
     for batch_idx, (subject) in enumerate(
@@ -115,13 +121,18 @@ def train_network(
         loss, calculated_metrics, output, _ = step(model, image, label, params)
         # store predictions for classification
         if calculate_overall_metrics:
-            batch_idx_slice = slice(batch_idx * params["batch_size"], (batch_idx + 1) * params["batch_size"])
+            batch_idx_slice = slice(
+                batch_idx * params["batch_size"], (batch_idx + 1) * params["batch_size"]
+            )
+            # TODO: label = BATCH_SIZE x 1. What if not? Multiclass? classif - OHE?
             ground_truth_array[batch_idx_slice] = label.detach().cpu().ravel()
+            # TODO: output is BATCH_SIZE x N_CLASSES. What if not?
             batch_predictions = torch.argmax(output, 1).cpu()
             assert len(batch_predictions) == len(label)
             predictions_array[batch_idx_slice] = batch_predictions
 
         nan_loss = torch.isnan(loss)
+        # loss backward
         second_order = (
             hasattr(optimizer, "is_second_order") and optimizer.is_second_order
         )
@@ -155,18 +166,8 @@ def train_network(
         # Non network training related
         if not nan_loss:
             total_epoch_train_loss += loss.detach().cpu().item()
-        for metric in calculated_metrics.keys():
-            if isinstance(total_epoch_train_metric[metric], list):
-                if len(total_epoch_train_metric[metric]) == 0:
-                    total_epoch_train_metric[metric] = np.array(
-                        calculated_metrics[metric]
-                    )
-                else:
-                    total_epoch_train_metric[metric] += np.array(
-                        calculated_metrics[metric]
-                    )
-            else:
-                total_epoch_train_metric[metric] += calculated_metrics[metric]
+        for metric, metric_val in calculated_metrics.items():
+            total_epoch_train_metric[metric] += metric_val
 
         if params["verbose"]:
             # For printing information at halftime during an epoch
@@ -194,6 +195,9 @@ def train_network(
         average_epoch_train_metric = overall_stats(
             predictions_array, ground_truth_array, params
         )
+    # TODO: the following not just prints and formats, but updates the dict also. Clean this code
+    #  1. average_epoch_train_metric and total_epoch_train_metric are combined
+    #  2. list values in total_epoch_train_metric are converted to strings by some logic (but not in avg_ep_tr_metr)
     average_epoch_train_metric = print_and_format_metrics(
         average_epoch_train_metric,
         total_epoch_train_metric,
@@ -325,17 +329,24 @@ def training_loop(
 
     metrics_log = list(params["metrics"])
 
-    calculate_overall_metrics = params["problem_type"] in {"classification", "regression"}
+    calculate_overall_metrics = params["problem_type"] in {
+        "classification",
+        "regression",
+    }
 
     if calculate_overall_metrics:
         # get the overall metrics that are calculated automatically for classification/regression problems
         if params["problem_type"] == "regression":
-            overall_metrics = overall_stats(torch.Tensor([1]), torch.Tensor([1]), params)
+            overall_metrics = overall_stats(
+                torch.Tensor([1]), torch.Tensor([1]), params
+            )
         elif params["problem_type"] == "classification":
             # this is just used to generate the headers for the overall stats
             temp_tensor = torch.randint(0, params["model"]["num_classes"], (5,))
             overall_metrics = overall_stats(
-                temp_tensor.to(dtype=torch.int32), temp_tensor.to(dtype=torch.int32), params
+                temp_tensor.to(dtype=torch.int32),
+                temp_tensor.to(dtype=torch.int32),
+                params,
             )
         else:
             raise NotImplementedError("Problem type not implemented for overall stats")
diff --git a/GANDLF/logger.py b/GANDLF/logger.py
index f7e15f044..ef98d5505 100755
--- a/GANDLF/logger.py
+++ b/GANDLF/logger.py
@@ -25,15 +25,19 @@ def __init__(self, logger_csv_filename: str, metrics: List[str], mode: str) -> N
         mode = mode.lower()
         self.mode = mode.lower()
 
-        new_header = ["epoch_no", f"{mode}_loss"] + [f"{mode}_{metric}" for metric in metrics]
+        new_header = ["epoch_no", f"{mode}_loss"] + [
+            f"{mode}_{metric}" for metric in metrics
+        ]
 
         # TODO: do we really need to support appending to existing files?
         if os.path.exists(self.filename):
             with open(self.filename, "r") as f:
                 existing_header = f.readline().strip().split(",")
             if set(existing_header) != set(new_header):
-                raise ValueError(f"Logger file {self.filename} error: existing header does not match new header."
-                                 f" Existing header: {existing_header}. New header: {new_header}")
+                raise ValueError(
+                    f"Logger file {self.filename} error: existing header does not match new header."
+                    f" Existing header: {existing_header}. New header: {new_header}"
+                )
             self.ordered_header = existing_header
         else:
             with open(self.filename, "w") as f:
@@ -41,8 +45,10 @@ def __init__(self, logger_csv_filename: str, metrics: List[str], mode: str) -> N
             self.ordered_header = new_header
 
     def write(
-            self, epoch_number: int, loss: Union[float, torch.Tensor],
-            epoch_metrics: Dict[str, Union[float, torch.Tensor]]
+        self,
+        epoch_number: int,
+        loss: Union[float, torch.Tensor],
+        epoch_metrics: Dict[str, Union[float, torch.Tensor]],
     ) -> None:
         """
         Write the epoch number, loss and metrics to the csv file.
@@ -56,8 +62,7 @@ def write(
         if torch.is_tensor(loss):
             loss = loss.cpu().item()
 
-        row = {"epoch_no": epoch_number,
-               f"{self.mode}_loss": loss}
+        row = {"epoch_no": epoch_number, f"{self.mode}_loss": loss}
 
         for metric, metric_val in epoch_metrics.items():
             if torch.is_tensor(metric_val):
diff --git a/GANDLF/metrics/__init__.py b/GANDLF/metrics/__init__.py
index b8de47cf1..1fc21b3fb 100644
--- a/GANDLF/metrics/__init__.py
+++ b/GANDLF/metrics/__init__.py
@@ -1,6 +1,8 @@
 """
 All the metrics are to be called from here
 """
+from typing import Union
+
 from GANDLF.losses.regression import MSE_loss, CEL
 from .segmentation import (
     multi_class_dice,
@@ -100,7 +102,7 @@
 ]
 
 
-def overall_stats(predictions, ground_truth, params):
+def overall_stats(predictions, ground_truth, params) -> dict[str, Union[float, list]]:
     """
     Generates a dictionary of metrics calculated on the overall predictions and ground truths.
 
diff --git a/GANDLF/metrics/classification.py b/GANDLF/metrics/classification.py
index ccfa507f6..5ef113fda 100644
--- a/GANDLF/metrics/classification.py
+++ b/GANDLF/metrics/classification.py
@@ -1,3 +1,5 @@
+from typing import Union
+
 import torch
 import torchmetrics as tm
 from torch.nn.functional import one_hot
@@ -5,7 +7,9 @@
 from GANDLF.utils.generic import determine_classification_task_type
 
 
-def overall_stats(prediction: torch.Tensor, target: torch.Tensor, params: dict) -> dict:
+def overall_stats(
+    prediction: torch.Tensor, target: torch.Tensor, params: dict
+) -> dict[str, Union[float, list]]:
     """
     Generates a dictionary of metrics calculated on the overall prediction and ground truths.
 
diff --git a/GANDLF/metrics/regression.py b/GANDLF/metrics/regression.py
index eedbde027..0d84fa2a3 100644
--- a/GANDLF/metrics/regression.py
+++ b/GANDLF/metrics/regression.py
@@ -1,6 +1,7 @@
 """
 All the metrics are to be called from here
 """
+from typing import Union
 
 import torch
 from sklearn.metrics import balanced_accuracy_score
@@ -82,7 +83,9 @@ def per_label_accuracy(
         return balanced_acc_score(prediction, target, params)
 
 
-def overall_stats(prediction: torch.Tensor, target: torch.Tensor, params: dict) -> dict:
+def overall_stats(
+    prediction: torch.Tensor, target: torch.Tensor, params: dict
+) -> dict[str, Union[float, list]]:
     """
     Generates a dictionary of metrics calculated on the overall predictions and ground truths.
 
diff --git a/GANDLF/utils/generic.py b/GANDLF/utils/generic.py
index b2fff3cc3..9b94a9cd7 100644
--- a/GANDLF/utils/generic.py
+++ b/GANDLF/utils/generic.py
@@ -231,6 +231,7 @@ def print_and_format_metrics(
 
     Args:
         cohort_level_metrics (dict): The cohort level metrics calculated from the GANDLF.metrics.overall_stats function.
+            May be empty dict if not classification/regression.
         sample_level_metrics (dict): The sample level metrics calculated from separate samples from the dataloader(s).
         metrics_dict_from_parameters (dict): The metrics dictionary to populate.
         mode (str): The mode of the metrics (train, val, test).
@@ -270,11 +271,8 @@ def __update_metric_from_list_to_single_string(input_metrics_dict: dict) -> dict
         else:
             to_print = sample_level_metrics[metric] / length_of_dataloader
         output_metrics_dict[metric] = to_print
-    for metric in output_metrics_dict.keys():
-        print(
-            "     Epoch Final   " + mode + " " + metric + " : ",
-            output_metrics_dict[metric],
-        )
+    for metric, metric_val in output_metrics_dict.items():
+        print("     Epoch Final   " + mode + " " + metric + " : ", metric_val)
     output_metrics_dict = __update_metric_from_list_to_single_string(
         output_metrics_dict
     )
diff --git a/GANDLF/utils/tensor.py b/GANDLF/utils/tensor.py
index 9dfd3cb9c..651b7b35d 100644
--- a/GANDLF/utils/tensor.py
+++ b/GANDLF/utils/tensor.py
@@ -521,7 +521,7 @@ def get_ground_truths_and_predictions_tensor(
 
 def get_output_from_calculator(
     prediction: torch.Tensor, target: torch.tensor, calculator: torchmetrics.Metric
-) -> float:
+) -> Union[float, list]:
     """
     Helper function to get the output from a calculator.
 

From 36bbfa9a720825a9b1ffc3fdcb20d03dc67fe9aa Mon Sep 17 00:00:00 2001
From: Viacheslav Kukushkin <vy.kukushkin@gmail.com>
Date: Wed, 15 May 2024 02:08:54 +0300
Subject: [PATCH 05/18] Fix after changing step output shape

---
 GANDLF/compute/forward_pass.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/GANDLF/compute/forward_pass.py b/GANDLF/compute/forward_pass.py
index e910c98d0..c9afefb29 100644
--- a/GANDLF/compute/forward_pass.py
+++ b/GANDLF/compute/forward_pass.py
@@ -309,7 +309,6 @@ def validate_network(
             # save outputs
             if params["problem_type"] == "segmentation":
                 output_prediction = aggregator.get_output_tensor()
-                output_prediction = output_prediction.unsqueeze(0)
                 if params["save_output"]:
                     img_for_metadata = torchio.ScalarImage(
                         tensor=subject["1"]["data"].squeeze(0),
@@ -389,7 +388,7 @@ def validate_network(
                         + ","
                         + subject["subject_id"][0]
                         + ","
-                        + str(output_prediction)
+                        + str(output_prediction[0])
                         + "\n"
                     )
 
@@ -401,7 +400,6 @@ def validate_network(
                         n.squeeze(), raw_input=image[i].squeeze(-1)
                     )
 
-            output_prediction = output_prediction.squeeze(-1)
             if is_inference and is_classification:
                 logits_list.append(output_prediction)
                 subject_id_list.append(subject.get("subject_id")[0])
@@ -412,9 +410,8 @@ def validate_network(
                 if label_ground_truth.shape[0] == 3:
                     label_ground_truth = label_ground_truth[0, ...].unsqueeze(0)
                 # we always want the ground truth to be in the same format as the prediction
+                # add batch dim
                 label_ground_truth = label_ground_truth.unsqueeze(0)
-                if label_ground_truth.shape[-1] == 1:
-                    label_ground_truth = label_ground_truth.squeeze(-1)
                 final_loss, final_metric = get_loss_and_metrics(
                     image,
                     label_ground_truth,

From 62ffb146637b8716b4e903ff126cb76d0c68f7f8 Mon Sep 17 00:00:00 2001
From: Viacheslav Kukushkin <vy.kukushkin@gmail.com>
Date: Tue, 21 May 2024 19:43:47 +0300
Subject: [PATCH 06/18] dynamic lists instead of fixed size for handling
 dynamic batch_size

---
 GANDLF/compute/forward_pass.py  |  6 ++----
 GANDLF/compute/training_loop.py | 15 +++++----------
 2 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/GANDLF/compute/forward_pass.py b/GANDLF/compute/forward_pass.py
index c9afefb29..90b32458a 100644
--- a/GANDLF/compute/forward_pass.py
+++ b/GANDLF/compute/forward_pass.py
@@ -108,10 +108,8 @@ def validate_network(
 
     # get ground truths for classification problem, validation set
     if calculate_overall_metrics:
-        (
-            ground_truth_array,
-            predictions_array,
-        ) = get_ground_truths_and_predictions_tensor(params, "validation_data")
+        ground_truth_array = []
+        predictions_array = []
 
     for batch_idx, (subject) in enumerate(
         tqdm(valid_dataloader, desc="Looping over " + mode + " data")
diff --git a/GANDLF/compute/training_loop.py b/GANDLF/compute/training_loop.py
index 8f183030d..4880edae1 100644
--- a/GANDLF/compute/training_loop.py
+++ b/GANDLF/compute/training_loop.py
@@ -68,9 +68,8 @@ def train_network(
 
     # get ground truths
     if calculate_overall_metrics:
-        # TODO: for regression / segmentation we need different dtypes + different shape
-        ground_truth_array = torch.zeros(len(train_dataloader.dataset), dtype=torch.int)
-        predictions_array = torch.zeros_like(ground_truth_array)
+        ground_truth_array = []
+        predictions_array = []
 
     for metric in params["metrics"]:
         # TODO: can it be per-label for non-classif?
@@ -121,15 +120,11 @@ def train_network(
         loss, calculated_metrics, output, _ = step(model, image, label, params)
         # store predictions for classification
         if calculate_overall_metrics:
-            batch_idx_slice = slice(
-                batch_idx * params["batch_size"], (batch_idx + 1) * params["batch_size"]
-            )
-            # TODO: label = BATCH_SIZE x 1. What if not? Multiclass? classif - OHE?
-            ground_truth_array[batch_idx_slice] = label.detach().cpu().ravel()
+            ground_truth_array.extend(list(label.detach().cpu()))
             # TODO: output is BATCH_SIZE x N_CLASSES. What if not?
             batch_predictions = torch.argmax(output, 1).cpu()
             assert len(batch_predictions) == len(label)
-            predictions_array[batch_idx_slice] = batch_predictions
+            predictions_array.extend(batch_predictions.tolist())
 
         nan_loss = torch.isnan(loss)
         # loss backward
@@ -193,7 +188,7 @@ def train_network(
     # get overall stats for classification
     if calculate_overall_metrics:
         average_epoch_train_metric = overall_stats(
-            predictions_array, ground_truth_array, params
+            torch.Tensor(predictions_array), torch.Tensor(ground_truth_array), params
         )
     # TODO: the following not just prints and formats, but updates the dict also. Clean this code
     #  1. average_epoch_train_metric and total_epoch_train_metric are combined

From 3987439a2a59e1ff9aad364ac0c661cd9dc6414b Mon Sep 17 00:00:00 2001
From: Viacheslav Kukushkin <vy.kukushkin@gmail.com>
Date: Tue, 21 May 2024 23:45:02 +0300
Subject: [PATCH 07/18] Fix for segmentation

---
 GANDLF/compute/forward_pass.py  | 13 ++++----
 GANDLF/compute/step.py          | 53 ++++++++++++++++++++-------------
 GANDLF/compute/training_loop.py | 14 +++++----
 GANDLF/logger.py                |  2 +-
 4 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/GANDLF/compute/forward_pass.py b/GANDLF/compute/forward_pass.py
index 90b32458a..539a422bd 100644
--- a/GANDLF/compute/forward_pass.py
+++ b/GANDLF/compute/forward_pass.py
@@ -206,13 +206,12 @@ def validate_network(
             )
 
             if calculate_overall_metrics:
+                ground_truth_array.append(label_ground_truth.item())
                 # TODO: that's for classification only. What about regression?
-                predictions_array[batch_idx] = (
-                    torch.argmax(pred_output[0], 0).cpu().item()
-                )
+                predictions_array.append(torch.argmax(pred_output[0], 0).cpu().item())
             # # Non network validation related
             total_epoch_valid_loss += final_loss.detach().cpu().item()
-            for metric, metric_val in final_metric.keys():
+            for metric, metric_val in final_metric.items():
                 total_epoch_valid_metric[metric] += metric_val
 
         else:  # for segmentation problems OR regression/classification when no label is present
@@ -306,7 +305,7 @@ def validate_network(
 
             # save outputs
             if params["problem_type"] == "segmentation":
-                output_prediction = aggregator.get_output_tensor()
+                output_prediction = aggregator.get_output_tensor().unsqueeze(0)
                 if params["save_output"]:
                     img_for_metadata = torchio.ScalarImage(
                         tensor=subject["1"]["data"].squeeze(0),
@@ -465,7 +464,9 @@ def validate_network(
         # get overall stats for classification
         if calculate_overall_metrics:
             average_epoch_valid_metric = overall_stats(
-                predictions_array, ground_truth_array, params
+                torch.Tensor(predictions_array),
+                torch.Tensor(ground_truth_array),
+                params,
             )
         average_epoch_valid_metric = print_and_format_metrics(
             average_epoch_valid_metric,
diff --git a/GANDLF/compute/step.py b/GANDLF/compute/step.py
index 151ee0872..141ff5890 100644
--- a/GANDLF/compute/step.py
+++ b/GANDLF/compute/step.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Optional, Tuple
 import torch
 import psutil
@@ -16,8 +17,12 @@ def step(
 
     Args:
         model (torch.nn.Module): The model to process the input image with, it should support appropriate dimensions.
-        image (torch.Tensor): The input image stack according to requirements.
+        image (torch.Tensor): The input image stack according to requirements. (B, C, H, W, D)
         label (torch.Tensor): The input label for the corresponding image tensor.
+            If segmentation, (B, C, H, W, D);
+            if classification / regression (not multilabel), (B, 1)
+            if classif / reg (multilabel), (B, N_LABELS)
+
         params (dict): The parameters dictionary.
         train (Optional[bool], optional): Whether the step is for training or validation. Defaults to True.
 
@@ -44,23 +49,19 @@ def step(
         if params["problem_type"] == "segmentation":
             if label.shape[1] == 3:
                 label = label[:, 0, ...].unsqueeze(1)
-                # this warning should only come up once
-                if params["print_rgb_label_warning"]:
-                    print(
-                        "WARNING: The label image is an RGB image, only the first channel will be used.",
-                        flush=True,
-                    )
-                    params["print_rgb_label_warning"] = False
+                warnings.warn(
+                    "The label image is an RGB image, only the first channel will be used."
+                )
 
-            if params["model"]["dimension"] == 2:
-                label = torch.squeeze(label, -1)
+    assert len(label) == len(image)
 
     if params["model"]["dimension"] == 2:
-        image = torch.squeeze(image, -1)
-        if "value_keys" in params:
-            if label is not None:
-                if len(label.shape) > 1:
-                    label = torch.squeeze(label, -1)
+        image = image.squeeze(-1)  # removing depth
+
+    # for segmentation remove the depth dimension from the label.
+    # for classification / regression, flattens class / reg label from list (possible in multilabel) to scalar
+    if label is not None:
+        label = label.squeeze(-1)
 
     if not train and params["model"]["type"].lower() == "openvino":
         output = torch.from_numpy(
@@ -69,17 +70,25 @@ def step(
             ]
         )
         output = output.to(params["device"])
-    else:
-        if params["model"]["amp"]:
-            with torch.cuda.amp.autocast():
-                output = model(image)
-        else:
+    elif params["model"]["amp"]:
+        with torch.cuda.amp.autocast():
             output = model(image)
+    else:
+        output = model(image)
 
     attention_map = None
     if "medcam_enabled" in params and params["medcam_enabled"]:
         output, attention_map = output
 
+    if not isinstance(output, torch.Tensor):
+        warnings.warn(
+            f"Model output is not a Tensor: {type(output)}. Say, `deep_resunet` and `deep_unet` may return "
+            f"list of tensors on different scales instead of just one prediction Tensor. However due to "
+            f"GaNDLF architecture it is expected that models return only one tensor. For deep_* models "
+            f"only the biggeest scale is processed. Use these models with caution till fix is implemented."
+        )
+        output = output[0]
+
     # one-hot encoding of 'label' will probably be needed for segmentation
     if label is not None:
         loss, metric_output = get_loss_and_metrics(image, label, output, params)
@@ -90,6 +99,10 @@ def step(
         if "medcam_enabled" in params and params["medcam_enabled"]:
             attention_map = torch.unsqueeze(attention_map, -1)
 
+    if params["model"]["dimension"] == 2 and params["problem_type"] == "segmentation":
+        # for 2d images where the depth is removed, add it back
+        output = torch.unsqueeze(output, -1)
+
     assert len(output) == len(
         image
     ), f"Error: output({len(output)}) and batch({len(image)}) have different lengths. Both should be equal to batch size!"
diff --git a/GANDLF/compute/training_loop.py b/GANDLF/compute/training_loop.py
index 4880edae1..7276cec36 100644
--- a/GANDLF/compute/training_loop.py
+++ b/GANDLF/compute/training_loop.py
@@ -92,21 +92,25 @@ def train_network(
         tqdm(train_dataloader, desc="Looping over training data")
     ):
         optimizer.zero_grad()
-        image = (
+        image = (  # 5D tensor: (B, C, H, W, D)
             torch.cat(
                 [subject[key][torchio.DATA] for key in params["channel_keys"]], dim=1
             )
             .float()
             .to(params["device"])
         )
-        if "value_keys" in params:
+        if (
+            "value_keys" in params
+        ):  # classification / regression (when label is scalar) or multilabel classif/regression
             label = torch.cat([subject[key] for key in params["value_keys"]], dim=0)
             # min is needed because for certain cases, batch size becomes smaller than the total remaining labels
             label = label.reshape(
                 min(params["batch_size"], len(label)), len(params["value_keys"])
             )
         else:
-            label = subject["label"][torchio.DATA]
+            label = subject["label"][
+                torchio.DATA
+            ]  # segmentation; label is (B, C, H, W, D) image
         label = label.to(params["device"])
 
         if params["save_training"]:
@@ -120,11 +124,11 @@ def train_network(
         loss, calculated_metrics, output, _ = step(model, image, label, params)
         # store predictions for classification
         if calculate_overall_metrics:
-            ground_truth_array.extend(list(label.detach().cpu()))
+            ground_truth_array.extend(label.detach().cpu())
             # TODO: output is BATCH_SIZE x N_CLASSES. What if not?
             batch_predictions = torch.argmax(output, 1).cpu()
             assert len(batch_predictions) == len(label)
-            predictions_array.extend(batch_predictions.tolist())
+            predictions_array.extend(batch_predictions.detach().cpu())
 
         nan_loss = torch.isnan(loss)
         # loss backward
diff --git a/GANDLF/logger.py b/GANDLF/logger.py
index ef98d5505..2562eb17d 100755
--- a/GANDLF/logger.py
+++ b/GANDLF/logger.py
@@ -70,6 +70,6 @@ def write(
             row[f"{self.mode}_{metric}"] = metric_val
 
         with open(self.filename, "a") as f:
-            line = [row[col] for col in self.ordered_header]
+            line = [row.get(col, "") for col in self.ordered_header]
             line = [str(x) for x in line]
             f.write(",".join(line) + "\n")

From 26b33a90380056916bd11b410c732015b9bbb224 Mon Sep 17 00:00:00 2001
From: Viacheslav Kukushkin <vy.kukushkin@gmail.com>
Date: Wed, 22 May 2024 01:07:53 +0300
Subject: [PATCH 08/18] a crutch for deep_* and sdnet architectures (that
 return list)

---
 GANDLF/compute/forward_pass.py  |  2 +-
 GANDLF/compute/step.py          | 31 ++++++++++++++++---------------
 GANDLF/compute/training_loop.py |  2 ++
 3 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/GANDLF/compute/forward_pass.py b/GANDLF/compute/forward_pass.py
index 539a422bd..3cddb9f61 100644
--- a/GANDLF/compute/forward_pass.py
+++ b/GANDLF/compute/forward_pass.py
@@ -376,7 +376,7 @@ def validate_network(
                 output_prediction = output_prediction / len(patch_loader)
                 if calculate_overall_metrics:
                     # TOD: what? regression and argmax?
-                    predictions_array[batch_idx] = (
+                    predictions_array.append(
                         torch.argmax(output_prediction[0], 0).cpu().item()
                     )
                 if params["save_output"]:
diff --git a/GANDLF/compute/step.py b/GANDLF/compute/step.py
index 141ff5890..483f89f1c 100644
--- a/GANDLF/compute/step.py
+++ b/GANDLF/compute/step.py
@@ -1,5 +1,5 @@
 import warnings
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 import torch
 import psutil
 from .loss_and_metric import get_loss_and_metrics
@@ -11,14 +11,14 @@ def step(
     label: Optional[torch.Tensor],
     params: dict,
     train: Optional[bool] = True,
-) -> Tuple[float, dict, torch.Tensor, torch.Tensor]:
+) -> Tuple[float, dict, Union[torch.Tensor, list[torch.Tensor]], torch.Tensor]:
     """
     This function performs a single step of training or validation.
 
     Args:
         model (torch.nn.Module): The model to process the input image with, it should support appropriate dimensions.
         image (torch.Tensor): The input image stack according to requirements. (B, C, H, W, D)
-        label (torch.Tensor): The input label for the corresponding image tensor.
+        label Optional[torch.Tensor]: The input label for the corresponding image tensor.
             If segmentation, (B, C, H, W, D);
             if classification / regression (not multilabel), (B, 1)
             if classif / reg (multilabel), (B, N_LABELS)
@@ -27,7 +27,8 @@ def step(
         train (Optional[bool], optional): Whether the step is for training or validation. Defaults to True.
 
     Returns:
-        Tuple[float, dict, torch.Tensor, torch.Tensor]: The loss, metrics, output, and attention map.
+        Tuple[float, dict, Union[torch.Tensor, list[torch.Tensor]], torch.Tensor]: The loss, metrics, output,
+            and attention map.
     """
     if params["verbose"]:
         if torch.cuda.is_available():
@@ -80,15 +81,6 @@ def step(
     if "medcam_enabled" in params and params["medcam_enabled"]:
         output, attention_map = output
 
-    if not isinstance(output, torch.Tensor):
-        warnings.warn(
-            f"Model output is not a Tensor: {type(output)}. Say, `deep_resunet` and `deep_unet` may return "
-            f"list of tensors on different scales instead of just one prediction Tensor. However due to "
-            f"GaNDLF architecture it is expected that models return only one tensor. For deep_* models "
-            f"only the biggeest scale is processed. Use these models with caution till fix is implemented."
-        )
-        output = output[0]
-
     # one-hot encoding of 'label' will probably be needed for segmentation
     if label is not None:
         loss, metric_output = get_loss_and_metrics(image, label, output, params)
@@ -99,9 +91,18 @@ def step(
         if "medcam_enabled" in params and params["medcam_enabled"]:
             attention_map = torch.unsqueeze(attention_map, -1)
 
-    if params["model"]["dimension"] == 2 and params["problem_type"] == "segmentation":
+    if not isinstance(output, torch.Tensor):
+        warnings.warn(
+            f"Model output is not a Tensor: {type(output)}. Say, `deep_resunet` and `deep_unet` may return "
+            f"list of tensors on different scales instead of just one prediction Tensor. However due to "
+            f"GaNDLF architecture it is expected that models return only one tensor. For deep_* models "
+            f"only the biggeest scale is processed. Use these models with caution till fix is implemented."
+        )
+        output = output[0]
+
+    if params["model"]["dimension"] == 2:
         # for 2d images where the depth is removed, add it back
-        output = torch.unsqueeze(output, -1)
+        output = output.unsqueeze(-1)
 
     assert len(output) == len(
         image
diff --git a/GANDLF/compute/training_loop.py b/GANDLF/compute/training_loop.py
index 7276cec36..d4517755a 100644
--- a/GANDLF/compute/training_loop.py
+++ b/GANDLF/compute/training_loop.py
@@ -124,6 +124,8 @@ def train_network(
         loss, calculated_metrics, output, _ = step(model, image, label, params)
         # store predictions for classification
         if calculate_overall_metrics:
+            # TODO: smelly code. if segmentation, in some models output may be a list of tensors rather then a one
+            #  tensor. This is not handled here. However, `calculate_overall_metrics` is set to False for segmentation
             ground_truth_array.extend(label.detach().cpu())
             # TODO: output is BATCH_SIZE x N_CLASSES. What if not?
             batch_predictions = torch.argmax(output, 1).cpu()

From 71273cee582671ca05b39c969bf07488da6397ca Mon Sep 17 00:00:00 2001
From: Viacheslav Kukushkin <vy.kukushkin@gmail.com>
Date: Wed, 22 May 2024 12:58:22 +0300
Subject: [PATCH 09/18] turning training dataset shuffle on

was turned off as workaround at https://github.com/mlcommons/GaNDLF/pull/870
---
 GANDLF/data/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GANDLF/data/__init__.py b/GANDLF/data/__init__.py
index adba1f2c6..6427ccee9 100644
--- a/GANDLF/data/__init__.py
+++ b/GANDLF/data/__init__.py
@@ -24,7 +24,7 @@ def get_train_loader(params):
             loader_type="train",
         ),
         batch_size=params["batch_size"],
-        shuffle=False,
+        shuffle=True,
         pin_memory=False,  # params["pin_memory_dataloader"], # this is going OOM if True - needs investigation
     )
 

From d30cf20172d89d103e67bae671cae55c16c6d016 Mon Sep 17 00:00:00 2001
From: Viacheslav Kukushkin <vy.kukushkin@gmail.com>
Date: Thu, 23 May 2024 10:55:17 +0300
Subject: [PATCH 10/18] Test fix for the case when both label and value_to_pred
 exist

fixes test_train_inference_classification_histology_large_2d (35)
---
 GANDLF/compute/forward_pass.py | 1 +
 GANDLF/compute/step.py         | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/GANDLF/compute/forward_pass.py b/GANDLF/compute/forward_pass.py
index 3cddb9f61..73d68e4b1 100644
--- a/GANDLF/compute/forward_pass.py
+++ b/GANDLF/compute/forward_pass.py
@@ -379,6 +379,7 @@ def validate_network(
                     predictions_array.append(
                         torch.argmax(output_prediction[0], 0).cpu().item()
                     )
+                    ground_truth_array.append(label_ground_truth.item())
                 if params["save_output"]:
                     outputToWrite += (
                         str(epoch)
diff --git a/GANDLF/compute/step.py b/GANDLF/compute/step.py
index 483f89f1c..f588b66ea 100644
--- a/GANDLF/compute/step.py
+++ b/GANDLF/compute/step.py
@@ -61,7 +61,9 @@ def step(
 
     # for segmentation remove the depth dimension from the label.
     # for classification / regression, flattens class / reg label from list (possible in multilabel) to scalar
-    if label is not None:
+    # TODO: second condition is crutch - in some cases label is passed as 1-d Tensor (B,) and if Batch size is 1,
+    #  it is squeezed to scalar tensor (0-d) and the future logic fails
+    if label is not None and len(label.shape) != 1:
         label = label.squeeze(-1)
 
     if not train and params["model"]["type"].lower() == "openvino":
@@ -100,7 +102,7 @@ def step(
         )
         output = output[0]
 
-    if params["model"]["dimension"] == 2:
+    if params["model"]["dimension"] == 2 and params["problem_type"] == "segmentation":
         # for 2d images where the depth is removed, add it back
         output = output.unsqueeze(-1)
 

From 92c4387cde26689531f153d7177bee734a406e35 Mon Sep 17 00:00:00 2001
From: Viacheslav Kukushkin <vy.kukushkin@gmail.com>
Date: Thu, 23 May 2024 13:38:22 +0300
Subject: [PATCH 11/18] bugfix when label is not present

---
 GANDLF/compute/step.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GANDLF/compute/step.py b/GANDLF/compute/step.py
index f588b66ea..148d206cf 100644
--- a/GANDLF/compute/step.py
+++ b/GANDLF/compute/step.py
@@ -54,7 +54,7 @@ def step(
                     "The label image is an RGB image, only the first channel will be used."
                 )
 
-    assert len(label) == len(image)
+        assert len(label) == len(image)
 
     if params["model"]["dimension"] == 2:
         image = image.squeeze(-1)  # removing depth

From d0d25fbbc91d7f3ae3235a0b3e80ada65d1f8787 Mon Sep 17 00:00:00 2001
From: Viacheslav Kukushkin <vy.kukushkin@gmail.com>
Date: Thu, 6 Jun 2024 17:15:47 +0300
Subject: [PATCH 12/18] Do not assert metric shape; lets take a first evaluated
 instead

(for one of classes per-label metrics are not counted thus metric shape may differ)
---
 GANDLF/compute/training_loop.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/GANDLF/compute/training_loop.py b/GANDLF/compute/training_loop.py
index d4517755a..bbf24a98d 100644
--- a/GANDLF/compute/training_loop.py
+++ b/GANDLF/compute/training_loop.py
@@ -74,9 +74,7 @@ def train_network(
     for metric in params["metrics"]:
         # TODO: can it be per-label for non-classif?
         if "per_label" in metric:
-            total_epoch_train_metric[metric] = np.zeros(
-                shape=params["model"]["num_classes"]
-            )
+            total_epoch_train_metric[metric] = np.zeros(1)  # real shape would be defined during execution
         else:
             total_epoch_train_metric[metric] = 0
 
@@ -168,7 +166,7 @@ def train_network(
         if not nan_loss:
             total_epoch_train_loss += loss.detach().cpu().item()
         for metric, metric_val in calculated_metrics.items():
-            total_epoch_train_metric[metric] += metric_val
+            total_epoch_train_metric[metric] = total_epoch_train_metric[metric] + metric_val
 
         if params["verbose"]:
             # For printing information at halftime during an epoch

From ca8a9040f1492c704c67a94d15bb32b51ce7e25a Mon Sep 17 00:00:00 2001
From: Viacheslav Kukushkin <vy.kukushkin@gmail.com>
Date: Thu, 6 Jun 2024 21:49:13 +0300
Subject: [PATCH 13/18] Blacked

---
 GANDLF/compute/training_loop.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/GANDLF/compute/training_loop.py b/GANDLF/compute/training_loop.py
index bbf24a98d..32b52f188 100644
--- a/GANDLF/compute/training_loop.py
+++ b/GANDLF/compute/training_loop.py
@@ -74,7 +74,9 @@ def train_network(
     for metric in params["metrics"]:
         # TODO: can it be per-label for non-classif?
         if "per_label" in metric:
-            total_epoch_train_metric[metric] = np.zeros(1)  # real shape would be defined during execution
+            total_epoch_train_metric[metric] = np.zeros(
+                1
+            )  # real shape would be defined during execution
         else:
             total_epoch_train_metric[metric] = 0
 
@@ -166,7 +168,9 @@ def train_network(
         if not nan_loss:
             total_epoch_train_loss += loss.detach().cpu().item()
         for metric, metric_val in calculated_metrics.items():
-            total_epoch_train_metric[metric] = total_epoch_train_metric[metric] + metric_val
+            total_epoch_train_metric[metric] = (
+                total_epoch_train_metric[metric] + metric_val
+            )
 
         if params["verbose"]:
             # For printing information at halftime during an epoch

From 53eb14545961075133a17d350f92c98d5beb10c7 Mon Sep 17 00:00:00 2001
From: hongbozheng <112345628+hongbozheng@users.noreply.github.com>
Date: Tue, 9 Jul 2024 14:24:49 -0500
Subject: [PATCH 14/18] correct forward operations order

---
 GANDLF/models/seg_modules/DownsamplingModule.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GANDLF/models/seg_modules/DownsamplingModule.py b/GANDLF/models/seg_modules/DownsamplingModule.py
index ab4a54ff3..2ccf3ba41 100644
--- a/GANDLF/models/seg_modules/DownsamplingModule.py
+++ b/GANDLF/models/seg_modules/DownsamplingModule.py
@@ -57,6 +57,6 @@ def forward(self, x):
         Returns:
             torch.Tensor: The output tensor, of shape (batch_size, output_channels, height // 2, width // 2).
         """
-        x = self.act(self.in_0(self.conv0(x)))
+        x = self.conv0(self.act(self.in_0(x)))
 
         return x

From 346e7c22885cbbc8b563b3d5af6da03806f47af6 Mon Sep 17 00:00:00 2001
From: Sarthak Pati <patis@iu.edu>
Date: Tue, 9 Jul 2024 13:31:22 -0600
Subject: [PATCH 15/18] fixed pip version for CI tests

---
 .devcontainer/onCreateCommand.sh      | 2 +-
 .github/workflows/black.yml           | 2 +-
 .github/workflows/main.yml            | 2 +-
 .github/workflows/mlcube-test.yml     | 2 +-
 .github/workflows/openfl-test.yml     | 2 +-
 .github/workflows/publish-nightly.yml | 2 +-
 .github/workflows/python-test.yml     | 2 +-
 Dockerfile-CPU                        | 2 +-
 Dockerfile-CUDA11.8                   | 2 +-
 Dockerfile-CUDA12.1                   | 2 +-
 Dockerfile-ROCm                       | 4 ++--
 11 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.devcontainer/onCreateCommand.sh b/.devcontainer/onCreateCommand.sh
index baf4c7aa0..2d8463aef 100755
--- a/.devcontainer/onCreateCommand.sh
+++ b/.devcontainer/onCreateCommand.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 python -m ensurepip # ensures pip is installed in the current environment
-pip install --upgrade pip
+pip install --upgrade pip==24.0
 pip install wheel
 pip install openvino-dev==2023.0.1 # [OPTIONAL] to generate optimized models for inference
 pip install mlcube_docker          # [OPTIONAL] to deploy GaNDLF models as MLCube-compliant Docker containers
diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
index 64421525b..3e7fd0864 100644
--- a/.github/workflows/black.yml
+++ b/.github/workflows/black.yml
@@ -27,7 +27,7 @@ jobs:
       
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
+          python -m pip install --upgrade pip==24.0
           python -m pip install black==${{ env.BLACK_VERSION }}
       
       - name: Run tests
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 03c911647..ba5e79b7a 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -38,7 +38,7 @@ jobs:
           ${{ runner.os }}-pip-
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip
+        python -m pip install --upgrade pip==24.0
         pip install scikit-build
         pip install -e .
         pip install build
diff --git a/.github/workflows/mlcube-test.yml b/.github/workflows/mlcube-test.yml
index 064f6a053..69a94fa31 100644
--- a/.github/workflows/mlcube-test.yml
+++ b/.github/workflows/mlcube-test.yml
@@ -70,7 +70,7 @@ jobs:
       run: |
         sudo apt-get update
         sudo apt-get install libvips libvips-tools -y
-        python -m pip install --upgrade pip
+        python -m pip install --upgrade pip==24.0
         python -m pip install wheel
         python -m pip install openvino-dev==2023.0.1 mlcube_docker
         pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu
diff --git a/.github/workflows/openfl-test.yml b/.github/workflows/openfl-test.yml
index 6caa31e97..727a13de0 100644
--- a/.github/workflows/openfl-test.yml
+++ b/.github/workflows/openfl-test.yml
@@ -71,7 +71,7 @@ jobs:
       run: |
         sudo apt-get update
         sudo apt-get install libvips libvips-tools -y
-        python -m pip install --upgrade pip
+        python -m pip install --upgrade pip==24.0
         python -m pip install wheel
         pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu
         pip install -e .
diff --git a/.github/workflows/publish-nightly.yml b/.github/workflows/publish-nightly.yml
index 21185fd86..d296bcff6 100644
--- a/.github/workflows/publish-nightly.yml
+++ b/.github/workflows/publish-nightly.yml
@@ -52,7 +52,7 @@ jobs:
     - name: Install dependencies
       if: env.publish_nightly
       run: |
-        python -m pip install --upgrade pip
+        python -m pip install --upgrade pip==24.0
         pip install scikit-build
         pip install -e .
         pip install build
diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml
index 6513c5109..c220b7a19 100644
--- a/.github/workflows/python-test.yml
+++ b/.github/workflows/python-test.yml
@@ -71,7 +71,7 @@ jobs:
       run: |
         sudo apt-get update
         sudo apt-get install libvips libvips-tools -y
-        python -m pip install --upgrade pip
+        python -m pip install --upgrade pip==24.0
         python -m pip install wheel
         python -m pip install openvino-dev==2023.0.1 mlcube_docker
         pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu
diff --git a/Dockerfile-CPU b/Dockerfile-CPU
index df066d294..1cb510679 100644
--- a/Dockerfile-CPU
+++ b/Dockerfile-CPU
@@ -7,7 +7,7 @@ LABEL version=1.0
 RUN apt-get update && apt-get install -y software-properties-common
 RUN add-apt-repository ppa:deadsnakes/ppa
 RUN apt-get update &&  apt-get install -y  python3.9 python3-pip libjpeg8-dev zlib1g-dev python3-dev libpython3.9-dev libffi-dev libgl1
-RUN python3.9 -m pip install --upgrade pip
+RUN python3.9 -m pip install --upgrade pip==24.0
 # EXPLICITLY install cpu versions of torch/torchvision (not all versions have +cpu modes on PyPI...)
 RUN python3.9 -m pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu
 RUN python3.9 -m pip install openvino-dev==2023.0.1 opencv-python-headless mlcube_docker
diff --git a/Dockerfile-CUDA11.8 b/Dockerfile-CUDA11.8
index bc4817868..9d5de251f 100644
--- a/Dockerfile-CUDA11.8
+++ b/Dockerfile-CUDA11.8
@@ -11,7 +11,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y software-properties-common
 RUN add-apt-repository ppa:deadsnakes/ppa
 RUN apt-get update &&  apt-get install -y  python3.9 python3-pip libjpeg8-dev zlib1g-dev python3-dev libpython3.9-dev libffi-dev libgl1
-RUN python3.9 -m pip install --upgrade pip
+RUN python3.9 -m pip install --upgrade pip==24.0
 RUN python3.9 -m pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu118
 RUN python3.9 -m pip install openvino-dev==2023.0.1 opencv-python-headless mlcube_docker
 
diff --git a/Dockerfile-CUDA12.1 b/Dockerfile-CUDA12.1
index 5a520397d..8d4bc90f8 100644
--- a/Dockerfile-CUDA12.1
+++ b/Dockerfile-CUDA12.1
@@ -11,7 +11,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y software-properties-common
 RUN add-apt-repository ppa:deadsnakes/ppa
 RUN apt-get update &&  apt-get install -y  python3.9 python3-pip libjpeg8-dev zlib1g-dev python3-dev libpython3.9-dev libffi-dev libgl1
-RUN python3.9 -m pip install --upgrade pip
+RUN python3.9 -m pip install --upgrade pip==24.0
 RUN python3.9 -m pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu121
 RUN python3.9 -m pip install openvino-dev==2023.0.1 opencv-python-headless mlcube_docker
 
diff --git a/Dockerfile-ROCm b/Dockerfile-ROCm
index ee5d7dff8..8c81089fe 100644
--- a/Dockerfile-ROCm
+++ b/Dockerfile-ROCm
@@ -9,9 +9,9 @@ LABEL version=1.0
 RUN apt-get update && apt-get install -y software-properties-common
 RUN add-apt-repository ppa:deadsnakes/ppa
 RUN apt-get update &&  apt-get install -y  python3.9 python3-pip libjpeg8-dev zlib1g-dev python3-dev libpython3.9-dev libffi-dev libgl1
-RUN python3.9 -m pip install --upgrade pip
+RUN python3.9 -m pip install --upgrade pip==24.0
 RUN python3.9 -m pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/rocm5.6
-RUN python3.9 -m pip install --upgrade pip && python3.9 -m pip install openvino-dev==2023.0.1 opencv-python-headless mlcube_docker
+RUN python3.9 -m pip install --upgrade pip==24.0 && python3.9 -m pip install openvino-dev==2023.0.1 opencv-python-headless mlcube_docker
 RUN apt-get update && apt-get install -y libgl1
 
 # Do some dependency installation separately here to make layer caching more efficient

From df8627113ebf4c60f646e73a63aea0ccb81c420e Mon Sep 17 00:00:00 2001
From: Sarthak Pati <patis@iu.edu>
Date: Tue, 9 Jul 2024 15:47:33 -0400
Subject: [PATCH 16/18] removed additional space

---
 GANDLF/models/seg_modules/DownsamplingModule.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GANDLF/models/seg_modules/DownsamplingModule.py b/GANDLF/models/seg_modules/DownsamplingModule.py
index 2ccf3ba41..24c92ce77 100644
--- a/GANDLF/models/seg_modules/DownsamplingModule.py
+++ b/GANDLF/models/seg_modules/DownsamplingModule.py
@@ -49,7 +49,7 @@ def forward(self, x):
         """
         Applies a downsampling operation to the input tensor.
 
-        [input -- > in --> lrelu --> ConvDS --> output]
+        [input --> in --> lrelu --> ConvDS --> output]
 
         Args:
         x (torch.Tensor): Input tensor of shape (batch_size, channels, height, width)

From 2cb0ac094267e45a2c5c76885969b998fb828ace Mon Sep 17 00:00:00 2001
From: hongbozheng <112345628+hongbozheng@users.noreply.github.com>
Date: Tue, 9 Jul 2024 20:19:24 -0500
Subject: [PATCH 17/18] Corrected forward operations order and norm layer
 parameter

---
 GANDLF/models/seg_modules/DownsamplingModule.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GANDLF/models/seg_modules/DownsamplingModule.py b/GANDLF/models/seg_modules/DownsamplingModule.py
index 24c92ce77..10eded967 100644
--- a/GANDLF/models/seg_modules/DownsamplingModule.py
+++ b/GANDLF/models/seg_modules/DownsamplingModule.py
@@ -39,7 +39,7 @@ def __init__(
         if act_kwargs is None:
             act_kwargs = {"negative_slope": 1e-2, "inplace": True}
 
-        self.in_0 = norm(output_channels, **norm_kwargs)
+        self.in_0 = norm(input_channels, **norm_kwargs)
 
         self.conv0 = conv(input_channels, output_channels, **conv_kwargs)
 

From 6b22745d20f87fc450199c38621ec7fdcb7b2d7d Mon Sep 17 00:00:00 2001
From: Viacheslav Kukushkin <vy.kukushkin@gmail.com>
Date: Thu, 18 Jul 2024 14:52:45 +0300
Subject: [PATCH 18/18] Error correction in validation and testing loops

- the same metric error was occuring in the loops in forward_pass.py -
  now it is fixed
- entire epoch completes successfully
Implemented by Szymon Mazurek szymon.mazurek@cyfronet.pl
---
 GANDLF/compute/forward_pass.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/GANDLF/compute/forward_pass.py b/GANDLF/compute/forward_pass.py
index 73d68e4b1..be2bff034 100644
--- a/GANDLF/compute/forward_pass.py
+++ b/GANDLF/compute/forward_pass.py
@@ -56,9 +56,7 @@ def validate_network(
 
     for metric in params["metrics"]:
         if "per_label" in metric:
-            total_epoch_valid_metric[metric] = np.zeros(
-                shape=params["model"]["num_classes"]
-            )
+            total_epoch_valid_metric[metric] = np.zeros(1)
         else:
             total_epoch_valid_metric[metric] = 0
 
@@ -212,7 +210,9 @@ def validate_network(
             # # Non network validation related
             total_epoch_valid_loss += final_loss.detach().cpu().item()
             for metric, metric_val in final_metric.items():
-                total_epoch_valid_metric[metric] += metric_val
+                total_epoch_valid_metric[metric] = (
+                    total_epoch_valid_metric[metric] + metric_val
+                )
 
         else:  # for segmentation problems OR regression/classification when no label is present
             grid_sampler = torchio.inference.GridSampler(
@@ -429,7 +429,9 @@ def validate_network(
                 # loss.cpu().data.item()
                 total_epoch_valid_loss += final_loss.cpu().item()
                 for metric in final_metric.keys():
-                    total_epoch_valid_metric[metric] += final_metric[metric]
+                    total_epoch_valid_metric[metric] = (
+                        total_epoch_valid_metric[metric] + final_metric[metric]
+                    )
 
         if label_ground_truth is not None:
             if params["verbose"]: