From b271f3eeff289491e6155c55c17fb0b7d903d8e9 Mon Sep 17 00:00:00 2001
From: Adam Shephard <adam.shephard@warwick.ac.uk>
Date: Wed, 16 Apr 2025 16:35:40 +0100
Subject: [PATCH 1/5] FIX: Update for multi-GPU support in models_abc

---
 tiatoolbox/models/models_abc.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/tiatoolbox/models/models_abc.py b/tiatoolbox/models/models_abc.py
index a8a8f7262..be63c8d57 100644
--- a/tiatoolbox/models/models_abc.py
+++ b/tiatoolbox/models/models_abc.py
@@ -2,11 +2,14 @@
 
 from __future__ import annotations
 
+import os
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any, Callable
 
 import torch
 import torch._dynamo
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel
 
 torch._dynamo.config.suppress_errors = True  # skipcq: PYL-W0212  # noqa: SLF001
 
@@ -51,12 +54,21 @@ def model_to(model: torch.nn.Module, device: str = "cpu") -> torch.nn.Module:
             The model after being moved to specified device.
 
     """
-    if device != "cpu":
-        # DataParallel work only for cuda
-        model = torch.nn.DataParallel(model)
-
     torch_device = torch.device(device)
-    return model.to(torch_device)
+
+    model = model.to(torch_device)
+
+    # Use DDP if multiple GPUs and not on CPU
+    if device == "cuda" and torch.cuda.device_count() > 1:
+        # This assumes a single-process DDP setup for inference
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "12355"
+        dist.init_process_group(
+            "gloo", rank=0, world_size=1
+        )  # You can use "nccl" for speed if CUDA
+        model = DistributedDataParallel(model, device_ids=[device.index])
+
+    return model
 
 
 class ModelABC(ABC, torch.nn.Module):

From cc5407a52773d03660f8a8ff20bb9a1e904be18e Mon Sep 17 00:00:00 2001
From: Adam Shephard <adam.shephard@warwick.ac.uk>
Date: Wed, 16 Apr 2025 18:04:13 +0000
Subject: [PATCH 2/5] UPD: Update code

---
 tiatoolbox/models/engine/semantic_segmentor.py | 2 ++
 tiatoolbox/models/models_abc.py                | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tiatoolbox/models/engine/semantic_segmentor.py b/tiatoolbox/models/engine/semantic_segmentor.py
index fe0c3e02b..5dc0c0832 100644
--- a/tiatoolbox/models/engine/semantic_segmentor.py
+++ b/tiatoolbox/models/engine/semantic_segmentor.py
@@ -15,6 +15,7 @@
 import torch
 import torch.multiprocessing as torch_mp
 import torch.utils.data as torch_data
+import torch.distributed as dist
 import tqdm
 
 from tiatoolbox import logger, rcParam
@@ -1421,6 +1422,7 @@ def predict(  # noqa: PLR0913
             logger.warning("Unable to remove %s", self._cache_dir)
 
         self._memory_cleanup()
+        dist.destroy_process_group()
 
         return self._outputs
 
diff --git a/tiatoolbox/models/models_abc.py b/tiatoolbox/models/models_abc.py
index be63c8d57..1a8982bf4 100644
--- a/tiatoolbox/models/models_abc.py
+++ b/tiatoolbox/models/models_abc.py
@@ -64,8 +64,8 @@ def model_to(model: torch.nn.Module, device: str = "cpu") -> torch.nn.Module:
         os.environ["MASTER_ADDR"] = "localhost"
         os.environ["MASTER_PORT"] = "12355"
         dist.init_process_group(
-            "gloo", rank=0, world_size=1
-        )  # You can use "nccl" for speed if CUDA
+            backend="nccl", rank=0, world_size=1
+        )
         model = DistributedDataParallel(model, device_ids=[device.index])
 
     return model

From e124cd81680466d91ce74dcf494aaf740861ea04 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 16 Apr 2025 18:04:57 +0000
Subject: [PATCH 3/5] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tiatoolbox/models/engine/semantic_segmentor.py | 2 +-
 tiatoolbox/models/models_abc.py                | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/tiatoolbox/models/engine/semantic_segmentor.py b/tiatoolbox/models/engine/semantic_segmentor.py
index 5dc0c0832..43e10c152 100644
--- a/tiatoolbox/models/engine/semantic_segmentor.py
+++ b/tiatoolbox/models/engine/semantic_segmentor.py
@@ -13,9 +13,9 @@
 import joblib
 import numpy as np
 import torch
+import torch.distributed as dist
 import torch.multiprocessing as torch_mp
 import torch.utils.data as torch_data
-import torch.distributed as dist
 import tqdm
 
 from tiatoolbox import logger, rcParam
diff --git a/tiatoolbox/models/models_abc.py b/tiatoolbox/models/models_abc.py
index 1a8982bf4..4ae250531 100644
--- a/tiatoolbox/models/models_abc.py
+++ b/tiatoolbox/models/models_abc.py
@@ -63,9 +63,7 @@ def model_to(model: torch.nn.Module, device: str = "cpu") -> torch.nn.Module:
         # This assumes a single-process DDP setup for inference
         os.environ["MASTER_ADDR"] = "localhost"
         os.environ["MASTER_PORT"] = "12355"
-        dist.init_process_group(
-            backend="nccl", rank=0, world_size=1
-        )
+        dist.init_process_group(backend="nccl", rank=0, world_size=1)
         model = DistributedDataParallel(model, device_ids=[device.index])
 
     return model

From e7b0822c88142e26d53051648246c8335e66272b Mon Sep 17 00:00:00 2001
From: Adam Shephard <adam.shephard@warwick.ac.uk>
Date: Mon, 12 May 2025 18:41:18 +0100
Subject: [PATCH 4/5] FIX: Fix to work on other machines

---
 tests/models/test_feature_extractor.py        | 45 +++++++++++++++++++
 .../models/engine/semantic_segmentor.py       |  9 +++-
 tiatoolbox/models/models_abc.py               | 25 ++++++++---
 3 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/tests/models/test_feature_extractor.py b/tests/models/test_feature_extractor.py
index cd33f0a5a..2b6122106 100644
--- a/tests/models/test_feature_extractor.py
+++ b/tests/models/test_feature_extractor.py
@@ -115,3 +115,48 @@ def test_full_inference(
     # ! else the output values will not exactly be the same (still < 1.0e-4
     # ! of epsilon though)
     assert np.mean(np.abs(features[:4] - _features)) < 1.0e-1
+
+
+@pytest.mark.skipif(
+    toolbox_env.running_on_ci() or not ON_GPU,
+    reason="Local test on machine with GPU.",
+)
+def test_multi_gpu_feature_extraction(remote_sample: Callable, tmp_path: Path) -> None:
+    """Local functionality test for feature extraction using multiple GPUs."""
+    save_dir = tmp_path / "output"
+    mini_wsi_svs = Path(remote_sample("wsi4_1k_1k_svs"))
+    shutil.rmtree(save_dir, ignore_errors=True)
+
+    # Use multiple GPUs
+    device = select_device(on_gpu=ON_GPU)
+
+    wsi_ioconfig = IOSegmentorConfig(
+        input_resolutions=[{"units": "mpp", "resolution": 0.5}],
+        patch_input_shape=[224, 224],
+        output_resolutions=[{"units": "mpp", "resolution": 0.5}],
+        patch_output_shape=[224, 224],
+        stride_shape=[224, 224],
+    )
+
+    model = TimmBackbone(backbone="UNI", pretrained=True)
+    extractor = DeepFeatureExtractor(
+        model=model,
+        auto_generate_mask=True,
+        batch_size=32,
+        num_loader_workers=4,
+        num_postproc_workers=4,
+    )
+
+    output_list = extractor.predict(
+        [mini_wsi_svs],
+        mode="wsi",
+        device=device,
+        ioconfig=wsi_ioconfig,
+        crash_on_exception=True,
+        save_dir=save_dir,
+    )
+    wsi_0_root_path = output_list[0][1]
+    positions = np.load(f"{wsi_0_root_path}.position.npy")
+    features = np.load(f"{wsi_0_root_path}.features.0.npy")
+    assert len(positions.shape) == 2
+    assert len(features.shape) == 4
diff --git a/tiatoolbox/models/engine/semantic_segmentor.py b/tiatoolbox/models/engine/semantic_segmentor.py
index 43e10c152..2befcf0b1 100644
--- a/tiatoolbox/models/engine/semantic_segmentor.py
+++ b/tiatoolbox/models/engine/semantic_segmentor.py
@@ -1422,7 +1422,14 @@ def predict(  # noqa: PLR0913
             logger.warning("Unable to remove %s", self._cache_dir)
 
         self._memory_cleanup()
-        dist.destroy_process_group()
+        from tiatoolbox.models.architecture.utils import is_torch_compile_compatible
+
+        if (
+            device == "cuda"
+            and torch.cuda.device_count() > 1
+            and is_torch_compile_compatible()
+        ):
+            dist.destroy_process_group()
 
         return self._outputs
 
diff --git a/tiatoolbox/models/models_abc.py b/tiatoolbox/models/models_abc.py
index 4ae250531..cf640e96f 100644
--- a/tiatoolbox/models/models_abc.py
+++ b/tiatoolbox/models/models_abc.py
@@ -11,6 +11,8 @@
 import torch.distributed as dist
 from torch.nn.parallel import DistributedDataParallel
 
+from tiatoolbox.models.architecture.utils import is_torch_compile_compatible
+
 torch._dynamo.config.suppress_errors = True  # skipcq: PYL-W0212  # noqa: SLF001
 
 if TYPE_CHECKING:  # pragma: no cover
@@ -56,15 +58,28 @@ def model_to(model: torch.nn.Module, device: str = "cpu") -> torch.nn.Module:
     """
     torch_device = torch.device(device)
 
-    model = model.to(torch_device)
-
     # Use DDP if multiple GPUs and not on CPU
-    if device == "cuda" and torch.cuda.device_count() > 1:
+    if (
+        device == "cuda"
+        and torch.cuda.device_count() > 1
+        and is_torch_compile_compatible()
+    ):
         # This assumes a single-process DDP setup for inference
+        model = model.to(torch_device)
         os.environ["MASTER_ADDR"] = "localhost"
         os.environ["MASTER_PORT"] = "12355"
-        dist.init_process_group(backend="nccl", rank=0, world_size=1)
-        model = DistributedDataParallel(model, device_ids=[device.index])
+        dist.init_process_group(
+            backend="nccl", rank=0, world_size=torch.cuda.device_count()
+        )
+        model = DistributedDataParallel(model, device_ids=[torch_device.index])
+
+    elif device != "cpu":
+        # DataParallel work only for cuda
+        model = torch.nn.DataParallel(model)
+        model = model.to(torch_device)
+
+    else:
+        model = model.to(torch_device)
 
     return model
 

From 0615636baabfda3fc9d6a4d1591acdbd7b2bb478 Mon Sep 17 00:00:00 2001
From: Adam Shephard <adam.shephard@warwick.ac.uk>
Date: Mon, 12 May 2025 18:42:32 +0100
Subject: [PATCH 5/5] FIX: Fix to work on other machines

---
 tests/models/test_feature_extractor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_feature_extractor.py b/tests/models/test_feature_extractor.py
index 2b6122106..9ceb549be 100644
--- a/tests/models/test_feature_extractor.py
+++ b/tests/models/test_feature_extractor.py
@@ -159,4 +159,4 @@ def test_multi_gpu_feature_extraction(remote_sample: Callable, tmp_path: Path) -
     positions = np.load(f"{wsi_0_root_path}.position.npy")
     features = np.load(f"{wsi_0_root_path}.features.0.npy")
     assert len(positions.shape) == 2
-    assert len(features.shape) == 4
+    assert len(features.shape) == 2