From b271f3eeff289491e6155c55c17fb0b7d903d8e9 Mon Sep 17 00:00:00 2001 From: Adam Shephard Date: Wed, 16 Apr 2025 16:35:40 +0100 Subject: [PATCH 1/5] FIX: Update for multi-GPU support in models_abc --- tiatoolbox/models/models_abc.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/tiatoolbox/models/models_abc.py b/tiatoolbox/models/models_abc.py index a8a8f7262..be63c8d57 100644 --- a/tiatoolbox/models/models_abc.py +++ b/tiatoolbox/models/models_abc.py @@ -2,11 +2,14 @@ from __future__ import annotations +import os from abc import ABC, abstractmethod from typing import TYPE_CHECKING, Any, Callable import torch import torch._dynamo +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel torch._dynamo.config.suppress_errors = True # skipcq: PYL-W0212 # noqa: SLF001 @@ -51,12 +54,21 @@ def model_to(model: torch.nn.Module, device: str = "cpu") -> torch.nn.Module: The model after being moved to specified device. """ - if device != "cpu": - # DataParallel work only for cuda - model = torch.nn.DataParallel(model) - torch_device = torch.device(device) - return model.to(torch_device) + + model = model.to(torch_device) + + # Use DDP if multiple GPUs and not on CPU + if device == "cuda" and torch.cuda.device_count() > 1: + # This assumes a single-process DDP setup for inference + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12355" + dist.init_process_group( + "gloo", rank=0, world_size=1 + ) # You can use "nccl" for speed if CUDA + model = DistributedDataParallel(model, device_ids=[device.index]) + + return model class ModelABC(ABC, torch.nn.Module): From cc5407a52773d03660f8a8ff20bb9a1e904be18e Mon Sep 17 00:00:00 2001 From: Adam Shephard Date: Wed, 16 Apr 2025 18:04:13 +0000 Subject: [PATCH 2/5] UPD: Update code --- tiatoolbox/models/engine/semantic_segmentor.py | 2 ++ tiatoolbox/models/models_abc.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tiatoolbox/models/engine/semantic_segmentor.py b/tiatoolbox/models/engine/semantic_segmentor.py index fe0c3e02b..5dc0c0832 100644 --- a/tiatoolbox/models/engine/semantic_segmentor.py +++ b/tiatoolbox/models/engine/semantic_segmentor.py @@ -15,6 +15,7 @@ import torch import torch.multiprocessing as torch_mp import torch.utils.data as torch_data +import torch.distributed as dist import tqdm from tiatoolbox import logger, rcParam @@ -1421,6 +1422,7 @@ def predict( # noqa: PLR0913 logger.warning("Unable to remove %s", self._cache_dir) self._memory_cleanup() + dist.destroy_process_group() return self._outputs diff --git a/tiatoolbox/models/models_abc.py b/tiatoolbox/models/models_abc.py index be63c8d57..1a8982bf4 100644 --- a/tiatoolbox/models/models_abc.py +++ b/tiatoolbox/models/models_abc.py @@ -64,8 +64,8 @@ def model_to(model: torch.nn.Module, device: str = "cpu") -> torch.nn.Module: os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "12355" dist.init_process_group( - "gloo", rank=0, world_size=1 - ) # You can use "nccl" for speed if CUDA + backend="nccl", rank=0, world_size=1 + ) model = DistributedDataParallel(model, device_ids=[device.index]) return model From e124cd81680466d91ce74dcf494aaf740861ea04 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 16 Apr 2025 18:04:57 +0000 Subject: [PATCH 3/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tiatoolbox/models/engine/semantic_segmentor.py | 2 +- tiatoolbox/models/models_abc.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/tiatoolbox/models/engine/semantic_segmentor.py b/tiatoolbox/models/engine/semantic_segmentor.py index 5dc0c0832..43e10c152 100644 --- a/tiatoolbox/models/engine/semantic_segmentor.py +++ b/tiatoolbox/models/engine/semantic_segmentor.py @@ -13,9 +13,9 @@ import joblib import numpy as np import torch +import torch.distributed as dist import torch.multiprocessing as torch_mp import torch.utils.data as torch_data -import torch.distributed as dist import tqdm from tiatoolbox import logger, rcParam diff --git a/tiatoolbox/models/models_abc.py b/tiatoolbox/models/models_abc.py index 1a8982bf4..4ae250531 100644 --- a/tiatoolbox/models/models_abc.py +++ b/tiatoolbox/models/models_abc.py @@ -63,9 +63,7 @@ def model_to(model: torch.nn.Module, device: str = "cpu") -> torch.nn.Module: # This assumes a single-process DDP setup for inference os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "12355" - dist.init_process_group( - backend="nccl", rank=0, world_size=1 - ) + dist.init_process_group(backend="nccl", rank=0, world_size=1) model = DistributedDataParallel(model, device_ids=[device.index]) return model From e7b0822c88142e26d53051648246c8335e66272b Mon Sep 17 00:00:00 2001 From: Adam Shephard Date: Mon, 12 May 2025 18:41:18 +0100 Subject: [PATCH 4/5] FIX: Fix to work on other machines --- tests/models/test_feature_extractor.py | 45 +++++++++++++++++++ .../models/engine/semantic_segmentor.py | 9 +++- tiatoolbox/models/models_abc.py | 25 ++++++++--- 3 files changed, 73 insertions(+), 6 deletions(-) diff --git a/tests/models/test_feature_extractor.py b/tests/models/test_feature_extractor.py index cd33f0a5a..2b6122106 100644 --- a/tests/models/test_feature_extractor.py +++ b/tests/models/test_feature_extractor.py @@ -115,3 +115,48 @@ def test_full_inference( # ! else the output values will not exactly be the same (still < 1.0e-4 # ! of epsilon though) assert np.mean(np.abs(features[:4] - _features)) < 1.0e-1 + + +@pytest.mark.skipif( + toolbox_env.running_on_ci() or not ON_GPU, + reason="Local test on machine with GPU.", +) +def test_multi_gpu_feature_extraction(remote_sample: Callable, tmp_path: Path) -> None: + """Local functionality test for feature extraction using multiple GPUs.""" + save_dir = tmp_path / "output" + mini_wsi_svs = Path(remote_sample("wsi4_1k_1k_svs")) + shutil.rmtree(save_dir, ignore_errors=True) + + # Use multiple GPUs + device = select_device(on_gpu=ON_GPU) + + wsi_ioconfig = IOSegmentorConfig( + input_resolutions=[{"units": "mpp", "resolution": 0.5}], + patch_input_shape=[224, 224], + output_resolutions=[{"units": "mpp", "resolution": 0.5}], + patch_output_shape=[224, 224], + stride_shape=[224, 224], + ) + + model = TimmBackbone(backbone="UNI", pretrained=True) + extractor = DeepFeatureExtractor( + model=model, + auto_generate_mask=True, + batch_size=32, + num_loader_workers=4, + num_postproc_workers=4, + ) + + output_list = extractor.predict( + [mini_wsi_svs], + mode="wsi", + device=device, + ioconfig=wsi_ioconfig, + crash_on_exception=True, + save_dir=save_dir, + ) + wsi_0_root_path = output_list[0][1] + positions = np.load(f"{wsi_0_root_path}.position.npy") + features = np.load(f"{wsi_0_root_path}.features.0.npy") + assert len(positions.shape) == 2 + assert len(features.shape) == 4 diff --git a/tiatoolbox/models/engine/semantic_segmentor.py b/tiatoolbox/models/engine/semantic_segmentor.py index 43e10c152..2befcf0b1 100644 --- a/tiatoolbox/models/engine/semantic_segmentor.py +++ b/tiatoolbox/models/engine/semantic_segmentor.py @@ -1422,7 +1422,14 @@ def predict( # noqa: PLR0913 logger.warning("Unable to remove %s", self._cache_dir) self._memory_cleanup() - dist.destroy_process_group() + from tiatoolbox.models.architecture.utils import is_torch_compile_compatible + + if ( + device == "cuda" + and torch.cuda.device_count() > 1 + and is_torch_compile_compatible() + ): + dist.destroy_process_group() return self._outputs diff --git a/tiatoolbox/models/models_abc.py b/tiatoolbox/models/models_abc.py index 4ae250531..cf640e96f 100644 --- a/tiatoolbox/models/models_abc.py +++ b/tiatoolbox/models/models_abc.py @@ -11,6 +11,8 @@ import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel +from tiatoolbox.models.architecture.utils import is_torch_compile_compatible + torch._dynamo.config.suppress_errors = True # skipcq: PYL-W0212 # noqa: SLF001 if TYPE_CHECKING: # pragma: no cover @@ -56,15 +58,28 @@ def model_to(model: torch.nn.Module, device: str = "cpu") -> torch.nn.Module: """ torch_device = torch.device(device) - model = model.to(torch_device) - # Use DDP if multiple GPUs and not on CPU - if device == "cuda" and torch.cuda.device_count() > 1: + if ( + device == "cuda" + and torch.cuda.device_count() > 1 + and is_torch_compile_compatible() + ): # This assumes a single-process DDP setup for inference + model = model.to(torch_device) os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "12355" - dist.init_process_group(backend="nccl", rank=0, world_size=1) - model = DistributedDataParallel(model, device_ids=[device.index]) + dist.init_process_group( + backend="nccl", rank=0, world_size=torch.cuda.device_count() + ) + model = DistributedDataParallel(model, device_ids=[torch_device.index]) + + elif device != "cpu": + # DataParallel work only for cuda + model = torch.nn.DataParallel(model) + model = model.to(torch_device) + + else: + model = model.to(torch_device) return model From 0615636baabfda3fc9d6a4d1591acdbd7b2bb478 Mon Sep 17 00:00:00 2001 From: Adam Shephard Date: Mon, 12 May 2025 18:42:32 +0100 Subject: [PATCH 5/5] FIX: Fix to work on other machines --- tests/models/test_feature_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_feature_extractor.py b/tests/models/test_feature_extractor.py index 2b6122106..9ceb549be 100644 --- a/tests/models/test_feature_extractor.py +++ b/tests/models/test_feature_extractor.py @@ -159,4 +159,4 @@ def test_multi_gpu_feature_extraction(remote_sample: Callable, tmp_path: Path) - positions = np.load(f"{wsi_0_root_path}.position.npy") features = np.load(f"{wsi_0_root_path}.features.0.npy") assert len(positions.shape) == 2 - assert len(features.shape) == 4 + assert len(features.shape) == 2