Running on multiple GPUs is slower than on a single GPU #9891

HanChen-HUST · 2024-12-26T03:27:38Z

🐛 Describe the bug

import os
from math import ceil

import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn.functional as F
from torch import Tensor
from torch.nn.parallel import DistributedDataParallel

from torch_geometric.datasets import Reddit
from torch_geometric.loader import NeighborLoader
from torch_geometric.nn import SAGEConv
from torch_geometric.datasets import Planetoid


class SAGE(torch.nn.Module):
    def __init__(self, in_channels: int, hidden_channels: int,
                 out_channels: int, num_layers: int = 2):
        super().__init__()

        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        for _ in range(num_layers - 2):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        self.convs.append(SAGEConv(hidden_channels, out_channels))

    def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
        for i, conv in enumerate(self.convs):
            x = conv(x, edge_index)
            if i < len(self.convs) - 1:
                x = x.relu()
                x = F.dropout(x, p=0.5, training=self.training)
        return x


@torch.no_grad()
def test(loader, model, rank):
    model.eval()

    total_correct = total_examples = 0
    for i, batch in enumerate(loader):
        out = model(batch.x, batch.edge_index.to(rank))
        pred = out[:batch.batch_size].argmax(dim=-1)
        y = batch.y[:batch.batch_size].to(rank)
        total_correct += int((pred == y).sum())
        total_examples += batch.batch_size
    return torch.tensor(total_correct / total_examples, device=rank)


def run(rank, world_size, dataset):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group('nccl', rank=rank, world_size=world_size)

    data = dataset[0]
    data = data.to(rank, 'x', 'y')  # Move to device for faster feature fetch.

    # Split indices into `world_size` many chunks:
    train_idx = data.train_mask.nonzero(as_tuple=False).view(-1)
    train_idx = train_idx.split(ceil(train_idx.size(0) / world_size))[rank]
    val_idx = data.val_mask.nonzero(as_tuple=False).view(-1)
    val_idx = val_idx.split(ceil(val_idx.size(0) / world_size))[rank]
    test_idx = data.val_mask.nonzero(as_tuple=False).view(-1)
    test_idx = test_idx.split(ceil(test_idx.size(0) / world_size))[rank]

    kwargs = dict(
        data=data,
        batch_size=1024,
        num_neighbors=[25, 10],
        drop_last=True,
        num_workers=4,
        persistent_workers=True,
    )
    train_loader = NeighborLoader(
        input_nodes=train_idx,
        shuffle=True,
        **kwargs,
    )
    val_loader = NeighborLoader(
        input_nodes=val_idx,
        shuffle=False,
        **kwargs,
    )
    test_loader = NeighborLoader(
        input_nodes=test_idx,
        shuffle=False,
        **kwargs,
    )

    torch.manual_seed(12345)
    model = SAGE(dataset.num_features, 256, dataset.num_classes).to(rank)
    model = DistributedDataParallel(model, device_ids=[rank])
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(1, 21):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            out = model(batch.x, batch.edge_index.to(rank))[:batch.batch_size]
            loss = F.cross_entropy(out, batch.y[:batch.batch_size])
            loss.backward()
            optimizer.step()

        dist.barrier()

        if rank == 0:
            print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}')

        if epoch % 5 == 0:
            train_acc = test(train_loader, model, rank)
            val_acc = test(val_loader, model, rank)
            test_acc = test(test_loader, model, rank)

            if world_size > 1:
                dist.all_reduce(train_acc, op=dist.ReduceOp.SUM)
                dist.all_reduce(train_acc, op=dist.ReduceOp.SUM)
                dist.all_reduce(train_acc, op=dist.ReduceOp.SUM)
                train_acc /= world_size
                val_acc /= world_size
                test_acc /= world_size

            if rank == 0:
                print(f'Train: {train_acc:.4f}, Val: {val_acc:.4f}, '
                      f'Test: {test_acc:.4f}')

        dist.barrier()

    dist.destroy_process_group()


if __name__ == '__main__':    
    dataset = Planetoid(root='/tmp/Cora', name='Cora')
    world_size = torch.cuda.device_count()
    print("Let's use", world_size, "GPUs!")
    mp.spawn(run, args=(world_size, dataset), nprocs=world_size, join=True)

Versions

@akihironitta, Hi, I tried to use this link: https://github.com/pyg-team/pytorch_geometric/blob/master/examples/multi_gpu/distributed_sampling.py in the Cora dataset, but the communication speed is extremely slow, could you analysis the reason？, Hi, the output of nvidia-smi topo -m is as follows:

GPU0    GPU1    GPU2    GPU3    GPU4    GPU5    CPU Affinity    NUMA Affinity   GPU NUMA ID
GPU0     X      PIX     PXB     PXB     PXB     PXB     0-27,56-83      0               N/A
GPU1    PIX      X      PXB     PXB     PXB     PXB     0-27,56-83      0               N/A
GPU2    PXB     PXB      X      PIX     NV4     PIX     0-27,56-83      0               N/A
GPU3    PXB     PXB     PIX      X      PIX     NV4     0-27,56-83      0               N/A
GPU4    PXB     PXB     NV4     PIX      X      PIX     0-27,56-83      0               N/A
GPU5    PXB     PXB     PIX     NV4     PIX      X      0-27,56-83      0               N/A

Legend:

  X    = Self
  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
  PIX  = Connection traversing at most a single PCIe bridge
  NV#  = Connection traversing a bonded set of # NVLinks

I just print the test_dataset information in the code, but after two hours, its still running, way much slower than one GPU running time, whats the potential reason? I just replace the Reddit dataset to Cora dataset.

The text was updated successfully, but these errors were encountered:

akihironitta · 2024-12-26T03:37:56Z

As I replied to you in #1417 (comment), how did you measure the communication speed? Is it just comparing the wall time of the script between single-GPU and multi-GPU case?

The interconnect configuration seems off to me. Is there a reason why some of the GPUs are interconnected with NVLink and others are with PCIe?

HanChen-HUST · 2024-12-26T03:49:03Z

yes, I just compared with the time, I dont know whats the correct interconnect configuration. Are there any other solving ways?

akihironitta · 2024-12-26T04:37:02Z

Sorry for the confusion. Actually, there seems such configurations although high performant servers like https://www.nvidia.com/en-gb/data-center/dgx-h200/ have all GPUs interconnected with NVLink.

Since you're not yet sure if the slowdown comes from communication, you could try reducing the number of workers NeighborLoader(num_workers=...) to avoid CPU subscription, depending on how many cores your system has. The best and easiest way is to profile the script with torch.profiler and see which part of the code gets slower than the single GPU case.

HanChen-HUST added the bug label Dec 26, 2024

akihironitta added question and removed bug labels Dec 26, 2024

akihironitta changed the title ~~question about multi gpu running~~ Running on multiple GPUs is slower than on a single GPU Dec 26, 2024

akihironitta added the distributed label Dec 26, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Running on multiple GPUs is slower than on a single GPU #9891

Running on multiple GPUs is slower than on a single GPU #9891

HanChen-HUST commented Dec 26, 2024 •

edited by akihironitta

Loading

akihironitta commented Dec 26, 2024

HanChen-HUST commented Dec 26, 2024

akihironitta commented Dec 26, 2024

Running on multiple GPUs is slower than on a single GPU #9891

Running on multiple GPUs is slower than on a single GPU #9891

Comments

HanChen-HUST commented Dec 26, 2024 • edited by akihironitta Loading

🐛 Describe the bug

Versions

akihironitta commented Dec 26, 2024

HanChen-HUST commented Dec 26, 2024

akihironitta commented Dec 26, 2024

HanChen-HUST commented Dec 26, 2024 •

edited by akihironitta

Loading