Skip to content

Commit

Permalink
add exception handling to see silen torchrun failures
Browse files Browse the repository at this point in the history
  • Loading branch information
divakar-amd committed Sep 26, 2024
1 parent ab92950 commit 2cb34c4
Showing 1 changed file with 9 additions and 3 deletions.
12 changes: 9 additions & 3 deletions benchmarks/kernels/benchmark_mixtral_moe_rocm.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@

def main(args):
world_size = args.numGPU
mp.spawn(wrapper, args=(args, ), nprocs=world_size, join=False)
try:
mp.spawn(wrapper, args=(args,), nprocs=world_size, join=False)
except Exception as e:
print(f"An error occurred during multiprocessing: {e}")


def wrapper(rank, args):
Expand All @@ -32,8 +35,11 @@ def wrapper(rank, args):
1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536, 2048,
3072, 4096
]
for i in range(device_id, len(batches), args.numGPU):
tune_batch(batches[i], model=args.model, TP=args.modelTP)
try:
for i in range(device_id, len(batches), args.numGPU):
tune_batch(batches[i], model=args.model, TP=args.modelTP)
except Exception as e:
print(f"An error occurred on device {device_id}: {e}")


def tune_batch(bs, model, TP):
Expand Down

0 comments on commit 2cb34c4

Please sign in to comment.