forked from mlcommons/training
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
SSD multiGPU reference + quality change (21.2 to 23) (mlcommons#237)
* SSD multiGPU reference + quality change (21.2 to 23) * fix for deadlock
- Loading branch information
Showing
12 changed files
with
629 additions
and
87 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,19 @@ | ||
FROM pytorch/pytorch:0.4_cuda9_cudnn7 | ||
FROM pytorch/pytorch:1.0.1-cuda10.0-cudnn7-runtime | ||
|
||
# Set working directory | ||
WORKDIR /mlperf | ||
|
||
RUN apt-get update && \ | ||
apt-get install -y python3-tk python-pip | ||
apt-get install -y python3-tk python-pip && \ | ||
apt-get install -y numactl | ||
|
||
# Necessary pip packages | ||
RUN pip install --upgrade pip | ||
RUN pip install Cython==0.28.4 \ | ||
matplotlib==2.2.2 | ||
RUN python3 -m pip install pycocotools==2.0.0 | ||
|
||
# Copy SSD code | ||
WORKDIR /mlperf | ||
COPY . . | ||
# Necessary pip packages | ||
RUN pip install -r requirements.txt | ||
RUN python3 -m pip install pycocotools==2.0.0 | ||
|
||
WORKDIR /mlperf/ssd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,13 @@ | ||
Cython==0.28.4 | ||
git+git://github.com/NVIDIA/apex.git@9041a868a1a253172d94b113a963375b9badd030#egg=apex | ||
mlperf-compliance==0.0.10 | ||
cycler==0.10.0 | ||
kiwisolver==1.0.1 | ||
matplotlib==2.2.2 | ||
numpy==1.14.5 | ||
Pillow==5.2.0 | ||
pycocotools==2.0.0 | ||
pyparsing==2.2.0 | ||
python-dateutil==2.7.3 | ||
pytz==2018.5 | ||
six==1.11.0 | ||
torch==0.4.0 | ||
torchvision==0.2.1 | ||
mlperf_compliance==0.0.7 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
import sys | ||
import subprocess | ||
import os | ||
import socket | ||
from argparse import ArgumentParser, REMAINDER | ||
|
||
import torch | ||
|
||
|
||
def parse_args(): | ||
""" | ||
Helper function parsing the command line options | ||
@retval ArgumentParser | ||
""" | ||
parser = ArgumentParser(description="PyTorch distributed training launch " | ||
"helper utilty that will spawn up " | ||
"multiple distributed processes") | ||
|
||
# Optional arguments for the launch helper | ||
parser.add_argument("--nnodes", type=int, default=1, | ||
help="The number of nodes to use for distributed " | ||
"training") | ||
parser.add_argument("--node_rank", type=int, default=0, | ||
help="The rank of the node for multi-node distributed " | ||
"training") | ||
parser.add_argument("--nproc_per_node", type=int, default=1, | ||
help="The number of processes to launch on each node, " | ||
"for GPU training, this is recommended to be set " | ||
"to the number of GPUs in your system so that " | ||
"each process can be bound to a single GPU.") | ||
parser.add_argument("--master_addr", default="127.0.0.1", type=str, | ||
help="Master node (rank 0)'s address, should be either " | ||
"the IP address or the hostname of node 0, for " | ||
"single node multi-proc training, the " | ||
"--master_addr can simply be 127.0.0.1") | ||
parser.add_argument("--master_port", default=29500, type=int, | ||
help="Master node (rank 0)'s free port that needs to " | ||
"be used for communciation during distributed " | ||
"training") | ||
parser.add_argument('--no_hyperthreads', action='store_true', | ||
help='Flag to disable binding to hyperthreads') | ||
parser.add_argument('--no_membind', action='store_true', | ||
help='Flag to disable memory binding') | ||
|
||
# non-optional arguments for binding | ||
parser.add_argument("--nsockets_per_node", type=int, required=True, | ||
help="Number of CPU sockets on a node") | ||
parser.add_argument("--ncores_per_socket", type=int, required=True, | ||
help="Number of CPU cores per socket") | ||
|
||
# positional | ||
parser.add_argument("training_script", type=str, | ||
help="The full path to the single GPU training " | ||
"program/script to be launched in parallel, " | ||
"followed by all the arguments for the " | ||
"training script") | ||
|
||
# rest from the training program | ||
parser.add_argument('training_script_args', nargs=REMAINDER) | ||
return parser.parse_args() | ||
|
||
def main(): | ||
args = parse_args() | ||
|
||
# variables for numactrl binding | ||
NSOCKETS = args.nsockets_per_node | ||
NGPUS_PER_SOCKET = args.nproc_per_node // args.nsockets_per_node | ||
NCORES_PER_GPU = args.ncores_per_socket // NGPUS_PER_SOCKET | ||
|
||
# world size in terms of number of processes | ||
dist_world_size = args.nproc_per_node * args.nnodes | ||
|
||
# set PyTorch distributed related environmental variables | ||
current_env = os.environ.copy() | ||
current_env["MASTER_ADDR"] = args.master_addr | ||
current_env["MASTER_PORT"] = str(args.master_port) | ||
current_env["WORLD_SIZE"] = str(dist_world_size) | ||
|
||
processes = [] | ||
|
||
for local_rank in range(0, args.nproc_per_node): | ||
# each process's rank | ||
dist_rank = args.nproc_per_node * args.node_rank + local_rank | ||
current_env["RANK"] = str(dist_rank) | ||
|
||
# form numactrl binding command | ||
cpu_ranges = [local_rank * NCORES_PER_GPU, | ||
(local_rank + 1) * NCORES_PER_GPU - 1, | ||
local_rank * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS), | ||
(local_rank + 1) * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS) - 1] | ||
|
||
numactlargs = [] | ||
if args.no_hyperthreads: | ||
numactlargs += [ "--physcpubind={}-{}".format(*cpu_ranges[0:2]) ] | ||
else: | ||
numactlargs += [ "--physcpubind={}-{},{}-{}".format(*cpu_ranges) ] | ||
|
||
if not args.no_membind: | ||
memnode = local_rank // NGPUS_PER_SOCKET | ||
numactlargs += [ "--membind={}".format(memnode) ] | ||
|
||
# spawn the processes | ||
cmd = [ "/usr/bin/numactl" ] \ | ||
+ numactlargs \ | ||
+ [ sys.executable, | ||
"-u", | ||
args.training_script, | ||
"--local_rank={}".format(local_rank) | ||
] \ | ||
+ args.training_script_args | ||
|
||
process = subprocess.Popen(cmd, env=current_env) | ||
processes.append(process) | ||
|
||
for process in processes: | ||
process.wait() | ||
|
||
|
||
if __name__ == "__main__": | ||
main() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
#!/bin/bash | ||
|
||
## DL params | ||
EXTRA_PARAMS=( | ||
--batch-size "32" | ||
) | ||
|
||
## System run parms | ||
DGXNNODES=1 | ||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) | ||
WALLTIME=24:00:00 | ||
|
||
## System config params | ||
DGXNGPU=1 | ||
DGXSOCKETCORES=4 | ||
DGXNSOCKET=1 | ||
DGXHT=1 # HT is on is 2, HT off is 1 | ||
DGXIBDEVICES='' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
#!/bin/bash | ||
|
||
## DL params | ||
EXTRA_PARAMS=( | ||
--batch-size "64" | ||
--warmup "300" | ||
) | ||
|
||
## System run parms | ||
DGXNNODES=2 | ||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) | ||
WALLTIME=12:00:00 | ||
|
||
## System config params | ||
DGXNGPU=8 | ||
DGXSOCKETCORES=20 | ||
DGXNSOCKET=2 | ||
DGXHT=2 # HT is on is 2, HT off is 1 | ||
DGXIBDEVICES='--device=/dev/infiniband --device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/ucm0 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/uverbs0 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1 --device=/dev/infiniband/issm0 --device=/dev/infiniband/umad0' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
#!/bin/bash | ||
|
||
## DL params | ||
EXTRA_PARAMS=( | ||
--batch-size "128" | ||
--warmup "300" | ||
) | ||
|
||
## System run parms | ||
DGXNNODES=1 | ||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) | ||
WALLTIME=12:00:00 | ||
|
||
## System config params | ||
DGXNGPU=8 | ||
DGXSOCKETCORES=20 | ||
DGXNSOCKET=1 | ||
DGXHT=1 # HT is on is 2, HT off is 1 | ||
DGXIBDEVICES='' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import torch | ||
import numpy as np | ||
from mlperf_compliance import mlperf_log | ||
|
||
def ssd_print(*args, sync=True, **kwargs): | ||
if sync: | ||
barrier() | ||
if get_rank() == 0: | ||
kwargs['stack_offset'] = 2 | ||
mlperf_log.ssd_print(*args, **kwargs) | ||
|
||
|
||
def barrier(): | ||
""" | ||
Works as a temporary distributed barrier, currently pytorch | ||
doesn't implement barrier for NCCL backend. | ||
Calls all_reduce on dummy tensor and synchronizes with GPU. | ||
""" | ||
if torch.distributed.is_initialized(): | ||
torch.distributed.all_reduce(torch.cuda.FloatTensor(1)) | ||
torch.cuda.synchronize() | ||
|
||
|
||
def get_rank(): | ||
""" | ||
Gets distributed rank or returns zero if distributed is not initialized. | ||
""" | ||
if torch.distributed.is_initialized(): | ||
rank = torch.distributed.get_rank() | ||
else: | ||
rank = 0 | ||
return rank | ||
|
||
def broadcast_seeds(seed, device): | ||
if torch.distributed.is_initialized(): | ||
seeds_tensor = torch.LongTensor([seed]).to(device) | ||
torch.distributed.broadcast(seeds_tensor, 0) | ||
seed = seeds_tensor.item() | ||
return seed |
Oops, something went wrong.