Skip to content

Commit 551f9e9

Browse files
committed
Test userbenchmark
1 parent ff2a8bc commit 551f9e9

File tree

3 files changed

+8
-204
lines changed

3 files changed

+8
-204
lines changed

.github/workflows/userbenchmark-a100.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,6 @@ jobs:
2727
- name: Install Conda
2828
run: |
2929
bash ./.ci/torchbench/install-conda.sh
30-
- name: Install TorchBench
31-
run: |
32-
bash ./.ci/torchbench/install.sh
3330
- name: Run user benchmark
3431
run: |
3532
set -x

userbenchmark/release-test/run.py

Lines changed: 2 additions & 145 deletions
Original file line numberDiff line numberDiff line change
@@ -1,155 +1,12 @@
1-
import argparse
2-
import itertools
31
import os
4-
import shutil
52
import subprocess
6-
import time
7-
from datetime import datetime
8-
from pathlib import Path
9-
from typing import List
10-
11-
import yaml
12-
from git import Repo
13-
14-
from ..utils import dump_output, get_output_dir, get_output_json
15-
from .result_analyzer import analyze
163

17-
# Expected WORK_DIR structure
18-
# WORK_DIR/
19-
# |---examples/
20-
# |---pytorch-<ver1>-cuda<ver1>/
21-
# |---run.sh
22-
# |---mnist/
23-
# |---mnist-hogwild/
24-
# |---<other-benchmarks>
25-
# |---pytorch-<ver2>-cuda<ver2>/
26-
# |---summary.csv
4+
from typing import List
275

286
BM_NAME = "release-test"
297
EXAMPLE_URL = "https://github.com/pytorch/examples.git"
308
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
31-
DEFAULT_CONFIG_PATH = os.path.join(
32-
os.path.dirname(os.path.abspath(__file__)), "configs"
33-
)
34-
RUN_TEMPLATE = """
35-
# GENERATED BY userbenchmark/release-test/__init__.py. DO NOT EDIT!
36-
bash {RELEASE_TEST_ROOT}/setup_env.sh '{CUDA_VERSION}' '{MAGMA_VERSION}' '{PYTORCH_VERSION}' '{PYTORCH_CHANNEL}' '{WORK_DIR}'
37-
bash {RELEASE_TEST_ROOT}/run_release_test.sh '{CUDA_VERSION}' '{RESULT_DIR}'
38-
"""
39-
40-
41-
def get_timestamp():
42-
return datetime.fromtimestamp(time.time()).strftime("%Y%m%d%H%M%S")
43-
44-
45-
def get_work_dir(output_dir):
46-
work_dir = output_dir.joinpath(f"run-{get_timestamp()}")
47-
work_dir.mkdir(exist_ok=True, parents=True)
48-
return work_dir
49-
50-
51-
def generate_test_scripts(config, work_dir):
52-
assert "cuda" in config and isinstance(
53-
config["cuda"], list
54-
), f"Expected CUDA config list, but not found."
55-
assert "pytorch" in config and isinstance(
56-
config["pytorch"], list
57-
), f"Exptected pytorch version list, but not found."
58-
bm_matrix = [config["cuda"], config["pytorch"]]
59-
run_scripts = {}
60-
for cuda, pytorch in itertools.product(*bm_matrix):
61-
run_key = f"pytorch-{pytorch['version']}-cuda-{cuda['version']}"
62-
run_script = RUN_TEMPLATE.format(
63-
RELEASE_TEST_ROOT=CURRENT_DIR,
64-
CUDA_VERSION=cuda["version"],
65-
MAGMA_VERSION=cuda["magma_version"],
66-
PYTORCH_VERSION=pytorch["version"],
67-
PYTORCH_CHANNEL=pytorch["conda_channel"],
68-
WORK_DIR=work_dir,
69-
RESULT_DIR=work_dir.joinpath(run_key),
70-
)
71-
run_scripts[run_key] = run_script
72-
return run_scripts
73-
74-
75-
def dump_test_scripts(run_scripts, work_dir):
76-
for run_key, run_script in run_scripts.items():
77-
run_script_loc = work_dir.joinpath(run_key)
78-
run_script_loc.mkdir(exist_ok=True)
79-
with open(run_script_loc.joinpath("run.sh"), "w") as rs:
80-
rs.write(run_script)
81-
82-
83-
def dump_result_to_json(metrics):
84-
result = get_output_json(BM_NAME, metrics)
85-
dump_output(BM_NAME, result)
86-
87-
88-
def run_benchmark(run_scripts, work_dir):
89-
for run_key, _rscript in run_scripts.items():
90-
run_script_path = work_dir.joinpath(run_key, "run.sh")
91-
# run the benchmark
92-
print(f"Running benchmark {run_key} ...")
93-
subprocess.check_call(["bash", str(run_script_path)])
94-
95-
96-
def get_config(config_name: str):
97-
if os.path.exists(os.path.join(DEFAULT_CONFIG_PATH, config_name)):
98-
config_name = os.path.join(DEFAULT_CONFIG_PATH, config_name)
99-
elif os.path.exists(os.path.join(DEFAULT_CONFIG_PATH, f"{config_name}.yaml")):
100-
config_name = os.path.join(DEFAULT_CONFIG_PATH, f"{config_name}.yaml")
101-
else:
102-
raise ValueError(
103-
f"Can't find config name {config_name} in config path {DEFAULT_CONFIG_PATH}."
104-
)
105-
with open(config_name, "r") as yfile:
106-
config = yaml.safe_load(yfile)
107-
return config
108-
109-
110-
def parse_args(args):
111-
parser = argparse.ArgumentParser()
112-
parser.add_argument(
113-
"--config", "-c", default="1.12.1", type=str, help="Config for release testing"
114-
)
115-
parser.add_argument(
116-
"--dry-run",
117-
action="store_true",
118-
help="Only generate the test scripts. Do not run the benchmark.",
119-
)
120-
parser.add_argument(
121-
"--analyze",
122-
type=str,
123-
help="Only analyze the result of the specified work directory.",
124-
)
125-
args = parser.parse_args(args)
126-
return args
127-
128-
129-
def prepare_release_tests(args: argparse.Namespace, work_dir: Path):
130-
config = get_config(args.config)
131-
run_scripts = generate_test_scripts(config, work_dir)
132-
dump_test_scripts(run_scripts, work_dir)
133-
# clone the examples repo
134-
Repo.clone_from(EXAMPLE_URL, work_dir.joinpath("examples"))
135-
return run_scripts
136-
137-
138-
def cleanup_release_tests(work_dir):
139-
examples_path = work_dir.joinpath("examples")
140-
if examples_path.exists():
141-
shutil.rmtree(examples_path)
1429

14310

14411
def run(args: List[str]):
145-
args = parse_args(args)
146-
if args.analyze:
147-
analyze(args.analyze)
148-
return
149-
work_dir = get_work_dir(get_output_dir(BM_NAME))
150-
run_scripts = prepare_release_tests(args=args, work_dir=work_dir)
151-
if not args.dry_run:
152-
run_benchmark(run_scripts, work_dir)
153-
metrics = analyze(work_dir)
154-
dump_result_to_json(metrics)
155-
cleanup_release_tests(work_dir)
12+
subprocess.check_call(["bash", f"{CURRENT_DIR}/run_release_test.sh"])
Lines changed: 6 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,60 +1,10 @@
11
#!/bin/bash
22

3-
set -xeuo pipefail
3+
set -euo pipefail
44

5-
CUDA_VERSION="$1"
6-
RESULT_DIR="$2"
7-
EXAMPLES_DIR="${RESULT_DIR}/../examples"
8-
# get the directory of the current script
9-
CURRENT_DIR=$(dirname -- "$0")
5+
python -c "import torch; import time; a = torch.randn([4096, 4096]).cuda(); time.sleep(60); print('done!')" > log.txt 2>&1 &
106

11-
PREFIX=""
12-
if [[ ${PLATFORM_NAME} == "aws_t4_metal" ]]; then
13-
PREFIX="taskset -c 24-47";
14-
export GOMP_CPU_AFFINITY="24-47"
15-
fi
16-
17-
. switch-cuda.sh "${CUDA_VERSION}"
18-
19-
20-
nvcc --version
21-
sudo apt update
22-
sudo apt-get install bc
23-
sudo apt-get install --reinstall time
24-
which time
25-
# run mnist
26-
mkdir -p "${RESULT_DIR}/mnist"
27-
pushd "${EXAMPLES_DIR}/mnist"
28-
export LOG_FILE=${RESULT_DIR}/mnist/result.log
29-
export MEM_FILE=${RESULT_DIR}/mnist/result_mem.log
30-
${PREFIX} bash "${CURRENT_DIR}/monitor_proc.sh" python main.py --epochs 3
31-
# run mnist-hogwild
32-
mkdir -p ${RESULT_DIR}/mnist_hogwild
33-
pushd "${EXAMPLES_DIR}/mnist_hogwild"
34-
export LOG_FILE=${RESULT_DIR}/mnist_hogwild/result.log
35-
export MEM_FILE=${RESULT_DIR}/mnist_hogwild/result_mem.log
36-
${PREFIX} bash "${CURRENT_DIR}/monitor_proc.sh" python main.py --epochs 3
37-
# run CPU WLM LSTM
38-
mkdir -p ${RESULT_DIR}/wlm_cpu_lstm
39-
pushd "${EXAMPLES_DIR}/word_language_model"
40-
export LOG_FILE=${RESULT_DIR}/wlm_cpu_lstm/result.log
41-
export MEM_FILE=${RESULT_DIR}/wlm_cpu_lstm/result_mem.log
42-
${PREFIX} bash "${CURRENT_DIR}/monitor_proc.sh" python main.py --epochs 3 --model LSTM
43-
# run GPU WLM LSTM
44-
mkdir -p ${RESULT_DIR}/wlm_gpu_lstm
45-
pushd "${EXAMPLES_DIR}/word_language_model"
46-
export LOG_FILE=${RESULT_DIR}/wlm_gpu_lstm/result.log
47-
export MEM_FILE=${RESULT_DIR}/wlm_gpu_lstm/result_mem.log
48-
${PREFIX} bash "${CURRENT_DIR}/monitor_proc.sh" python main.py --epochs 3 --model LSTM --cuda
49-
# run CPU WLM Transformer
50-
mkdir -p ${RESULT_DIR}/wlm_cpu_trans
51-
pushd "${EXAMPLES_DIR}/word_language_model"
52-
export LOG_FILE=${RESULT_DIR}/wlm_cpu_trans/result.log
53-
export MEM_FILE=${RESULT_DIR}/wlm_cpu_trans/result_mem.log
54-
${PREFIX} bash "${CURRENT_DIR}/monitor_proc.sh" python main.py --epochs 3 --model Transformer
55-
# run GPU WLM Transformer
56-
mkdir -p ${RESULT_DIR}/wlm_gpu_trans
57-
pushd "${EXAMPLES_DIR}/word_language_model"
58-
export LOG_FILE=${RESULT_DIR}/wlm_gpu_trans/result.log
59-
export MEM_FILE=${RESULT_DIR}/wlm_gpu_trans/result_mem.log
60-
${PREFIX} bash "${CURRENT_DIR}/monitor_proc.sh" python main.py --epochs 3 --model Transformer --cuda
7+
for i in {1..120}; do
8+
nvidia-smi pmon -s m -c 1 -o T
9+
sleep 0.5
10+
done

0 commit comments

Comments
 (0)