Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix a100 bisection workflow #2055

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 13 additions & 11 deletions .github/workflows/userbenchmark-a100-bisection.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,10 @@ jobs:
CONDA_ENV: "bisection-ci-a100"
PLATFORM_NAME: "gcp_a100"
SETUP_SCRIPT: "/workspace/setup_instance.sh"
BISECT_WORKDIR: ".userbenchmark/${{ github.env.userbenchmark }}/bisection"
BISECT_WORKDIR: ".userbenchmark/${{ github.event.inputs.userbenchmark }}/bisection"
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: [self-hosted, a100-runner]
timeout-minutes: 2880 # 48 hours
Expand Down Expand Up @@ -64,22 +65,23 @@ jobs:
run: |
CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}"
cd benchmark
python ./utils/python_utils.py --create-conda-env "${CONDA_ENV}"
- name: Setup bisection environment
run: |
. "${SETUP_SCRIPT}"; cd benchmark
python utils/cuda_utils.py --install-torch-build-deps
python utils/cuda_utils.py --install-torchbench-deps
mkdir -p "${BISECT_WORKDIR}"
python utils/cuda_utils.py --install-torch-nightly
PYTORCH_GIT_HASH=$(python -c 'import torch; print(torch.version.git_version)')
python run_benchmark.py ${{ github.event.inputs.userbenchmark }} ${{ github.event.inputs.userbenchmark_args }} --dryrun \
--output "${BISECT_WORKDIR}/metric-control.json"
--output "${BISECT_WORKDIR}/metrics-control.json"
sed -i "s/${PYTORCH_GIT_HASH}/${{ github.event.inputs.start_commit }}/g" "${BISECT_WORKDIR}/metrics-control.json"
python run_benchmark.py ${{ github.event.inputs.userbenchmark }} ${{ github.event.inputs.userbenchmark_args }} --dryrun \
--output "${BISECT_WORKDIR}/metric-treatment.json"
--output "${BISECT_WORKDIR}/metrics-treatment.json"
sed -i "s/${PYTORCH_GIT_HASH}/${{ github.event.inputs.end_commit }}/g" "${BISECT_WORKDIR}/metrics-treatment.json"
python regression_detector.py \
--control "${BISECT_WORKDIR}/metrics-control.json" --treatment "${BISECT_WORKDIR}/metrics-treatment.json" \
--output "${BISECT_WORKDIR}/regression-gh${GITHUB_RUN_ID}.yaml"
pip uninstall -y torch torchvision torchaudio torch_tensorrt
python ./utils/python_utils.py --create-conda-env "${CONDA_ENV}"
- name: Setup bisection environment
run: |
. "${SETUP_SCRIPT}"; cd benchmark
python utils/cuda_utils.py --install-torch-build-deps
python utils/cuda_utils.py --install-torchbench-deps
- name: Bisection
run: |
. "${SETUP_SCRIPT}"; cd benchmark
Expand Down
5 changes: 0 additions & 5 deletions bisection.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,6 @@
"url": "https://github.com/pytorch/pytorch.git",
"build_command": [sys.executable, "setup.py", "install"],
},
"torchdata": {
"name": "data",
"url": "https://github.com/pytorch/data.git",
"build_command": [sys.executable, "setup.py", "install"],
},
"torchvision": {
"name": "vision",
"url": "https://github.com/pytorch/vision.git",
Expand Down
42 changes: 34 additions & 8 deletions userbenchmark/test_bench/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
import json
import os
import shutil
import yaml
import re
import ast
import numpy

from typing import List, Dict, Optional, Any, Union
Expand All @@ -25,6 +28,19 @@ def config_to_str(config: TorchBenchModelConfig) -> str:
f" bs={config.batch_size}, extra_args={config.extra_args}"
return metrics_base

def str_to_config(metric_name: str) -> TorchBenchModelConfig:
regex = "model=(.*), test=(.*), device=(.*), bs=(.*), extra_args=(.*), metric=(.*)"
model, test, device, batch_size, extra_args, _metric = re.match(regex, metric_name).groups()
extra_args = ast.literal_eval(extra_args)
batch_size = ast.literal_eval(batch_size)
return TorchBenchModelConfig(
name=model,
test=test,
device=device,
batch_size=batch_size,
extra_args=extra_args,
)

def generate_model_configs(devices: List[str], tests: List[str], batch_sizes: List[str], model_names: List[str], extra_args: List[str]) -> List[TorchBenchModelConfig]:
"""Use the default batch size and default mode."""
if not model_names:
Expand All @@ -40,6 +56,12 @@ def generate_model_configs(devices: List[str], tests: List[str], batch_sizes: Li
) for device, test, batch_size, model_name in cfgs]
return result

def generate_model_configs_from_bisect_yaml(bisect_yaml: str) -> List[TorchBenchModelConfig]:
with open(bisect_yaml, "r") as fp:
bisect = yaml.safe_load(fp)
result = list(map(lambda x: str_to_config(x), bisect["details"].keys()))
return result

def init_output_dir(configs: List[TorchBenchModelConfig], output_dir: pathlib.Path) -> List[TorchBenchModelConfig]:
result = []
for config in configs:
Expand Down Expand Up @@ -118,6 +140,7 @@ def parse_known_args(args):
default_device = "cuda" if "cuda" in list_devices() else "cpu"
parser.add_argument(
"models",
nargs="*",
help="Name of models to run, split by comma.",
)
parser.add_argument("--device", "-d", default=default_device, help="Devices to run, splited by comma.")
Expand All @@ -132,15 +155,18 @@ def parse_known_args(args):

def run(args: List[str]):
args, extra_args = parse_known_args(args)
# If not specified, use the entire model set
if not args.models:
args.models = list_models()
if args.run_bisect:
configs = generate_model_configs_from_bisect_yaml(args.run_bisect)
else:
# If not specified, use the entire model set
if not args.models:
args.models = list_models()
devices = validate(parse_str_to_list(args.device), list_devices())
tests = validate(parse_str_to_list(args.test), list_tests())
batch_sizes = parse_str_to_list(args.bs)
models = validate(parse_str_to_list(args.models), list_models())
configs = generate_model_configs(devices, tests, batch_sizes, model_names=models, extra_args=extra_args)
debug_output_dir = get_default_debug_output_dir(args.output) if args.debug else None
devices = validate(parse_str_to_list(args.device), list_devices())
tests = validate(parse_str_to_list(args.test), list_tests())
batch_sizes = parse_str_to_list(args.bs)
models = validate(parse_str_to_list(args.models), list_models())
configs = generate_model_configs(devices, tests, batch_sizes, model_names=models, extra_args=extra_args)
configs = init_output_dir(configs, debug_output_dir) if debug_output_dir else configs
results = {}
try:
Expand Down