Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Autotuner] CI Smoke Test - Resume #2097

Merged
merged 13 commits into from
Jan 15, 2025
24 changes: 5 additions & 19 deletions flow/test/test_autotuner.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#!/usr/bin/env bash
DESIGN_NAME=${1:-gcd}
PLATFORM=${2:-nangate45}

# run the commands in ORFS root dir
echo "[INFO FLW-0029] Installing dependencies in virtual environment."
Expand All @@ -20,28 +22,12 @@ python3 -m unittest tools.AutoTuner.test.smoke_test_sweep.${PLATFORM}SweepSmokeT
echo "Running Autotuner smoke tests for --sample and --iteration."
python3 -m unittest tools.AutoTuner.test.smoke_test_sample_iteration.${PLATFORM}SampleIterationSmokeTest.test_sample_iteration

if [ "$PLATFORM" == "asap7" ] && [ "$DESIGN" == "gcd" ]; then
if [ "$PLATFORM" == "asap7" ] && [ "$DESIGN_NAME" == "gcd" ]; then
echo "Running Autotuner ref file test (only once)"
python3 -m unittest tools.AutoTuner.test.ref_file_check.RefFileCheck.test_files
fi

echo "Running Autotuner smoke algorithm & evaluation test"
python3 -m unittest tools.AutoTuner.test.smoke_test_algo_eval.${PLATFORM}AlgoEvalSmokeTest.test_algo_eval

# run this test last (because it modifies current path)
echo "Running Autotuner remote test"
if [ "$PLATFORM" == "asap7" ] && [ "$DESIGN" == "gcd" ]; then
# Get the directory of the current script
script_dir="$(dirname "${BASH_SOURCE[0]}")"
cd "$script_dir"/../../
latest_image=$(./etc/DockerTag.sh -dev)
echo "ORFS_VERSION=$latest_image" > ./tools/AutoTuner/.env
cd ./tools/AutoTuner
docker compose up --wait
docker compose exec ray-worker bash -c "cd /OpenROAD-flow-scripts/tools/AutoTuner/src/autotuner && \
python3 distributed.py --design gcd --platform asap7 --server 127.0.0.1 --port 10001 \
--config ../../../../flow/designs/asap7/gcd/autotuner.json tune --samples 1"
docker compose down -v --remove-orphans
echo "Running AutoTuner resume test (only once)"
python3 -m unittest tools.AutoTuner.test.resume_check.ResumeCheck.test_tune_resume
fi

exit $ret
2 changes: 1 addition & 1 deletion flow/test/test_helper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ fi
if [ "${RUN_AUTOTUNER}" == "true" ]; then
set +x
echo "Start AutoTuner test."
./test/test_autotuner.sh
./test/test_autotuner.sh $DESIGN_NAME $PLATFORM
set -x
fi

Expand Down
31 changes: 24 additions & 7 deletions tools/AutoTuner/src/autotuner/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def setup(self, config):
"""
# We create the following directory structure:
# 1/ 2/ 3/ 4/ 5/ 6/
# <repo>/<logs>/<platform>/<design>/<experiment>-DATE/<id>/<cwd>
# <repo>/<logs>/<platform>/<design>/<experiment>/<id>/<cwd>
repo_dir = os.getcwd() + "/../" * 6
self.repo_dir = os.path.abspath(repo_dir)
self.parameters = parse_config(config, path=os.getcwd())
Expand All @@ -88,7 +88,8 @@ def step(self):
"""
Run step experiment and compute its score.
"""
metrics_file = openroad(self.repo_dir, self.parameters, self.variant)
self._variant = f"{self.variant}-{self.step_}"
metrics_file = openroad(self.repo_dir, self.parameters, self._variant)
self.step_ += 1
(score, effective_clk_period, num_drc) = self.evaluate(
self.read_metrics(metrics_file)
Expand Down Expand Up @@ -709,7 +710,10 @@ def parse_arguments():
help="Time limit (in hours) for each trial run. Default is no limit.",
)
tune_parser.add_argument(
"--resume", action="store_true", help="Resume previous run."
"--resume",
action="store_true",
help="Resume previous run. Note that you must also set a unique experiment\
name identifier via `--experiment NAME` to be able to resume.",
)

# Setup
Expand Down Expand Up @@ -797,8 +801,8 @@ def parse_arguments():
)
tune_parser.add_argument(
"--resources_per_trial",
type=int,
metavar="<int>",
type=float,
metavar="<float>",
default=1,
help="Number of CPUs to request for each tuning job.",
)
Expand Down Expand Up @@ -874,7 +878,20 @@ def parse_arguments():
)
sys.exit(7)

arguments.experiment += f"-{arguments.mode}-{DATE}"
# Check for experiment name and resume flag.
if arguments.resume and arguments.experiment == "test":
print(
'[ERROR TUN-0031] The flag "--resume"'
' requires that "--experiment NAME" is also given.'
)
sys.exit(1)

# If the experiment name is the default, add a UUID to the end.
if arguments.experiment == "test":
id = str(uuid())[:8]
arguments.experiment = f"{arguments.mode}-{id}"
else:
arguments.experiment += f"-{arguments.mode}"

if arguments.timeout is not None:
arguments.timeout = round(arguments.timeout * 3600)
Expand Down Expand Up @@ -1075,7 +1092,7 @@ def sweep():
local_dir=LOCAL_DIR,
resume=args.resume,
stop={"training_iteration": args.iterations},
resources_per_trial={"cpu": args.resources_per_trial},
resources_per_trial={"cpu": os.cpu_count() / args.jobs},
luarss marked this conversation as resolved.
Show resolved Hide resolved
log_to_file=["trail-out.log", "trail-err.log"],
trial_name_creator=lambda x: f"variant-{x.trainable_name}-{x.trial_id}-ray",
trial_dirname_creator=lambda x: f"variant-{x.trainable_name}-{x.trial_id}-ray",
Expand Down
89 changes: 89 additions & 0 deletions tools/AutoTuner/test/resume_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import unittest
import subprocess
import os
import time

from contextlib import contextmanager

cur_dir = os.path.dirname(os.path.abspath(__file__))
src_dir = os.path.join(cur_dir, "../src/autotuner")
orfs_dir = os.path.join(cur_dir, "../../../flow")
os.chdir(src_dir)


@contextmanager
def managed_process(*args, **kwargs):
"""
Runs process and ensures it is killed when the context is exited.
"""
proc = subprocess.Popen(*args, **kwargs)
try:
yield proc
finally:
if proc.poll() is None: # If the process is still running
proc.kill() # Forcefully kill it


class ResumeCheck(unittest.TestCase):
# only test 1 platform/design.
platform = "asap7"
design = "gcd"
samples = 5
iterations = 2

def setUp(self):
self.config = os.path.join(
orfs_dir, "designs", self.platform, self.design, "autotuner.json"
)
self.jobs = self.samples
self.num_cpus = os.cpu_count()

# How it works: Say we have 5 samples and 5 iterations.
# If we want to limit to only 5 trials (and avoid any parallelism magic by Ray)
# We can set resources_per_trial = NUM_CORES/5 = 3.2 (fractional resources_per_trial are allowed!)

# Cast to 1 decimal place
res_per_trial = float("{:.1f}".format(self.num_cpus / self.samples))
options = ["", "--resume"]
self.commands = [
f"python3 distributed.py"
f" --design {self.design}"
f" --platform {self.platform}"
f" --config {self.config}"
f" --jobs {self.jobs}"
f" --experiment test_resume"
f" tune --iterations {self.iterations} --samples {self.samples}"
f" --resources_per_trial {res_per_trial}"
f" {c}"
for c in options
]

def test_tune_resume(self):
# Goal is to first run the first config (without resume) and then run the second config (with resume)
# and check if the run is able to complete.

# Run the first config asynchronously.
print("Running the first config")
with managed_process(self.commands[0], shell=True) as proc:
time.sleep(120)

# Keep trying to stop the ray cluster until it is stopped
while 1:
proc = subprocess.run("ray status", shell=True)
no_nodes = proc.returncode != 0
proc = subprocess.run("ray stop", shell=True)
successful = proc.returncode == 0

if no_nodes and successful:
break
time.sleep(10)

# Run the second config to completion
print("Running the second config")
proc = subprocess.run(self.commands[1], shell=True)
successful = proc.returncode == 0
self.assertTrue(successful)


if __name__ == "__main__":
unittest.main()
Loading