Skip to content

Commit

Permalink
[Autotuner] CI Smoke Test - Resume (#2097)
Browse files Browse the repository at this point in the history
* Resume smoke test

Signed-off-by: Jack Luar <[email protected]>
  • Loading branch information
luarss authored Jan 15, 2025
1 parent 9ab0084 commit 0a48f8f
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 27 deletions.
24 changes: 5 additions & 19 deletions flow/test/test_autotuner.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#!/usr/bin/env bash
DESIGN_NAME=${1:-gcd}
PLATFORM=${2:-nangate45}

# run the commands in ORFS root dir
echo "[INFO FLW-0029] Installing dependencies in virtual environment."
Expand All @@ -20,28 +22,12 @@ python3 -m unittest tools.AutoTuner.test.smoke_test_sweep.${PLATFORM}SweepSmokeT
echo "Running Autotuner smoke tests for --sample and --iteration."
python3 -m unittest tools.AutoTuner.test.smoke_test_sample_iteration.${PLATFORM}SampleIterationSmokeTest.test_sample_iteration

if [ "$PLATFORM" == "asap7" ] && [ "$DESIGN" == "gcd" ]; then
if [ "$PLATFORM" == "asap7" ] && [ "$DESIGN_NAME" == "gcd" ]; then
echo "Running Autotuner ref file test (only once)"
python3 -m unittest tools.AutoTuner.test.ref_file_check.RefFileCheck.test_files
fi

echo "Running Autotuner smoke algorithm & evaluation test"
python3 -m unittest tools.AutoTuner.test.smoke_test_algo_eval.${PLATFORM}AlgoEvalSmokeTest.test_algo_eval

# run this test last (because it modifies current path)
echo "Running Autotuner remote test"
if [ "$PLATFORM" == "asap7" ] && [ "$DESIGN" == "gcd" ]; then
# Get the directory of the current script
script_dir="$(dirname "${BASH_SOURCE[0]}")"
cd "$script_dir"/../../
latest_image=$(./etc/DockerTag.sh -dev)
echo "ORFS_VERSION=$latest_image" > ./tools/AutoTuner/.env
cd ./tools/AutoTuner
docker compose up --wait
docker compose exec ray-worker bash -c "cd /OpenROAD-flow-scripts/tools/AutoTuner/src/autotuner && \
python3 distributed.py --design gcd --platform asap7 --server 127.0.0.1 --port 10001 \
--config ../../../../flow/designs/asap7/gcd/autotuner.json tune --samples 1"
docker compose down -v --remove-orphans
echo "Running AutoTuner resume test (only once)"
python3 -m unittest tools.AutoTuner.test.resume_check.ResumeCheck.test_tune_resume
fi

exit $ret
2 changes: 1 addition & 1 deletion flow/test/test_helper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ fi
if [ "${RUN_AUTOTUNER}" == "true" ]; then
set +x
echo "Start AutoTuner test."
./test/test_autotuner.sh
./test/test_autotuner.sh $DESIGN_NAME $PLATFORM
set -x
fi

Expand Down
31 changes: 24 additions & 7 deletions tools/AutoTuner/src/autotuner/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def setup(self, config):
"""
# We create the following directory structure:
# 1/ 2/ 3/ 4/ 5/ 6/
# <repo>/<logs>/<platform>/<design>/<experiment>-DATE/<id>/<cwd>
# <repo>/<logs>/<platform>/<design>/<experiment>/<id>/<cwd>
repo_dir = os.getcwd() + "/../" * 6
self.repo_dir = os.path.abspath(repo_dir)
self.parameters = parse_config(config, path=os.getcwd())
Expand All @@ -88,7 +88,8 @@ def step(self):
"""
Run step experiment and compute its score.
"""
metrics_file = openroad(self.repo_dir, self.parameters, self.variant)
self._variant = f"{self.variant}-{self.step_}"
metrics_file = openroad(self.repo_dir, self.parameters, self._variant)
self.step_ += 1
(score, effective_clk_period, num_drc) = self.evaluate(
self.read_metrics(metrics_file)
Expand Down Expand Up @@ -709,7 +710,10 @@ def parse_arguments():
help="Time limit (in hours) for each trial run. Default is no limit.",
)
tune_parser.add_argument(
"--resume", action="store_true", help="Resume previous run."
"--resume",
action="store_true",
help="Resume previous run. Note that you must also set a unique experiment\
name identifier via `--experiment NAME` to be able to resume.",
)

# Setup
Expand Down Expand Up @@ -797,8 +801,8 @@ def parse_arguments():
)
tune_parser.add_argument(
"--resources_per_trial",
type=int,
metavar="<int>",
type=float,
metavar="<float>",
default=1,
help="Number of CPUs to request for each tuning job.",
)
Expand Down Expand Up @@ -874,7 +878,20 @@ def parse_arguments():
)
sys.exit(7)

arguments.experiment += f"-{arguments.mode}-{DATE}"
# Check for experiment name and resume flag.
if arguments.resume and arguments.experiment == "test":
print(
'[ERROR TUN-0031] The flag "--resume"'
' requires that "--experiment NAME" is also given.'
)
sys.exit(1)

# If the experiment name is the default, add a UUID to the end.
if arguments.experiment == "test":
id = str(uuid())[:8]
arguments.experiment = f"{arguments.mode}-{id}"
else:
arguments.experiment += f"-{arguments.mode}"

if arguments.timeout is not None:
arguments.timeout = round(arguments.timeout * 3600)
Expand Down Expand Up @@ -1075,7 +1092,7 @@ def sweep():
local_dir=LOCAL_DIR,
resume=args.resume,
stop={"training_iteration": args.iterations},
resources_per_trial={"cpu": args.resources_per_trial},
resources_per_trial={"cpu": os.cpu_count() / args.jobs},
log_to_file=["trail-out.log", "trail-err.log"],
trial_name_creator=lambda x: f"variant-{x.trainable_name}-{x.trial_id}-ray",
trial_dirname_creator=lambda x: f"variant-{x.trainable_name}-{x.trial_id}-ray",
Expand Down
89 changes: 89 additions & 0 deletions tools/AutoTuner/test/resume_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import unittest
import subprocess
import os
import time

from contextlib import contextmanager

cur_dir = os.path.dirname(os.path.abspath(__file__))
src_dir = os.path.join(cur_dir, "../src/autotuner")
orfs_dir = os.path.join(cur_dir, "../../../flow")
os.chdir(src_dir)


@contextmanager
def managed_process(*args, **kwargs):
"""
Runs process and ensures it is killed when the context is exited.
"""
proc = subprocess.Popen(*args, **kwargs)
try:
yield proc
finally:
if proc.poll() is None: # If the process is still running
proc.kill() # Forcefully kill it


class ResumeCheck(unittest.TestCase):
# only test 1 platform/design.
platform = "asap7"
design = "gcd"
samples = 5
iterations = 2

def setUp(self):
self.config = os.path.join(
orfs_dir, "designs", self.platform, self.design, "autotuner.json"
)
self.jobs = self.samples
self.num_cpus = os.cpu_count()

# How it works: Say we have 5 samples and 5 iterations.
# If we want to limit to only 5 trials (and avoid any parallelism magic by Ray)
# We can set resources_per_trial = NUM_CORES/5 = 3.2 (fractional resources_per_trial are allowed!)

# Cast to 1 decimal place
res_per_trial = float("{:.1f}".format(self.num_cpus / self.samples))
options = ["", "--resume"]
self.commands = [
f"python3 distributed.py"
f" --design {self.design}"
f" --platform {self.platform}"
f" --config {self.config}"
f" --jobs {self.jobs}"
f" --experiment test_resume"
f" tune --iterations {self.iterations} --samples {self.samples}"
f" --resources_per_trial {res_per_trial}"
f" {c}"
for c in options
]

def test_tune_resume(self):
# Goal is to first run the first config (without resume) and then run the second config (with resume)
# and check if the run is able to complete.

# Run the first config asynchronously.
print("Running the first config")
with managed_process(self.commands[0], shell=True) as proc:
time.sleep(120)

# Keep trying to stop the ray cluster until it is stopped
while 1:
proc = subprocess.run("ray status", shell=True)
no_nodes = proc.returncode != 0
proc = subprocess.run("ray stop", shell=True)
successful = proc.returncode == 0

if no_nodes and successful:
break
time.sleep(10)

# Run the second config to completion
print("Running the second config")
proc = subprocess.run(self.commands[1], shell=True)
successful = proc.returncode == 0
self.assertTrue(successful)


if __name__ == "__main__":
unittest.main()

0 comments on commit 0a48f8f

Please sign in to comment.