diff --git a/flow/test/test_autotuner.sh b/flow/test/test_autotuner.sh index ae009fee1d..7f8c258753 100755 --- a/flow/test/test_autotuner.sh +++ b/flow/test/test_autotuner.sh @@ -1,4 +1,6 @@ #!/usr/bin/env bash +DESIGN_NAME=${1:-gcd} +PLATFORM=${2:-nangate45} # run the commands in ORFS root dir echo "[INFO FLW-0029] Installing dependencies in virtual environment." @@ -20,28 +22,12 @@ python3 -m unittest tools.AutoTuner.test.smoke_test_sweep.${PLATFORM}SweepSmokeT echo "Running Autotuner smoke tests for --sample and --iteration." python3 -m unittest tools.AutoTuner.test.smoke_test_sample_iteration.${PLATFORM}SampleIterationSmokeTest.test_sample_iteration -if [ "$PLATFORM" == "asap7" ] && [ "$DESIGN" == "gcd" ]; then +if [ "$PLATFORM" == "asap7" ] && [ "$DESIGN_NAME" == "gcd" ]; then echo "Running Autotuner ref file test (only once)" python3 -m unittest tools.AutoTuner.test.ref_file_check.RefFileCheck.test_files -fi -echo "Running Autotuner smoke algorithm & evaluation test" -python3 -m unittest tools.AutoTuner.test.smoke_test_algo_eval.${PLATFORM}AlgoEvalSmokeTest.test_algo_eval - -# run this test last (because it modifies current path) -echo "Running Autotuner remote test" -if [ "$PLATFORM" == "asap7" ] && [ "$DESIGN" == "gcd" ]; then - # Get the directory of the current script - script_dir="$(dirname "${BASH_SOURCE[0]}")" - cd "$script_dir"/../../ - latest_image=$(./etc/DockerTag.sh -dev) - echo "ORFS_VERSION=$latest_image" > ./tools/AutoTuner/.env - cd ./tools/AutoTuner - docker compose up --wait - docker compose exec ray-worker bash -c "cd /OpenROAD-flow-scripts/tools/AutoTuner/src/autotuner && \ - python3 distributed.py --design gcd --platform asap7 --server 127.0.0.1 --port 10001 \ - --config ../../../../flow/designs/asap7/gcd/autotuner.json tune --samples 1" - docker compose down -v --remove-orphans + echo "Running AutoTuner resume test (only once)" + python3 -m unittest tools.AutoTuner.test.resume_check.ResumeCheck.test_tune_resume fi exit $ret diff --git a/flow/test/test_helper.sh b/flow/test/test_helper.sh index 4507dba8f2..123119cddb 100755 --- a/flow/test/test_helper.sh +++ b/flow/test/test_helper.sh @@ -108,7 +108,7 @@ fi if [ "${RUN_AUTOTUNER}" == "true" ]; then set +x echo "Start AutoTuner test." - ./test/test_autotuner.sh + ./test/test_autotuner.sh $DESIGN_NAME $PLATFORM set -x fi diff --git a/tools/AutoTuner/src/autotuner/distributed.py b/tools/AutoTuner/src/autotuner/distributed.py index 5543f0002a..d9c8946891 100644 --- a/tools/AutoTuner/src/autotuner/distributed.py +++ b/tools/AutoTuner/src/autotuner/distributed.py @@ -77,7 +77,7 @@ def setup(self, config): """ # We create the following directory structure: # 1/ 2/ 3/ 4/ 5/ 6/ - # ////-DATE// + # ////// repo_dir = os.getcwd() + "/../" * 6 self.repo_dir = os.path.abspath(repo_dir) self.parameters = parse_config(config, path=os.getcwd()) @@ -88,7 +88,8 @@ def step(self): """ Run step experiment and compute its score. """ - metrics_file = openroad(self.repo_dir, self.parameters, self.variant) + self._variant = f"{self.variant}-{self.step_}" + metrics_file = openroad(self.repo_dir, self.parameters, self._variant) self.step_ += 1 (score, effective_clk_period, num_drc) = self.evaluate( self.read_metrics(metrics_file) @@ -709,7 +710,10 @@ def parse_arguments(): help="Time limit (in hours) for each trial run. Default is no limit.", ) tune_parser.add_argument( - "--resume", action="store_true", help="Resume previous run." + "--resume", + action="store_true", + help="Resume previous run. Note that you must also set a unique experiment\ + name identifier via `--experiment NAME` to be able to resume.", ) # Setup @@ -797,8 +801,8 @@ def parse_arguments(): ) tune_parser.add_argument( "--resources_per_trial", - type=int, - metavar="", + type=float, + metavar="", default=1, help="Number of CPUs to request for each tuning job.", ) @@ -874,7 +878,20 @@ def parse_arguments(): ) sys.exit(7) - arguments.experiment += f"-{arguments.mode}-{DATE}" + # Check for experiment name and resume flag. + if arguments.resume and arguments.experiment == "test": + print( + '[ERROR TUN-0031] The flag "--resume"' + ' requires that "--experiment NAME" is also given.' + ) + sys.exit(1) + + # If the experiment name is the default, add a UUID to the end. + if arguments.experiment == "test": + id = str(uuid())[:8] + arguments.experiment = f"{arguments.mode}-{id}" + else: + arguments.experiment += f"-{arguments.mode}" if arguments.timeout is not None: arguments.timeout = round(arguments.timeout * 3600) @@ -1075,7 +1092,7 @@ def sweep(): local_dir=LOCAL_DIR, resume=args.resume, stop={"training_iteration": args.iterations}, - resources_per_trial={"cpu": args.resources_per_trial}, + resources_per_trial={"cpu": os.cpu_count() / args.jobs}, log_to_file=["trail-out.log", "trail-err.log"], trial_name_creator=lambda x: f"variant-{x.trainable_name}-{x.trial_id}-ray", trial_dirname_creator=lambda x: f"variant-{x.trainable_name}-{x.trial_id}-ray", diff --git a/tools/AutoTuner/test/resume_check.py b/tools/AutoTuner/test/resume_check.py new file mode 100644 index 0000000000..92219eed22 --- /dev/null +++ b/tools/AutoTuner/test/resume_check.py @@ -0,0 +1,89 @@ +import unittest +import subprocess +import os +import time + +from contextlib import contextmanager + +cur_dir = os.path.dirname(os.path.abspath(__file__)) +src_dir = os.path.join(cur_dir, "../src/autotuner") +orfs_dir = os.path.join(cur_dir, "../../../flow") +os.chdir(src_dir) + + +@contextmanager +def managed_process(*args, **kwargs): + """ + Runs process and ensures it is killed when the context is exited. + """ + proc = subprocess.Popen(*args, **kwargs) + try: + yield proc + finally: + if proc.poll() is None: # If the process is still running + proc.kill() # Forcefully kill it + + +class ResumeCheck(unittest.TestCase): + # only test 1 platform/design. + platform = "asap7" + design = "gcd" + samples = 5 + iterations = 2 + + def setUp(self): + self.config = os.path.join( + orfs_dir, "designs", self.platform, self.design, "autotuner.json" + ) + self.jobs = self.samples + self.num_cpus = os.cpu_count() + + # How it works: Say we have 5 samples and 5 iterations. + # If we want to limit to only 5 trials (and avoid any parallelism magic by Ray) + # We can set resources_per_trial = NUM_CORES/5 = 3.2 (fractional resources_per_trial are allowed!) + + # Cast to 1 decimal place + res_per_trial = float("{:.1f}".format(self.num_cpus / self.samples)) + options = ["", "--resume"] + self.commands = [ + f"python3 distributed.py" + f" --design {self.design}" + f" --platform {self.platform}" + f" --config {self.config}" + f" --jobs {self.jobs}" + f" --experiment test_resume" + f" tune --iterations {self.iterations} --samples {self.samples}" + f" --resources_per_trial {res_per_trial}" + f" {c}" + for c in options + ] + + def test_tune_resume(self): + # Goal is to first run the first config (without resume) and then run the second config (with resume) + # and check if the run is able to complete. + + # Run the first config asynchronously. + print("Running the first config") + with managed_process(self.commands[0], shell=True) as proc: + time.sleep(120) + + # Keep trying to stop the ray cluster until it is stopped + while 1: + proc = subprocess.run("ray status", shell=True) + no_nodes = proc.returncode != 0 + proc = subprocess.run("ray stop", shell=True) + successful = proc.returncode == 0 + + if no_nodes and successful: + break + time.sleep(10) + + # Run the second config to completion + print("Running the second config") + proc = subprocess.run(self.commands[1], shell=True) + successful = proc.returncode == 0 + self.assertTrue(successful) + + +if __name__ == "__main__": + unittest.main()