diff --git a/tools/AutoTuner/.gitignore b/tools/AutoTuner/.gitignore index d7693dd9c4..e4cd3d6f9b 100644 --- a/tools/AutoTuner/.gitignore +++ b/tools/AutoTuner/.gitignore @@ -10,3 +10,4 @@ __pycache__/ # Autotuner env autotuner_env .env +.venv diff --git a/tools/AutoTuner/Makefile b/tools/AutoTuner/Makefile new file mode 100644 index 0000000000..2edaab4d5a --- /dev/null +++ b/tools/AutoTuner/Makefile @@ -0,0 +1,17 @@ +.PHONY: clearcache init reqs test + +clearcache: + @echo "Cleaning python cache" + @find . -type d -name __pycache__ -exec rm -r {} \+ + +init: + @echo "Installing python environment" + @./installer.sh + +reqs: + @echo "Compiling requirements" + @rm -f requirements.txt + @.venv/bin/pip-compile --output-file=requirements.txt requirements.in + +test: + @echo "Running tests" \ No newline at end of file diff --git a/tools/AutoTuner/installer.sh b/tools/AutoTuner/installer.sh index c694715511..f93c765113 100755 --- a/tools/AutoTuner/installer.sh +++ b/tools/AutoTuner/installer.sh @@ -7,5 +7,25 @@ script_dir="$(dirname "${BASH_SOURCE[0]}")" venv_name="autotuner_env" python3 -m venv "$script_dir/$venv_name" source "$script_dir/$venv_name/bin/activate" -pip3 install -U -r $script_dir/requirements.txt +retry_count=0 +max_retries=5 +success=false + +while [[ $retry_count -lt $max_retries ]]; do + if pip3 cache purge && pip3 install --no-cache-dir -U -r "$script_dir/requirements.txt"; then + success=true + break + else + retry_count=$((retry_count + 1)) + echo "Attempt $retry_count failed. Retrying in 1 minute..." + sleep 60 + fi +done + +if [ "$success" = false ]; then + echo "Failed to install requirements after $max_retries attempts." + deactivate + exit 1 +fi + deactivate diff --git a/tools/AutoTuner/requirements.in b/tools/AutoTuner/requirements.in new file mode 100644 index 0000000000..c972ba901f --- /dev/null +++ b/tools/AutoTuner/requirements.in @@ -0,0 +1,12 @@ +ray[default,tune]==2.9.3 +ax-platform>=0.3.3,<=0.3.7 +hyperopt==0.2.7 +optuna==3.6.0 +pandas>=2.0,<=2.2.1 +bayesian-optimization==1.4.0 +colorama==0.4.6 +tensorboard>=2.14.0,<=2.16.2 +protobuf==3.20.3 +SQLAlchemy==1.4.17 +urllib3<=1.26.15 +pip-tools==7.4.1 diff --git a/tools/AutoTuner/requirements.txt b/tools/AutoTuner/requirements.txt index 5bf65305cc..f2b88aa5d1 100644 --- a/tools/AutoTuner/requirements.txt +++ b/tools/AutoTuner/requirements.txt @@ -1,11 +1,436 @@ -ray[default,tune]==2.9.3 -ax-platform>=0.3.3,<=0.3.7 -hyperopt==0.2.7 -optuna==3.6.0 -pandas>=2.0,<=2.2.1 +# +# This file is autogenerated by pip-compile with Python 3.8 +# by the following command: +# +# pip-compile --output-file=requirements.txt requirements.in +# +absl-py==2.1.0 + # via tensorboard +aiohappyeyeballs==2.4.3 + # via aiohttp +aiohttp==3.10.10 + # via + # aiohttp-cors + # ray +aiohttp-cors==0.7.0 + # via ray +aiosignal==1.3.1 + # via + # aiohttp + # ray +alembic==1.13.3 + # via optuna +annotated-types==0.7.0 + # via pydantic +asttokens==2.4.1 + # via stack-data +async-timeout==4.0.3 + # via aiohttp +attrs==24.2.0 + # via + # aiohttp + # jsonschema + # referencing +ax-platform==0.3.3 + # via -r requirements.in +backcall==0.2.0 + # via ipython bayesian-optimization==1.4.0 + # via -r requirements.in +blessed==1.20.0 + # via gpustat +botorch==0.8.5 + # via ax-platform +build==1.2.2.post1 + # via pip-tools +cachetools==5.5.0 + # via google-auth +certifi==2024.8.30 + # via requests +charset-normalizer==3.4.0 + # via requests +click==8.1.7 + # via + # pip-tools + # ray +cloudpickle==3.1.0 + # via hyperopt colorama==0.4.6 -tensorboard>=2.14.0,<=2.16.2 + # via -r requirements.in +colorful==0.5.6 + # via ray +colorlog==6.9.0 + # via optuna +comm==0.2.2 + # via ipywidgets +decorator==5.1.1 + # via ipython +distlib==0.3.9 + # via virtualenv +executing==2.1.0 + # via stack-data +filelock==3.16.1 + # via + # ray + # torch + # triton + # virtualenv +frozenlist==1.5.0 + # via + # aiohttp + # aiosignal + # ray +fsspec==2024.10.0 + # via + # ray + # torch +future==1.0.0 + # via hyperopt +google-api-core==2.22.0 + # via opencensus +google-auth==2.35.0 + # via + # google-api-core + # google-auth-oauthlib + # tensorboard +google-auth-oauthlib==1.0.0 + # via tensorboard +googleapis-common-protos==1.65.0 + # via google-api-core +gpustat==1.1.1 + # via ray +gpytorch==1.10 + # via botorch +greenlet==3.1.1 + # via sqlalchemy +grpcio==1.67.1 + # via + # ray + # tensorboard +hyperopt==0.2.7 + # via -r requirements.in +idna==3.10 + # via + # requests + # yarl +importlib-metadata==8.5.0 + # via + # alembic + # build + # markdown +importlib-resources==6.4.5 + # via + # alembic + # jsonschema + # jsonschema-specifications +ipython==8.12.3 + # via ipywidgets +ipywidgets==8.1.5 + # via ax-platform +jedi==0.19.1 + # via ipython +jinja2==3.1.4 + # via + # ax-platform + # torch +joblib==1.4.2 + # via scikit-learn +jsonschema==4.23.0 + # via ray +jsonschema-specifications==2023.12.1 + # via jsonschema +jupyterlab-widgets==3.0.13 + # via ipywidgets +linear-operator==0.4.0 + # via + # botorch + # gpytorch +mako==1.3.6 + # via alembic +markdown==3.7 + # via tensorboard +markupsafe==2.1.5 + # via + # jinja2 + # mako + # werkzeug +matplotlib-inline==0.1.7 + # via ipython +mpmath==1.3.0 + # via sympy +msgpack==1.1.0 + # via ray +multidict==6.1.0 + # via + # aiohttp + # yarl +multipledispatch==1.0.0 + # via botorch +networkx==3.1 + # via + # hyperopt + # torch +numpy==1.24.4 + # via + # bayesian-optimization + # hyperopt + # optuna + # pandas + # pyarrow + # pyro-ppl + # scikit-learn + # scipy + # tensorboard + # tensorboardx +nvidia-cublas-cu12==12.1.3.1 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via torch +nvidia-cuda-runtime-cu12==12.1.105 + # via torch +nvidia-cudnn-cu12==9.1.0.70 + # via torch +nvidia-cufft-cu12==11.0.2.54 + # via torch +nvidia-curand-cu12==10.3.2.106 + # via torch +nvidia-cusolver-cu12==11.4.5.107 + # via torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-ml-py==12.560.30 + # via gpustat +nvidia-nccl-cu12==2.20.5 + # via torch +nvidia-nvjitlink-cu12==12.6.77 + # via + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via torch +oauthlib==3.2.2 + # via requests-oauthlib +opencensus==0.11.4 + # via ray +opencensus-context==0.1.3 + # via opencensus +opt-einsum==3.4.0 + # via pyro-ppl +optuna==3.6.0 + # via -r requirements.in +packaging==24.1 + # via + # build + # optuna + # plotly + # ray + # tensorboardx +pandas==2.0.3 + # via + # -r requirements.in + # ax-platform + # ray +parso==0.8.4 + # via jedi +pexpect==4.9.0 + # via ipython +pickleshare==0.7.5 + # via ipython +pip-tools==7.4.1 + # via -r requirements.in +pkgutil-resolve-name==1.3.10 + # via jsonschema +platformdirs==4.3.6 + # via virtualenv +plotly==5.24.1 + # via ax-platform +prometheus-client==0.21.0 + # via ray +prompt-toolkit==3.0.48 + # via ipython +propcache==0.2.0 + # via yarl +proto-plus==1.25.0 + # via google-api-core protobuf==3.20.3 -SQLAlchemy==1.4.17 -urllib3<=1.26.15 + # via + # -r requirements.in + # google-api-core + # googleapis-common-protos + # proto-plus + # ray + # tensorboard + # tensorboardx +psutil==6.1.0 + # via gpustat +ptyprocess==0.7.0 + # via pexpect +pure-eval==0.2.3 + # via stack-data +py-spy==0.3.14 + # via ray +py4j==0.10.9.7 + # via hyperopt +pyarrow==17.0.0 + # via ray +pyasn1==0.6.1 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.1 + # via google-auth +pydantic==2.9.2 + # via ray +pydantic-core==2.23.4 + # via pydantic +pygments==2.18.0 + # via ipython +pyproject-hooks==1.2.0 + # via + # build + # pip-tools +pyro-api==0.1.2 + # via pyro-ppl +pyro-ppl==1.9.1 + # via botorch +python-dateutil==2.9.0.post0 + # via pandas +pytz==2024.2 + # via pandas +pyyaml==6.0.2 + # via + # optuna + # ray +ray[default,tune]==2.9.3 + # via -r requirements.in +referencing==0.35.1 + # via + # jsonschema + # jsonschema-specifications +requests==2.32.3 + # via + # google-api-core + # ray + # requests-oauthlib + # tensorboard +requests-oauthlib==2.0.0 + # via google-auth-oauthlib +rpds-py==0.20.1 + # via + # jsonschema + # referencing +rsa==4.9 + # via google-auth +scikit-learn==1.3.2 + # via + # ax-platform + # bayesian-optimization + # gpytorch +scipy==1.10.1 + # via + # ax-platform + # bayesian-optimization + # botorch + # hyperopt + # linear-operator + # scikit-learn +six==1.16.0 + # via + # asttokens + # blessed + # hyperopt + # opencensus + # python-dateutil +smart-open==7.0.5 + # via ray +sqlalchemy==1.4.17 + # via + # -r requirements.in + # alembic + # optuna +stack-data==0.6.3 + # via ipython +sympy==1.13.3 + # via torch +tenacity==9.0.0 + # via plotly +tensorboard==2.14.0 + # via -r requirements.in +tensorboard-data-server==0.7.2 + # via tensorboard +tensorboardx==2.6.2.2 + # via ray +threadpoolctl==3.5.0 + # via scikit-learn +tomli==2.0.2 + # via + # build + # pip-tools +torch==2.4.1 + # via + # botorch + # linear-operator + # pyro-ppl +tqdm==4.66.6 + # via + # hyperopt + # optuna + # pyro-ppl +traitlets==5.14.3 + # via + # comm + # ipython + # ipywidgets + # matplotlib-inline +triton==3.0.0 + # via torch +typeguard==2.13.3 + # via ax-platform +typing-extensions==4.12.2 + # via + # alembic + # annotated-types + # ipython + # multidict + # pydantic + # pydantic-core + # torch +tzdata==2024.2 + # via pandas +urllib3==1.26.15 + # via + # -r requirements.in + # requests +virtualenv==20.27.1 + # via ray +wcwidth==0.2.13 + # via + # blessed + # prompt-toolkit +werkzeug==3.0.6 + # via tensorboard +wheel==0.44.0 + # via + # pip-tools + # tensorboard +widgetsnbextension==4.0.13 + # via ipywidgets +wrapt==1.16.0 + # via smart-open +yarl==1.15.2 + # via aiohttp +zipp==3.20.2 + # via + # importlib-metadata + # importlib-resources + +# The following packages are considered to be unsafe in a requirements file: +# pip +# setuptools diff --git a/tools/AutoTuner/test/ref_file_check.py b/tools/AutoTuner/test/ref_file_check.py index e0401693c4..e9d144d1ab 100644 --- a/tools/AutoTuner/test/ref_file_check.py +++ b/tools/AutoTuner/test/ref_file_check.py @@ -26,6 +26,7 @@ def setUp(self): f" tune --samples 1" for c in configs ] + subprocess.run(["ray", "stop"], shell=True, check=True) # Make this a test case def test_files(self): diff --git a/tools/AutoTuner/test/resume_check.py b/tools/AutoTuner/test/resume_check.py index 92219eed22..64c0e6a481 100644 --- a/tools/AutoTuner/test/resume_check.py +++ b/tools/AutoTuner/test/resume_check.py @@ -57,6 +57,7 @@ def setUp(self): f" {c}" for c in options ] + subprocess.run(["ray", "stop"], shell=True, check=True) def test_tune_resume(self): # Goal is to first run the first config (without resume) and then run the second config (with resume) diff --git a/tools/AutoTuner/test/smoke_test_algo_eval.py b/tools/AutoTuner/test/smoke_test_algo_eval.py index a695489b48..72a7aebf2a 100644 --- a/tools/AutoTuner/test/smoke_test_algo_eval.py +++ b/tools/AutoTuner/test/smoke_test_algo_eval.py @@ -32,6 +32,7 @@ def setUp(self): f" --reference {self.reference}" for a, e in self.matrix ] + subprocess.run(["ray", "stop"], shell=True, check=True) def make_base(self): os.chdir(orfs_dir) diff --git a/tools/AutoTuner/test/smoke_test_sample_iteration.py b/tools/AutoTuner/test/smoke_test_sample_iteration.py index f49c22a088..013e1e224c 100644 --- a/tools/AutoTuner/test/smoke_test_sample_iteration.py +++ b/tools/AutoTuner/test/smoke_test_sample_iteration.py @@ -27,6 +27,7 @@ def setUp(self): f" tune --samples {s} --iterations {i}" for s, i in self.matrix ] + subprocess.run(["ray", "stop"], shell=True, check=True) class ASAP7SampleIterationSmokeTest(BaseSampleIterationSmokeTest): diff --git a/tools/AutoTuner/test/smoke_test_sweep.py b/tools/AutoTuner/test/smoke_test_sweep.py index 7a1b013911..6540778b18 100644 --- a/tools/AutoTuner/test/smoke_test_sweep.py +++ b/tools/AutoTuner/test/smoke_test_sweep.py @@ -39,6 +39,7 @@ def setUp(self): f" --jobs {self.jobs}" f" sweep" ) + subprocess.run(["ray", "stop"], shell=True, check=True) def test_sweep(self): raise NotImplementedError( diff --git a/tools/AutoTuner/test/smoke_test_tune.py b/tools/AutoTuner/test/smoke_test_tune.py index 9710416745..bf9bf172b1 100644 --- a/tools/AutoTuner/test/smoke_test_tune.py +++ b/tools/AutoTuner/test/smoke_test_tune.py @@ -25,6 +25,7 @@ def setUp(self): f" --config {self.config}" f" tune --samples 5" ) + subprocess.run(["ray", "stop"], shell=True, check=True) def test_tune(self): raise NotImplementedError(