From 4fed2dc1ea5e175d67f1611869657796112ec1e7 Mon Sep 17 00:00:00 2001 From: hjjq Date: Tue, 19 Dec 2023 17:47:10 -0500 Subject: [PATCH 1/2] add org runner select;default shut down instance --- .github/scripts/start_instances.py | 3 ++- .github/workflows/regression.yaml | 14 ++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/.github/scripts/start_instances.py b/.github/scripts/start_instances.py index 6d0b59202..01fefa0b4 100644 --- a/.github/scripts/start_instances.py +++ b/.github/scripts/start_instances.py @@ -18,6 +18,7 @@ def run_command(cmd): # e.g., ' 1, 2, ,3,,' -> ['1', '2', '3'] hw_config_ids = os.environ.get('HW_CONFIG').replace(' ', '') + repo_org = os.environ.get('REPO_NAME').split('/')[0] if hw_config_ids == 'all': query = ( 'SELECT id FROM hardware_config' @@ -34,7 +35,7 @@ def run_command(cmd): query = ( 'SELECT cloud_provider_id, instance_id, hardware_config.name as hw_config FROM cloud_instance ' 'JOIN hardware_config ON cloud_instance.hardware_config_id = hardware_config.id ' - f'WHERE hardware_config_id = {hw_config_id} LIMIT 1' + f'WHERE hardware_config_id = {hw_config_id} AND cloud_instance.org = \'{repo_org}\' LIMIT 1' ) cursor.execute(query) rows = cursor.fetchall() diff --git a/.github/workflows/regression.yaml b/.github/workflows/regression.yaml index 61434db52..2a8580854 100644 --- a/.github/workflows/regression.yaml +++ b/.github/workflows/regression.yaml @@ -7,7 +7,7 @@ on: description: 'Shut down GPU instances when finished.' required: true type: boolean - default: false + default: true issue_comment: types: [created] @@ -29,8 +29,8 @@ jobs: start_instances: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'issue_comment' && github.event.issue.pull_request && - contains(fromJSON('["MEMBER", "OWNER"]'), github.event.comment.author_association) && + github.event_name == 'issue_comment' && github.event.issue.pull_request != '' && + contains(fromJSON('["MEMBER", "OWNER", "COLLABORATOR"]'), github.event.comment.author_association) && contains(github.event.comment.body, '$hidet-ci launch') runs-on: ubuntu-latest outputs: @@ -48,6 +48,7 @@ jobs: run: timeout 900 python ./.github/scripts/start_instances.py env: HW_CONFIG: all + REPO_NAME: ${{ github.repository }} # TODO: Allow launching only specified GPU instances - name: Upload run configs @@ -59,6 +60,7 @@ jobs: run_tests: needs: start_instances + timeout-minutes: 2880 strategy: matrix: hw_configs: ${{ fromJSON(needs.start_instances.outputs.hw_configs) }} @@ -110,6 +112,7 @@ jobs: name: run_configs - name: Run tests + timeout-minutes: 2880 run: | python hidet/.github/scripts/run_tests.py env: @@ -165,7 +168,10 @@ jobs: HW_CONFIGS: ${{ needs.start_instances.outputs.hw_configs }} stop_instances: - if: ${{ inputs.shutdown_instances }} + if: | + github.event_name == 'workflow_dispatch' && inputs.shutdown_instances || + github.event_name == 'issue_comment' && github.event.issue.pull_request != '' && + !contains(github.event.comment.body, '--keep') runs-on: ubuntu-latest needs: [start_instances, run_tests] steps: From da748a70fe0a501d12d7f5149deeeb4403c91648 Mon Sep 17 00:00:00 2001 From: hjjq Date: Tue, 19 Dec 2023 18:09:29 -0500 Subject: [PATCH 2/2] fix deadlock --- .github/scripts/run_tests.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/.github/scripts/run_tests.py b/.github/scripts/run_tests.py index 49f253539..3e16c8ffe 100644 --- a/.github/scripts/run_tests.py +++ b/.github/scripts/run_tests.py @@ -11,20 +11,15 @@ def run_command(cmd): cmd = " ".join(cmd) print("Running command: " + cmd) - popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True) - outputs = [] - for line in popen.stdout: - print(line, end='') - outputs.append(line) - popen.stdout.close() - ret = popen.wait() + process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True) + stdout, stderr = process.communicate() + ret = process.returncode if ret: print('STDERR:') - for line in popen.stderr: + for line in stderr: print(line, end='') - print(f'Command {cmd} failed with return code {ret}.') - return None - return outputs + raise RuntimeError(f'Command {cmd} failed with return code {ret}.') + return stdout def get_bench_cmd(run_type, run_id, run_name, run_param_name, dtype): # Get the name of the benchmark script from DB