From 8a97fdac1fc9285cc8ee7dd644de35b00a086519 Mon Sep 17 00:00:00 2001 From: Andrei Kashchikhin Date: Tue, 16 Apr 2024 15:52:12 +0100 Subject: [PATCH] [CI] [GHA] Introduce GHA pipeline rerunner (#23865) ### Details: This PR introduces a GHA pipeline rerunner. It should scan the failed workflows' logs and re-run those having known sporadic errors. The rerunner is a Python script which is used by a dedicated workflow. The workflow will not run from this PR, it needs to be in `master`. I've checked the workflow and script in the private repo. ### Tickets: - *136935* --- .gitattributes | 2 + .github/scripts/workflow_rerun/__init__.py | 0 .../scripts/workflow_rerun/argument_parser.py | 20 +++ .github/scripts/workflow_rerun/constants.py | 17 +++ .../workflow_rerun/errors_to_look_for.json | 42 ++++++ .../scripts/workflow_rerun/log_analyzer.py | 132 ++++++++++++++++++ .../scripts/workflow_rerun/log_collector.py | 21 +++ .github/scripts/workflow_rerun/rerunner.py | 53 +++++++ .../scripts/workflow_rerun/tests/__init__.py | 0 .../tests/data/log_archive_with_error.zip | 3 + .../tests/data/log_archive_wo_error.zip | 3 + .../workflow_rerun/tests/integration_test.py | 52 +++++++ .../workflow_rerun/tests/log_analyzer_test.py | 101 ++++++++++++++ .../tests/log_collector_test.py | 38 +++++ .github/workflows/workflow_rerunner.yml | 72 ++++++++++ 15 files changed, 556 insertions(+) create mode 100644 .github/scripts/workflow_rerun/__init__.py create mode 100644 .github/scripts/workflow_rerun/argument_parser.py create mode 100644 .github/scripts/workflow_rerun/constants.py create mode 100644 .github/scripts/workflow_rerun/errors_to_look_for.json create mode 100644 .github/scripts/workflow_rerun/log_analyzer.py create mode 100644 .github/scripts/workflow_rerun/log_collector.py create mode 100644 .github/scripts/workflow_rerun/rerunner.py create mode 100644 .github/scripts/workflow_rerun/tests/__init__.py create mode 100644 .github/scripts/workflow_rerun/tests/data/log_archive_with_error.zip create mode 100644 .github/scripts/workflow_rerun/tests/data/log_archive_wo_error.zip create mode 100644 .github/scripts/workflow_rerun/tests/integration_test.py create mode 100644 .github/scripts/workflow_rerun/tests/log_analyzer_test.py create mode 100644 .github/scripts/workflow_rerun/tests/log_collector_test.py create mode 100644 .github/workflows/workflow_rerunner.yml diff --git a/.gitattributes b/.gitattributes index dfeac125fb1c3f..a0f976d8fb1fbe 100644 --- a/.gitattributes +++ b/.gitattributes @@ -65,3 +65,5 @@ *.vsdx filter=lfs diff=lfs merge=lfs -text *.bmp filter=lfs diff=lfs merge=lfs -text *.svg filter=lfs diff=lfs merge=lfs -text +.github/scripts/workflow_rerun/tests/data/log_archive_with_error.zip filter=lfs diff=lfs merge=lfs -text +.github/scripts/workflow_rerun/tests/data/log_archive_wo_error.zip filter=lfs diff=lfs merge=lfs -text diff --git a/.github/scripts/workflow_rerun/__init__.py b/.github/scripts/workflow_rerun/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/.github/scripts/workflow_rerun/argument_parser.py b/.github/scripts/workflow_rerun/argument_parser.py new file mode 100644 index 00000000000000..e73485dafd09c0 --- /dev/null +++ b/.github/scripts/workflow_rerun/argument_parser.py @@ -0,0 +1,20 @@ +import argparse +from pathlib import Path + + +def get_arguments() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument('-r', '--repository-name', + type=str, + required=True, + help='Repository name in the OWNER/REPOSITORY format') + parser.add_argument('--run-id', + type=int, + required=True, + help='Workflow Run ID') + parser.add_argument('--errors-to-look-for-file', + type=str, + required=False, + help='.json file with the errors to look for in logs', + default=Path(__file__).resolve().parent.joinpath('errors_to_look_for.json')) + return parser.parse_args() diff --git a/.github/scripts/workflow_rerun/constants.py b/.github/scripts/workflow_rerun/constants.py new file mode 100644 index 00000000000000..174ffd74e6371f --- /dev/null +++ b/.github/scripts/workflow_rerun/constants.py @@ -0,0 +1,17 @@ +import logging +import os + + +GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN') + + +def init_logger(): + LOGLEVEL = os.environ.get('LOGLEVEL', 'INFO').upper() + logging.basicConfig(level=LOGLEVEL, + format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', + datefmt='%m-%d-%Y %H:%M:%S') + + +init_logger() + +LOGGER = logging.getLogger('rerunner') diff --git a/.github/scripts/workflow_rerun/errors_to_look_for.json b/.github/scripts/workflow_rerun/errors_to_look_for.json new file mode 100644 index 00000000000000..836b21a0042bfd --- /dev/null +++ b/.github/scripts/workflow_rerun/errors_to_look_for.json @@ -0,0 +1,42 @@ +[ + { + "error_text": "This is a problem related to network connectivity", + "ticket": 135929 + }, + { + "error_text": "Unable to make request", + "ticket": 135715 + }, + { + "error_text": "GnuTLS recv error", + "ticket": 131918 + }, + { + "error_text": "Connection was reset", + "ticket": 131818 + }, + { + "error_text": "Failed to connect to github.com", + "ticket": 131657 + }, + { + "error_text": "Could not resolve host: github.com", + "ticket": 131546 + }, + { + "error_text": "retrieving gpg key timed out", + "ticket": 131538 + }, + { + "error_text": "Retry limit has been reached for chunk", + "ticket": 131537 + }, + { + "error_text": "fatal error: downloading", + "ticket": 131424 + }, + { + "error_text": "Failure when receiving data from the peer", + "ticket": 137121 + } +] \ No newline at end of file diff --git a/.github/scripts/workflow_rerun/log_analyzer.py b/.github/scripts/workflow_rerun/log_analyzer.py new file mode 100644 index 00000000000000..73f73ee53e1efd --- /dev/null +++ b/.github/scripts/workflow_rerun/log_analyzer.py @@ -0,0 +1,132 @@ +import json +import re +import tempfile +from pathlib import Path +from typing import TypedDict +from zipfile import ZipFile + +from workflow_rerun.constants import LOGGER + + +class LogFile(TypedDict): + file_name: str + path: Path + + +class ErrorData(TypedDict): + error_text: str + ticket: int + + +class LogAnalyzer: + def __init__(self, + path_to_log_archive: Path, + path_to_errors_file: Path) -> None: + self._path_to_log_archive = path_to_log_archive + self._path_to_errors_file = path_to_errors_file + + self._errors_to_look_for: list[ErrorData] = [] + self._collect_errors_to_look_for() + + self._log_dir = tempfile.TemporaryDirectory().name + + self._log_files: list[LogFile] = [] + self._collect_log_files() + + all_txt_log_files_pretty = '\n'.join(map(lambda item: str(item['path']), self._log_files)) + LOGGER.info(f'ALL .txt LOG FILES: \n{all_txt_log_files_pretty}') + + self.found_matching_error = False + + def _collect_errors_to_look_for(self) -> None: + with open(file=self._path_to_errors_file, + mode='r', + encoding='utf-8') as errors_file: + errors_data = json.load(errors_file) + for error_data in errors_data: + self._errors_to_look_for.append( + ErrorData(error_text=error_data['error_text'], + ticket=error_data['ticket']) + ) + + def _collect_log_files(self) -> None: + """ + Collects the .txt log files from the log archive + + The GitHub Actions pipeline logs archive should have the following structure: + > Job_name_0 + > step_name_0.txt + > step_name_1.txt + ... + > Job_name_1 + > step_name_0.txt + > step_name_1.txt + ... + > Job_name_2 + ... + ... + + We need to only analyze the `*.txt` files + """ + + with ZipFile(file=self._path_to_log_archive, + mode='r') as zip_file: + zip_file.extractall(self._log_dir) + + for _file in Path(self._log_dir).iterdir(): + if _file.is_dir(): + for log_file in _file.iterdir(): + self._log_files.append(LogFile(file_name=log_file.name, + path=log_file.resolve())) + + def _is_error_in_log(self, + error_to_look_for: str, + log_file_path: Path) -> bool: + """ + Searches for the error in the provided log + """ + + error_to_look_for = self._clean_up_string(error_to_look_for) + + with open(file=log_file_path, + mode='r', + encoding='utf-8') as log_file: + for line in log_file: + if error_to_look_for in self._clean_up_string(line): + return True + return False + + @staticmethod + def _clean_up_string(string: str) -> str: + """ + Replaces special characters with spaces in the string, strips it from leading and following spaces, + and lowers it + + for "Could not resolve host: github.com" returns "could not resolve host github com" + + This cleanup is applied to both errors to look for and logs themselves for matching + """ + return re.sub(r'[^A-Za-z0-9]+', ' ', string).lower().strip() + + def analyze(self) -> None: + """ + Iterates over the known errors and tries to find them in the collected log files + """ + for error in self._errors_to_look_for: + + LOGGER.info(f'LOOKING FOR "{error["error_text"]}" ERROR...') + + for log_file in self._log_files: + if self._is_error_in_log(error_to_look_for=error['error_text'], + log_file_path=log_file['path']): + LOGGER.info(f'FOUND "{error["error_text"]}" ERROR IN {log_file["path"]}. TICKET: {error["ticket"]}') + self.found_matching_error = True + return + +if __name__ == '__main__': + # Usage example + log_analyzer = LogAnalyzer(path_to_log_archive=Path('/tmp/logs/log.zip'), + path_to_errors_file=Path('/tmp/errors_to_look_for.json')) + log_analyzer.analyze() + if log_analyzer.found_matching_error: + print('found matching error, see logs above') diff --git a/.github/scripts/workflow_rerun/log_collector.py b/.github/scripts/workflow_rerun/log_collector.py new file mode 100644 index 00000000000000..6045a4750b824b --- /dev/null +++ b/.github/scripts/workflow_rerun/log_collector.py @@ -0,0 +1,21 @@ +from pathlib import Path + +import requests +from github.WorkflowRun import WorkflowRun +from workflow_rerun.constants import GITHUB_TOKEN, LOGGER + + +def collect_logs_for_run(run: WorkflowRun, + log_archive_path: Path) -> Path: + """ + Collects log archive for a pipeline + """ + with open(file=log_archive_path, + mode='wb') as log_archive: + LOGGER.info(f'STARTED LOG COLLECTION FOR {run.id} IN {log_archive_path}') + # PyGitHub does not expose the "/repos/{owner}/{repo}/actions/runs/{run_id}/logs" endpoint so we have to use requests + log_archive.write(requests.get(url=run.logs_url, + headers={'Authorization': f'Bearer {GITHUB_TOKEN}'}).content) + LOGGER.info(f'COLLECTED LOGS FOR {run.id} IN {log_archive_path}') + + return log_archive_path diff --git a/.github/scripts/workflow_rerun/rerunner.py b/.github/scripts/workflow_rerun/rerunner.py new file mode 100644 index 00000000000000..49ba2031a1e747 --- /dev/null +++ b/.github/scripts/workflow_rerun/rerunner.py @@ -0,0 +1,53 @@ +import sys +import tempfile +from pathlib import Path + +from github import Github, Auth +from workflow_rerun.argument_parser import get_arguments +from workflow_rerun.constants import GITHUB_TOKEN, LOGGER +from workflow_rerun.log_analyzer import LogAnalyzer +from workflow_rerun.log_collector import collect_logs_for_run + +if __name__ == '__main__': + + args = get_arguments() + run_id = args.run_id + repository_name = args.repository_name + + github = Github(auth=Auth.Token(token=GITHUB_TOKEN)) + gh_repo = github.get_repo(full_name_or_id=repository_name) + run = gh_repo.get_workflow_run(id_=run_id) + + LOGGER.info(f'CHECKING IF RERUN IS NEEDED FOR {run.html_url} RUN IN {repository_name}.') + + # Check if the run has already been retriggered + # we do not want to fall into a loop with retriggers + if run.run_attempt > 1: + LOGGER.info(f'THERE ARE {run.run_attempt} ATTEMPTS ALREADY. NOT CHECKING LOGS AND NOT RETRIGGERING. EXITING') + sys.exit(0) + + log_archive_path = Path(tempfile.NamedTemporaryFile(suffix='.zip').name) + + collect_logs_for_run( + run=run, + log_archive_path=log_archive_path, + ) + + log_analyzer = LogAnalyzer( + path_to_log_archive=log_archive_path, + path_to_errors_file=args.error_to_look_for_file, + ) + log_analyzer.analyze() + + if log_analyzer.found_matching_error: + LOGGER.info(f'FOUND MATCHING ERROR, RETRIGGERING {run.html_url}') + status = run.rerun() + if status: + LOGGER.info(f'RUN RETRIGGERED SUCCESSFULLY: {run.html_url}') + else: + LOGGER.info(f'RUN WAS NOT RETRIGGERED, SEE ABOVE') + + # "status" is True (which is 1) if everything is ok, False (which is 0) otherwise + sys.exit(not status) + else: + LOGGER.info(f'NO ERROR WAS FOUND, NOT RETRIGGERING') diff --git a/.github/scripts/workflow_rerun/tests/__init__.py b/.github/scripts/workflow_rerun/tests/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/.github/scripts/workflow_rerun/tests/data/log_archive_with_error.zip b/.github/scripts/workflow_rerun/tests/data/log_archive_with_error.zip new file mode 100644 index 00000000000000..c02b478af0076a --- /dev/null +++ b/.github/scripts/workflow_rerun/tests/data/log_archive_with_error.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:894d636bcf156a7f3fae09f3c1d61df6b3db89117a917a3079995805c29115b3 +size 89247 diff --git a/.github/scripts/workflow_rerun/tests/data/log_archive_wo_error.zip b/.github/scripts/workflow_rerun/tests/data/log_archive_wo_error.zip new file mode 100644 index 00000000000000..42be8d16787555 --- /dev/null +++ b/.github/scripts/workflow_rerun/tests/data/log_archive_wo_error.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f094a737d7ea40dba8d3fb13493275cae243d08e5f1dabce90c316c951a6ac2 +size 52047 diff --git a/.github/scripts/workflow_rerun/tests/integration_test.py b/.github/scripts/workflow_rerun/tests/integration_test.py new file mode 100644 index 00000000000000..6e950772d486f6 --- /dev/null +++ b/.github/scripts/workflow_rerun/tests/integration_test.py @@ -0,0 +1,52 @@ +""" +Integration tests +""" + +import unittest +from pathlib import Path +from github import Github, Auth +import os +import tempfile + + +from workflow_rerun.log_analyzer import LogAnalyzer +from workflow_rerun.log_collector import collect_logs_for_run + + +class IntegrationTest(unittest.TestCase): + """ + A class for testing integration between LogAnalyzer and log_collection + """ + + def setUp(self) -> None: + print(f'\nIn test: "{self._testMethodName}"', flush=True) + self._cwd = Path(__file__).parent + self.errors_to_look_for_file = self._cwd.parent.joinpath( + 'errors_to_look_for.json' + ) + self.github = Github(auth=Auth.Token(token=os.environ.get('GITHUB_TOKEN'))) + self.gh_repo = self.github.get_repo(full_name_or_id='openvinotoolkit/openvino') + + # Even if we use "failure" for status we cannot guarantee logs containing any of the known error + # So these tests use the logs of the most recent successfull pipeline + self.wf_run = self.gh_repo.get_workflow_runs(status='success')[0] + print(f'Workflow run for testing: {self.wf_run}', flush=True) + + def test_log_collection_and_analysis(self) -> None: + """ + Ensure logs collected by collect_logs_for_run are analyzed by LogAnalyzer + """ + + log_archive_path = Path(tempfile.NamedTemporaryFile(suffix='.zip').name) + collect_logs_for_run(run=self.wf_run, + log_archive_path=log_archive_path) + + analyzer = LogAnalyzer( + path_to_log_archive=log_archive_path, + path_to_errors_file=self.errors_to_look_for_file, + ) + analyzer.analyze() + self.assertFalse(analyzer.found_matching_error) + + def tearDown(self) -> None: + self.github.close() diff --git a/.github/scripts/workflow_rerun/tests/log_analyzer_test.py b/.github/scripts/workflow_rerun/tests/log_analyzer_test.py new file mode 100644 index 00000000000000..b10e4166bb038f --- /dev/null +++ b/.github/scripts/workflow_rerun/tests/log_analyzer_test.py @@ -0,0 +1,101 @@ +""" +LogAnalyzer tests +""" + +import unittest +from pathlib import Path + + +from workflow_rerun.log_analyzer import LogAnalyzer + + +class LogAnalyzerTest(unittest.TestCase): + """ + A class for testing LogAnalyzer + """ + + def setUp(self) -> None: + print(f'\nIn test: "{self._testMethodName}"', flush=True) + self._cwd = Path(__file__).parent + self.log_archive_with_error = self._cwd.joinpath("data").joinpath( + 'log_archive_with_error.zip' + ) + self.log_archive_wo_error = self._cwd.joinpath("data").joinpath( + 'log_archive_wo_error.zip' + ) + self.errors_to_look_for_file = self._cwd.parent.joinpath( + 'errors_to_look_for.json' + ) + + def test_log_analyzer_instantiation(self) -> None: + """ + Ensure LogAnalyzer is instantiated correctly. + """ + analyzer = LogAnalyzer( + path_to_log_archive=self.log_archive_wo_error, + path_to_errors_file=self.errors_to_look_for_file, + ) + self.assertTrue( + hasattr(analyzer, '_errors_to_look_for'), + 'Analyzer should have _errors_to_look_for', + ) + self.assertTrue( + hasattr(analyzer, '_log_files'), 'Analyzer should have _log_files' + ) + + for error_data in analyzer._errors_to_look_for: + self.assertTrue( + error_data['error_text'], 'Each error_data should have text' + ) + self.assertTrue(error_data['ticket'], 'Each error_data should have ticket') + + for log_file in analyzer._log_files: + self.assertTrue( + log_file['file_name'], 'Each log_file should have file_name' + ) + self.assertTrue(log_file['path'], 'Each log_file should have path') + + def test_string_cleanup(self) -> None: + """ + Ensure log cleanup function returns correct results + """ + analyzer = LogAnalyzer( + path_to_log_archive=self.log_archive_wo_error, + path_to_errors_file=self.errors_to_look_for_file, + ) + + data = ( + 'Connection was reset', + 'Failed to connect to github.com', + 'Could not resolve host: github.com', + ) + expected = ( + 'connection was reset', + 'failed to connect to github com', + 'could not resolve host github com', + ) + + for input_str, expected_str in zip(data, expected): + self.assertEqual(analyzer._clean_up_string(string=input_str), expected_str) + + def test_analyzer_with_error(self) -> None: + """ + Ensure LogAnalyzer can find an error + """ + analyzer = LogAnalyzer( + path_to_log_archive=self.log_archive_with_error, + path_to_errors_file=self.errors_to_look_for_file, + ) + analyzer.analyze() + self.assertTrue(analyzer.found_matching_error) + + def test_analyzer_wo_error(self) -> None: + """ + Ensure LogAnalyzer does not find an error in the log files w/o errors + """ + analyzer = LogAnalyzer( + path_to_log_archive=self.log_archive_wo_error, + path_to_errors_file=self.errors_to_look_for_file, + ) + analyzer.analyze() + self.assertFalse(analyzer.found_matching_error) diff --git a/.github/scripts/workflow_rerun/tests/log_collector_test.py b/.github/scripts/workflow_rerun/tests/log_collector_test.py new file mode 100644 index 00000000000000..f325576262c203 --- /dev/null +++ b/.github/scripts/workflow_rerun/tests/log_collector_test.py @@ -0,0 +1,38 @@ +""" +log collector tests +""" + +import os +import unittest +import tempfile +from pathlib import Path + +from github import Github, Auth + +from workflow_rerun.log_collector import collect_logs_for_run + + +class LogCollectorTest(unittest.TestCase): + """ + A class for testing log collection + """ + + def setUp(self) -> None: + print(f'\nIn test: "{self._testMethodName}"', flush=True) + self._cwd = Path(__file__).parent + self.github = Github(auth=Auth.Token(token=os.environ.get('GITHUB_TOKEN'))) + self.gh_repo = self.github.get_repo(full_name_or_id='openvinotoolkit/openvino') + # Use the logs of the most recent successfull pipeline + self.wf_run = self.gh_repo.get_workflow_runs(status='success')[0] + print(f'Workflow run for testing: {self.wf_run}', flush=True) + + def test_log_collection(self) -> None: + """ + Ensure log collection is working + """ + log_archive_path = Path(tempfile.NamedTemporaryFile(suffix='.zip').name) + collect_logs_for_run(run=self.wf_run, log_archive_path=log_archive_path) + self.assertTrue(Path(log_archive_path).exists()) + + def tearDown(self) -> None: + self.github.close() diff --git a/.github/workflows/workflow_rerunner.yml b/.github/workflows/workflow_rerunner.yml new file mode 100644 index 00000000000000..77b8b68f25b359 --- /dev/null +++ b/.github/workflows/workflow_rerunner.yml @@ -0,0 +1,72 @@ +name: Rerun Workflow with Known Errors + +on: + workflow_run: + workflows: + - Linux (Ubuntu 20.04, Python 3.11) + - Linux ARM64 (Ubuntu 20.04, Python 3.11) + - Linux Static CC (Ubuntu 22.04, Python 3.11, Clang) + - Linux RISC-V with Conan (Ubuntu 22.04, Python 3.10) + - Windows (VS 2019, Python 3.11) + - Windows Conditional Compilation (VS 2022, Python 3.11) + types: + - completed + pull_request: + paths: + - '.github/workflows/workflow_rerunner.yml' + - '.github/scripts/workflow_rerun/**' + +jobs: + rerun: + name: Rerun Workflow + if: ${{ github.event.workflow_run.conclusion == 'failure' }} # Run only for the completed workflows + runs-on: aks-linux-2-cores-8gb + permissions: + actions: write + contents: read + statuses: read + checks: read + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + sparse-checkout: '.github/scripts/workflow_rerun' + + - name: Install deps + run: pip3 install PyGithub==2.2.0 requests==2.31.0 + + - name: Dump GitHub context + env: + GITHUB_CONTEXT: ${{ toJson(github) }} + run: echo "$GITHUB_CONTEXT" + + - name: Rerun + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + export PYTHONPATH=${{ github.workspace }}/.github/scripts/workflow_rerun:${{ github.workspace }}/.github/scripts:$PYTHONPATH + python3 ${{ github.workspace }}/.github/scripts/workflow_rerun/rerunner.py \ + --run-id ${{ github.event.workflow_run.id }} \ + --repository-name ${GITHUB_REPOSITORY} + + rerunner_tests: + name: Rerunner Tests + if: ${{ github.event_name == 'pull_request' }} + runs-on: aks-linux-2-cores-8gb + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + sparse-checkout: '.github/scripts/workflow_rerun' + lfs: true + + - name: Install deps + run: pip3 install PyGithub==2.2.0 requests==2.31.0 + + - name: Test Rerunner + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + working-directory: ${{ github.workspace }}/.github/scripts/workflow_rerun + run: | + export PYTHONPATH=${{ github.workspace }}/.github/scripts/workflow_rerun:${{ github.workspace }}/.github/scripts:$PYTHONPATH + python3 -m unittest tests/*_test.py