From ec52c6283f16a8f057d9d0691c6c1881b9c476dd Mon Sep 17 00:00:00 2001
From: Israel Fruchter <fruch@scylladb.com>
Date: Thu, 20 Jun 2024 13:55:48 +0300
Subject: [PATCH] feature(github): cache github issues status in S3

one github action that runs once a day, that
caches the issues status from multiple github repositories

and client code that can read those csv files and cache
them locally, so SkipPerIssue can read first out of it
and then fallback into doing direct github API calls

Ref: https://github.com/scylladb/qa-tasks/issues/1678
---
 .github/workflows/cache-issues.yaml | 27 +++++++++++++++++
 sdcm/utils/issues.py                | 45 ++++++++++++++++++++++++++++-
 2 files changed, 71 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/cache-issues.yaml

diff --git a/.github/workflows/cache-issues.yaml b/.github/workflows/cache-issues.yaml
new file mode 100644
index 0000000000..5e0f90fb7d
--- /dev/null
+++ b/.github/workflows/cache-issues.yaml
@@ -0,0 +1,27 @@
+name: Cache issues status
+on:
+  schedule:
+    - cron: '0 */6 * * *'
+
+jobs:
+  collect_n_upload:
+    runs-on: ubuntu-latest
+    steps:
+      - run: |
+          mkdir -p issues
+          for repo in scylladb scylla-enterprise scylla-manager scylla-operator scylla-cluster-tests scylla-dtest qa-tasks scylla-tools-java ; do
+            gh issue list --state all --json number,state,labels --limit 30000 --template '{{range .}}{{.number}},{{.state}},{{range .labels}}{{.name}}|{{end}}{{println ""}}{{end}}' --repo scylladb/$repo > issues/scylladb_$repo.csv
+          done
+        env:
+          GH_TOKEN: ${{ secrets.ISSUE_ASSIGNMENT_TO_PROJECT_TOKEN }}
+      - name: Upload folder to bucket
+        uses: a-sync/s3-uploader@2.0.1
+        with:
+          args: --recursive
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.S3_AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_AWS_SECRET_ACCESS_KEY }}
+          AWS_REGION: 'us-east-1'
+          S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }}
+          S3_KEY: 'issues'
+          FILE: ./issues
diff --git a/sdcm/utils/issues.py b/sdcm/utils/issues.py
index d7301aa20f..6155d7e194 100644
--- a/sdcm/utils/issues.py
+++ b/sdcm/utils/issues.py
@@ -1,5 +1,6 @@
 import re
 import sys
+import csv
 import logging
 from functools import lru_cache
 from dataclasses import dataclass
@@ -7,8 +8,9 @@
 import github
 import github.Auth
 import github.Issue
+import github.Label
 from github.GithubException import UnknownObjectException, RateLimitExceededException
-
+from botocore.exceptions import ClientError
 from sdcm.keystore import KeyStore
 from sdcm.sct_config import SCTConfiguration
 from sdcm.sct_events.base import Severity
@@ -28,6 +30,33 @@ class Issue:
     issue_id: int | None
 
 
+@lru_cache(maxsize=1)
+class CachedGitHubIssues:
+    """
+    This class would cache the issues from the s3 bucket, and return the issue details
+    the cache is populated by `.github/workflows/cache-issues.yaml` workflow
+    every 6 hours.
+
+    it's main goal is to make sure we don't reach the rate limit of the github api
+    """
+
+    def __init__(self):
+        self.storage = KeyStore()
+
+    @lru_cache()
+    def get_repo_data(self, owner, repo):
+        scsv = self.storage.get_file_contents(f'issues/{owner}_{repo}.csv')
+        issues = {issue['id']: issue for issue in csv.DictReader(
+            scsv.decode().splitlines(), fieldnames=("id", "state", "labels"))}
+        issues = {issue['id']: issue | dict(
+            labels=[dict(name=label) for label in issue['labels'].strip().rstrip('|').split('|')]) for issue in issues.values()}
+        return issues
+
+    def get_issue(self, owner: str, repo_id: str, issue_id: str | int):
+        repo_issues_mapping = self.get_repo_data(owner, repo_id)
+        return repo_issues_mapping.get(str(issue_id))
+
+
 class SkipPerIssues:
     """
     instance of this class would return true, if one of the issue on the list is open
@@ -51,6 +80,8 @@ def github(cls):
         return cls._github
 
     def __init__(self, issues: list[str] | str, params: SCTConfiguration | dict):
+        self.cache = CachedGitHubIssues()
+
         self.params = params
         issues = [issues] if isinstance(issues, str) else issues
 
@@ -81,6 +112,18 @@ def get_issue_details(self, issue):
                                severity=Severity.WARNING,
                                trace=sys._getframe().f_back).publish()  # pylint: disable=protected-access
             return None
+        try:
+            if issue_details := self.cache.get_issue(owner=issue_parsed.user_id, repo_id=issue_parsed.repo_id, issue_id=issue_parsed.issue_id):
+                return github.Issue.Issue(requester=None, headers={},
+                                          attributes=dict(state=issue_details['state'].lower(),
+                                                          labels=issue_details['labels']),
+                                          completed=True)
+        except ClientError as exc:
+            logging.warning("failed to get issue: %s from s3 cache", issue)
+            TestFrameworkEvent(source=self.__class__.__name__,
+                               message=f"failed to get issue {issue} from s3 cache",
+                               severity=Severity.ERROR,
+                               exception=exc).publish()
         try:
             return self.github.get_repo(f'{issue_parsed.user_id}/{issue_parsed.repo_id}', lazy=True).get_issue(issue_parsed.issue_id)
         except UnknownObjectException: