TestingResearchIllinois · darko-marinov · Dec 7, 2022 · Nov 28, 2022 · Nov 30, 2022 · Nov 30, 2022
diff --git a/repo-info/README.md b/repo-info/README.md
@@ -0,0 +1,21 @@
+# Repo Info Helper
+
+This script scans a given .csv file (works on both `pr-data.csv` and `py-data.csv`), specifically, just a single column inside, and outputs another .csv file, with 3 columns:
+
+* Repo URL
+* Months since latest commit to master/main
+* Number of stars
+
+... which are sorted in descending order of number of stars and ascending order of months since last commit to master.
+
+The latter 2 values will help us in shortlisting a project to fix flaky tests in. The chances of your PR getting accepted are higher for a repository that is actively maintained and has a high number of stars.
+
+## To run:
+
+* Required libraries to install: `pandas`, `tqdm`, `pygithub`
+
+* Requires a github access token if there are more than 60 requests made (i.e. more than 60 unique repositories in the file), which is highly likely, since both `pr-data.csv` and `py-data.csv` each contain 300+ unique repositories at the time of writing this (Nov 2022).
+
+* To run: `python3 get_repo_info.py -t <GITHUB_ACCESS_TOKEN> -f '<CSV_FILEPATH>' -c '<COLUMN_NAME_CONTAINING_REPO_URL>'`
+
+The new file will be saved with the name `repo_info.csv` in the same directory as the script.
diff --git a/repo-info/get_repo_info.py b/repo-info/get_repo_info.py
@@ -0,0 +1,55 @@
+import argparse
+import datetime
+import pandas as pd
+from tqdm import tqdm
+from github import Github
+
+tqdm.pandas()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-t', '--github_access_token', help='GitHub access token to overcome API rate limitations')
+parser.add_argument('-f', '--filepath', help='Filepath of .csv file containing repo data')
+parser.add_argument('-c', '--colname', help='Column name in CSV file pertaining to repo URL')
+args = parser.parse_args()
+
+GITHUB_API_RATE_LIMIT = 5000
+FILEPATH, COLNAME, GITHUB_ACCESS_TOKEN = args.filepath, args.colname, args.github_access_token
+REPO_URLS = pd.read_csv(FILEPATH)[COLNAME].unique()
+NUM_REPOS = REPO_URLS.shape[0]
+
+def check_number_repos():
+    if NUM_REPOS > GITHUB_API_RATE_LIMIT:
+        print(f'You can only make {GITHUB_API_RATE_LIMIT} requests per hour. Your file has {NUM_REPOS} unique repositories. Exiting.')
+        exit(0)
+
+def get_diff_month(d1, d2):
+    return (d1.year - d2.year) * 12 + d1.month - d2.month
+
+def get_repo_object(repo_url):
+    try:
+        repo_name = repo_url.split('github.com/')[1]
+        return Github(GITHUB_ACCESS_TOKEN).get_repo(repo_name)
+    except:
+        return None
+
+def get_months_since_last_commit(repo):
+    try:
+        master_branch = repo.get_branch('master')
+        latest_commit_date = master_branch.commit.commit.author.date
+        months_since_commit = get_diff_month(datetime.datetime.now(), latest_commit_date)
+        return months_since_commit
+    except Exception as e:
+        return None
+
+def get_maintained_repos():    
+    check_number_repos()
+    print(f'Analyzing {NUM_REPOS} repositories...')
+    df = pd.DataFrame()
+    df['REPO_URL'] = REPO_URLS
+    df['REPO_OBJECT'] = df['REPO_URL'].progress_apply(lambda url: get_repo_object(url))
+    df['MONTHS_SINCE_LAST_COMMIT'] = df['REPO_OBJECT'].progress_apply(lambda repo_object: get_months_since_last_commit(repo_object))
+    df['STARS'] = df['REPO_OBJECT'].progress_apply(lambda repo_object: repo_object.stargazers_count if repo_object is not None else None)
+    df = df.sort_values(by=['MONTHS_SINCE_LAST_COMMIT', 'STARS'], ascending=[True, False]).drop(columns=['REPO_OBJECT', 'Unnamed: 0'], errors='ignore')
+    df.to_csv('repo_info.csv', index=False)
+
+get_maintained_repos()