diff --git a/repo-info/README.md b/repo-info/README.md new file mode 100644 index 00000000..8abe6527 --- /dev/null +++ b/repo-info/README.md @@ -0,0 +1,21 @@ +# Repo Info Helper + +This script scans a given .csv file (works on both `pr-data.csv` and `py-data.csv`), and outputs another .csv file, with 3 columns: + +* Repo URL +* Months since latest commit to master/main +* Number of stars + +... which are sorted in descending order of number of stars and ascending order of months since last commit to master. + +The latter 2 values will help us in shortlisting a project to fix flaky tests in. The chances of your PR getting accepted are higher for a repository that is actively maintained and has a high number of stars. This script will only scan URLs that have an empty `Status` column. + +## To run: + +* Requires a github access token if there are more than 60 requests made (i.e. more than 60 unique repositories in the file), which is highly likely, since both `pr-data.csv` and `py-data.csv` each contain 300+ unique repositories at the time of writing this (Nov 2022). + +* Following are the commands to run the script from the root directory. Remember to use a github access token to overcome the rate limit: + * For `pr-data.csv`: `repo-info/get_repo_info.py -f pr-data.csv -c 'Project URL' -t ` + * For `py-data.csv`: `repo-info/get_repo_info.py -f py-data.csv -c 'Project URL' -t ` + +The new file will be saved with the name `repo_info.csv` inside the `repo-info` directory. \ No newline at end of file diff --git a/repo-info/get_repo_info.py b/repo-info/get_repo_info.py new file mode 100644 index 00000000..9f0a8346 --- /dev/null +++ b/repo-info/get_repo_info.py @@ -0,0 +1,62 @@ +import os +import argparse +import datetime +import pandas as pd +from tqdm import tqdm +from github import Github + +tqdm.pandas() + +parser = argparse.ArgumentParser() +parser.add_argument('-t', '--github_access_token', help='GitHub access token to overcome API rate limitations') +parser.add_argument('-f', '--filepath', help='Filepath of .csv file containing repo data') +parser.add_argument('-c', '--colname', help='Column name in CSV file pertaining to repo URL') +args = parser.parse_args() + +GITHUB_API_RATE_LIMIT = 5000 +FILEPATH, COLNAME, GITHUB_ACCESS_TOKEN = args.filepath, args.colname, args.github_access_token + +data = pd.read_csv(FILEPATH) +data = data[data['Status'].isna()] +REPO_URLS = data[COLNAME].unique() +NUM_REPOS = REPO_URLS.shape[0] + +def check_number_repos(): + if NUM_REPOS > GITHUB_API_RATE_LIMIT: + print(f'You can only make {GITHUB_API_RATE_LIMIT} requests per hour. Your file has {NUM_REPOS} unique repositories. Exiting.') + exit(0) + +def get_diff_month(d1, d2): + return (d1.year - d2.year) * 12 + d1.month - d2.month + +def get_repo_object(repo_url): + try: + repo_name = repo_url.split('github.com/')[1] + return Github(GITHUB_ACCESS_TOKEN).get_repo(repo_name) + except Exception as e: + print(e) + return None + +def get_months_since_last_commit(repo): + try: + default_branch = repo.get_branch(repo.default_branch) + latest_commit_date = default_branch.commit.commit.author.date + months_since_commit = get_diff_month(datetime.datetime.now(), latest_commit_date) + return months_since_commit + except Exception as e: + print(e) + return None + +def get_maintained_repos(): + check_number_repos() + print(f'Analyzing {NUM_REPOS} repositories...') + df = pd.DataFrame() + df['REPO_URL'] = REPO_URLS + df['REPO_OBJECT'] = df['REPO_URL'].progress_apply(lambda url: get_repo_object(url)) + df['MONTHS_SINCE_LAST_COMMIT'] = df['REPO_OBJECT'].progress_apply(lambda repo_object: get_months_since_last_commit(repo_object)) + df['STARS'] = df['REPO_OBJECT'].progress_apply(lambda repo_object: repo_object.stargazers_count if repo_object is not None else None) + df = df.sort_values(by=['MONTHS_SINCE_LAST_COMMIT', 'STARS'], ascending=[True, False]).drop(columns=['REPO_OBJECT', 'Unnamed: 0'], errors='ignore') + df.to_csv(f'{os.getcwd()}/repo-info/repo-info.csv', index=False) + +if __name__ == '__main__': + get_maintained_repos() \ No newline at end of file diff --git a/repo-info/requirements.txt b/repo-info/requirements.txt new file mode 100644 index 00000000..cfa61bb7 --- /dev/null +++ b/repo-info/requirements.txt @@ -0,0 +1,3 @@ +pandas==1.5.2 +PyGithub==1.57 +tqdm==4.64.1 \ No newline at end of file