From 84553b70a583db739bc84a47777925e05bf575b5 Mon Sep 17 00:00:00 2001 From: FeBalla Date: Thu, 28 Apr 2022 01:50:38 -0400 Subject: [PATCH 1/3] feat: adds a first functional collecter for repositories in github organization via github api --- .gitignore | 3 +++ api_handler.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ logger.py | 43 +++++++++++++++++++++++++++++++++++++++++++ main.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 146 insertions(+) create mode 100644 api_handler.py create mode 100644 logger.py create mode 100644 main.py diff --git a/.gitignore b/.gitignore index b6e4761..9492099 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +.env +logs + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/api_handler.py b/api_handler.py new file mode 100644 index 0000000..03a5dbe --- /dev/null +++ b/api_handler.py @@ -0,0 +1,50 @@ +from datetime import datetime, timedelta +from dotenv import dotenv_values +import requests + +config = dotenv_values(".env") + +BASE_URL = "https://github.com" +API_URL = "https://api.github.com" + +# The last commit date returned by GitHub API is in UTC timezone, so you can adjust it to another +# timezone using this parameter. For GMT-4 the difference in april is around 4 hours. This is used +# just for logs and does not affect anything else. +TIME_DIFF_TO_UTC = timedelta(hours=4) + + +def get_assignment_repos_names(REPOSITORY_PREFIX="", PER_PAGE=100, TOTAL_PAGES=2): + '''Returns a set with the name of all private repositories from a organization that starts with a + specific prefix''' + assignment_repos = set() + + for page in range(TOTAL_PAGES): + url = f"{API_URL}/orgs/{config['ORGA']}/repos?type=private&per_page={PER_PAGE}&page={page + 1}" + org_repos = requests.get(url, auth=(config["USER"], config["TOKEN"])) + + for repo in org_repos.json(): + if repo["name"].startswith(REPOSITORY_PREFIX): + assignment_repos.add(repo["name"]) + + return list(assignment_repos) + + +def get_repository_information(repository_name, REPOSITORY_PREFIX=""): + '''Gets the repository information by name, including the last commit sha, the last commit date + and the standard url for cloning''' + url = f"{API_URL}/repos/{config['ORGA']}/{repository_name}/commits" + data = requests.get(url, auth=(config["USER"], config["TOKEN"])).json()[0] + + commit_date = datetime.strptime(data["commit"]["author"]["date"], "%Y-%m-%dT%H:%M:%SZ") + commit_date -= TIME_DIFF_TO_UTC + commit_date = commit_date.strftime("%d/%m/%Y-%H:%M") + + repository_info = { + "name": repository_name, + "clone_url": f"{BASE_URL}/{config['ORGA']}/{repository_name}.git", + "last_commit_sha": data["sha"], + "last_commit_author": repository_name.replace(f"{REPOSITORY_PREFIX}-", ""), + "last_commit_date": commit_date, + } + + return repository_info diff --git a/logger.py b/logger.py new file mode 100644 index 0000000..9c05268 --- /dev/null +++ b/logger.py @@ -0,0 +1,43 @@ +from datetime import datetime +import os + + +class Logger: + def __init__(self, folder_name="logs", encoding="utf-8"): + self.folder_name = folder_name + + if not os.path.exists(self.folder_name): + os.makedirs(self.folder_name) + + self.repositories_data_file = open(f"{self.folder_name}/repos.txt", "w", encoding=encoding) + print("Repositories expected to clone:", file=self.repositories_data_file) + + self.runtime_logs_file = open(f"{self.folder_name}/runtime.txt", "w", encoding=encoding) + print("Runtime logs:", file=self.runtime_logs_file) + + def save_repositories_data(self, repositories_data): + '''Writes a log-file with all the information of the repositories that will be cloned''' + repositories_data = sorted(repositories_data, key=lambda repo: repo['last_commit_date']) + + for repo in repositories_data: + print(repo["name"], end=" ", file=self.repositories_data_file) + print(repo["last_commit_sha"], end=" ", file=self.repositories_data_file) + print(repo["last_commit_author"], end=" ", file=self.repositories_data_file) + print(repo["last_commit_date"], file=self.repositories_data_file) + + self.repositories_data_file.flush() + os.fsync(self.repositories_data_file.fileno()) + + def write_runtime_log(self, msg): + '''Writes a runtime-log (msg) in the log-file with the current time''' + now = datetime.now() + current_time = now.strftime("%H:%M:%S") + + print(f"({current_time}){msg}", file=self.runtime_logs_file) + self.runtime_logs_file.flush() + os.fsync(self.runtime_logs_file.fileno()) + + def finalize(self): + '''Ends the logger handler, saving and closing the used files''' + self.runtime_logs_file.close() + self.repositories_data_file.close() diff --git a/main.py b/main.py new file mode 100644 index 0000000..db78d41 --- /dev/null +++ b/main.py @@ -0,0 +1,50 @@ +from github_api_handler import get_assignment_repos_names, get_repository_information +from dotenv import dotenv_values +from git import Repo, rmtree +from logger import Logger +import os + +config = dotenv_values(".env") + +BASE_SAVE_DIR = f"../{config['PREFIX']}" +PER_PAGE = 100 +PAGES = 3 + + +if __name__ == "__main__": + cloned = 0 + failed = 0 + + print("(1/6) Initializing") + logger = Logger() + + print(f"(2/6) Getting repositories names from {config['ORGA']} with prefix: {config['PREFIX']}") + repo_names = get_assignment_repos_names(config["PREFIX"], PER_PAGE, PAGES) + + print("(3/6) Getting repositories data") + repositories_data = [get_repository_information(name, config["PREFIX"]) for name in repo_names] + + print("(4/6) Saving repositories data") + logger.save_repositories_data(repositories_data) + + if not os.path.exists(BASE_SAVE_DIR): + os.makedirs(BASE_SAVE_DIR) + + print("(5/6) Cloning repositories") + for repo in repositories_data: + try: + cloned_repo = Repo.clone_from(repo["clone_url"], f"{BASE_SAVE_DIR}/{repo['name']}", no_checkout=True) + cloned_repo.git.checkout(repo["last_commit_sha"]) + rmtree(f"{BASE_SAVE_DIR}/{repo['name']}/.git") + + logger.write_runtime_log(f"Cloned succesfully: {repo['name']}") + cloned += 1 + except: + logger.write_runtime_log(f"Couldn't clone: {repo['name']}") + failed += 1 + + logger.finalize() + + print("(6/6) Process completed") + print(f"- Cloned: {cloned}") + print(f"- Failed: {failed}") From 9f9b52e01a044be6ff10b951e83bb476d97cfe99 Mon Sep 17 00:00:00 2001 From: FeBalla Date: Tue, 16 Aug 2022 15:54:13 -0400 Subject: [PATCH 2/3] feat: support for different OS; fix: last commit date according to local time; (general refactor) --- api_handler.py | 50 ------------------------------------------------ example.env | 6 ++++++ logger.py | 18 +++++++++-------- main.py | 36 ++++++++++++++++------------------ requirements.txt | 4 ++++ utilities.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 85 insertions(+), 78 deletions(-) delete mode 100644 api_handler.py create mode 100644 example.env create mode 100644 requirements.txt create mode 100644 utilities.py diff --git a/api_handler.py b/api_handler.py deleted file mode 100644 index 03a5dbe..0000000 --- a/api_handler.py +++ /dev/null @@ -1,50 +0,0 @@ -from datetime import datetime, timedelta -from dotenv import dotenv_values -import requests - -config = dotenv_values(".env") - -BASE_URL = "https://github.com" -API_URL = "https://api.github.com" - -# The last commit date returned by GitHub API is in UTC timezone, so you can adjust it to another -# timezone using this parameter. For GMT-4 the difference in april is around 4 hours. This is used -# just for logs and does not affect anything else. -TIME_DIFF_TO_UTC = timedelta(hours=4) - - -def get_assignment_repos_names(REPOSITORY_PREFIX="", PER_PAGE=100, TOTAL_PAGES=2): - '''Returns a set with the name of all private repositories from a organization that starts with a - specific prefix''' - assignment_repos = set() - - for page in range(TOTAL_PAGES): - url = f"{API_URL}/orgs/{config['ORGA']}/repos?type=private&per_page={PER_PAGE}&page={page + 1}" - org_repos = requests.get(url, auth=(config["USER"], config["TOKEN"])) - - for repo in org_repos.json(): - if repo["name"].startswith(REPOSITORY_PREFIX): - assignment_repos.add(repo["name"]) - - return list(assignment_repos) - - -def get_repository_information(repository_name, REPOSITORY_PREFIX=""): - '''Gets the repository information by name, including the last commit sha, the last commit date - and the standard url for cloning''' - url = f"{API_URL}/repos/{config['ORGA']}/{repository_name}/commits" - data = requests.get(url, auth=(config["USER"], config["TOKEN"])).json()[0] - - commit_date = datetime.strptime(data["commit"]["author"]["date"], "%Y-%m-%dT%H:%M:%SZ") - commit_date -= TIME_DIFF_TO_UTC - commit_date = commit_date.strftime("%d/%m/%Y-%H:%M") - - repository_info = { - "name": repository_name, - "clone_url": f"{BASE_URL}/{config['ORGA']}/{repository_name}.git", - "last_commit_sha": data["sha"], - "last_commit_author": repository_name.replace(f"{REPOSITORY_PREFIX}-", ""), - "last_commit_date": commit_date, - } - - return repository_info diff --git a/example.env b/example.env new file mode 100644 index 0000000..344e3dd --- /dev/null +++ b/example.env @@ -0,0 +1,6 @@ +PREFIX = "my-assignment" +TOKEN = "my-personal-access-token" +USER = "MyUser" +ORGA = "My-Organization" +PER_PAGE = 100 +PAGES = 1 \ No newline at end of file diff --git a/logger.py b/logger.py index 9c05268..eee973f 100644 --- a/logger.py +++ b/logger.py @@ -1,24 +1,26 @@ -from datetime import datetime import os +from datetime import datetime class Logger: + '''Handle execution logs (repositories list, runtime messages, etc.)''' + def __init__(self, folder_name="logs", encoding="utf-8"): self.folder_name = folder_name if not os.path.exists(self.folder_name): os.makedirs(self.folder_name) - self.repositories_data_file = open(f"{self.folder_name}/repos.txt", "w", encoding=encoding) + self.repositories_data_file = open(os.path.join(self.folder_name, "repos.txt"), "w", encoding=encoding) print("Repositories expected to clone:", file=self.repositories_data_file) - self.runtime_logs_file = open(f"{self.folder_name}/runtime.txt", "w", encoding=encoding) + self.runtime_logs_file = open(os.path.join(self.folder_name, "runtime.txt"), "w", encoding=encoding) print("Runtime logs:", file=self.runtime_logs_file) def save_repositories_data(self, repositories_data): '''Writes a log-file with all the information of the repositories that will be cloned''' - repositories_data = sorted(repositories_data, key=lambda repo: repo['last_commit_date']) - + repositories_data = sorted(repositories_data, key=lambda repo: repo["last_commit_date"]) + for repo in repositories_data: print(repo["name"], end=" ", file=self.repositories_data_file) print(repo["last_commit_sha"], end=" ", file=self.repositories_data_file) @@ -29,15 +31,15 @@ def save_repositories_data(self, repositories_data): os.fsync(self.repositories_data_file.fileno()) def write_runtime_log(self, msg): - '''Writes a runtime-log (msg) in the log-file with the current time''' + '''Writes a runtime-log message in the log-file with the current time''' now = datetime.now() current_time = now.strftime("%H:%M:%S") - print(f"({current_time}){msg}", file=self.runtime_logs_file) + print(f"({current_time}) {msg}", file=self.runtime_logs_file) self.runtime_logs_file.flush() os.fsync(self.runtime_logs_file.fileno()) def finalize(self): - '''Ends the logger handler, saving and closing the used files''' + '''Ends the logger handler instance, saving and closing the used files''' self.runtime_logs_file.close() self.repositories_data_file.close() diff --git a/main.py b/main.py index db78d41..276fed2 100644 --- a/main.py +++ b/main.py @@ -1,25 +1,21 @@ -from github_api_handler import get_assignment_repos_names, get_repository_information +import os from dotenv import dotenv_values from git import Repo, rmtree +from utilities import get_assignment_repos_names, get_repository_information from logger import Logger -import os config = dotenv_values(".env") - -BASE_SAVE_DIR = f"../{config['PREFIX']}" -PER_PAGE = 100 -PAGES = 3 - +SAVE_DIR = os.path.join("..", config["PREFIX"]) if __name__ == "__main__": - cloned = 0 - failed = 0 + CLONED = 0 + FAILED = 0 print("(1/6) Initializing") logger = Logger() - print(f"(2/6) Getting repositories names from {config['ORGA']} with prefix: {config['PREFIX']}") - repo_names = get_assignment_repos_names(config["PREFIX"], PER_PAGE, PAGES) + print(f"(2/6) Getting repositories list from {config['ORGA']} with prefix: {config['PREFIX']}") + repo_names = get_assignment_repos_names(config["PREFIX"], int(config["PER_PAGE"]), int(config["PAGES"])) print("(3/6) Getting repositories data") repositories_data = [get_repository_information(name, config["PREFIX"]) for name in repo_names] @@ -27,24 +23,24 @@ print("(4/6) Saving repositories data") logger.save_repositories_data(repositories_data) - if not os.path.exists(BASE_SAVE_DIR): - os.makedirs(BASE_SAVE_DIR) + if not os.path.exists(SAVE_DIR): + os.makedirs(SAVE_DIR) print("(5/6) Cloning repositories") for repo in repositories_data: try: - cloned_repo = Repo.clone_from(repo["clone_url"], f"{BASE_SAVE_DIR}/{repo['name']}", no_checkout=True) + cloned_repo = Repo.clone_from(repo["clone_url"], os.path.join(SAVE_DIR, repo["name"]), no_checkout=True) cloned_repo.git.checkout(repo["last_commit_sha"]) - rmtree(f"{BASE_SAVE_DIR}/{repo['name']}/.git") + rmtree(os.path.join(SAVE_DIR, repo["name"], ".git")) logger.write_runtime_log(f"Cloned succesfully: {repo['name']}") - cloned += 1 - except: + CLONED += 1 + except Exception: logger.write_runtime_log(f"Couldn't clone: {repo['name']}") - failed += 1 + FAILED += 1 logger.finalize() print("(6/6) Process completed") - print(f"- Cloned: {cloned}") - print(f"- Failed: {failed}") + print(f"- Cloned: {CLONED}") + print(f"- Failed: {FAILED}") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..fcf5d8a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +GitPython==3.1.27 +python-dotenv==0.20.0 +python_dateutil==2.8.2 +requests==2.26.0 diff --git a/utilities.py b/utilities.py new file mode 100644 index 0000000..0352ecc --- /dev/null +++ b/utilities.py @@ -0,0 +1,49 @@ +from datetime import datetime +from dotenv import dotenv_values +from dateutil import tz +import requests + +config = dotenv_values(".env") +BASE_URL = "https://github.com" +API_URL = "https://api.github.com" + + +def get_assignment_repos_names(repository_prefix="", per_page=100, total_pages=2): + '''Gets names of all private repositories from a organization starting with `repository_prefix`''' + assignment_repos = set() + + for page in range(total_pages): + url = f"{API_URL}/orgs/{config['ORGA']}/repos?type=private&per_page={per_page}&page={page + 1}" + org_repos = requests.get(url, auth=(config["USER"], config["TOKEN"])) + + for repo in org_repos.json(): + if repo["name"].startswith(repository_prefix): + assignment_repos.add(repo["name"]) + + return list(assignment_repos) + + +def get_last_commit_date(commit_data): + '''Gets the last commit date in local time (GitHub API returns it in UTC)''' + commit_date = datetime.strptime(commit_data["commit"]["author"]["date"], "%Y-%m-%dT%H:%M:%SZ") + commit_date = commit_date.replace(tzinfo=tz.tzutc()) + commit_date = commit_date.astimezone(tz.tzlocal()) + commit_date = commit_date.strftime("%d/%m/%Y-%H:%M") + + return commit_date + + +def get_repository_information(repository_name, repository_prefix=""): + '''Gets information for a specific repository''' + url = f"{API_URL}/repos/{config['ORGA']}/{repository_name}/commits" + data = requests.get(url, auth=(config["USER"], config["TOKEN"])).json()[0] + + repository_info = { + "name": repository_name, + "clone_url": f"{BASE_URL}/{config['ORGA']}/{repository_name}.git", + "last_commit_sha": data["sha"], + "last_commit_author": repository_name.replace(f"{repository_prefix}-", ""), + "last_commit_date": get_last_commit_date(data), + } + + return repository_info From 69826576a97e564ff119ccb2bb52ff1305f35f2e Mon Sep 17 00:00:00 2001 From: Fernando Balladares C <48292394+FeBalla@users.noreply.github.com> Date: Tue, 16 Aug 2022 17:01:10 -0400 Subject: [PATCH 3/3] Update README.md Signed-off-by: Fernando Balladares C. <48292394+FeBalla@users.noreply.github.com> --- README.md | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index bfb789f..bb87af4 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,28 @@ -# github-repo-collecter -Script to collect all repositories from a GitHub organization via GitHub API +# GitHub Classroom Collector +It's a simple Python script to collect all **private** repositories from a GitHub Organization. Is intended to be used for downloading all the submissions of a GitHub Classroom assignment, getting information like last commit date. + +## How to use? +1. First of all, you need to install all the dependencies specified in `requirements.txt`. You can use: +```bash +pip install -r requirements.txt +``` + +2. Now you need to create a `.env` file like [example.env](./example.env): +- `PREFIX`: Sets a prefix for the repositories name. This is usefull to download only repositories from a specific assignment. +- `TOKEN`: Here you need to use a [personal access token](https://docs.github.com/es/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) for your GitHub account. +- `USER`: Sets the username of your's GitHub account. +- `ORGA`: Sets the organization's name where the repositories are located. +- `PER_PAGE`: Represents how many repositories you want to get from a single API call (it works paginated). You can use 100 as default. +- `PAGES`: Represents how many pages of size _PER_PAGE_ will be called to GitHub API. For example, if you have 500 repositories and _PER_PAGE=100_, then _PAGES_ should be 5. However, if you don't want to overthink, just set a higher value and it only will take a little longer. + +3. Run the [main.py](./main.py) module and wait until all the repositories are downloaded. + +## Logs +Once the execution started, will be created a directory with 2 files: +- `repos.txt`: Has the information of the repositories that will be cloned. Each line has the following format: +```txt +repository-name last-commit-sha last-commit-author last-commit-date +``` + +- `runtime.txt`: Has the runtime logs with the clone results for each repository. +