diff --git a/.gitignore b/.gitignore index b6e4761..9492099 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +.env +logs + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/README.md b/README.md index bfb789f..bb87af4 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,28 @@ -# github-repo-collecter -Script to collect all repositories from a GitHub organization via GitHub API +# GitHub Classroom Collector +It's a simple Python script to collect all **private** repositories from a GitHub Organization. Is intended to be used for downloading all the submissions of a GitHub Classroom assignment, getting information like last commit date. + +## How to use? +1. First of all, you need to install all the dependencies specified in `requirements.txt`. You can use: +```bash +pip install -r requirements.txt +``` + +2. Now you need to create a `.env` file like [example.env](./example.env): +- `PREFIX`: Sets a prefix for the repositories name. This is usefull to download only repositories from a specific assignment. +- `TOKEN`: Here you need to use a [personal access token](https://docs.github.com/es/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) for your GitHub account. +- `USER`: Sets the username of your's GitHub account. +- `ORGA`: Sets the organization's name where the repositories are located. +- `PER_PAGE`: Represents how many repositories you want to get from a single API call (it works paginated). You can use 100 as default. +- `PAGES`: Represents how many pages of size _PER_PAGE_ will be called to GitHub API. For example, if you have 500 repositories and _PER_PAGE=100_, then _PAGES_ should be 5. However, if you don't want to overthink, just set a higher value and it only will take a little longer. + +3. Run the [main.py](./main.py) module and wait until all the repositories are downloaded. + +## Logs +Once the execution started, will be created a directory with 2 files: +- `repos.txt`: Has the information of the repositories that will be cloned. Each line has the following format: +```txt +repository-name last-commit-sha last-commit-author last-commit-date +``` + +- `runtime.txt`: Has the runtime logs with the clone results for each repository. + diff --git a/example.env b/example.env new file mode 100644 index 0000000..344e3dd --- /dev/null +++ b/example.env @@ -0,0 +1,6 @@ +PREFIX = "my-assignment" +TOKEN = "my-personal-access-token" +USER = "MyUser" +ORGA = "My-Organization" +PER_PAGE = 100 +PAGES = 1 \ No newline at end of file diff --git a/logger.py b/logger.py new file mode 100644 index 0000000..eee973f --- /dev/null +++ b/logger.py @@ -0,0 +1,45 @@ +import os +from datetime import datetime + + +class Logger: + '''Handle execution logs (repositories list, runtime messages, etc.)''' + + def __init__(self, folder_name="logs", encoding="utf-8"): + self.folder_name = folder_name + + if not os.path.exists(self.folder_name): + os.makedirs(self.folder_name) + + self.repositories_data_file = open(os.path.join(self.folder_name, "repos.txt"), "w", encoding=encoding) + print("Repositories expected to clone:", file=self.repositories_data_file) + + self.runtime_logs_file = open(os.path.join(self.folder_name, "runtime.txt"), "w", encoding=encoding) + print("Runtime logs:", file=self.runtime_logs_file) + + def save_repositories_data(self, repositories_data): + '''Writes a log-file with all the information of the repositories that will be cloned''' + repositories_data = sorted(repositories_data, key=lambda repo: repo["last_commit_date"]) + + for repo in repositories_data: + print(repo["name"], end=" ", file=self.repositories_data_file) + print(repo["last_commit_sha"], end=" ", file=self.repositories_data_file) + print(repo["last_commit_author"], end=" ", file=self.repositories_data_file) + print(repo["last_commit_date"], file=self.repositories_data_file) + + self.repositories_data_file.flush() + os.fsync(self.repositories_data_file.fileno()) + + def write_runtime_log(self, msg): + '''Writes a runtime-log message in the log-file with the current time''' + now = datetime.now() + current_time = now.strftime("%H:%M:%S") + + print(f"({current_time}) {msg}", file=self.runtime_logs_file) + self.runtime_logs_file.flush() + os.fsync(self.runtime_logs_file.fileno()) + + def finalize(self): + '''Ends the logger handler instance, saving and closing the used files''' + self.runtime_logs_file.close() + self.repositories_data_file.close() diff --git a/main.py b/main.py new file mode 100644 index 0000000..276fed2 --- /dev/null +++ b/main.py @@ -0,0 +1,46 @@ +import os +from dotenv import dotenv_values +from git import Repo, rmtree +from utilities import get_assignment_repos_names, get_repository_information +from logger import Logger + +config = dotenv_values(".env") +SAVE_DIR = os.path.join("..", config["PREFIX"]) + +if __name__ == "__main__": + CLONED = 0 + FAILED = 0 + + print("(1/6) Initializing") + logger = Logger() + + print(f"(2/6) Getting repositories list from {config['ORGA']} with prefix: {config['PREFIX']}") + repo_names = get_assignment_repos_names(config["PREFIX"], int(config["PER_PAGE"]), int(config["PAGES"])) + + print("(3/6) Getting repositories data") + repositories_data = [get_repository_information(name, config["PREFIX"]) for name in repo_names] + + print("(4/6) Saving repositories data") + logger.save_repositories_data(repositories_data) + + if not os.path.exists(SAVE_DIR): + os.makedirs(SAVE_DIR) + + print("(5/6) Cloning repositories") + for repo in repositories_data: + try: + cloned_repo = Repo.clone_from(repo["clone_url"], os.path.join(SAVE_DIR, repo["name"]), no_checkout=True) + cloned_repo.git.checkout(repo["last_commit_sha"]) + rmtree(os.path.join(SAVE_DIR, repo["name"], ".git")) + + logger.write_runtime_log(f"Cloned succesfully: {repo['name']}") + CLONED += 1 + except Exception: + logger.write_runtime_log(f"Couldn't clone: {repo['name']}") + FAILED += 1 + + logger.finalize() + + print("(6/6) Process completed") + print(f"- Cloned: {CLONED}") + print(f"- Failed: {FAILED}") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..fcf5d8a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +GitPython==3.1.27 +python-dotenv==0.20.0 +python_dateutil==2.8.2 +requests==2.26.0 diff --git a/utilities.py b/utilities.py new file mode 100644 index 0000000..0352ecc --- /dev/null +++ b/utilities.py @@ -0,0 +1,49 @@ +from datetime import datetime +from dotenv import dotenv_values +from dateutil import tz +import requests + +config = dotenv_values(".env") +BASE_URL = "https://github.com" +API_URL = "https://api.github.com" + + +def get_assignment_repos_names(repository_prefix="", per_page=100, total_pages=2): + '''Gets names of all private repositories from a organization starting with `repository_prefix`''' + assignment_repos = set() + + for page in range(total_pages): + url = f"{API_URL}/orgs/{config['ORGA']}/repos?type=private&per_page={per_page}&page={page + 1}" + org_repos = requests.get(url, auth=(config["USER"], config["TOKEN"])) + + for repo in org_repos.json(): + if repo["name"].startswith(repository_prefix): + assignment_repos.add(repo["name"]) + + return list(assignment_repos) + + +def get_last_commit_date(commit_data): + '''Gets the last commit date in local time (GitHub API returns it in UTC)''' + commit_date = datetime.strptime(commit_data["commit"]["author"]["date"], "%Y-%m-%dT%H:%M:%SZ") + commit_date = commit_date.replace(tzinfo=tz.tzutc()) + commit_date = commit_date.astimezone(tz.tzlocal()) + commit_date = commit_date.strftime("%d/%m/%Y-%H:%M") + + return commit_date + + +def get_repository_information(repository_name, repository_prefix=""): + '''Gets information for a specific repository''' + url = f"{API_URL}/repos/{config['ORGA']}/{repository_name}/commits" + data = requests.get(url, auth=(config["USER"], config["TOKEN"])).json()[0] + + repository_info = { + "name": repository_name, + "clone_url": f"{BASE_URL}/{config['ORGA']}/{repository_name}.git", + "last_commit_sha": data["sha"], + "last_commit_author": repository_name.replace(f"{repository_prefix}-", ""), + "last_commit_date": get_last_commit_date(data), + } + + return repository_info