Skip to content

Commit

Permalink
Merge pull request #1 from FeBalla/dev
Browse files Browse the repository at this point in the history
First Release
  • Loading branch information
FeBalla authored Aug 16, 2022
2 parents 2b8e951 + 6982657 commit a05c44b
Show file tree
Hide file tree
Showing 7 changed files with 181 additions and 2 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
.env
logs

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
30 changes: 28 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,28 @@
# github-repo-collecter
Script to collect all repositories from a GitHub organization via GitHub API
# GitHub Classroom Collector
It's a simple Python script to collect all **private** repositories from a GitHub Organization. Is intended to be used for downloading all the submissions of a GitHub Classroom assignment, getting information like last commit date.

## How to use?
1. First of all, you need to install all the dependencies specified in `requirements.txt`. You can use:
```bash
pip install -r requirements.txt
```

2. Now you need to create a `.env` file like [example.env](./example.env):
- `PREFIX`: Sets a prefix for the repositories name. This is usefull to download only repositories from a specific assignment.
- `TOKEN`: Here you need to use a [personal access token](https://docs.github.com/es/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) for your GitHub account.
- `USER`: Sets the username of your's GitHub account.
- `ORGA`: Sets the organization's name where the repositories are located.
- `PER_PAGE`: Represents how many repositories you want to get from a single API call (it works paginated). You can use 100 as default.
- `PAGES`: Represents how many pages of size _PER_PAGE_ will be called to GitHub API. For example, if you have 500 repositories and _PER_PAGE=100_, then _PAGES_ should be 5. However, if you don't want to overthink, just set a higher value and it only will take a little longer.

3. Run the [main.py](./main.py) module and wait until all the repositories are downloaded.

## Logs
Once the execution started, will be created a directory with 2 files:
- `repos.txt`: Has the information of the repositories that will be cloned. Each line has the following format:
```txt
repository-name last-commit-sha last-commit-author last-commit-date
```

- `runtime.txt`: Has the runtime logs with the clone results for each repository.

6 changes: 6 additions & 0 deletions example.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
PREFIX = "my-assignment"
TOKEN = "my-personal-access-token"
USER = "MyUser"
ORGA = "My-Organization"
PER_PAGE = 100
PAGES = 1
45 changes: 45 additions & 0 deletions logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os
from datetime import datetime


class Logger:
'''Handle execution logs (repositories list, runtime messages, etc.)'''

def __init__(self, folder_name="logs", encoding="utf-8"):
self.folder_name = folder_name

if not os.path.exists(self.folder_name):
os.makedirs(self.folder_name)

self.repositories_data_file = open(os.path.join(self.folder_name, "repos.txt"), "w", encoding=encoding)
print("Repositories expected to clone:", file=self.repositories_data_file)

self.runtime_logs_file = open(os.path.join(self.folder_name, "runtime.txt"), "w", encoding=encoding)
print("Runtime logs:", file=self.runtime_logs_file)

def save_repositories_data(self, repositories_data):
'''Writes a log-file with all the information of the repositories that will be cloned'''
repositories_data = sorted(repositories_data, key=lambda repo: repo["last_commit_date"])

for repo in repositories_data:
print(repo["name"], end=" ", file=self.repositories_data_file)
print(repo["last_commit_sha"], end=" ", file=self.repositories_data_file)
print(repo["last_commit_author"], end=" ", file=self.repositories_data_file)
print(repo["last_commit_date"], file=self.repositories_data_file)

self.repositories_data_file.flush()
os.fsync(self.repositories_data_file.fileno())

def write_runtime_log(self, msg):
'''Writes a runtime-log message in the log-file with the current time'''
now = datetime.now()
current_time = now.strftime("%H:%M:%S")

print(f"({current_time}) {msg}", file=self.runtime_logs_file)
self.runtime_logs_file.flush()
os.fsync(self.runtime_logs_file.fileno())

def finalize(self):
'''Ends the logger handler instance, saving and closing the used files'''
self.runtime_logs_file.close()
self.repositories_data_file.close()
46 changes: 46 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os
from dotenv import dotenv_values
from git import Repo, rmtree
from utilities import get_assignment_repos_names, get_repository_information
from logger import Logger

config = dotenv_values(".env")
SAVE_DIR = os.path.join("..", config["PREFIX"])

if __name__ == "__main__":
CLONED = 0
FAILED = 0

print("(1/6) Initializing")
logger = Logger()

print(f"(2/6) Getting repositories list from {config['ORGA']} with prefix: {config['PREFIX']}")
repo_names = get_assignment_repos_names(config["PREFIX"], int(config["PER_PAGE"]), int(config["PAGES"]))

print("(3/6) Getting repositories data")
repositories_data = [get_repository_information(name, config["PREFIX"]) for name in repo_names]

print("(4/6) Saving repositories data")
logger.save_repositories_data(repositories_data)

if not os.path.exists(SAVE_DIR):
os.makedirs(SAVE_DIR)

print("(5/6) Cloning repositories")
for repo in repositories_data:
try:
cloned_repo = Repo.clone_from(repo["clone_url"], os.path.join(SAVE_DIR, repo["name"]), no_checkout=True)
cloned_repo.git.checkout(repo["last_commit_sha"])
rmtree(os.path.join(SAVE_DIR, repo["name"], ".git"))

logger.write_runtime_log(f"Cloned succesfully: {repo['name']}")
CLONED += 1
except Exception:
logger.write_runtime_log(f"Couldn't clone: {repo['name']}")
FAILED += 1

logger.finalize()

print("(6/6) Process completed")
print(f"- Cloned: {CLONED}")
print(f"- Failed: {FAILED}")
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
GitPython==3.1.27
python-dotenv==0.20.0
python_dateutil==2.8.2
requests==2.26.0
49 changes: 49 additions & 0 deletions utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from datetime import datetime
from dotenv import dotenv_values
from dateutil import tz
import requests

config = dotenv_values(".env")
BASE_URL = "https://github.com"
API_URL = "https://api.github.com"


def get_assignment_repos_names(repository_prefix="", per_page=100, total_pages=2):
'''Gets names of all private repositories from a organization starting with `repository_prefix`'''
assignment_repos = set()

for page in range(total_pages):
url = f"{API_URL}/orgs/{config['ORGA']}/repos?type=private&per_page={per_page}&page={page + 1}"
org_repos = requests.get(url, auth=(config["USER"], config["TOKEN"]))

for repo in org_repos.json():
if repo["name"].startswith(repository_prefix):
assignment_repos.add(repo["name"])

return list(assignment_repos)


def get_last_commit_date(commit_data):
'''Gets the last commit date in local time (GitHub API returns it in UTC)'''
commit_date = datetime.strptime(commit_data["commit"]["author"]["date"], "%Y-%m-%dT%H:%M:%SZ")
commit_date = commit_date.replace(tzinfo=tz.tzutc())
commit_date = commit_date.astimezone(tz.tzlocal())
commit_date = commit_date.strftime("%d/%m/%Y-%H:%M")

return commit_date


def get_repository_information(repository_name, repository_prefix=""):
'''Gets information for a specific repository'''
url = f"{API_URL}/repos/{config['ORGA']}/{repository_name}/commits"
data = requests.get(url, auth=(config["USER"], config["TOKEN"])).json()[0]

repository_info = {
"name": repository_name,
"clone_url": f"{BASE_URL}/{config['ORGA']}/{repository_name}.git",
"last_commit_sha": data["sha"],
"last_commit_author": repository_name.replace(f"{repository_prefix}-", ""),
"last_commit_date": get_last_commit_date(data),
}

return repository_info

0 comments on commit a05c44b

Please sign in to comment.