From 45704e39c9c518575258ca05489bebc627cbcb01 Mon Sep 17 00:00:00 2001 From: Crystal Kwok Date: Fri, 30 Aug 2024 14:11:41 +0200 Subject: [PATCH] add function to scrape all repo and issues with while loop 1. while loop to get next page, if exist 2. refactor functions 3. logger to track debug, info, warning --- scrapers/scrape_ghissues.py | 168 ++++++++++++++++++++---------------- scrapers/secrets_gh.json | 3 +- 2 files changed, 96 insertions(+), 75 deletions(-) diff --git a/scrapers/scrape_ghissues.py b/scrapers/scrape_ghissues.py index 58908e9..5c43fc6 100644 --- a/scrapers/scrape_ghissues.py +++ b/scrapers/scrape_ghissues.py @@ -13,7 +13,7 @@ # - attributes to extract from comments are configured in 'config_ghissues_comments.txt'. # - scraped texts are stored in '../data_ghissues/X-issue-Y.txt', # where X is the repository and Y is the issue ID -# - summary file '../data_ghissues/scraped_repo_and_issues.txt' contains stats of scraped issues, +# - summary file '../data_ghissues/00_summary_scraped_repo_and_issues.txt' contains stats of scraped issues, # also used to compare if issue is updated # -------------------------------------------------------------------------------------------------- @@ -25,7 +25,15 @@ import os import re import textwrap +import logging +# --- logging --- + +logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) # --- configuration --- @@ -34,18 +42,22 @@ gh_param = json.load(conf) conf.close() except FileNotFoundError: - print("ERROR: scrape_ghissues: cannot read credentials file.") + logger.error("scrape_ghissues: cannot read credentials file") sys.exit(1) token = gh_param.get("github_token") +project_owner = gh_param.get("project_owner") + +base_url = "https://api.github.com" +query_params = "?per_page=100" # per_page for number of entries +dev = False # param for development to break while loop + config_ghissues = "config_ghissues.txt" config_ghissues_comments = "config_ghissues_comments.txt" format_attributes = ["Body"] -base_url = "https://api.github.com" -project_owner = "noi-techpark" out_dir = "../data_ghissues" -summary_file_name = f"{out_dir}/scraped_repo_and_issues.txt" -test_param = "" # eg. "?per_page=1&page=1", per_page for number of entries, page for number of pages +summary_file_name = f"{out_dir}/00_summary_scraped_repo_and_issues.txt" +request_count = 0 # --- functions --- @@ -53,50 +65,63 @@ def load_config(config_file): with open(config_file, 'r') as file: return [line.strip() for line in file] -def make_github_request(url): - req = urllib.request.Request(f"{url}{test_param}") +def make_github_request(url, function_name): + global request_count + req = urllib.request.Request(url) if token: req.add_header('Authorization', f'Bearer {token}') - return req - -def get_repositories(): - req = make_github_request(f"{base_url}/orgs/{project_owner}/repos") with urllib.request.urlopen(req) as response: if response.getcode() != 200: - print("WARNING: get_repository() got response %s." % response.getcode()) + logger.warning("%s got response %s.", function_name, response.getcode()) return [] data = response.read() + header_link = response.headers.get("Link") + request_count += 1 + if request_count % 1 == 0: + logger.debug("Executing: %s", url) json_data = json.loads(data.decode('utf-8')) - return json_data + return json_data, header_link + +def extract_next_link(header): + links = header.split(',') + for link in links: + parts = link.split('; ') + if 'rel="next"' in parts[1]: + link = parts[0].strip('<> ') + break + return link + +def get_repositories(): + json_data, header = make_github_request(f"{base_url}/orgs/{project_owner}/repos{query_params}", "get_repositories()") + list_repos = [item['name'] for item in json_data] + while header and "next" in header: + link = extract_next_link(header) + json_data, header = make_github_request(link, "get_repositories()") + list_repos.extend([item['name'] for item in json_data]) + if dev: break + return list_repos def get_repo_issues(repo): - req = make_github_request(f"{base_url}/repos/{project_owner}/{repo}/issues") - with urllib.request.urlopen(req) as response: - if response.getcode() != 200: - print("WARNING: get_repo_issues() got response %s." % response.getcode()) - return [] - data = response.read() - json_data = json.loads(data.decode('utf-8')) - return json_data + json_data, header = make_github_request(f"{base_url}/repos/{project_owner}/{repo}/issues{query_params}", "get_repo_issues()") + list_issue_numbers = [item['number'] for item in json_data] + while header and "next" in header: + link = extract_next_link(header) + json_data, header = make_github_request(link, "get_repo_issues()") + list_issue_numbers.extend([item['number'] for item in json_data]) + if dev: break + return list_issue_numbers def get_issue(repo, issue_number): - req = make_github_request(f"{base_url}/repos/{project_owner}/{repo}/issues/{issue_number}") - with urllib.request.urlopen(req) as response: - if response.getcode() != 200: - print("WARNING: get_issue() got response %s." % response.getcode()) - return [] - data = response.read() - json_data = json.loads(data.decode('utf-8')) + json_data, header = make_github_request(f"{base_url}/repos/{project_owner}/{repo}/issues/{issue_number}", "get_issue()") return json_data def get_comments(repo, issue_number): - req = make_github_request(f"{base_url}/repos/{project_owner}/{repo}/issues/{issue_number}/comments") - with urllib.request.urlopen(req) as response: - if response.getcode() != 200: - print("WARNING: get_comments() got response %s." % response.getcode()) - return [] - data = response.read() - json_data = json.loads(data.decode('utf-8')) + json_data, header = make_github_request(f"{base_url}/repos/{project_owner}/{repo}/issues/{issue_number}/comments{query_params}", "get_comments()") + while header and "next" in header: + link = extract_next_link(header) + more_json_data, header = make_github_request(link, "get_comments()") + json_data.append(more_json_data) + if dev: break return json_data def extract_and_format(attributes, extracted_data, data): @@ -106,7 +131,7 @@ def extract_and_format(attributes, extracted_data, data): extracted_data[attr] = extract_nested(data, attr) else: extracted_data[attr] = data[attr] - except (KeyError, TypeError): + except (KeyError, TypeError) as err: extracted_data[attr] = None return format_issue(extracted_data) @@ -179,50 +204,45 @@ def main(): previous_content = summary_file.read() summary_file.close() except FileNotFoundError: - print("INFO: Previous summary file not found") + logger.info("previous summary file not found") previous_content = "" records, last_update = read_previous_content(previous_content) - temp_file_name = f"{out_dir}/temp_summary_file.txt" + temp_file_name = f"{out_dir}/01_temp_summary_file.txt" temp_summary_file = open(temp_file_name, 'w') temp_summary_file.write(f"Repository, Issue number, Comment count\n") - try: - json_repo = get_repositories() - list_repos = [item['name'] for item in json_repo] + list_repos = get_repositories() - for repo in list_repos: - json_repo_issues = get_repo_issues(repo) - list_issue_numbers = [item['number'] for item in json_repo_issues] - - for issue_number in list_issue_numbers: - json_issues = get_issue(repo, issue_number) - updated = check_updated(records, repo, json_issues, last_update) - - comment_count = json_issues["comments"] - - if not records or not updated: - text = extract_and_format(issue_attributes, {}, json_issues) - - json_comments = get_comments(repo, issue_number) - comment_count = len(json_comments) - extracted_data = {} - comment_count = 0 - for comment in json_comments: - comment_count += 1 - extracted_data["comment"] = comment_count - text += extract_and_format(comment_attributes, extracted_data, comment) - - file_name = "%s/%s-issue-%d.txt" % (out_dir, repo, int(issue_number)) - file = open(file_name, 'w') - file.write(text) - file.close() - updated_issues += 1 - - total_issues += 1 - temp_summary_file.write(f"{repo}, {issue_number}, {comment_count}\n") - except Exception as e: - print(f"ERROR: {e}") + for repo in list_repos: + list_issue_numbers = get_repo_issues(repo) + for issue_number in list_issue_numbers: + json_issues = get_issue(repo, issue_number) + updated = check_updated(records, repo, json_issues, last_update) + + comment_count = json_issues["comments"] + + if not records or not updated: + text = extract_and_format(issue_attributes, {}, json_issues) + + json_comments = get_comments(repo, issue_number) + comment_count = len(json_comments) + extracted_data = {} + comment_count = 0 + for comment in json_comments: + comment_count += 1 + extracted_data["comment"] = comment_count + text += extract_and_format(comment_attributes, extracted_data, comment) + + file_name = "%s/%s-issue-%d.txt" % (out_dir, repo, int(issue_number)) + file = open(file_name, 'w') + file.write(text) + file.close() + updated_issues += 1 + + total_issues += 1 + temp_summary_file.write(f"{repo}, {issue_number}, {comment_count}\n") + temp_summary_file.flush() t1 = time.time() temp_summary_file.write(f"Total number of issues: {total_issues}\n") diff --git a/scrapers/secrets_gh.json b/scrapers/secrets_gh.json index 5921266..3f39612 100644 --- a/scrapers/secrets_gh.json +++ b/scrapers/secrets_gh.json @@ -1,3 +1,4 @@ { - "github_token" : "YOUR_GITHUB_TOKEN_to_make_more_requests_per_hour_Otherwise_leave_blank" + "github_token" : "YOUR_GITHUB_TOKEN_to_make_more_requests_per_hour_Otherwise_leave_blank", + "project_owner" : "YOUR_GITHUB_ACCOUNT" }