Skip to content

Commit

Permalink
add function to scrape all repo and issues with while loop
Browse files Browse the repository at this point in the history
1. while loop to get next page, if exist
2. refactor functions
3. logger to track debug, info, warning
  • Loading branch information
kytcrystal committed Aug 30, 2024
1 parent b3bdb28 commit 45704e3
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 75 deletions.
168 changes: 94 additions & 74 deletions scrapers/scrape_ghissues.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# - attributes to extract from comments are configured in 'config_ghissues_comments.txt'.
# - scraped texts are stored in '../data_ghissues/X-issue-Y.txt',
# where X is the repository and Y is the issue ID
# - summary file '../data_ghissues/scraped_repo_and_issues.txt' contains stats of scraped issues,
# - summary file '../data_ghissues/00_summary_scraped_repo_and_issues.txt' contains stats of scraped issues,
# also used to compare if issue is updated
# --------------------------------------------------------------------------------------------------

Expand All @@ -25,7 +25,15 @@
import os
import re
import textwrap
import logging

# --- logging ---

logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# --- configuration ---

Expand All @@ -34,69 +42,86 @@
gh_param = json.load(conf)
conf.close()
except FileNotFoundError:
print("ERROR: scrape_ghissues: cannot read credentials file.")
logger.error("scrape_ghissues: cannot read credentials file")
sys.exit(1)

token = gh_param.get("github_token")
project_owner = gh_param.get("project_owner")

base_url = "https://api.github.com"
query_params = "?per_page=100" # per_page for number of entries
dev = False # param for development to break while loop

config_ghissues = "config_ghissues.txt"
config_ghissues_comments = "config_ghissues_comments.txt"
format_attributes = ["Body"]
base_url = "https://api.github.com"
project_owner = "noi-techpark"
out_dir = "../data_ghissues"
summary_file_name = f"{out_dir}/scraped_repo_and_issues.txt"
test_param = "" # eg. "?per_page=1&page=1", per_page for number of entries, page for number of pages
summary_file_name = f"{out_dir}/00_summary_scraped_repo_and_issues.txt"
request_count = 0

# --- functions ---

def load_config(config_file):
with open(config_file, 'r') as file:
return [line.strip() for line in file]

def make_github_request(url):
req = urllib.request.Request(f"{url}{test_param}")
def make_github_request(url, function_name):
global request_count
req = urllib.request.Request(url)
if token:
req.add_header('Authorization', f'Bearer {token}')
return req

def get_repositories():
req = make_github_request(f"{base_url}/orgs/{project_owner}/repos")
with urllib.request.urlopen(req) as response:
if response.getcode() != 200:
print("WARNING: get_repository() got response %s." % response.getcode())
logger.warning("%s got response %s.", function_name, response.getcode())
return []
data = response.read()
header_link = response.headers.get("Link")
request_count += 1
if request_count % 1 == 0:
logger.debug("Executing: %s", url)
json_data = json.loads(data.decode('utf-8'))
return json_data
return json_data, header_link

def extract_next_link(header):
links = header.split(',')
for link in links:
parts = link.split('; ')
if 'rel="next"' in parts[1]:
link = parts[0].strip('<> ')
break
return link

def get_repositories():
json_data, header = make_github_request(f"{base_url}/orgs/{project_owner}/repos{query_params}", "get_repositories()")
list_repos = [item['name'] for item in json_data]
while header and "next" in header:
link = extract_next_link(header)
json_data, header = make_github_request(link, "get_repositories()")
list_repos.extend([item['name'] for item in json_data])
if dev: break
return list_repos

def get_repo_issues(repo):
req = make_github_request(f"{base_url}/repos/{project_owner}/{repo}/issues")
with urllib.request.urlopen(req) as response:
if response.getcode() != 200:
print("WARNING: get_repo_issues() got response %s." % response.getcode())
return []
data = response.read()
json_data = json.loads(data.decode('utf-8'))
return json_data
json_data, header = make_github_request(f"{base_url}/repos/{project_owner}/{repo}/issues{query_params}", "get_repo_issues()")
list_issue_numbers = [item['number'] for item in json_data]
while header and "next" in header:
link = extract_next_link(header)
json_data, header = make_github_request(link, "get_repo_issues()")
list_issue_numbers.extend([item['number'] for item in json_data])
if dev: break
return list_issue_numbers

def get_issue(repo, issue_number):
req = make_github_request(f"{base_url}/repos/{project_owner}/{repo}/issues/{issue_number}")
with urllib.request.urlopen(req) as response:
if response.getcode() != 200:
print("WARNING: get_issue() got response %s." % response.getcode())
return []
data = response.read()
json_data = json.loads(data.decode('utf-8'))
json_data, header = make_github_request(f"{base_url}/repos/{project_owner}/{repo}/issues/{issue_number}", "get_issue()")
return json_data

def get_comments(repo, issue_number):
req = make_github_request(f"{base_url}/repos/{project_owner}/{repo}/issues/{issue_number}/comments")
with urllib.request.urlopen(req) as response:
if response.getcode() != 200:
print("WARNING: get_comments() got response %s." % response.getcode())
return []
data = response.read()
json_data = json.loads(data.decode('utf-8'))
json_data, header = make_github_request(f"{base_url}/repos/{project_owner}/{repo}/issues/{issue_number}/comments{query_params}", "get_comments()")
while header and "next" in header:
link = extract_next_link(header)
more_json_data, header = make_github_request(link, "get_comments()")
json_data.append(more_json_data)
if dev: break
return json_data

def extract_and_format(attributes, extracted_data, data):
Expand All @@ -106,7 +131,7 @@ def extract_and_format(attributes, extracted_data, data):
extracted_data[attr] = extract_nested(data, attr)
else:
extracted_data[attr] = data[attr]
except (KeyError, TypeError):
except (KeyError, TypeError) as err:
extracted_data[attr] = None
return format_issue(extracted_data)

Expand Down Expand Up @@ -179,50 +204,45 @@ def main():
previous_content = summary_file.read()
summary_file.close()
except FileNotFoundError:
print("INFO: Previous summary file not found")
logger.info("previous summary file not found")
previous_content = ""
records, last_update = read_previous_content(previous_content)
temp_file_name = f"{out_dir}/temp_summary_file.txt"
temp_file_name = f"{out_dir}/01_temp_summary_file.txt"
temp_summary_file = open(temp_file_name, 'w')
temp_summary_file.write(f"Repository, Issue number, Comment count\n")

try:
json_repo = get_repositories()
list_repos = [item['name'] for item in json_repo]
list_repos = get_repositories()

for repo in list_repos:
json_repo_issues = get_repo_issues(repo)
list_issue_numbers = [item['number'] for item in json_repo_issues]

for issue_number in list_issue_numbers:
json_issues = get_issue(repo, issue_number)
updated = check_updated(records, repo, json_issues, last_update)

comment_count = json_issues["comments"]

if not records or not updated:
text = extract_and_format(issue_attributes, {}, json_issues)

json_comments = get_comments(repo, issue_number)
comment_count = len(json_comments)
extracted_data = {}
comment_count = 0
for comment in json_comments:
comment_count += 1
extracted_data["comment"] = comment_count
text += extract_and_format(comment_attributes, extracted_data, comment)

file_name = "%s/%s-issue-%d.txt" % (out_dir, repo, int(issue_number))
file = open(file_name, 'w')
file.write(text)
file.close()
updated_issues += 1

total_issues += 1
temp_summary_file.write(f"{repo}, {issue_number}, {comment_count}\n")
except Exception as e:
print(f"ERROR: {e}")
for repo in list_repos:
list_issue_numbers = get_repo_issues(repo)

for issue_number in list_issue_numbers:
json_issues = get_issue(repo, issue_number)
updated = check_updated(records, repo, json_issues, last_update)

comment_count = json_issues["comments"]

if not records or not updated:
text = extract_and_format(issue_attributes, {}, json_issues)

json_comments = get_comments(repo, issue_number)
comment_count = len(json_comments)
extracted_data = {}
comment_count = 0
for comment in json_comments:
comment_count += 1
extracted_data["comment"] = comment_count
text += extract_and_format(comment_attributes, extracted_data, comment)

file_name = "%s/%s-issue-%d.txt" % (out_dir, repo, int(issue_number))
file = open(file_name, 'w')
file.write(text)
file.close()
updated_issues += 1

total_issues += 1
temp_summary_file.write(f"{repo}, {issue_number}, {comment_count}\n")
temp_summary_file.flush()

t1 = time.time()
temp_summary_file.write(f"Total number of issues: {total_issues}\n")
Expand Down
3 changes: 2 additions & 1 deletion scrapers/secrets_gh.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{
"github_token" : "YOUR_GITHUB_TOKEN_to_make_more_requests_per_hour_Otherwise_leave_blank"
"github_token" : "YOUR_GITHUB_TOKEN_to_make_more_requests_per_hour_Otherwise_leave_blank",
"project_owner" : "YOUR_GITHUB_ACCOUNT"
}

0 comments on commit 45704e3

Please sign in to comment.