add function to scrape all repo and issues with while loop

1. while loop to get next page, if exist 2. refactor functions 3. logger to track debug, info, warning
noi-techpark · Aug 30, 2024 · 45704e3 · 45704e3
1 parent b3bdb28
commit 45704e3
Show file tree

Hide file tree

Showing 2 changed files with 96 additions and 75 deletions.
diff --git a/scrapers/scrape_ghissues.py b/scrapers/scrape_ghissues.py
@@ -13,7 +13,7 @@
 #   - attributes to extract from comments are configured in 'config_ghissues_comments.txt'.
 #   - scraped texts are stored in '../data_ghissues/X-issue-Y.txt',
 #     where X is the repository and Y is the issue ID
-#   - summary file '../data_ghissues/scraped_repo_and_issues.txt' contains stats of scraped issues,
+#   - summary file '../data_ghissues/00_summary_scraped_repo_and_issues.txt' contains stats of scraped issues,
 #     also used to compare if issue is updated
 # --------------------------------------------------------------------------------------------------
 
@@ -25,7 +25,15 @@
 import os
 import re
 import textwrap
+import logging
 
+# --- logging ---
+
+logging.basicConfig(
+    level=logging.DEBUG,  
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'  
+)
+logger = logging.getLogger(__name__)
 
 # --- configuration ---
 
@@ -34,69 +42,86 @@
     gh_param = json.load(conf)
     conf.close()
 except FileNotFoundError:
-    print("ERROR: scrape_ghissues: cannot read credentials file.")
+    logger.error("scrape_ghissues: cannot read credentials file")
     sys.exit(1)
 
 token = gh_param.get("github_token")
+project_owner = gh_param.get("project_owner")
+
+base_url = "https://api.github.com"
+query_params = "?per_page=100" # per_page for number of entries
+dev = False # param for development to break while loop
+
 config_ghissues = "config_ghissues.txt"
 config_ghissues_comments = "config_ghissues_comments.txt"
 format_attributes = ["Body"]
-base_url = "https://api.github.com"
-project_owner = "noi-techpark"
 out_dir = "../data_ghissues"
-summary_file_name = f"{out_dir}/scraped_repo_and_issues.txt"
-test_param = "" # eg. "?per_page=1&page=1", per_page for number of entries, page for number of pages
+summary_file_name = f"{out_dir}/00_summary_scraped_repo_and_issues.txt"
+request_count = 0
 
 # --- functions ---
 
 def load_config(config_file):
     with open(config_file, 'r') as file:
         return [line.strip() for line in file]
 
-def make_github_request(url):
-    req = urllib.request.Request(f"{url}{test_param}")
+def make_github_request(url, function_name):
+    global request_count
+    req = urllib.request.Request(url)
     if token:
         req.add_header('Authorization', f'Bearer {token}')
-    return req
-
-def get_repositories():
-    req = make_github_request(f"{base_url}/orgs/{project_owner}/repos")
     with urllib.request.urlopen(req) as response:
         if response.getcode() != 200:
-            print("WARNING: get_repository() got response %s." % response.getcode())
+            logger.warning("%s got response %s.", function_name, response.getcode())
             return []
         data = response.read()
+        header_link = response.headers.get("Link")
+        request_count += 1
+        if request_count % 1 == 0:
+            logger.debug("Executing: %s", url)
     json_data = json.loads(data.decode('utf-8'))
-    return json_data
+    return json_data, header_link
+
+def extract_next_link(header):
+    links = header.split(',')
+    for link in links:
+        parts = link.split('; ')
+        if 'rel="next"' in parts[1]:
+            link = parts[0].strip('<> ')
+            break
+    return link
+
+def get_repositories(): 
+    json_data, header = make_github_request(f"{base_url}/orgs/{project_owner}/repos{query_params}", "get_repositories()")
+    list_repos = [item['name'] for item in json_data]
+    while header and "next" in header:
+        link = extract_next_link(header)
+        json_data, header = make_github_request(link, "get_repositories()")
+        list_repos.extend([item['name'] for item in json_data])
+        if dev: break
+    return list_repos
 
 def get_repo_issues(repo):
-    req = make_github_request(f"{base_url}/repos/{project_owner}/{repo}/issues")
-    with urllib.request.urlopen(req) as response:
-        if response.getcode() != 200:
-            print("WARNING: get_repo_issues() got response %s." % response.getcode())
-            return []
-        data = response.read()
-    json_data = json.loads(data.decode('utf-8'))
-    return json_data
+    json_data, header  = make_github_request(f"{base_url}/repos/{project_owner}/{repo}/issues{query_params}", "get_repo_issues()")
+    list_issue_numbers = [item['number'] for item in json_data]
+    while header and "next" in header:
+        link = extract_next_link(header)
+        json_data, header = make_github_request(link, "get_repo_issues()")
+        list_issue_numbers.extend([item['number'] for item in json_data])
+        if dev: break
+    return list_issue_numbers
 
 def get_issue(repo, issue_number):
-    req = make_github_request(f"{base_url}/repos/{project_owner}/{repo}/issues/{issue_number}")
-    with urllib.request.urlopen(req) as response:
-        if response.getcode() != 200:
-            print("WARNING: get_issue() got response %s." % response.getcode())
-            return []
-        data = response.read()
-    json_data = json.loads(data.decode('utf-8'))
+    json_data, header  = make_github_request(f"{base_url}/repos/{project_owner}/{repo}/issues/{issue_number}", "get_issue()")
     return json_data
 
 def get_comments(repo, issue_number):
-    req = make_github_request(f"{base_url}/repos/{project_owner}/{repo}/issues/{issue_number}/comments")
-    with urllib.request.urlopen(req) as response:
-        if response.getcode() != 200:
-            print("WARNING: get_comments() got response %s." % response.getcode())
-            return []
-        data = response.read()
-    json_data = json.loads(data.decode('utf-8'))
+    json_data, header  = make_github_request(f"{base_url}/repos/{project_owner}/{repo}/issues/{issue_number}/comments{query_params}", "get_comments()")
+    while header and "next" in header:
+        link = extract_next_link(header)
+        more_json_data, header = make_github_request(link, "get_comments()")
+        json_data.append(more_json_data)
+        if dev: break
     return json_data
 
 def extract_and_format(attributes, extracted_data, data):
@@ -106,7 +131,7 @@ def extract_and_format(attributes, extracted_data, data):
                 extracted_data[attr] = extract_nested(data, attr)
             else:
                 extracted_data[attr] = data[attr]
-        except (KeyError, TypeError):
+        except (KeyError, TypeError) as err:
             extracted_data[attr] = None
     return format_issue(extracted_data)
 
@@ -179,50 +204,45 @@ def main():
         previous_content = summary_file.read()
         summary_file.close()
     except FileNotFoundError:
-        print("INFO: Previous summary file not found")
+        logger.info("previous summary file not found")
         previous_content = ""
     records, last_update = read_previous_content(previous_content)
-    temp_file_name = f"{out_dir}/temp_summary_file.txt"
+    temp_file_name = f"{out_dir}/01_temp_summary_file.txt"
     temp_summary_file = open(temp_file_name, 'w')
     temp_summary_file.write(f"Repository, Issue number, Comment count\n")
 
-    try:
-        json_repo = get_repositories()
-        list_repos = [item['name'] for item in json_repo]
+    list_repos = get_repositories()
 
-        for repo in list_repos:
-            json_repo_issues = get_repo_issues(repo)
-            list_issue_numbers = [item['number'] for item in json_repo_issues]
-
-            for issue_number in list_issue_numbers:
-                json_issues = get_issue(repo, issue_number)
-                updated = check_updated(records, repo, json_issues, last_update)
-
-                comment_count = json_issues["comments"]
-
-                if not records or not updated: 
-                    text = extract_and_format(issue_attributes, {}, json_issues)
-
-                    json_comments = get_comments(repo, issue_number)
-                    comment_count = len(json_comments)
-                    extracted_data = {}
-                    comment_count = 0
-                    for comment in json_comments:
-                        comment_count += 1
-                        extracted_data["comment"] = comment_count
-                        text += extract_and_format(comment_attributes, extracted_data, comment)
-
-                    file_name = "%s/%s-issue-%d.txt" % (out_dir, repo, int(issue_number))
-                    file = open(file_name, 'w')
-                    file.write(text)
-                    file.close()
-                    updated_issues += 1
-
-                total_issues += 1                
-                temp_summary_file.write(f"{repo}, {issue_number}, {comment_count}\n")
-    except Exception as e:
-        print(f"ERROR: {e}")
+    for repo in list_repos:
+        list_issue_numbers = get_repo_issues(repo)
 
+        for issue_number in list_issue_numbers:
+            json_issues = get_issue(repo, issue_number)
+            updated = check_updated(records, repo, json_issues, last_update)
+
+            comment_count = json_issues["comments"]
+
+            if not records or not updated: 
+                text = extract_and_format(issue_attributes, {}, json_issues)
+
+                json_comments = get_comments(repo, issue_number)
+                comment_count = len(json_comments)
+                extracted_data = {}
+                comment_count = 0
+                for comment in json_comments:
+                    comment_count += 1
+                    extracted_data["comment"] = comment_count
+                    text += extract_and_format(comment_attributes, extracted_data, comment)
+
+                file_name = "%s/%s-issue-%d.txt" % (out_dir, repo, int(issue_number))
+                file = open(file_name, 'w')
+                file.write(text)
+                file.close()
+                updated_issues += 1
+
+            total_issues += 1                
+            temp_summary_file.write(f"{repo}, {issue_number}, {comment_count}\n")
+            temp_summary_file.flush()
 
     t1 = time.time()
     temp_summary_file.write(f"Total number of issues: {total_issues}\n")

diff --git a/scrapers/secrets_gh.json b/scrapers/secrets_gh.json
@@ -1,3 +1,4 @@
 {
-  "github_token" : "YOUR_GITHUB_TOKEN_to_make_more_requests_per_hour_Otherwise_leave_blank"
+  "github_token" : "YOUR_GITHUB_TOKEN_to_make_more_requests_per_hour_Otherwise_leave_blank",
+  "project_owner" : "YOUR_GITHUB_ACCOUNT"
 }