From a2c97773fafd4a55de50cfab96aebcd7e1edb545 Mon Sep 17 00:00:00 2001
From: Jeremiah Petersen <118206017+JeremiahPetersen@users.noreply.github.com>
Date: Mon, 15 Apr 2024 15:34:00 -0700
Subject: [PATCH] Update RepoToText.py - clean code

removed unnecessary error handling
---
 RepoToText.py | 67 +++++++++++++++++++++++----------------------------
 1 file changed, 30 insertions(+), 37 deletions(-)

diff --git a/RepoToText.py b/RepoToText.py
index 200427cc..e11dc700 100644
--- a/RepoToText.py
+++ b/RepoToText.py
@@ -1,5 +1,5 @@
 """
-This module handles the back end Flask server for RepoToText
+This module handles the back end flask server for RepoToText
 """
 
 # pylint: disable=line-too-long
@@ -8,7 +8,7 @@
 import os
 from datetime import datetime
 import re
-from github import Github, RateLimitExceededException, GithubException
+from github import Github, RateLimitExceededException
 from bs4 import BeautifulSoup
 import requests
 from flask import Flask, request, jsonify
@@ -25,8 +25,6 @@ def __init__(self, repo_name, doc_link=None, selected_file_types=None):
         if selected_file_types is None:
             selected_file_types = []
         self.github_api_key = os.getenv("GITHUB_API_KEY")
-        if not self.github_api_key:
-            raise ValueError("GitHub API key not set in environment variables")
         self.repo_name = repo_name
         self.doc_link = doc_link
         self.selected_file_types = selected_file_types
@@ -38,38 +36,34 @@ def recursive_fetch_files(repo, contents):
             files_data = []
             for content_file in contents:
                 if content_file.type == "dir":
-                    try:
-                        new_contents = repo.get_contents(content_file.path)
-                    except GithubException as e:
-                        print(f"Error accessing directory {content_file.path}: {e}")
-                        continue
-                    files_data += recursive_fetch_files(repo, new_contents)
+                    files_data += recursive_fetch_files(repo, repo.get_contents(content_file.path))
                 else:
+                    # Check if file type is in selected file types
                     if any(content_file.name.endswith(file_type) for file_type in self.selected_file_types):
-                        file_content = f"\n'''--- {content_file.path} ---\n"
-                        try:
-                            if content_file.encoding == "base64":
+                        file_content = ""
+                        file_content += f"\n'''--- {content_file.path} ---\n"
+
+                        if content_file.encoding == "base64":
+                            try:
                                 file_content += content_file.decoded_content.decode("utf-8")
-                            else:
-                                print(f"Warning: Skipping {content_file.path} due to unexpected encoding '{content_file.encoding}'.")
-                                continue
-                        except Exception as e:
-                            file_content += f"[Content not decodable: {e}]"
+                            except UnicodeDecodeError: # catch decoding errors
+                                file_content += "[Content not decodable]"
+                        elif content_file.encoding == "none":
+                            # Handle files with encoding "none" here
+                            print(f"Warning: Skipping {content_file.path} due to unsupported encoding 'none'.")
+                            continue
+                        else:
+                            # Handle other unexpected encodings here
+                            print(f"Warning: Skipping {content_file.path} due to unexpected encoding '{content_file.encoding}'.")
+                            continue
+
                         file_content += "\n'''"
                         files_data.append(file_content)
             return files_data
 
         github_instance = Github(self.github_api_key)
-        try:
-            repo = github_instance.get_repo(self.repo_name)
-        except GithubException as e:
-            raise ValueError(f"Error accessing GitHub repository {self.repo_name}: {e}")
-        
-        try:
-            contents = repo.get_contents("")
-        except GithubException as e:
-            raise ValueError(f"Error fetching repository contents: {e}")
-
+        repo = github_instance.get_repo(self.repo_name)
+        contents = repo.get_contents("")
         files_data = recursive_fetch_files(repo, contents)
         return files_data
 
@@ -86,7 +80,7 @@ def scrape_doc(self):
             return ""
 
     def write_to_file(self, files_data):
-        """Build a .txt file with all of the repo's files"""
+        """Built .txt file with all of the repo's files"""
         timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
         filename = f"/app/data/{self.repo_name.replace('/', '_')}_{timestamp}.txt"
         with open(filename, "w", encoding='utf-8') as f:
@@ -100,7 +94,7 @@ def write_to_file(self, files_data):
         return filename
 
     def clean_up_text(self, filename):
-        """Remove excessive line breaks."""
+        """Remove line breaks after 2."""
         with open(filename, 'r', encoding='utf-8') as f:
             text = f.read()
         cleaned_text = re.sub('\n{3,}', '\n\n', text)
@@ -108,7 +102,7 @@ def clean_up_text(self, filename):
             f.write(cleaned_text)
 
     def run(self):
-        """Run the scraping process."""
+        """Run RepoToText."""
         print("Fetching all files...")
         files_data = self.fetch_all_files()
 
@@ -123,8 +117,9 @@ def run(self):
 
 @app.route('/scrape', methods=['POST'])
 def scrape():
-    """Endpoint to initiate scraping of GitHub repositories."""
+    """Scrape GitHub repositories."""
     data = request.get_json()
+
     repo_url = data.get('repoUrl')
     doc_url = data.get('docUrl')
     selected_file_types = data.get('selectedFileTypes', [])
@@ -132,12 +127,10 @@ def scrape():
     if not repo_url:
         return jsonify({"error": "Repo URL not provided."}), 400
 
-    repo_name = repo_url.split('github.com/')[-1]
+    repo_name = repo_url.split('github.com/')[-1]  # Extract repo name from URL
+
     scraper = GithubRepoScraper(repo_name, doc_url, selected_file_types)
-    try:
-        filename = scraper.run()
-    except ValueError as e:
-        return jsonify({"error": str(e)}), 500
+    filename = scraper.run()
 
     with open(filename, 'r', encoding='utf-8') as file:
         file_content = file.read()