From a2c97773fafd4a55de50cfab96aebcd7e1edb545 Mon Sep 17 00:00:00 2001 From: Jeremiah Petersen <118206017+JeremiahPetersen@users.noreply.github.com> Date: Mon, 15 Apr 2024 15:34:00 -0700 Subject: [PATCH] Update RepoToText.py - clean code removed unnecessary error handling --- RepoToText.py | 67 +++++++++++++++++++++++---------------------------- 1 file changed, 30 insertions(+), 37 deletions(-) diff --git a/RepoToText.py b/RepoToText.py index 200427cc..e11dc700 100644 --- a/RepoToText.py +++ b/RepoToText.py @@ -1,5 +1,5 @@ """ -This module handles the back end Flask server for RepoToText +This module handles the back end flask server for RepoToText """ # pylint: disable=line-too-long @@ -8,7 +8,7 @@ import os from datetime import datetime import re -from github import Github, RateLimitExceededException, GithubException +from github import Github, RateLimitExceededException from bs4 import BeautifulSoup import requests from flask import Flask, request, jsonify @@ -25,8 +25,6 @@ def __init__(self, repo_name, doc_link=None, selected_file_types=None): if selected_file_types is None: selected_file_types = [] self.github_api_key = os.getenv("GITHUB_API_KEY") - if not self.github_api_key: - raise ValueError("GitHub API key not set in environment variables") self.repo_name = repo_name self.doc_link = doc_link self.selected_file_types = selected_file_types @@ -38,38 +36,34 @@ def recursive_fetch_files(repo, contents): files_data = [] for content_file in contents: if content_file.type == "dir": - try: - new_contents = repo.get_contents(content_file.path) - except GithubException as e: - print(f"Error accessing directory {content_file.path}: {e}") - continue - files_data += recursive_fetch_files(repo, new_contents) + files_data += recursive_fetch_files(repo, repo.get_contents(content_file.path)) else: + # Check if file type is in selected file types if any(content_file.name.endswith(file_type) for file_type in self.selected_file_types): - file_content = f"\n'''--- {content_file.path} ---\n" - try: - if content_file.encoding == "base64": + file_content = "" + file_content += f"\n'''--- {content_file.path} ---\n" + + if content_file.encoding == "base64": + try: file_content += content_file.decoded_content.decode("utf-8") - else: - print(f"Warning: Skipping {content_file.path} due to unexpected encoding '{content_file.encoding}'.") - continue - except Exception as e: - file_content += f"[Content not decodable: {e}]" + except UnicodeDecodeError: # catch decoding errors + file_content += "[Content not decodable]" + elif content_file.encoding == "none": + # Handle files with encoding "none" here + print(f"Warning: Skipping {content_file.path} due to unsupported encoding 'none'.") + continue + else: + # Handle other unexpected encodings here + print(f"Warning: Skipping {content_file.path} due to unexpected encoding '{content_file.encoding}'.") + continue + file_content += "\n'''" files_data.append(file_content) return files_data github_instance = Github(self.github_api_key) - try: - repo = github_instance.get_repo(self.repo_name) - except GithubException as e: - raise ValueError(f"Error accessing GitHub repository {self.repo_name}: {e}") - - try: - contents = repo.get_contents("") - except GithubException as e: - raise ValueError(f"Error fetching repository contents: {e}") - + repo = github_instance.get_repo(self.repo_name) + contents = repo.get_contents("") files_data = recursive_fetch_files(repo, contents) return files_data @@ -86,7 +80,7 @@ def scrape_doc(self): return "" def write_to_file(self, files_data): - """Build a .txt file with all of the repo's files""" + """Built .txt file with all of the repo's files""" timestamp = datetime.now().strftime("%Y%m%d%H%M%S") filename = f"/app/data/{self.repo_name.replace('/', '_')}_{timestamp}.txt" with open(filename, "w", encoding='utf-8') as f: @@ -100,7 +94,7 @@ def write_to_file(self, files_data): return filename def clean_up_text(self, filename): - """Remove excessive line breaks.""" + """Remove line breaks after 2.""" with open(filename, 'r', encoding='utf-8') as f: text = f.read() cleaned_text = re.sub('\n{3,}', '\n\n', text) @@ -108,7 +102,7 @@ def clean_up_text(self, filename): f.write(cleaned_text) def run(self): - """Run the scraping process.""" + """Run RepoToText.""" print("Fetching all files...") files_data = self.fetch_all_files() @@ -123,8 +117,9 @@ def run(self): @app.route('/scrape', methods=['POST']) def scrape(): - """Endpoint to initiate scraping of GitHub repositories.""" + """Scrape GitHub repositories.""" data = request.get_json() + repo_url = data.get('repoUrl') doc_url = data.get('docUrl') selected_file_types = data.get('selectedFileTypes', []) @@ -132,12 +127,10 @@ def scrape(): if not repo_url: return jsonify({"error": "Repo URL not provided."}), 400 - repo_name = repo_url.split('github.com/')[-1] + repo_name = repo_url.split('github.com/')[-1] # Extract repo name from URL + scraper = GithubRepoScraper(repo_name, doc_url, selected_file_types) - try: - filename = scraper.run() - except ValueError as e: - return jsonify({"error": str(e)}), 500 + filename = scraper.run() with open(filename, 'r', encoding='utf-8') as file: file_content = file.read()