Skip to content

Commit

Permalink
Update RepoToText.py - clean code
Browse files Browse the repository at this point in the history
removed unnecessary error handling
  • Loading branch information
JeremiahPetersen authored Apr 15, 2024
1 parent 3e8612a commit a2c9777
Showing 1 changed file with 30 additions and 37 deletions.
67 changes: 30 additions & 37 deletions RepoToText.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
This module handles the back end Flask server for RepoToText
This module handles the back end flask server for RepoToText
"""

# pylint: disable=line-too-long
Expand All @@ -8,7 +8,7 @@
import os
from datetime import datetime
import re
from github import Github, RateLimitExceededException, GithubException
from github import Github, RateLimitExceededException
from bs4 import BeautifulSoup
import requests
from flask import Flask, request, jsonify
Expand All @@ -25,8 +25,6 @@ def __init__(self, repo_name, doc_link=None, selected_file_types=None):
if selected_file_types is None:
selected_file_types = []
self.github_api_key = os.getenv("GITHUB_API_KEY")
if not self.github_api_key:
raise ValueError("GitHub API key not set in environment variables")
self.repo_name = repo_name
self.doc_link = doc_link
self.selected_file_types = selected_file_types
Expand All @@ -38,38 +36,34 @@ def recursive_fetch_files(repo, contents):
files_data = []
for content_file in contents:
if content_file.type == "dir":
try:
new_contents = repo.get_contents(content_file.path)
except GithubException as e:
print(f"Error accessing directory {content_file.path}: {e}")
continue
files_data += recursive_fetch_files(repo, new_contents)
files_data += recursive_fetch_files(repo, repo.get_contents(content_file.path))
else:
# Check if file type is in selected file types
if any(content_file.name.endswith(file_type) for file_type in self.selected_file_types):
file_content = f"\n'''--- {content_file.path} ---\n"
try:
if content_file.encoding == "base64":
file_content = ""
file_content += f"\n'''--- {content_file.path} ---\n"

if content_file.encoding == "base64":
try:
file_content += content_file.decoded_content.decode("utf-8")
else:
print(f"Warning: Skipping {content_file.path} due to unexpected encoding '{content_file.encoding}'.")
continue
except Exception as e:
file_content += f"[Content not decodable: {e}]"
except UnicodeDecodeError: # catch decoding errors
file_content += "[Content not decodable]"
elif content_file.encoding == "none":
# Handle files with encoding "none" here
print(f"Warning: Skipping {content_file.path} due to unsupported encoding 'none'.")
continue
else:
# Handle other unexpected encodings here
print(f"Warning: Skipping {content_file.path} due to unexpected encoding '{content_file.encoding}'.")
continue

file_content += "\n'''"
files_data.append(file_content)
return files_data

github_instance = Github(self.github_api_key)
try:
repo = github_instance.get_repo(self.repo_name)
except GithubException as e:
raise ValueError(f"Error accessing GitHub repository {self.repo_name}: {e}")

try:
contents = repo.get_contents("")
except GithubException as e:
raise ValueError(f"Error fetching repository contents: {e}")

repo = github_instance.get_repo(self.repo_name)
contents = repo.get_contents("")
files_data = recursive_fetch_files(repo, contents)
return files_data

Expand All @@ -86,7 +80,7 @@ def scrape_doc(self):
return ""

def write_to_file(self, files_data):
"""Build a .txt file with all of the repo's files"""
"""Built .txt file with all of the repo's files"""
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
filename = f"/app/data/{self.repo_name.replace('/', '_')}_{timestamp}.txt"
with open(filename, "w", encoding='utf-8') as f:
Expand All @@ -100,15 +94,15 @@ def write_to_file(self, files_data):
return filename

def clean_up_text(self, filename):
"""Remove excessive line breaks."""
"""Remove line breaks after 2."""
with open(filename, 'r', encoding='utf-8') as f:
text = f.read()
cleaned_text = re.sub('\n{3,}', '\n\n', text)
with open(filename, 'w', encoding='utf-8') as f:
f.write(cleaned_text)

def run(self):
"""Run the scraping process."""
"""Run RepoToText."""
print("Fetching all files...")
files_data = self.fetch_all_files()

Expand All @@ -123,21 +117,20 @@ def run(self):

@app.route('/scrape', methods=['POST'])
def scrape():
"""Endpoint to initiate scraping of GitHub repositories."""
"""Scrape GitHub repositories."""
data = request.get_json()

repo_url = data.get('repoUrl')
doc_url = data.get('docUrl')
selected_file_types = data.get('selectedFileTypes', [])

if not repo_url:
return jsonify({"error": "Repo URL not provided."}), 400

repo_name = repo_url.split('github.com/')[-1]
repo_name = repo_url.split('github.com/')[-1] # Extract repo name from URL

scraper = GithubRepoScraper(repo_name, doc_url, selected_file_types)
try:
filename = scraper.run()
except ValueError as e:
return jsonify({"error": str(e)}), 500
filename = scraper.run()

with open(filename, 'r', encoding='utf-8') as file:
file_content = file.read()
Expand Down

0 comments on commit a2c9777

Please sign in to comment.