-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
🤖 Quick attempt at a file to help us slice the project up
- Loading branch information
1 parent
087823c
commit 5a897c9
Showing
1 changed file
with
337 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,337 @@ | ||
#!/usr/bin/env -S uv --quiet run --script | ||
# /// script | ||
# requires-python = ">=3.12" | ||
# dependencies = [ | ||
# "bs4", | ||
# "httpx", | ||
# "pydantic", | ||
# "python-dateutil", | ||
# "python-frontmatter", | ||
# "python-slugify", | ||
# "pytz", | ||
# "rich", | ||
# "typer", | ||
# "markdown-it-py", | ||
# ] | ||
# /// | ||
import os | ||
import re | ||
from pathlib import Path | ||
from typing import Any | ||
from urllib.parse import urlparse | ||
|
||
import frontmatter | ||
import httpx | ||
import typer | ||
from bs4 import BeautifulSoup | ||
from bs4 import Tag | ||
from markdown_it import MarkdownIt | ||
from pydantic import BaseModel | ||
from pydantic import ConfigDict | ||
from pydantic import Field | ||
from rich import print | ||
from rich.progress import track | ||
from slugify import slugify | ||
|
||
|
||
app = typer.Typer( | ||
add_help_option=False, | ||
no_args_is_help=True, | ||
rich_markup_mode="rich", | ||
) | ||
|
||
|
||
class Project(BaseModel): | ||
"""Model representing a Django project from the awesome list.""" | ||
|
||
model_config = ConfigDict(extra="allow") | ||
|
||
name: str | ||
description: str | ||
url: str | ||
category: str | ||
slug: str = Field(default="") | ||
tags: list[str] = Field(default_factory=list) | ||
github_stars: int | None = None | ||
github_forks: int | None = None | ||
github_last_update: str | None = None | ||
previous_urls: list[str] = Field(default_factory=list) | ||
|
||
def __init__(self, **data): | ||
super().__init__(**data) | ||
if not self.slug: | ||
self.slug = slugify(self.name) | ||
|
||
|
||
def parse_project_line(line: Tag, category: str) -> Project | None: | ||
"""Parse a project line from the markdown and return a Project object.""" | ||
try: | ||
# Find the project link | ||
link = line.find("a") | ||
if not link: | ||
return None | ||
|
||
name = link.text.strip() | ||
url = link.get("href", "").strip() | ||
|
||
# Get description (text after the link) | ||
description = line.text.replace(name, "").strip() | ||
description = re.sub(r"^\s*-\s*", "", description) # Remove leading dash | ||
description = re.sub(r"^\s*", "", description) # Remove leading whitespace | ||
|
||
if not all([name, url, description]): | ||
return None | ||
|
||
return Project(name=name, description=description, url=url, category=category) | ||
except Exception as e: | ||
print(f"[red]Error parsing project line: {e}[/red]") | ||
return None | ||
|
||
|
||
def read_readme(file_path: Path) -> str: | ||
"""Read README content from local file and convert to HTML.""" | ||
markdown_content = file_path.read_text() | ||
md = MarkdownIt() | ||
html_content = md.render(markdown_content) | ||
return html_content | ||
|
||
|
||
def parse_readme(content: str) -> list[Project]: | ||
"""Parse README content and extract projects.""" | ||
soup = BeautifulSoup(content, "html.parser") | ||
projects = [] | ||
current_category = "" | ||
|
||
for element in soup.find_all(["h2", "h3", "li"]): | ||
if element.name in ["h2", "h3"]: | ||
current_category = element.text.strip() | ||
elif element.name == "li" and current_category: | ||
if current_category == "Contents": | ||
continue | ||
|
||
project = parse_project_line(element, current_category) | ||
if project: | ||
projects.append(project) | ||
|
||
return projects | ||
|
||
|
||
def merge_project_data(existing: dict[str, Any], new: dict[str, Any]) -> dict[str, Any]: | ||
""" | ||
Merge existing project data with new data, preserving existing values | ||
while updating with new information where appropriate. | ||
""" | ||
# Start with the existing data | ||
merged = existing.copy() | ||
|
||
# Always update core fields from the README | ||
core_fields = {"name", "url", "category"} | ||
for field in core_fields: | ||
if field in new: | ||
# If URL is changing, store the old URL in previous_urls | ||
if field == "url" and new["url"] != existing.get("url"): | ||
previous_urls = merged.get("previous_urls", []) | ||
old_url = existing.get("url") | ||
if old_url and old_url not in previous_urls: | ||
previous_urls.append(old_url) | ||
merged["previous_urls"] = previous_urls | ||
merged[field] = new[field] | ||
|
||
# Smart merge for description - update only if meaningfully different | ||
if "description" in new and new["description"] != existing.get("description", ""): | ||
merged["description"] = new["description"] | ||
|
||
# Update GitHub metrics if they exist in new data | ||
github_fields = {"github_stars", "github_forks", "github_last_update"} | ||
for field in github_fields: | ||
if field in new and new[field] is not None: | ||
merged[field] = new[field] | ||
|
||
return merged | ||
|
||
|
||
def save_project(project: Project, output_dir: Path): | ||
"""Save project as a markdown file with frontmatter, preserving and merging existing content.""" | ||
output_file = output_dir / f"{project.slug}.md" | ||
project_data = project.model_dump(exclude_none=True) | ||
|
||
if output_file.exists(): | ||
try: | ||
# Load existing file | ||
existing_post = frontmatter.load(output_file) | ||
existing_data = dict(existing_post.metadata) | ||
|
||
# Merge data, favoring preservation of existing content | ||
merged_data = merge_project_data(existing_data, project_data) | ||
|
||
# Create new post with merged data but keep existing content | ||
post = frontmatter.Post(existing_post.content, **merged_data) | ||
except Exception as e: | ||
print( | ||
f"[yellow]Warning: Could not load existing file {output_file}, creating new: {e}[/yellow]" | ||
) | ||
post = frontmatter.Post(project.description, **project_data) | ||
else: | ||
# Create new file | ||
post = frontmatter.Post(project.description, **project_data) | ||
|
||
output_file.write_text(frontmatter.dumps(post)) | ||
|
||
|
||
def extract_github_info(url: str) -> dict[str, str] | None: | ||
"""Extract owner and repo from a GitHub URL.""" | ||
parsed = urlparse(url) | ||
if parsed.netloc != "github.com": | ||
return None | ||
|
||
parts = parsed.path.strip("/").split("/") | ||
if len(parts) >= 2: | ||
return {"owner": parts[0], "repo": parts[1]} | ||
return None | ||
|
||
|
||
def get_github_metrics( | ||
owner: str, repo: str, client: httpx.Client | ||
) -> tuple[dict, str | None]: | ||
""" | ||
Fetch GitHub metrics for a repository. | ||
Returns a tuple of (metrics_dict, new_url) where new_url is set if the repo has moved. | ||
""" | ||
headers = {} | ||
if github_token := os.environ.get("GITHUB_TOKEN"): | ||
headers["Authorization"] = f"token {github_token}" | ||
|
||
api_url = f"https://api.github.com/repos/{owner}/{repo}" | ||
try: | ||
response = client.get( | ||
api_url, | ||
headers=headers, | ||
timeout=10.0, | ||
follow_redirects=True, # Enable following redirects | ||
) | ||
|
||
# Check if we followed a redirect | ||
new_url = None | ||
if len(response.history) > 0: | ||
for r in response.history: | ||
if r.status_code == 301: | ||
# Get the new location from the API response | ||
data = response.json() | ||
new_url = data.get("html_url") | ||
if new_url: | ||
print( | ||
f"[yellow]Repository moved: {owner}/{repo} -> {new_url}[/yellow]" | ||
) | ||
break | ||
|
||
response.raise_for_status() | ||
data = response.json() | ||
|
||
return { | ||
"github_stars": data["stargazers_count"], | ||
"github_forks": data["forks_count"], | ||
"github_last_update": data["updated_at"], | ||
}, new_url | ||
|
||
except httpx.HTTPError as e: | ||
print(f"[red]Error fetching GitHub metrics for {owner}/{repo}: {str(e)}[/red]") | ||
return {}, None | ||
|
||
|
||
def load_project(file_path: Path) -> Project | None: | ||
"""Load a project from a markdown file.""" | ||
try: | ||
post = frontmatter.load(file_path) | ||
return Project(**post.metadata) | ||
except Exception as e: | ||
print(f"[red]Error loading project from {file_path}: {str(e)}[/red]") | ||
return None | ||
|
||
|
||
@app.command() | ||
def parse(readme_path: Path = Path("README.md"), output_dir: str = "_projects"): | ||
""" | ||
Parse local Awesome Django README and create individual project files with frontmatter. | ||
Preserves existing file content and metadata while updating with new information from README. | ||
""" | ||
if not readme_path.exists(): | ||
print(f"[red]Error: README file not found at {readme_path}[/red]") | ||
raise typer.Exit(1) | ||
|
||
print(f"[bold blue]Reading README from {readme_path}...[/bold blue]") | ||
|
||
# Create output directory | ||
output_path = Path(output_dir) | ||
output_path.mkdir(exist_ok=True) | ||
|
||
# Read and parse README | ||
content = read_readme(readme_path) | ||
projects = parse_readme(content) | ||
|
||
print(f"[green]Found {len(projects)} projects[/green]") | ||
|
||
# Save individual project files | ||
for project in projects: | ||
save_project(project, output_path) | ||
print(f"[green]Updated {project.name} in {project.slug}.md[/green]") | ||
|
||
|
||
@app.command() | ||
def update_metrics(projects_dir: Path = Path("_projects"), batch_size: int = 50): | ||
""" | ||
Update GitHub metrics (stars, forks, last update) for all projects. | ||
""" | ||
if not projects_dir.exists(): | ||
print(f"[red]Error: Projects directory not found at {projects_dir}[/red]") | ||
raise typer.Exit(1) | ||
|
||
print( | ||
f"[bold blue]Updating GitHub metrics for projects in {projects_dir}...[/bold blue]" | ||
) | ||
|
||
# Load all projects | ||
project_files = list(projects_dir.glob("*.md")) | ||
projects = [] | ||
for file in project_files: | ||
if project := load_project(file): | ||
projects.append((file, project)) | ||
|
||
print(f"[green]Found {len(projects)} projects to update[/green]") | ||
|
||
# Update metrics in batches to avoid rate limiting | ||
with httpx.Client() as client: | ||
for i in track( | ||
range(0, len(projects), batch_size), description="Updating projects" | ||
): | ||
batch = projects[i : i + batch_size] | ||
for file_path, project in batch: | ||
if github_info := extract_github_info(project.url): | ||
metrics, new_url = get_github_metrics( | ||
github_info["owner"], github_info["repo"], client | ||
) | ||
|
||
if metrics: | ||
# Update project with new metrics | ||
for key, value in metrics.items(): | ||
setattr(project, key, value) | ||
|
||
# Update URL if repository has moved | ||
if new_url and new_url != project.url: | ||
# Store the old URL in previous_urls | ||
if not hasattr(project, "previous_urls"): | ||
project.previous_urls = [] | ||
project.previous_urls.append(project.url) | ||
# Update to new URL | ||
project.url = new_url | ||
print( | ||
f"[yellow]Updated URL for {project.name}: {project.url}[/yellow]" | ||
) | ||
|
||
save_project(project, projects_dir) | ||
print(f"[green]Updated metrics for {project.name}[/green]") | ||
|
||
print("[bold blue]Finished updating GitHub metrics![/bold blue]") | ||
|
||
|
||
if __name__ == "__main__": | ||
app() |