-
Notifications
You must be signed in to change notification settings - Fork 0
/
metadata_fetcher.py
149 lines (118 loc) · 4.41 KB
/
metadata_fetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import re
from dotenv import load_dotenv
from datetime import datetime
from typing import Union, List
from pydantic import BaseModel, Field
import arxiv
from github import Github, GithubException
from urllib.parse import urlparse, urlunparse
# Load environment variables
load_dotenv(override=True)
class ArxivPaper(BaseModel):
"""Represents an arXiv paper."""
title: str
abstract: str = Field(exclude=True) # This field won't be included in serialization
authors: List[str]
publish_date: datetime
arxiv_url: str
fetch_date: datetime = Field(default_factory=datetime.now)
summary: str
class GithubRepo(BaseModel):
"""Represents a GitHub repository."""
title: str
about: str
readme: str = Field(exclude=True) # This field won't be included in serialization
author: str
last_commit_date: datetime
stars: int
license: Union[str, None]
github_url: str
fetch_date: datetime = Field(default_factory=datetime.now)
summary: str
def fetch_arxiv_data(url: str) -> ArxivPaper:
"""Fetch data for an arXiv paper."""
try:
url = clean_url(url)
# Extract paper_id
paper_id = url.split("/")[-1]
# Fetch data
search = arxiv.Search(id_list=[paper_id])
paper = next(search.results())
return ArxivPaper(
title=paper.title,
abstract=paper.summary,
authors=[author.name for author in paper.authors],
publish_date=paper.published,
arxiv_url=url,
summary="",
)
except StopIteration:
raise ValueError(f"No paper found for the given URL: {url}")
except Exception as e:
raise RuntimeError(f"Error fetching arXiv data: {str(e)}")
def fetch_github_data(url: str) -> GithubRepo:
"""Fetch data for a GitHub repository."""
try:
g = Github()
url = clean_url(url)
# Extract repo full name from URL
parsed_url = urlparse(url)
repo_full_name = "/".join(parsed_url.path.strip("/").split("/")[:2])
if not repo_full_name:
raise ValueError("Invalid GitHub URL")
repo = g.get_repo(repo_full_name)
try:
readme = repo.get_readme().decoded_content.decode()
readme = process_markdown(readme)
except GithubException:
readme = "No README available"
return GithubRepo(
title=repo.name,
about=repo.description or "No description available",
readme=readme,
author=repo.owner.login,
last_commit_date=repo.pushed_at,
stars=repo.stargazers_count,
license=repo.license.name if repo.license else None,
github_url=url,
summary="",
)
except GithubException as e:
if e.status == 404:
raise ValueError(f"GitHub repository not found: {url}")
else:
raise RuntimeError(f"GitHub API error: {str(e)}")
except Exception as e:
raise RuntimeError(f"Error fetching GitHub data: {str(e)}")
def clean_url(url: str) -> str:
"""Cleans the URL to meet the requirements."""
parsed_url = urlparse(url)
if "github.com" in parsed_url.netloc:
# Keep only the first two path segments (username and repo name)
path = "/".join(parsed_url.path.strip("/").split("/")[:2])
return urlunparse(
parsed_url._replace(path=f"/{path}", params="", query="", fragment="")
)
elif "arxiv.org" in parsed_url.netloc:
# Replace 'pdf' with 'abs' in the path and remove version number
path = parsed_url.path.replace("/pdf/", "/abs/")
path = re.sub(r"v\d+$", "", path)
return urlunparse(
parsed_url._replace(path=path, params="", query="", fragment="")
)
else:
raise ValueError("Invalid URL. Must be an arXiv or GitHub URL.")
def process_markdown(markdown):
# Remove HTML tags
html_tag_pattern = r"<[^>]*>"
markdown = re.sub(html_tag_pattern, "", markdown)
# Remove markdown links
md_link_pattern = r"\[(.*?)\]\(.*?\)"
markdown = re.sub(md_link_pattern, r"\1", markdown)
# Remove code blocks
code_block_pattern = r"```.*?```"
markdown = re.sub(code_block_pattern, "(code example)", markdown, flags=re.DOTALL)
# Remove markdown tables
md_table_pattern = r"\|.*?\|"
markdown = re.sub(md_table_pattern, "(table)", markdown, flags=re.DOTALL)
return markdown