Skip to content

Commit 4f996c0

Browse files
authored
Merge pull request #21 from lsgrep/add-transformers
add transformers
2 parents 6314162 + c0a7923 commit 4f996c0

File tree

4 files changed

+69993
-6
lines changed

4 files changed

+69993
-6
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
- [Jax](https://jax.readthedocs.io/en/latest/jax.html)
1515
- [Ray](https://docs.ray.io/en/latest/genindex.html)
1616
- [Langchain](https://api.python.langchain.com/en/latest/langchain_api_reference.html)
17+
- [Hugging Face Transformers](https://huggingface.co/docs/transformers/index)
1718

1819
## Install
1920
Checkout the [Releases](https://github.com/lsgrep/mldocs/releases), download the latest `mldocs.alfredworkflow`,

crawler/generate_ml_keywords.py

Lines changed: 270 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import json
22
import re
33
from pathlib import Path
4+
from typing import Dict, Any
45

56
import requests
67
import yaml
8+
from bs4 import BeautifulSoup
79

810
data_dir = f'{str(Path(__file__).resolve().parent.parent)}/data'
911

@@ -73,23 +75,287 @@ def parse_generated_docs(link, pattern=None):
7375
doc_url = f'{base_url}/{href}'
7476
metadata = {'url': doc_url}
7577
data[k] = metadata
78+
return data
79+
80+
81+
def parse_huggingface_main_classes() -> Dict[str, Any]:
82+
"""Parse Hugging Face Transformers main API classes documentation.
83+
84+
Returns:
85+
Dictionary mapping class names to their documentation URLs
86+
"""
87+
data = {}
88+
base_domain = 'https://huggingface.co'
89+
90+
# Get main class pages from the API reference page
91+
print('\nFetching main classes from API reference...')
92+
resp = requests.get(f'{base_domain}/docs/transformers/main_classes/configuration')
93+
soup = BeautifulSoup(resp.text, 'html.parser')
94+
95+
# Find the sidebar navigation
96+
main_class_pages = []
97+
for link in soup.select('a'):
98+
href = link.get('href', '')
99+
if '/main_classes/' in href:
100+
page_name = href.split('/')[-1]
101+
if page_name and page_name not in main_class_pages:
102+
main_class_pages.append(page_name)
103+
104+
# Remove duplicates and sort
105+
main_class_pages = sorted(set(main_class_pages))
106+
107+
print(f'\nFound main class pages: {main_class_pages}')
108+
print(f'Processing {len(main_class_pages)} main class pages...')
109+
110+
# Process each main class page
111+
for page in main_class_pages:
112+
page_url = f'{base_domain}/docs/transformers/main_classes/{page}'
113+
114+
try:
115+
resp = requests.get(page_url)
116+
soup = BeautifulSoup(resp.text, 'html.parser')
117+
118+
# Find all class sections (h2 and h3 headers)
119+
class_sections = soup.find_all(['h2', 'h3'])
120+
current_class = None
121+
122+
# Track links for summary
123+
page_links = set()
124+
page_classes = set()
125+
126+
# Process all links first
127+
for a in soup.find_all('a'):
128+
href = a.get('href', '')
129+
if href and '#transformers.' in href:
130+
page_links.add(href)
131+
132+
# Process class sections
133+
for section in class_sections:
134+
# Look for class name in the text
135+
class_match = re.search(r'class transformers\.(\w+)', section.text)
136+
if class_match:
137+
current_class = class_match.group(1)
138+
class_key = f'transformers.{current_class}'
139+
page_classes.add(class_key)
140+
141+
data[class_key] = {
142+
'url': f'{page_url}#transformers.{current_class}'
143+
}
144+
continue
145+
146+
section_id = section.get('id', '')
147+
if not section_id or section_id.startswith('_') or section_id.endswith('_'):
148+
continue
149+
150+
# If we're in a class context, this might be a method
151+
if current_class:
152+
method_key = f'transformers.{current_class}.{section_id}'
153+
data[method_key] = {
154+
'url': f'{page_url}#transformers.{current_class}.{section_id}'
155+
}
156+
157+
# Print summary for this page
158+
print(f'\n{page.upper()}:')
159+
if page_classes:
160+
print(f' Classes ({len(page_classes)}): {sorted(page_classes)}')
161+
print(f' Total links: {len(page_links)}')
162+
163+
except Exception as e:
164+
print(f'Error processing {page_url}: {str(e)}')
165+
continue
166+
167+
return data
76168

169+
def parse_huggingface_docs(base_url: str, test_mode: bool = False) -> Dict[str, Any]:
170+
"""Parse Hugging Face Transformers documentation.
171+
172+
Args:
173+
base_url: The base URL of the Hugging Face documentation
174+
175+
Returns:
176+
Dictionary mapping function/class names to their documentation URLs
177+
"""
178+
data = {}
179+
base_domain = 'https://huggingface.co'
180+
181+
# Get the main page content
182+
resp = requests.get(base_url)
183+
soup = BeautifulSoup(resp.text, 'html.parser')
184+
185+
# Find all model documentation links
186+
model_links = []
187+
for link in soup.find_all('a'):
188+
href = link.get('href')
189+
if href and 'model_doc' in href:
190+
# Handle both absolute and relative URLs
191+
if href.startswith('http'):
192+
full_url = href
193+
elif href.startswith('//'):
194+
full_url = f'https:{href}'
195+
else:
196+
# Handle relative paths
197+
if href.startswith('/'):
198+
full_url = f'{base_domain}{href}'
199+
else:
200+
# Construct URL relative to the docs base path
201+
docs_base = '/'.join(base_url.split('/')[:-1])
202+
full_url = f'{docs_base}/{href}'
203+
model_links.append(full_url)
204+
205+
# For testing, only process ZoeDepth
206+
if test_mode:
207+
model_links = ['https://huggingface.co/docs/transformers/model_doc/zoedepth']
208+
209+
# Process each model's documentation
210+
for model_url in model_links:
211+
try:
212+
print(f'Processing {model_url}')
213+
print('Found model links:', len(model_links))
214+
model_resp = requests.get(model_url)
215+
model_soup = BeautifulSoup(model_resp.text, 'html.parser')
216+
217+
# Extract model name from URL
218+
model_name = model_url.split('/')[-1].replace('-', '_')
219+
220+
# Add the main model entry
221+
model_key = f'transformers.{model_name}'
222+
print(f'Adding model entry: {model_key}')
223+
data[model_key] = {
224+
'url': model_url
225+
}
226+
227+
# Find all class definitions (h2 and h3 headers)
228+
class_sections = model_soup.find_all(['h2', 'h3'])
229+
current_class = None
230+
231+
for section in class_sections:
232+
# Look for the class name in the text
233+
class_match = re.search(r'class transformers\.([\w]+)', section.text)
234+
if class_match:
235+
current_class = class_match.group(1)
236+
class_key = f'transformers.{model_name}.{current_class}'
237+
print(f'Found class: {class_key}')
238+
239+
data[class_key] = {
240+
'url': f'{model_url}#transformers.{current_class.split(".")[-1]}'
241+
}
242+
continue
243+
244+
section_id = section.get('id', '')
245+
if not section_id or section_id.startswith('_') or section_id.endswith('_'):
246+
continue
247+
248+
# If we're in a class context, this might be a method
249+
if current_class:
250+
current_class = section_id
251+
class_key = f'transformers.{model_name}.{current_class}'
252+
print(f'Found class: {class_key}')
253+
254+
# Get class description
255+
desc = ''
256+
next_p = section.find_next('p')
257+
if next_p:
258+
desc = next_p.text
259+
260+
data[class_key] = {
261+
'url': f'{model_url}#{section_id}',
262+
'desc': desc
263+
}
264+
265+
# Find and add all methods in this class
266+
method_sections = section.find_next(['h4', 'h5'])
267+
while method_sections and method_sections.find_previous(['h2', 'h3']) == section:
268+
method_id = method_sections.get('id', '')
269+
if method_id and not method_id.startswith('_'):
270+
method_key = f'{class_key}.{method_id}'
271+
272+
# Get method description and parameters
273+
desc = ''
274+
params = []
275+
next_elem = method_sections.find_next(['p', 'ul'])
276+
while next_elem and next_elem.name in ['p', 'ul']:
277+
if next_elem.name == 'p':
278+
desc += next_elem.text + ' '
279+
elif next_elem.name == 'ul':
280+
for li in next_elem.find_all('li'):
281+
params.append(li.text)
282+
next_elem = next_elem.find_next(['p', 'ul'])
283+
284+
data[method_key] = {
285+
'url': f'{model_url}#{method_id}',
286+
'desc': desc.strip(),
287+
'params': params
288+
}
289+
290+
method_sections = method_sections.find_next(['h4', 'h5'])
291+
292+
# If we're in a class context, this might be a method
293+
elif current_class:
294+
method_key = f'transformers.{model_name}.{current_class}.{section_id}'
295+
296+
data[method_key] = {
297+
'url': f'{model_url}#transformers.{current_class.split(".")[-1]}.{section_id}'
298+
}
299+
300+
except Exception as e:
301+
print(f'Error processing {model_url}: {str(e)}')
302+
continue
303+
77304
return data
78305

79306

80307
if __name__ == '__main__':
81-
data = prepare_base_keywords()
308+
# Load existing data if it exists
309+
doc_file = f'{data_dir}/ml.json'
310+
try:
311+
with open(doc_file, 'r') as f:
312+
data = json.load(f)
313+
print(f'Loaded {len(data)} existing entries')
314+
except FileNotFoundError:
315+
data = {}
316+
317+
# Add base keywords if not present
318+
base_data = prepare_base_keywords()
319+
for key, value in base_data.items():
320+
if key not in data:
321+
data[key] = value
322+
82323
seed_file = f'{data_dir}/seed.yaml'
83324
seed = load_seed_file(seed_file)
325+
326+
# Process TensorFlow docs
84327
for tensorflow_doc in seed['tensorflow']:
85328
print(f'processing: {tensorflow_doc["name"]}')
86329
crawled = parse_tf_docs(tensorflow_doc['url'], tensorflow_doc['prefix'])
87330
data.update(crawled)
331+
332+
# Process generated docs
88333
for api_doc in seed['generated']:
89334
print(f'processing: {api_doc["name"]}')
90335
doc_url = api_doc['url']
91-
data.update(parse_generated_docs(doc_url))
336+
337+
# Special handling for Hugging Face Transformers
338+
if api_doc['name'] == 'transformers':
339+
print('Processing Hugging Face Transformers documentation...')
340+
crawled = parse_huggingface_docs(doc_url, test_mode=False)
341+
print(f'Crawled model data keys: {list(crawled.keys())}')
342+
# Preserve Hugging Face model entries
343+
for key, value in crawled.items():
344+
data[key] = value
345+
346+
print('\nProcessing Hugging Face main API classes...')
347+
main_classes = parse_huggingface_main_classes()
348+
print('\nMain API Classes found:')
349+
for key, value in main_classes.items():
350+
print(f' {key} -> {value["url"]}')
351+
# Add main API class entries
352+
for key, value in main_classes.items():
353+
data[key] = value
354+
else:
355+
data.update(parse_generated_docs(doc_url))
92356

93-
doc_file = f'{data_dir}/ml.json'
357+
print(f'Writing {len(data)} entries to {doc_file}')
358+
print('Sample keys:', list(data.keys())[:5], '...')
359+
print('Hugging Face keys:', [k for k in data.keys() if k.startswith('transformers.')])
94360
with open(doc_file, 'w') as f:
95-
json.dump(data, f)
361+
json.dump(data, f, indent=2)

data/ml.json

Lines changed: 69719 additions & 1 deletion
Large diffs are not rendered by default.

data/seed.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,6 @@ generated: # API docs those are generated by tools
3636
- name: 'xgboost'
3737
url: 'https://xgboost.readthedocs.io/en/stable/genindex.html'
3838
- name: 'langchain'
39-
url: 'https://api.python.langchain.com/en/latest/langchain_api_reference.html'
39+
url: 'https://api.python.langchain.com/en/latest/langchain_api_reference.html'
40+
- name: 'transformers'
41+
url: 'https://huggingface.co/docs/transformers/index'

0 commit comments

Comments
 (0)