Skip to content

add transformers #21

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
- [Jax](https://jax.readthedocs.io/en/latest/jax.html)
- [Ray](https://docs.ray.io/en/latest/genindex.html)
- [Langchain](https://api.python.langchain.com/en/latest/langchain_api_reference.html)
- [Hugging Face Transformers](https://huggingface.co/docs/transformers/index)

## Install
Checkout the [Releases](https://github.com/lsgrep/mldocs/releases), download the latest `mldocs.alfredworkflow`,
Expand Down
274 changes: 270 additions & 4 deletions crawler/generate_ml_keywords.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import json
import re
from pathlib import Path
from typing import Dict, Any

import requests
import yaml
from bs4 import BeautifulSoup

data_dir = f'{str(Path(__file__).resolve().parent.parent)}/data'

Expand Down Expand Up @@ -73,23 +75,287 @@ def parse_generated_docs(link, pattern=None):
doc_url = f'{base_url}/{href}'
metadata = {'url': doc_url}
data[k] = metadata
return data


def parse_huggingface_main_classes() -> Dict[str, Any]:
"""Parse Hugging Face Transformers main API classes documentation.

Returns:
Dictionary mapping class names to their documentation URLs
"""
data = {}
base_domain = 'https://huggingface.co'

# Get main class pages from the API reference page
print('\nFetching main classes from API reference...')
resp = requests.get(f'{base_domain}/docs/transformers/main_classes/configuration')
soup = BeautifulSoup(resp.text, 'html.parser')

# Find the sidebar navigation
main_class_pages = []
for link in soup.select('a'):
href = link.get('href', '')
if '/main_classes/' in href:
page_name = href.split('/')[-1]
if page_name and page_name not in main_class_pages:
main_class_pages.append(page_name)

# Remove duplicates and sort
main_class_pages = sorted(set(main_class_pages))

print(f'\nFound main class pages: {main_class_pages}')
print(f'Processing {len(main_class_pages)} main class pages...')

# Process each main class page
for page in main_class_pages:
page_url = f'{base_domain}/docs/transformers/main_classes/{page}'

try:
resp = requests.get(page_url)
soup = BeautifulSoup(resp.text, 'html.parser')

# Find all class sections (h2 and h3 headers)
class_sections = soup.find_all(['h2', 'h3'])
current_class = None

# Track links for summary
page_links = set()
page_classes = set()

# Process all links first
for a in soup.find_all('a'):
href = a.get('href', '')
if href and '#transformers.' in href:
page_links.add(href)

# Process class sections
for section in class_sections:
# Look for class name in the text
class_match = re.search(r'class transformers\.(\w+)', section.text)
if class_match:
current_class = class_match.group(1)
class_key = f'transformers.{current_class}'
page_classes.add(class_key)

data[class_key] = {
'url': f'{page_url}#transformers.{current_class}'
}
continue

section_id = section.get('id', '')
if not section_id or section_id.startswith('_') or section_id.endswith('_'):
continue

# If we're in a class context, this might be a method
if current_class:
method_key = f'transformers.{current_class}.{section_id}'
data[method_key] = {
'url': f'{page_url}#transformers.{current_class}.{section_id}'
}

# Print summary for this page
print(f'\n{page.upper()}:')
if page_classes:
print(f' Classes ({len(page_classes)}): {sorted(page_classes)}')
print(f' Total links: {len(page_links)}')

except Exception as e:
print(f'Error processing {page_url}: {str(e)}')
continue

return data

def parse_huggingface_docs(base_url: str, test_mode: bool = False) -> Dict[str, Any]:
"""Parse Hugging Face Transformers documentation.

Args:
base_url: The base URL of the Hugging Face documentation

Returns:
Dictionary mapping function/class names to their documentation URLs
"""
data = {}
base_domain = 'https://huggingface.co'

# Get the main page content
resp = requests.get(base_url)
soup = BeautifulSoup(resp.text, 'html.parser')

# Find all model documentation links
model_links = []
for link in soup.find_all('a'):
href = link.get('href')
if href and 'model_doc' in href:
# Handle both absolute and relative URLs
if href.startswith('http'):
full_url = href
elif href.startswith('//'):
full_url = f'https:{href}'
else:
# Handle relative paths
if href.startswith('/'):
full_url = f'{base_domain}{href}'
else:
# Construct URL relative to the docs base path
docs_base = '/'.join(base_url.split('/')[:-1])
full_url = f'{docs_base}/{href}'
model_links.append(full_url)

# For testing, only process ZoeDepth
if test_mode:
model_links = ['https://huggingface.co/docs/transformers/model_doc/zoedepth']

# Process each model's documentation
for model_url in model_links:
try:
print(f'Processing {model_url}')
print('Found model links:', len(model_links))
model_resp = requests.get(model_url)
model_soup = BeautifulSoup(model_resp.text, 'html.parser')

# Extract model name from URL
model_name = model_url.split('/')[-1].replace('-', '_')

# Add the main model entry
model_key = f'transformers.{model_name}'
print(f'Adding model entry: {model_key}')
data[model_key] = {
'url': model_url
}

# Find all class definitions (h2 and h3 headers)
class_sections = model_soup.find_all(['h2', 'h3'])
current_class = None

for section in class_sections:
# Look for the class name in the text
class_match = re.search(r'class transformers\.([\w]+)', section.text)
if class_match:
current_class = class_match.group(1)
class_key = f'transformers.{model_name}.{current_class}'
print(f'Found class: {class_key}')

data[class_key] = {
'url': f'{model_url}#transformers.{current_class.split(".")[-1]}'
}
continue

section_id = section.get('id', '')
if not section_id or section_id.startswith('_') or section_id.endswith('_'):
continue

# If we're in a class context, this might be a method
if current_class:
current_class = section_id
class_key = f'transformers.{model_name}.{current_class}'
print(f'Found class: {class_key}')

# Get class description
desc = ''
next_p = section.find_next('p')
if next_p:
desc = next_p.text

data[class_key] = {
'url': f'{model_url}#{section_id}',
'desc': desc
}

# Find and add all methods in this class
method_sections = section.find_next(['h4', 'h5'])
while method_sections and method_sections.find_previous(['h2', 'h3']) == section:
method_id = method_sections.get('id', '')
if method_id and not method_id.startswith('_'):
method_key = f'{class_key}.{method_id}'

# Get method description and parameters
desc = ''
params = []
next_elem = method_sections.find_next(['p', 'ul'])
while next_elem and next_elem.name in ['p', 'ul']:
if next_elem.name == 'p':
desc += next_elem.text + ' '
elif next_elem.name == 'ul':
for li in next_elem.find_all('li'):
params.append(li.text)
next_elem = next_elem.find_next(['p', 'ul'])

data[method_key] = {
'url': f'{model_url}#{method_id}',
'desc': desc.strip(),
'params': params
}

method_sections = method_sections.find_next(['h4', 'h5'])

# If we're in a class context, this might be a method
elif current_class:
method_key = f'transformers.{model_name}.{current_class}.{section_id}'

data[method_key] = {
'url': f'{model_url}#transformers.{current_class.split(".")[-1]}.{section_id}'
}

except Exception as e:
print(f'Error processing {model_url}: {str(e)}')
continue

return data


if __name__ == '__main__':
data = prepare_base_keywords()
# Load existing data if it exists
doc_file = f'{data_dir}/ml.json'
try:
with open(doc_file, 'r') as f:
data = json.load(f)
print(f'Loaded {len(data)} existing entries')
except FileNotFoundError:
data = {}

# Add base keywords if not present
base_data = prepare_base_keywords()
for key, value in base_data.items():
if key not in data:
data[key] = value

seed_file = f'{data_dir}/seed.yaml'
seed = load_seed_file(seed_file)

# Process TensorFlow docs
for tensorflow_doc in seed['tensorflow']:
print(f'processing: {tensorflow_doc["name"]}')
crawled = parse_tf_docs(tensorflow_doc['url'], tensorflow_doc['prefix'])
data.update(crawled)

# Process generated docs
for api_doc in seed['generated']:
print(f'processing: {api_doc["name"]}')
doc_url = api_doc['url']
data.update(parse_generated_docs(doc_url))

# Special handling for Hugging Face Transformers
if api_doc['name'] == 'transformers':
print('Processing Hugging Face Transformers documentation...')
crawled = parse_huggingface_docs(doc_url, test_mode=False)
print(f'Crawled model data keys: {list(crawled.keys())}')
# Preserve Hugging Face model entries
for key, value in crawled.items():
data[key] = value

print('\nProcessing Hugging Face main API classes...')
main_classes = parse_huggingface_main_classes()
print('\nMain API Classes found:')
for key, value in main_classes.items():
print(f' {key} -> {value["url"]}')
# Add main API class entries
for key, value in main_classes.items():
data[key] = value
else:
data.update(parse_generated_docs(doc_url))

doc_file = f'{data_dir}/ml.json'
print(f'Writing {len(data)} entries to {doc_file}')
print('Sample keys:', list(data.keys())[:5], '...')
print('Hugging Face keys:', [k for k in data.keys() if k.startswith('transformers.')])
with open(doc_file, 'w') as f:
json.dump(data, f)
json.dump(data, f, indent=2)
69,720 changes: 69,719 additions & 1 deletion data/ml.json

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion data/seed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,6 @@ generated: # API docs those are generated by tools
- name: 'xgboost'
url: 'https://xgboost.readthedocs.io/en/stable/genindex.html'
- name: 'langchain'
url: 'https://api.python.langchain.com/en/latest/langchain_api_reference.html'
url: 'https://api.python.langchain.com/en/latest/langchain_api_reference.html'
- name: 'transformers'
url: 'https://huggingface.co/docs/transformers/index'
Loading