|
1 | 1 | import json
|
2 | 2 | import re
|
3 | 3 | from pathlib import Path
|
| 4 | +from typing import Dict, Any |
4 | 5 |
|
5 | 6 | import requests
|
6 | 7 | import yaml
|
| 8 | +from bs4 import BeautifulSoup |
7 | 9 |
|
8 | 10 | data_dir = f'{str(Path(__file__).resolve().parent.parent)}/data'
|
9 | 11 |
|
@@ -73,23 +75,287 @@ def parse_generated_docs(link, pattern=None):
|
73 | 75 | doc_url = f'{base_url}/{href}'
|
74 | 76 | metadata = {'url': doc_url}
|
75 | 77 | data[k] = metadata
|
| 78 | + return data |
| 79 | + |
| 80 | + |
| 81 | +def parse_huggingface_main_classes() -> Dict[str, Any]: |
| 82 | + """Parse Hugging Face Transformers main API classes documentation. |
| 83 | + |
| 84 | + Returns: |
| 85 | + Dictionary mapping class names to their documentation URLs |
| 86 | + """ |
| 87 | + data = {} |
| 88 | + base_domain = 'https://huggingface.co' |
| 89 | + |
| 90 | + # Get main class pages from the API reference page |
| 91 | + print('\nFetching main classes from API reference...') |
| 92 | + resp = requests.get(f'{base_domain}/docs/transformers/main_classes/configuration') |
| 93 | + soup = BeautifulSoup(resp.text, 'html.parser') |
| 94 | + |
| 95 | + # Find the sidebar navigation |
| 96 | + main_class_pages = [] |
| 97 | + for link in soup.select('a'): |
| 98 | + href = link.get('href', '') |
| 99 | + if '/main_classes/' in href: |
| 100 | + page_name = href.split('/')[-1] |
| 101 | + if page_name and page_name not in main_class_pages: |
| 102 | + main_class_pages.append(page_name) |
| 103 | + |
| 104 | + # Remove duplicates and sort |
| 105 | + main_class_pages = sorted(set(main_class_pages)) |
| 106 | + |
| 107 | + print(f'\nFound main class pages: {main_class_pages}') |
| 108 | + print(f'Processing {len(main_class_pages)} main class pages...') |
| 109 | + |
| 110 | + # Process each main class page |
| 111 | + for page in main_class_pages: |
| 112 | + page_url = f'{base_domain}/docs/transformers/main_classes/{page}' |
| 113 | + |
| 114 | + try: |
| 115 | + resp = requests.get(page_url) |
| 116 | + soup = BeautifulSoup(resp.text, 'html.parser') |
| 117 | + |
| 118 | + # Find all class sections (h2 and h3 headers) |
| 119 | + class_sections = soup.find_all(['h2', 'h3']) |
| 120 | + current_class = None |
| 121 | + |
| 122 | + # Track links for summary |
| 123 | + page_links = set() |
| 124 | + page_classes = set() |
| 125 | + |
| 126 | + # Process all links first |
| 127 | + for a in soup.find_all('a'): |
| 128 | + href = a.get('href', '') |
| 129 | + if href and '#transformers.' in href: |
| 130 | + page_links.add(href) |
| 131 | + |
| 132 | + # Process class sections |
| 133 | + for section in class_sections: |
| 134 | + # Look for class name in the text |
| 135 | + class_match = re.search(r'class transformers\.(\w+)', section.text) |
| 136 | + if class_match: |
| 137 | + current_class = class_match.group(1) |
| 138 | + class_key = f'transformers.{current_class}' |
| 139 | + page_classes.add(class_key) |
| 140 | + |
| 141 | + data[class_key] = { |
| 142 | + 'url': f'{page_url}#transformers.{current_class}' |
| 143 | + } |
| 144 | + continue |
| 145 | + |
| 146 | + section_id = section.get('id', '') |
| 147 | + if not section_id or section_id.startswith('_') or section_id.endswith('_'): |
| 148 | + continue |
| 149 | + |
| 150 | + # If we're in a class context, this might be a method |
| 151 | + if current_class: |
| 152 | + method_key = f'transformers.{current_class}.{section_id}' |
| 153 | + data[method_key] = { |
| 154 | + 'url': f'{page_url}#transformers.{current_class}.{section_id}' |
| 155 | + } |
| 156 | + |
| 157 | + # Print summary for this page |
| 158 | + print(f'\n{page.upper()}:') |
| 159 | + if page_classes: |
| 160 | + print(f' Classes ({len(page_classes)}): {sorted(page_classes)}') |
| 161 | + print(f' Total links: {len(page_links)}') |
| 162 | + |
| 163 | + except Exception as e: |
| 164 | + print(f'Error processing {page_url}: {str(e)}') |
| 165 | + continue |
| 166 | + |
| 167 | + return data |
76 | 168 |
|
| 169 | +def parse_huggingface_docs(base_url: str, test_mode: bool = False) -> Dict[str, Any]: |
| 170 | + """Parse Hugging Face Transformers documentation. |
| 171 | + |
| 172 | + Args: |
| 173 | + base_url: The base URL of the Hugging Face documentation |
| 174 | + |
| 175 | + Returns: |
| 176 | + Dictionary mapping function/class names to their documentation URLs |
| 177 | + """ |
| 178 | + data = {} |
| 179 | + base_domain = 'https://huggingface.co' |
| 180 | + |
| 181 | + # Get the main page content |
| 182 | + resp = requests.get(base_url) |
| 183 | + soup = BeautifulSoup(resp.text, 'html.parser') |
| 184 | + |
| 185 | + # Find all model documentation links |
| 186 | + model_links = [] |
| 187 | + for link in soup.find_all('a'): |
| 188 | + href = link.get('href') |
| 189 | + if href and 'model_doc' in href: |
| 190 | + # Handle both absolute and relative URLs |
| 191 | + if href.startswith('http'): |
| 192 | + full_url = href |
| 193 | + elif href.startswith('//'): |
| 194 | + full_url = f'https:{href}' |
| 195 | + else: |
| 196 | + # Handle relative paths |
| 197 | + if href.startswith('/'): |
| 198 | + full_url = f'{base_domain}{href}' |
| 199 | + else: |
| 200 | + # Construct URL relative to the docs base path |
| 201 | + docs_base = '/'.join(base_url.split('/')[:-1]) |
| 202 | + full_url = f'{docs_base}/{href}' |
| 203 | + model_links.append(full_url) |
| 204 | + |
| 205 | + # For testing, only process ZoeDepth |
| 206 | + if test_mode: |
| 207 | + model_links = ['https://huggingface.co/docs/transformers/model_doc/zoedepth'] |
| 208 | + |
| 209 | + # Process each model's documentation |
| 210 | + for model_url in model_links: |
| 211 | + try: |
| 212 | + print(f'Processing {model_url}') |
| 213 | + print('Found model links:', len(model_links)) |
| 214 | + model_resp = requests.get(model_url) |
| 215 | + model_soup = BeautifulSoup(model_resp.text, 'html.parser') |
| 216 | + |
| 217 | + # Extract model name from URL |
| 218 | + model_name = model_url.split('/')[-1].replace('-', '_') |
| 219 | + |
| 220 | + # Add the main model entry |
| 221 | + model_key = f'transformers.{model_name}' |
| 222 | + print(f'Adding model entry: {model_key}') |
| 223 | + data[model_key] = { |
| 224 | + 'url': model_url |
| 225 | + } |
| 226 | + |
| 227 | + # Find all class definitions (h2 and h3 headers) |
| 228 | + class_sections = model_soup.find_all(['h2', 'h3']) |
| 229 | + current_class = None |
| 230 | + |
| 231 | + for section in class_sections: |
| 232 | + # Look for the class name in the text |
| 233 | + class_match = re.search(r'class transformers\.([\w]+)', section.text) |
| 234 | + if class_match: |
| 235 | + current_class = class_match.group(1) |
| 236 | + class_key = f'transformers.{model_name}.{current_class}' |
| 237 | + print(f'Found class: {class_key}') |
| 238 | + |
| 239 | + data[class_key] = { |
| 240 | + 'url': f'{model_url}#transformers.{current_class.split(".")[-1]}' |
| 241 | + } |
| 242 | + continue |
| 243 | + |
| 244 | + section_id = section.get('id', '') |
| 245 | + if not section_id or section_id.startswith('_') or section_id.endswith('_'): |
| 246 | + continue |
| 247 | + |
| 248 | + # If we're in a class context, this might be a method |
| 249 | + if current_class: |
| 250 | + current_class = section_id |
| 251 | + class_key = f'transformers.{model_name}.{current_class}' |
| 252 | + print(f'Found class: {class_key}') |
| 253 | + |
| 254 | + # Get class description |
| 255 | + desc = '' |
| 256 | + next_p = section.find_next('p') |
| 257 | + if next_p: |
| 258 | + desc = next_p.text |
| 259 | + |
| 260 | + data[class_key] = { |
| 261 | + 'url': f'{model_url}#{section_id}', |
| 262 | + 'desc': desc |
| 263 | + } |
| 264 | + |
| 265 | + # Find and add all methods in this class |
| 266 | + method_sections = section.find_next(['h4', 'h5']) |
| 267 | + while method_sections and method_sections.find_previous(['h2', 'h3']) == section: |
| 268 | + method_id = method_sections.get('id', '') |
| 269 | + if method_id and not method_id.startswith('_'): |
| 270 | + method_key = f'{class_key}.{method_id}' |
| 271 | + |
| 272 | + # Get method description and parameters |
| 273 | + desc = '' |
| 274 | + params = [] |
| 275 | + next_elem = method_sections.find_next(['p', 'ul']) |
| 276 | + while next_elem and next_elem.name in ['p', 'ul']: |
| 277 | + if next_elem.name == 'p': |
| 278 | + desc += next_elem.text + ' ' |
| 279 | + elif next_elem.name == 'ul': |
| 280 | + for li in next_elem.find_all('li'): |
| 281 | + params.append(li.text) |
| 282 | + next_elem = next_elem.find_next(['p', 'ul']) |
| 283 | + |
| 284 | + data[method_key] = { |
| 285 | + 'url': f'{model_url}#{method_id}', |
| 286 | + 'desc': desc.strip(), |
| 287 | + 'params': params |
| 288 | + } |
| 289 | + |
| 290 | + method_sections = method_sections.find_next(['h4', 'h5']) |
| 291 | + |
| 292 | + # If we're in a class context, this might be a method |
| 293 | + elif current_class: |
| 294 | + method_key = f'transformers.{model_name}.{current_class}.{section_id}' |
| 295 | + |
| 296 | + data[method_key] = { |
| 297 | + 'url': f'{model_url}#transformers.{current_class.split(".")[-1]}.{section_id}' |
| 298 | + } |
| 299 | + |
| 300 | + except Exception as e: |
| 301 | + print(f'Error processing {model_url}: {str(e)}') |
| 302 | + continue |
| 303 | + |
77 | 304 | return data
|
78 | 305 |
|
79 | 306 |
|
80 | 307 | if __name__ == '__main__':
|
81 |
| - data = prepare_base_keywords() |
| 308 | + # Load existing data if it exists |
| 309 | + doc_file = f'{data_dir}/ml.json' |
| 310 | + try: |
| 311 | + with open(doc_file, 'r') as f: |
| 312 | + data = json.load(f) |
| 313 | + print(f'Loaded {len(data)} existing entries') |
| 314 | + except FileNotFoundError: |
| 315 | + data = {} |
| 316 | + |
| 317 | + # Add base keywords if not present |
| 318 | + base_data = prepare_base_keywords() |
| 319 | + for key, value in base_data.items(): |
| 320 | + if key not in data: |
| 321 | + data[key] = value |
| 322 | + |
82 | 323 | seed_file = f'{data_dir}/seed.yaml'
|
83 | 324 | seed = load_seed_file(seed_file)
|
| 325 | + |
| 326 | + # Process TensorFlow docs |
84 | 327 | for tensorflow_doc in seed['tensorflow']:
|
85 | 328 | print(f'processing: {tensorflow_doc["name"]}')
|
86 | 329 | crawled = parse_tf_docs(tensorflow_doc['url'], tensorflow_doc['prefix'])
|
87 | 330 | data.update(crawled)
|
| 331 | + |
| 332 | + # Process generated docs |
88 | 333 | for api_doc in seed['generated']:
|
89 | 334 | print(f'processing: {api_doc["name"]}')
|
90 | 335 | doc_url = api_doc['url']
|
91 |
| - data.update(parse_generated_docs(doc_url)) |
| 336 | + |
| 337 | + # Special handling for Hugging Face Transformers |
| 338 | + if api_doc['name'] == 'transformers': |
| 339 | + print('Processing Hugging Face Transformers documentation...') |
| 340 | + crawled = parse_huggingface_docs(doc_url, test_mode=False) |
| 341 | + print(f'Crawled model data keys: {list(crawled.keys())}') |
| 342 | + # Preserve Hugging Face model entries |
| 343 | + for key, value in crawled.items(): |
| 344 | + data[key] = value |
| 345 | + |
| 346 | + print('\nProcessing Hugging Face main API classes...') |
| 347 | + main_classes = parse_huggingface_main_classes() |
| 348 | + print('\nMain API Classes found:') |
| 349 | + for key, value in main_classes.items(): |
| 350 | + print(f' {key} -> {value["url"]}') |
| 351 | + # Add main API class entries |
| 352 | + for key, value in main_classes.items(): |
| 353 | + data[key] = value |
| 354 | + else: |
| 355 | + data.update(parse_generated_docs(doc_url)) |
92 | 356 |
|
93 |
| - doc_file = f'{data_dir}/ml.json' |
| 357 | + print(f'Writing {len(data)} entries to {doc_file}') |
| 358 | + print('Sample keys:', list(data.keys())[:5], '...') |
| 359 | + print('Hugging Face keys:', [k for k in data.keys() if k.startswith('transformers.')]) |
94 | 360 | with open(doc_file, 'w') as f:
|
95 |
| - json.dump(data, f) |
| 361 | + json.dump(data, f, indent=2) |
0 commit comments