-
Notifications
You must be signed in to change notification settings - Fork 1
/
reqmul.py
43 lines (38 loc) · 1.23 KB
/
reqmul.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import subprocess
import os
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
d1 = pd.read_csv('disease_links.csv')
base_url = "https://rarediseases.info.nih.gov/"
links = []
for i in range(5910):
sub_url = d1.loc[i, "link"]
url_link = base_url + sub_url
links.append(url_link)
def download_page(link):
# Get the file name from the end of the URL
file_name = link.split('/')[-1]
# Make sure the file name is not empty
if file_name:
file_path = os.path.join('pages', file_name)
else:
file_path = os.path.join('pages', 'index.html')
# Check if the file already exists
if os.path.exists(file_path):
print('~', end="")
return
response = requests.get(link)
if response.status_code == 200:
# Save the web page to the specified directory
with open(file_path, 'w') as f:
f.write(response.text)
elif response.status_code == 502:
print("Uh oh! a server side restriction, perhaps an IP ban.")
else:
print(f'Error downloading {link}: {response.status_code}')
# Create a thread pool with 16 threads
with ThreadPoolExecutor(max_workers=16) as executor:
# Submit a task to download each web page
for link in links:
executor.submit(download_page, link)