-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebscrape_script.py
executable file
·70 lines (55 loc) · 2.39 KB
/
webscrape_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# In webscrape_script.py
import os
import requests
import importlib.util
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import argparse
# Function to download files from URLs
def download_files(urls, download_directory):
for url in urls:
file_name = os.path.join(download_directory, os.path.basename(url))
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
def import_module_from_file(module_name, file_path):
spec = importlib.util.spec_from_file_location(module_name, file_path)
if spec is None:
return None
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
# Setup command-line argument parsing
parser = argparse.ArgumentParser(description='Scrape .ogg files from a webpage.')
parser.add_argument('url', type=str, help='The URL of the page to scrape.')
parser.add_argument('-o', '--output', type=str, help='The output folder to save the files.')
parser.add_argument('-s', '--selector', type=str, help='The selector module to use.', default='webscrape_mynoise.py')
args = parser.parse_args()
if args.selector:
selector_module_path = os.path.join(os.path.expanduser('~/bin'), args.selector)
else:
selector_module_path = os.path.join(os.path.expanduser('~/bin'), 'webscrape_mynoise.py')
selector_module = import_module_from_file('selector', selector_module_path)
# Setup headless chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
# Initialize the webdriver
driver = webdriver.Chrome(options=chrome_options)
# Open the URL
driver.get(args.url)
# Wait for the JavaScript to load resources
driver.implicitly_wait(10) # Adjust the wait time if necessary
# Extract URLs using the selector module
selectedUrls = selector_module.get_urls(driver)
# Define the directory where files will be saved
download_directory = args.output if args.output else os.path.join(os.getcwd(), 'downloaded_files')
# Create the directory to save the files if it does not exist
os.makedirs(download_directory, exist_ok=True)
# Download files
download_files(selectedUrls, download_directory)
# Close the browser
driver.quit()