-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
72 lines (54 loc) · 2.64 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import sys
import lxml.html
from lxml.cssselect import CSSSelector
import requests
from io import open as iopen
from urllib.parse import urlsplit
destinationDir = '/tmp/trilobites/'
galleryHost = 'https://www.amnh.org'
urls = [
'https://www.amnh.org/our-research/paleontology/paleontology-faq/trilobite-website/gallery-of-trilobites/master-gallery-of-all-website-trilobites-a-c',
'https://www.amnh.org/our-research/paleontology/paleontology-faq/trilobite-website/gallery-of-trilobites/master-gallery-of-all-website-trilobites-d-i',
'https://www.amnh.org/our-research/paleontology/paleontology-faq/trilobite-website/gallery-of-trilobites/master-gallery-of-all-website-trilobites-j-p',
'https://www.amnh.org/our-research/paleontology/paleontology-faq/trilobite-website/gallery-of-trilobites/master-gallery-of-all-website-trilobites-q-z'
]
sections = []
# Set this if you need to restart from a specific image index after a failed/aborted run
startSectionIndex = 0
# Get all image sections
for url in urls:
response = requests.get(url)
if response.status_code != requests.codes.ok:
print('=====> FAILED WITH STATUS ' + str(response.status_code) + ': ' + url)
exit()
dom = lxml.html.fromstring(response.text)
# imageDownloadLinks = CSSSelector('a.fullscreen-link')(dom)
sections = sections + CSSSelector('div.clearfix div.content')(dom)
print('FOUND ' + str(len(sections)) + ' TRILOBITES...\n\n')
# Parse each section to download image
for idx, section in enumerate(sections):
if idx < startSectionIndex:
continue
# Get trilobite name from header
trilobiteName = 'NO_NAME'
headers = CSSSelector('h1')(section)
if headers:
trilobiteNameText = headers[0].text
if trilobiteNameText:
trilobiteName = trilobiteNameText.strip(' \r\n').replace(' ', '_')
# Get full size image download link
downloadRelativePath = CSSSelector('a.fullscreen-link')(section)[0].get('href')
downloadUrl = galleryHost + downloadRelativePath
# Get rando image ID
pathTokens = downloadRelativePath.split('/')
imageId = pathTokens[-2] + '_' + pathTokens[-1]
# Download image
fileData = requests.get(downloadUrl)
if fileData.status_code != requests.codes.ok:
print('=====> ' + imageId + '-' + trilobiteName + ' FAILED WITH STATUS ' + str(fileData.status_code) + ': ' + downloadUrl + '\n')
continue
# Save image to disk
destinationFilePath = destinationDir + imageId + '-' + trilobiteName + '.jpg'
with iopen(destinationFilePath, 'wb') as file:
file.write(fileData.content)
print(str(idx) + ' - ' + trilobiteName + ' - ' + downloadUrl)