-
Notifications
You must be signed in to change notification settings - Fork 0
/
imagesMetadata.py
133 lines (113 loc) · 4.37 KB
/
imagesMetadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import sys
import json
import requests
from urllib.parse import urljoin, urlparse, parse_qs
from bs4 import BeautifulSoup
def scrape_page(url):
try:
response = requests.get(url)
response.raise_for_status()
except requests.RequestException as e:
print("Error fetching URL {}: {}".format(url, e))
return None
base_url = url
soup = BeautifulSoup(response.text, 'html.parser')
results = []
# Loop through each "risultato"
for result in soup.select('.risultato'):
# Extract the first image
img_tag = result.select_one('.scheda-foto img')
if img_tag and img_tag.get('src'):
image_url = urljoin(base_url, img_tag['src'])
else:
image_url = ''
# Extract title and title URL
h3_link = result.select_one('h3 a')
if h3_link:
title = h3_link.get_text(strip=True)
title_url = urljoin(base_url, h3_link['href'])
else:
title = ''
title_url = ''
record = {
'title': title,
'titleUrl': title_url,
'imageUrl': image_url,
'details': []
}
# Extract rows of data
rows = result.select('.tabella .riga')
for row in rows:
definition = row.select_one('.def')
definition_text = definition.get_text(strip=True) if definition else ''
dato_div = row.select_one('.dato')
if not dato_div:
continue
link = dato_div.select_one('a')
if link:
# If dato contains a link
value = link.get_text(strip=True)
raw_link = urljoin(base_url, link['href'])
parsed_url = urlparse(raw_link)
query_params = parse_qs(parsed_url.query)
scheda_param = query_params.get('scheda', [None])[0]
scheda_link = None
if scheda_param:
# Reconstruct URL with only the 'scheda' param
scheda_link = "{}://{}{}?scheda={}".format(
parsed_url.scheme,
parsed_url.netloc,
parsed_url.path,
scheda_param
)
record['details'].append({
'definition': definition_text,
'value': value,
'originalLink': raw_link,
'schedaParam': scheda_param,
'schedaLink': scheda_link
})
else:
# If dato contains just text
value = dato_div.get_text(strip=True)
# If definition is "Soggetto", split by comma
if definition_text.lower() == 'soggetto':
value = [v.strip() for v in value.split(',')]
record['details'].append({
'definition': definition_text,
'value': value
})
results.append(record)
return results
def main():
# Parse command line arguments
if len(sys.argv) != 3:
print("Usage: script.py <start_page> <end_page>")
sys.exit(1)
try:
start_page = int(sys.argv[1])
end_page = int(sys.argv[2])
except ValueError:
print("Please provide integer values for start_page and end_page.")
sys.exit(1)
# Ensure the output directory exists
os.makedirs('output', exist_ok=True)
base_url = "https://asacdati.labiennale.org/it/fondi/fototeca/sem-ricerca.php?cerca=1&p="
for p in range(start_page, end_page + 1):
page_url = "{}{}".format(base_url, p)
print("Scraping page {}: {}".format(p, page_url))
data = scrape_page(page_url)
if data is not None:
if len(data) > 0:
# Save to JSON file
output_file = os.path.join('output', "page_{}.json".format(p))
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print("Page {} scraped successfully, data saved to {}".format(p, output_file))
else:
print("Page {}: No results found.".format(p))
else:
print("Page {}: Failed to scrape page.".format(p))
if __name__ == "__main__":
main()