-
Notifications
You must be signed in to change notification settings - Fork 0
/
history_scraper.py
71 lines (54 loc) · 2.17 KB
/
history_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import datetime
import locale
import requests
from pathlib import Path
import random
import time
import csv
from bs4 import BeautifulSoup
from collections import namedtuple
locale.setlocale(locale.LC_TIME, 'it_IT.utf8')
SCRIPT_DIR = Path(__file__).parent.resolve()
def write_checkpoint(file_name, csv_list):
with open(file_name, 'w') as out:
csv_out = csv.writer(out)
csv_out.writerow(['date', 'time', 'cond', 'temp', 'wind', 'hr', 'press'])
csv_out.writerows(csv_list)
WeatherRec = namedtuple('WeatherRec', ['date', 'time', 'cond', 'temp', 'wind', 'hr', 'press'])
start = datetime.datetime.strptime("05-10-2019", "%d-%m-%Y")
end = datetime.datetime.today()
date_generated = [start + datetime.timedelta(days=x) for x in range(0, (end-start).days + 1)]
weather_list = []
n_pages = len(date_generated)
for idx, dat in enumerate(date_generated):
url = f"https://it.tutiempo.net/record/lime/{dat.strftime('%-d-%B-%Y')}.html"
response = requests.get(url)
print(f'Parsed {url} ({idx+1} of {n_pages}): {response.status_code}')
time.sleep(random.uniform(0, 2))
soup = BeautifulSoup(response.text, "html.parser")
try:
table_rows = soup.find(id="HistoricosData").div.table.tbody.find_all('tr')
except AttributeError:
print('Error in parsing page: data non available')
continue
for roww in table_rows:
d = roww.find_all('td', recursive=False)
if not d:
continue
weather_list.append(WeatherRec(
dat.strftime('%d-%m-%Y'),
d[0].text,
d[1].span.text,
d[2].text.rstrip('°'),
d[3].text.rstrip(' km/h'),
d[4].text.rstrip('%'),
d[5].text.rstrip(' hPa')
))
if (idx + 1) % 30 != 0: continue
print(f'Saving checkpoint history_weather_{str((idx + 1) // 30).zfill(2)}.csv')
file_path = SCRIPT_DIR.joinpath('data', f'history_weather_{str((idx + 1) // 30).zfill(2)}.csv')
write_checkpoint(file_path, weather_list)
weather_list.clear()
file_path = SCRIPT_DIR.joinpath('dat', f'history_weather_{str((idx + 1) // 30 + 1).zfill(2)}.csv')
write_checkpoint(file_path, weather_list)
print('DONE')