-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGHCNv4_automatic_download.py
86 lines (59 loc) · 2.13 KB
/
GHCNv4_automatic_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# -*- coding: utf-8 -*-
"""
@author: Adrien Wehrlé, GEUS (Geological Survey of Denmark and Greenland)
"""
import urllib.request
from urllib.error import URLError
import pandas as pd
import numpy as np
import os
from multiprocessing import Pool, freeze_support
import time
import requests
from bs4 import BeautifulSoup
path = "/path/to/Github_repository/"
# load station metadata
metadata_filename = path + "GHCNv4_stations.txt"
stations_metadata = pd.read_csv(metadata_filename, delimiter=r"\s+")
# select all stations above min_latitude
min_latitude = 66.5
arctic_stations = stations_metadata[stations_metadata.Lat > min_latitude]
arctic_stations.reset_index(inplace=True, drop=True)
def GISTEMP_autodownload(k):
# extract station metadata
station = arctic_stations.iloc[k, :]
try:
# access website
response = requests.get(
"https://data.giss.nasa.gov/cgi-bin/gistemp/"
+ "stdata_show_v4.cgi?id="
+ station.ID
+ "&ds=14&dt=1"
)
# parse html
soup = BeautifulSoup(response.text, "html.parser")
# extract hyperlink
a_tag = soup.findAll("a")[15]
link = a_tag["href"]
# download file path
download_url = "https://data.giss.nasa.gov/" + link
urllib.request.urlretrieve(
download_url, path + "GHCN_v4_data" + os.sep + station.ID + ".csv"
)
print(k, "Downloading %s" % download_url)
except URLError:
print(k, "%s not found" % station)
return
if __name__ == "__main__":
freeze_support()
nb_cores = 7
start_time = time.time()
start_local_time = time.ctime(start_time)
with Pool(nb_cores) as p:
p.map(GISTEMP_autodownload, range(0, len(arctic_stations)))
end_time = time.time()
end_local_time = time.ctime(end_time)
processing_time = (end_time - start_time) / 60
print("--- Processing time: %s minutes ---" % processing_time)
print("--- Start time: %s ---" % start_local_time)
print("--- End time: %s ---" % end_local_time)