-
Notifications
You must be signed in to change notification settings - Fork 5
/
vocmax05_compress_database.py
101 lines (73 loc) · 3.33 KB
/
vocmax05_compress_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""
This script is used for compressing a set of NSRDB files in order to get
faster load times. This script is used to compress files, which are then
uploaded onto AWS S3.
All years of NSRDB files for a given location are read in and combined into a
single compressed data file and associated info file.
toddkarin
"""
import nsrdbtools
import numpy as np
import os
import pandas as pd
# Directory to inspect for getting weather data.
data_dir = '/Users/toddkarin/Documents/NSRDB/'
# Directory to put pickled files.
pickle_dir = '/Users/toddkarin/Documents/NSRDB_compressed/'
# Build dataframe of file info.
filedata = nsrdbtools.inspect_database(data_dir)
# NSRDB data has one csv file for each year for each location.
# Find all unique locations in the dataset.
unique_locs = list(set(filedata['location_id']))
print('Number locations: ' + str(len(unique_locs)))
# Loop through locations.
# for j in range(10):
for j in range(len(unique_locs)):
# Print progress
print('Iteration: {:.0f}, Percent done: {:.4f}'.format(j,j/len(unique_locs)*100))
# Find all datafiles at a given location
filedata_curr = filedata[filedata['location_id']==unique_locs[j]]
# Sort by year
filedata_curr = filedata_curr.sort_values('year')
# Get the different parts of the filename
fname_parts = filedata_curr.filename.to_list()[0].split('_')
# Name the output files.
data_filename = fname_parts[0] + '_' + fname_parts[1] + '_' + fname_parts[2] + '.npz'
# Full path of output files
data_fullpath = os.path.join(pickle_dir, data_filename)
if os.path.isfile(data_fullpath):
# Skip over any files that already exist.
print('file already exists')
else:
# If compressed files don't exist, read in filedata
df, info = nsrdbtools.combine_csv(filedata_curr['fullpath'])
dni = np.array(df['DNI'].astype(np.int16))
dhi = np.array(df['DHI'].astype(np.int16))
ghi = np.array(df['GHI'].astype(np.int16))
temp_air = np.array(df['Temperature'].astype(np.int8))
wind_speed = np.array(df['Wind Speed'].astype(np.float16))
year = np.array(df['Year'].astype(np.int16))
month = np.array(df['Month'].astype(np.int8))
day = np.array(df['Day'].astype(np.int8))
hour = np.array(df['Hour'].astype(np.int8))
minute = np.array(df['Minute'].astype(np.int8))
np.savez_compressed(data_fullpath,
Source=info['Source'],
Location_ID=info['Location ID'],
Latitude=info['Latitude'],
Longitude=info['Longitude'],
Elevation=info['Elevation'],
local_time_zone=info['Local Time Zone'],
interval_in_hours=info['interval_in_hours'],
timedelta_in_years=info['timedelta_in_years'],
Version=info['Version'],
dni=dni,
dhi=dhi,
ghi=ghi,
temp_air=temp_air,
wind_speed=wind_speed,
year=year,
month=month,
day=day,
hour=hour,
minute=minute)