-
Notifications
You must be signed in to change notification settings - Fork 1
/
prepare_data.py
executable file
·169 lines (147 loc) · 5 KB
/
prepare_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import csv
import os
import datetime
import warnings
import numpy as np
import matplotlib.pyplot as plt
root_dir = os.path.join('/Users/edwardyoung/Google Drive/CodeForSF/OpenTransit/',
'OT Raw Data (not in shared drive)')
clean_data_dir = os.path.join('/Users/edwardyoung/Google Drive/CodeForSF/',
'OpenTransit/clean_data')
def format_and_save_apc_production_data(apc_file):
"""
"""
data_dict = format_apc_production_csv(apc_file)
for k in data_dict.keys():
np.save(os.path.join(clean_data_dir, f'{k}'), data_dict[k])
def format_apc_production_csv(apc_file):
"""
Extracts the data from a APC production CSV file and saves them as numpy
files.
"""
data_types = {
'ACTUAL_SEQUENCE': "int",
'ACT_TRIP_START_TIME' : "datetime",
'APC_DATE_TIME' : "datetime",
'BLOCK_ID' : "int",
# 'BOOKING_ID' : "int",
'BOOKING_ID' : "str",
'BOOKING_NUM' : 'int',
'BOOKING_START_DATE' : 'datetime',
'BS_ID' : 'int',
'CLOSE_DATE_TIME': 'datetime',
'CURRENT_ROUTE_ID': 'int',
'DATENUMBER' : 'int',
'DATE_TYPE_VS': 'int',
'DIRECTION_CODE_ID': 'int',
'DWELL_TIME' : 'int',
'EFFECTIVE_DATE_KEY_FK' : 'int',
'EXT_TRIP_ID' : 'int',
'GARAGE_ID': 'int',
'HEADSIGN_ROUTE': 'int',
'IMPORT_ERROR' : 'int',
'IMPORT_TRIP_ERROR' : 'int',
'INSERT_DATE_TIME' : 'datetime',
'MAX_LOAD' : 'int',
'NON_REV_DISTANCE': 'int',
'NON_REV_SECONDS': 'int',
'NUM_STAT' : 'int' ,
'OFFS' : 'int',
'ONS' : 'int',
'OPEN_DATE_TIME' : 'datetime',
'OPERATOR_ID' : 'int',
'POSITION_SOURCE' : 'int',
'PRIMARY_KEY' : 'int',
'QUALITY_INDICATOR' : 'int',
'RAW_MAX_LOAD' : 'int',
'RAW_OFF' : 'int',
'RAW_ON' : 'int',
'REV_DISTANCE' : 'float',
'REV_SECONDS' : 'int',
'ROUTE_ID' : 'int',
'RUN_ID' : 'int',
'SCHED_TIME' : 'datetime',
'SEG_ARR_TIME' : 'datetime',
'SEG_DEP_TIME' : 'datetime',
'START_TRIP_TIME' : 'datetime',
'TIME_ID' : 'int',
'TP_ID' : 'int',
'TRANSIT_DATE_TIME' : 'datetime',
'VARIATION' : 'str',
'VEHICLE_ID' : 'int',
'VEH_LAT' : 'float',
'VEH_LONG' : 'float'
}
with open(os.path.join(root_dir, apc_file), newline='') as csvfile:
spamreader = csv.DictReader(csvfile)
keys = spamreader.fieldnames # Load dictionary keys
# Extract data keys
data_dict = {}
for k in keys:
data_dict[k] = []
# Fill the data dictionary with values
counter = 0
for row in spamreader:
if counter % 10000 == 0:
print(f'Loaded {counter} lines')
for k in keys:
if k in data_types.keys():
data_dict[k].append(convert_type(row[k], data_types[k]))
counter += 1
return data_dict
def convert_type(dat, output_type):
"""
Converts input data into the desired type.
"""
if output_type == "int":
return int(dat)
elif output_type == "datetime":
return datetime.datetime.strptime(dat.split('.')[0],
'%Y-%m-%d %H:%M:%S')
elif output_type == "float":
return float(dat)
elif output_type == "str":
return str(dat)
else:
warnings.warn("Not an accepted datatype")
def plot_lat_lon(run_id, start_date, end_date):
# Load the data
odt = np.load(os.path.join(clean_data_dir, 'OPEN_DATE_TIME.npy'))
lats = np.load(os.path.join(clean_data_dir,'VEH_LAT.npy'))
lons = np.load(os.path.join(clean_data_dir,'VEH_LONG.npy'))
ons = np.load(os.path.join(clean_data_dir, 'ONS.npy'))
offs = np.load(os.path.join(clean_data_dir, 'OFFS.npy'))
run_ids = np.load(os.path.join(clean_data_dir, 'RUN_ID.npy'))
lats[lats==0] = np.nan
lons[lons==0] = np.nan
# Sort data
ars = np.argsort(odt)
run_ids = run_ids[ars]
odt = odt[ars]
lats = lats[ars]
lons = lons[ars]
ons = ons[ars]
offs = offs[ars]
time_idx = np.logical_and(odt > start_date, odt < end_date)
print(f'There are {len(time_idx)} elements')
run_ids = run_ids[time_idx]
odt = odt[time_idx]
lats = lats[time_idx]
lons = lons[time_idx]
ons = ons[time_idx]
offs = offs[time_idx]
# Choose run_id
idx = np.where(run_ids == run_id)[0]
fig, ax = plt.subplots(4, sharex=True, figsize=(5,8))
ax[0].plot(odt[idx], lats[idx])
ax[1].plot(odt[idx], lons[idx])
ax[2].plot(odt[idx], ons[idx])
ax[2].plot(odt[idx], offs[idx])
ax[3].plot(odt[idx], np.cumsum(ons[idx])-np.cumsum(offs[idx]))
plt.tight_layout()
plt.figure()
plt.plot(lats[idx],lons[idx], '.')
if __name__ == "__main__":
apc_file = 'Not Confirmed - ProductionDW_APC_20170101_to_20170601.csv'
# data_dict = format_apc_production_csv(apc_file)
format_and_save_apc_production_data(apc_file)