-
Notifications
You must be signed in to change notification settings - Fork 0
/
form_results.py
339 lines (298 loc) · 12.1 KB
/
form_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
import numpy as np
import pandas as pd
import os
import yaml
from pathlib import Path
import shutil
class FormResults(object):
"""
The results spreadsheet from the Data Portal metadata forms
Inputs:
in_file_name: the name for the Excel spreadsheet downloaded from the
"Initial PSRC Metadata Collection Form" (Microsoft form)
in_file_dir: the full path specification for in_file_name
config_dir: the full directory path to the config files that determine the export behavior for each layer
"""
def __init__(self,
in_file_name,
in_sheet_name = 'Sheet1',
in_file_dir = r'Y:\Data-Portal\Metadata',
config_dir = r'.\Config\run_files',
shared_column_def_path = r'C:\Users\cpeak\OneDrive - Puget Sound Regional Council\Question 1',
fields_spreadsheet_path = r'C:\Users\cpeak\Repos\data-portal-tools\meta_workspace\fields.xlsx'
):
try:
# self.path = metadata_dir
self.form_results_path = in_file_dir + '\\' + in_file_name
self.shared_column_def_path = shared_column_def_path
self.fields_spreadsheet_path = fields_spreadsheet_path
self.df = pd.read_excel(self.form_results_path,
sheet_name=in_sheet_name,
na_filter=False)
self.set_column_dict()
self.config_dir = config_dir
#self.rename_columns()
self.standardize_cols()
except Exception as e:
print(e.args[0])
raise
def standardize_cols(self):
try:
df = self.df
df = df.replace('\'', '`', regex=True)
self.df = df
except Exception as e:
print(e.args[0])
raise
def rename_columns(self):
try:
new_colnames = []
for c in self.df.columns:
newname = self.column_dict[c]
new_colnames.append(newname)
self.df.columns = new_colnames
except Exception as e:
print(e.args[0])
raise
def yamlize(self, in_series):
"""
in_series = a row from self.df
"""
ser = in_series
out_yaml = yaml.load("""
dataset:
layer_params:
title: {}
tags: {}
allow_edits: False
share_level: everyone
spatial_data: {}
snippet: null
accessInformation: null
licenseInfo: null
metadata:
contact_name: {}
contact_email: {}
contact_street_address: 1201 3rd Ave. Ste. 500
contact_city: Seattle
contact_state: Washington
contact_zip: 98101
contact_phone: '{}'
description: '{}'
data_source: '{}'
date_last_updated: '{}'
data_lineage: '{}'
assessment: '{}'
organization_name: Puget Sound Regional Council
psrc_website: '{}'
summary_purpose: ''
supplemental_info: '{}'
time_period: '{}'
tech_note_link: '{}'
update_cadence: '{}'
""".format(ser['DatasetName'], #was title
ser['Tags'],
ser['SpatialData'], #was Spatial Data
ser['ContactName'],
ser['ContactEmail'],
ser['ContactPhone'],
ser['Abstract'],
ser['DataSource'],
ser['DateLastUpdated'],
ser['DataCollectionProcess'], #was DataLineage
ser['DataAssessment'], #was Assessment
ser['WebpageLink'], #was Webpage
ser['SupplementalInformation'], # Was SupplementalInfo
ser['TimePeriod'],
ser['TechnicalNotes'], #was TechNoteLink
ser['UpdateCadence']
),
yaml.FullLoader)
return out_yaml
def set_column_dict(self):
self.column_dict = {
'ID': 'ID',
'Start time': 'Starttime',
'Completion time': 'Timestamp',
'Email': 'Email',
'Name': 'Name',
'Dataset Name': 'dataset name on form',
'Suggested name': 'Title',
'spatial_data': 'Spatial Data',
'Abstract': 'Abstract',
'Time period covered by the data': 'TimePeriod',
'Link to PSRC webpage': 'Webpage',
'Links to background information or technical notes': 'TechNoteLink',
'Name of data contact': 'ContactName',
'Email address of data contact': 'ContactEmail',
'Phone number of data contact': 'ContactPhone',
'When was the dataset last updated?': 'DateLastUpdated',
'How often is the dataset updated?': 'UpdateCadence',
'What is the data source?': 'DataSource',
'Description of the data collection process': 'DataLineage',
'Assessment of the data': 'Assessment',
'Supplemental Information': 'SupplementalInfo',
'Internal location of GIS dataset': 'InternalLocation',
'Category': 'Category',
'Tags': 'Tags',
'Field Definitions': 'FieldDefinitions',
'Suggestions for improving the dataset.': 'Suggestions'
}
def mergedict(self, source, destination):
"""
cribbed from stackoverflow user vincent
https://stackoverflow.com/questions/20656135/python-deep-merge-dictionary-data
run me with nosetests --with-doctest file.py
merge two dictionaries into one, unioning their contents
> a = { 'first' : { 'all_rows' : { 'pass' : 'dog', 'number' : '1' } } }
> b = { 'first' : { 'all_rows' : { 'fail' : 'cat', 'number' : '5' } } }
> merge(b, a) == { 'first' : { 'all_rows' : { 'pass' : 'dog', 'fail' : 'cat', 'number' : '5' } } }
True
"""
try:
for key, value in source.items():
if isinstance(value, dict):
# get node or create one
node = destination.setdefault(key, {})
self.mergedict(value, node)
else:
destination[key] = value
return destination
except Exception as e:
print(e.args[0])
raise
def find_config_file(self, target_title):
"""look through yaml files in run_files directory for a file with a
dataset.layer-params.title value equal to target_title.
If one is found, returns a dict with two keys:
filepath (the path to the yaml file)
yamldict (the contents of that yaml file, in dictionary form)
If not, return False
"""
try:
ret_value = False
dir = self.config_dir
for yfile in os.listdir(dir):
dir_path = Path(dir)
fpath = dir_path / yfile
with open(fpath) as file:
my_yaml = yaml.load(file, Loader=yaml.FullLoader)
y_title = my_yaml['dataset']['layer_params']['title']
y_title = y_title.lower()
if y_title == target_title:
ret_dict = {}
ret_dict['filepath'] = fpath
ret_dict['yamldict'] = my_yaml
ret_value = ret_dict
#print(y_title)
return ret_value
except Exception as e:
print(e.args[0])
raise
def fields_path(self, excel_name):
try:
dir_path = self.shared_column_def_path
f_name = excel_name.replace(" ","_")
yaml_path = dir_path + '\\' + excel_name
return yaml_path
except Exception as e:
print(e.args[0])
raise
def df_to_list_of_dicts(self, in_df):
"""
from a field-definitions dataframe,
create a list of dictionaries for the field names and definitions
"""
try:
field_list = []
for i, r in in_df.iterrows():
if r.RemoveField != 'Yes':
if r['Description '] == r['Description ']:
desc = r['Description ']
else:
desc = '(no description)'
f_dict = {'title': r['FieldName'], 'description': desc}
field_list.append(f_dict)
return field_list
except Exception as e:
print(e.args[0])
raise
def get_field_data2(self, row):
"""
given a row from the data_sets data frame,
return the field definitions as a list of dictionaries.
"""
try:
data_set_id = row.DatasetID
df = pd.read_excel(self.fields_spreadsheet_path)
df = df[df.DatasetID == data_set_id]
field_list = self.df_to_list_of_dicts(df)
return field_list
except Exception as e:
print(e.args[0])
raise
def get_field_data(self, row):
"""
given a series (row) with a column FieldDefinitions listing a path to a field spreadsheet,
return the field definitions as a list of dicitonaries
"""
try:
url = row.FieldDefinitions
l = url.split('&file=')[1]
l = l.split('&action=')[0]
l = l.replace('%20',' ')
fpath = self.fields_path(l)
print(fpath)
df_fields = pd.read_excel(fpath)
field_list = self.df_to_list_of_dicts(df_fields)
return field_list
except Exception as e:
print(e.args[0])
raise
def get_title_from_metadata(self, metadata_yaml):
try:
title = metadata_yaml['dataset']['layer_params']['title']
# title = title.replace(' ','_')
# title = title.replace('-','_')
# title = title.lower()
return title
except Exception as e:
print(e.args[0])
raise
def integrate_fields(self):
"""
# for each record in metadata form output
# creata a dictionary meta_dict from the row
# if there is a matching YAML config file
# merge meta_dict into the config file
# else:
# create a new config file
"""
try:
for i, r in self.df.iterrows():
new_metadata = self.yamlize(r)
fd = self.get_field_data2(r)
new_metadata['dataset']['layer_params']['metadata']['fields'] = fd
title = self.get_title_from_metadata(new_metadata)
print('title: {}'.format(title))
config_file = self.find_config_file(title)
if config_file:
config_yaml = config_file['yamldict']
merged_yaml = self.mergedict(new_metadata, config_yaml)
with open(config_file['filepath'], 'w') as file:
#dictlist = config_file['yamldict']
doc = yaml.dump(merged_yaml, file)
print ("merged with {} and written to file.".format(config_file['filepath']))
#print("yaml: {}".format(merged_yaml))
else:
print('no matching config file. Writing new one...')
config_yaml = new_metadata
dir_path = Path(self.config_dir)
config_filename = title + '.yml'
fpath = dir_path / config_filename
with open(fpath, 'w') as file:
doc = yaml.dump(config_yaml, file)
print('{} written'.format(config_filename))
except Exception as e:
print(e.args[0])
raise