-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
260 lines (206 loc) · 13 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
import os
from dotenv import load_dotenv
import geopandas as gpd
import get_poi
import pandas as pd
from sqlalchemy import create_engine
# Load environment variables
load_dotenv()
# SQLAlchemy creates an engine that connects Pandas with a Postgres database
# Documentation: https://pandas.pydata.org/docs/reference/api/pandas.read_sql.html
# Establish connection to Postgres db
engine = create_engine(os.getenv('POSTGRES_CONNECTION', None))
def write_pandas_to_db(df, table_name, engine=engine, if_exists='replace'):
# Write DataFrame to db as `table_name` table.
df.to_sql(table_name, engine, if_exists=if_exists)
def read_table_to_pandas(table_name, engine=engine):
# Read SQL table with pandas.
return pd.read_sql(f"SELECT * FROM {table_name}", engine)
def preprocess_and_load_survey_df():
"""Extract and preprocess survey data
"""
# Read SQL table with pandas
df_all = pd.read_sql("SELECT * FROM survey_raw", engine)
# print(df_all.columns)
# clean the column names
# PLEASE NOTE WE DELETED THE FIRST COLUMN LABELED ASSESSMENT SINCE THERE IS NO DATA IN THIS COLUMN
cleaned_column_names = (df_all.columns
.str.strip()
.str.replace('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))', r'_\1', regex=True)
.str.lower()
.str.replace('[ _-]+', '_', regex=True)
.str.replace('[}{)(><.!?\\\\:;,-]', '', regex=True))
df_all.columns = cleaned_column_names
# deleted column 'assessment' it has no data, no entries
df_all = df_all.drop(columns=['assessment'])
df_all = df_all.rename(columns={'location_details/_landmarks': 'location'})
# converting the date to days only
df_all['creation_date'] = pd.to_datetime(df_all['creation_date']).dt.date
df_all['edit_date'] = pd.to_datetime(df_all['edit_date']).dt.date
# setting creation date to the proper data format (to date time)
df_all['creation_date'] = pd.to_datetime(
df_all['creation_date'], infer_datetime_format=True)
df_all['edit_date'] = pd.to_datetime(
df_all['edit_date'], infer_datetime_format=True)
# added line of code below
#df_all.fillna("N_A", inplace = True)
# Rename all the columns to snake_case convention
df_all = df_all.rename(columns={'location_details_landmarks': 'location', 'type_of_event': 'event_type', 'who_was_involved': 'person_involved', 'other_who_was_involved': 'other_person_involved', 'what_was_the_situation': 'what_situation', 'other_what_was_the_situation': 'other_situation', 'medical/_health_concerns': 'medical_health_concerns', 'other_medical/_health_concerns': 'other_medical_health_concerns', 'problematic_social_behaviours': 'problematic_social_behavior', 'streetscape_&_public_realm': 'streetscape_public_realm', 'other_streetscape_&_public_realm': 'other_streetscape_public_realm',
'engagement_requested': 'engage_request', 'other_engagement_requested': 'other_engage_request', 'engagement_provided': 'engage_provided', 'channelling':'channelling', 'other_channelling':'other_channelling','report_completed_by': 'report_completed', 'assessment1': 'assessment', 'relevant_notes/_description_of_incident': 'notes_description', 'if_the_person_is_a_resident/_visitor/_person_with_ubn_have_you_o': 'previous_engagement', 'list_their_distinctive_characteristics/tags': 'yes_characteristics_tags', 'if_"_yes"_how_many_times_including_this_time': 'yes_number_times', 'x': 'x_longitude', 'y': 'y_latitude'})
df_all = df_all.fillna('no_data')
df_all = df_all.replace('N_A', 'no_data', regex=True)
df_all = df_all.replace('N_A,', ',no_data', regex=True)
df_all = df_all.replace(',N_A,', ',no_data', regex=True)
df_all = df_all.replace('None', 'no_data')
df_all["event_type"] = df_all["event_type"].str.replace(
'Streetscape_Public_Realm', 'Streetscape', regex=False)
df_all["creator"] = df_all["creator"].str.replace(
'IvanOsorioAvila', 'Ivan Osorio Avila', regex=False)
df_all["creator"] = df_all["creator"].str.replace(
'carmen_12csi', 'Carmen Poon', regex=False)
df_all["editor"] = df_all["editor"].str.replace(
'IvanOsorioAvila', 'Ivan Osorio Avila', regex=False)
df_all["editor"] = df_all["editor"].str.replace(
'carmen_12csi', 'Carmen Poon', regex=False)
# combine person_involved with other_person_involved column
df_all['person_involved'] = df_all.person_involved.str.cat(
df_all.other_person_involved, sep=",")
df_all = df_all.drop(columns=['other_person_involved'])
df_all["person_involved"] = df_all["person_involved"].str.replace(
'other', 'no_data', regex=False)
# combine what_situation with other_situation column
df_all['what_situation'] = df_all.what_situation.str.cat(
df_all.other_situation, sep=",")
df_all = df_all.drop(columns=['other_situation'])
df_all["what_situation"] = df_all["what_situation"].str.replace(
'other', 'no_data', regex=False)
# combine medical_health_concerns with other_medical_health_concerns column
df_all['medical_health_concerns'] = df_all.medical_health_concerns.str.cat(
df_all.other_medical_health_concerns, sep=",")
df_all = df_all.drop(columns=['other_medical_health_concerns'])
df_all["medical_health_concerns"] = df_all["medical_health_concerns"].str.replace(
'other', 'no_data', regex=False)
# combine streetscape_public_realm with other_streetscape_public_realm column
df_all['streetscape_public_realm'] = df_all.streetscape_public_realm.str.cat(
df_all.other_streetscape_public_realm, sep=",")
df_all = df_all.drop(columns=['other_streetscape_public_realm'])
df_all["streetscape_public_realm"] = df_all["streetscape_public_realm"].str.replace(
'other', 'no_data', regex=False)
# combine engage_request with other_engage_request column
df_all['engage_request'] = df_all.engage_request.str.cat(
df_all.other_engage_request, sep=",")
df_all = df_all.drop(columns=['other_engage_request'])
df_all["engage_request"] = df_all["engage_request"].str.replace(
'other', 'no_data', regex=False)
# combine channelling with other_channelling column
df_all['channelling'] = df_all.channelling.str.cat(
df_all.other_channelling, sep=",")
df_all = df_all.drop(columns=['other_channelling'])
df_all["channelling"] = df_all["channelling"].str.replace(
'other', 'no_data', regex=False)
# combine referrals with other_referrals column
df_all['referrals'] = df_all.referrals.str.cat(
df_all.other_referrals, sep=",")
df_all = df_all.drop(columns=['other_referrals'])
df_all["referrals"] = df_all["referrals"].str.replace(
'other', 'no_data', regex=False)
# combine engage_provided with other_engagement_provided column
df_all['engage_provided'] = df_all.engage_provided.str.cat(
df_all.other_engagement_provided, sep=",")
df_all = df_all.drop(columns=['other_engagement_provided'])
df_all["engage_provided"] = df_all["engage_provided"].str.replace(
'other', 'no_data', regex=False)
points = get_poi.get_events_geoenriched()
points = points.drop(['index'], axis=1)
#print(points.dtypes)
coord = get_poi.get_places_geoenriched()
coord = coord.drop(['index'], axis=1)
#print(coord.dtypes)
# spatial join checking our x and y to see if inside the polygon co-ordinates
points_within = gpd.sjoin(points, coord.head(10))
#points_within = gpd.sjoin(points, coord.head(10), op='within')
points_within = points_within.rename(columns={'ObjectID': 'object_id'})
#print(points_within.dtypes)
# joining both dataframes based on object_id
df_all = pd.merge(
df_all, points_within[['object_id', 'name']], how='left', on=['object_id'])
df_all = df_all.rename(columns={'name': 'hot_spot'})
df_all.loc[df_all['hot_spot'].isna(), 'hot_spot'] = 'not in \'hotspot\''
df_all['general_location'] = df_all['location']
df_all = df_all[['index', 'object_id', 'global_id', 'creation_date', 'creator', 'edit_date', 'editor', 'location'] + ['general_location'] + ['event_type', 'person_involved', 'what_situation', 'medical_health_concerns', 'problematic_social_behavior',
'streetscape_public_realm', 'engage_request', 'engage_provided', 'channelling', 'referrals', 'report_completed', 'assessment', 'notes_description', 'previous_engagement', 'yes_characteristics_tags', 'yes_number_times', 'x_longitude', 'y_latitude', 'hot_spot']]
# data cleaning to make a general location - so dashboard will have better more accurate information
df_all['general_location'] = df_all['general_location'] = (
df_all.general_location.str.lower())
df_all["general_location"] = df_all["general_location"].str.replace(
'forrest', 'forest', regex=False)
df_all.loc[df_all['general_location'].str.contains(
'shoppers'), 'general_location'] = 'shoppers drug mart'
df_all.loc[df_all['general_location'].str.contains(
'big al’s'), 'general_location'] = 'big al’s bar and grill'
df_all.loc[df_all['general_location'].str.contains(
'victory'), 'general_location'] = 'victory outreach church'
df_all.loc[df_all['general_location'].str.contains(
'alex'), 'general_location'] = 'the alex'
df_all.loc[df_all['general_location'].str.contains(
'bravo'), 'general_location'] = 'bravo restaurant'
df_all.loc[df_all['general_location'].str.contains(
'chicago’s pub'), 'general_location'] = 'chicago pub'
df_all.loc[df_all['general_location'].str.contains(
'chicken the way'), 'general_location'] = 'chicken on the way'
df_all.loc[df_all['general_location'].str.contains(
'mekong'), 'general_location'] = 'mekong vietnamese restaurant'
df_all.loc[df_all['general_location'].str.contains(
'osmows'), 'general_location'] = 'osmows mediterranean cuisine'
df_all.loc[df_all['general_location'].str.contains(
'shawarma'), 'general_location'] = 'shawarma palace'
df_all.loc[df_all['general_location'].str.contains(
'across from 4201 19ave'), 'general_location'] = 'across street from 4201 19ave'
df_all.loc[df_all['general_location'].str.contains(
'ba mien'), 'general_location'] = 'ba mien quan vietnamese cafe'
df_all.loc[df_all['general_location'].str.contains(
'circle k'), 'general_location'] = 'circle k'
df_all.loc[df_all['general_location'].str.contains(
'forest lawn physiotherapy'), 'general_location'] = 'forest lawn physiotherapy'
df_all.loc[df_all['general_location'].str.contains(
'fuse'), 'general_location'] = 'fuse 33'
df_all.loc[df_all['general_location'].str.contains(
'kokonut kove'), 'general_location'] = 'kokonut kove'
df_all.loc[df_all['general_location'].str.contains(
'marsha allah'), 'general_location'] = 'mashaallah restaurant'
df_all.loc[df_all['general_location'].str.contains(
'mashaalah'), 'general_location'] = 'mashaallah restaurant'
df_all.loc[df_all['general_location'].str.contains(
'nature’s food in store'), 'general_location'] = 'nature’s food and spice'
return df_all
def target_columns():
target_columns = ['event_type', 'person_involved',
'what_situation', 'medical_health_concerns', 'problematic_social_behavior',
'streetscape_public_realm', 'engage_request', 'engage_provided',
'channelling', 'referrals', 'report_completed', 'assessment',
'notes_description', 'previous_engagement','hot_spot']
return target_columns
def input_data():
input_data = [
{'label':'Event Type', 'value': "event_type"},
{'label':'Person Involved', 'value': "person_involved"},
{'label':'What Situation', 'value': "what_situation"},
{'label':'Medical Health Concerns', 'value': "medical_health_concerns"},
{'label':'Problematic Social Behavior', 'value': "problematic_social_behavior"},
{'label':'Streetscape', 'value': "streetscape_public_realm"},
{'label':'Engagement Request', 'value': "engage_request"},
{'label':'Engagement Provided', 'value': "engage_provided"},
{'label':'Channelling', 'value': "channelling"},
{'label':'Referrals', 'value': "referrals"},
{'label':'Report Completed', 'value': "report_completed"},
{'label':'Assessment', 'value': "assessment"},
{'label':'Notes Description', 'value': "notes_description"},
{'label':'Previous Engagement', 'value': "previous_engagement"},
{'label':'Hot Spot', 'value': "hot_spot"},
]
return input_data
if __name__ == '__main__':
df = preprocess_and_load_survey_df()
write_pandas_to_db(df, 'preprocess')
# print(df)
#df.to_csv('df.csv')