-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathwrangle_road_ratings.py
96 lines (70 loc) · 2.86 KB
/
wrangle_road_ratings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python
# coding: utf-8
"""
Clean road ratings data
-----------------------
Description
-----------
Data cleaning for yearly audit of road quality.
- Appends all years of data together.
- Standardizes column names.
- Removes invalid values.
- Exports to CSV.
"""
import pandas as pd
import numpy as np
import glob
import re
import sys
import yaml
# ## Static variables
DATA_PATHS = yaml.load( open('./../../datafiles.yaml') )
ROAD_RATINGS_DIR = DATA_PATHS['datadir']['raw_data_dir'] + '/Road_Ratings/'
# Regex chunk to identify four digit numbers in file names (year)
YEAR_REGEX = re.compile(r'\d{4}')
# Column names found in CSVs
COLUMN_NAMES = ['Street Name', 'From', 'To', 'Block', 'Ward',
'Overall', 'Crack', 'Patch', 'Date R8d', 'Length', 'Width',
'year_inspected'
]
# Column names to export to SQL
SQL_COLUMN_NAMES = ['street_name', 'street_from', 'street_to', 'block_number', 'ward_number',
'overall_rating', 'crack_rating', 'patch_rating', 'date_rated', 'street_length',
'street_width', 'year_inspected']
# Columns that should be integers (used to put in blanks where text is observed)
SQL_INT_COLUMNS = ['block_number', 'ward_number', 'overall_rating', 'crack_rating', 'patch_rating',
'street_length', 'street_width', 'year_inspected']
# Used to ignore funky data where overall rating is not between 0-10.
ACCEPTABLE_RATINGS = range(0,11)
# ## Extract and append CSVs containing road ratings.
# Extract list of CSV file names.
csv_list = []
for file_name in glob.glob(ROAD_RATINGS_DIR + '*.csv'):
csv_list.append(file_name)
# Take each CSV and create a dataframe.
df_list = []
for file_name in csv_list:
df = pd.read_csv(file_name)
search = YEAR_REGEX.search(file_name)
year = search.group(0)
df['year_inspected'] = year
if 'Rating' in df.columns:
# 2002 data has column name of "Rating" instead of "Overall"
df = df.rename(columns = {'Rating': 'Overall'})
trimmed = df[COLUMN_NAMES]
df_list.append(trimmed)
# appended has all dataframes appended
appended = pd.concat(df_list)
# ## Filter invalid data and coerce incorrect data to null (e.g., text in numeric column)
# Filter data to only include rows where the overall rating is 0-10 (about 99% of data)
appended['Numeric_Overall'] = pd.to_numeric( appended.Overall, errors='coerce')
filtered = appended[appended['Numeric_Overall'].isin(ACCEPTABLE_RATINGS)]
# Rename columns.
trimmed = filtered[COLUMN_NAMES]
trimmed.columns = SQL_COLUMN_NAMES
# Puts blanks into rows in each column that should be numeric.
for col in trimmed.columns:
if col in SQL_INT_COLUMNS:
trimmed[col] = pd.to_numeric(trimmed[col], errors='coerce')
# ## Export to CSV to copy into DB
trimmed.to_csv(DATA_PATHS['datadir']['clean_data_dir'] + '/road_ratings.csv', index = False)