diff --git a/.gitignore b/.gitignore index 5cd2556d..9de0fa30 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ cache/ *.rds *.zip *.csv +!docs/data-dict.csv *.xlsx *.xlsm *.html diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3d9242f2..8bbd70f7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,3 +27,10 @@ repos: entry: Cannot commit .Rhistory, .RData, .Rds or .rds. language: fail files: '\.(Rhistory|RData|Rds|rds)$' + - id: check-data-dict + name: Data dictionary must be up to date with params file + entry: Rscript R/hooks/check-data-dict.R + files: (^|/)((params\.yaml)|(data-dict\.csv))$ + language: r + additional_dependencies: + - yaml diff --git a/R/hooks/check-data-dict.R b/R/hooks/check-data-dict.R new file mode 100644 index 00000000..80e8aeec --- /dev/null +++ b/R/hooks/check-data-dict.R @@ -0,0 +1,34 @@ +#!/usr/bin/env Rscript +# Script to check that the data dictionary file is up to date with the +# latest feature set +library(yaml) + +params_filename <- "params.yaml" +data_dict_filename <- "docs/data-dict.csv" + +params <- read_yaml(params_filename) +data_dict <- read.csv(data_dict_filename) + +symmetric_diff <- c( + setdiff(data_dict$variable_name, params$model$predictor$all), + setdiff(params$model$predictor$all, data_dict$variable_name) +) +symmetric_diff_len <- length(symmetric_diff) + +if (symmetric_diff_len > 0) { + err_msg_prefix <- ifelse(symmetric_diff_len == 1, "Param is", "Params are") + err_msg <- paste0( + err_msg_prefix, + " not present in both ", + params_filename, + " and ", + data_dict_filename, + ": ", + paste(symmetric_diff, collapse = ", "), + ". ", + "Did you forget to reknit README.Rmd after updating ", + params_filename, + "?" + ) + stop(err_msg) +} diff --git a/README.Rmd b/README.Rmd index 8ba2a2e4..a5dbcbf4 100644 --- a/README.Rmd +++ b/README.Rmd @@ -231,10 +231,11 @@ Model accuracy for each parameter combination is measured on a validation set us ### Features Used -The residential model uses a variety of individual and aggregate features to determine a property's assessed value. We've tested a long list of possible features over time, including [walk score](https://gitlab.com/ccao-data-science---modeling/models/ccao_res_avm/-/blob/9407d1fae1986c5ce1f5434aa91d3f8cf06c8ea1/output/test_new_variables/county_walkscore.html), [crime rate](https://gitlab.com/ccao-data-science---modeling/models/ccao_res_avm/-/blob/9407d1fae1986c5ce1f5434aa91d3f8cf06c8ea1/output/test_new_variables/chicago_crimerate.html), [school districts](https://gitlab.com/ccao-data-science---modeling/models/ccao_res_avm/-/blob/9407d1fae1986c5ce1f5434aa91d3f8cf06c8ea1/output/test_new_variables/county_school_boundaries_mean_encoded.html), and many others. The features in the table below are the ones that made the cut. They're the right combination of easy to understand and impute, powerfully predictive, and well-behaved. Most of them are in use in the model as of `r Sys.Date()`. +The residential model uses a variety of individual and aggregate features to determine a property's assessed value. We've tested a long list of possible features over time, including [walk score](https://gitlab.com/ccao-data-science---modeling/models/ccao_res_avm/-/blob/9407d1fae1986c5ce1f5434aa91d3f8cf06c8ea1/output/test_new_variables/county_walkscore.html), [crime rate](https://gitlab.com/ccao-data-science---modeling/models/ccao_res_avm/-/blob/9407d1fae1986c5ce1f5434aa91d3f8cf06c8ea1/output/test_new_variables/chicago_crimerate.html), [school districts](https://gitlab.com/ccao-data-science---modeling/models/ccao_res_avm/-/blob/9407d1fae1986c5ce1f5434aa91d3f8cf06c8ea1/output/test_new_variables/county_school_boundaries_mean_encoded.html), and many others. The features in the table below are the ones that made the cut. They're the right combination of easy to understand and impute, powerfully predictive, and well-behaved. ```{r feature_guide, message=FALSE, results='asis', echo=FALSE} library(dplyr) +library(readr) library(tidyr) library(yaml) library(jsonlite) @@ -316,38 +317,71 @@ param_notes <- param_tbl$value %>% )) %>% unlist() -ccao::vars_dict %>% - inner_join( - param_tbl %>% mutate(description = param_notes), - by = c("var_name_model" = "value") +param_tbl_fmt <- param_tbl %>% + mutate(description = param_notes) %>% + left_join( + ccao::vars_dict, + by = c("value" = "var_name_model") ) %>% group_by(var_name_pretty) %>% mutate(row = paste0("X", row_number())) %>% distinct( - `Feature Name` = var_name_pretty, - Category = var_type, - Type = var_data_type, - Notes = description, - var_value, row + feature_name = var_name_pretty, + variable_name = value, + description, + category = var_type, + type = var_data_type, + var_code, var_value, row ) %>% - mutate(Category = recode( - Category, + mutate(category = recode( + category, char = "Characteristic", acs5 = "ACS5", loc = "Location", prox = "Proximity", ind = "Indicator", time = "Time", - meta = "Meta", other = "Other", ccao = "Other" + meta = "Meta", other = "Other", ccao = "Other", shp = "Parcel Shape" )) %>% pivot_wider( - id_cols = `Feature Name`:`Notes`, + id_cols = `feature_name`:`category`, names_from = row, - values_from = var_value + values_from = c(var_code, var_value) + ) %>% + unite( + "possible_codes", + starts_with("var_code_X"), + sep = ", ", + na.rm = TRUE + ) %>% + unite( + "possible_values", + starts_with("var_value_X"), + sep = ", ", + na.rm = TRUE + ) %>% + mutate(description = replace_na(description, "")) %>% + arrange(category) + +# Write machine-readable version of the table to file +param_tbl_fmt %>% + write_csv("docs/data-dict.csv") + +# Render human-readable version of the table to the doc +param_tbl_fmt %>% + rename( + "Feature Name" = "feature_name", + "Variable Name" = "variable_name", + "Description" = "description", + "Category" = "category", + "Possible Values (Encoded)" = "possible_codes", + "Possible Values (Semantic)" = "possible_values", ) %>% - unite("Possible Values", starts_with("X"), sep = ", ", na.rm = TRUE) %>% - mutate(Notes = replace_na(Notes, "")) %>% - arrange(Category) %>% - relocate(Notes, .after = everything()) %>% knitr::kable(format = "markdown") ``` +We maintain a few useful resources for working with these features: + +- Once you've [pulled the input data](#getting-data), you can inner join the data to the CSV version of the data dictionary ([`docs/data-dict.csv`](./docs/data-dict.csv)) to filter for only the features that we use in the model. +- You can browse our [data catalog](https://ccao-data.github.io/data-architecture/#!/overview) to see more details about these features, in particular the [residential model input view](https://ccao-data.github.io/data-architecture/#!/model/model.ccao_data_athena.model.vw_card_res_input) which is the source of our training data. +- You can use the [`ccao` R package](https://ccao-data.github.io/ccao/) or its [Python equivalent](https://ccao-data.github.io/ccao/python/) to programmatically convert variable names to their human-readable versions ([`ccao::vars_rename()`](https://ccao-data.github.io/ccao/reference/vars_rename.html)) or convert numerically-encoded variables to human-readable values ([`ccao::vars_recode()`](https://ccao-data.github.io/ccao/reference/vars_recode.html). The [`ccao::vars_dict` object](https://ccao-data.github.io/ccao/reference/vars_dict.html) is also useful for inspecting the raw crosswalk that powers the rename and recode functions. + #### Data Sources We rely on numerous third-party sources to add new features to our data. These features are used in the primary valuation model and thus need to be high-quality and error-free. A non-exhaustive list of features and their respective sources includes: diff --git a/README.md b/README.md index 351267f2..10f698e4 100644 --- a/README.md +++ b/README.md @@ -367,107 +367,134 @@ rate](https://gitlab.com/ccao-data-science---modeling/models/ccao_res_avm/-/blob districts](https://gitlab.com/ccao-data-science---modeling/models/ccao_res_avm/-/blob/9407d1fae1986c5ce1f5434aa91d3f8cf06c8ea1/output/test_new_variables/county_school_boundaries_mean_encoded.html), and many others. The features in the table below are the ones that made the cut. They’re the right combination of easy to understand and impute, -powerfully predictive, and well-behaved. Most of them are in use in the -model as of 2024-04-12. - -| Feature Name | Category | Type | Possible Values | Notes | -|:------------------------------------------------------------------------|:---------------|:------------|:---------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------| -| Percent Population Age, Under 19 Years Old | ACS5 | numeric | | Percent of the people 17 years or younger | -| Percent Population Age, Over 65 Years Old | ACS5 | numeric | | Percent of the people 65 years or older | -| Median Population Age | ACS5 | numeric | | Median age for whole population | -| Percent Population Mobility, In Same House 1 Year Ago | ACS5 | numeric | | Percent of people (older than 1 year) who have not moved in the past 12 months | -| Percent Population Mobility, Moved From Other State in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved from another state in the past 12 months | -| Percent Households Family, Married | ACS5 | numeric | | Percent of households that are family, married | -| Percent Households Nonfamily, Living Alone | ACS5 | numeric | | Percent of households that are non-family, alone (single) | -| Percent Population Education, High School Degree | ACS5 | numeric | | Percent of people older than 25 who attained a high school degree | -| Percent Population Education, Bachelor Degree | ACS5 | numeric | | Percent of people older than 25 who attained a bachelor’s degree | -| Percent Population Education, Graduate Degree | ACS5 | numeric | | Percent of people older than 25 who attained a graduate degree | -| Percent Population Income, Below Poverty Level | ACS5 | numeric | | Percent of people above the poverty level in the last 12 months | -| Median Income, Household in Past Year | ACS5 | numeric | | Median income per household in the past 12 months | -| Median Income, Per Capita in Past Year | ACS5 | numeric | | Median income per capita in the past 12 months | -| Percent Population Income, Received SNAP in Past Year | ACS5 | numeric | | Percent of households that received SNAP in the past 12 months | -| Percent Population Employment, Unemployed | ACS5 | numeric | | Percent of people 16 years and older unemployed | -| Median Occupied Household, Total, Year Built | ACS5 | numeric | | Median year built for all occupied households | -| Median Occupied Household, Renter, Gross Rent | ACS5 | numeric | | Median gross rent for only renter-occupied units | -| Percent Occupied Households, Owner | ACS5 | numeric | | Percent of households that are owner-occupied | -| Percent Occupied Households, Total, One or More Selected Conditions | ACS5 | numeric | | Percent of occupied households with selected conditions | -| Percent Population Mobility, Moved From Within Same County in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved in county in the past 12 months | -| Year Built | Characteristic | numeric | | Year the property was constructed | -| Central Air Conditioning | Characteristic | categorical | Central A/C, No Central A/C | Indicator for central air | -| Apartments | Characteristic | categorical | Two, Three, Four, Five, Six, None | Number of apartments for class 211 and 212 properties | -| Attic Finish | Characteristic | categorical | Living Area, Partial, None | Attic finish | -| Attic Type | Characteristic | categorical | Full, Partial, None | Attic type | -| Bedrooms | Characteristic | numeric | | Number of bedrooms in the building | -| Building Square Feet | Characteristic | numeric | | Square footage of the building, as measured from the exterior | -| Basement Type | Characteristic | categorical | Full, Slab, Partial, Crawl | Basement type | -| Basement Finish | Characteristic | categorical | Formal Rec Room, Apartment, Unfinished | Basement finish | -| Exterior Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | Exterior wall construction | -| Full Baths | Characteristic | numeric | | Number of full bathrooms | -| Fireplaces | Characteristic | numeric | | Number of fireplaces | -| Garage 1 Attached | Characteristic | categorical | Yes, No | Indicator for garage attached | -| Garage 1 Ext. Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | Garage exterior wall construction | -| Garage 1 Size | Characteristic | categorical | 1 cars, 1.5 cars, 2 cars, 2.5 cars, 3 cars, 3.5 cars, 0 cars, 4 cars | Garage size (number of cars) | -| Half Baths | Characteristic | numeric | | Number of half baths | -| Land Square Feet | Characteristic | numeric | | Square footage of the land (not just the building) of the property | -| Central Heating | Characteristic | categorical | Warm Air Furnace, Hot Water Steam, Electric Heater, None | Interior heating type | -| Number of Commercial Units | Characteristic | numeric | | Number of commercial units | -| Porch | Characteristic | categorical | None, Frame Enclosed, Masonry Enclosed | Porch type | -| Roof Material | Characteristic | categorical | Shingle + Asphalt, Tar + Gravel, Slate, Shake, Tile, Other | Roof material / construction | -| Rooms | Characteristic | numeric | | Number of total rooms in the building (excluding baths) | -| Cathedral Ceiling | Characteristic | categorical | Yes, No | Deprecated | -| Type of Residence | Characteristic | categorical | 1 Story, 2 Story, 3 Story +, Split Level, 1.5 Story, Missing | Type of residence | -| Recent Renovation | Characteristic | logical | | Indicates whether or not a property was renovated within the last 3 years | -| Property Class | Characteristic | character | | Card-level property type and/or use | -| Longitude | Location | numeric | | X coordinate in degrees (global longitude) | -| Latitude | Location | numeric | | Y coordinate in degrees (global latitude) | -| Census Tract GEOID | Location | character | | 11-digit ACS/Census tract GEOID | -| First Street Factor | Location | numeric | | First Street flood factor The flood factor is a risk score, where 10 is the highest risk and 1 is the lowest risk | -| School Elementary District GEOID | Location | character | | School district (elementary) GEOID | -| School Secondary District GEOID | Location | character | | School district (secondary) GEOID | -| Municipality Name | Location | character | | Taxing district name, as seen on Cook County tax bills | -| CMAP Walkability Score (No Transit) | Location | numeric | | CMAP walkability score for a given PIN, excluding transit walkability | -| CMAP Walkability Total Score | Location | numeric | | CMAP walkability score for a given PIN, including transit walkability | -| Airport Noise DNL | Location | numeric | | O’Hare and Midway noise, measured as DNL | -| Township Code | Meta | character | | Cook County township code | -| Neighborhood Code | Meta | character | | Assessor neighborhood code | -| Number of sales within previous N years of sale/lien date | Meta | numeric | | Number of sales within previous N years of sale/lien date | -| Property Tax Bill Aggregate Rate | Other | numeric | | Tax bill rate for the taxing district containing a given PIN | -| School District (Elementary) GreatSchools Rating | Other | numeric | | Average GreatSchools rating of elementary schools within the district of a given PIN | -| School District (Secondary) GreatSchools Rating | Other | numeric | | Average GreatSchools rating of secondary schools within the district of a given PIN | -| Corner Lot | Other | logical | | Corner lot indicator | -| Active Homeowner Exemption | Other | logical | | Parcel has an active homeowner exemption | -| Number of Years Active Homeowner Exemption | Other | numeric | | Number of years parcel has had an active homeowner exemption | -| Number of PINs in Half Mile | Proximity | numeric | | Number of PINs within half mile | -| Number of Bus Stops in Half Mile | Proximity | numeric | | Number of bus stops within half mile | -| Number of Foreclosures Per 1000 PINs (Past 5 Years) | Proximity | numeric | | Number of foreclosures per 1000 PINs, within half mile (past 5 years) | -| Number of Schools in Half Mile | Proximity | numeric | | Number of schools (any kind) within half mile | -| Number of Schools with Rating in Half Mile | Proximity | numeric | | Number of schools (any kind) within half mile | -| Average School Rating in Half Mile | Proximity | numeric | | Average school rating of schools within half mile | -| Nearest Bike Trail Distance (Feet) | Proximity | numeric | | Nearest bike trail distance (feet) | -| Nearest Cemetery Distance (Feet) | Proximity | numeric | | Nearest cemetery distance (feet) | -| Nearest CTA Route Distance (Feet) | Proximity | numeric | | Nearest CTA route distance (feet) | -| Nearest CTA Stop Distance (Feet) | Proximity | numeric | | Nearest CTA stop distance (feet) | -| Nearest Hospital Distance (Feet) | Proximity | numeric | | Nearest hospital distance (feet) | -| Lake Michigan Distance (Feet) | Proximity | numeric | | Distance to Lake Michigan shoreline (feet) | -| Nearest Major Road Distance (Feet) | Proximity | numeric | | Nearest major road distance (feet) | -| Nearest Metra Route Distance (Feet) | Proximity | numeric | | Nearest Metra route distance (feet) | -| Nearest Metra Stop Distance (Feet) | Proximity | numeric | | Nearest Metra stop distance (feet) | -| Nearest Park Distance (Feet) | Proximity | numeric | | Nearest park distance (feet) | -| Nearest Railroad Distance (Feet) | Proximity | numeric | | Nearest railroad distance (feet) | -| Nearest Secondary Road Distance (Feet) | Proximity | numeric | | Nearest secondary road distance (feet) | -| Nearest University Distance (Feet) | Proximity | numeric | | Nearest university distance (feet) | -| Nearest Vacant Land Parcel Distance (Feet) | Proximity | numeric | | Nearest vacant land (class 100) parcel distance (feet) | -| Nearest Water Distance (Feet) | Proximity | numeric | | Nearest water distance (feet) | -| Nearest Golf Course Distance (Feet) | Proximity | numeric | | Nearest golf course distance (feet) | -| Total Airport Noise DNL | Proximity | numeric | | Estimated DNL for a PIN, assuming a baseline DNL of 50 (“quiet suburban”) and adding predicted noise from O’Hare and Midway airports to that baseline | -| Sale Year | Time | numeric | | Sale year calculated as the number of years since 0 B.C.E | -| Sale Day | Time | numeric | | Sale day calculated as the number of days since January 1st, 1997 | -| Sale Quarter of Year | Time | character | | Character encoding of quarter of year (Q1 - Q4) | -| Sale Month of Year | Time | character | | Character encoding of month of year (Jan - Dec) | -| Sale Day of Year | Time | numeric | | Numeric encoding of day of year (1 - 365) | -| Sale Day of Month | Time | numeric | | Numeric encoding of day of month (1 - 31) | -| Sale Day of Week | Time | numeric | | Numeric encoding of day of week (1 - 7) | -| Sale After COVID-19 | Time | logical | | Indicator for whether sale occurred after COVID-19 was widely publicized (around March 15, 2020) | +powerfully predictive, and well-behaved. + +| Feature Name | Variable Name | Description | Category | Possible Values (Encoded) | Possible Values (Semantic) | +|:----------------------------------------------------------------------------|:------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------|:--------------------------|:---------------------------------------------------------------------| +| Percent Population Age, Under 19 Years Old | acs5_percent_age_children | Percent of the people 17 years or younger | ACS5 | | | +| Percent Population Age, Over 65 Years Old | acs5_percent_age_senior | Percent of the people 65 years or older | ACS5 | | | +| Median Population Age | acs5_median_age_total | Median age for whole population | ACS5 | | | +| Percent Households Family, Married | acs5_percent_household_family_married | Percent of households that are family, married | ACS5 | | | +| Percent Households Nonfamily, Living Alone | acs5_percent_household_nonfamily_alone | Percent of households that are non-family, alone (single) | ACS5 | | | +| Percent Population Education, High School Degree | acs5_percent_education_high_school | Percent of people older than 25 who attained a high school degree | ACS5 | | | +| Percent Population Education, Bachelor Degree | acs5_percent_education_bachelor | Percent of people older than 25 who attained a bachelor’s degree | ACS5 | | | +| Percent Population Education, Graduate Degree | acs5_percent_education_graduate | Percent of people older than 25 who attained a graduate degree | ACS5 | | | +| Percent Population Income, Below Poverty Level | acs5_percent_income_below_poverty_level | Percent of people above the poverty level in the last 12 months | ACS5 | | | +| Median Income, Household in Past Year | acs5_median_income_household_past_year | Median income per household in the past 12 months | ACS5 | | | +| Median Income, Per Capita in Past Year | acs5_median_income_per_capita_past_year | Median income per capita in the past 12 months | ACS5 | | | +| Percent Population Income, Received SNAP in Past Year | acs5_percent_income_household_received_snap_past_year | Percent of households that received SNAP in the past 12 months | ACS5 | | | +| Percent Population Employment, Unemployed | acs5_percent_employment_unemployed | Percent of people 16 years and older unemployed | ACS5 | | | +| Median Occupied Household, Total, Year Built | acs5_median_household_total_occupied_year_built | Median year built for all occupied households | ACS5 | | | +| Median Occupied Household, Renter, Gross Rent | acs5_median_household_renter_occupied_gross_rent | Median gross rent for only renter-occupied units | ACS5 | | | +| Percent Occupied Households, Owner | acs5_percent_household_owner_occupied | Percent of households that are owner-occupied | ACS5 | | | +| Year Built | char_yrblt | Year the property was constructed | Characteristic | | | +| Central Air Conditioning | char_air | Indicator for central air | Characteristic | 1, 2 | Central A/C, No Central A/C | +| Apartments | char_apts | Number of apartments for class 211 and 212 properties | Characteristic | 1, 2, 3, 4, 5, 6 | Two, Three, Four, Five, Six, None | +| Attic Finish | char_attic_fnsh | Attic finish | Characteristic | 1, 2, 3 | Living Area, Partial, None | +| Attic Type | char_attic_type | Attic type | Characteristic | 1, 2, 3 | Full, Partial, None | +| Bedrooms | char_beds | Number of bedrooms in the building | Characteristic | | | +| Building Square Feet | char_bldg_sf | Square footage of the building, as measured from the exterior | Characteristic | | | +| Basement Type | char_bsmt | Basement type | Characteristic | 1, 2, 3, 4 | Full, Slab, Partial, Crawl | +| Basement Finish | char_bsmt_fin | Basement finish | Characteristic | 1, 2, 3 | Formal Rec Room, Apartment, Unfinished | +| Property Class | char_class | Card-level property type and/or use | Characteristic | | | +| Exterior Wall Material | char_ext_wall | Exterior wall construction | Characteristic | 1, 2, 3, 4 | Frame, Masonry, Frame + Masonry, Stucco | +| Full Baths | char_fbath | Number of full bathrooms | Characteristic | | | +| Fireplaces | char_frpl | Number of fireplaces | Characteristic | | | +| Garage 1 Attached | char_gar1_att | Indicator for garage attached | Characteristic | 1, 2 | Yes, No | +| Garage 1 Ext. Wall Material | char_gar1_cnst | Garage exterior wall construction | Characteristic | 1, 2, 3, 4 | Frame, Masonry, Frame + Masonry, Stucco | +| Garage 1 Size | char_gar1_size | Garage size (number of cars) | Characteristic | 1, 2, 3, 4, 5, 6, 7, 8 | 1 cars, 1.5 cars, 2 cars, 2.5 cars, 3 cars, 3.5 cars, 0 cars, 4 cars | +| Half Baths | char_hbath | Number of half baths | Characteristic | | | +| Land Square Feet | char_land_sf | Square footage of the land (not just the building) of the property | Characteristic | | | +| Central Heating | char_heat | Interior heating type | Characteristic | 1, 2, 3, 4 | Warm Air Furnace, Hot Water Steam, Electric Heater, None | +| Number of Commercial Units | char_ncu | Number of commercial units | Characteristic | | | +| Porch | char_porch | Porch type | Characteristic | 0, 1, 2 | None, Frame Enclosed, Masonry Enclosed | +| Roof Material | char_roof_cnst | Roof material / construction | Characteristic | 1, 2, 3, 4, 5, 6 | Shingle + Asphalt, Tar + Gravel, Slate, Shake, Tile, Other | +| Rooms | char_rooms | Number of total rooms in the building (excluding baths) | Characteristic | | | +| Cathedral Ceiling | char_tp_dsgn | Deprecated | Characteristic | 1, 2 | Yes, No | +| Type of Residence | char_type_resd | Type of residence | Characteristic | 1, 2, 3, 4, 5, 9.9 | 1 Story, 2 Story, 3 Story +, Split Level, 1.5 Story, Missing | +| Recent Renovation | char_recent_renovation | Indicates whether or not a property was renovated within the last 3 years | Characteristic | | | +| Longitude | loc_longitude | X coordinate in degrees (global longitude) | Location | | | +| Latitude | loc_latitude | Y coordinate in degrees (global latitude) | Location | | | +| Census Tract GEOID | loc_census_tract_geoid | 11-digit ACS/Census tract GEOID | Location | | | +| First Street Factor | loc_env_flood_fs_factor | First Street flood factor The flood factor is a risk score, where 10 is the highest risk and 1 is the lowest risk | Location | | | +| School Elementary District GEOID | loc_school_elementary_district_geoid | School district (elementary) GEOID | Location | | | +| School Secondary District GEOID | loc_school_secondary_district_geoid | School district (secondary) GEOID | Location | | | +| CMAP Walkability Score (No Transit) | loc_access_cmap_walk_nta_score | CMAP walkability score for a given PIN, excluding transit walkability | Location | | | +| CMAP Walkability Total Score | loc_access_cmap_walk_total_score | CMAP walkability score for a given PIN, including transit walkability | Location | | | +| Municipality Name | loc_tax_municipality_name | Taxing district name, as seen on Cook County tax bills | Location | | | +| Township Code | meta_township_code | Cook County township code | Meta | | | +| Neighborhood Code | meta_nbhd_code | Assessor neighborhood code | Meta | | | +| Number of sales within previous N years of sale/lien date | meta_sale_count_past_n_years | Number of sales within previous N years of sale/lien date | Meta | | | +| Property Tax Bill Aggregate Rate | other_tax_bill_rate | Tax bill rate for the taxing district containing a given PIN | Other | | | +| School District (Elementary) GreatSchools Rating | other_school_district_elementary_avg_rating | Average GreatSchools rating of elementary schools within the district of a given PIN | Other | | | +| School District (Secondary) GreatSchools Rating | other_school_district_secondary_avg_rating | Average GreatSchools rating of secondary schools within the district of a given PIN | Other | | | +| Active Homeowner Exemption | ccao_is_active_exe_homeowner | Parcel has an active homeowner exemption | Other | | | +| Number of Years Active Homeowner Exemption | ccao_n_years_exe_homeowner | Number of years parcel has had an active homeowner exemption | Other | | | +| Standard Deviation Distance From Parcel Centroid to Vertices (Feet) | shp_parcel_centroid_dist_ft_sd | Standard deviation of the distance from each major parcel vertex to the parcel centroid | Parcel Shape | | | +| Standard Deviation Parcel Edge Length (Feet) | shp_parcel_edge_len_ft_sd | Standard deviation of the edge length between parcel vertices | Parcel Shape | | | +| Standard Deviation Parcel Interior Angle (Degrees) | shp_parcel_interior_angle_sd | Standard deviation of the interior angles of the parcel polygon | Parcel Shape | | | +| Ratio of Parcel Area to Minimum Rotated Bounding Rectangle | shp_parcel_mrr_area_ratio | Ratio of the parcel’s area to the area of its minimum rotated bounding rectangle | Parcel Shape | | | +| Ratio of Parcel Minimum Rotated Bounding Rectangle Longest to Shortest Side | shp_parcel_mrr_side_ratio | Ratio of the longest to the shortest side of the parcel’s minimum rotated bounding rectangle | Parcel Shape | | | +| Number of Parcel Vertices | shp_parcel_num_vertices | The number of vertices of the parcel | Parcel Shape | | | +| Number of PINs in Half Mile | prox_num_pin_in_half_mile | Number of PINs within half mile | Proximity | | | +| Number of Bus Stops in Half Mile | prox_num_bus_stop_in_half_mile | Number of bus stops within half mile | Proximity | | | +| Number of Foreclosures Per 1000 PINs (Past 5 Years) | prox_num_foreclosure_per_1000_pin_past_5_years | Number of foreclosures per 1000 PINs, within half mile (past 5 years) | Proximity | | | +| Average School Rating in Half Mile | prox_avg_school_rating_in_half_mile | Average school rating of schools within half mile | Proximity | | | +| Total Airport Noise DNL | prox_airport_dnl_total | Estimated DNL for a PIN, assuming a baseline DNL of 50 (“quiet suburban”) and adding predicted noise from O’Hare and Midway airports to that baseline | Proximity | | | +| Nearest Bike Trail Distance (Feet) | prox_nearest_bike_trail_dist_ft | Nearest bike trail distance (feet) | Proximity | | | +| Nearest Cemetery Distance (Feet) | prox_nearest_cemetery_dist_ft | Nearest cemetery distance (feet) | Proximity | | | +| Nearest CTA Route Distance (Feet) | prox_nearest_cta_route_dist_ft | Nearest CTA route distance (feet) | Proximity | | | +| Nearest CTA Stop Distance (Feet) | prox_nearest_cta_stop_dist_ft | Nearest CTA stop distance (feet) | Proximity | | | +| Nearest Hospital Distance (Feet) | prox_nearest_hospital_dist_ft | Nearest hospital distance (feet) | Proximity | | | +| Lake Michigan Distance (Feet) | prox_lake_michigan_dist_ft | Distance to Lake Michigan shoreline (feet) | Proximity | | | +| Nearest Metra Route Distance (Feet) | prox_nearest_metra_route_dist_ft | Nearest Metra route distance (feet) | Proximity | | | +| Nearest Metra Stop Distance (Feet) | prox_nearest_metra_stop_dist_ft | Nearest Metra stop distance (feet) | Proximity | | | +| Nearest Park Distance (Feet) | prox_nearest_park_dist_ft | Nearest park distance (feet) | Proximity | | | +| Nearest Railroad Distance (Feet) | prox_nearest_railroad_dist_ft | Nearest railroad distance (feet) | Proximity | | | +| Nearest University Distance (Feet) | prox_nearest_university_dist_ft | Nearest university distance (feet) | Proximity | | | +| Nearest Vacant Land Parcel Distance (Feet) | prox_nearest_vacant_land_dist_ft | Nearest vacant land (class 100) parcel distance (feet) | Proximity | | | +| Nearest Water Distance (Feet) | prox_nearest_water_dist_ft | Nearest water distance (feet) | Proximity | | | +| Nearest Golf Course Distance (Feet) | prox_nearest_golf_course_dist_ft | Nearest golf course distance (feet) | Proximity | | | +| Nearest Highway Distance (Feet) | prox_nearest_road_highway_dist_ft | Distance to nearest highway road | Proximity | | | +| Nearest Arterial Road Distance (Feet) | prox_nearest_road_arterial_dist_ft | Distance to nearest arterial road | Proximity | | | +| Nearest Collector Road Distance (Feet) | prox_nearest_road_collector_dist_ft | Distance to nearest collector road | Proximity | | | +| Average Daily Traffic Count on Nearest Highway | prox_nearest_road_highway_daily_traffic | Daily traffic of nearest highway road | Proximity | | | +| Average Daily Traffic Count on Nearest Arterial Road | prox_nearest_road_arterial_daily_traffic | Daily traffic of nearest arterial road | Proximity | | | +| Average Daily Traffic Count on Nearest Collector Road | prox_nearest_road_collector_daily_traffic | Daily traffic of nearest collector road | Proximity | | | +| Nearest New Construction (Feet) | prox_nearest_new_construction_dist_ft | Nearest new construction distance (feet) | Proximity | | | +| Nearest Major Stadium (Feet) | prox_nearest_stadium_dist_ft | Nearest stadium distance (feet) | Proximity | | | +| Sale Year | time_sale_year | Sale year calculated as the number of years since 0 B.C.E | Time | | | +| Sale Day | time_sale_day | Sale day calculated as the number of days since January 1st, 1997 | Time | | | +| Sale Quarter of Year | time_sale_quarter_of_year | Character encoding of quarter of year (Q1 - Q4) | Time | | | +| Sale Month of Year | time_sale_month_of_year | Character encoding of month of year (Jan - Dec) | Time | | | +| Sale Day of Year | time_sale_day_of_year | Numeric encoding of day of year (1 - 365) | Time | | | +| Sale Day of Month | time_sale_day_of_month | Numeric encoding of day of month (1 - 31) | Time | | | +| Sale Day of Week | time_sale_day_of_week | Numeric encoding of day of week (1 - 7) | Time | | | +| Sale After COVID-19 | time_sale_post_covid | Indicator for whether sale occurred after COVID-19 was widely publicized (around March 15, 2020) | Time | | | + +We maintain a few useful resources for working with these features: + +- Once you’ve [pulled the input data](#getting-data), you can inner join + the data to the CSV version of the data dictionary + ([`docs/data-dict.csv`](./docs/data-dict.csv)) to filter for only the + features that we use in the model. +- You can browse our [data + catalog](https://ccao-data.github.io/data-architecture/#!/overview) to + see more details about these features, in particular the [residential + model input + view](https://ccao-data.github.io/data-architecture/#!/model/model.ccao_data_athena.model.vw_card_res_input) + which is the source of our training data. +- You can use the [`ccao` R package](https://ccao-data.github.io/ccao/) + or its [Python equivalent](https://ccao-data.github.io/ccao/python/) + to programmatically convert variable names to their human-readable + versions + ([`ccao::vars_rename()`](https://ccao-data.github.io/ccao/reference/vars_rename.html)) + or convert numerically-encoded variables to human-readable values + ([`ccao::vars_recode()`](https://ccao-data.github.io/ccao/reference/vars_recode.html). + The [`ccao::vars_dict` + object](https://ccao-data.github.io/ccao/reference/vars_dict.html) is + also useful for inspecting the raw crosswalk that powers the rename + and recode functions. #### Data Sources @@ -1465,8 +1492,8 @@ commands: dependency as necessary 4. Run `renv::snapshot()` to update the reporting lockfile with the dependencies defined in the `DESCRIPTION` file -5. Run `renv::activate(profile = "default")` if you would like to switch - back to the default renv profile +5. Run `renv::activate(profile = "default")` if you would like to + switch back to the default renv profile ## Troubleshooting diff --git a/docs/data-dict.csv b/docs/data-dict.csv new file mode 100644 index 00000000..4efac0d7 --- /dev/null +++ b/docs/data-dict.csv @@ -0,0 +1,101 @@ +feature_name,variable_name,description,category,possible_codes,possible_values +"Percent Population Age, Under 19 Years Old",acs5_percent_age_children,Percent of the people 17 years or younger,ACS5,, +"Percent Population Age, Over 65 Years Old",acs5_percent_age_senior,Percent of the people 65 years or older,ACS5,, +Median Population Age,acs5_median_age_total,Median age for whole population,ACS5,, +"Percent Households Family, Married",acs5_percent_household_family_married,"Percent of households that are family, married",ACS5,, +"Percent Households Nonfamily, Living Alone",acs5_percent_household_nonfamily_alone,"Percent of households that are non-family, alone (single)",ACS5,, +"Percent Population Education, High School Degree",acs5_percent_education_high_school,Percent of people older than 25 who attained a high school degree,ACS5,, +"Percent Population Education, Bachelor Degree",acs5_percent_education_bachelor,Percent of people older than 25 who attained a bachelor's degree,ACS5,, +"Percent Population Education, Graduate Degree",acs5_percent_education_graduate,Percent of people older than 25 who attained a graduate degree,ACS5,, +"Percent Population Income, Below Poverty Level",acs5_percent_income_below_poverty_level,Percent of people above the poverty level in the last 12 months,ACS5,, +"Median Income, Household in Past Year",acs5_median_income_household_past_year,Median income per household in the past 12 months,ACS5,, +"Median Income, Per Capita in Past Year",acs5_median_income_per_capita_past_year,Median income per capita in the past 12 months,ACS5,, +"Percent Population Income, Received SNAP in Past Year",acs5_percent_income_household_received_snap_past_year,Percent of households that received SNAP in the past 12 months,ACS5,, +"Percent Population Employment, Unemployed",acs5_percent_employment_unemployed,Percent of people 16 years and older unemployed,ACS5,, +"Median Occupied Household, Total, Year Built",acs5_median_household_total_occupied_year_built,Median year built for all occupied households,ACS5,, +"Median Occupied Household, Renter, Gross Rent",acs5_median_household_renter_occupied_gross_rent,Median gross rent for only renter-occupied units,ACS5,, +"Percent Occupied Households, Owner",acs5_percent_household_owner_occupied,Percent of households that are owner-occupied,ACS5,, +Year Built,char_yrblt,Year the property was constructed,Characteristic,, +Central Air Conditioning,char_air,Indicator for central air,Characteristic,"1, 2","Central A/C, No Central A/C" +Apartments,char_apts,Number of apartments for class 211 and 212 properties,Characteristic,"1, 2, 3, 4, 5, 6","Two, Three, Four, Five, Six, None" +Attic Finish,char_attic_fnsh,Attic finish,Characteristic,"1, 2, 3","Living Area, Partial, None" +Attic Type,char_attic_type,Attic type,Characteristic,"1, 2, 3","Full, Partial, None" +Bedrooms,char_beds,Number of bedrooms in the building,Characteristic,, +Building Square Feet,char_bldg_sf,"Square footage of the building, as measured from the exterior",Characteristic,, +Basement Type,char_bsmt,Basement type,Characteristic,"1, 2, 3, 4","Full, Slab, Partial, Crawl" +Basement Finish,char_bsmt_fin,Basement finish,Characteristic,"1, 2, 3","Formal Rec Room, Apartment, Unfinished" +Property Class,char_class,Card-level property type and/or use,Characteristic,, +Exterior Wall Material,char_ext_wall,Exterior wall construction,Characteristic,"1, 2, 3, 4","Frame, Masonry, Frame + Masonry, Stucco" +Full Baths,char_fbath,Number of full bathrooms,Characteristic,, +Fireplaces,char_frpl,Number of fireplaces,Characteristic,, +Garage 1 Attached,char_gar1_att,Indicator for garage attached,Characteristic,"1, 2","Yes, No" +Garage 1 Ext. Wall Material,char_gar1_cnst,Garage exterior wall construction,Characteristic,"1, 2, 3, 4","Frame, Masonry, Frame + Masonry, Stucco" +Garage 1 Size,char_gar1_size,Garage size (number of cars),Characteristic,"1, 2, 3, 4, 5, 6, 7, 8","1 cars, 1.5 cars, 2 cars, 2.5 cars, 3 cars, 3.5 cars, 0 cars, 4 cars" +Half Baths,char_hbath,Number of half baths,Characteristic,, +Land Square Feet,char_land_sf,Square footage of the land (not just the building) of the property,Characteristic,, +Central Heating,char_heat,Interior heating type,Characteristic,"1, 2, 3, 4","Warm Air Furnace, Hot Water Steam, Electric Heater, None" +Number of Commercial Units,char_ncu,Number of commercial units,Characteristic,, +Porch,char_porch,Porch type,Characteristic,"0, 1, 2","None, Frame Enclosed, Masonry Enclosed" +Roof Material,char_roof_cnst,Roof material / construction,Characteristic,"1, 2, 3, 4, 5, 6","Shingle + Asphalt, Tar + Gravel, Slate, Shake, Tile, Other" +Rooms,char_rooms,Number of total rooms in the building (excluding baths),Characteristic,, +Cathedral Ceiling,char_tp_dsgn,Deprecated,Characteristic,"1, 2","Yes, No" +Type of Residence,char_type_resd,Type of residence,Characteristic,"1, 2, 3, 4, 5, 9.9","1 Story, 2 Story, 3 Story +, Split Level, 1.5 Story, Missing" +Recent Renovation,char_recent_renovation,Indicates whether or not a property was renovated within the last 3 years,Characteristic,, +Longitude,loc_longitude,X coordinate in degrees (global longitude),Location,, +Latitude,loc_latitude,Y coordinate in degrees (global latitude),Location,, +Census Tract GEOID,loc_census_tract_geoid,11-digit ACS/Census tract GEOID,Location,, +First Street Factor,loc_env_flood_fs_factor,"First Street flood factor The flood factor is a risk score, where 10 is the highest risk and 1 is the lowest risk",Location,, +School Elementary District GEOID,loc_school_elementary_district_geoid,School district (elementary) GEOID,Location,, +School Secondary District GEOID,loc_school_secondary_district_geoid,School district (secondary) GEOID,Location,, +CMAP Walkability Score (No Transit),loc_access_cmap_walk_nta_score,"CMAP walkability score for a given PIN, excluding transit walkability",Location,, +CMAP Walkability Total Score,loc_access_cmap_walk_total_score,"CMAP walkability score for a given PIN, including transit walkability",Location,, +Municipality Name,loc_tax_municipality_name,"Taxing district name, as seen on Cook County tax bills",Location,, +Township Code,meta_township_code,Cook County township code,Meta,, +Neighborhood Code,meta_nbhd_code,Assessor neighborhood code,Meta,, +Number of sales within previous N years of sale/lien date,meta_sale_count_past_n_years,Number of sales within previous N years of sale/lien date,Meta,, +Property Tax Bill Aggregate Rate,other_tax_bill_rate,Tax bill rate for the taxing district containing a given PIN,Other,, +School District (Elementary) GreatSchools Rating,other_school_district_elementary_avg_rating,Average GreatSchools rating of elementary schools within the district of a given PIN,Other,, +School District (Secondary) GreatSchools Rating,other_school_district_secondary_avg_rating,Average GreatSchools rating of secondary schools within the district of a given PIN,Other,, +Active Homeowner Exemption,ccao_is_active_exe_homeowner,Parcel has an active homeowner exemption,Other,, +Number of Years Active Homeowner Exemption,ccao_n_years_exe_homeowner,Number of years parcel has had an active homeowner exemption,Other,, +Standard Deviation Distance From Parcel Centroid to Vertices (Feet),shp_parcel_centroid_dist_ft_sd,Standard deviation of the distance from each major parcel vertex to the parcel centroid,Parcel Shape,, +Standard Deviation Parcel Edge Length (Feet),shp_parcel_edge_len_ft_sd,Standard deviation of the edge length between parcel vertices,Parcel Shape,, +Standard Deviation Parcel Interior Angle (Degrees),shp_parcel_interior_angle_sd,Standard deviation of the interior angles of the parcel polygon,Parcel Shape,, +Ratio of Parcel Area to Minimum Rotated Bounding Rectangle,shp_parcel_mrr_area_ratio,Ratio of the parcel's area to the area of its minimum rotated bounding rectangle,Parcel Shape,, +Ratio of Parcel Minimum Rotated Bounding Rectangle Longest to Shortest Side,shp_parcel_mrr_side_ratio,Ratio of the longest to the shortest side of the parcel's minimum rotated bounding rectangle,Parcel Shape,, +Number of Parcel Vertices,shp_parcel_num_vertices,The number of vertices of the parcel,Parcel Shape,, +Number of PINs in Half Mile,prox_num_pin_in_half_mile,Number of PINs within half mile,Proximity,, +Number of Bus Stops in Half Mile,prox_num_bus_stop_in_half_mile,Number of bus stops within half mile,Proximity,, +Number of Foreclosures Per 1000 PINs (Past 5 Years),prox_num_foreclosure_per_1000_pin_past_5_years,"Number of foreclosures per 1000 PINs, within half mile (past 5 years)",Proximity,, +Average School Rating in Half Mile,prox_avg_school_rating_in_half_mile,Average school rating of schools within half mile,Proximity,, +Total Airport Noise DNL,prox_airport_dnl_total,"Estimated DNL for a PIN, assuming a baseline DNL of 50 (""quiet suburban"") and adding predicted noise from O'Hare and Midway airports to that baseline",Proximity,, +Nearest Bike Trail Distance (Feet),prox_nearest_bike_trail_dist_ft,Nearest bike trail distance (feet),Proximity,, +Nearest Cemetery Distance (Feet),prox_nearest_cemetery_dist_ft,Nearest cemetery distance (feet),Proximity,, +Nearest CTA Route Distance (Feet),prox_nearest_cta_route_dist_ft,Nearest CTA route distance (feet),Proximity,, +Nearest CTA Stop Distance (Feet),prox_nearest_cta_stop_dist_ft,Nearest CTA stop distance (feet),Proximity,, +Nearest Hospital Distance (Feet),prox_nearest_hospital_dist_ft,Nearest hospital distance (feet),Proximity,, +Lake Michigan Distance (Feet),prox_lake_michigan_dist_ft,Distance to Lake Michigan shoreline (feet),Proximity,, +Nearest Metra Route Distance (Feet),prox_nearest_metra_route_dist_ft,Nearest Metra route distance (feet),Proximity,, +Nearest Metra Stop Distance (Feet),prox_nearest_metra_stop_dist_ft,Nearest Metra stop distance (feet),Proximity,, +Nearest Park Distance (Feet),prox_nearest_park_dist_ft,Nearest park distance (feet),Proximity,, +Nearest Railroad Distance (Feet),prox_nearest_railroad_dist_ft,Nearest railroad distance (feet),Proximity,, +Nearest University Distance (Feet),prox_nearest_university_dist_ft,Nearest university distance (feet),Proximity,, +Nearest Vacant Land Parcel Distance (Feet),prox_nearest_vacant_land_dist_ft,Nearest vacant land (class 100) parcel distance (feet),Proximity,, +Nearest Water Distance (Feet),prox_nearest_water_dist_ft,Nearest water distance (feet),Proximity,, +Nearest Golf Course Distance (Feet),prox_nearest_golf_course_dist_ft,Nearest golf course distance (feet),Proximity,, +Nearest Highway Distance (Feet),prox_nearest_road_highway_dist_ft,Distance to nearest highway road,Proximity,, +Nearest Arterial Road Distance (Feet),prox_nearest_road_arterial_dist_ft,Distance to nearest arterial road,Proximity,, +Nearest Collector Road Distance (Feet),prox_nearest_road_collector_dist_ft,Distance to nearest collector road,Proximity,, +Average Daily Traffic Count on Nearest Highway,prox_nearest_road_highway_daily_traffic,Daily traffic of nearest highway road,Proximity,, +Average Daily Traffic Count on Nearest Arterial Road,prox_nearest_road_arterial_daily_traffic,Daily traffic of nearest arterial road,Proximity,, +Average Daily Traffic Count on Nearest Collector Road,prox_nearest_road_collector_daily_traffic,Daily traffic of nearest collector road,Proximity,, +Nearest New Construction (Feet),prox_nearest_new_construction_dist_ft,Nearest new construction distance (feet),Proximity,, +Nearest Major Stadium (Feet),prox_nearest_stadium_dist_ft,Nearest stadium distance (feet),Proximity,, +Sale Year,time_sale_year,Sale year calculated as the number of years since 0 B.C.E,Time,, +Sale Day,time_sale_day,"Sale day calculated as the number of days since January 1st, 1997",Time,, +Sale Quarter of Year,time_sale_quarter_of_year,Character encoding of quarter of year (Q1 - Q4),Time,, +Sale Month of Year,time_sale_month_of_year,Character encoding of month of year (Jan - Dec),Time,, +Sale Day of Year,time_sale_day_of_year,Numeric encoding of day of year (1 - 365),Time,, +Sale Day of Month,time_sale_day_of_month,Numeric encoding of day of month (1 - 31),Time,, +Sale Day of Week,time_sale_day_of_week,Numeric encoding of day of week (1 - 7),Time,, +Sale After COVID-19,time_sale_post_covid,"Indicator for whether sale occurred after COVID-19 was widely publicized (around March 15, 2020)",Time,,