diff --git a/R/setup.R b/R/setup.R index f955b06..e434345 100644 --- a/R/setup.R +++ b/R/setup.R @@ -19,6 +19,7 @@ suppressPackageStartupMessages({ # Resolve package namespace conflicts, preferring the library::function pair # shown over other functions with the same name from different libraries conflicts_prefer( + data.table::`:=`, dplyr::filter, dplyr::first, dplyr::lag, diff --git a/README.md b/README.md index b0c92d2..c341d59 100644 --- a/README.md +++ b/README.md @@ -135,6 +135,24 @@ ones used in the most recent assessment model. | Average Daily Traffic Count on Nearest Collector Road | prox_nearest_road_collector_daily_traffic | Daily traffic of nearest collector road | Proximity | numeric | X | | Nearest New Construction (Feet) | prox_nearest_new_construction_dist_ft | Nearest new construction distance (feet) | Proximity | numeric | X | | Nearest Major Stadium (Feet) | prox_nearest_stadium_dist_ft | Nearest stadium distance (feet) | Proximity | numeric | X | +| NA | time_sale_roll_mean_nbhd_sf_t0_w1 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_sf_t0_w2 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_sf_t0_w3 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_sf_t1_w1 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_sf_t1_w2 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_sf_t1_w3 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_sf_t2_w1 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_sf_t2_w2 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_sf_t2_w3 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_condo_t0_w1 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_condo_t0_w2 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_condo_t0_w3 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_condo_t1_w1 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_condo_t1_w2 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_condo_t1_w3 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_condo_t2_w1 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_condo_t2_w2 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_condo_t2_w3 | | NA | NA | X | | Percent Population Age, Under 19 Years Old | acs5_percent_age_children | Percent of the people 17 years or younger | ACS5 | numeric | | | Percent Population Age, Over 65 Years Old | acs5_percent_age_senior | Percent of the people 65 years or older | ACS5 | numeric | | | Median Population Age | acs5_median_age_total | Median age for whole population | ACS5 | numeric | | diff --git a/docs/data-dict.csv b/docs/data-dict.csv index fd8cbac..00306ed 100644 --- a/docs/data-dict.csv +++ b/docs/data-dict.csv @@ -25,6 +25,24 @@ Average Daily Traffic Count on Nearest Arterial Road,prox_nearest_road_arterial_ Average Daily Traffic Count on Nearest Collector Road,prox_nearest_road_collector_daily_traffic,Daily traffic of nearest collector road,Proximity,numeric,TRUE Nearest New Construction (Feet),prox_nearest_new_construction_dist_ft,Nearest new construction distance (feet),Proximity,numeric,TRUE Nearest Major Stadium (Feet),prox_nearest_stadium_dist_ft,Nearest stadium distance (feet),Proximity,numeric,TRUE +NA,time_sale_roll_mean_nbhd_sf_t0_w1,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_sf_t0_w2,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_sf_t0_w3,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_sf_t1_w1,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_sf_t1_w2,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_sf_t1_w3,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_sf_t2_w1,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_sf_t2_w2,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_sf_t2_w3,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_condo_t0_w1,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_condo_t0_w2,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_condo_t0_w3,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_condo_t1_w1,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_condo_t1_w2,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_condo_t1_w3,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_condo_t2_w1,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_condo_t2_w2,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_condo_t2_w3,,NA,NA,TRUE "Percent Population Age, Under 19 Years Old",acs5_percent_age_children,Percent of the people 17 years or younger,ACS5,numeric,FALSE "Percent Population Age, Over 65 Years Old",acs5_percent_age_senior,Percent of the people 65 years or older,ACS5,numeric,FALSE Median Population Age,acs5_median_age_total,Median age for whole population,ACS5,numeric,FALSE diff --git a/dvc.lock b/dvc.lock index 71c5fdd..df1d6af 100644 --- a/dvc.lock +++ b/dvc.lock @@ -5,8 +5,8 @@ stages: deps: - path: pipeline/00-ingest.R hash: md5 - md5: f758cc2d2c8dbe928806ffb0a46ab821 - size: 24134 + md5: 32c710a20efbbecb9efdfcd6135431be + size: 32221 params: params.yaml: assessment: @@ -31,12 +31,12 @@ stages: outs: - path: input/assessment_data.parquet hash: md5 - md5: b1462cc55efa7d8beb5ec2af9a649a9b - size: 76103136 + md5: 8680103dca75682cd34aaf843d92ca94 + size: 87157277 - path: input/char_data.parquet hash: md5 - md5: 09a842b0910fa84c9fa7834593ee488c - size: 149301395 + md5: 5889fbe740480bae3c2da1bbc2b7c720 + size: 155750977 - path: input/condo_strata_data.parquet hash: md5 md5: ded3ecde590af57e6b98a8935fae0215 @@ -47,8 +47,8 @@ stages: size: 6019 - path: input/training_data.parquet hash: md5 - md5: ef87ceb9be93d8ae85118991ab5269f2 - size: 76713007 + md5: 14f460da4e8e49f58e3dda8dab06f792 + size: 100615128 train: cmd: Rscript pipeline/01-train.R deps: diff --git a/dvc.yaml b/dvc.yaml index 8c6dfcf..b6eaefa 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -4,52 +4,51 @@ stages: desc: > Ingest training and assessment data from Athena + generate condo strata deps: - - pipeline/00-ingest.R + - pipeline/00-ingest.R params: - - assessment - - input + - assessment + - input outs: - - input/assessment_data.parquet - - input/char_data.parquet - - input/condo_strata_data.parquet - - input/land_nbhd_rate_data.parquet - - input/training_data.parquet + - input/assessment_data.parquet + - input/char_data.parquet + - input/condo_strata_data.parquet + - input/land_nbhd_rate_data.parquet + - input/training_data.parquet frozen: true - train: cmd: Rscript pipeline/01-train.R desc: > Train a LightGBM model with cross-validation. Generate model objects, data recipes, and predictions on the test set (most recent 10% of sales) deps: - - pipeline/01-train.R - - input/training_data.parquet + - pipeline/01-train.R + - input/training_data.parquet params: - - cv - - model.engine - - model.hyperparameter - - model.objective - - model.parameter - - model.predictor - - model.seed - - model.verbose - - ratio_study - - toggle.cv_enable + - cv + - model.engine + - model.hyperparameter + - model.objective + - model.parameter + - model.predictor + - model.seed + - model.verbose + - ratio_study + - toggle.cv_enable outs: - - output/intermediate/timing/model_timing_train.parquet: - cache: false - - output/parameter_final/model_parameter_final.parquet: - cache: false - - output/parameter_range/model_parameter_range.parquet: - cache: false - - output/parameter_search/model_parameter_search.parquet: - cache: false - - output/test_card/model_test_card.parquet: - cache: false - - output/workflow/fit/model_workflow_fit.zip: - cache: false - - output/workflow/recipe/model_workflow_recipe.rds: - cache: false + - output/intermediate/timing/model_timing_train.parquet: + cache: false + - output/parameter_final/model_parameter_final.parquet: + cache: false + - output/parameter_range/model_parameter_range.parquet: + cache: false + - output/parameter_search/model_parameter_search.parquet: + cache: false + - output/test_card/model_test_card.parquet: + cache: false + - output/workflow/fit/model_workflow_fit.zip: + cache: false + - output/workflow/recipe/model_workflow_recipe.rds: + cache: false assess: cmd: Rscript pipeline/02-assess.R @@ -58,25 +57,25 @@ stages: County. Also generate flags, calculate land values, and make any post-modeling changes deps: - - pipeline/02-assess.R - - input/assessment_data.parquet - - input/condo_strata_data.parquet - - input/land_nbhd_rate_data.parquet - - input/training_data.parquet - - output/workflow/fit/model_workflow_fit.zip - - output/workflow/recipe/model_workflow_recipe.rds + - pipeline/02-assess.R + - input/assessment_data.parquet + - input/condo_strata_data.parquet + - input/land_nbhd_rate_data.parquet + - input/training_data.parquet + - output/workflow/fit/model_workflow_fit.zip + - output/workflow/recipe/model_workflow_recipe.rds params: - - assessment - - model.predictor.all - - pv - - ratio_study + - assessment + - model.predictor.all + - pv + - ratio_study outs: - - output/assessment_card/model_assessment_card.parquet: - cache: false - - output/assessment_pin/model_assessment_pin.parquet: - cache: false - - output/intermediate/timing/model_timing_assess.parquet: - cache: false + - output/assessment_card/model_assessment_card.parquet: + cache: false + - output/assessment_pin/model_assessment_pin.parquet: + cache: false + - output/intermediate/timing/model_timing_assess.parquet: + cache: false evaluate: cmd: Rscript pipeline/03-evaluate.R @@ -86,23 +85,23 @@ stages: 2. An assessor-specific ratio study comparing estimated assessments to the previous year's sales deps: - - pipeline/03-evaluate.R - - output/assessment_pin/model_assessment_pin.parquet - - output/test_card/model_test_card.parquet + - pipeline/03-evaluate.R + - output/assessment_pin/model_assessment_pin.parquet + - output/test_card/model_test_card.parquet params: - - assessment - - ratio_study + - assessment + - ratio_study outs: - - output/performance/model_performance_test.parquet: - cache: false - - output/performance_quantile/model_performance_quantile_test.parquet: - cache: false - - output/performance/model_performance_assessment.parquet: - cache: false - - output/performance_quantile/model_performance_quantile_assessment.parquet: - cache: false - - output/intermediate/timing/model_timing_evaluate.parquet: - cache: false + - output/performance/model_performance_test.parquet: + cache: false + - output/performance_quantile/model_performance_quantile_test.parquet: + cache: false + - output/performance/model_performance_assessment.parquet: + cache: false + - output/performance_quantile/model_performance_quantile_assessment.parquet: + cache: false + - output/intermediate/timing/model_timing_evaluate.parquet: + cache: false interpret: cmd: Rscript pipeline/04-interpret.R @@ -110,20 +109,20 @@ stages: Generate SHAP values for each card and feature as well as feature importance metrics for each feature deps: - - pipeline/04-interpret.R - - input/assessment_data.parquet - - output/workflow/fit/model_workflow_fit.zip - - output/workflow/recipe/model_workflow_recipe.rds + - pipeline/04-interpret.R + - input/assessment_data.parquet + - output/workflow/fit/model_workflow_fit.zip + - output/workflow/recipe/model_workflow_recipe.rds params: - - toggle.shap_enable - - model.predictor.all + - toggle.shap_enable + - model.predictor.all outs: - - output/shap/model_shap.parquet: - cache: false - - output/feature_importance/model_feature_importance.parquet: - cache: false - - output/intermediate/timing/model_timing_interpret.parquet: - cache: false + - output/shap/model_shap.parquet: + cache: false + - output/feature_importance/model_feature_importance.parquet: + cache: false + - output/intermediate/timing/model_timing_interpret.parquet: + cache: false finalize: cmd: Rscript pipeline/05-finalize.R @@ -131,28 +130,28 @@ stages: Save run timings and run metadata to disk and render a performance report using Quarto. deps: - - pipeline/05-finalize.R - - output/intermediate/timing/model_timing_train.parquet - - output/intermediate/timing/model_timing_assess.parquet - - output/intermediate/timing/model_timing_evaluate.parquet - - output/intermediate/timing/model_timing_interpret.parquet + - pipeline/05-finalize.R + - output/intermediate/timing/model_timing_train.parquet + - output/intermediate/timing/model_timing_assess.parquet + - output/intermediate/timing/model_timing_evaluate.parquet + - output/intermediate/timing/model_timing_interpret.parquet params: - - run_note - - toggle - - input - - cv - - model - - pv - - ratio_study + - run_note + - toggle + - input + - cv + - model + - pv + - ratio_study outs: - - output/intermediate/timing/model_timing_finalize.parquet: - cache: false - - output/timing/model_timing.parquet: - cache: false - - output/metadata/model_metadata.parquet: - cache: false - - reports/performance/performance.html: - cache: false + - output/intermediate/timing/model_timing_finalize.parquet: + cache: false + - output/timing/model_timing.parquet: + cache: false + - output/metadata/model_metadata.parquet: + cache: false + - reports/performance/performance.html: + cache: false upload: cmd: Rscript pipeline/06-upload.R @@ -162,24 +161,24 @@ stages: outputs prior to upload and attach a unique run ID. This step requires access to the CCAO Data AWS account, and so is assumed to be internal-only deps: - - pipeline/06-upload.R - - output/parameter_final/model_parameter_final.parquet - - output/parameter_range/model_parameter_range.parquet - - output/parameter_search/model_parameter_search.parquet - - output/workflow/fit/model_workflow_fit.zip - - output/workflow/recipe/model_workflow_recipe.rds - - output/test_card/model_test_card.parquet - - output/assessment_card/model_assessment_card.parquet - - output/assessment_pin/model_assessment_pin.parquet - - output/performance/model_performance_test.parquet - - output/performance_quantile/model_performance_quantile_test.parquet - - output/performance/model_performance_assessment.parquet - - output/performance_quantile/model_performance_quantile_assessment.parquet - - output/shap/model_shap.parquet - - output/feature_importance/model_feature_importance.parquet - - output/metadata/model_metadata.parquet - - output/timing/model_timing.parquet - - reports/performance/performance.html + - pipeline/06-upload.R + - output/parameter_final/model_parameter_final.parquet + - output/parameter_range/model_parameter_range.parquet + - output/parameter_search/model_parameter_search.parquet + - output/workflow/fit/model_workflow_fit.zip + - output/workflow/recipe/model_workflow_recipe.rds + - output/test_card/model_test_card.parquet + - output/assessment_card/model_assessment_card.parquet + - output/assessment_pin/model_assessment_pin.parquet + - output/performance/model_performance_test.parquet + - output/performance_quantile/model_performance_quantile_test.parquet + - output/performance/model_performance_assessment.parquet + - output/performance_quantile/model_performance_quantile_assessment.parquet + - output/shap/model_shap.parquet + - output/feature_importance/model_feature_importance.parquet + - output/metadata/model_metadata.parquet + - output/timing/model_timing.parquet + - reports/performance/performance.html export: cmd: Rscript pipeline/07-export.R @@ -188,11 +187,11 @@ stages: run. NOT automatically run since it is typically only run once. Manually run once a model is selected deps: - - pipeline/07-export.R + - pipeline/07-export.R params: - - assessment.year - - input.min_sale_year - - input.max_sale_year - - ratio_study - - export + - assessment.year + - input.min_sale_year + - input.max_sale_year + - ratio_study + - export frozen: true diff --git a/params.yaml b/params.yaml index be11570..c6c1008 100644 --- a/params.yaml +++ b/params.yaml @@ -226,6 +226,24 @@ model: - "shp_parcel_num_vertices" - "meta_strata_1" - "meta_strata_2" + - "time_sale_roll_mean_nbhd_sf_t0_w1" + - "time_sale_roll_mean_nbhd_sf_t0_w2" + - "time_sale_roll_mean_nbhd_sf_t0_w3" + - "time_sale_roll_mean_nbhd_sf_t1_w1" + - "time_sale_roll_mean_nbhd_sf_t1_w2" + - "time_sale_roll_mean_nbhd_sf_t1_w3" + - "time_sale_roll_mean_nbhd_sf_t2_w1" + - "time_sale_roll_mean_nbhd_sf_t2_w2" + - "time_sale_roll_mean_nbhd_sf_t2_w3" + - "time_sale_roll_mean_nbhd_condo_t0_w1" + - "time_sale_roll_mean_nbhd_condo_t0_w2" + - "time_sale_roll_mean_nbhd_condo_t0_w3" + - "time_sale_roll_mean_nbhd_condo_t1_w1" + - "time_sale_roll_mean_nbhd_condo_t1_w2" + - "time_sale_roll_mean_nbhd_condo_t1_w3" + - "time_sale_roll_mean_nbhd_condo_t2_w1" + - "time_sale_roll_mean_nbhd_condo_t2_w2" + - "time_sale_roll_mean_nbhd_condo_t2_w3" # List of predictors included in predictor.all which are categoricals. It is # CRITICAL that any categorical variables are included in this list, else diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index d271ab4..4f57a84 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -228,6 +228,44 @@ land_nbhd_rate_data <- dbGetQuery( ) tictoc::toc() +# Pull single family sales data to construct rolling price averages by +# neighborhood. +tictoc::tic("Single-family sales data pulled") +sf_sales_data <- dbGetQuery( + conn = AWS_ATHENA_CONN_NOCTUA, glue(" + SELECT + sale.doc_no AS meta_sale_document_num, + sale.sale_price AS meta_sale_price, + sale.sale_date AS meta_sale_date, + sale.sv_is_outlier, + res.meta_township_code, + res.meta_nbhd_code + FROM model.vw_card_res_input res + INNER JOIN default.vw_pin_sale sale + ON sale.pin = res.meta_pin + AND sale.year = res.year + WHERE res.year + BETWEEN '{params$input$min_sale_year}' + AND '{params$input$max_sale_year}' + --AND CAST({params$input$max_sale_year} AS int) + AND sale.deed_type IN ('01', '02', '05') + AND NOT sale.is_multisale + AND NOT sale.sale_filter_same_sale_within_365 + AND NOT sale.sale_filter_less_than_10k + AND NOT sale.sale_filter_deed_type + ") +) %>% + # We keep multicard sales since we are only using them to construct sale price + # trends, but we still need the sales sample to be unique by document number + distinct(meta_sale_document_num, .keep_all = TRUE) %>% + # Only exclude explicit outliers from training. Sales with missing validation + # outcomes will be considered non-outliers + mutate( + sv_is_outlier = replace_na(sv_is_outlier, FALSE), + ind_pin_is_multicard = FALSE + ) +tictoc::toc() + # Close connection to Athena dbDisconnect(AWS_ATHENA_CONN_NOCTUA) rm(AWS_ATHENA_CONN_NOCTUA) @@ -393,6 +431,245 @@ training_data_clean <- training_data_fil %>% as_tibble() +# Stack single-family and condo sales data to construct rolling means for both +all_sales_data <- sf_sales_data %>% + mutate(regression_group = "sf") %>% + bind_rows( + training_data_clean %>% + select( + meta_sale_document_num, meta_sale_price, meta_sale_date, sv_is_outlier, + meta_township_code, meta_nbhd_code + ) %>% + mutate( + ind_pin_is_multicard = FALSE, + regression_group = "condo" + ) + ) %>% + mutate( + meta_sale_price_sf = ifelse(regression_group == "sf", meta_sale_price, NA), + meta_sale_price_condo = ifelse( + regression_group == "condo", meta_sale_price, NA + ), + ) %>% + arrange(meta_sale_date) + +all_sales_data_rolling <- all_sales_data[ + !sv_is_outlier & !ind_pin_is_multicard, + `:=`( + lag_nbhd_sf_t0_price = data.table::shift( + meta_sale_price_sf, 1, + type = "lag" + ), + lag_nbhd_sf_t1_shift = (seq_len(.N) - findInterval( + meta_sale_date %m-% months(3), + meta_sale_date + )) * 2 - 1, + lag_nbhd_sf_t2_shift = (seq_len(.N) - findInterval( + meta_sale_date %m-% months(12), + meta_sale_date + )) * 2 - 1, + lag_nbhd_condo_t0_price = data.table::shift(meta_sale_price_condo, 1, + type = "lag" + ), + lag_nbhd_condo_t1_shift = (seq_len(.N) - findInterval( + meta_sale_date %m-% months(3), meta_sale_date + )) * 2 - 1, + lag_nbhd_condo_t2_shift = (seq_len(.N) - findInterval( + meta_sale_date %m-% months(12), meta_sale_date + )) * 2 - 1 + ), + by = .(meta_nbhd_code) +][ + !sv_is_outlier & !ind_pin_is_multicard, + `:=`( + lag_nbhd_sf_t1_price = meta_sale_price_sf[replace( + seq(.N) - lag_nbhd_sf_t1_shift, seq(.N) <= lag_nbhd_sf_t1_shift, NA + )], + lag_nbhd_sf_t2_price = meta_sale_price_sf[replace( + seq(.N) - lag_nbhd_sf_t2_shift, seq(.N) <= lag_nbhd_sf_t2_shift, NA + )], + lag_nbhd_condo_t1_price = meta_sale_price_condo[replace( + seq(.N) - lag_nbhd_condo_t1_shift, seq(.N) <= lag_nbhd_condo_t1_shift, NA + )], + lag_nbhd_condo_t2_price = meta_sale_price_condo[replace( + seq(.N) - lag_nbhd_condo_t2_shift, seq(.N) <= lag_nbhd_condo_t2_shift, NA + )] + ), + by = .(meta_nbhd_code) +][ + !sv_is_outlier & !ind_pin_is_multicard, + `:=`( + time_sale_roll_mean_nbhd_sf_t0_w1 = data.table::frollmean( + lag_nbhd_sf_t0_price, + seq_len(.N) - findInterval(meta_sale_date %m-% months(3), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_sf_t0_w2 = data.table::frollmean( + lag_nbhd_sf_t0_price, + seq_len(.N) - findInterval(meta_sale_date %m-% months(6), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_sf_t0_w3 = data.table::frollmean( + lag_nbhd_sf_t0_price, + seq_len(.N) - findInterval( + meta_sale_date %m-% months(12), meta_sale_date + ), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_sf_t1_w1 = data.table::frollmean( + lag_nbhd_sf_t1_price, + seq_len(.N) - findInterval(meta_sale_date %m-% months(3), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_sf_t1_w2 = data.table::frollmean( + lag_nbhd_sf_t1_price, + seq_len(.N) - findInterval(meta_sale_date %m-% months(6), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_sf_t1_w3 = data.table::frollmean( + lag_nbhd_sf_t1_price, + seq_len(.N) - findInterval( + meta_sale_date %m-% months(12), meta_sale_date + ), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_sf_t2_w1 = data.table::frollmean( + lag_nbhd_sf_t2_price, + seq_len(.N) - findInterval(meta_sale_date %m-% months(3), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_sf_t2_w2 = data.table::frollmean( + lag_nbhd_sf_t2_price, + seq_len(.N) - findInterval(meta_sale_date %m-% months(6), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_sf_t2_w3 = data.table::frollmean( + lag_nbhd_sf_t2_price, + seq_len(.N) - findInterval( + meta_sale_date %m-% months(12), meta_sale_date + ), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_condo_t0_w1 = data.table::frollmean( + lag_nbhd_condo_t0_price, + seq_len(.N) - findInterval(meta_sale_date %m-% months(3), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_condo_t0_w2 = data.table::frollmean( + lag_nbhd_condo_t0_price, + seq_len(.N) - findInterval(meta_sale_date %m-% months(6), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_condo_t0_w3 = data.table::frollmean( + lag_nbhd_condo_t0_price, + seq_len(.N) - findInterval( + meta_sale_date %m-% months(12), meta_sale_date + ), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_condo_t1_w1 = data.table::frollmean( + lag_nbhd_condo_t1_price, + seq_len(.N) - findInterval(meta_sale_date %m-% months(3), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_condo_t1_w2 = data.table::frollmean( + lag_nbhd_condo_t1_price, + seq_len(.N) - findInterval(meta_sale_date %m-% months(6), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_condo_t1_w3 = data.table::frollmean( + lag_nbhd_condo_t1_price, + seq_len(.N) - findInterval( + meta_sale_date %m-% months(12), meta_sale_date + ), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_condo_t2_w1 = data.table::frollmean( + lag_nbhd_condo_t2_price, + seq_len(.N) - findInterval(meta_sale_date %m-% months(3), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_condo_t2_w2 = data.table::frollmean( + lag_nbhd_condo_t2_price, + seq_len(.N) - findInterval(meta_sale_date %m-% months(6), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_condo_t2_w3 = data.table::frollmean( + lag_nbhd_condo_t2_price, + seq_len(.N) - findInterval( + meta_sale_date %m-% months(12), meta_sale_date + ), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ) + ), + by = .(meta_nbhd_code) +] %>% + as_tibble() %>% + # Replace NaNs + mutate(across(.cols = starts_with("time"), ~ ifelse(is.nan(.x), NA, .x))) + +# Join rolling sales means for condo and single-family sales onto training data. +training_data_clean <- training_data_clean %>% + left_join( + all_sales_data_rolling %>% + select(meta_sale_document_num, starts_with("time")), + by = "meta_sale_document_num" + ) + ## 4.2. Assessment Data -------------------------------------------------------- # Clean the assessment data. This is the target data that the trained model is @@ -444,8 +721,21 @@ assessment_data_clean <- assessment_data %>% ) %>% relocate(starts_with("ind_"), .after = starts_with("meta_")) %>% relocate(starts_with("char_"), .after = starts_with("ind_")) %>% - as_tibble() - + as_tibble() %>% + # Join rolling sales means for condo and single-family sales onto assessment + # data. Use the most recent rolling mean per neighborhood. There is no need to + # remove outliers from the right side of the join since they are empty and + # cannot influence forward-filling + left_join( + all_sales_data_rolling %>% + group_by(meta_nbhd_code) %>% + arrange(desc(meta_sale_date), .by_group = TRUE) %>% + fill(starts_with("time"), .direction = "up") %>% + slice_head(n = 1) %>% + ungroup() %>% + select(meta_nbhd_code, starts_with("time")), + by = "meta_nbhd_code" + ) ## 4.3. Land Rates ------------------------------------------------------------- message("Saving land rates")