Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Train with multi-cards and aggregate sqft #331

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 54 additions & 2 deletions pipeline/01-train.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,62 @@
# NOTE: It is critical to trim "multicard" sales when training. Multicard means
# there is multiple buildings on a PIN. Since these sales include multiple
# buildings, they are typically higher than a "normal" sale and must be removed
training_data_full <- read_parquet(paths$input$training$local) %>%
filter(!ind_pin_is_multicard, !sv_is_outlier) %>%

# - - - - - -
# TESTING - multi-card training strategy
# - - - - - -

training_data_w_multi_card <- read_parquet(paths$input$training$local) %>%
filter(!sv_is_outlier) %>%
arrange(meta_sale_date)



# Process the data as per the requirements
training_data_full <- training_data_w_multi_card %>%
# Group by both meta_pin and meta_sale_document_num
group_by(meta_pin, meta_sale_document_num) %>%
arrange(desc(char_bldg_sf), desc(char_bldg_sf), .by_group = TRUE) %>%
# Flag the first row in each group to preserve
mutate(
preserve = row_number() == 1,
# Calculate the total char_bldg_sf for each group
total_char_bldg_sf = sum(char_bldg_sf)
) %>%
# Update char_bldg_sf: if preserved, set to total; else, set to NA
mutate(char_bldg_sf = if_else(preserve, total_char_bldg_sf, NA_real_)) %>%
# Keep only the preserved rows
filter(preserve) %>%
# Remove temporary columns used for processing
select(-preserve, -total_char_bldg_sf) %>%
# Ungroup the data for further operations
ungroup()



# * * * *
# some eda stuff
# * * * *
# training_data_w_multi_card_processed %>%
# mutate(temp_price_per_sqft = meta_sale_price/char_bldg_sf) %>%
# select(meta_pin, meta_sale_document_num, ind_pin_is_multicard,
# meta_sale_price, temp_price_per_sqft,
# sv_is_outlier, char_bldg_sf,
# meta_sale_date) %>% View()
#
#
# training_data_w_multi_card_processed %>%
# mutate(temp_price_per_sqft = meta_sale_price/char_bldg_sf) %>%
# mutate(meta_sale_price_sf_decile = ntile(temp_price_per_sqft, 10)) %>%
# group_by(meta_sale_price_sf_decile, ind_pin_is_multicard) %>%
# summarise(count = n(), .groups = "drop") %>%
# group_by(meta_sale_price_sf_decile) %>%
# mutate(percentage = count / sum(count) * 100)

Check warning on line 83 in pipeline/01-train.R

View workflow job for this annotation

GitHub Actions / pre-commit

file=/home/runner/work/model-res-avm/model-res-avm/pipeline/01-train.R,line=83,col=5,[commented_code_linter] Commented code should be removed.

# training_data_full <- read_parquet(paths$input$training$local) %>%
# filter(!ind_pin_is_multicard, !sv_is_outlier) %>%
# arrange(meta_sale_date)

Check warning on line 87 in pipeline/01-train.R

View workflow job for this annotation

GitHub Actions / pre-commit

file=/home/runner/work/model-res-avm/model-res-avm/pipeline/01-train.R,line=87,col=5,[commented_code_linter] Commented code should be removed.

# Create train/test split by time, with most recent observations in the test set
# We want our best model(s) to be predictive of the future, since properties are
# assessed on the basis of past sales
Expand Down
36 changes: 18 additions & 18 deletions pipeline/02-assess.R
Original file line number Diff line number Diff line change
Expand Up @@ -79,28 +79,28 @@
# across multiple PINs sometimes receives different values from the model
group_by(meta_tieback_key_pin, meta_card_num, char_land_sf) %>%
mutate(
pred_card_intermediate_fmv = ifelse(
pred_pin_card_sum = ifelse(
is.na(meta_tieback_key_pin),
pred_card_initial_fmv,
mean(pred_card_initial_fmv)
)
) %>%
# Aggregate multi-cards to the PIN-level by summing the predictions
# of all cards. We use a heuristic here to limit the PIN-level total
# value, this is to prevent super-high-value back-buildings/ADUs from
# blowing up the PIN-level AV
group_by(meta_pin) %>%
mutate(
pred_pin_card_sum = ifelse(
sum(pred_card_intermediate_fmv) * meta_tieback_proration_rate <=
params$pv$multicard_yoy_cap * first(meta_1yr_pri_board_tot * 10) |
is.na(meta_1yr_pri_board_tot) |
n() != 2,
sum(pred_card_intermediate_fmv),
max(pred_card_intermediate_fmv)
)
) %>%
ungroup()
) #%>%
# # Aggregate multi-cards to the PIN-level by summing the predictions
# # of all cards. We use a heuristic here to limit the PIN-level total
# # value, this is to prevent super-high-value back-buildings/ADUs from

Check warning on line 90 in pipeline/02-assess.R

View workflow job for this annotation

GitHub Actions / pre-commit

file=/home/runner/work/model-res-avm/model-res-avm/pipeline/02-assess.R,line=90,col=3,[commented_code_linter] Commented code should be removed.
# # blowing up the PIN-level AV
# group_by(meta_pin) %>%
# mutate(
# pred_pin_card_sum = ifelse(
# sum(pred_card_intermediate_fmv) * meta_tieback_proration_rate <=
# params$pv$multicard_yoy_cap * first(meta_1yr_pri_board_tot * 10) |
# is.na(meta_1yr_pri_board_tot) |
# n() != 2,

Check warning on line 98 in pipeline/02-assess.R

View workflow job for this annotation

GitHub Actions / pre-commit

file=/home/runner/work/model-res-avm/model-res-avm/pipeline/02-assess.R,line=98,col=9,[commented_code_linter] Commented code should be removed.
# sum(pred_card_intermediate_fmv),

Check warning on line 99 in pipeline/02-assess.R

View workflow job for this annotation

GitHub Actions / pre-commit

file=/home/runner/work/model-res-avm/model-res-avm/pipeline/02-assess.R,line=99,col=7,[commented_code_linter] Commented code should be removed.
# max(pred_card_intermediate_fmv)

Check warning on line 100 in pipeline/02-assess.R

View workflow job for this annotation

GitHub Actions / pre-commit

file=/home/runner/work/model-res-avm/model-res-avm/pipeline/02-assess.R,line=100,col=7,[commented_code_linter] Commented code should be removed.
# )
# ) %>%
# ungroup()

Check warning on line 103 in pipeline/02-assess.R

View workflow job for this annotation

GitHub Actions / pre-commit

file=/home/runner/work/model-res-avm/model-res-avm/pipeline/02-assess.R,line=103,col=3,[commented_code_linter] Commented code should be removed.


## 3.2. Townhomes --------------------------------------------------------------
Expand Down
Loading