diff --git a/.envrc b/.envrc new file mode 100644 index 00000000..dd7c815d --- /dev/null +++ b/.envrc @@ -0,0 +1,3 @@ +source venv/bin/activate +dotenv +export PASSWORD=VA.TlSR#Z%mu**Q9 diff --git a/api/Dockerfile b/api/Dockerfile index cb1144de..c289af07 100644 --- a/api/Dockerfile +++ b/api/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3 +FROM python:3.12 WORKDIR /usr/src/app diff --git a/api/main.py b/api/main.py index fbfcea0b..319b55ed 100644 --- a/api/main.py +++ b/api/main.py @@ -197,37 +197,34 @@ async def read_blue_zone(year: Year, radius: Radius, db=Depends(get_db)): ], } - @app.get("/predict") async def read_predict( - blue_zone_radius: Radius, - yellow_zone_line_radius: Radius, - yellow_zone_stop_radius: Radius, - blue_zone_limit: Limit, - yellow_zone_limit: Limit, - year: Year, - db=Depends(get_db), - predictor=Depends(get_predictor), -): - result = predictor.predict_cumulative( + blue_zone_radius: Radius, + yellow_zone_line_radius: Radius, + yellow_zone_stop_radius: Radius, + blue_zone_limit: Limit, + yellow_zone_limit: Limit, + year: Year, + db=Depends(get_db), + predictor=Depends(get_predictor), + ): + + result = predictor.predict_cumulative_by_year( db, - intervention=( - { - "radius_blue": blue_zone_radius, - "limit_blue": blue_zone_limit, - "radius_yellow_line": yellow_zone_line_radius, - "radius_yellow_stop": yellow_zone_stop_radius, - "limit_yellow": yellow_zone_limit, - "reform_year": year, - } - ), + intervention={ + "radius_blue": blue_zone_radius, + "limit_blue": blue_zone_limit, + "radius_yellow_line": yellow_zone_line_radius, + "radius_yellow_stop": yellow_zone_stop_radius, + "limit_yellow": yellow_zone_limit, + "reform_year": year, + }, ) return { "census_tracts": [str(t) for t in result["census_tracts"]], - "housing_units_factual": [t.item() for t in result["housing_units_factual"]], - "housing_units_counterfactual": [ - t.tolist() for t in result["housing_units_counterfactual"] - ], + "years": result["years"], + "housing_units_factual": result["housing_units_factual"], + "housing_units_counterfactual": result["housing_units_counterfactual"], } diff --git a/cities/deployment/tracts_minneapolis/predict.py b/cities/deployment/tracts_minneapolis/predict.py index 39966076..7fc004a7 100644 --- a/cities/deployment/tracts_minneapolis/predict.py +++ b/cities/deployment/tracts_minneapolis/predict.py @@ -11,8 +11,8 @@ from dotenv import load_dotenv from pyro.infer import Predictive -from cities.modeling.zoning_models.zoning_tracts_population import ( - TractsModelPopulation as TractsModel, +from cities.modeling.zoning_models.zoning_tracts_continuous_interactions_model import ( + TractsModelContinuousInteractions as TractsModel, ) from cities.utils.data_grabber import find_repo_root from cities.utils.data_loader import select_from_data, select_from_sql @@ -32,6 +32,21 @@ if local_user == "rafal": load_dotenv(os.path.expanduser("~/.env_pw")) +num_samples = 100 + +# this disables assertions for speed +dev_mode = False + +local_user = os.getenv("USER") +if local_user == "rafal": + load_dotenv(os.path.expanduser("~/.env_pw")) + +num_samples = 100 + +# this disables assertions for speed +dev_mode = False + + class TractsModelPredictor: kwargs = { @@ -40,6 +55,7 @@ class TractsModelPredictor: "housing_units", "housing_units_original", "total_value", + "total_value_original", "total_population", "population_density", "median_value", @@ -50,7 +66,9 @@ class TractsModelPredictor: "white_original", "parcel_sqm", "downtown_overlap", + "downtown_overlap_original", "university_overlap", + "university_overlap_original", }, "outcome": "housing_units", } @@ -60,23 +78,13 @@ class TractsModelPredictor: census_tract, year_, case - when downtown_yn then 0 - when not downtown_yn - and year_ >= %(reform_year)s - and distance_to_transit <= %(radius_blue)s - then %(limit_blue)s - when not downtown_yn - and year_ >= %(reform_year)s - and distance_to_transit > %(radius_blue)s - and (distance_to_transit_line <= %(radius_yellow_line)s - or distance_to_transit_stop <= %(radius_yellow_stop)s) + when downtown_yn or university_yn then 0 + when year_ < %(reform_year)s then 1 + when distance_to_transit <= %(radius_blue)s then %(limit_blue)s + when distance_to_transit_line <= %(radius_yellow_line)s + or distance_to_transit_stop <= %(radius_yellow_stop)s then %(limit_yellow)s - when not downtown_yn - and year_ >= %(reform_year)s - and distance_to_transit_line > %(radius_yellow_line)s - and distance_to_transit_stop > %(radius_yellow_stop)s - then 1 - else limit_con + else 1 end as intervention from tracts_model__parcels """ @@ -118,13 +126,8 @@ def __init__(self, conn): TractsModelPredictor.kwargs, ) - # set to zero whenever the university overlap is above 1 - # TODO this should be handled at the data processing stage - self.data["continuous"]["mean_limit_original"] = torch.where( - self.data["continuous"]["university_overlap"] > 1, - torch.zeros_like(self.data["continuous"]["mean_limit_original"]), - self.data["continuous"]["mean_limit_original"], - ) + # R: I assume this this is Jack's workaround to ensure the limits align, correct? + self.data["continuous"]["mean_limit_original"] = self.obs_limits(conn) self.subset = select_from_data(self.data, TractsModelPredictor.kwargs) @@ -143,25 +146,6 @@ def __init__(self, conn): "housing_units_original" ].mean() - # interaction_pairs - # ins = [ - # ("university_overlap", "limit"), - # ("downtown_overlap", "limit"), - # ("distance", "downtown_overlap"), - # ("distance", "university_overlap"), - # ("distance", "limit"), - # ("median_value", "segregation"), - # ("distance", "segregation"), - # ("limit", "sqm"), - # ("segregation", "sqm"), - # ("distance", "white"), - # ("income", "limit"), - # ("downtown_overlap", "median_value"), - # ("downtown_overlap", "segregation"), - # ("median_value", "white"), - # ("distance", "income"), - # ] - ins = [ ("university_overlap", "limit"), ("downtown_overlap", "limit"), @@ -178,15 +162,6 @@ def __init__(self, conn): ("downtown_overlap", "segregation"), ("median_value", "white"), ("distance", "income"), - # from density/pop stage 1 - ("population", "sqm"), - ("density", "income"), - ("density", "white"), - ("density", "segregation"), - ("density", "sqm"), - ("density", "downtown_overlap"), - ("density", "university_overlap"), - ("population", "density"), ] model = TractsModel( @@ -195,15 +170,15 @@ def __init__(self, conn): housing_units_continuous_interaction_pairs=ins, ) - # moved most of this logic here to avoid repeated computations - with open(self.guide_path, "rb") as file: self.guide = dill.load(file) pyro.clear_param_store() pyro.get_param_store().load(self.param_path) - self.predictive = Predictive(model=model, guide=self.guide, num_samples=100) + self.predictive = Predictive( + model=model, guide=self.guide, num_samples=num_samples + ) self.subset_for_preds = copy.deepcopy(self.subset) self.subset_for_preds["continuous"]["housing_units"] = None @@ -219,6 +194,19 @@ def _tracts_intervention( limit_yellow, reform_year, ): + """Return the mean parking limits at the tracts level that result from the given intervention. + + Parameters: + - conn: database connection + - radius_blue: radius of the blue zone (meters) + - limit_blue: parking limit for blue zone + - radius_yellow_line: radius of the yellow zone around lines (meters) + - radius_yellow_stop: radius of the yellow zone around stops (meters) + - limit_yellow: parking limit for yellow zone + - reform_year: year of the intervention + + Returns: Tensor of parking limits sorted by tract and year + """ params = { "reform_year": reform_year, "radius_blue": radius_blue, @@ -232,6 +220,10 @@ def _tracts_intervention( ) return torch.tensor(df["intervention"].values, dtype=torch.float32) + def obs_limits(self, conn): + """Return the observed (factual) parking limits at the tracts level.""" + return self._tracts_intervention(conn, 106.7, 0, 402.3, 804.7, 0.5, 2015) + def predict_cumulative(self, conn, intervention): """Predict the total number of housing units built from 2011-2020 under intervention. @@ -274,7 +266,6 @@ def predict_cumulative(self, conn, intervention): result_cf * self.housing_units_std + self.housing_units_mean ).clamp(min=0) - # calculate cumulative housing units (factual) obs_limits = {} cf_limits = {} obs_cumsums = {} @@ -300,46 +291,63 @@ def predict_cumulative(self, conn, intervention): f_units.append(f_housing_units_raw[:, mask]) cf_units.append(cf_housing_units_raw[:, mask]) + key_str = str(key.item()) obs_cumsum = torch.cumsum(torch.stack(obs_units), dim=0).flatten() - obs_limits[key] = torch.stack(obs_limits_list).flatten() - cf_limits[key] = torch.stack(cf_limits_list).flatten() + obs_limits[key_str] = torch.stack(obs_limits_list).flatten() + cf_limits[key_str] = torch.stack(cf_limits_list).flatten() f_cumsum = torch.cumsum(torch.stack(f_units), dim=0).squeeze() cf_cumsum = torch.cumsum(torch.stack(cf_units), dim=0).squeeze() - obs_cumsums[key] = obs_cumsum - f_cumsums[key] = f_cumsum - cf_cumsums[key] = cf_cumsum - - # presumably outdated - - tracts = self.data["categorical"]["census_tract"] - - # calculate cumulative housing units (factual) - f_totals = {} - for i in range(tracts.shape[0]): - key = tracts[i].item() - if key not in f_totals: - f_totals[key] = 0 - f_totals[key] += obs_housing_units_raw[i] - - # calculate cumulative housing units (counterfactual) - cf_totals = {} - for i in range(tracts.shape[0]): - year = self.years[i].item() - key = tracts[i].item() - if key not in cf_totals: - cf_totals[key] = 0 - if year < intervention["reform_year"]: - cf_totals[key] += obs_housing_units_raw[i] - else: - cf_totals[key] = cf_totals[key] + cf_housing_units_raw[:, i] - cf_totals = {k: torch.clamp(v, 0) for k, v in cf_totals.items()} - - census_tracts = list(cf_totals.keys()) - f_housing_units = [f_totals[k] for k in census_tracts] - cf_housing_units = [cf_totals[k] for k in census_tracts] + obs_cumsums[key_str] = obs_cumsum + f_cumsums[key_str] = f_cumsum + cf_cumsums[key_str] = cf_cumsum + + # R: I'd recommend keeping "cumsums", as well as "observed/factual/counterfactual" + # in variable names + # to make terminology clear and transparent + cumsums_observed = torch.stack(list(obs_cumsums.values())).T.tolist() + + cumsums_factual = [ + [_.tolist() for _ in __.unbind(dim=-2)] + for __ in torch.stack(list(f_cumsums.values())).unbind(dim=-2) + ] + + cumsums_counterfactual = [ + [_.tolist() for _ in __.unbind(dim=-2)] + for __ in torch.stack(list(cf_cumsums.values())).unbind(dim=-2) + ] + + if dev_mode: + assert ( + len(cumsums_factual) + == len(cumsums_observed) + == len(cumsums_counterfactual) + == 10 + ) + # the number of years + assert ( + len(cumsums_factual[0]) == len(cumsums_counterfactual[0]) == 113 + ) # the number of unique tracts + assert ( + len(cumsums_factual[0][0]) + == len(cumsums_counterfactual[0][0]) + == num_samples + ) + assert list(obs_cumsums.keys()) == [ + str(_) for _ in self.tracts.unique().tolist() + ] return { + # these are lists whose structures are dictated + # by the frontend demands + "census_tracts": list(obs_cumsums.keys()), + "years": self.years.unique().tolist(), + "cumsums_observed": cumsums_observed, + "cumsums_factual": cumsums_factual, + "cumsums_counterfactual": cumsums_counterfactual, + # more direct dictionaries used for notebooks and debugging + # if they slow anything down + # we can revisit and make an optional output "obs_cumsums": obs_cumsums, "f_cumsums": f_cumsums, "cf_cumsums": cf_cumsums, @@ -349,12 +357,37 @@ def predict_cumulative(self, conn, intervention): "raw_obs_housing_units": obs_housing_units_raw, "raw_f_housing_units": f_housing_units_raw, "raw_cf_housing_units": cf_housing_units_raw, - # presumably outdated - "census_tracts": census_tracts, - "housing_units_factual": f_housing_units, - "housing_units_counterfactual": cf_housing_units, } + +# This the desired structure of the output +# (except, we need to correct for the observed/factual distinction +# (and make our terminology consistent with the concepts) +# { +# "census_tracts": ["27053000100", "27053000200", ...], # List of census tract IDs +# "years": [2011, 2012, 2013, ..., 2019], # List of years + +# "housing_units_factual": [ +# [100, 150, ...], # Cumulative counts for each tract in 2011 +# [120, 180, ...], # Cumulative counts for each tract in 2012 +# ... +# ], + +# "housing_units_counterfactual": [ +# [ # Year 2011 +# [101, 102, ..., 105], # 100 samples for tract 27053000100 +# [151, 153, ..., 158], # 100 samples for tract 27053000200 +# ... +# ], +# [ # Year 2012 +# [122, 124, ..., 128], # 100 samples for tract 27053000100 +# [182, 185, ..., 190], # 100 samples for tract 27053000200 +# ... +# ], +# ... +# ] +# } + # return { # "census_tracts": census_tracts, # "housing_units_factual": f_housing_units, @@ -373,6 +406,7 @@ def predict_cumulative(self, conn, intervention): start = time.time() for iter in range(5): + local_start = time.time() result = predictor.predict_cumulative( conn, intervention={ @@ -384,5 +418,7 @@ def predict_cumulative(self, conn, intervention): "reform_year": 2015, }, ) + local_end = time.time() + print(f"Counterfactual in {local_end - local_start} seconds") end = time.time() - print(f"Counterfactual in {end - start} seconds") + print(f"5 counterfactuals in {end - start} seconds") diff --git a/cities/deployment/tracts_minneapolis/predict_old.py b/cities/deployment/tracts_minneapolis/predict_old.py new file mode 100644 index 00000000..17aa84ed --- /dev/null +++ b/cities/deployment/tracts_minneapolis/predict_old.py @@ -0,0 +1,281 @@ +import copy +import os + +import dill +import pandas as pd +import pyro +import torch +from chirho.counterfactual.handlers import MultiWorldCounterfactual +from chirho.indexed.ops import IndexSet, gather +from chirho.interventional.handlers import do +from dotenv import load_dotenv +from pyro.infer import Predictive + +from cities.modeling.zoning_models.zoning_tracts_sqm_model import ( + TractsModelSqm as TractsModel, +) +from cities.utils.data_grabber import find_repo_root +from cities.utils.data_loader import select_from_data, select_from_sql + +load_dotenv() + +local_user = os.getenv("USER") +if local_user == "rafal": + load_dotenv(os.path.expanduser("~/.env_pw")) + + +class TractsModelPredictor: + kwargs = { + "categorical": ["year", "year_original", "census_tract"], + "continuous": { + "housing_units", + "total_value", + "median_value", + "median_distance", + "income", + "segregation_original", + "white_original", + "housing_units_original", + "parcel_sqm", + }, + "outcome": "housing_units", + } + + kwargs_subset = { + "categorical": ["year", "census_tract"], + "continuous": { + "housing_units", + "total_value", + "median_value", + "mean_limit_original", + "median_distance", + "income", + "segregation_original", + "white_original", + "parcel_sqm", + }, + "outcome": "housing_units", + } + + parcel_intervention_sql = """ + select + census_tract, + year_, + case + when downtown_yn or university_yn then 0 + when year_ < %(reform_year)s then 1 + when distance_to_transit <= %(radius_blue)s then %(limit_blue)s + when distance_to_transit_line <= %(radius_yellow_line)s + or distance_to_transit_stop <= %(radius_yellow_stop)s + then %(limit_yellow)s + else 1 + end as intervention + from tracts_model__parcels + """ + + tracts_intervention_sql = f""" + with parcel_interventions as ({parcel_intervention_sql}) + select + census_tract, + year_, + avg(intervention) as intervention + from parcel_interventions + group by census_tract, year_ + order by census_tract, year_ + """ + + def __init__(self, conn): + self.conn = conn + + root = find_repo_root() + deploy_path = os.path.join(root, "cities/deployment/tracts_minneapolis") + + guide_path = os.path.join(deploy_path, "tracts_model_guide.pkl") + self.param_path = os.path.join(deploy_path, "tracts_model_params.pth") + + need_to_train_flag = False + if not os.path.isfile(guide_path): + need_to_train_flag = True + print(f"Warning: '{guide_path}' does not exist.") + if not os.path.isfile(self.param_path): + need_to_train_flag = True + print(f"Warning: '{self.param_path}' does not exist.") + + if need_to_train_flag: + print("Please run 'train_model.py' to generate the required files.") + + with open(guide_path, "rb") as file: + guide = dill.load(file) + + self.data = select_from_sql( + "select * from tracts_model__census_tracts order by census_tract, year", + conn, + TractsModelPredictor.kwargs, + ) + self.data["continuous"]["mean_limit_original"] = self.obs_limits(conn) + self.subset = select_from_data(self.data, TractsModelPredictor.kwargs_subset) + + categorical_levels = { + "year": torch.unique(self.subset["categorical"]["year"]), + "census_tract": torch.unique(self.subset["categorical"]["census_tract"]), + } + + self.housing_units_std = self.data["continuous"]["housing_units_original"].std() + self.housing_units_mean = self.data["continuous"][ + "housing_units_original" + ].mean() + + model = TractsModel(**self.subset, categorical_levels=categorical_levels) + self.predictive = Predictive(model=model, guide=guide, num_samples=100) + + # these are at the tracts level + def _tracts_intervention( + self, + conn, + radius_blue, + limit_blue, + radius_yellow_line, + radius_yellow_stop, + limit_yellow, + reform_year, + ): + """Return the mean parking limits at the tracts level that result from the given intervention. + + Parameters: + - conn: database connection + - radius_blue: radius of the blue zone (meters) + - limit_blue: parking limit for blue zone + - radius_yellow_line: radius of the yellow zone around lines (meters) + - radius_yellow_stop: radius of the yellow zone around stops (meters) + - limit_yellow: parking limit for yellow zone + - reform_year: year of the intervention + + Returns: Tensor of parking limits sorted by tract and year + """ + params = { + "reform_year": reform_year, + "radius_blue": radius_blue, + "limit_blue": limit_blue, + "radius_yellow_line": radius_yellow_line, + "radius_yellow_stop": radius_yellow_stop, + "limit_yellow": limit_yellow, + } + df = pd.read_sql( + TractsModelPredictor.tracts_intervention_sql, conn, params=params + ) + return torch.tensor(df["intervention"].values, dtype=torch.float32) + + def obs_limits(self, conn): + """Return the observed (factual) parking limits at the tracts level.""" + return self._tracts_intervention(conn, 106.7, 0, 402.3, 804.7, 0.5, 2015) + + def predict_cumulative_by_year(self, conn, intervention): + """Predict the cumulative number of housing units built from 2011-2019 under intervention, by year. + + Returns a dictionary with keys: + - 'census_tracts': the tracts considered + - 'years': the years considered (2011-2019) + - 'housing_units_factual': cumulative housing units built according to real housing data, by year + - 'housing_units_counterfactual': samples from prediction of cumulative housing units built, by year + """ + pyro.clear_param_store() + pyro.get_param_store().load(self.param_path) + + subset_for_preds = copy.deepcopy(self.subset) + subset_for_preds["continuous"]["housing_units"] = None + + limit_intervention = self._tracts_intervention(conn, **intervention) + + with MultiWorldCounterfactual() as mwc: + with do(actions={"limit": limit_intervention}): + result_all = self.predictive(**subset_for_preds)["housing_units"] + with mwc: + result = gather( + result_all, IndexSet(**{"limit": {1}}), event_dims=0 + ).squeeze() + + years = self.data["categorical"]["year_original"] + tracts = self.data["categorical"]["census_tract"] + f_housing_units = self.data["continuous"]["housing_units_original"] + cf_housing_units = result * self.housing_units_std + self.housing_units_mean + + # Organize cumulative data by year and tract + f_data = {} + cf_data = {} + unique_years = sorted(set(years.tolist())) + unique_years = [ + year for year in unique_years if year <= 2019 + ] # Exclude years after 2019 + unique_tracts = sorted(set(tracts.tolist())) + + for year in unique_years: + f_data[year] = {tract: 0 for tract in unique_tracts} + cf_data[year] = {tract: [0] * 100 for tract in unique_tracts} + + for i in range(tracts.shape[0]): + year = years[i].item() + if year > 2019: + continue # Skip data for years after 2019 + tract = tracts[i].item() + + # Update factual data + for y in unique_years: + if y >= year: + f_data[y][tract] += f_housing_units[i].item() + + # Update counterfactual data + if year < intervention["reform_year"]: + for y in unique_years: + if y >= year: + cf_data[y][tract] = [ + x + f_housing_units[i].item() for x in cf_data[y][tract] + ] + else: + for y in unique_years: + if y >= year: + cf_data[y][tract] = [ + x + y + for x, y in zip( + cf_data[y][tract], cf_housing_units[:, i].tolist() + ) + ] + + # Convert to lists for easier JSON serialization + housing_units_factual = [ + [f_data[year][tract] for tract in unique_tracts] for year in unique_years + ] + housing_units_counterfactual = [ + [cf_data[year][tract] for tract in unique_tracts] for year in unique_years + ] + + return { + "census_tracts": unique_tracts, + "years": unique_years, + "housing_units_factual": housing_units_factual, + "housing_units_counterfactual": housing_units_counterfactual, + } + + +if __name__ == "__main__": + import time + + from cities.utils.data_loader import db_connection + + with db_connection() as conn: + predictor = TractsModelPredictor(conn) + start = time.time() + + for iter in range(5): # added for time testing + result = predictor.predict_cumulative_by_year( + conn, + intervention={ + "radius_blue": 106.7, + "limit_blue": 0, + "radius_yellow_line": 402.3, + "radius_yellow_stop": 804.7, + "limit_yellow": 0.5, + "reform_year": 2015, + }, + ) + end = time.time() + print(f"Counterfactual in {end - start} seconds") diff --git a/cities/deployment/tracts_minneapolis/train_model.py b/cities/deployment/tracts_minneapolis/train_model.py index 5cb141d2..3fb44744 100644 --- a/cities/deployment/tracts_minneapolis/train_model.py +++ b/cities/deployment/tracts_minneapolis/train_model.py @@ -7,22 +7,12 @@ from dotenv import load_dotenv from cities.modeling.svi_inference import run_svi_inference -from cities.modeling.zoning_models.zoning_tracts_population import ( - TractsModelPopulation as TractsModel, +from cities.modeling.zoning_models.zoning_tracts_continuous_interactions_model import ( + TractsModelContinuousInteractions as TractsModel, ) - -# from cities.modeling.zoning_models.zoning_tracts_continuous_interactions_model import ( -# # TractsModelContinuousInteractions as TractsModel, -# ) from cities.utils.data_grabber import find_repo_root from cities.utils.data_loader import db_connection, select_from_sql -# from cities.modeling.zoning_models.zoning_tracts_model import TractsModel -# from cities.modeling.zoning_models.zoning_tracts_sqm_model import ( -# TractsModelSqm as TractsModel, -# ) - - n_steps = 1500 load_dotenv() @@ -74,25 +64,6 @@ ############################# # interaction terms -# ins = [ -# ("university_overlap", "limit"), -# ("downtown_overlap", "limit"), -# ("distance", "downtown_overlap"), -# ("distance", "university_overlap"), -# ("distance", "limit"), -# ("median_value", "segregation"), -# ("distance", "segregation"), -# ("limit", "sqm"), -# ("segregation", "sqm"), -# ("distance", "white"), -# ("income", "limit"), -# ("downtown_overlap", "median_value"), -# ("downtown_overlap", "segregation"), -# ("median_value", "white"), -# ("distance", "income"), -# ] - - ins = [ ("university_overlap", "limit"), ("downtown_overlap", "limit"), @@ -109,14 +80,6 @@ ("downtown_overlap", "segregation"), ("median_value", "white"), ("distance", "income"), - ("population", "sqm"), - ("density", "income"), - ("density", "white"), - ("density", "segregation"), - ("density", "sqm"), - ("density", "downtown_overlap"), - ("density", "university_overlap"), - ("population", "density"), ] diff --git a/dbt/models/tracts_model/intermediate/census_tracts_population.sql b/dbt/models/tracts_model/intermediate/census_tracts_population.sql new file mode 100644 index 00000000..f7f8c8b2 --- /dev/null +++ b/dbt/models/tracts_model/intermediate/census_tracts_population.sql @@ -0,0 +1,14 @@ +-- Population and population density by census tract +with +demographics as (select * from {{ ref('demographics') }}), +population as ( + select * from demographics + where name_ = 'B01003_001E' -- total population +), +census_tracts as (select * from {{ ref('tracts_model_int__census_tracts_filtered') }}) +select + census_tracts.census_tract_id, + population.value_ as total_population, + population.value_ / st_area(census_tracts.geom) as population_density +from + census_tracts left join population using (census_tract, year_) diff --git a/dbt/models/tracts_model/intermediate/parcels_parking_limits.sql b/dbt/models/tracts_model/intermediate/parcels_parking_limits.sql index aebd7b00..a44cbd31 100644 --- a/dbt/models/tracts_model/intermediate/parcels_parking_limits.sql +++ b/dbt/models/tracts_model/intermediate/parcels_parking_limits.sql @@ -1,21 +1,24 @@ with parcels as (select * from {{ ref('tracts_model_int__parcels_filtered') }}), transit as (select * from {{ ref('high_frequency_transit_lines') }}), -downtown as (select * from {{ ref('downtown') }}), +downtown as (select * from {{ ref('downtown') }} limit 1), +university as (select * from {{ ref('university') }} limit 1), with_is_downtown as ( select parcels.parcel_id, parcels.census_tract_id, parcels.valid, parcels.geom, - st_intersects(parcels.geom, downtown.geom) as is_downtown - from downtown, parcels + st_intersects(parcels.geom, downtown.geom) as is_downtown, + st_intersects(parcels.geom, university.geom) as is_university + from downtown, university, parcels ), with_limit as ( select parcels.parcel_id, parcels.census_tract_id, parcels.is_downtown, + parcels.is_university, case when parcels.is_downtown then 'eliminated' when parcels.valid << '[2015-01-01,)'::daterange then 'full' @@ -35,6 +38,7 @@ with_limit_numeric as ( parcels.parcel_id, parcels.census_tract_id, parcels.is_downtown, + parcels.is_university, parcels.limit_, case limit_ when 'full' then 1 diff --git a/dbt/models/tracts_model/tracts_model__census_tracts.sql b/dbt/models/tracts_model/tracts_model__census_tracts.sql index d767c4e0..00d0c89f 100644 --- a/dbt/models/tracts_model/tracts_model__census_tracts.sql +++ b/dbt/models/tracts_model/tracts_model__census_tracts.sql @@ -5,55 +5,43 @@ }} with -housing_units as (select * from {{ ref('census_tracts_housing_units') }}) -, property_values as (select * from {{ ref('census_tracts_property_values') }}) -, distance_to_transit as (select * from {{ ref('census_tracts_distance_to_transit') }}) -, parcel_area as (select * from {{ ref('census_tracts_parcel_area') }}) -, parking_limits as (select * from {{ ref('census_tracts_parking_limits') }}) -, regions as (select * from {{ ref('census_tracts_regions') }}) -, demographics as (select * from {{ ref('demographics') }}) -, census_tracts as (select * from {{ ref('tracts_model_int__census_tracts_filtered') }}) +housing_units as (select * from {{ ref('census_tracts_housing_units') }}), +property_values as (select * from {{ ref('census_tracts_property_values') }}), +distance_to_transit as (select * from {{ ref('census_tracts_distance_to_transit') }}), +parcel_area as (select * from {{ ref('census_tracts_parcel_area') }}), +parking_limits as (select * from {{ ref('census_tracts_parking_limits') }}), +regions as (select * from {{ ref('census_tracts_regions') }}), +population as (select * from {{ ref('census_tracts_population') }}), +demographics as (select * from {{ ref('demographics') }}), +census_tracts as (select * from {{ ref('tracts_model_int__census_tracts_filtered') }}), -- Demographic data -, white as ( - select * from demographics - where name_ = 'B03002_003E' -- white non-hispanic population -) -, population as ( - select * from demographics - where name_ = 'B01003_001E' -- total population -) -, white_frac as ( - select white.census_tract, white.year_, {{ safe_divide('white.value_', 'population.value_') }} as value_ - from white inner join population using (census_tract, year_) -) -, income as ( - select * from demographics - where name_ = 'B19013_001E' -- median household income -) -, segregation as ( - select * from demographics - where description = 'segregation_index_annual_city' -) +white as (select * from demographics where name_ = 'B03002_003E'), -- white non-hispanic population +income as (select * from demographics where name_ = 'B19013_001E'), -- median household income +segregation as ( +select * from demographics where description = 'segregation_index_annual_city' +), -, raw_data as ( +raw_data as ( select - census_tracts.census_tract::bigint - , census_tracts.year_::smallint as "year" - , coalesce(housing_units.num_units, 0) as housing_units - , property_values.total_value - , property_values.median_value - , distance_to_transit.median_distance_to_transit as median_distance - , distance_to_transit.mean_distance_to_transit as mean_distance - , parcel_area.parcel_sqm::double precision - , parcel_area.parcel_mean_sqm::double precision - , parcel_area.parcel_median_sqm::double precision - , parking_limits.mean_limit::double precision - , white_frac.value_ as white - , income.value_ as income - , segregation.value_ as segregation - , regions.downtown_overlap - , regions.university_overlap + census_tracts.census_tract::bigint, + census_tracts.year_::smallint as "year", + coalesce(housing_units.num_units, 0) as housing_units, + property_values.total_value, + property_values.median_value, + distance_to_transit.median_distance_to_transit as median_distance, + distance_to_transit.mean_distance_to_transit as mean_distance, + parcel_area.parcel_sqm::double precision, + parcel_area.parcel_mean_sqm::double precision, + parcel_area.parcel_median_sqm::double precision, + parking_limits.mean_limit::double precision, + population.total_population, + population.population_density, + {{ safe_divide('white.value_', 'population.total_population') }} as white, + income.value_ as income, + segregation.value_ as segregation, + regions.downtown_overlap, + regions.university_overlap from census_tracts inner join housing_units using (census_tract_id) @@ -61,21 +49,25 @@ from inner join distance_to_transit using (census_tract_id) inner join parcel_area using (census_tract_id) inner join parking_limits using (census_tract_id) + inner join population using (census_tract_id) left join segregation using (census_tract, year_) - left join white_frac using (census_tract, year_) + left join white using (census_tract, year_) left join income using (census_tract, year_) left join regions using (census_tract_id) -) -, with_std as ( +), + +with_std as ( select - census_tract - , {{ standardize_cat(['year']) }} - , {{ standardize_cont(['housing_units', 'total_value', 'median_value', - 'median_distance', 'mean_distance', 'parcel_sqm', - 'parcel_mean_sqm', 'parcel_median_sqm', 'white', - 'income', 'mean_limit', 'segregation', - 'downtown_overlap', 'university_overlap' ]) }} + census_tract, + {{ standardize_cat(['year']) }}, + {{ standardize_cont(['housing_units', 'total_value', 'median_value', + 'median_distance', 'mean_distance', 'parcel_sqm', + 'parcel_mean_sqm', 'parcel_median_sqm', 'white', + 'income', 'mean_limit', 'segregation', + 'downtown_overlap', 'university_overlap', + 'total_population', 'population_density' ]) }} from raw_data ) + select * from with_std diff --git a/dbt/models/tracts_model/tracts_model__parcels.sql b/dbt/models/tracts_model/tracts_model__parcels.sql index 35cfadba..7ea34057 100644 --- a/dbt/models/tracts_model/tracts_model__parcels.sql +++ b/dbt/models/tracts_model/tracts_model__parcels.sql @@ -17,7 +17,8 @@ select parcels_distance_to_transit.line_distance as distance_to_transit_line, parcels_distance_to_transit.stop_distance as distance_to_transit_stop, parcels_parking_limits.limit_numeric as limit_con, - parcels_parking_limits.is_downtown as downtown_yn + parcels_parking_limits.is_downtown as downtown_yn, + parcels_parking_limits.is_university as university_yn from parcels join census_tracts using (census_tract_id) diff --git a/scripts/test.sh b/scripts/test.sh index 146a47f0..e9460f44 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -1,4 +1,4 @@ #!/bin/bash set -euxo pipefail -CI=1 cd tests && pytest \ No newline at end of file +CI=1 python -m pytest tests/ \ No newline at end of file diff --git a/tests/modeling/test_tracts_model.py b/tests/modeling/test_tracts_model.py index 320a3e5c..14d5c073 100644 --- a/tests/modeling/test_tracts_model.py +++ b/tests/modeling/test_tracts_model.py @@ -26,6 +26,7 @@ num_samples = 10 +# data_path = os.path.join(root, "data/minneapolis/processed/pg_census_tracts_dataset.pt") data_path = os.path.join(root, "data/minneapolis/processed/pg_census_tracts_dataset.pt") dataset_read = torch.load(data_path, weights_only=False) @@ -60,8 +61,6 @@ pg_subset = select_from_data(data, kwargs) pg_dataset_read = torch.load(data_path, weights_only=False) -print("shape for pg", pg_subset["categorical"]["year"].shape) - @pytest.mark.parametrize( "model_class",