Skip to content

Commit

Permalink
Merge branch 'ru-tracts-minimal-deployment' of github.com:BasisResear…
Browse files Browse the repository at this point in the history
…ch/cities into ru-tracts-minimal-deployment
  • Loading branch information
jfeser committed Sep 12, 2024
2 parents 459b39d + d9c7d3e commit 96e3842
Show file tree
Hide file tree
Showing 14 changed files with 262 additions and 81 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ tests/__pycache__/
**/*.Rproj.user/
**/*.Rproj/
**/*.Rhistory
**/*.Rdata
**/*.RData


.vscode/settings.json
Expand Down
6 changes: 0 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,6 @@ test_all: FORCE
test_notebooks: FORCE
./scripts/test_notebooks.sh

done: FORCE
./scripts/clean.sh
./scripts/lint.sh
./scripts/test.sh
./scripts/test_notebooks.sh

api/requirements.txt: FORCE
pip-compile --extra api --output-file api/requirements.txt

Expand Down
48 changes: 45 additions & 3 deletions cities/deployment/tracts_minneapolis/generate_torch_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,21 @@
import time

import sqlalchemy
import torch
from dotenv import load_dotenv

from cities.utils.data_loader import select_from_sql
from cities.utils.data_grabber import find_repo_root
from cities.utils.data_loader import ZoningDataset, select_from_sql

load_dotenv()

# local torch loader is needed for subsampling in evaluation, comparison to the previous dataset and useful for EDA

USERNAME = os.getenv("USERNAME")

DB_USERNAME = os.getenv("DB_USERNAME")
HOST = os.getenv("HOST")
DATABASE = os.getenv("DATABASE")
PASSWORD = os.getenv("PASSWORD")

#####################
# data load and prep
Expand All @@ -19,20 +26,23 @@
"categorical": ["year", "census_tract"],
"continuous": {
"housing_units",
"housing_units_original",
"total_value",
"total_value_original",
"median_value",
"mean_limit_original",
"median_distance",
"income",
"segregation_original",
"white_original",
"parcel_sqm",
},
"outcome": "housing_units",
}

load_start = time.time()
with sqlalchemy.create_engine(
f"postgresql://{USERNAME}@{HOST}/{DATABASE}"
f"postgresql://{DB_USERNAME}:{PASSWORD}@{HOST}/{DATABASE}"
).connect() as conn:
subset = select_from_sql(
"select * from dev.tracts_model__census_tracts order by census_tract, year",
Expand All @@ -41,3 +51,35 @@
)
load_end = time.time()
print(f"Data loaded in {load_end - load_start} seconds")


columns_to_standardize = [
"housing_units_original",
"total_value_original",
]

new_standardization_dict = {}

for column in columns_to_standardize:
new_standardization_dict[column] = {
"mean": subset["continuous"][column].mean(),
"std": subset["continuous"][column].std(),
}


assert "parcel_sqm" in subset["continuous"].keys()

root = find_repo_root()

pg_census_tracts_dataset = ZoningDataset(
subset["categorical"],
subset["continuous"],
standardization_dictionary=new_standardization_dict,
)
assert "parcel_sqm" in subset["continuous"].keys()

pg_census_tracts_data_path = os.path.join(
root, "data/minneapolis/processed/pg_census_tracts_dataset.pt"
)

torch.save(pg_census_tracts_dataset, pg_census_tracts_data_path)
39 changes: 31 additions & 8 deletions cities/deployment/tracts_minneapolis/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,25 @@
import sqlalchemy
import torch
from chirho.counterfactual.handlers import MultiWorldCounterfactual
from chirho.indexed.ops import IndexSet, gather
from chirho.interventional.handlers import do
from dotenv import load_dotenv
from pyro.infer import Predictive

from cities.modeling.zoning_models.zoning_tracts_model import TractsModel
from cities.modeling.zoning_models.zoning_tracts_sqm_model import (
TractsModelSqm as TractsModel,
)
from cities.utils.data_grabber import find_repo_root
from cities.utils.data_loader import select_from_data, select_from_sql

load_dotenv()


DB_USERNAME = os.getenv("DB_USERNAME")
HOST = os.getenv("HOST")
DATABASE = os.getenv("DATABASE")
PASSWORD = os.getenv("PASSWORD")


class TractsModelPredictor:
kwargs = {
Expand All @@ -28,6 +40,7 @@ class TractsModelPredictor:
"segregation_original",
"white_original",
"housing_units_original",
"parcel_sqm",
},
"outcome": "housing_units",
}
Expand All @@ -43,6 +56,7 @@ class TractsModelPredictor:
"income",
"segregation_original",
"white_original",
"parcel_sqm",
},
"outcome": "housing_units",
}
Expand Down Expand Up @@ -89,7 +103,6 @@ def __init__(self, conn):
deploy_path = os.path.join(root, "cities/deployment/tracts_minneapolis")

guide_path = os.path.join(deploy_path, "tracts_model_guide.pkl")
print("guide path", guide_path)
with open(guide_path, "rb") as file:
guide = dill.load(file)

Expand Down Expand Up @@ -150,11 +163,21 @@ def predict(self, conn, intervention=None):
else:
intervention = self._tracts_intervention(conn, **intervention)
print(intervention.shape, intervention)
with MultiWorldCounterfactual():
with do(actions={"limit": intervention}):
result = self.predictive(**subset_for_preds)[
"housing_units"
].squeeze()[:, 1, :]
# with MultiWorldCounterfactual():
# with do(actions={"limit": intervention}):
# result = self.predictive(**subset_for_preds)[
# "housing_units"
# ].squeeze()[:, 1, :]

# RU: if you use mwc, you should not use squeezing and indexing to look into possible worlds.
# There is a nicer and more robust way to do so is:
with MultiWorldCounterfactual() as mwc:
with do(actions={"limit": intervention}):
result_all = self.predictive(**subset_for_preds)["housing_units"]
with mwc:
result = gather(
result_all, IndexSet(**{"limit": {1}}), event_dims=0
).squeeze()

# undo standardization
result = result * self.housing_units_std + self.housing_units_mean
Expand Down Expand Up @@ -236,7 +259,7 @@ def predict_cumulative(self, conn, intervention):
PASSWORD = os.getenv("PASSWORD")

with sqlalchemy.create_engine(
f"postgresql://{USERNAME}:{PASSWORD}@{HOST}/{DATABASE}"
f"postgresql://{DB_USERNAME}:{PASSWORD}@{HOST}/{DATABASE}"
).connect() as conn:
predictor = TractsModelPredictor(conn)
start = time.time()
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@
"source": [
"## What is this project about?\n",
"\n",
"We use state-of-the-art bayesian causal models to investigate the role of parking zoning reform in Minneapolis on the development of new housing units, at a relatively fine-grained level of census tracts. Minneapolis is an example of a city which somewhat sucessfuly navigates the housing crisis, and a parking zoning reform has been claimed to be connected to this outcome (see for example [here](https://reason.com/2024/02/27/fear-loathing-and-zoning-reform-in-minnesota/) and [here](https://www.strongtowns.org/journal/2023/9/15/ending-minimum-parking-requirements-was-a-policy-win-for-the-twin-cities)).\n",
"We use state-of-the-art Bayesian causal modeling tools ([ChiRho](https://github.com/BasisResearch/chirho)) to investigate the role of parking zoning reform in Minneapolis on the development of new housing units, at a relatively fine-grained level of census tracts. Minneapolis is an example of a city which somewhat sucessfuly navigates the housing crisis, and a parking zoning reform has been claimed to be connected to this outcome (see for example [here](https://reason.com/2024/02/27/fear-loathing-and-zoning-reform-in-minnesota/) and [here](https://www.strongtowns.org/journal/2023/9/15/ending-minimum-parking-requirements-was-a-policy-win-for-the-twin-cities)).\n",
"\n",
"\n",
"%TODO Someone should perhaps check if there are better links to include here\n",
"\n",
"It is quite clear that the number of housing units increased faster after the reform. What is not quite clear, is whether this is a mere correlation or not.We decided to take a deep dive and connect parcel-level data with demographic variables within a carefully devised causal model to investigate. \n"
"Whether this is so, to what extent and with what uncertainty has been unclear. Yes, the number of housing units in the city increased faster after the reform. But it is not ovious whether this isn't a mere correlation arising from other variables being causally responsible, or random variation. We decided to take a deep dive and connect detailed census tracts data with demographic variables within a carefully devised causal model to investigate. Due to data availability limitations, we start at year 2010. Since a major world-wide event changed too many things in 2020, this is where our data collection stops, to be able to separate the zoning concerns from the complex and unprecedented events that follow. It turns out that even with 10 years of data only, causal modelling allows us to offer some (admittedly, uncertain) answers."
]
},
{
Expand All @@ -24,10 +24,9 @@
"source": [
"## Why this is not a typical machine learning project\n",
"\n",
"A typical predictive project in machine learning tends to use as much data as possible and uses algorithms to identify patters, focusing only on predictive accuracy. While such an approach is useful, the key limitation is that such models have a hard time distinguishing accidental correlations from causal connections, and therefore are not realiable guides to counterfactual predictions and causal effect evaluation. Moreover, a typical model often disregards information that humans use heavily: temporal or causal structures, which are needed to generalize well outside the training data.\n",
"A typical predictive project in machine learning tends to use as much data as possible and algorithms to identify patters, focusing only on predictive accuracy. While such an approach is useful, the key limitation is that such models have a hard time distinguishing accidental correlations from causal connections, and therefore are not realiable guides to counterfactual predictions and causal effect estimation. Moreover, a typical model often disregards information that humans use heavily: temporal, spatial or causal structures, which are needed to generalize well outside the training data.\n",
"\n",
"Instead, we use our core open source technology, [ChiRho](https://github.com/BasisResearch/chirho) to build **bayesian causal** models using hand-picked relevant variables. This way, we work with humans and domain experts in the loop. The fact that we use Bayesian methods, allow for the injection of their deep understanding of the ecosystem, which can be work in symbiosis with the data, even if the latter is somewhat limited. The fact that the models is causal gives us a chance to try to draw conclusions about effects of intereventions, and quantify the uncertainties involved and be honest about them.\n",
"\n"
"Instead, we use our core open source technology, [ChiRho](https://github.com/BasisResearch/chirho) to build **bayesian causal models** using hand-picked relevant variables. This way, we can work with humans and in the loop. The fact that we use Bayesian methods, allows for the injection of human understanding of the causal dependecies, which then are made work in symbiosis with the data, even if the latter is somewhat limited, and for honest assessment of the resulting uncertainties. The fact that the models is causal gives us a chance to address counterfactual queries involving alternative interventions.\n"
]
},
{
Expand All @@ -38,13 +37,14 @@
"\n",
"## Why care about different types of questions?\n",
"\n",
"Once we start thinking in causal terms, there are **multiple types of queries** that we can distinguish and answer using the model, and such questions typically have different answers. \n",
"Once we start thinking in causal terms, there are **multiple types of queries** that we can distinguish and answer using the model, and such questions typically have different answers. While assosciative information is often useful or revealing, equally often we wwant to be able to evaluate potential consequences of acting one way or another, and in this mode of reflection, we rather turn to thinking in terms of interventions and counterfactuals.\n",
"\n",
"- *Association*. Example: Is there a correlation between increased green spaces and decreased crime rate in an area? Perhaps, areas with more green spaces do tend to have lower crime rates for various reasons.\n",
"\n",
"- *Intervention* If the city implements a zoning change to create more green spaces, how would this impact the crime rate in the area? The answer might differ here: factors other than the policy change probably influence crime rates to a larger extent.\n",
"- *Intervention* If the city implements a zoning change to create more green spaces, how would this impact the crime rate in the area? The answer might differ here: factors other than the policy change probably influence crime rates to a large extent.\n",
"\n",
"- *Counterfactual* Suppose you did create more green spaces and the crime rate in the area did go down. Are you to be thanked? This depends on whether the crime rate would have gone down had you not created more green space in the area. Would it?\n",
"\n",
"- *Counterfactual* Suppose you did create more green spaces and the crime rate in the area did go down. Are you to be thanked? This depends on whether the crime rate would have gone down had you not created more green space in the area.\n",
"\n",
"\n"
]
Expand All @@ -55,18 +55,25 @@
"source": [
"## Counterfactual modeling of the zoning reform\n",
"\n",
"In the case at hand, we allow you, the user, to investigate predicted counterfactual outcomes of a zoning reform, specifed in terms of where the two zones start, what parking limits are to be imposed in different zones, and what year the reform has been introduced. The variables\n",
"In the case at hand, we allow you, the user, to investigate predicted counterfactual outcomes of a zoning reform, specifed in terms of where the two zones start, what parking limits are to be imposed in different zones, and what year the reform has been introduced. From among the available variables we hand-picked the ones that are most useful and meaningfully causally connected. The model simultaneously learns the strenghts of over 30 causal connections and uses this information to inform its counterfactual predictions. The structural assumptions we have made at a high level can be described by the diagram below. However, a moderately competent user can use our [open source codebase](https://github.com/BasisResearch/cities) to tweak or modify these assumptions and invesigate the consequences of doing so.\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"\n",
"<img src=\"tracts_dag_plot_high_density.png\" alt=\"DAG Plot\" width=\"800\"/>\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## How does the model perform?\n",
"\n",
"The causal layer, nevertheless, should not take place at the cost of predictive power. The models went through a battery of tests on split data, each time being able to account for around 25-30% variation in the data (which for such noisy problems is fairly decent peformance), effectively on average improving predictions of new housing units appearing in each of census tracts at each of a given years by the count of 35-40 over a null model. A detailed notebook with model testing is also available at our open source codebase. "
]
}
],
"metadata": {
Expand Down
26 changes: 20 additions & 6 deletions cities/deployment/tracts_minneapolis/train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,16 @@
from dotenv import load_dotenv

from cities.modeling.svi_inference import run_svi_inference
from cities.modeling.zoning_models.zoning_tracts_model import TractsModel

# from cities.modeling.zoning_models.zoning_tracts_model import TractsModel
from cities.modeling.zoning_models.zoning_tracts_sqm_model import (
TractsModelSqm as TractsModel,
)
from cities.utils.data_grabber import find_repo_root
from cities.utils.data_loader import select_from_sql

n_steps = 2000

load_dotenv()


Expand All @@ -34,6 +41,7 @@
"income",
"segregation_original",
"white_original",
"parcel_sqm",
},
"outcome": "housing_units",
}
Expand Down Expand Up @@ -64,15 +72,21 @@

pyro.clear_param_store()

guide = run_svi_inference(tracts_model, n_steps=2000, lr=0.03, plot=False, **subset)
guide = run_svi_inference(tracts_model, n_steps=n_steps, lr=0.03, plot=False, **subset)

##########################################
# save guide and params in the same folder
##########################################
root = find_repo_root()

deploy_path = os.path.join(root, "cities/deployment/tracts_minneapolis")
guide_path = os.path.join(deploy_path, "tracts_model_guide.pkl")
param_path = os.path.join(deploy_path, "tracts_model_params.pth")

guide_path = os.path.join(deploy_path, "tracts_model_guide.pkl")
serialized_guide = dill.dumps(guide)
file_path = "tracts_model_guide.pkl"
with open(file_path, "wb") as file:
with open(guide_path, "wb") as file:
file.write(serialized_guide)

param_path = "tracts_model_params.pth"
pyro.get_param_store().save(param_path)
with open(param_path, "wb") as file:
pyro.get_param_store().save(param_path)
2 changes: 1 addition & 1 deletion cities/queries/causal_insight_slim.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler

from cities.utils.cleaning_scripts.cleaning_utils import (
from cities.utils.cleaning_utils import (
revert_prediction_df,
revert_standardize_and_scale_scaler,
sigmoid,
Expand Down
Binary file modified data/minneapolis/processed/pg_census_tracts_dataset.pt
Binary file not shown.
Binary file modified docs/experimental_notebooks/zoning/tracts_dag_plot.pdf
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 96e3842

Please sign in to comment.