Merge branch 'ru-tracts-minimal-deployment' of github.com:BasisResear…

…ch/cities into ru-tracts-minimal-deployment
BasisResearch · Sep 12, 2024 · 96e3842 · 96e3842
2 parents 459b39d + d9c7d3e
commit 96e3842
Show file tree

Hide file tree

Showing 14 changed files with 262 additions and 81 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,8 @@ tests/__pycache__/
 **/*.Rproj.user/
 **/*.Rproj/
 **/*.Rhistory
+**/*.Rdata
+**/*.RData
 
 
 .vscode/settings.json

diff --git a/Makefile b/Makefile
@@ -22,12 +22,6 @@ test_all: FORCE
 test_notebooks: FORCE
 	./scripts/test_notebooks.sh
 
-done: FORCE
-	./scripts/clean.sh
-	./scripts/lint.sh
-	./scripts/test.sh
-	./scripts/test_notebooks.sh
-
 api/requirements.txt: FORCE
 	pip-compile --extra api --output-file api/requirements.txt
 

diff --git a/cities/deployment/tracts_minneapolis/generate_torch_loader.py b/cities/deployment/tracts_minneapolis/generate_torch_loader.py
@@ -2,14 +2,21 @@
 import time
 
 import sqlalchemy
+import torch
+from dotenv import load_dotenv
 
-from cities.utils.data_loader import select_from_sql
+from cities.utils.data_grabber import find_repo_root
+from cities.utils.data_loader import ZoningDataset, select_from_sql
+
+load_dotenv()
 
 # local torch loader is needed for subsampling in evaluation, comparison to the previous dataset and useful for EDA
 
-USERNAME = os.getenv("USERNAME")
+
+DB_USERNAME = os.getenv("DB_USERNAME")
 HOST = os.getenv("HOST")
 DATABASE = os.getenv("DATABASE")
+PASSWORD = os.getenv("PASSWORD")
 
 #####################
 # data load and prep
@@ -19,20 +26,23 @@
     "categorical": ["year", "census_tract"],
     "continuous": {
         "housing_units",
+        "housing_units_original",
         "total_value",
+        "total_value_original",
         "median_value",
         "mean_limit_original",
         "median_distance",
         "income",
         "segregation_original",
         "white_original",
+        "parcel_sqm",
     },
     "outcome": "housing_units",
 }
 
 load_start = time.time()
 with sqlalchemy.create_engine(
-    f"postgresql://{USERNAME}@{HOST}/{DATABASE}"
+    f"postgresql://{DB_USERNAME}:{PASSWORD}@{HOST}/{DATABASE}"
 ).connect() as conn:
     subset = select_from_sql(
         "select * from dev.tracts_model__census_tracts order by census_tract, year",
@@ -41,3 +51,35 @@
     )
 load_end = time.time()
 print(f"Data loaded in {load_end - load_start} seconds")
+
+
+columns_to_standardize = [
+    "housing_units_original",
+    "total_value_original",
+]
+
+new_standardization_dict = {}
+
+for column in columns_to_standardize:
+    new_standardization_dict[column] = {
+        "mean": subset["continuous"][column].mean(),
+        "std": subset["continuous"][column].std(),
+    }
+
+
+assert "parcel_sqm" in subset["continuous"].keys()
+
+root = find_repo_root()
+
+pg_census_tracts_dataset = ZoningDataset(
+    subset["categorical"],
+    subset["continuous"],
+    standardization_dictionary=new_standardization_dict,
+)
+assert "parcel_sqm" in subset["continuous"].keys()
+
+pg_census_tracts_data_path = os.path.join(
+    root, "data/minneapolis/processed/pg_census_tracts_dataset.pt"
+)
+
+torch.save(pg_census_tracts_dataset, pg_census_tracts_data_path)
diff --git a/cities/deployment/tracts_minneapolis/predict.py b/cities/deployment/tracts_minneapolis/predict.py
@@ -7,13 +7,25 @@
 import sqlalchemy
 import torch
 from chirho.counterfactual.handlers import MultiWorldCounterfactual
+from chirho.indexed.ops import IndexSet, gather
 from chirho.interventional.handlers import do
+from dotenv import load_dotenv
 from pyro.infer import Predictive
 
-from cities.modeling.zoning_models.zoning_tracts_model import TractsModel
+from cities.modeling.zoning_models.zoning_tracts_sqm_model import (
+    TractsModelSqm as TractsModel,
+)
 from cities.utils.data_grabber import find_repo_root
 from cities.utils.data_loader import select_from_data, select_from_sql
 
+load_dotenv()
+
+
+DB_USERNAME = os.getenv("DB_USERNAME")
+HOST = os.getenv("HOST")
+DATABASE = os.getenv("DATABASE")
+PASSWORD = os.getenv("PASSWORD")
+
 
 class TractsModelPredictor:
     kwargs = {
@@ -28,6 +40,7 @@ class TractsModelPredictor:
             "segregation_original",
             "white_original",
             "housing_units_original",
+            "parcel_sqm",
         },
         "outcome": "housing_units",
     }
@@ -43,6 +56,7 @@ class TractsModelPredictor:
             "income",
             "segregation_original",
             "white_original",
+            "parcel_sqm",
         },
         "outcome": "housing_units",
     }
@@ -89,7 +103,6 @@ def __init__(self, conn):
         deploy_path = os.path.join(root, "cities/deployment/tracts_minneapolis")
 
         guide_path = os.path.join(deploy_path, "tracts_model_guide.pkl")
-        print("guide path", guide_path)
         with open(guide_path, "rb") as file:
             guide = dill.load(file)
 
@@ -150,11 +163,21 @@ def predict(self, conn, intervention=None):
         else:
             intervention = self._tracts_intervention(conn, **intervention)
             print(intervention.shape, intervention)
-            with MultiWorldCounterfactual():
-                with do(actions={"limit": intervention}):
-                    result = self.predictive(**subset_for_preds)[
-                        "housing_units"
-                    ].squeeze()[:, 1, :]
+            # with MultiWorldCounterfactual():
+            #     with do(actions={"limit": intervention}):
+            #         result = self.predictive(**subset_for_preds)[
+            #             "housing_units"
+            #         ].squeeze()[:, 1, :]
+
+        # RU: if you use mwc, you should not use squeezing and indexing to look into possible worlds.
+        # There is a nicer and more robust way to do so is:
+        with MultiWorldCounterfactual() as mwc:
+            with do(actions={"limit": intervention}):
+                result_all = self.predictive(**subset_for_preds)["housing_units"]
+        with mwc:
+            result = gather(
+                result_all, IndexSet(**{"limit": {1}}), event_dims=0
+            ).squeeze()
 
         # undo standardization
         result = result * self.housing_units_std + self.housing_units_mean
@@ -236,7 +259,7 @@ def predict_cumulative(self, conn, intervention):
     PASSWORD = os.getenv("PASSWORD")
 
     with sqlalchemy.create_engine(
-        f"postgresql://{USERNAME}:{PASSWORD}@{HOST}/{DATABASE}"
+        f"postgresql://{DB_USERNAME}:{PASSWORD}@{HOST}/{DATABASE}"
     ).connect() as conn:
         predictor = TractsModelPredictor(conn)
         start = time.time()

diff --git a/...yment/tracts_minneapolis/tracts_model_overview/tracts_dag_plot_high_density.png b/...yment/tracts_minneapolis/tracts_model_overview/tracts_dag_plot_high_density.png
diff --git a/cities/deployment/tracts_minneapolis/tracts_model_overview/tracts_model_overview.ipynb b/cities/deployment/tracts_minneapolis/tracts_model_overview/tracts_model_overview.ipynb
@@ -10,12 +10,12 @@
    "source": [
     "## What is this project about?\n",
     "\n",
-    "We use state-of-the-art bayesian causal models to investigate the role of parking zoning reform in Minneapolis on the development of new housing units, at a relatively fine-grained level of census tracts. Minneapolis is an example of a city which somewhat sucessfuly navigates the housing crisis, and a parking zoning reform has been claimed to be connected to this outcome (see for example [here](https://reason.com/2024/02/27/fear-loathing-and-zoning-reform-in-minnesota/) and [here](https://www.strongtowns.org/journal/2023/9/15/ending-minimum-parking-requirements-was-a-policy-win-for-the-twin-cities)).\n",
+    "We use state-of-the-art Bayesian causal modeling tools ([ChiRho](https://github.com/BasisResearch/chirho)) to investigate the role of parking zoning reform in Minneapolis on the development of new housing units, at a relatively fine-grained level of census tracts. Minneapolis is an example of a city which somewhat sucessfuly navigates the housing crisis, and a parking zoning reform has been claimed to be connected to this outcome (see for example [here](https://reason.com/2024/02/27/fear-loathing-and-zoning-reform-in-minnesota/) and [here](https://www.strongtowns.org/journal/2023/9/15/ending-minimum-parking-requirements-was-a-policy-win-for-the-twin-cities)).\n",
     "\n",
     "\n",
     "%TODO Someone should perhaps check if there are better links to include here\n",
     "\n",
-    "It is quite clear that the number of housing units increased faster after the reform. What is not quite clear, is whether this is a mere correlation or not.We decided to take a deep dive and connect parcel-level data with demographic variables within a carefully devised causal model to investigate. \n"
+    "Whether this is so, to what extent and with what uncertainty has been unclear. Yes, the number of housing units in the city increased faster after the reform. But it is not ovious whether this isn't a mere correlation arising from other variables being causally responsible, or random variation. We decided to take a deep dive and connect detailed census tracts data with demographic variables within a carefully devised causal model to investigate. Due to data availability limitations, we start at year 2010. Since a major world-wide event changed too many things in 2020, this is where our data collection stops, to be able to separate the zoning concerns from the complex and unprecedented events that follow. It turns out that even with 10 years of data only, causal modelling allows us to offer some (admittedly, uncertain) answers."
    ]
   },
   {
@@ -24,10 +24,9 @@
    "source": [
     "## Why this is not a typical machine learning project\n",
     "\n",
-    "A typical predictive project in machine learning tends to use as much data as possible and uses algorithms to identify patters, focusing only on predictive accuracy. While such an approach is useful, the key limitation is that such models have a hard time distinguishing accidental correlations from causal connections, and therefore are not realiable guides to counterfactual predictions and causal effect evaluation. Moreover, a typical model often disregards information that humans use heavily: temporal or causal structures, which are needed to generalize well outside the training data.\n",
+    "A typical predictive project in machine learning tends to use as much data as possible and algorithms to identify patters, focusing only on predictive accuracy. While such an approach is useful, the key limitation is that such models have a hard time distinguishing accidental correlations from causal connections, and therefore are not realiable guides to counterfactual predictions and causal effect estimation. Moreover, a typical model often disregards information that humans use heavily: temporal, spatial or causal structures, which are needed to generalize well outside the training data.\n",
     "\n",
-    "Instead, we use our core open source technology, [ChiRho](https://github.com/BasisResearch/chirho) to build **bayesian causal** models using hand-picked relevant variables. This way, we work with humans and domain experts in the loop. The fact that we use Bayesian methods, allow for the injection of their deep understanding of the ecosystem, which can be work in symbiosis with the data, even if the latter is somewhat limited. The fact that the models is causal gives us a chance to try to draw conclusions about effects of intereventions, and quantify the uncertainties involved and be honest about them.\n",
-    "\n"
+    "Instead, we use our core open source technology, [ChiRho](https://github.com/BasisResearch/chirho) to build **bayesian causal models** using hand-picked relevant variables. This way, we can work with humans and in the loop. The fact that we use Bayesian methods, allows for the injection of human understanding of the causal dependecies, which then are made work in symbiosis with the data, even if the latter is somewhat limited, and for honest assessment of the resulting uncertainties. The fact that the models is causal gives us a chance to address counterfactual queries involving alternative interventions.\n"
    ]
   },
   {
@@ -38,13 +37,14 @@
     "\n",
     "## Why care about different types of questions?\n",
     "\n",
-    "Once we start thinking in causal terms, there are **multiple types of queries** that we can distinguish and answer using the model, and such questions typically have different answers. \n",
+    "Once we start thinking in causal terms, there are **multiple types of queries** that we can distinguish and answer using the model, and such questions typically have different answers. While assosciative information is often useful or revealing, equally often we wwant to be able to evaluate potential consequences of acting one way or another, and in this mode of reflection, we rather turn to thinking in terms of interventions and counterfactuals.\n",
     "\n",
     "- *Association*. Example: Is there a correlation between increased green spaces and decreased crime rate in an area? Perhaps, areas with more green spaces do tend to have lower crime rates for various reasons.\n",
     "\n",
-    "- *Intervention* If the city implements a zoning change to create more green spaces, how would this impact the crime rate in the area? The answer might differ here: factors other than the policy change probably influence crime rates to a larger extent.\n",
+    "- *Intervention* If the city implements a zoning change to create more green spaces, how would this impact the crime rate in the area? The answer might differ here: factors other than the policy change probably influence crime rates to a large extent.\n",
+    "\n",
+    "- *Counterfactual* Suppose you did create more green spaces and the crime rate in the area did go down. Are you to be thanked? This depends on whether the crime rate would have gone down had you not created more green space in the area. Would it?\n",
     "\n",
-    "- *Counterfactual* Suppose you did create more green spaces and the crime rate in the area did go down. Are you to be thanked? This depends on whether the crime rate would have gone down had you not created more green space in the area.\n",
     "\n",
     "\n"
    ]
@@ -55,18 +55,25 @@
    "source": [
     "## Counterfactual modeling of the zoning reform\n",
     "\n",
-    "In the case at hand, we allow you, the user, to investigate predicted counterfactual outcomes of a zoning reform, specifed in terms of where the two zones start, what parking limits are to be imposed in different zones, and what year the reform has been introduced. The variables\n",
+    "In the case at hand, we allow you, the user, to investigate predicted counterfactual outcomes of a zoning reform, specifed in terms of where the two zones start, what parking limits are to be imposed in different zones, and what year the reform has been introduced. From among the available variables we hand-picked the ones that are most useful and meaningfully causally connected. The model simultaneously learns the strenghts of over 30 causal connections and uses this information to inform its counterfactual predictions. The structural assumptions we have made at a high level can be described by the diagram below. However, a moderately competent user can use our [open source codebase](https://github.com/BasisResearch/cities) to tweak or modify these assumptions and invesigate the consequences of doing so.\n",
     "    "
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "\n",
-    "\n",
     "<img src=\"tracts_dag_plot_high_density.png\" alt=\"DAG Plot\" width=\"800\"/>\n"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## How does the model perform?\n",
+    "\n",
+    "The causal layer, nevertheless, should not take place at the cost of predictive power. The models went through a battery of tests on split data, each time being able to account for around 25-30% variation in the data (which for such noisy problems is fairly decent peformance), effectively on average improving predictions of new housing units appearing in each of census tracts at each of a given years by the count of 35-40 over a null model. A detailed notebook with model testing is also available at our open source codebase. "
+   ]
   }
  ],
  "metadata": {

diff --git a/cities/deployment/tracts_minneapolis/train_model.py b/cities/deployment/tracts_minneapolis/train_model.py
@@ -8,9 +8,16 @@
 from dotenv import load_dotenv
 
 from cities.modeling.svi_inference import run_svi_inference
-from cities.modeling.zoning_models.zoning_tracts_model import TractsModel
+
+# from cities.modeling.zoning_models.zoning_tracts_model import TractsModel
+from cities.modeling.zoning_models.zoning_tracts_sqm_model import (
+    TractsModelSqm as TractsModel,
+)
+from cities.utils.data_grabber import find_repo_root
 from cities.utils.data_loader import select_from_sql
 
+n_steps = 2000
+
 load_dotenv()
 
 
@@ -34,6 +41,7 @@
         "income",
         "segregation_original",
         "white_original",
+        "parcel_sqm",
     },
     "outcome": "housing_units",
 }
@@ -64,15 +72,21 @@
 
 pyro.clear_param_store()
 
-guide = run_svi_inference(tracts_model, n_steps=2000, lr=0.03, plot=False, **subset)
+guide = run_svi_inference(tracts_model, n_steps=n_steps, lr=0.03, plot=False, **subset)
 
 ##########################################
 # save guide and params in the same folder
 ##########################################
+root = find_repo_root()
+
+deploy_path = os.path.join(root, "cities/deployment/tracts_minneapolis")
+guide_path = os.path.join(deploy_path, "tracts_model_guide.pkl")
+param_path = os.path.join(deploy_path, "tracts_model_params.pth")
+
+guide_path = os.path.join(deploy_path, "tracts_model_guide.pkl")
 serialized_guide = dill.dumps(guide)
-file_path = "tracts_model_guide.pkl"
-with open(file_path, "wb") as file:
+with open(guide_path, "wb") as file:
     file.write(serialized_guide)
 
-param_path = "tracts_model_params.pth"
-pyro.get_param_store().save(param_path)
+with open(param_path, "wb") as file:
+    pyro.get_param_store().save(param_path)
diff --git a/cities/queries/causal_insight_slim.py b/cities/queries/causal_insight_slim.py
@@ -6,7 +6,7 @@
 import plotly.graph_objects as go
 from sklearn.preprocessing import StandardScaler
 
-from cities.utils.cleaning_scripts.cleaning_utils import (
+from cities.utils.cleaning_utils import (
     revert_prediction_df,
     revert_standardize_and_scale_scaler,
     sigmoid,

diff --git a/data/minneapolis/processed/pg_census_tracts_dataset.pt b/data/minneapolis/processed/pg_census_tracts_dataset.pt
diff --git a/docs/experimental_notebooks/zoning/tracts_dag_plot.pdf b/docs/experimental_notebooks/zoning/tracts_dag_plot.pdf
diff --git a/docs/experimental_notebooks/zoning/tracts_dag_plot_high_density.png b/docs/experimental_notebooks/zoning/tracts_dag_plot_high_density.png