Skip to content

Commit

Permalink
zoning model
Browse files Browse the repository at this point in the history
  • Loading branch information
rfl-urbaniak committed Jun 21, 2024
1 parent e4db7ef commit 4eaaa9a
Show file tree
Hide file tree
Showing 8 changed files with 456 additions and 1,090 deletions.
214 changes: 126 additions & 88 deletions cities/modeling/evaluation.py
Original file line number Diff line number Diff line change
@@ -1,162 +1,202 @@
import torch
import pyro
from torch.utils.data import DataLoader, random_split, TensorDataset
import numpy as np
import os
from cities.utils.data_grabber import find_repo_root
from cities.utils.data_loader import ZoningDataset

from cities.modeling.simple_linear import SimpleLinear
from cities.modeling.svi_inference import run_svi_inference
from pyro.infer import Predictive
from chirho.robust.handlers.predictive import PredictiveModel
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import DataLoader, random_split

from cities.modeling.svi_inference import run_svi_inference
from cities.utils.data_grabber import find_repo_root
from cities.utils.data_loader import select_from_data
from pyro.infer import Predictive

root = find_repo_root()


def prep_data_for_test(train_size = 0.8):
zoning_data_path = os.path.join(root,"data/minneapolis/processed/zoning_dataset.pt")
def prep_data_for_test(train_size=0.8):
zoning_data_path = os.path.join(
root, "data/minneapolis/processed/zoning_dataset.pt"
)
zoning_dataset_read = torch.load(zoning_data_path)

train_size = int(train_size * len(zoning_dataset_read))
test_size = len(zoning_dataset_read) - train_size

train_dataset, test_dataset = random_split(zoning_dataset_read, [train_size, test_size])
train_dataset, test_dataset = random_split(
zoning_dataset_read, [train_size, test_size]
)

train_loader = DataLoader(train_dataset, batch_size = train_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size= test_size, shuffle=False)
train_loader = DataLoader(train_dataset, batch_size=train_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=test_size, shuffle=False)

categorical_levels = zoning_dataset_read.categorical_levels

return train_loader, test_loader, categorical_levels

def test_performance(model_class, kwarg_names, train_loader, test_loader,
categorical_levels, n_steps=600, plot = True):

#TODO perhaps remove the original categorical levels here
def test_performance(
model_class,
kwarg_names,
train_loader,
test_loader,
categorical_levels,
n_steps=600,
plot=True,
):

assert all(item in kwarg_names.keys() for item in ["categorical", "continuous", "outcome"])
# TODO perhaps remove the original categorical levels here

assert all(
item in kwarg_names.keys() for item in ["categorical", "continuous", "outcome"]
)
assert kwarg_names["outcome"] not in kwarg_names["continuous"]

train_data = next(iter(train_loader))
test_data = next(iter(test_loader))

_train_data = {}
_train_data['outcome'] = train_data['continuous'][kwarg_names['outcome']]
_train_data['categorical'] = {key: val for key, val in train_data['categorical'].items() if key in kwarg_names['categorical']}
_train_data['continuous'] = {key: val for key, val in train_data['continuous'].items() if key in kwarg_names['continuous']}

_test_data = {}
_test_data['outcome'] = test_data['continuous'][kwarg_names['outcome']]
_test_data['categorical'] = {key: val for key, val in test_data['categorical'].items() if key in kwarg_names['categorical']}
_test_data['continuous'] = {key: val for key, val in test_data['continuous'].items() if key in kwarg_names['continuous']}

_train_data = select_from_data(train_data, kwarg_names)
_test_data = select_from_data(test_data, kwarg_names)

#####################################################
# eliminate test categories not in the training data
#####################################################
def apply_mask(data, mask):
return {key: val[mask] for key, val in data.items()}
return {key: val[mask] for key, val in data.items()}

mask = torch.ones(len(_test_data['outcome']), dtype=torch.bool)
for key, value in _test_data['categorical'].items():
mask = mask * torch.isin(_test_data['categorical'][key],(_train_data['categorical'][key].unique()))
mask = torch.ones(len(_test_data["outcome"]), dtype=torch.bool)
for key, value in _test_data["categorical"].items():
mask = mask * torch.isin(
_test_data["categorical"][key], (_train_data["categorical"][key].unique())
)

_test_data['categorical'] = apply_mask(_test_data['categorical'], mask)
_test_data['continuous'] = apply_mask(_test_data['continuous'], mask)
_test_data['outcome'] = _test_data['outcome'][mask]
_test_data["categorical"] = apply_mask(_test_data["categorical"], mask)
_test_data["continuous"] = apply_mask(_test_data["continuous"], mask)
_test_data["outcome"] = _test_data["outcome"][mask]

for key in _test_data['categorical'].keys():
assert _test_data['categorical'][key].shape[0] == mask.sum()
for key in _test_data['continuous'].keys():
assert _test_data['continuous'][key].shape[0] == mask.sum()
for key in _test_data["categorical"].keys():
assert _test_data["categorical"][key].shape[0] == mask.sum()
for key in _test_data["continuous"].keys():
assert _test_data["continuous"][key].shape[0] == mask.sum()

# raise error if sum(mask) < .5 * len(test_data['outcome'])
if sum(mask) < .5 * len(_test_data['outcome']):
raise ValueError("Sampled test data has too many new categorical levels, consider decreasing train size")

if sum(mask) < 0.5 * len(_test_data["outcome"]):
raise ValueError(
"Sampled test data has too many new categorical levels, consider decreasing train size"
)

######################################
# recode categorical variables to have
# no index gaps in the training data
# ####################################
# ####################################

mappings = {}
for name in _train_data['categorical'].keys():
unique_train = torch.unique(_train_data['categorical'][name])
for name in _train_data["categorical"].keys():
unique_train = torch.unique(_train_data["categorical"][name])
mappings[name] = {v.item(): i for i, v in enumerate(unique_train)}
_train_data['categorical'][name] = torch.tensor([mappings[name][x.item()] for x in _train_data['categorical'][name]])
_test_data['categorical'][name] = torch.tensor([mappings[name][x.item()] for x in _test_data['categorical'][name]])

_train_data["categorical"][name] = torch.tensor(
[mappings[name][x.item()] for x in _train_data["categorical"][name]]
)
_test_data["categorical"][name] = torch.tensor(
[mappings[name][x.item()] for x in _test_data["categorical"][name]]
)

######################
# train and test
######################
model = model_class(**_train_data)
guide = run_svi_inference(
model, n_steps=n_steps, lr=0.01, verbose=True, **_train_data
)

)

predictive = Predictive(
model, guide=guide, num_samples = 1000
)
predictive = Predictive(model, guide=guide, num_samples=1000)

categorical_levels = model.categorical_levels
# with pyro.poutine.trace() as tr:
# with pyro.plate("samples", size = 1000, dim = -10):
samples_training = predictive(categorical = _train_data['categorical'],
continuous = _train_data['continuous'],
outcome = None, categorical_levels = categorical_levels)

samples_test = predictive(categorical = _test_data['categorical'],
continuous = _test_data['continuous'],
outcome = None, categorical_levels = categorical_levels)

train_predicted_mean = samples_training['outcome_observed'].squeeze().mean(dim = 0)
train_predicted_lower = samples_training['outcome_observed'].squeeze().quantile(0.05, dim = 0)
train_predicted_upper = samples_training['outcome_observed'].squeeze().quantile(0.95, dim = 0)

coverage_training = _train_data['outcome'].squeeze().gt(train_predicted_lower).float() * _train_data['outcome'].squeeze().lt(train_predicted_upper).float()
residuals_train = (_train_data['outcome'].squeeze() - train_predicted_mean)
samples_training = predictive(
categorical=_train_data["categorical"],
continuous=_train_data["continuous"],
outcome=None,
categorical_levels=categorical_levels,
)

samples_test = predictive(
categorical=_test_data["categorical"],
continuous=_test_data["continuous"],
outcome=None,
categorical_levels=categorical_levels,
)

train_predicted_mean = samples_training["outcome_observed"].squeeze().mean(dim=0)
train_predicted_lower = (
samples_training["outcome_observed"].squeeze().quantile(0.05, dim=0)
)
train_predicted_upper = (
samples_training["outcome_observed"].squeeze().quantile(0.95, dim=0)
)

coverage_training = (
_train_data["outcome"].squeeze().gt(train_predicted_lower).float()
* _train_data["outcome"].squeeze().lt(train_predicted_upper).float()
)
residuals_train = _train_data["outcome"].squeeze() - train_predicted_mean
mae_train = torch.abs(residuals_train).mean().item()

rsquared_train = 1 - residuals_train.var() / _train_data['outcome'].squeeze().var()

test_predicted_mean = samples_test['outcome_observed'].squeeze().mean(dim = 0)
test_predicted_lower = samples_test['outcome_observed'].squeeze().quantile(0.05, dim = 0)
test_predicted_upper = samples_test['outcome_observed'].squeeze().quantile(0.95, dim = 0)

coverage_test = _test_data['outcome'].squeeze().gt(test_predicted_lower).float() * _test_data['outcome'].squeeze().lt(test_predicted_upper).float()
residuals_test = (_test_data['outcome'].squeeze() - test_predicted_mean)
rsquared_train = 1 - residuals_train.var() / _train_data["outcome"].squeeze().var()

test_predicted_mean = samples_test["outcome_observed"].squeeze().mean(dim=0)
test_predicted_lower = (
samples_test["outcome_observed"].squeeze().quantile(0.05, dim=0)
)
test_predicted_upper = (
samples_test["outcome_observed"].squeeze().quantile(0.95, dim=0)
)

coverage_test = (
_test_data["outcome"].squeeze().gt(test_predicted_lower).float()
* _test_data["outcome"].squeeze().lt(test_predicted_upper).float()
)
residuals_test = _test_data["outcome"].squeeze() - test_predicted_mean
mae_test = torch.abs(residuals_test).mean().item()

rsquared_test = 1 - residuals_test.var() / _test_data['outcome'].squeeze().var()

rsquared_test = 1 - residuals_test.var() / _test_data["outcome"].squeeze().var()

if plot:
fig, axs = plt.subplots(2, 2, figsize=(14, 10))

axs[0, 0].scatter(x=_train_data['outcome'], y=train_predicted_mean, s=6, alpha=0.5)
axs[0, 0].set_title("Training data, ratio of outcomes within 95% CI: {:.2f}".format(coverage_training.mean().item()))
axs[0, 0].scatter(
x=_train_data["outcome"], y=train_predicted_mean, s=6, alpha=0.5
)
axs[0, 0].set_title(
"Training data, ratio of outcomes within 95% CI: {:.2f}".format(
coverage_training.mean().item()
)
)
axs[0, 0].set_xlabel("true outcome")
axs[0, 0].set_ylabel("mean predicted outcome")

axs[0, 1].hist(residuals_train, bins=50)
axs[0, 1].set_title("Training set residuals, Rsquared: {:.2f}".format(rsquared_train.item()))
axs[0, 1].set_title(
"Training set residuals, Rsquared: {:.2f}".format(rsquared_train.item())
)
axs[0, 1].set_xlabel("residuals")
axs[0, 1].set_ylabel("frequency")

axs[1, 0].scatter(x=_test_data['outcome'], y=test_predicted_mean, s=6, alpha=0.5)
axs[1, 0].set_title("Test data, ratio of outcomes within 95% CI: {:.2f}".format(coverage_test.mean().item()))
axs[1, 0].scatter(
x=_test_data["outcome"], y=test_predicted_mean, s=6, alpha=0.5
)
axs[1, 0].set_title(
"Test data, ratio of outcomes within 95% CI: {:.2f}".format(
coverage_test.mean().item()
)
)
axs[1, 0].set_xlabel("true outcome")
axs[1, 0].set_ylabel("mean predicted outcome")

axs[1, 1].hist(residuals_test, bins=50)
axs[1, 1].set_title("Test set residuals, Rsquared: {:.2f}".format(rsquared_test.item()))
axs[1, 1].set_title(
"Test set residuals, Rsquared: {:.2f}".format(rsquared_test.item())
)
axs[1, 1].set_xlabel("residuals")
axs[1, 1].set_ylabel("frequency")

Expand All @@ -173,7 +213,5 @@ def apply_mask(data, mask):
"rsquared_train": rsquared_train,
"rsquared_test": rsquared_test,
"coverage_train": coverage_training.mean().item(),
"coverage_test": coverage_test.mean().item()
"coverage_test": coverage_test.mean().item(),
}


7 changes: 4 additions & 3 deletions cities/modeling/simple_linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ def forward(
name
].squeeze(-1)


objects_cat_weighted[name] = weights_categorical_outcome[name][
..., categorical[name]
]
Expand Down Expand Up @@ -222,8 +221,10 @@ def unconditioned_model():
f"continuous_{key}", dist.Normal(0, 1)
)
return self.model(
categorical=_categorical, continuous=_continuous,
outcome=None, categorical_levels=self.categorical_levels
categorical=_categorical,
continuous=_continuous,
outcome=None,
categorical_levels=self.categorical_levels,
)

self.unconditioned_model = unconditioned_model
Expand Down
50 changes: 41 additions & 9 deletions cities/utils/data_loader.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,62 @@
from torch.utils.data import Dataset
from typing import Dict, List

import torch
from torch.utils.data import Dataset


class ZoningDataset(Dataset):
def __init__(self, categorical, continuous, standardization_dictionary=None, index_dictionary=None):
def __init__(
self,
categorical,
continuous,
standardization_dictionary=None,
index_dictionary=None,
):
self.categorical = categorical
self.continuous = continuous

if index_dictionary is None:
self.index_dictionary = {
"zoning_ordering" : ['downtown', 'blue_zone', 'yellow_zone', 'other_non_university'],
"limit_ordering" : ['eliminated', 'reduced', 'full']
"zoning_ordering": [
"downtown",
"blue_zone",
"yellow_zone",
"other_non_university",
],
"limit_ordering": ["eliminated", "reduced", "full"],
}

self.standardization_dictionary = standardization_dictionary

categorical_levels = dict()
if self.categorical:
self.categorical_levels = dict()
for name in self.categorical.keys():
self.categorical_levels[name] = torch.unique(categorical[name])

def __len__(self):
return len(self.categorical['parcel_id'])
return len(self.categorical["parcel_id"])

def __getitem__(self, idx):
cat_data = {key: val[idx] for key, val in self.categorical.items()}
cont_data = {key: val[idx] for key, val in self.continuous.items()}
return {'categorical': cat_data, 'continuous': cont_data,}

return {
"categorical": cat_data,
"continuous": cont_data,
}


def select_from_data(data, kwarg_names: Dict["str", List["str"]]):
_data = {}
_data["outcome"] = data["continuous"][kwarg_names["outcome"]]
_data["categorical"] = {
key: val
for key, val in data["categorical"].items()
if key in kwarg_names["categorical"]
}
_data["continuous"] = {
key: val
for key, val in data["continuous"].items()
if key in kwarg_names["continuous"]
}

return _data
Loading

0 comments on commit 4eaaa9a

Please sign in to comment.