Skip to content

Commit

Permalink
Merge pull request #1 from ostojanovic/revision
Browse files Browse the repository at this point in the history
Revision
  • Loading branch information
jleugeri authored Jul 25, 2019
2 parents 435f6ac + 9b81991 commit 6db615d
Show file tree
Hide file tree
Showing 39 changed files with 849 additions and 607 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# data directory
data/

# log files
logs/

Expand Down
Binary file modified figures/curves.pdf
Binary file not shown.
Binary file modified figures/curves_borreliosis_appendix.pdf
Binary file not shown.
Binary file modified figures/curves_campylobacter_appendix.pdf
Binary file not shown.
Binary file modified figures/curves_rotavirus_appendix.pdf
Binary file not shown.
Binary file modified figures/exogenous_components.pdf
Binary file not shown.
Binary file added figures/forest_borreliosis.pdf
Binary file not shown.
Binary file added figures/forest_campylobacter.pdf
Binary file not shown.
Binary file added figures/forest_rotavirus.pdf
Binary file not shown.
Binary file modified figures/interaction_kernels.pdf
Binary file not shown.
Binary file removed figures/interaction_weights_borreliosis_appendix.pdf
Binary file not shown.
Binary file not shown.
Binary file removed figures/interaction_weights_rotavirus_appendix.pdf
Binary file not shown.
Binary file removed figures/kernels_appendix.pdf
Binary file not shown.
Binary file modified figures/measures.pdf
Binary file not shown.
Binary file added figures/rhat.pdf
Binary file not shown.
Binary file removed figures/schematic.pdf
Binary file not shown.
Binary file modified figures/temporal_contribution.pdf
Binary file not shown.
Empty file added logs/.keep
Empty file.
162 changes: 92 additions & 70 deletions src/BaseModel.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re, pandas as pd, datetime, numpy as np, scipy as sp, pymc3 as pm, patsy as pt, theano, theano.tensor as tt
import theano
import re, pandas as pd, datetime, numpy as np, scipy as sp, pymc3 as pm, patsy as pt, theano.tensor as tt
theano.config.compute_test_value = 'off' # BUG: may throw an error for flat RVs
from collections import OrderedDict
from sampling_utils import *
Expand Down Expand Up @@ -66,6 +67,7 @@ def __init__(self, var, filenames, weeks, counties):
with open(filename, "rb") as f:
tmp=pkl.load(f)
except FileNotFoundError:
print("Warning: File {} not found!".format(filename))
pass
except Exception as e:
print(e)
Expand Down Expand Up @@ -107,7 +109,7 @@ class BaseModel(object):
* interaction effects (functions of distance in time and space relative to each datapoint)
"""

def __init__(self, trange, counties, ia_effect_filenames, num_ia=16, model=None, include_ia=True, include_eastwest=True, include_demographics=True, include_temporal=True):
def __init__(self, trange, counties, ia_effect_filenames, num_ia=16, model=None, include_ia=True, include_eastwest=True, include_demographics=True, include_temporal=True, orthogonalize=False):
self.county_info = counties
self.ia_effect_filenames = ia_effect_filenames
self.num_ia = num_ia if include_ia else 0
Expand All @@ -116,7 +118,7 @@ def __init__(self, trange, counties, ia_effect_filenames, num_ia=16, model=None,
self.include_demographics = include_demographics
self.include_temporal = include_temporal
self.trange = trange

first_year=self.trange[0][0]
last_year=self.trange[1][0]
self.features = {
Expand All @@ -126,110 +128,130 @@ def __init__(self, trange, counties, ia_effect_filenames, num_ia=16, model=None,
"spatial": {"eastwest": SpatialEastWestFeature(self.county_info)} if self.include_eastwest else {},
"exposure": {"exposure": SpatioTemporalYearlyDemographicsFeature(self.county_info, "total", 1.0/100000)}
}


self.Q = np.eye(16, dtype=np.float32)
if orthogonalize:
# transformation to orthogonalize IA features
T = np.linalg.inv(np.linalg.cholesky(gaussian_gram([6.25,12.5,25.0,50.0]))).T
for i in range(4):
self.Q[i*4:(i+1)*4, i*4:(i+1)*4] = T


def evaluate_features(self, weeks, counties):
all_features = {}
for group_name,features in self.features.items():
group_features = {}
for feature_name,feature in features.items():
feature_matrix = feature(weeks, counties)
group_features[feature_name] = pd.DataFrame(feature_matrix[:,:], index=weeks, columns=counties).stack()
all_features[group_name] = pd.DataFrame(group_features)
all_features[group_name] = pd.DataFrame([], index=pd.MultiIndex.from_product([weeks,counties]), columns=[]) if len(group_features)==0 else pd.DataFrame(group_features)
return all_features

def model(self, target):
def init_model(self, target):
weeks,counties = target.index, target.columns

# extract features
features = self.evaluate_features(weeks, counties)

Y = target.stack().values.astype(np.float32)
T_S = features["temporal_seasonal"].values
T_T = features["temporal_trend"].values
TS = features["spatiotemporal"].values
S = features["spatial"].values
exposure = features["exposure"].values.ravel()

with pm.Model() as model:
α = pm.HalfCauchy("α", beta=2.0)
IA = pm.Flat("IA", shape=(len(Y),self.num_ia))

W_ia = pm.HalfNormal("W_ia", sd=1, shape=self.num_ia)
W_t_s = pm.Normal("W_t_s", mu=0, sd=1, shape=T_S.shape[1])
W_t_t = pm.Normal("W_t_t", mu=0, sd=1, shape=T_T.shape[1])

s = tt.dot(IA, W_ia) + tt.dot(T_S, W_t_s) + tt.dot(T_T, W_t_t)

if TS.shape[1]!=0:
W_ts = pm.Normal("W_ts", mu=0, sd=1, shape=TS.shape[1])
s += tt.dot(TS, W_ts)

if S.shape[1]!=0:
W_s = pm.Normal("W_s", mu=0, sd=1, shape=S.shape[1])
s += tt.dot(S, W_s)

μ = pm.Deterministic("μ", tt.exp(s)*exposure)
pm.NegativeBinomial("Y", mu=μ, alpha=α, observed=Y)

return model
Y_obs = target.stack().values.astype(np.float32)
T_S = features["temporal_seasonal"].values.astype(np.float32)
T_T = features["temporal_trend"].values.astype(np.float32)
TS = features["spatiotemporal"].values.astype(np.float32)
S = features["spatial"].values.astype(np.float32)

log_exposure = np.log(features["exposure"].values.astype(np.float32).ravel())

# extract dimensions
num_obs = np.prod(target.shape)
num_t_s = T_S.shape[1]
num_t_t = T_T.shape[1]
num_ts = TS.shape[1]
num_s = S.shape[1]


with pm.Model() as self.model:
# interaction effects are generated externally -> flat prior
IA = pm.Flat("IA", testval=np.ones((num_obs, self.num_ia)),shape=(num_obs, self.num_ia))

# priors
#δ = 1/√α
δ = pm.HalfCauchy("δ", 10, testval=1.0)
α = pm.Deterministic("α", np.float32(1.0)/δ)
W_ia = pm.Normal("W_ia", mu=0, sd=10, testval=np.zeros(self.num_ia), shape=self.num_ia)
W_t_s = pm.Normal("W_t_s", mu=0, sd=10, testval=np.zeros(num_t_s), shape=num_t_s)
W_t_t = pm.Normal("W_t_t", mu=0, sd=10, testval=np.zeros(num_t_t), shape=num_t_t)
W_ts = pm.Normal("W_ts", mu=0, sd=10, testval=np.zeros(num_ts), shape=num_ts)
W_s = pm.Normal("W_s", mu=0, sd=10, testval=np.zeros(num_s), shape=num_s)
self.param_names = ["δ", "W_ia", "W_t_s", "W_t_t", "W_ts", "W_s"]
self.params = [δ, W_ia, W_t_s, W_t_t, W_ts, W_s]

# calculate interaction effect
IA_ef = tt.dot(tt.dot(IA, self.Q), W_ia)

def sample_parameters(self, target, samples=1000, chains=None, cores=8, init="auto", **kwargs):
# calculate mean rates
μ = pm.Deterministic("μ",
# (1.0+tt.exp(IA_ef))*
tt.exp(IA_ef + tt.dot(T_S, W_t_s) + tt.dot(T_T, W_t_t) + tt.dot(TS, W_ts) + tt.dot(S, W_s) + log_exposure)
)

# constrain to observations
pm.NegativeBinomial("Y", mu=μ, alpha=α, observed=Y_obs)


def sample_parameters(self, target, n_init=100, samples=1000, chains=None, cores=8, init="advi", target_accept=0.8, max_treedepth=10, **kwargs):
"""
sample_parameters(target, samples=1000, cores=8, init="auto", **kwargs)
Samples from the posterior parameter distribution, given a training dataset.
The basis functions are designed to be causal, i.e. only data points strictly predating the predicted time points are used (this implies "one-step-ahead"-predictions).
"""
model = self.model(target)

# model = self.model(target)

self.init_model(target)

if chains is None:
chains = max(2,cores)

with model:
ia_effect_loader = IAEffectLoader(model.IA, self.ia_effect_filenames, target.index, target.columns)

vars = [model.α, model.W_ia, model.W_t_s, model.W_t_t]
if hasattr(model,"W_ts"):
vars += [model.W_ts]
if hasattr(model,"W_s"):
vars += [model.W_s]

steps = ([ia_effect_loader] if self.include_ia else [] ) + \
[pm.step_methods.NUTS(vars=vars)]
trace = pm.sample(samples, steps, chains=chains, cores=cores, init=init, compute_convergence_checks=False, **kwargs)

with self.model:
# run!
ia_effect_loader = IAEffectLoader(self.model.IA, self.ia_effect_filenames, target.index, target.columns)
nuts = pm.step_methods.NUTS(vars=self.params, target_accept=target_accept, max_treedepth=max_treedepth)
steps = (([ia_effect_loader] if self.include_ia else [] ) + [nuts] )
trace = pm.sample(samples, steps, chains=chains, cores=cores, compute_convergence_checks=False, **kwargs)
# trace = pm.sample(0, steps, tune=samples+tune, discard_tuned_samples=False, chains=chains, cores=cores, compute_convergence_checks=False, **kwargs)
# trace = trace[tune:]
return trace

def sample_predictions(self, target_weeks, target_counties, parameters, init="auto"):
# extract features
features = self.evaluate_features(target_weeks, target_counties)

T_S = features["temporal_seasonal"].values
T_T = features["temporal_trend"].values
TS = features["spatiotemporal"].values
S = features["spatial"].values
exposure = features["exposure"].values.reshape((-1,1))
log_exposure = np.log(features["exposure"].values.ravel())

α = parameters["α"].reshape((1,-1))
# extract coefficient samples
α = parameters["α"]
W_ia = parameters["W_ia"]
W_t_s = parameters["W_t_s"]
W_t_t = parameters["W_t_t"]
W_ts = parameters["W_ts"]
W_s = parameters["W_s"]


ia_l = IAEffectLoader(None, self.ia_effect_filenames, target_weeks, target_counties)

num_predictions = len(target_weeks)*len(target_counties)
num_parameter_samples = α.size
with pm.Model() as model:
IA = pm.Flat("IA", shape=(num_predictions,self.num_ia))
ia_effect_loader = IAEffectLoader(model.IA, self.ia_effect_filenames, target_weeks, target_counties)
IA_trace = pm.sample(num_parameter_samples, ia_effect_loader, chains=1, cores=1, init=init, compute_convergence_checks=False)["IA"]

μs = np.exp(np.dot(T_S, W_t_s.T) + np.dot(T_T, W_t_t.T) + np.dot(TS, W_ts.T) + np.dot(S, W_s.T) + (IA_trace*W_ia[:,np.newaxis,:]).sum(axis=-1).T)*exposure
ys = pm.NegativeBinomial.dist(mu=μs, alpha=α).random()

new_trace = {}
for varname in parameters.varnames:
new_trace[varname] = parameters[varname]
new_trace["IA"] = IA_trace.T
new_trace["μ"] = μs.T
new_trace["Y"] = ys.T

return new_trace
y = np.zeros((num_parameter_samples, num_predictions), dtype=int)
μ = np.zeros((num_parameter_samples, num_predictions), dtype=np.float32)

for i in range(num_parameter_samples):
IA_ef = np.dot(np.dot(ia_l.samples[np.random.choice(len(ia_l.samples))], self.Q), W_ia[i])
# μ[i,:] = (1.0+np.exp(IA_ef))*np.exp(np.dot(T_S, W_t_s[i]) + np.dot(T_T, W_t_t[i]) + np.dot(TS, W_ts[i]) + np.dot(S, W_s[i]) + log_exposure)
μ[i,:] = np.exp(IA_ef + np.dot(T_S, W_t_s[i]) + np.dot(T_T, W_t_t[i]) + np.dot(TS, W_ts[i]) + np.dot(S, W_s[i]) + log_exposure)
y[i,:] = pm.NegativeBinomial.dist(mu=μ[i,:], alpha=α[i]).random()

return {"y": y, "μ": μ, "α": α}
7 changes: 7 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import itertools as it

diseases = ["campylobacter", "rotavirus", "borreliosis"]
prediction_regions = ["germany", "bavaria"]

combinations_age_eastwest = [(False,False),(False,True),(True,True)]
combinations = list(it.product(range(len(combinations_age_eastwest)), diseases))
55 changes: 26 additions & 29 deletions src/evaluate.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
from config import *
from shared_utils import *
import pickle as pkl
import numpy as np
from shared_utils import dss, deviance_negbin, load_data, split_data, parse_yearweek
from collections import OrderedDict
import pandas as pd

diseases = ["campylobacter", "rotavirus", "borreliosis"]
measures = {
"deviance": (lambda target_val, pred_val, alpha_val: deviance_negbin(target_val, pred_val, alpha_val)),
"DS score": (lambda target_val, pred_val, alpha_val: dss(target_val, pred_val, pred_val+pred_val**2/alpha_val))
Expand All @@ -13,7 +11,6 @@
with open('../data/comparison.pkl',"rb") as f:
best_model=pkl.load(f)


with open('../data/counties/counties.pkl',"rb") as f:
counties = pkl.load(f)

Expand All @@ -22,14 +19,13 @@
for i,disease in enumerate(diseases):
use_age = best_model[disease]["use_age"]
use_eastwest = best_model[disease]["use_eastwest"]
filename_pred = "../data/mcmc_samples/predictions_{}_{}_{}.pkl".format(disease, use_age, use_eastwest)
prediction_region = "bavaria" if disease=="borreliosis" else "germany"

with open(filename_pred,"rb") as f:
res = pkl.load(f)
if disease=="borreliosis":
prediction_region = "bavaria"
use_eastwest = False
else:
prediction_region = "germany"

mean_predicted_μ = np.reshape(res['μ'],(800,104,-1)).mean(axis=0)
mean_predicted_α = res['α'].mean()
res = load_pred(disease, use_age, use_eastwest)

with open('../data/hhh4_results_{}.pkl'.format(disease),"rb") as f:
res_hhh4 = pkl.load(f)
Expand All @@ -41,22 +37,23 @@
_, _, _, target = split_data(data)
county_ids = target.columns

models = {
"our model": (pd.DataFrame(mean_predicted_μ, columns=target.columns, index=target.index), pd.Series(np.repeat(mean_predicted_α, target.shape[0]), index=target.index)),
"hhh4 model": (res_hhh4["test prediction mean"], pd.Series(np.repeat(1.0/res_hhh4["test alpha"], target.shape[0]), index=target.index)),
}

assert np.all(models["our model"][0].columns == models["hhh4 model"][0].columns), "Column names don't match!"

summary[disease] = {}
for model,(prediction, alpha) in models.items():
summary[disease][model] = {}
summary = {}
# hhh4

for name in ["our model", "hhh4 model"]:
summary[name] = {}
for measure,f in measures.items():
measure_df = pd.DataFrame(f(target.values.ravel(), prediction.values.ravel(), alpha.values.repeat(target.shape[1])).reshape(target.shape), index=target.index, columns=target.columns)
summary[disease][model][measure] = measure_df
summary[disease][model][measure + " mean"] = np.mean(measure_df.mean())
summary[disease][model][measure + " sd"] = np.std(measure_df.mean())

with open('../data/measures_summary.pkl',"wb") as f:
pkl.dump(summary, f)
print("Evaluating {} for disease {}, measure {}".format(name, disease, measure))
if name == "our model":
measure_df = pd.DataFrame(f(target.values.astype(np.float32).reshape((1,-1)).repeat(res["y"].shape[0], axis=0), res["μ"].astype(np.float32), res["α"].astype(np.float32).reshape((-1,1))).mean(axis=0).reshape(target.shape), index=target.index, columns=target.columns)
else:
measure_df = pd.DataFrame(f(target.values.astype(np.float32).ravel(), res_hhh4["test prediction mean"].values.astype(np.float32).ravel(), np.float32(1.0/res_hhh4["test alpha"])).reshape(target.shape), index=target.index, columns=target.columns)

summary[name][measure] = measure_df
summary[name][measure + " mean"] = np.mean(measure_df.mean())
summary[name][measure + " sd"] = np.std(measure_df.mean())

with open("../data/measures_{}_summary.pkl".format(disease),"wb") as f:
pkl.dump(summary, f)

del summary
14 changes: 11 additions & 3 deletions src/gridjob_sample_posterior.sge
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
#!/bin/bash
#$ -N RKI_MCMC
#$ -t 1-6
#$ -l mem=30G
#$ -t 1-9
#$ -l mem=14G
#$ -m n
#$ -q ni.q
#$ -l h='!(ramsauer.ikw.uni-osnabrueck.de|vector.cv.uni-osnabrueck.de|righty.ni.uni-osnabrueck.de)'
#$ -wd /net/store/ni/projects/BSTIM/src/
#$ -e /net/store/ni/projects/BSTIM/logs_backup/e_$TASK_ID.txt
#$ -o /net/store/ni/projects/BSTIM/logs_backup/o_$TASK_ID.txt
#$ -pe default 4

echo "Running job ${JOB_ID}, task ${SGE_TASK_ID} on `hostname`."

mkdir "/tmp/${JOB_ID}_${SGE_TASK_ID}"

THEANO_FLAGS="base_compiledir=/tmp/${JOB_ID}_${SGE_TASK_ID}/,floatX=float32,device=cpu,openmp=True,mode=FAST_RUN,warn_float64=warn" OMP_NUM_THREADS=8 python3 sample_posterior.py
source /net/store/users/jleugeri/virt/bin/activate
#THEANO_FLAGS="base_compiledir=/tmp/${JOB_ID}_${SGE_TASK_ID}/,floatX=float32,device=cpu,openmp=True,mode=FAST_RUN,warn_float64=warn" OMP_NUM_THREADS=4 python3 sample_posterior.py
THEANO_FLAGS="base_compiledir=/tmp/${JOB_ID}_${SGE_TASK_ID}/,floatX=float32,device=cpu,mode=FAST_RUN,warn_float64=warn" python3 sample_posterior.py
Loading

0 comments on commit 6db615d

Please sign in to comment.