-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathcommon.py
454 lines (377 loc) · 17 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
import matplotlib.pyplot as plt
import matplotlib.collections
import numpy as np
import descartes
import os, datetime, collections
import open_cp.sources.chicago
import open_cp.geometry
import open_cp.plot
import open_cp.predictors
import open_cp.evaluation
import open_cp.kernels
import pandas as pd
import scipy.stats
def load_geometry(datadir):
open_cp.sources.chicago.set_data_directory(datadir)
return open_cp.sources.chicago.get_side("North")
def make_grid(geo):
grid = open_cp.data.Grid(150, 150, 0, 0)
return open_cp.geometry.mask_grid_by_intersection(geo, grid)
def load(datadir):
"""Load and compute initial data.
:return: `(geometry, grid)`
"""
chicago = load_geometry(datadir)
return chicago, make_grid(chicago)
class BaseModel():
def __init__(self, grid):
self._grid = grid
def prob(self, x, y):
want, pt = self._in_grid(x, y)
if want:
return self._probs[pt[1], pt[0]]
return 0.0
def max_prob(self):
return np.max(np.ma.array(self._probs, mask=self._grid.mask))
@property
def grid(self):
return self._grid
def to_grid_coords(self, x, y):
pt = np.asarray([x,y]) - [self.grid.xoffset, self.grid.yoffset]
pt = np.floor_divide(pt, [self.grid.xsize, self.grid.ysize]).astype(np.int)
return pt
def in_grid(self, x, y):
pt = self.to_grid_coords(x, y)
if np.all((pt >= [0, 0]) & (pt < [self.grid.xextent, self.grid.yextent])):
return self.grid.is_valid(*pt)
return False
def to_prediction(self):
mat = np.array(self._probs)
pred = open_cp.predictors.GridPredictionArray(self.grid.xsize, self.grid.ysize,
mat, self.grid.xoffset, self.grid.yoffset)
pred.mask_with(self.grid)
return pred
def map_to_grid(self, x, y):
"""Map a point in :math:`[0,1]^2` to the bounding box of the grid"""
pt = np.asarray([x,y])
pt = pt * [self._grid.xsize * self._grid.xextent, self._grid.ysize * self._grid.yextent]
return pt + [self._grid.xoffset, self._grid.yoffset]
def _in_grid(self, x, y):
pt = self.to_grid_coords(x, y)
if np.all((pt >= [0, 0]) & (pt < [self.grid.xextent, self.grid.yextent])):
return self.grid.is_valid(*pt), pt
return False, pt
def _set_probs(self, probs):
probs = self.grid.mask_matrix(probs)
self._probs = probs / np.sum(probs)
class Model1(BaseModel):
"""Homogeneous poisson process."""
def __init__(self, grid):
super().__init__(grid)
probs = np.zeros((grid.yextent, grid.xextent)) + 1
self._set_probs(probs)
def to_randomised_prediction(self):
mat = np.array(self._probs)
mat += np.random.random(mat.shape) * 1e-7
pred = open_cp.predictors.GridPredictionArray(self.grid.xsize, self.grid.ysize,
mat, self.grid.xoffset, self.grid.yoffset)
pred.mask_with(self.grid)
pred = pred.renormalise()
return pred
def __str__(self):
return "Model1"
class Model2(BaseModel):
"""Inhomogeneous Poisson process, linearly increases left to right across
the grid."""
def __init__(self, grid):
super().__init__(grid)
probs = np.linspace(0, 1, grid.xextent)
probs = probs[None,:] + np.zeros(grid.yextent)[:,None]
self._set_probs(probs)
def __str__(self):
return "Model2"
class Model3(BaseModel):
"""Inhomogeneous Poisson process, on `[0,1]^2` has intensity
:math:`2 \exp(-20(x-y)^2) + y`
then spread out to the grid.
"""
def __init__(self, grid):
super().__init__(grid)
probs = np.empty((grid.yextent, grid.xextent))
for x in range(grid.xextent):
for y in range(grid.yextent):
probs[y,x] = self.cts_prob((x+0.5) / grid.xextent, (y+0.5) / grid.yextent)
self._set_probs(probs)
def cts_prob(self, x, y):
return 2 * np.exp(-(x-y)**2 * 20) + y
def __str__(self):
return "Model3"
def kde_scorer(grid):
"""Use a "plugin" bandwidth estimator to form a probability surface from
the actual events. Then return the integral of :math:`| f(x) - p(x)|^2`.
:param grid: The grid to conform the KDE to.
:return: A function object with with signature `func(pred, tps)`.
"""
def score_kde(pred, tps):
points = np.asarray([tps.xcoords, tps.ycoords])
if tps.number_data_points <= 2:
raise ValueError("Need at least 3 events.")
kernel = open_cp.kernels.GaussianEdgeCorrectGrid(points, grid)
kde_pred = open_cp.predictors.grid_prediction_from_kernel_and_masked_grid(kernel, grid, samples=5)
kde_pred = kde_pred.renormalise()
return np.sum((pred.intensity_matrix - kde_pred.intensity_matrix)**2) * grid.xsize * grid.ysize
return score_kde
def kde_scorer_fixed_bandwidth(grid, bandwidth):
"""As :func:`kde_scorer` but with a fixed bandwidth.
:param grid: The grid to conform the KDE to.
:param bandwidth: The bandwith to use.
:return: A function object with with signature `func(pred, tps)`.
"""
def score_kde(pred, tps):
points = np.asarray([tps.xcoords, tps.ycoords])
if tps.number_data_points <= 2:
raise ValueError("Need at least 3 events.")
kernel = open_cp.kernels.GaussianEdgeCorrectGrid(points, grid)
kernel.covariance_matrix = [[1,0],[0,1]]
kernel.bandwidth = bandwidth
kde_pred = open_cp.predictors.grid_prediction_from_kernel_and_masked_grid(kernel, grid, samples=5)
kde_pred = kde_pred.renormalise()
return np.sum((pred.intensity_matrix - kde_pred.intensity_matrix)**2) * grid.xsize * grid.ysize
return score_kde
def multiscale_brier(pred, tps):
"""Score the prediction using multiscale Brier."""
maxsize = min(pred.xextent, pred.yextent)
return {s : open_cp.evaluation.multiscale_brier_score(pred, tps,s) for s in range(1, maxsize+1)}
def multiscale_kl(pred, tps):
"""Score the prediction using multiscale Kullback-Leibler."""
maxsize = min(pred.xextent, pred.yextent)
return {s : open_cp.evaluation.multiscale_kl_score(pred, tps,s) for s in range(1, maxsize+1)}
def sample(model, size):
"""Return points from the model.
:param model: The model to use to sample from.
:param size: Number of points to return.
:return: Array of shape `(size, 2)`.
"""
out = []
renorm = model.max_prob()
while len(out) < size:
pt = model.map_to_grid(*np.random.random(2))
if model.in_grid(*pt):
if model.prob(*pt) > np.random.random() * renorm:
out.append(pt)
return np.asarray(out)
def sample_to_timed_points(model, size):
"""As :func:`sample` but return in :class:`open_cp.data.TimedPoints`.
"""
t = [datetime.datetime(2017,1,1)] * size
pts = sample(model, size)
assert pts.shape == (size, 2)
return open_cp.data.TimedPoints.from_coords(t, *pts.T)
def make_data_preds(grid, num_trials=1000, base_intensity=10):
"""Make the data in one go.
:param grid: The masked grid to base everything on.
:param num_trials: The number of trials to run. Will not yield empty point
collections, so may return fewer than this many tuples.
:param base_intensity: Each trial has `n` events where `n` is distributed
as a Poisson with mean `base_intensity`.
:return: A dictionary from `key` to a list of pairs `(pred, tps)`. Here
`key` is the pair `(source_model_name, pred_model_name)`, `pred` is the
prediction as a :class:`GridPredictionArray` instance, and `tps` is a
:class:`TimedPoints` instance.
"""
predictions = collections.defaultdict(list)
for trial in range(num_trials):
for SourceModel in [Model1, Model2, Model3]:
source_model = SourceModel(grid)
while True:
num_pts = np.random.poisson(base_intensity)
if num_pts > 0:
break
tps = sample_to_timed_points(source_model, num_pts)
for PredModel in [Model1, Model2, Model3]:
pred_model = PredModel(grid)
pred = pred_model.to_prediction()
try:
pred = pred_model.to_randomised_prediction()
except:
pass
key = (str(source_model), str(pred_model))
predictions[key].append((pred, tps))
return predictions
def process(all_data, func):
"""`func` should be a function object with signature `func(pred, tps)`, and
should return some object.
:param all_data: The output of :func:`make_data_preds`.
:return: A dictionary from `key` to `object` where each key will be `(k1,
k2, i)` with `k1,k2` as before, and `i` a counter. `object` is the
return value of func.
"""
return { (key) + (i,) : func(pred, tps) for key in all_data
for i, (pred, tps) in enumerate(all_data[key]) }
def constrain_to_number_events(all_data, minimum_event_count):
"""Remove trials with too few events.
:param all_data: The output of :func:`make_data_preds`; or data conforming
to this dictionary style.
:param minimum_event_count: The minimum number of events we want in a
trial.
:return: The same format of data, but each "actual occurrance" will have at
least `minimum_event_count` events.
"""
return {k:[(g,tps) for g,tps in v if tps.number_data_points >= minimum_event_count]
for k,v in all_data.items()}
def plot_models(data, func):
"""Helper method to plot a 3 by 3 grid of results.
:param data: A dictionary from `key` to object. Each `key` is a tuple
`(k1, k2, *)` where `k1` is the source name (which model generated the
points) and `k2` is the prediction name (which model generated the
prediction).
:param func: A function object with signature `func(result, ax, key)`
where `result` is a list of the objects passed in `data` which have
appropriate `k1` and `k2`; `ax` is the `matplotlib` `axis` object to
plot to; `key` is `(k1,k2)`. This function should process the object
appropriately, and draw to the axis.
"""
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(16,12))
labels = ["Model1", "Model2", "Model3"]
for axe, k1 in zip(axes, labels):
for ax, k2 in zip(axe, labels):
ax.set_title("{}/{}".format(k1,k2))
result = [data[k] for k in data if k[:2] == (k1, k2)]
func(result, ax, (k1, k2))
None
fig.tight_layout()
def plot_three(data, func):
"""As :func:`plot_models`, but will pass a _dictionary_ `results` as the
first argument, where this is map from `k2` (the prediction model) to a
list of objects. Allows plotting all predictions (for one given
generation model) on a single axis object.
"""
fig, axes = plt.subplots(ncols=3, figsize=(16,5))
labels = ["Model1", "Model2", "Model3"]
for ax, k1 in zip(axes, labels):
ax.set_title("Data from {}".format(k1))
results = {k2 : [data[k] for k in data if k[:2] == (k1, k2)] for k2 in labels}
func(results, ax, k1)
fig.tight_layout()
def data_by_source_model(data):
"""Assuming `data` is as in the format returned from :func:`process`,
yield data in the same format, for from source model 1, then model 2,
then model 3 (with the key now being `(prediction_model, trial_number)`).
"""
for key in ["Model1", "Model2", "Model3"]:
yield {k[1:] : v for k, v in data.items() if k[0] == key}
PairedItem = collections.namedtuple("PairedItem", "key one two")
def paired_data(data_for_one_source):
"""Assuming `data_for_one_source` is as returned by
:func:`data_by_source_model`, yield a list of :class:`PairedItem` objects,
in the order of "models 1/2 compared" then "1/3" and then "2/3".
"""
for k1, k2 in [(1,2), (1,3), (2,3)]:
k1 = "Model{}".format(k1)
k2 = "Model{}".format(k2)
keys = {k[1:] for k in data_for_one_source if k[0] == k1}
keys.intersection_update({k[1:] for k in data_for_one_source if k[0] == k2})
keys = list(keys)
keys.sort()
yield [PairedItem(k, data_for_one_source[(k1,*k)], data_for_one_source[(k2,*k)]) for k in keys]
def plot_paired_data(data, plot_func):
"""Plot a 3x3 array of plots, each row being data from one source model,
and the columns being "models 1/2 compared" then "1/3" and then "2/3".
:param data: As before, return of :func:`process`.
:param plot_func:` Should have signature `plot_func(ax, paired_data)`
where `paired_data` is a list of :class:`PairedItem` objects.
"""
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(16,12))
models = ["Model1", "Model2", "Model3"]
comparisons = [(1,2), (1,3), (2,3)]
for axe, source_name, row_data in zip(axes, models, data_by_source_model(data)):
for ax, key, d in zip(axe, comparisons, paired_data(row_data)):
plot_func(ax, d)
ax.set_title("{} -> {} vs {}".format(source_name, *key))
fig.tight_layout()
return fig, axes
def comparison_uni_paired(lower_bound, upper_bound):
"""Returns a `plot_func` which is suitable for passing to
`plot_paired_data`. Assumes that the "objects" returned from
:func:`process` are just numbers, and plots the difference between "one"
and "two" in the :class:`PairedItem` instance. Plots an estimate of the
cumulative density.
:param lower_bound:
:param upper_bound: `x` axis range.
"""
def plot_func(ax, data):
diff = np.asarray([d.one - d.two for d in data])
y = np.sum(diff <= 0) / len(diff)
lc = matplotlib.collections.LineCollection([[[lower_bound,y],[0,y]], [[0,0],[0,y]]], color="black", linewidth=1)
ax.add_collection(lc)
x = np.linspace(lower_bound, upper_bound, 100)
cumulative = []
for cutoff in x:
cumulative.append( np.sum(diff <= cutoff) )
ax.plot(x, np.asarray(cumulative) / len(diff))
return plot_func
def scatter_uni_paired_plot_func(ax, data):
"""A `plot_func` which is suitable for passing to `plot_paired_data`.
Assumes that the "objects" returned from :func:`process` are just numbers,
and plots a scatter diagram."""
x = [d.one for d in data]
y = [d.two for d in data]
ax.scatter(x, y)
start = min(min(x), min(y))
end = max(max(x), max(y))
d = (end - start) * 0.1
ax.plot([start-d, end+d], [start-d, end+d], color="red", linewidth=1)
def label_scatter_uni_paired(fig, axes):
"""Suitable for calling after using :func:`scatter_uni_paired_plot_func`.
Sets labels on each axis, and tightly lays out the figure."""
for axes_row in axes:
for ax, (x,y) in zip(axes_row, [(1,2), (1,3), (2,3)]):
ax.set(xlabel="Model{}".format(x), ylabel="Model{}".format(y))
fig.tight_layout()
#############################################################################
# More tentative, data vis stuff.
#############################################################################
def hitrate_inverse_to_hitrate(inv_dict, coverages):
"""Convert the "inverse hitrate dictionary" to a more traditional
lookup, using `coverages`."""
out = dict()
for cov in coverages:
choices = [k for k,v in inv_dict.items() if v <= cov]
out[cov] = 0 if len(choices) == 0 else max(choices)
return out
def plot_hit_rate(data):
"""Draws a 3x3 grid; plots the mean and 25%, 75% percentiles of coverage
against hit rate."""
coverages = list(range(0,102,2))
def to_by_coverage(result):
by_cov = {cov : [] for cov in coverages}
for row in result:
out = hitrate_inverse_to_hitrate(row, coverages)
for c in out:
by_cov[c].append(out[c])
return by_cov
def plot_func(result, ax, key):
frame = pd.DataFrame(to_by_coverage(result)).describe().T
ax.plot(frame["mean"], label="mean")
ax.plot(frame["25%"], label="25% percentile")
ax.plot(frame["75%"], label="75% percentile")
ax.set(xlabel="Coverage (%)", ylabel="Hit rate (%)")
ax.set(xlim=[-5,105], ylim=[-5,105])
ax.legend()
plot_models(data, plot_func)
def plot_likelihood(data, grid):
"""Draws a 3x3 grid; plots an estimated distribution. For model1, we have
a delta function, which is drawn as a red line on the other plots for
comparison."""
def plot_func(result, ax, key):
kernel = scipy.stats.kde.gaussian_kde(result)
xx = -np.log(np.sum(~grid.mask))
if key[1] == "Model1":
d = 0.0002
x = np.linspace(xx - d, xx + d, 100)
else:
x = np.linspace(-9, -6, 100)
ax.plot(x, kernel(x))
line = [[xx,0], [xx,100000]]
ax.add_collection(matplotlib.collections.LineCollection([line], color="red"))
plot_models(data, plot_func)