-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfitting_polynomials_fourier_1d.py
448 lines (378 loc) · 16 KB
/
fitting_polynomials_fourier_1d.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
"""
fitting_polynomials_fourier_1d.py
=================================
Examples of regression of in one dimension, described in the paper "No patterns
in regression residuals," illustrating underspecified, correctly specified, and
overfitting scenarios. Chebyshev polynomial of the first kind series models,
and Fourier series models, are combined with additive iid Gaussian noise and
subject to ordinary linear regression.
Saves output into a uniquely timestamped subfolder of
./plots/polynomials_fourier_1d/.
"""
import os
import pickle
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import numpy as np
import numpy.polynomial.chebyshev
import pandas as pd
import fitting_polynomials_2d
from fitting_polynomials_2d import PLTDIR
# Parameters
# ==========
# Number of data points
nx = 100
# Sigma of iid pixel noise
noise_sigma = 1.
# Settings for the ideal model, underspecified, overspecified and highly overspecified
# series model degrees to use as model sets for the ideal model and fitting
fit_degree = {}
# real signal (ideal model) degree in the simulations (1D polynomial / Fourier series) will also
# be used as a model set for regression
fit_degree["true"] = {"cheb": 12, "sinu": 8}
fit_degree["lo"] = {"cheb": 6, "sinu": 4} # underspecified model sets
fit_degree["hi"] = {"cheb": 24, "sinu": 16} # overspecified model sets
fit_degree["vhi"] = {"cheb": 48, "sinu": 32} # added to illustrate more extreme behaviour clearly
# Per coefficient "signal to noise" in random true pattern, i.e. ratio of standard deviation
# of true curve coefficient values to noise_sigma
coeff_signal_to_noise = 1.
# Define x coordinates as linearly spaced points on the some interval, e.g. [0, 1), [-1, 1)
x = {
"cheb": np.linspace(-1., 1., num=nx, endpoint=False),
"sinu": np.linspace(0., 1., num=nx, endpoint=False),
}
# Plot settings
FIGSIZE = (10, 4)
FIGSIZE_RESIDUALS = (10, 1.25)
CLIM = [-2.5, 2.5]
CMAP = "Greys_r"
TITLE_SIZE = "x-large"
# Title display strings for plots
FIT_DISPLAY = {
"lo": "Low degree",
"true": "Matching",
"hi": "High degree",
"vhi": "Very high degree",
}
CURVE_FAMILY_DISPLAY = {"cheb": "polynomial", "sinu": "Fourier"}
# Periodogram chart settings
PERIODOGRAM_YTICKS = 10**np.linspace(-32., 4., num=10, dtype=float)
PERIODOGRAM_YLIM = 10**np.asarray([-32, 4.], dtype=float)
# Autocorrelation function chart settings
ACF_MAX_LAG = 10
# Output folder structure: project dir
PROJDIR = os.path.join(PLTDIR, "polynomials_fourier_1d")
# Output file types
OUTFILE_EXTENSIONS = (".png", ".pdf")
# Functions
# =========
def sinusoid_design_matrix(x, degree):
"""Returns the sinusoid [cosx, sinx] design matrix up to input degree"""
sinx = np.asarray([np.sin(2. * np.pi * float(j) * x) for j in range(0, degree)]).T
cosx = np.asarray([np.cos(2. * np.pi * float(j) * x) for j in range(0, degree)]).T
return np.hstack([cosx, sinx])
def chebyshev_design_matrix(x, degree):
"""Returns the Chebyshev polynomial design matrix up to input degree"""
i1n = np.eye(1 + degree)
return np.asarray([numpy.polynomial.chebyshev.chebval(x, _row) for _row in i1n]).T
def plot_regressions(xarr, yarrs, curve_family_display, tstmp, outdir, show=True):
"""Makes and saves scatter and line plots of 1D regressions.
Args:
xarr:
numpy array-like containing x coordinates shared by all arrays in
yarrs
yarrs:
list of 6 array-likes containing the following values in the
dependent variable y, in order:
- ideal model
- data (= ideal model + iid errors)
- Low degree model set OLS prediction
- Matching degree model set set OLS prediction
- High degree model set set OLS prediction
- Very high degree model set set OLS prediction
curve_family_display: one of {'polynomial', 'Fourier'}
tstmp: timestamp used in folder structure
outdir: output folder
show: plt.show()?
"""
fig, ax = plt.subplots(figsize=FIGSIZE)
ax.set_title(
f"{curve_family_display.title()} series regression in one dimension", size=TITLE_SIZE)
ax.plot(xarr, yarrs[0], color="k", ls="-", linewidth=2, label="Ideal model")
ax.plot(xarr, yarrs[1], "k+", markersize=15, label="Data")
ax.plot(xarr, yarrs[2], color="red", ls="--", linewidth=1, label=FIT_DISPLAY["lo"])
ax.plot(xarr, yarrs[3], color="k", ls="-", linewidth=1, label=FIT_DISPLAY["true"])
ax.plot(xarr, yarrs[4], color="blue", ls="-.", linewidth=1, label=FIT_DISPLAY["hi"])
ax.plot(xarr, yarrs[5], color="purple", ls=":", linewidth=1.25, label=FIT_DISPLAY["vhi"])
ax.set_xlabel(r"$x$")
ax.grid()
ax.legend()
fig.tight_layout()
for _suffix in OUTFILE_EXTENSIONS:
outfile = os.path.join(
outdir, f"curves_{curve_family_display.lower().replace(' ', '_')}_{tstmp}{_suffix}")
print(f"Saving to {outfile}")
fig.savefig(outfile)
if show:
plt.show()
plt.close(fig)
def plot_residuals(residuals, fit_display, curve_family_display, tstmp, outdir, show=True):
"""Makes and saves pcolor images plots of residuals in 1D regressions.
Args:
residuals: np.array-like
fit_display: display str for fit, e.g. Low degree, Matching degree etc.
curve_family_display: one of {'polynomial', 'Fourier'}
tstmp: timestamp used in folder structure
outdir: output folder
show: plt.show()?
"""
fig = plt.figure(figsize=FIGSIZE_RESIDUALS)
ax = fig.add_axes([0.075, 0.3, 0.855, 0.45])
im = ax.pcolor(residuals.reshape((1, len(residuals))), cmap=CMAP, clim=CLIM)
ax.set_yticklabels([])
ax.set_title(f"{fit_display} {curve_family_display} residual map", size=TITLE_SIZE)
# See https://stackoverflow.com/a/39938019 for colormap handling
divider = make_axes_locatable(ax)
cax = fig.add_axes([0.945, 0.3, 0.01, 0.45])
fig.colorbar(im, cax=cax, orientation='vertical')
for _suffix in OUTFILE_EXTENSIONS:
outfile = os.path.join(
outdir, (
f"residuals_{fit_display.lower().replace(' ', '_')}_"
f"{curve_family_display.lower().replace(' ', '_')}_{tstmp}{_suffix}"
),
)
print(f"Saving to {outfile}")
fig.savefig(outfile)
if show:
plt.show()
plt.close(fig)
return
def plot_periodograms(periodograms, nfull, curve_family_display, tstmp, outdir, show=True):
"""Makes and saves plots of error and residual periodograms from 1D
regressions.
Args:
periodograms:
list of 5 array-likes containing the following 1d periodograms (in
order):
- iid errors
- Low degree model set residuals
- Matching degree model set residuals
- High degree model set residuals
- Very high degree model set residuals
nfull:
int full size of original dataset, such that
len(p) = nfull // 2 + 1 for each for each p in the periodograms
curve_family_display: one of {'polynomial', 'Fourier'}
tstmp: timestamp used in folder structure
outdir: output folder
show: plt.show()?
"""
fig, ax = plt.subplots(figsize=FIGSIZE)
ax.set_title(curve_family_display.title()+" series regression periodograms", size=TITLE_SIZE)
ax.plot(
np.arange(len(periodograms[0])) / nfull, periodograms[0], color="k", ls="--",
linewidth=1, label="iid errors",
)
ax.plot(
np.arange(len(periodograms[1])) / nfull, periodograms[1], color="red", ls="--",
linewidth=1.5, label=FIT_DISPLAY["lo"],
)
ax.plot(
np.arange(len(periodograms[2])) / nfull, periodograms[2], color="k", ls="-",
linewidth=1.5, label=FIT_DISPLAY["true"],
)
ax.plot(
np.arange(len(periodograms[3])) / nfull, periodograms[3], color="blue", ls="-.",
linewidth=1.5, label=FIT_DISPLAY["hi"],
)
ax.plot(
np.arange(len(periodograms[4])) / nfull, periodograms[4], color="purple", ls=":",
linewidth=1.5, label=FIT_DISPLAY["vhi"],
)
ax.set_yscale("log")
ax.set_yticks(PERIODOGRAM_YTICKS)
ax.set_ylim(PERIODOGRAM_YLIM)
ax.set_xlabel("Frequency")
ax.grid()
ax.legend()
fig.tight_layout()
for _suffix in OUTFILE_EXTENSIONS:
outfile = os.path.join(
outdir,
f"periodograms_{curve_family_display.lower().replace(' ', '_')}_{tstmp}{_suffix}",
)
print(f"Saving to {outfile}")
fig.savefig(outfile)
if show:
plt.show()
plt.close(fig)
def plot_acfs(acfs, nfull, curve_family_display, tstmp, outdir, show=True):
"""Makes and saves plots of error and residual autocorrelation functions
from 1D regressions.
Args:
acfs:
list of 5 array-likes containing the following 1d autocorrelation
functions (in order):
- iid errors
- Low degree model set residuals
- Matching degree model set residuals
- High degree model set residuals
- Very high degree model set residuals
nfull:
int full size of original dataset, such that
len(acf) = nfull // 2 + 1 for each for each acf in the acfs
curve_family_display: one of {'polynomial', 'Fourier'}
tstmp: timestamp used in folder structure
outdir: output folder
show: plt.show()?
"""
fig, ax = plt.subplots(figsize=FIGSIZE)
ax.set_title(
curve_family_display.title()+" series regression circular autocorrelation functions",
size=TITLE_SIZE,
)
offset = 0.
ax.plot(
np.arange(len(acfs[0])), acfs[0], color="k", ls="--", linewidth=1, label="iid errors")
ax.plot(
1 * offset + np.arange(len(acfs[1])), acfs[1],
marker="o", color="red", ls="--", linewidth=1.5, label=FIT_DISPLAY["lo"],
)
ax.plot(
2 * offset + np.arange(len(acfs[2])), acfs[2],
marker="x", color="k", ls="-", linewidth=1.5, label=FIT_DISPLAY["true"],
)
ax.plot(
3 * offset + np.arange(len(acfs[3])), acfs[3],
marker="+", color="blue", ls="-.", linewidth=1.5, label=FIT_DISPLAY["hi"],
)
ax.plot(
4 * offset + np.arange(len(acfs[4])), acfs[4],
marker=".", color="purple", ls=":", linewidth=1.5, label=FIT_DISPLAY["vhi"],
)
ax.axhline(-2. / np.sqrt(nfull), ls=":", linewidth=1.2, color="k")
ax.axhline(-1. / np.sqrt(nfull), ls=":", linewidth=1.2, color="k")
ax.axhline(+0., ls="-", linewidth=1, color="k")
ax.axhline(+1. / np.sqrt(nfull), ls=":", linewidth=1.2, color="k")
ax.axhline(+2. / np.sqrt(nfull), ls=":", linewidth=1.2, color="k")
ax.set_xlabel("Lag")
ax.grid()
ax.legend()
fig.tight_layout()
for _suffix in OUTFILE_EXTENSIONS:
outfile = os.path.join(
outdir,
f"acfs_{curve_family_display.lower().replace(' ', '_')}_{tstmp}{_suffix}",
)
print(f"Saving to {outfile}")
fig.savefig(outfile)
if show:
plt.show()
plt.close(fig)
# Main script
# ===========
if __name__ == "__main__":
# Current timestamp, used in I/0
tstmp = pd.Timestamp.now().isoformat().replace(":", "")
outdir = fitting_polynomials_2d.build_output_folder_structure(tstmp, project_dir=PROJDIR)
# Output dict - will be pickled
output = {}
# Design matrices for the different model sets
features = {
_degree: {
_k: _f(x=x[_k], degree=fit_degree[_degree][_k])
for _k, _f in (("cheb", chebyshev_design_matrix), ("sinu", sinusoid_design_matrix))
}
for _degree in ("lo", "true", "hi", "vhi")
}
for _cf in ("cheb", "sinu"): # Big outer loop over curve family
# Build the true 1d curve coefficients
output[f"{_cf}_coeffs_true"] = coeff_signal_to_noise * np.random.randn(
features["true"][_cf].shape[-1])
# Build the true 1d curves from these coefficients
output[f"ytrue_{_cf}"] = np.matmul(features["true"][_cf], output[f"{_cf}_coeffs_true"])
# Add random Gaussian iid errors to generate our simulation dataset y values
output[f"e_{_cf}"] = noise_sigma * np.random.randn(nx)
output[f"y_{_cf}"] = output[f"ytrue_{_cf}"] + output[f"e_{_cf}"]
# Plot scatter plots of data, ideal model and predictions
# First perform regression at different degrees to generate predictions
for _fit in ("lo", "true", "hi", "vhi"):
_design_matrix = features[_fit][_cf]
_coeffs = np.linalg.lstsq(_design_matrix, output[f"y_{_cf}"], rcond=None)[0]
_yfit = _design_matrix.dot(_coeffs.T)
output[f"ypred_{_cf}_{_fit}"] = _yfit
# Plot ideal model, data, and ordinary least squares regression predictions
plot_regressions(
xarr=x[_cf],
yarrs=[
output[f"ytrue_{_cf}"], # ideal model
output[f"y_{_cf}"], # data
output[f"ypred_{_cf}_lo"],
output[f"ypred_{_cf}_true"],
output[f"ypred_{_cf}_hi"],
output[f"ypred_{_cf}_vhi"]
],
curve_family_display=CURVE_FAMILY_DISPLAY[_cf],
tstmp=tstmp,
outdir=outdir,
show=True,
)
# Now plot residuals, but using imaging to bring out patterns
for _fit in ("lo", "true", "hi", "vhi"):
# Residuals = data - model
_res = output[f"y_{_cf}"] - output[f"ypred_{_cf}_{_fit}"]
output[f"res_{_cf}_{_fit}"] = _res.copy() # store residuals
plot_residuals(
residuals=_res,
fit_display=FIT_DISPLAY[_fit],
curve_family_display=CURVE_FAMILY_DISPLAY[_cf],
tstmp=tstmp,
outdir=outdir,
show=True,
)
# Calculate residual periodogram via FFT and store
output[f"rp_{_cf}_{_fit}"] = np.abs(np.fft.rfft(_res))**2 / len(_res)
# Calculate periodograms of just the errors for plotting
output[f"ep_{_cf}"] = np.abs(np.fft.rfft(output[f"e_{_cf}"]))**2 / len(output[f"e_{_cf}"])
# Now we plot error and residual periodograms
plot_periodograms(
[
output[f"ep_{_cf}"], # iid errors periodogram for comparison
output[f"rp_{_cf}_lo"],
output[f"rp_{_cf}_true"],
output[f"rp_{_cf}_hi"],
output[f"rp_{_cf}_vhi"],
],
nfull=nx,
curve_family_display=CURVE_FAMILY_DISPLAY[_cf],
tstmp=tstmp,
outdir=outdir,
show=True,
)
# Calculate (circular) autocorrelation functions via inverse FFT of residual periodograms
for _fit in ("lo", "true", "hi", "vhi"):
output[f"racf_{_cf}_{_fit}"] = np.fft.irfft(output[f"rp_{_cf}_{_fit}"])
output[f"racf_{_cf}_{_fit}"] /= output[f"racf_{_cf}_{_fit}"][0] # variance normalize
output[f"racf_{_cf}_{_fit}"] = ( # take only first n // 2 + 1 elements due to symmetry
output[f"racf_{_cf}_{_fit}"][:len(output[f"rp_{_cf}_{_fit}"])])
# Calculate (circular) autocorrelation function of just the errors for plotting
output[f"eacf_{_cf}"] = np.fft.irfft(output[f"ep_{_cf}"])
output[f"eacf_{_cf}"] /= output[f"eacf_{_cf}"][0] # variance normalize
# take only first n // 2 + 1 elements due to symmetry
output[f"eacf_{_cf}"] = output[f"eacf_{_cf}"][:len(output[f"ep_{_cf}"])]
# Now plot autocorrelation functions
plot_acfs(
[
output[f"eacf_{_cf}"][:(1 + ACF_MAX_LAG)], # iid errors periodogram for comparison
output[f"racf_{_cf}_lo"][:(1 + ACF_MAX_LAG)],
output[f"racf_{_cf}_true"][:(1 + ACF_MAX_LAG)],
output[f"racf_{_cf}_hi"][:(1 + ACF_MAX_LAG)],
output[f"racf_{_cf}_vhi"][:(1 + ACF_MAX_LAG)],
],
nfull=nx,
curve_family_display=CURVE_FAMILY_DISPLAY[_cf],
tstmp=tstmp,
outdir=outdir,
show=True,
)