-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdembcm.m
207 lines (181 loc) · 7.21 KB
/
dembcm.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
function dembcm()
% dembcm - Demo program for BCM approximation for large scale GP regression
%
% Synopsis:
% dembcm;
%
% Description:
% This routine demonstrates how the provided routines for the Bayesian
% Committee Machine can be used for analyzing data
% Basic steps are
% - Generate a data set (linear combination of some random basis
% functions) of 500 data points
% - Split the data into modules of 100 points each
% - Train Gaussian process models for each module
% - Use the BCM approximation to obtain a prediction
%
% Also, the demo compares the prediction accuracy with
% - A Gaussian Process model that is trained on all 500 points
% - A Gaussian Process model trained on only 300 points
%
% The Bayesian Committee Machine is used in two variants:
% - In the standard form, training data are assigned modules at random
% - Alternatively, use k-means clustering, and assign points the points
% from each cluster to a module.
%
% See also: bcm,gp,bcminit,bcmtrain,bcmprepare
%
% Author(s): Anton Schwaighofer, Nov 2004
% $Id: dembcm.m,v 1.1 2005/11/16 17:12:41 anton Exp $
% The demo also works for other random states, no worries ;-)
randstate = 1;
rand('state', randstate);
randn('state', randstate);
% ----------------------------------------------------------------------
fprintf('Generating training and test data...\n');
% 500 training data points, save basis points to later generate test
% data. Generate low noise data
noiselevel = 0.1;
[Xtrain, Ytrain, Xbasis, Ybasis, Ytrain0] = art_data(500, 5, 0, noiselevel);
% Generate 2000 test data from the same function. Use the "true" function
% values Ytest0 (not the ones corrupted by noise) for testing
[Xtest, Ytest, dummy1, dummy2, Ytest0] = art_data(2000, 5, 0, noiselevel, ...
5, Xbasis, Ybasis);
% Options for scg:
scgopt = foptions;
scgopt(1) = 1;
scgopt(2) = 1e-4;
scgopt(3) = 1e-4;
scgopt(14) = 15;
% ----------------------------------------------------------------------
fprintf('Training a full GP model on all training data...\n');
% Full gp model: Use the standard Netlab routines to train the thingy
% Rational quadratic kernel is much better than the squexp kernel
fullgp = gp(5, 'ratquad');
fullgp = gpinit(fullgp, Xtrain, Ytrain);
fullgp = netopt(fullgp, scgopt, Xtrain, Ytrain, 'scg');
[fullpred,fullvar] = gpfwd(fullgp, Xtest);
% ----------------------------------------------------------------------
fprintf('Training a full GP model on 300 (out of the 500) training data...\n');
% We also compare with a full GP trained on only 300 (out of 500) points
full1gp = gp(5, 'ratquad');
full1gp = gpinit(full1gp, Xtrain(1:300,:), Ytrain(1:300));
full1gp = netopt(full1gp, scgopt, Xtrain(1:300,:), Ytrain(1:300), 'scg');
[full1pred,full1var] = gpfwd(full1gp, Xtest);
% ----------------------------------------------------------------------
fprintf('Training the modules of the Bayesian Committee Machine...\n');
% Now build BCM model: start with defining a 'template' GP that is the
% basis for each BCM module
gp0 = gp(5, 'ratquad');
% Build a BCM model from the template
bcm0 = bcm(gp0);
% Give the BCM its data. The training data will be split up such that
% each module gets 100 points
fprintf('BCM: Each module has 100 data points\n');
bcm0 = bcminit(bcm0, Xtrain, Ytrain, 100);
% Fit the BCM modules. Do this by optimizing evidence for each module
% with shared hyperparameters
bcm1 = bcmtrain(bcm0, 'shared', 'scg', scgopt);
bcm1 = bcmprepare(bcm1);
[bcm1pred, bcm1var] = bcmfwd(bcm1, Xtest);
% ----------------------------------------------------------------------
fprintf('Starting to cluster the training data...\n');
% Clustered BCM:
kmeansopt = [1 1e-5 1e-4 0 0 0 0 0 0 0 0 0 0 30];
r = randperm(size(Xtrain,1));
[centres,opt,post] = kmeans(Xtrain(r(1:5),:),Xtrain,kmeansopt);
[m,assignment] = max(post,[],2);
for i = 1:5,
fprintf('Clustered BCM: Module %i has %i data points\n', i, nnz(assignment==i));
end
% ----------------------------------------------------------------------
fprintf('Training the modules of the clustered Bayesian Committee Machine...\n');
bcm5 = bcminit(bcm0, Xtrain, Ytrain, assignment);
bcm5a = bcmtrain(bcm5, 'shared', 'scg', scgopt);
bcm5a = bcmprepare(bcm5a);
[bcm5apred, bcm5avar] = bcmfwd(bcm5a, Xtest);
% ----------------------------------------------------------------------
fprintf('\nEvaluating all models in terms of\n');
fprintf('RMSE (root mean squared error)\n');
fprintf('logProb (negative log probability of test data under the predictive distribution\n\n');
loss_negLogProb = inline('0.5*(log(2*pi*var) + ((pred-label).^2)./var)','label','pred','var');
fprintf('Full GP model:\n');
pred = fullpred; var = fullvar;
fprintf('RMSE = %f, logProb = %f\n\n', sqrt(mean((pred-Ytest0).^2)), ...
mean(loss_negLogProb(Ytest0, pred, var)));
fprintf('Full GP model on 300 data points:\n');
pred = full1pred; var = full1var;
fprintf('RMSE = %f, logProb = %f\n\n', sqrt(mean((pred-Ytest0).^2)), ...
mean(loss_negLogProb(Ytest0, pred, var)));
fprintf('BCM model with shared hyperparams:\n');
pred = bcm1pred; var = bcm1var;
fprintf('RMSE = %f, logProb = %f\n\n', sqrt(mean((pred-Ytest0).^2)), ...
mean(loss_negLogProb(Ytest0, pred, var)));
fprintf('Clustered BCM model with shared hyperparams:\n');
pred = bcm5apred; var = bcm5avar;
fprintf('RMSE = %f, logProb = %f\n\n', sqrt(mean((pred-Ytest0).^2)), ...
mean(loss_negLogProb(Ytest0, pred, var)));
% $$$ figure(10);
% $$$ clf;
% $$$ val_errorbars(Ytest0', fullpred', sqrt(fullvar)');
% $$$ set(gcf, 'Name', 'Full GP with ratquad');
% $$$ figure(11);
% $$$ clf;
% $$$ val_errorbars(Ytest0', full1pred', sqrt(full1var)');
% $$$ set(gcf, 'Name', 'Full GP on subset of 300 points');
% $$$ figure(12);
% $$$ clf;
% $$$ val_errorbars(Ytest0', bcm1pred', sqrt(bcm1var)');
% $$$ set(gcf, 'Name', 'BCM shared');
% $$$ figure(15);
% $$$ clf;
% $$$ val_errorbars(Ytest0', bcm5apred', sqrt(bcm5avar)');
% $$$ set(gcf, 'Name', 'Clustered BCM shared');
return
function [X, Y, Xbasis, Ybasis, Ynoisefree] = art_data(npoints, ndim, classification, noise, nbasis, Xbasis, Ybasis)
% ART_DATA - Generate Volker's artificial data set
% Set all random seeds to 0 before calling to reproduce the exact data set.
%
if nargin<6,
Xbasis = [];
Ybasis = [];
end
if nargin<5,
nbasis = 5;
end
if nargin<4,
noise = 0;
end
if nargin<3,
classification = 0;
end
if nargin<2,
ndim = 5;
end
[Nb, dimb] = size(Xbasis);
if (Nb>0) & (~all(size(Ybasis) == [Nb, 1])),
error('Size of basis function matrices XBASIS YBASIS does not match');
end
if Nb==0,
% No basis functions: generate new ones randomly
% X-Prototypes in range [-1...+1]
Xbasis = 2*rand(nbasis, ndim)-1;
% Target values of the prototypes
Ybasis = randn(nbasis, 1);
if classification & (ndim==5) & (nbasis==5),
% Volker's modification to generate a more balanced classification
% data-set, works only with random seed 0
Ybasis(2) = -Ybasis(2);
Ybasis(3) = -Ybasis(3);
end
end
adisa = mean(mean(sqrt(dist2(Xbasis, Xbasis))));
aids_sig = adisa/4;
X = 2*rand(npoints, ndim)-1;
ad = sqrt(dist2(X, Xbasis));
Yex = exp(-1/(2*(aids_sig*aids_sig))*(ad.*ad));
Ynoisefree = (Yex * Ybasis) ./ (Yex * ones(nbasis, 1));
Y = Ynoisefree + noise * randn(npoints,1);
if classification,
Y = sign(Y);
end