diff --git a/README b/README new file mode 100644 index 0000000..ce1cdaf --- /dev/null +++ b/README @@ -0,0 +1,49 @@ + +Bayesian Committee Machine +Version 1.0, November 2005 + +The Bayesian Committee Machine (BCM) is an approximation method for +large-scale Gaussian process regression. + +What you should know beforehand: + +- The code is for Matlab + +- It requires the Netlab toolbox. You can download Netlab from + http://www.ncrg.aston.ac.uk/netlab/ + +- Install Netlab *before* trying to run any of the programs here. + +- To get started and to check your installation, try running 'dembcm.m' + +- If you are looking for example code to run the BCM, have a look at dembcm.m + All of the main features are used and explained there. + + +Relevant publications: + +V. Tresp. A bayesian committee machine. Neural Computation, 12, 2000 + +A. Schwaighofer and V. Tresp. Transductive and inductive methods for +approximate Gaussian process regression. In S. Becker, S. Thrun, and +K. Obermayer, editors, Advances in Neural Information Processing Systems +15. MIT Press, 2003 + +============================================================ + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the +Free Software Foundation, Inc., +59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + diff --git a/bcm.m b/bcm.m new file mode 100644 index 0000000..90a11a1 --- /dev/null +++ b/bcm.m @@ -0,0 +1,62 @@ +function net = bcm(gpnet) +% bcm - Bayesian Committee Machine +% +% Synopsis: +% net = bcm(gpnet) +% +% Arguments: +% gpnet: A Gaussian process template for BCM modules, as output by Netlab's +% function gp.m. Each module of the BCM will inherit its initial +% parameters from gpnet. +% +% Returns: +% net: Structure describing the BCM +% +% Description: +% The Bayesian Committee Machine (BCM) is an approximation method for +% large-scale Gaussian process regression. The training data is split +% into a number of blocks, for which individual Gaussian process +% predictors ("modules") are trained. The prediction of a BCM is a +% weighted combination of the predictions of individual modules on the +% test data. Also, test data is processed in blocks, which leads to +% improved performance. +% The code here is a wrapper routine for Gaussian process routines +% provided by the Netlab toolbox. Netlab is thus required for this code +% to run. +% +% Examples: +% Building a BCM for 7-dimensional input, where each module is a GP +% with squared-exponential kernel: +% gpnet = gp(7, 'sqexp'); +% net = bcm(gpnet); +% Equip the BCM with its training data, split up into modules of size +% 500: +% net = bcminit(net, Xtrain, Ytrain, 500); +% Fit each module's hyperparameters, and pre-compute a few matrices: +% net = bcmtrain(net, 'individual'); +% net = bcmprepare(net); +% For increased performance: cluster the training data beforehand (10 +% clusters in the example below) then assign clusters to modules: +% options = [1 1e-5 1e-4 0 0 0 0 0 0 0 0 0 0 30]; +% r = randperm(size(Xtrain,1)); +% [centres,opt,post] = kmeans(Xtrain(r(1:10)),Xtrain,options); +% [m,assignment] = max(post,[],2); +% net = bcminit(net, Xtrain, Ytrain, assignment); +% net = bcmprepare(net); +% Now can do prediction: +% [pred, errorBar] = bcmfwd(net, Xtest, 400); +% +% See also: bcminit,bcmprepare,bcmtrain,bcmfwd,bcmerr,bcmgrad,bcmpak,bcmunpak +% + +% Author(s): Anton Schwaighofer, Nov 2004 +% $Id: bcm.m,v 1.1 2004/11/18 21:18:24 anton Exp $ + +error(nargchk(1, 1, nargin)); + +net = struct('type', 'bcm', 'gpnet', gpnet); +net.nin = gpnet.nin; +net.nout = 1; +net.module = []; +net.invPrior = {}; +net.weights = {}; diff --git a/bcmerr.m b/bcmerr.m new file mode 100644 index 0000000..bba1fdb --- /dev/null +++ b/bcmerr.m @@ -0,0 +1,114 @@ +function [e,edata,eprior] = bcmerr(net, x, t, exactCov, Xtest) +% bcmerr - Error function for Bayesian Committee Machine +% +% Synopsis: +% [e,edata,eprior] = bcmerr(net) +% e = bcmerr(net, [], [], Xtest) (use with care, see below) +% +% Arguments: +% net: BCM structure +% Xtest: [Q net.nin] matrix of test data +% +% Returns: +% e: Value of the error function (marginal likelihood) +% edata: Data contribution to e +% eprior: Prior contribution to e +% +% Description: +% This function returns the sum of the error functions of each module, +% that is, the unnormalized negative log-likelihood. Error function is +% computed on the basis of the pre-initialized data in each GP module, +% thus no data is required as input. Still, to be compatible with the +% standard Netlab error functions, bcmerr.m accepts input arguments in +% the form bcmerr(net, x, t). +% +% In the second calling syntax, bcmerr(net, [], [], Xtest), the exact +% BCM evidence is returned. This is given by +% -1/2 log det C - 1/2 t' C^{-1} t + const +% with C given by +% C = BD[K] + sigma^2 I + (K_c K_t^{-1} K_c' - BD[K_c K_t^{-1} K_c'] +% where BD[...] denotes a block-diagonal approximation of the argument, +% K_c is the kernel matrix of all training points versus test points, +% K_t is the test point kernel matrix, and t is a vector of all +% training targets. +% C is a matrix of size [N N] for N training data, thus the exact BCM +% evidence can only be computed for cases where also an exact GP +% solution can be found. Use this feature only with moderately sized +% problems. +% +% +% See also: bcm,bcmtrain,bcminit +% + +% Author(s): Anton Schwaighofer, Nov 2004 +% $Id: bcmerr.m,v 1.2 2004/11/23 21:43:51 anton Exp $ + +% Input arguments x and t are effectively ignored + +error(nargchk(3, 4, nargin)); +if nargin<4, + Xtest = []; +end + +if isempty(Xtest), + % No test data given: Default case of summing up individual modules' evidence + e = 0; + edata = 0; + eprior = 0; + for i = 1:length(net.module), + netI = net.module(i); + [a, b, c] = gperr(netI, netI.tr_in, netI.tr_targets); + e = e+a; + edata = edata+b; + eprior = eprior+c; + end +else + % Compute BCM error with the actual BCM covariance matrix. This matrix + % has size [N N] for N training points, thus we can typically not hold + % it in memory. For this, knowledge of the test data is required. + + % For the block diagonal approximation, we need to know the number of + % data in each module: + modSize = zeros(1, length(net.module)); + for i = 1:length(net.module), + modSize(i) = length(net.module(i).tr_targets); + end + % Reconstruct the full training data + N = sum(modSize); + Xtrain = zeros(N, net.nin); + ind = 1; + for i = 1:length(net.module), + netI = net.module(i); + Xtrain(ind:(ind+length(netI.tr_targets)-1),:) = netI.tr_in; + ind = ind+length(netI.tr_targets); + end + % Major part of the overall kernel matrix is a form of Schur complement: + Kt = gpcovarp(net.gpnet, Xtest, Xtest); + Kc = gpcovarp(net.gpnet, Xtrain, Xtest); + smallEye = eps^(2/3)*speye(size(Kt)); + C = Kc*inv(Kt+smallEye)*Kc'; + % Overwrite diagonal blocks with exact covariance matrix, meaning that + % the kernel matrix is exact for points within the same module + startInd = 1; + for i = 1:length(net.module), + ind = startInd:(startInd+modSize(i)-1); + netI = net.module(i); + % Use gpcovar here, so that the contribution of the noise variance is + % already taken into account + C(ind,ind) = gpcovar(net.gpnet, Xtrain(ind,:)); + startInd = startInd+modSize(i); + end + % With this matrix C, we can compute evidence as usual: + C(isnan(C)) = realmax; + C(isinf(C)&(C<0)) = -realmax; + C(isinf(C)&(C>0)) = realmax; + eigC = eig(C, 'nobalance'); + % Guard against eventual tiny negative eigenvalues (eg. in the Matern + % kernel with large values of nu) + if any(eigC<=0), + warning('Skipping some negative eigenvalues. Results may be inaccurate'); + end + edata = 0.5*(sum(log(eigC(eigC>0)))+t'*inv(C)*t); + eprior = 0; + e = edata+eprior; +end diff --git a/bcmfwd.m b/bcmfwd.m new file mode 100644 index 0000000..316db25 --- /dev/null +++ b/bcmfwd.m @@ -0,0 +1,134 @@ +function [Ypred, Yvar] = bcmfwd(net,Xtest,querySize,verbosity) +% bcmfwd - Forward propagation in Bayesian Committee Machine +% +% Synopsis: +% Ypred = bcmfwd(net,Xtest) +% [Ypred,Yvar] = bcmfwd(net,Xtest,querySize,verbosity) +% +% Arguments: +% net: BCM structure +% Xtest: [Q d] matrix of test data, Q points in d dimensions +% querySize: Size of query set (prediction is based on blocks of test +% points of size querySize). Default value, if omitted: 500. +% verbosity: (optional) Use a value >0 to display progress information +% +% Returns: +% Ypred: [Q 1] vector of predictions (predictive mean) for each test point +% Yvar: [Q 1] vector of predictive variances (error bars) for each test +% point +% +% Description: +% Forward propagation in Bayesian Committee Machine. The test data is +% split up into blocks of size querySize. For each block, all GP +% modules in the BCM make their prediction, the prediction is then +% weighted by the inverse predictive covariance, summed and normalized +% to give the BCM output. +% Typically, the performance of the BCM increases as querySize +% increases. +% Instead of passing querySize as a parameter, it can also be set in a +% field 'querySize' of the BCM structure. +% +% Examples: +% Build a BCM with modules that contain 500 training points each: +% gp1 = gp(5, 'sqexp'); +% bcm1 = bcm(gp1); +% bcm1.querySize = 500; +% bcm1 = bcminit(bcm1, Xtrain, Ytrain); +% Train the BCM, by maximizing the training data marginal likelihood +% for each module individually: +% bcm1 = bcmtrain(bcm1,'individual','scg'); +% Compare the predictions of the BCM with different query set size: +% pred1 = bcmfwd(bcm1, Xtest, 10); +% pred2 = bcmfwd(bcm1, Xtest, 800); +% +% +% See also: bcm,bcminit,bcmtrain,bcmprepare +% + +% Author(s): Anton Schwaighofer, Nov 2004 +% $Id: bcmfwd.m,v 1.2 2004/11/23 23:23:58 anton Exp $ + +error(nargchk(2, 4, nargin)); +error(consist(net, 'bcm', Xtest)); +if nargin<3, + querySize = []; +end +if isempty(querySize), + if isfield(net, 'querySize'), + querySize = net.querySize; + else + querySize = 500; + end +end +if nargin<4, + verbosity=0; +end + +if isempty(net.invPrior) | isempty(net.weights), + net = bcmprepare(net); +end +P = size(Xtest, 1); +% Number of query sets of maximum size net.querySize +nQueries = ceil(P/querySize); +nModules = length(net.module); +Ypred = zeros([P 1]); +Yvar = zeros([P 1]); + +if verbosity>0, + fprintf('\nStarting forward propagation (%i query sets).\n', nQueries); +end +if verbosity==1, + fprintf('Query set '); +end +t1 = cputime; +for j = 1:nQueries, + if verbosity==1, + fprintf('%i ', j); + end + if verbosity==2, + fprintf('Query set %i: ', j); + end + ind1 = (1+(j-1)*querySize):min(P, j*querySize); + Xtest1 = Xtest(ind1, :); + % A small regularization matrix for inversions + smallEye = eps^(2/3)*speye(length(ind1)); + % Prediction for the current query set + Ypred1 = zeros([length(ind1) 1]); + % Overall covariance matrix for current query set + Ycov1 = 0; + % The original BCM where all modules share the same hyperparameters: +% $$$ K11 = gpcovarp(net.module(1), Xtest1, Xtest1); +% $$$ Ycov1 = -(nModules-1)*inv(K11+smallEye); + startInd = 1; + for i = 1:length(net.module), + netI = net.module(i); + K11 = gpcovarp(netI, Xtest1, Xtest1); + K12 = gpcovarp(netI, Xtest1, netI.tr_in); + % Prediction of current module + Ypred2 = K12*net.weight{i}; + % Covariance of current module + Ycov2 = K11-K12*net.invPrior{i}*K12'; + invYcov2 = inv(Ycov2+smallEye); + % Add weighted prediction of the current module + Ypred1 = Ypred1+invYcov2*Ypred2; + % Update overall covariance matrix + Ycov1 = Ycov1+invYcov2; + % Instead of the above (M-1)*inv(K11) term: this has one contribution for + % the prior covariance for each module but the last/first one. The + % last module is usually the smallest, drop this one + if i~=length(net.module), + Ycov1 = Ycov1 - inv(K11+smallEye); + end + if verbosity==2, + fprintf('.'); + end + end + % Ycov1 is the *inverse* covariance of the overall prediction + Ycov1 = inv(Ycov1+smallEye); + % Rescale the sum of the modules' predictions and write into result + Ypred(ind1) = Ycov1*Ypred1; + Yvar(ind1) = diag(Ycov1); + if verbosity>0, + fprintf('\n'); + end +end diff --git a/bcmgrad.m b/bcmgrad.m new file mode 100644 index 0000000..d339589 --- /dev/null +++ b/bcmgrad.m @@ -0,0 +1,31 @@ +function g = bcmgrad(net, x, t) +% bcmgrad - Error gradient for Bayesian Committee Machine +% +% Synopsis: +% g = bcmgrad(net) +% +% Arguments: +% net: BCM structure +% +% Returns: +% g: Gradient of the error function (marginal likelihood) with respect +% to the kernel parameters +% +% Description: +% Error function and gradient are computed on the basis of the +% pre-initialized data in each GP module, thus no data is required as +% input. +% +% +% See also: bcm,bcmtrain,bcminit,bcmerr +% + +% Author(s): Anton Schwaighofer, Nov 2004 +% $Id: bcmgrad.m,v 1.1 2004/11/18 21:19:46 anton Exp $ + +g = 0; +for i = 1:length(net.module), + netI = net.module(i); + gI = gpgrad(netI, netI.tr_in, netI.tr_targets); + g = g+gI; +end diff --git a/bcminit.m b/bcminit.m new file mode 100644 index 0000000..0228d6f --- /dev/null +++ b/bcminit.m @@ -0,0 +1,96 @@ +function net = bcminit(net, Xtrain, Ytrain, assignment) +% bcminit - Initialization for Bayesian Committee Machine (BCM) +% +% Synopsis: +% net = bcminit(net,Xtrain,Ytrain,assignment) +% +% Arguments: +% net: BCM structure, as output by bcm.m +% Xtrain: [N d] matrix of training data, N points in d dimensions +% Ytrain: [N 1] vector of training targets +% assignment: Scalar or [N 1] vector. Number of training data that are +% assigned to each module. If assignment is a scalar K, each module is +% assigned K points, eventually the last module is given fewer +% points. If assignment is a vector of length N, module I will be +% assigned all points J for which assignment(J)==I. +% +% Returns: +% net: BCM structure. net has the following newly added fields: +% .module: Structure array, containing the Netlab-like GP +% description for module I in net.module(I) +% .invPrior: Cell array, with the inverse covariance of module I's +% training data in net.invPrior{I} (empty) +% .weight: Cell array, with the GP weight vector of module I in +% net.weight{I} (empty) +% +% Description: +% Purpose of this routine is to split up the training data into data +% for the individual GP modules. Each module is a replica of the +% template GP, given in bcm.gpnet +% +% +% Examples: +% Give each module an equal share of 500 training data: +% net = bcminit(net, Xtrain, Ytrain, 500); +% For improved performance, use kmeans clustering to get modules that +% are spatially separated. Intialize kmeans with 5 random centres: +% options = [1 1e-5 1e-4 0 0 0 0 0 0 0 0 0 0 30]; +% r = randperm(size(Xtrain,1)); +% centres = Xtrain(r(1:10)); +% [centres,opt,post] = kmeans(centres,Xtrain,options); +% Extract the point/centres assignment and use directly in bcminit: +% [m,assignment] = max(post,[],2); +% net = bcminit(net, Xtrain, Ytrain, assignment); +% +% +% See also: bcm,bcmprepare,bcmtrain,bcmfwd +% + +% Author(s): Anton Schwaighofer, Nov 2004 +% $Id: bcminit.m,v 1.1 2004/11/18 21:19:53 anton Exp $ + +error(nargchk(4, 4, nargin)); +error(consist(net, 'bcm', Xtrain, Ytrain)); + +[N, dim] = size(Xtrain); +% Assignment given as a scalar: +if prod(size(assignment))==1, + modulesize = assignment; + nModules = ceil(N/modulesize); + r = rem(N, modulesize); + % Each modules gets an equal share of the training data + if r==0, + modulesize = repmat(modulesize, [1 nModules]); + else + modulesize = [repmat(modulesize, [1 nModules-1]) r]; + end + % Generate the assignment vector: assignment(j)==i if point j goes to + % module i + assignment = zeros([N 1]); + start = 1; + for i = 1:length(modulesize), + ind = start:(start+modulesize(i)-1); + assignment(ind) = i; + start = ind(end)+1; + end +else + if length(assignment)~=N, + error('Length of vector assignment match match the number of training data'); + end + % Uniquify the whole thing, this gets us rid of any nonsense data, + % wrong module numbers, and such + [B, dummy, assignment] = unique(assignment); + nModules = length(B); +end + +% Initialize the GPs for each module with its data +net.module = net.gpnet; +for i = 1:nModules, + netI = net.gpnet; + ind = (assignment==i); + netI = gpinit(netI, Xtrain(ind,:), Ytrain(ind,:)); + net.module(i) = netI; +end +% Initialize empty data for inverse prior matrices and weight vectors +net.invPrior = []; +net.weight = []; diff --git a/bcmpak.m b/bcmpak.m new file mode 100644 index 0000000..2edc547 --- /dev/null +++ b/bcmpak.m @@ -0,0 +1,28 @@ +function w = bcmpak(net) +% bcmpak - Combine kernel parameters of BCM into vector +% +% Synopsis: +% w = bcmpak(net) +% +% Arguments: +% net: BCM structure +% +% Returns: +% w: Vector of GP parameters taken from field net.gpnet +% +% Description: +% This routine is only meant to be used as a subroutine of +% bcmtrain.m. The routine returns the GP parameters taken from +% net.gpnet, which are assumed to be equal to all of the BCM's module +% parameters. +% +% +% See also: bcm,bcmtrain,bcmunpak +% + +% Author(s): Anton Schwaighofer, Nov 2004 +% $Id: bcmpak.m,v 1.1 2004/11/18 21:20:47 anton Exp $ + +error(nargchk(1, 1, nargin)); +error(consist(net, 'bcm')); +w = gppak(net.gpnet); diff --git a/bcmprepare.m b/bcmprepare.m new file mode 100644 index 0000000..4b754bb --- /dev/null +++ b/bcmprepare.m @@ -0,0 +1,54 @@ +function net = bcmprepare(net, verbosity) +% bcmprepare - Pre-compute prior matrices for Bayesian Committee Machine (BCM) +% +% Synopsis: +% net = bcmprepare(net) +% net = bcmprepare(net,verbosity) +% +% Arguments: +% net: Initialized BCM structure, as output by bcminit.m (training data must +% already be assigned to each module) +% verbosity: (optional) Use a value >0 to display progress information +% +% Returns: +% net: Modified BCM structure, where now the fields net.invPrior and +% net.weight are computed +% +% Description: +% Pre-compute the matrices that are repeatedly used in BCM forward +% propagation, that are the inverse covariance matrix of each module, +% and the weight vector for GP predictions. +% +% +% See also: bcm,bcminit +% + +% Author(s): Anton Schwaighofer, Nov 2004 +% $Id: bcmprepare.m,v 1.1 2004/11/18 21:20:55 anton Exp $ + +error(nargchk(1, 2, nargin)); +error(consist(net, 'bcm')); +if nargin<2, + verbosity=0; +end + +if verbosity>0, + fprintf('Pre-computing prior matrices for %i modules ', nbModules); +end +for i = 1:length(net.module), + netI = net.module(i); + % gpcovar computes the kernel matrix of the given points, and also adds + % the measurement noise. + Kprior = gpcovar(netI, netI.tr_in); + net.invPrior{i} = inv(Kprior); + % Measurement noise is restricted to a minimum value of 1e-8 in the + % Netlab routines. Thus, the matrices should be so well conditioned + % that we can solve the linear system by inversion, instead of mldivide + net.weight{i} = net.invPrior{i} * netI.tr_targets; + if verbosity==2, + fprintf('.'); + end +end +if verbosity==2, + fprintf('\n'); +end diff --git a/bcmtrain.m b/bcmtrain.m new file mode 100644 index 0000000..8c85456 --- /dev/null +++ b/bcmtrain.m @@ -0,0 +1,77 @@ +function net = bcmtrain(net,method,alg,options) +% bcmtrain - Kernel parameter optimization for Bayesian Committee Machine +% +% Synopsis: +% net = bcmtrain(net,method,alg,options) +% +% Arguments: +% net: Initialized BCM structure, as output by bcminit.m +% method: String, one of 'shared', 'individual'. If method=='shared', all +% modules share the same hyperparameters, chosen such that the sum of all +% module marginal likelihoods is maximized. if method=='individual', kernel +% parameters are optimized for each modules, such that each +% modules' marginal likelihood is maximal. +% alg: Optimization routine to use for kernel parameters, e.g. 'scg' +% options: Options vector for the optimization routine +% +% Returns: +% net: Modified BCM structure +% +% Description: +% For reasons of efficiency, BCM hyperparameter selection is only +% implemented as heuristics, where the marginal likelihood of +% individual BCM modules is considered. Two strategies are available: +% 'shared': All BCM modules share the same hyperparameters (e.g., +% kernel parameters or noise variance. Training is done by +% maximizing the sum of marginal likelihoods in each module. +% 'individual': Each BCM module has its distinct set of +% hyperparameters. Training is done by maximizing marginal +% likelihood in each module separately. +% For most cases, it seems that shared hyperparameters +% (method=='shared') leads to better performance than individual +% hyperparameters. +% +% +% See also: bcm,bcminit,bcmprepare,bcmfwd +% + +% Author(s): Anton Schwaighofer, Nov 2004 +% $Id: bcmtrain.m,v 1.2 2004/11/23 22:49:37 anton Exp $ + +error(nargchk(2, 4, nargin)); +if nargin<4, + options = zeros([1 18]); + options(1) = 0; + options(2) = 1e-4; + options(3) = 1e-6; + options(9) = 0; + options(14) = 50; +end +if nargin<3, + alg = 'scg'; +end + +% Invalidate all eventually computed prior matrices +net.invPrior = {}; +net.weights = {}; + + +net.method = method; +switch method + case 'shared' + % Use default netopt. bcmerr and bcmgrad have been adapted such that + % they are appropriate for this type of kernel parameter + % optimization: bcmerr computes the sum of the individual module + % likelihoods, bcmgrad computes the sum of the gradients + net = netopt(net, options, [], [], 'scg'); + case 'individual' + % This is easy... just loop over all modules, train them via standard + % evidence maximization + for i = 1:length(net.module), + netI = net.module(i); + netI = netopt(netI, options, netI.tr_in, netI.tr_targets, alg); + net.module(i) = netI; + end + otherwise + error('Invalid value for parameter ''method'''); +end diff --git a/bcmunpak.m b/bcmunpak.m new file mode 100644 index 0000000..6edea35 --- /dev/null +++ b/bcmunpak.m @@ -0,0 +1,33 @@ +function net = bcmunpak(net,w) +% bcmunpak - Copy kernel parameters for BCM from vector +% +% Synopsis: +% net = bcmunpak(net,w) +% +% Arguments: +% net: BCM structure +% w: Vector of GP parameter +% +% Returns: +% net: Modified BCM structure, where the GP parameters for each module are set +% to the values given in vector w +% +% Description: +% This routine is only meant to be used as a subroutine of +% bcmtrain.m. The routine copies the gien GP parameters into each GP +% module, as well as into the GP given in net.gpnet. +% +% +% See also: bcmpak +% + +% Author(s): Anton Schwaighofer, Nov 2004 +% $Id: bcmunpak.m,v 1.1 2004/11/18 21:21:11 anton Exp $ + +error(nargchk(2, 2, nargin)); +error(consist(net, 'bcm')); + +net.gpnet = gpunpak(net.gpnet, w); +for i = 1:length(net.module), + net.module(i) = gpunpak(net.module(i), w); +end diff --git a/dembcm.m b/dembcm.m new file mode 100644 index 0000000..21063a9 --- /dev/null +++ b/dembcm.m @@ -0,0 +1,207 @@ +function dembcm() +% dembcm - Demo program for BCM approximation for large scale GP regression +% +% Synopsis: +% dembcm; +% +% Description: +% This routine demonstrates how the provided routines for the Bayesian +% Committee Machine can be used for analyzing data +% Basic steps are +% - Generate a data set (linear combination of some random basis +% functions) of 500 data points +% - Split the data into modules of 100 points each +% - Train Gaussian process models for each module +% - Use the BCM approximation to obtain a prediction +% +% Also, the demo compares the prediction accuracy with +% - A Gaussian Process model that is trained on all 500 points +% - A Gaussian Process model trained on only 300 points +% +% The Bayesian Committee Machine is used in two variants: +% - In the standard form, training data are assigned modules at random +% - Alternatively, use k-means clustering, and assign points the points +% from each cluster to a module. +% +% See also: bcm,gp,bcminit,bcmtrain,bcmprepare +% + +% Author(s): Anton Schwaighofer, Nov 2004 +% $Id: dembcm.m,v 1.1 2005/11/16 17:12:41 anton Exp $ + +% The demo also works for other random states, no worries ;-) +randstate = 1; + +rand('state', randstate); +randn('state', randstate); + +% ---------------------------------------------------------------------- +fprintf('Generating training and test data...\n'); +% 500 training data points, save basis points to later generate test +% data. Generate low noise data +noiselevel = 0.1; +[Xtrain, Ytrain, Xbasis, Ybasis, Ytrain0] = art_data(500, 5, 0, noiselevel); +% Generate 2000 test data from the same function. Use the "true" function +% values Ytest0 (not the ones corrupted by noise) for testing +[Xtest, Ytest, dummy1, dummy2, Ytest0] = art_data(2000, 5, 0, noiselevel, ... + 5, Xbasis, Ybasis); + +% Options for scg: +scgopt = foptions; +scgopt(1) = 1; +scgopt(2) = 1e-4; +scgopt(3) = 1e-4; +scgopt(14) = 15; + +% ---------------------------------------------------------------------- +fprintf('Training a full GP model on all training data...\n'); + +% Full gp model: Use the standard Netlab routines to train the thingy +% Rational quadratic kernel is much better than the squexp kernel +fullgp = gp(5, 'ratquad'); +fullgp = gpinit(fullgp, Xtrain, Ytrain); +fullgp = netopt(fullgp, scgopt, Xtrain, Ytrain, 'scg'); +[fullpred,fullvar] = gpfwd(fullgp, Xtest); + +% ---------------------------------------------------------------------- +fprintf('Training a full GP model on 300 (out of the 500) training data...\n'); + +% We also compare with a full GP trained on only 300 (out of 500) points +full1gp = gp(5, 'ratquad'); +full1gp = gpinit(full1gp, Xtrain(1:300,:), Ytrain(1:300)); +full1gp = netopt(full1gp, scgopt, Xtrain(1:300,:), Ytrain(1:300), 'scg'); +[full1pred,full1var] = gpfwd(full1gp, Xtest); + +% ---------------------------------------------------------------------- +fprintf('Training the modules of the Bayesian Committee Machine...\n'); + +% Now build BCM model: start with defining a 'template' GP that is the +% basis for each BCM module +gp0 = gp(5, 'ratquad'); +% Build a BCM model from the template +bcm0 = bcm(gp0); +% Give the BCM its data. The training data will be split up such that +% each module gets 100 points +fprintf('BCM: Each module has 100 data points\n'); +bcm0 = bcminit(bcm0, Xtrain, Ytrain, 100); +% Fit the BCM modules. Do this by optimizing evidence for each module +% with shared hyperparameters +bcm1 = bcmtrain(bcm0, 'shared', 'scg', scgopt); +bcm1 = bcmprepare(bcm1); +[bcm1pred, bcm1var] = bcmfwd(bcm1, Xtest); + +% ---------------------------------------------------------------------- +fprintf('Starting to cluster the training data...\n'); + +% Clustered BCM: +kmeansopt = [1 1e-5 1e-4 0 0 0 0 0 0 0 0 0 0 30]; +r = randperm(size(Xtrain,1)); +[centres,opt,post] = kmeans(Xtrain(r(1:5),:),Xtrain,kmeansopt); +[m,assignment] = max(post,[],2); +for i = 1:5, + fprintf('Clustered BCM: Module %i has %i data points\n', i, nnz(assignment==i)); +end +% ---------------------------------------------------------------------- +fprintf('Training the modules of the clustered Bayesian Committee Machine...\n'); +bcm5 = bcminit(bcm0, Xtrain, Ytrain, assignment); +bcm5a = bcmtrain(bcm5, 'shared', 'scg', scgopt); +bcm5a = bcmprepare(bcm5a); +[bcm5apred, bcm5avar] = bcmfwd(bcm5a, Xtest); + + +% ---------------------------------------------------------------------- +fprintf('\nEvaluating all models in terms of\n'); +fprintf('RMSE (root mean squared error)\n'); +fprintf('logProb (negative log probability of test data under the predictive distribution\n\n'); + +loss_negLogProb = inline('0.5*(log(2*pi*var) + ((pred-label).^2)./var)','label','pred','var'); + +fprintf('Full GP model:\n'); +pred = fullpred; var = fullvar; +fprintf('RMSE = %f, logProb = %f\n\n', sqrt(mean((pred-Ytest0).^2)), ... + mean(loss_negLogProb(Ytest0, pred, var))); +fprintf('Full GP model on 300 data points:\n'); +pred = full1pred; var = full1var; +fprintf('RMSE = %f, logProb = %f\n\n', sqrt(mean((pred-Ytest0).^2)), ... + mean(loss_negLogProb(Ytest0, pred, var))); +fprintf('BCM model with shared hyperparams:\n'); +pred = bcm1pred; var = bcm1var; +fprintf('RMSE = %f, logProb = %f\n\n', sqrt(mean((pred-Ytest0).^2)), ... + mean(loss_negLogProb(Ytest0, pred, var))); +fprintf('Clustered BCM model with shared hyperparams:\n'); +pred = bcm5apred; var = bcm5avar; +fprintf('RMSE = %f, logProb = %f\n\n', sqrt(mean((pred-Ytest0).^2)), ... + mean(loss_negLogProb(Ytest0, pred, var))); + +% $$$ figure(10); +% $$$ clf; +% $$$ val_errorbars(Ytest0', fullpred', sqrt(fullvar)'); +% $$$ set(gcf, 'Name', 'Full GP with ratquad'); +% $$$ figure(11); +% $$$ clf; +% $$$ val_errorbars(Ytest0', full1pred', sqrt(full1var)'); +% $$$ set(gcf, 'Name', 'Full GP on subset of 300 points'); +% $$$ figure(12); +% $$$ clf; +% $$$ val_errorbars(Ytest0', bcm1pred', sqrt(bcm1var)'); +% $$$ set(gcf, 'Name', 'BCM shared'); +% $$$ figure(15); +% $$$ clf; +% $$$ val_errorbars(Ytest0', bcm5apred', sqrt(bcm5avar)'); +% $$$ set(gcf, 'Name', 'Clustered BCM shared'); + +return + +function [X, Y, Xbasis, Ybasis, Ynoisefree] = art_data(npoints, ndim, classification, noise, nbasis, Xbasis, Ybasis) +% ART_DATA - Generate Volker's artificial data set +% Set all random seeds to 0 before calling to reproduce the exact data set. +% + +if nargin<6, + Xbasis = []; + Ybasis = []; +end +if nargin<5, + nbasis = 5; +end +if nargin<4, + noise = 0; +end +if nargin<3, + classification = 0; +end +if nargin<2, + ndim = 5; +end + +[Nb, dimb] = size(Xbasis); +if (Nb>0) & (~all(size(Ybasis) == [Nb, 1])), + error('Size of basis function matrices XBASIS YBASIS does not match'); +end + + +if Nb==0, + % No basis functions: generate new ones randomly + % X-Prototypes in range [-1...+1] + Xbasis = 2*rand(nbasis, ndim)-1; + % Target values of the prototypes + Ybasis = randn(nbasis, 1); + if classification & (ndim==5) & (nbasis==5), + % Volker's modification to generate a more balanced classification + % data-set, works only with random seed 0 + Ybasis(2) = -Ybasis(2); + Ybasis(3) = -Ybasis(3); + end +end + +adisa = mean(mean(sqrt(dist2(Xbasis, Xbasis)))); +aids_sig = adisa/4; + +X = 2*rand(npoints, ndim)-1; +ad = sqrt(dist2(X, Xbasis)); +Yex = exp(-1/(2*(aids_sig*aids_sig))*(ad.*ad)); +Ynoisefree = (Yex * Ybasis) ./ (Yex * ones(nbasis, 1)); +Y = Ynoisefree + noise * randn(npoints,1); +if classification, + Y = sign(Y); +end