diff --git a/README b/README
new file mode 100644
index 0000000..ce1cdaf
--- /dev/null
+++ b/README
@@ -0,0 +1,49 @@
+
+Bayesian Committee Machine
+Version 1.0, November 2005
+
+The Bayesian Committee Machine (BCM) is an approximation method for
+large-scale Gaussian process regression.
+
+What you should know beforehand:
+
+- The code is for Matlab
+
+- It requires the Netlab toolbox. You can download Netlab from 
+  http://www.ncrg.aston.ac.uk/netlab/
+
+- Install Netlab *before* trying to run any of the programs here.
+
+- To get started and to check your installation, try running 'dembcm.m'
+
+- If you are looking for example code to run the BCM, have a look at dembcm.m
+  All of the main features are used and explained there.
+
+
+Relevant publications:
+
+V. Tresp. A bayesian committee machine. Neural Computation, 12, 2000
+
+A. Schwaighofer and V. Tresp. Transductive and inductive methods for
+approximate Gaussian process regression. In S. Becker, S. Thrun, and
+K. Obermayer, editors, Advances in Neural Information Processing Systems
+15. MIT Press, 2003
+
+============================================================
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the 
+Free Software Foundation, Inc.,
+59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+
diff --git a/bcm.m b/bcm.m
new file mode 100644
index 0000000..90a11a1
--- /dev/null
+++ b/bcm.m
@@ -0,0 +1,62 @@
+function net = bcm(gpnet)
+% bcm - Bayesian Committee Machine
+%
+% Synopsis:
+%   net = bcm(gpnet)
+%   
+% Arguments:
+%   gpnet: A Gaussian process template for BCM modules, as output by Netlab's
+%       function gp.m. Each module of the BCM will inherit its initial
+%       parameters from gpnet.
+%   
+% Returns:
+%   net: Structure describing the BCM
+%   
+% Description:
+%   The Bayesian Committee Machine (BCM) is an approximation method for
+%   large-scale Gaussian process regression. The training data is split
+%   into a number of blocks, for which individual Gaussian process
+%   predictors ("modules") are trained. The prediction of a BCM is a
+%   weighted combination of the predictions of individual modules on the
+%   test data. Also, test data is processed in blocks, which leads to
+%   improved performance.
+%   The code here is a wrapper routine for Gaussian process routines
+%   provided by the Netlab toolbox. Netlab is thus required for this code
+%   to run.
+%
+% Examples:
+%   Building a BCM for 7-dimensional input, where each module is a GP
+%   with squared-exponential kernel:
+%       gpnet = gp(7, 'sqexp');
+%       net = bcm(gpnet);
+%   Equip the BCM with its training data, split up into modules of size
+%   500: 
+%       net = bcminit(net, Xtrain, Ytrain, 500);
+%   Fit each module's hyperparameters, and pre-compute a few matrices:
+%       net = bcmtrain(net, 'individual');
+%       net = bcmprepare(net);
+%   For increased performance: cluster the training data beforehand (10
+%   clusters in the example below) then assign clusters to modules:
+%       options = [1 1e-5 1e-4 0 0 0 0 0 0 0 0 0 0 30];
+%       r = randperm(size(Xtrain,1));
+%       [centres,opt,post] = kmeans(Xtrain(r(1:10)),Xtrain,options);
+%       [m,assignment] = max(post,[],2);
+%       net = bcminit(net, Xtrain, Ytrain, assignment);
+%       net = bcmprepare(net);
+%   Now can do prediction:
+%       [pred, errorBar] = bcmfwd(net, Xtest, 400);
+%   
+% See also: bcminit,bcmprepare,bcmtrain,bcmfwd,bcmerr,bcmgrad,bcmpak,bcmunpak
+% 
+
+% Author(s): Anton Schwaighofer, Nov 2004
+% $Id: bcm.m,v 1.1 2004/11/18 21:18:24 anton Exp $
+
+error(nargchk(1, 1, nargin));
+
+net = struct('type', 'bcm', 'gpnet', gpnet);
+net.nin = gpnet.nin;
+net.nout = 1;
+net.module = [];
+net.invPrior = {};
+net.weights = {};
diff --git a/bcmerr.m b/bcmerr.m
new file mode 100644
index 0000000..bba1fdb
--- /dev/null
+++ b/bcmerr.m
@@ -0,0 +1,114 @@
+function [e,edata,eprior] = bcmerr(net, x, t, exactCov, Xtest)
+% bcmerr - Error function for Bayesian Committee Machine
+%
+% Synopsis:
+%   [e,edata,eprior] = bcmerr(net)
+%   e = bcmerr(net, [], [], Xtest) (use with care, see below)
+%   
+% Arguments:
+%   net: BCM structure
+%   Xtest: [Q net.nin] matrix of test data
+%   
+% Returns:
+%   e: Value of the error function (marginal likelihood)
+%   edata: Data contribution to e
+%   eprior: Prior contribution to e
+%   
+% Description:
+%   This function returns the sum of the error functions of each module,
+%   that is, the unnormalized negative log-likelihood.  Error function is
+%   computed on the basis of the pre-initialized data in each GP module,
+%   thus no data is required as input. Still, to be compatible with the
+%   standard Netlab error functions, bcmerr.m accepts input arguments in
+%   the form bcmerr(net, x, t).
+%
+%   In the second calling syntax, bcmerr(net, [], [], Xtest), the exact
+%   BCM evidence is returned. This is given by
+%     -1/2 log det C - 1/2 t' C^{-1} t + const
+%   with C given by
+%      C = BD[K] + sigma^2 I + (K_c K_t^{-1} K_c' - BD[K_c K_t^{-1} K_c']
+%   where BD[...] denotes a block-diagonal approximation of the argument,
+%   K_c is the kernel matrix of all training points versus test points,
+%   K_t is the test point kernel matrix, and t is a vector of all
+%   training targets.
+%   C is a matrix of size [N N] for N training data, thus the exact BCM
+%   evidence can only be computed for cases where also an exact GP
+%   solution can be found. Use this feature only with moderately sized
+%   problems.
+%   
+%   
+% See also: bcm,bcmtrain,bcminit
+% 
+
+% Author(s): Anton Schwaighofer, Nov 2004
+% $Id: bcmerr.m,v 1.2 2004/11/23 21:43:51 anton Exp $
+
+% Input arguments x and t are effectively ignored
+
+error(nargchk(3, 4, nargin));
+if nargin<4,
+  Xtest = [];
+end
+
+if isempty(Xtest),
+  % No test data given: Default case of summing up individual modules' evidence
+  e = 0;
+  edata = 0;
+  eprior = 0;
+  for i = 1:length(net.module),
+    netI = net.module(i);
+    [a, b, c] = gperr(netI, netI.tr_in, netI.tr_targets);
+    e = e+a;
+    edata = edata+b;
+    eprior = eprior+c;
+  end
+else
+  % Compute BCM error with the actual BCM covariance matrix. This matrix
+  % has size [N N] for N training points, thus we can typically not hold
+  % it in memory. For this, knowledge of the test data is required.
+
+  % For the block diagonal approximation, we need to know the number of
+  % data in each module:
+  modSize = zeros(1, length(net.module));
+  for i = 1:length(net.module),
+    modSize(i) = length(net.module(i).tr_targets);
+  end
+  % Reconstruct the full training data
+  N = sum(modSize);
+  Xtrain = zeros(N, net.nin);
+  ind = 1;
+  for i = 1:length(net.module),
+    netI = net.module(i);
+    Xtrain(ind:(ind+length(netI.tr_targets)-1),:) = netI.tr_in;
+    ind = ind+length(netI.tr_targets);
+  end
+  % Major part of the overall kernel matrix is a form of Schur complement:
+  Kt = gpcovarp(net.gpnet, Xtest, Xtest);
+  Kc = gpcovarp(net.gpnet, Xtrain, Xtest);
+  smallEye = eps^(2/3)*speye(size(Kt));
+  C = Kc*inv(Kt+smallEye)*Kc';
+  % Overwrite diagonal blocks with exact covariance matrix, meaning that
+  % the kernel matrix is exact for points within the same module
+  startInd = 1;
+  for i = 1:length(net.module),
+    ind = startInd:(startInd+modSize(i)-1);
+    netI = net.module(i);
+    % Use gpcovar here, so that the contribution of the noise variance is
+    % already taken into account
+    C(ind,ind) = gpcovar(net.gpnet, Xtrain(ind,:));
+    startInd = startInd+modSize(i);
+  end
+  % With this matrix C, we can compute evidence as usual:
+  C(isnan(C)) = realmax;
+  C(isinf(C)&(C<0)) = -realmax;
+  C(isinf(C)&(C>0)) = realmax;
+  eigC = eig(C, 'nobalance');
+  % Guard against eventual tiny negative eigenvalues (eg. in the Matern
+  % kernel with large values of nu)
+  if any(eigC<=0),
+    warning('Skipping some negative eigenvalues. Results may be inaccurate');
+  end
+  edata = 0.5*(sum(log(eigC(eigC>0)))+t'*inv(C)*t);
+  eprior = 0;
+  e = edata+eprior;
+end
diff --git a/bcmfwd.m b/bcmfwd.m
new file mode 100644
index 0000000..316db25
--- /dev/null
+++ b/bcmfwd.m
@@ -0,0 +1,134 @@
+function [Ypred, Yvar] = bcmfwd(net,Xtest,querySize,verbosity)
+% bcmfwd - Forward propagation in Bayesian Committee Machine
+%
+% Synopsis:
+%   Ypred = bcmfwd(net,Xtest)
+%   [Ypred,Yvar] = bcmfwd(net,Xtest,querySize,verbosity)
+%   
+% Arguments:
+%   net: BCM structure
+%   Xtest: [Q d] matrix of test data, Q points in d dimensions
+%   querySize: Size of query set (prediction is based on blocks of test
+%       points of size querySize). Default value, if omitted: 500.
+%   verbosity: (optional) Use a value >0 to display progress information
+%   
+% Returns:
+%   Ypred: [Q 1] vector of predictions (predictive mean) for each test point
+%   Yvar: [Q 1] vector of predictive variances (error bars) for each test
+%       point 
+%   
+% Description:
+%   Forward propagation in Bayesian Committee Machine. The test data is
+%   split up into blocks of size querySize. For each block, all GP
+%   modules in the BCM make their prediction, the prediction is then
+%   weighted by the inverse predictive covariance, summed and normalized
+%   to give the BCM output.
+%   Typically, the performance of the BCM increases as querySize
+%   increases.
+%   Instead of passing querySize as a parameter, it can also be set in a
+%   field 'querySize' of the BCM structure.
+%   
+% Examples:
+%   Build a BCM with modules that contain 500 training points each:
+%     gp1 = gp(5, 'sqexp');
+%     bcm1 = bcm(gp1);
+%     bcm1.querySize = 500;
+%     bcm1 = bcminit(bcm1, Xtrain, Ytrain);
+%   Train the BCM, by maximizing the training data marginal likelihood
+%   for each module individually:
+%     bcm1 = bcmtrain(bcm1,'individual','scg');
+%   Compare the predictions of the BCM with different query set size:
+%     pred1 = bcmfwd(bcm1, Xtest, 10);
+%     pred2 = bcmfwd(bcm1, Xtest, 800);
+%   
+%   
+% See also: bcm,bcminit,bcmtrain,bcmprepare
+% 
+
+% Author(s): Anton Schwaighofer, Nov 2004
+% $Id: bcmfwd.m,v 1.2 2004/11/23 23:23:58 anton Exp $
+
+error(nargchk(2, 4, nargin));
+error(consist(net, 'bcm', Xtest));
+if nargin<3,
+  querySize = [];
+end
+if isempty(querySize),
+  if isfield(net, 'querySize'),
+    querySize = net.querySize;
+  else
+    querySize = 500;
+  end
+end
+if nargin<4,
+  verbosity=0;
+end
+
+if isempty(net.invPrior) | isempty(net.weights),
+  net = bcmprepare(net);
+end
+P = size(Xtest, 1);
+% Number of query sets of maximum size net.querySize
+nQueries = ceil(P/querySize);
+nModules = length(net.module);
+Ypred = zeros([P 1]);
+Yvar = zeros([P 1]);
+
+if verbosity>0,
+  fprintf('\nStarting forward propagation (%i query sets).\n', nQueries);
+end
+if verbosity==1,
+  fprintf('Query set ');
+end
+t1 = cputime;
+for j = 1:nQueries,
+  if verbosity==1,
+    fprintf('%i ', j);
+  end
+  if verbosity==2,
+    fprintf('Query set %i: ', j);
+  end
+  ind1 = (1+(j-1)*querySize):min(P, j*querySize);
+  Xtest1 = Xtest(ind1, :);
+  % A small regularization matrix for inversions
+  smallEye = eps^(2/3)*speye(length(ind1));
+  % Prediction for the current query set
+  Ypred1 = zeros([length(ind1) 1]);
+  % Overall covariance matrix for current query set
+  Ycov1 = 0;
+  % The original BCM where all modules share the same hyperparameters:
+% $$$   K11 = gpcovarp(net.module(1), Xtest1, Xtest1);
+% $$$   Ycov1 = -(nModules-1)*inv(K11+smallEye);
+  startInd = 1;
+  for i = 1:length(net.module),
+    netI = net.module(i);
+    K11 = gpcovarp(netI, Xtest1, Xtest1);
+    K12 = gpcovarp(netI, Xtest1, netI.tr_in);
+    % Prediction of current module
+    Ypred2 = K12*net.weight{i};
+    % Covariance of current module
+    Ycov2 = K11-K12*net.invPrior{i}*K12';
+    invYcov2 = inv(Ycov2+smallEye);
+    % Add weighted prediction of the current module
+    Ypred1 = Ypred1+invYcov2*Ypred2;
+    % Update overall covariance matrix
+    Ycov1 = Ycov1+invYcov2;
+    % Instead of the above (M-1)*inv(K11) term: this has one contribution for
+    % the prior covariance for each module but the last/first one. The
+    % last module is usually the smallest, drop this one
+    if i~=length(net.module),
+      Ycov1 = Ycov1 - inv(K11+smallEye);
+    end
+    if verbosity==2,
+      fprintf('.');
+    end
+  end
+  % Ycov1 is the *inverse* covariance of the overall prediction
+  Ycov1 = inv(Ycov1+smallEye);
+  % Rescale the sum of the modules' predictions and write into result
+  Ypred(ind1) = Ycov1*Ypred1;
+  Yvar(ind1) = diag(Ycov1);
+  if verbosity>0,
+    fprintf('\n');
+  end
+end
diff --git a/bcmgrad.m b/bcmgrad.m
new file mode 100644
index 0000000..d339589
--- /dev/null
+++ b/bcmgrad.m
@@ -0,0 +1,31 @@
+function g = bcmgrad(net, x, t)
+% bcmgrad - Error gradient for Bayesian Committee Machine
+%
+% Synopsis:
+%   g = bcmgrad(net)
+%   
+% Arguments:
+%   net: BCM structure
+%   
+% Returns:
+%   g: Gradient of the error function (marginal likelihood) with respect
+%       to the kernel parameters
+%   
+% Description:
+%   Error function and gradient are computed on the basis of the
+%   pre-initialized data in each GP module, thus no data is required as
+%   input.
+%   
+%   
+% See also: bcm,bcmtrain,bcminit,bcmerr
+% 
+
+% Author(s): Anton Schwaighofer, Nov 2004
+% $Id: bcmgrad.m,v 1.1 2004/11/18 21:19:46 anton Exp $
+
+g = 0;
+for i = 1:length(net.module),
+  netI = net.module(i);
+  gI = gpgrad(netI, netI.tr_in, netI.tr_targets);
+  g = g+gI;
+end
diff --git a/bcminit.m b/bcminit.m
new file mode 100644
index 0000000..0228d6f
--- /dev/null
+++ b/bcminit.m
@@ -0,0 +1,96 @@
+function net = bcminit(net, Xtrain, Ytrain, assignment)
+% bcminit - Initialization for Bayesian Committee Machine (BCM)
+%
+% Synopsis:
+%   net = bcminit(net,Xtrain,Ytrain,assignment)
+%   
+% Arguments:
+%   net: BCM structure, as output by bcm.m
+%   Xtrain: [N d] matrix of training data, N points in d dimensions
+%   Ytrain: [N 1] vector of training targets
+%   assignment: Scalar or [N 1] vector. Number of training data that are
+%       assigned to each module. If assignment is a scalar K, each module is
+%       assigned K points, eventually the last module is given fewer
+%       points. If assignment is a vector of length N, module I will be
+%       assigned all points J for which assignment(J)==I.
+%   
+% Returns:
+%   net: BCM structure. net has the following newly added fields:
+%       .module: Structure array, containing the Netlab-like GP
+%           description for module I in net.module(I)
+%       .invPrior: Cell array, with the inverse covariance of module I's 
+%           training data in net.invPrior{I} (empty)
+%       .weight: Cell array, with the GP weight vector of module I in
+%           net.weight{I} (empty)
+%   
+% Description:
+%   Purpose of this routine is to split up the training data into data
+%   for the individual GP modules. Each module is a replica of the
+%   template GP, given in bcm.gpnet 
+%   
+%   
+% Examples:
+%   Give each module an equal share of 500 training data:
+%       net = bcminit(net, Xtrain, Ytrain, 500);
+%   For improved performance, use kmeans clustering to get modules that
+%   are spatially separated. Intialize kmeans with 5 random centres:
+%       options = [1 1e-5 1e-4 0 0 0 0 0 0 0 0 0 0 30];
+%       r = randperm(size(Xtrain,1));
+%       centres = Xtrain(r(1:10));
+%       [centres,opt,post] = kmeans(centres,Xtrain,options);
+%   Extract the point/centres assignment and use directly in bcminit:
+%       [m,assignment] = max(post,[],2);
+%       net = bcminit(net, Xtrain, Ytrain, assignment);
+%
+%   
+% See also: bcm,bcmprepare,bcmtrain,bcmfwd
+% 
+
+% Author(s): Anton Schwaighofer, Nov 2004
+% $Id: bcminit.m,v 1.1 2004/11/18 21:19:53 anton Exp $
+
+error(nargchk(4, 4, nargin));
+error(consist(net, 'bcm', Xtrain, Ytrain));
+
+[N, dim] = size(Xtrain);
+% Assignment given as a scalar:
+if prod(size(assignment))==1,
+  modulesize = assignment;
+  nModules = ceil(N/modulesize);
+  r = rem(N, modulesize);
+  % Each modules gets an equal share of the training data
+  if r==0,
+    modulesize = repmat(modulesize, [1 nModules]);
+  else
+    modulesize = [repmat(modulesize, [1 nModules-1]) r];
+  end
+  % Generate the assignment vector: assignment(j)==i if point j goes to
+  % module i
+  assignment = zeros([N 1]);
+  start = 1;
+  for i = 1:length(modulesize),
+    ind = start:(start+modulesize(i)-1);
+    assignment(ind) = i;
+    start = ind(end)+1;
+  end
+else
+  if length(assignment)~=N,
+    error('Length of vector assignment match match the number of training data');
+  end
+  % Uniquify the whole thing, this gets us rid of any nonsense data,
+  % wrong module numbers, and such
+  [B, dummy, assignment] = unique(assignment);
+  nModules = length(B);
+end
+
+% Initialize the GPs for each module with its data
+net.module = net.gpnet;
+for i = 1:nModules,
+  netI = net.gpnet;
+  ind = (assignment==i);
+  netI = gpinit(netI, Xtrain(ind,:), Ytrain(ind,:));
+  net.module(i) = netI;
+end
+% Initialize empty data for inverse prior matrices and weight vectors
+net.invPrior = [];
+net.weight = [];
diff --git a/bcmpak.m b/bcmpak.m
new file mode 100644
index 0000000..2edc547
--- /dev/null
+++ b/bcmpak.m
@@ -0,0 +1,28 @@
+function w = bcmpak(net)
+% bcmpak - Combine kernel parameters of BCM into vector
+%
+% Synopsis:
+%   w = bcmpak(net)
+%   
+% Arguments:
+%   net: BCM structure
+%   
+% Returns:
+%   w: Vector of GP parameters taken from field net.gpnet
+%   
+% Description:
+%   This routine is only meant to be used as a subroutine of
+%   bcmtrain.m. The routine returns the GP parameters taken from
+%   net.gpnet, which are assumed to be equal to all of the BCM's module
+%   parameters.
+%   
+%   
+% See also: bcm,bcmtrain,bcmunpak
+% 
+
+% Author(s): Anton Schwaighofer, Nov 2004
+% $Id: bcmpak.m,v 1.1 2004/11/18 21:20:47 anton Exp $
+
+error(nargchk(1, 1, nargin));
+error(consist(net, 'bcm'));
+w = gppak(net.gpnet);
diff --git a/bcmprepare.m b/bcmprepare.m
new file mode 100644
index 0000000..4b754bb
--- /dev/null
+++ b/bcmprepare.m
@@ -0,0 +1,54 @@
+function net = bcmprepare(net, verbosity)
+% bcmprepare - Pre-compute prior matrices for Bayesian Committee Machine (BCM)
+%
+% Synopsis:
+%   net = bcmprepare(net)
+%   net = bcmprepare(net,verbosity)
+%   
+% Arguments:
+%   net: Initialized BCM structure, as output by bcminit.m (training data must
+%       already be assigned to each module)
+%   verbosity: (optional) Use a value >0 to display progress information
+%   
+% Returns:
+%   net: Modified BCM structure, where now the fields net.invPrior and
+%       net.weight are computed
+%   
+% Description:
+%   Pre-compute the matrices that are repeatedly used in BCM forward
+%   propagation, that are the inverse covariance matrix of each module,
+%   and the weight vector for GP predictions.
+%   
+%   
+% See also: bcm,bcminit
+% 
+
+% Author(s): Anton Schwaighofer, Nov 2004
+% $Id: bcmprepare.m,v 1.1 2004/11/18 21:20:55 anton Exp $
+
+error(nargchk(1, 2, nargin));
+error(consist(net, 'bcm'));
+if nargin<2,
+  verbosity=0;
+end
+
+if verbosity>0,
+  fprintf('Pre-computing prior matrices for %i modules ', nbModules);
+end
+for i = 1:length(net.module),
+  netI = net.module(i);
+  % gpcovar computes the kernel matrix of the given points, and also adds
+  % the measurement noise.
+  Kprior = gpcovar(netI, netI.tr_in);
+  net.invPrior{i} = inv(Kprior);
+  % Measurement noise is restricted to a minimum value of 1e-8 in the
+  % Netlab routines. Thus, the matrices should be so well conditioned
+  % that we can solve the linear system by inversion, instead of mldivide
+  net.weight{i} = net.invPrior{i} * netI.tr_targets;
+  if verbosity==2,
+    fprintf('.');
+  end
+end
+if verbosity==2,
+  fprintf('\n');
+end
diff --git a/bcmtrain.m b/bcmtrain.m
new file mode 100644
index 0000000..8c85456
--- /dev/null
+++ b/bcmtrain.m
@@ -0,0 +1,77 @@
+function net = bcmtrain(net,method,alg,options)
+% bcmtrain - Kernel parameter optimization for Bayesian Committee Machine
+%
+% Synopsis:
+%   net = bcmtrain(net,method,alg,options)
+%   
+% Arguments:
+%   net: Initialized BCM structure, as output by bcminit.m
+%   method: String, one of 'shared', 'individual'. If method=='shared', all
+%       modules share the same hyperparameters, chosen such that the sum of all
+%       module marginal likelihoods is maximized. if method=='individual', kernel
+%       parameters are optimized for each modules, such that each
+%       modules' marginal likelihood is maximal.
+%   alg: Optimization routine to use for kernel parameters, e.g. 'scg'
+%   options: Options vector for the optimization routine
+%   
+% Returns:
+%   net: Modified BCM structure
+%   
+% Description:
+%   For reasons of efficiency, BCM hyperparameter selection is only
+%   implemented as heuristics, where the marginal likelihood of
+%   individual BCM modules is considered. Two strategies are available:
+%   'shared': All BCM modules share the same hyperparameters (e.g.,
+%       kernel parameters or noise variance. Training is done by
+%       maximizing the sum of marginal likelihoods in each module.
+%   'individual': Each BCM module has its distinct set of
+%       hyperparameters. Training is done by maximizing marginal
+%       likelihood in each module separately.
+%   For most cases, it seems that shared hyperparameters
+%   (method=='shared') leads to better performance than individual
+%   hyperparameters. 
+%   
+%   
+% See also: bcm,bcminit,bcmprepare,bcmfwd
+% 
+
+% Author(s): Anton Schwaighofer, Nov 2004
+% $Id: bcmtrain.m,v 1.2 2004/11/23 22:49:37 anton Exp $
+
+error(nargchk(2, 4, nargin));
+if nargin<4,
+  options = zeros([1 18]);
+  options(1) = 0;
+  options(2) = 1e-4;
+  options(3) = 1e-6;
+  options(9) = 0;
+  options(14) = 50;
+end
+if nargin<3,
+  alg = 'scg';
+end
+
+% Invalidate all eventually computed prior matrices
+net.invPrior = {};
+net.weights = {};
+  
+
+net.method = method;
+switch method
+  case 'shared'
+    % Use default netopt. bcmerr and bcmgrad have been adapted such that
+    % they are appropriate for this type of kernel parameter
+    % optimization: bcmerr computes the sum of the individual module
+    % likelihoods, bcmgrad computes the sum of the gradients
+    net = netopt(net, options, [], [], 'scg');
+  case 'individual'
+    % This is easy... just loop over all modules, train them via standard
+    % evidence maximization
+    for i = 1:length(net.module),
+      netI = net.module(i);
+      netI = netopt(netI, options, netI.tr_in, netI.tr_targets, alg);
+      net.module(i) = netI;
+    end
+  otherwise
+    error('Invalid value for parameter ''method''');
+end
diff --git a/bcmunpak.m b/bcmunpak.m
new file mode 100644
index 0000000..6edea35
--- /dev/null
+++ b/bcmunpak.m
@@ -0,0 +1,33 @@
+function net = bcmunpak(net,w)
+% bcmunpak - Copy kernel parameters for BCM from vector
+%
+% Synopsis:
+%   net = bcmunpak(net,w)
+%   
+% Arguments:
+%   net: BCM structure
+%   w: Vector of GP parameter
+%   
+% Returns:
+%   net: Modified BCM structure, where the GP parameters for each module are set
+%       to the values given in vector w
+%   
+% Description:
+%   This routine is only meant to be used as a subroutine of
+%   bcmtrain.m. The routine copies the gien GP parameters into each GP
+%   module, as well as into the GP given in net.gpnet.
+%
+%   
+% See also: bcmpak
+% 
+
+% Author(s): Anton Schwaighofer, Nov 2004
+% $Id: bcmunpak.m,v 1.1 2004/11/18 21:21:11 anton Exp $
+
+error(nargchk(2, 2, nargin));
+error(consist(net, 'bcm'));
+
+net.gpnet = gpunpak(net.gpnet, w);
+for i = 1:length(net.module),
+  net.module(i) = gpunpak(net.module(i), w);
+end
diff --git a/dembcm.m b/dembcm.m
new file mode 100644
index 0000000..21063a9
--- /dev/null
+++ b/dembcm.m
@@ -0,0 +1,207 @@
+function dembcm()
+% dembcm - Demo program for BCM approximation for large scale GP regression
+%
+% Synopsis:
+%   dembcm;
+%
+% Description:
+%   This routine demonstrates how the provided routines for the Bayesian
+%   Committee Machine can be used for analyzing data
+%   Basic steps are
+%   - Generate a data set (linear combination of some random basis
+%     functions) of 500 data points
+%   - Split the data into modules of 100 points each
+%   - Train Gaussian process models for each module
+%   - Use the BCM approximation to obtain a prediction
+%   
+%   Also, the demo compares the prediction accuracy with
+%   - A Gaussian Process model that is trained on all 500 points
+%   - A Gaussian Process model trained on only 300 points
+%
+%   The Bayesian Committee Machine is used in two variants:
+%   - In the standard form, training data are assigned modules at random
+%   - Alternatively, use k-means clustering, and assign points the points
+%     from each cluster to a module.
+%
+% See also: bcm,gp,bcminit,bcmtrain,bcmprepare
+%   
+
+% Author(s): Anton Schwaighofer, Nov 2004
+% $Id: dembcm.m,v 1.1 2005/11/16 17:12:41 anton Exp $
+
+% The demo also works for other random states, no worries ;-)
+randstate = 1;
+
+rand('state', randstate);
+randn('state', randstate);
+
+% ----------------------------------------------------------------------
+fprintf('Generating training and test data...\n');
+% 500 training data points, save basis points to later generate test
+% data. Generate low noise data
+noiselevel = 0.1;
+[Xtrain, Ytrain, Xbasis, Ybasis, Ytrain0] = art_data(500, 5, 0, noiselevel);
+% Generate 2000 test data from the same function. Use the "true" function
+% values Ytest0 (not the ones corrupted by noise) for testing
+[Xtest, Ytest, dummy1, dummy2, Ytest0] = art_data(2000, 5, 0, noiselevel, ...
+                                                  5, Xbasis, Ybasis);
+
+% Options for scg:
+scgopt = foptions;
+scgopt(1) = 1;
+scgopt(2) = 1e-4;
+scgopt(3) = 1e-4;
+scgopt(14) = 15;
+
+% ----------------------------------------------------------------------
+fprintf('Training a full GP model on all training data...\n');
+
+% Full gp model: Use the standard Netlab routines to train the thingy
+% Rational quadratic kernel is much better than the squexp kernel
+fullgp = gp(5, 'ratquad');
+fullgp = gpinit(fullgp, Xtrain, Ytrain);
+fullgp = netopt(fullgp, scgopt, Xtrain, Ytrain, 'scg');
+[fullpred,fullvar] = gpfwd(fullgp, Xtest);
+
+% ----------------------------------------------------------------------
+fprintf('Training a full GP model on 300 (out of the 500) training data...\n');
+
+% We also compare with a full GP trained on only 300 (out of 500) points
+full1gp = gp(5, 'ratquad');
+full1gp = gpinit(full1gp, Xtrain(1:300,:), Ytrain(1:300));
+full1gp = netopt(full1gp, scgopt, Xtrain(1:300,:), Ytrain(1:300), 'scg');
+[full1pred,full1var] = gpfwd(full1gp, Xtest);
+
+% ----------------------------------------------------------------------
+fprintf('Training the modules of the Bayesian Committee Machine...\n');
+
+% Now build BCM model: start with defining a 'template' GP that is the
+% basis for each BCM module
+gp0 = gp(5, 'ratquad');
+% Build a BCM model from the template
+bcm0 = bcm(gp0);
+% Give the BCM its data. The training data will be split up such that
+% each  module gets 100 points
+fprintf('BCM: Each module has 100 data points\n');
+bcm0 = bcminit(bcm0, Xtrain, Ytrain, 100);
+% Fit the BCM modules. Do this by optimizing evidence for each module
+% with shared hyperparameters
+bcm1 = bcmtrain(bcm0, 'shared', 'scg', scgopt);
+bcm1 = bcmprepare(bcm1);
+[bcm1pred, bcm1var] = bcmfwd(bcm1, Xtest);
+
+% ----------------------------------------------------------------------
+fprintf('Starting to cluster the training data...\n');
+
+% Clustered BCM:
+kmeansopt = [1 1e-5 1e-4 0 0 0 0 0 0 0 0 0 0 30];
+r = randperm(size(Xtrain,1));
+[centres,opt,post] = kmeans(Xtrain(r(1:5),:),Xtrain,kmeansopt);
+[m,assignment] = max(post,[],2);
+for i = 1:5,
+  fprintf('Clustered BCM: Module %i has %i data points\n', i, nnz(assignment==i));
+end
+% ----------------------------------------------------------------------
+fprintf('Training the modules of the clustered Bayesian Committee Machine...\n');
+bcm5 = bcminit(bcm0, Xtrain, Ytrain, assignment);
+bcm5a = bcmtrain(bcm5, 'shared', 'scg', scgopt);
+bcm5a = bcmprepare(bcm5a);
+[bcm5apred, bcm5avar] = bcmfwd(bcm5a, Xtest);
+
+
+% ----------------------------------------------------------------------
+fprintf('\nEvaluating all models in terms of\n');
+fprintf('RMSE (root mean squared error)\n');
+fprintf('logProb (negative log probability of test data under the predictive distribution\n\n');
+
+loss_negLogProb = inline('0.5*(log(2*pi*var) + ((pred-label).^2)./var)','label','pred','var');
+
+fprintf('Full GP model:\n');
+pred = fullpred; var = fullvar;
+fprintf('RMSE = %f, logProb = %f\n\n', sqrt(mean((pred-Ytest0).^2)), ...
+        mean(loss_negLogProb(Ytest0, pred, var)));
+fprintf('Full GP model on 300 data points:\n');
+pred = full1pred; var = full1var;
+fprintf('RMSE = %f, logProb = %f\n\n', sqrt(mean((pred-Ytest0).^2)), ...
+        mean(loss_negLogProb(Ytest0, pred, var)));
+fprintf('BCM model with shared hyperparams:\n');
+pred = bcm1pred; var = bcm1var;
+fprintf('RMSE = %f, logProb = %f\n\n', sqrt(mean((pred-Ytest0).^2)), ...
+        mean(loss_negLogProb(Ytest0, pred, var)));
+fprintf('Clustered BCM model with shared hyperparams:\n');
+pred = bcm5apred; var = bcm5avar;
+fprintf('RMSE = %f, logProb = %f\n\n', sqrt(mean((pred-Ytest0).^2)), ...
+        mean(loss_negLogProb(Ytest0, pred, var)));
+
+% $$$ figure(10);
+% $$$ clf;
+% $$$ val_errorbars(Ytest0', fullpred', sqrt(fullvar)');
+% $$$ set(gcf, 'Name', 'Full GP with ratquad');
+% $$$ figure(11);
+% $$$ clf;
+% $$$ val_errorbars(Ytest0', full1pred', sqrt(full1var)');
+% $$$ set(gcf, 'Name', 'Full GP on subset of 300 points');
+% $$$ figure(12);
+% $$$ clf;
+% $$$ val_errorbars(Ytest0', bcm1pred', sqrt(bcm1var)');
+% $$$ set(gcf, 'Name', 'BCM shared');
+% $$$ figure(15);
+% $$$ clf;
+% $$$ val_errorbars(Ytest0', bcm5apred', sqrt(bcm5avar)');
+% $$$ set(gcf, 'Name', 'Clustered BCM shared');
+
+return
+
+function [X, Y, Xbasis, Ybasis, Ynoisefree] = art_data(npoints, ndim, classification, noise, nbasis, Xbasis, Ybasis)
+% ART_DATA - Generate Volker's artificial data set
+%   Set all random seeds to 0 before calling to reproduce the exact data set.
+%
+
+if nargin<6,
+  Xbasis = [];
+  Ybasis = [];
+end
+if nargin<5,
+  nbasis = 5;
+end
+if nargin<4,
+  noise = 0;
+end
+if nargin<3,
+  classification = 0;
+end
+if nargin<2,
+  ndim = 5;
+end
+
+[Nb, dimb] = size(Xbasis);
+if (Nb>0) & (~all(size(Ybasis) == [Nb, 1])),
+  error('Size of basis function matrices XBASIS YBASIS does not match');
+end
+
+
+if Nb==0,
+  % No basis functions: generate new ones randomly
+  % X-Prototypes in range [-1...+1]
+  Xbasis = 2*rand(nbasis, ndim)-1;
+  % Target values of the prototypes
+  Ybasis = randn(nbasis, 1);
+  if classification & (ndim==5) & (nbasis==5),
+    % Volker's modification to generate a more balanced classification
+    % data-set, works only with random seed 0
+    Ybasis(2) = -Ybasis(2);
+    Ybasis(3) = -Ybasis(3);
+  end
+end
+
+adisa = mean(mean(sqrt(dist2(Xbasis, Xbasis))));
+aids_sig = adisa/4;
+
+X = 2*rand(npoints, ndim)-1;
+ad  = sqrt(dist2(X, Xbasis));
+Yex = exp(-1/(2*(aids_sig*aids_sig))*(ad.*ad));
+Ynoisefree = (Yex * Ybasis) ./ (Yex * ones(nbasis, 1));
+Y = Ynoisefree + noise * randn(npoints,1);
+if classification,
+  Y = sign(Y);
+end