-
Notifications
You must be signed in to change notification settings - Fork 0
/
do_fitGPz.m
134 lines (97 loc) · 5.44 KB
/
do_fitGPz.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
function [other, test_specz, mse, test_photz] = do_fitGPz(dataPath, maxIter, Nexamples, params)
% Computes photozs using the GPz algorithm available at
% https://github.com/troyraen/GPz (forked from OxfordML/GPz).
% Most of this function is taken from the file GPz/demo_photoz.m
%
% dataPath = string. path to csv data file.
% Required format is specified below.
% maxIter = int. maximum number of iterations.
% Nexamples = vector of ints. # examples in datasets: [training,validation,testing].
% params = cell array. {heteroscedastic, csl_method, maxAttempts, inputNoise, method}
%
% Example usage:
% % from matlab dir
% [test_specz, mse, test_photz] = do_fitGPz('../GPz/data/sdss_sample.csv')
%
addpath('../GPz/GPz/')
% addpath(genpath('minFunc_2012/')) % path to minfunc
addpath(genpath('../GPz/minFunc_2012/'))
rng(1); % fix random seed
%%%%%%%%%%%%%% Model options %%%%%%%%%%%%%%%%
m = 100; % number of basis functions to use [required]
method = params{5}; % 'VD'; % select a method, options = GL, VL, GD, VD, GC and VC [required]
heteroscedastic = params{1}; % true; % learn a heteroscedastic noise process, set to false if only interested in point estimates [default=true]
normalize = true; % pre-process the input by subtracting the means and dividing by the standard deviations [default=true]
% maxIter = 500; % maximum number of iterations [default=200]
maxAttempts = params{3}; % 50; % maximum iterations to attempt if there is no progress on the validation set [default=infinity]
% trainSplit = 0.2; % percentage of data to use for training
% validSplit = 0.2; % percentage of data to use for validation
% testSplit = 0.6; % percentage of data to use for testing
inputNoise = params{4}; % true; % false = use mag errors as additional inputs, true = use mag errors as additional input noise
csl_method = params{2}; % 'normal'; % cost-sensitive learning option: [default='normal']
% 'balanced': to weigh
% rare samples more heavily during training
% 'normalized': assigns an error cost for each sample = 1/(z+1)
% 'normal': no weights assigned, all samples are equally important
binWidth = 0.1; % the width of the bin for 'balanced' cost-sensitive learning [default=range(output)/100]
%%%%%%%%%%%%%% Prepare data %%%%%%%%%%%%%%
'Preparing data ...'
% dataPath = 'data/sdss_sample.csv'; % path to the data set, has to be in the following format m_1,m_2,..,m_k,e_1,e_2,...,e_k,z_spec
% where m_i is the i-th magnitude, e_i is its associated uncertainty and z_spec is the spectroscopic redshift
% [required]
outPath = []; % if set to a path, the output will be saved to a csv file.
% read data from file
X = csvread(dataPath);
Y = X(:,end);
X(:,end) = [];
[n,d] = size(X);
filters = d/2;
% select training, validation and testing sets from the data
% [training,validation,testing] = sample(n,trainSplit,validSplit,testSplit);
% you can also select the size of each sample
[training,validation,testing] = sample(n,Nexamples(1),Nexamples(2),Nexamples(3));
% get the weights for cost-sensitive learning
omega = getOmega(Y,csl_method,binWidth);
if(inputNoise)
% treat the mag-errors as input noise variance
Psi = X(:,filters+1:end).^2;
X(:,filters+1:end) = [];
else
% treat the mag-errors as input additional inputs
X(:,filters+1:end) = log(X(:,filters+1:end));
Psi = [];
end
%%%%%%%%%%%%%% Fit the model %%%%%%%%%%%%%%
'Fitting the Model ...'
% initialize the model
model = init(X,Y,method,m,'omega',omega,'training',training,'heteroscedastic',heteroscedastic,'normalize',normalize,'Psi',Psi);
% train the model
model = train(model,X,Y,'omega',omega,'training',training,'validation',validation,'maxIter',maxIter,'maxAttempts',maxAttempts,'Psi',Psi);
%%%%%%%%%%%%%% Compute Metrics %%%%%%%%%%%%%%
'Computing Metrics ...'
% use the model to generate predictions for the test set
[mu,sigma,nu,beta_i,gamma] = predict(X,model,'Psi',Psi,'selection',testing);
% mu = the best point estimate
% nu = variance due to data density
% beta_i = variance due to output noise
% gamma = variance due to input noise
% sigma = nu+beta_i+gamma
%%%%%%%%%%%%%% Set Output %%%%%%%%%%%%%%
%root mean squared error, i.e. sqrt(mean(errors^2))
rmse = sqrt(metrics(Y(testing),mu,sigma,@(y,mu,sigma) (y-mu).^2));
% fraction of data where |z_spec-z_phot|/(1+z_spec)<0.10
fr10 = metrics(Y(testing),mu,sigma,@(y,mu,sigma) 100*(abs(y-mu)./(y+1)<0.10));
%-------------------------------------------------------------
% Above here is almost exclusively from GPz/demo_photoz.m.
% Below here is my calculations.
%-------------------------------------------------------------
test_specz = Y(testing);
mse = sum(rmse.^2);
test_photz = mu;
% Check whether my calculations give the same results as GPz
zdev = calc_zdev(test_specz, test_photz);
[NMAD, out10] = calc_zerrors(zdev);
diff_frout10 = (100-fr10(end))/100 - out10;
other = [out10, diff_frout10];
% out10 = 0.0260 on the default dataset, N train examples = 100000. Matches fr10.
end