-
Notifications
You must be signed in to change notification settings - Fork 11
/
gpmodel2struct.m
533 lines (436 loc) · 17.8 KB
/
gpmodel2struct.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
function gpmodel = gpmodel2struct(gp,ID,tbxStats,createSyms,modelStruc,fastSymMode)
%GPMODEL2STRUCT Create a struct describing a multigene regression model.
%
% GPMODEL = GPMODEL2STRUCT(GP,ID) gets the multigene regression model
% specified by the identifier ID from the GPTIPS struct GP and returns
% model info and performance data as a struct GPMODEL.
%
% The GPMODEL2STRUCT struct contains a variety of model information
% including symbolic object versions of the multigene regression model
% and its constituent genes as well as model performance metrics for the
% training, validation and test data sets (if present). This struct - as
% well as being a convenient store of model performance data - may also
% be used as an input to several other GPTIPS model analysis functions,
% e.g.
%
% GPPRETTY(GP,GPMODEL)
%
% GPMODELREPORT(GP,GPMODEL)
%
% RUNTREE(GP,GPMODEL)
%
% GPMODEL2SYM(GP,GPMODEL)
%
% GPMODEL2MFILE(GP,GPMODEL)
%
% GPMODEL2FUNC(GP,GPMODEL)
%
% GPMODELGENES2MFILE(GP,GPMODEL)
%
% DRAWTREES(GP,GPMODEL)
%
% Note:
%
% The GPMODEL struct is functionally identical to that returned by the
% function GPGENES2MODEL wherein a 'new' multigene regression model may
% be constructed from the unique genes in a population.
%
% Additionally:
%
% GPMODEL = GPMODEL2STRUCT(GP,'best') gets info on the 'best' model in
% the population (as evaluated on training data).
%
% GPMODEL = GPMODEL2STRUCT(GP,'valbest') gets info on the 'best' model
% (as evaluated on the validation data set (if this data exists).
%
% GPMODEL = GPMODEL2STRUCT(GP,'testbest') gets info on the 'best' model
% (as evaluated on the validation data set (if this data exists).
%
% GPMODEL = GPMODEL2STRUCT(GP,ID,TBXSTATS) where TBXSTATS is TRUE does
% the same but computes additional model performance stats (on the
% training data) using the Statistics Toolbox (TBXSTATS default is
% FALSE). These may be found in the field GPMODEL.TRAIN.TBXSTATS which
% contains the output of the Statistics Toolbox REGSTATS function.
%
% GPMODEL = GPMODEL2STRUCT(GP,ID,TBXSTATS,CREATESYMS) where CREATESYMS is
% FALSE does the same but does not generate any symbolic math objects
% using the Symbolic Math Toolbox. (CREATESYMS default is TRUE).
%
% Remarks:
%
% This function always returns a GPMODEL struct even if the supplied
% model identifier is invalid (e.g. if 'valbest' is specified but there
% is no validation data) or if the model is invalid in some other way
% (e.g. there were no gene weights computed for it due to non-finite gene
% outputs on the training data). The field GPMODEL.VALID in the returned
% struct is TRUE for valid models and FALSE otherwise. If the model is
% invalid then the reason for the model's invalidity can always be found
% in the field GPMODEL.INVALIDREASON. E.g. 'Invalid model index
% supplied.'
%
% Furthermore - for a valid model - if there was 'test' set data supplied
% during the GP run then the field GPMODEL.TEST.WARNING will be TRUE if
% there was a problem predicting the test set values amd the reason for
% the problem will be in the field GPMODEL.TEST.WARNINGREASON (e.g.
% 'Non-finite or non-real predictions on testing data.'). Similarly for
% validation data, if there was a problem predicting it then the field
% GPMODEL.VAL.WARNING is TRUE and the reason found in
% GPMODEL.VAL.WARNINGREASON.
%
% It is quite possible for a model to be formally 'valid' (i.e. predicts
% OK on the training data) but fails on the test or validation data (e.g.
% due to division by zero etc.)
%
% This function computes additional model performance stats using the
% sub-function REGRESSMULTI_FITFUN_FULL_STATS. So, if you change the
% multigene regression fitness function REGRESSMULTI_FITFUN then you will
% also need to change REGRESSMULTI_FITFUN_FULL_STATS.
%
% Copyright (c) 2009-2015 Dominic Searson
%
% GPTIPS 2
%
% See also GPMODEL2MFILE, GPMODEL2FUNC, GENES2GPMODEL, GPMODEL2SYM,
% GPMODELREPORT, DRAWTREES, RUNTREE, GPMODELGENES2MFILE
if nargin < 2
gpmodel.valid = false;
gpmodel.invalidReason = 'The function GPMODEL2STRUCT requires at least 2 arguments, e.g. GPMODEL2STRUCT(GP,''BEST'')';
return;
end
if ~strncmpi('regressmulti',func2str(gp.fitness.fitfun),12)
error('GPMODEL2STRUCT may only be used to extract model data from a GP structure with a population containing multigene symbolic regression models.');
end
if nargin < 6 || isempty(fastSymMode)
fastSymMode = false;
end
%scan model genes for input frequency, depth, complexity? (default = yes).
%(setting to false is a bit quicker but doesn't compute the structural info)
if nargin < 5 || isempty(modelStruc)
modelStruc = true;
end
%create symbolic objects of overall model and genes? (default = yes).
%(setting to false is significantly quicker but you don't get the symbolic
%math object for each gene)
if nargin < 4 || isempty(createSyms)
createSyms = true;
end
%compute statistics toolbox stats? (default = no).
if nargin < 3 || isempty(tbxStats)
tbxStats = false;
end
%set up default test, train and validation values
gpmodel.train.r2 = 0;
gpmodel.train.rmse = Inf;
gpmodel.train.mse = Inf;
gpmodel.train.sse = Inf;
gpmodel.train.mae = Inf;
gpmodel.train.maxe = Inf;
gpmodel.train.err = [];
gpmodel.train.ypred = [];
gpmodel.train.gene_outputs = [];
gpmodel.train.datapoints = [];
gpmodel.train.warning = false;
gpmodel.train.warningReason = '';
gpmodel.val.r2 = 0;
gpmodel.val.rmse = Inf;
gpmodel.val.mse = Inf;
gpmodel.val.sse = Inf;
gpmodel.val.mae = Inf;
gpmodel.val.maxe = Inf;
gpmodel.val.err = [];
gpmodel.val.ypred = [];
gpmodel.val.gene_outputs = [];
gpmodel.val.datapoints = [];
gpmodel.val.warning = false;
gpmodel.val.warningReason = '';
gpmodel.test.r2 = 0;
gpmodel.test.rmse = Inf;
gpmodel.test.mse = Inf;
gpmodel.test.sse = Inf;
gpmodel.test.mae = Inf;
gpmodel.test.maxe = Inf;
gpmodel.test.err = [];
gpmodel.test.ypred = [];
gpmodel.test.gene_outputs = [];
gpmodel.test.datapoints = [];
gpmodel.test.warning = false;
gpmodel.test.warningReason = '';
%encoded genes supplied in cell array
cellgenes = false;
%parse user input for supplied numerical model identifier
if isnumeric(ID) && numel(ID) == 1
if ~mod(ID,1) && ID > 0 && ID <= gp.runcontrol.pop_size
%get encoded trees, eval trees and return values
treestrs = gp.pop{ID};
evaltreestrs = tree2evalstr(gp.pop{ID},gp);
rtnVals = gp.fitness.returnvalues{ID};
else
gpmodel.valid = false;
gpmodel.invalidReason = 'Invalid model index supplied.';
return;
end
elseif ischar(ID) && strcmpi(ID,'best') %or 'best' on training
treestrs = gp.results.best.individual;
evaltreestrs = gp.results.best.eval_individual;
rtnVals = gp.results.best.returnvalues;
elseif ischar(ID) && strcmpi(ID,'valbest') %or 'best' on validation data
% check that validation data is present
if ~isfield(gp.results,'valbest')
gpmodel.valid = false;
gpmodel.invalidReason ='No validation data was found.';
return;
end
treestrs = gp.results.valbest.individual;
evaltreestrs = gp.results.valbest.eval_individual;
rtnVals = gp.results.valbest.returnvalues;
elseif ischar(ID) && strcmpi(ID,'testbest') %or 'best' on test data
% check that test data is present
if ~isfield(gp.results,'testbest')
gpmodel.valid = false;
gpmodel.invalidReason ='No test data was found.';
return;
end
treestrs = gp.results.testbest.individual;
evaltreestrs = gp.results.testbest.eval_individual;
rtnVals = gp.results.testbest.returnvalues;
%if user supplied list of genes in encoded form,e.g. {'n(x1,f(x8),x11)','c(x2,x8)'}
elseif iscell(ID)
treestrs = ID;
evaltreestrs = tree2evalstr(treestrs,gp);
cellgenes = true;
%return values need to be computed by the fitness function
gp.state.force_compute_theta = true;
gp.state.run_completed = false;
gp.userdata.showgraphs = false;
[fitness,gp,rtnVals] = feval(gp.fitness.fitfun,evaltreestrs,gp);
if isinf(fitness)
gpmodel.valid = false;
gpmodel.invalidReason ='Supplied gene list gave non-finite values for predictions of training data';
return
end
elseif isstruct(ID)
gpmodel.valid = false;
gpmodel.invalidReason ='Models already in struct format unsupported by gpmodel2struct';
return
else %otherwise user did not supply a valid multigene model selector
gpmodel.valid = false;
gpmodel.invalidReason ='Invalid model selector supplied.';
return
end
%get multigene regression stats
wstate = warning; warning off;
gpmodel = regressmulti_fitfun_full_stats(gpmodel,evaltreestrs,gp,rtnVals,tbxStats);
warning(wstate);
%compute some tree structural information
if modelStruc
gpmodel.numNodes = getnumnodes(treestrs);
gpmodel.expComplexity = getcomplexity(treestrs);
inputVec = gpmodelvars(gp,ID);
%inputs
count = 0;
for i=1:numel(inputVec)
if inputVec(i) > 0
count = count + 1;
if ~isempty(gp.nodes.inputs) && i <= numel(gp.nodes.inputs.names) && ~isempty(gp.nodes.inputs.names{i})
gpmodel.inputs{count} = strtrim(gp.nodes.inputs.names{i});
else
gpmodel.inputs{count} = ['x' num2str(i)];
end
end
end
gpmodel.numInputs = numel(gpmodel.inputs);
gpmodel.output = gp.nodes.output.name;
%max depth
maxDepth = 1;
for i=1:numel(treestrs)
maxDepth = max(maxDepth,getdepth(treestrs{i}));
end
gpmodel.maxDepth = maxDepth;
end
%create sym objects
if createSyms && gp.info.toolbox.symbolic
if cellgenes
ID = horzcat(ID,rtnVals);
end
[gpmodel.sym,gpmodel.genes.geneSyms] = gpmodel2sym(gp,ID,fastSymMode,true);
else
gpmodel.sym = [];
gpmodel.genes.geneSyms = [];
end
%original encoded genes
gpmodel.about = 'A struct representing a multigene regression model.';
gpmodel.genes.geneStrs = treestrs;
gpmodel.genes.geneWeights = rtnVals;
gpmodel = orderfields(gpmodel);
gpmodel.source = 'gpmodel2struct';
function gpmodel = regressmulti_fitfun_full_stats(gpmodel, evalstr,gp,theta,tbxStats)
%REGRESSMULTI_FITFUN_FULL_STATS updates GPMODEL struct with stats for an existing multigene regression model.
%
% GPMODEL = REGRESSMULTI_FITFUN_FULL_STATS(EVALSTR,GP,THETA,GRAPHS,TBXST
% ATS) updates a structure GPMODEL containing performance data about the
% multigene symbolic regression individual represented by EVALSTR and
% THETA.
%
% Remarks:
%
% Utility function. Reproduces functionality of REGRESSMULTI_FITFUN in a
% more useful (but slower) form for offline use. It calculates a variety
% of performance statistics which can be accessed via the fields of
% GPMODEL. Also sets warning flags for 'test' and 'validation' data sets
% if the model fails to predict for them.
%statistics toolbox stats off by default
if nargin < 4
tbxStats = false;
end
%ensure gene weights in matrix form
if iscell(theta)
theta = theta{1};
end
gpmodel.valid = true;
gpmodel.invalidReason = '';
%if the gene weights vector is empty it means that during the
%fitness evaluation it could not be generated due to either
%non-finite or complex model outputs OR numerical problem in the
%SVD least squares computation of weights. This is a model "showstopper"
%and so the model is marked as 'invalid'
if numel(evalstr) ~= (numel(theta)-1)
gpmodel.valid = false;
gpmodel.invalidReason = 'Empty gene weights vector: probably because model output contained complex or non-finite values on training data.';
return
end
%process evalstr with regex to allow direct access to data matrices
pat = 'x(\d+)';
evalstr = regexprep(evalstr,pat,'gp.userdata.xtrain(:,$1)');
numTrainData = numel(gp.userdata.ytrain);
numGenes = length(evalstr);
gpmodel.genes.num_genes = numGenes;
%set up a matrix to store the tree outputs plus a bias column of ones
geneOutputsTrain = ones(numTrainData,numGenes+1);
%eval each gene in the current individual on training data
for i=1:numGenes
ind = i + 1;
eval(['geneOutputsTrain(:,ind)=' evalstr{i} ';']);
end
%add raw gene outputs to struct (not bias term)
gpmodel.train.gene_outputs = geneOutputsTrain(:,2:end);
gpmodel.train.datapoints = numTrainData;
%check for nonsensical answers - exit with invalid model if any found
if any(any(~isfinite(geneOutputsTrain))) || any(any(~isreal(geneOutputsTrain)))
gpmodel.valid = false;
gpmodel.invalidReason = 'Non-finite or non-real gene output values on training data.';
return
end
%calc contribution of weighted individual genes on training data
gpmodel.train.ypred_genes = geneOutputsTrain.*repmat(theta',numTrainData,1);
%calc overall prediction of training data
gpmodel.train.ypred = geneOutputsTrain*theta;
%error (training data)
errTrain = gp.userdata.ytrain - gpmodel.train.ypred;
gpmodel.train.err = errTrain;
%SSE (training data)
gpmodel.train.sse = (errTrain'*errTrain);
%MSE (training data)
gpmodel.train.mse = gpmodel.train.sse/numTrainData;
%RMS prediction error - training data
gpmodel.train.rmse = sqrt(gpmodel.train.mse);
%r2 for training data
r2train = 1 - (gpmodel.train.sse /sum( (gp.userdata.ytrain-mean(gp.userdata.ytrain)).^2 ) );
gpmodel.train.r2 = r2train;
%mean absolute error (training)
gpmodel.train.mae = mean(abs(errTrain));
%max abs error (training)
gpmodel.train.maxe = max(abs(errTrain));
%process validation data if present
if isfield(gp.results,'valbest')
evalstr = strrep(evalstr,'.xtrain','.xval');
numValData = length(gp.userdata.yval);
geneOutputsVal = zeros(numValData,numGenes+1);
geneOutputsVal(:,1) = ones;
for i=1:numGenes
ind = i + 1;
eval(['geneOutputsVal(:,ind)=' evalstr{i} ';']);
end
gpmodel.val.gene_outputs = geneOutputsVal(:,2:end);
gpmodel.val.datapoints = numValData;
%flag warning if non-real or complex predictions on validation data
if any(any(~isfinite(geneOutputsVal))) || any(any(~isreal(geneOutputsVal)))
gpmodel.val.rmse = Inf;
gpmodel.val.warning = true;
gpmodel.val.warningReason = 'Non-finite or non-real predictions on validation data.';
end
%compute model stats and predictions if no warning
if ~gpmodel.val.warning
gpmodel.val.ypred = geneOutputsVal*theta; %create the prediction on the validation data
%error validation data
errVal = gp.userdata.yval - gpmodel.val.ypred;
gpmodel.val.err = errVal;
%sse for validation data
gpmodel.val.sse = errVal'*errVal;
%MSE validation data
gpmodel.val.mse = gpmodel.val.sse/numValData;
%rmse validation data
gpmodel.val.rmse = sqrt(gpmodel.val.mse);
%R2 for validation data
gpmodel.val.r2 = 1 - (gpmodel.val.sse/sum( (gp.userdata.yval - mean(gp.userdata.yval)).^2 ));
%mean absolute error for validation data
gpmodel.val.mae = mean(abs(errVal));
%max abs error (validation)
gpmodel.val.maxe = max(abs(errVal));
end
evalstr = strrep(evalstr,'.xval','.xtrain');
else
gpmodel.val.rmse = Inf;
gpmodel.val.warning = true;
gpmodel.val.warningReason = 'No validation data was found.';
end %end of validation data calcs
%process test data
if (isfield(gp.userdata,'xtest')) && (isfield(gp.userdata,'ytest')) && ...
~isempty(gp.userdata.xtest) && ~isempty(gp.userdata.ytest)
evalstr = strrep(evalstr,'.xtrain','.xtest');
numTestData = length(gp.userdata.ytest);
geneOutputsTest = zeros(numTestData,numGenes+1);
geneOutputsTest(:,1) = ones;
for i=1:numGenes
ind = i + 1;
eval(['geneOutputsTest(:,ind)=' evalstr{i} ';']);
end
gpmodel.test.gene_outputs = geneOutputsTest(:,2:end);
gpmodel.test.datapoints = numTestData;
if any(any(~isfinite(geneOutputsTest))) || any(any(~isreal(geneOutputsTest)))
gpmodel.test.rmse = Inf;
gpmodel.test.warning = true;
gpmodel.test.warningReason = 'Non-finite or non-real predictions on testing data.';
end
if ~gpmodel.test.warning
gpmodel.test.ypred = geneOutputsTest*theta;
%error test data
errTest = gp.userdata.ytest - gpmodel.test.ypred;
gpmodel.test.err = errTest;
%sse for test data
gpmodel.test.sse = errTest'*errTest;
%MSE test
gpmodel.test.mse = gpmodel.test.sse/numTestData;
%RMSE test data
gpmodel.test.rmse = sqrt(gpmodel.test.mse);
%r2 test data
gpmodel.test.r2 = 1- (gpmodel.test.sse/sum( (gp.userdata.ytest - mean(gp.userdata.ytest)).^2 ));
%mean absolute error test data
gpmodel.test.mae = mean(abs(errTest));
%max abs error (testing)
gpmodel.test.maxe = max(abs(errTest));
end
else
gpmodel.test.rmse = Inf;
gpmodel.test.warning = true;
gpmodel.test.warningReason = 'No test data was found.';
end
%calc statistical analysis of gene significance & other stats on training data
%(if stats toolbox is present)
if tbxStats && gp.info.toolbox.stats
stats = regstats(gp.userdata.ytrain,geneOutputsTrain(:,2:end));
gpmodel.train.pvals = stats.tstat.pval;
gpmodel.train.tbxStats = stats;
else
gpmodel.train.pvals = [];
gpmodel.train.tbxStats = [];
end