-
Notifications
You must be signed in to change notification settings - Fork 39
/
LoadandFixUCRArchive.m
93 lines (68 loc) · 3.08 KB
/
LoadandFixUCRArchive.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
% This script modifies the new UCR Time-Series Archive to make it backwards
% compatible with previous versions and enable older code and scripts to
% run without modifications.
% dir_old contains the current archive directory
% dir_new contains new, corrected, archive directory
dir_old = 'DATASETSOLD';
dir_new = 'DATASETSNEW';
% first 2 values are '.' and '..' - UCR Archive 2018 version has 128 datasets
dir_struct = dir('DATASETSOLD');
dataset_names = {dir_struct(3:130).name};
for i = 1:length(dataset_names)
disp(['Dataset being processed: ', char(dataset_names(i))]);
% directory structure
dir_path_old = [dir_old,'/',char(dataset_names(i)),'/',char(dataset_names(i))];
% load training and test sets
TRAIN = load([dir_path_old,'_TRAIN']);
TEST = load([dir_path_old,'_TEST']);
% remove classes from time series
TRAIN_labels = TRAIN(:,1);
TRAIN(:,1) = [];
TEST_labels = TEST(:,1);
TEST(:,1) = [];
% fill missing values for DodgerLoopDay, DodgerLoopGame, and
% DodgerLoopWeekend datasets. Only three datasets have missing values.
if ( strcmp('DodgerLoopDay',char(dataset_names(i))) || strcmp('DodgerLoopGame',char(dataset_names(i))) || strcmp('DodgerLoopWeekend',char(dataset_names(i))) )
TRAIN = fillmissing(TRAIN, 'linear', 2, 'EndValues', 'nearest');
TEST = fillmissing(TEST, 'linear', 2, 'EndValues', 'nearest');
end
% for datasets with varying time-series length we resample the shorter time
% series to reach the length of the longest time series
TRAIN = ResampleToMaxLength(TRAIN);
TEST = ResampleToMaxLength(TEST);
% z-normalize time series
TEST = zscore(TEST,[],2);
TRAIN = zscore(TRAIN,[],2);
% fixing issues with class names
% Case 1: classes starting from 0 instead of 1
if (min(TEST_labels) == 0 || min(TRAIN_labels == 0))
TRAIN_labels = TRAIN_labels + 1;
TEST_labels = TEST_labels + 1;
end
% Case 2: binary classes are -1 or 1 instead of 1 and 2
if (min(TEST_labels) == -1 || min(TRAIN_labels == -1))
TRAIN_labels = round(TRAIN_labels/2 + 1.5);
TEST_labels = round(TEST_labels/2 + 1.5);
end
% Case 3: classes start from 3 instead of 1
if (min(TEST_labels) == 3 || min(TRAIN_labels == 3))
TRAIN_labels = TRAIN_labels - 2;
TEST_labels = TEST_labels - 2;
end
% create new train and test datasets (i.e., merge labels with data)
TRAIN = [TRAIN_labels,TRAIN];
TEST = [TEST_labels,TEST];
% save new datasets to new directory
dir_path_new = [dir_new,'/',char(dataset_names(i)),'/',char(dataset_names(i))];
dlmwrite([dir_path_new,'_TRAIN'],TRAIN, 'delimiter', ',','precision', 9);
dlmwrite([dir_path_new,'_TEST'],TEST, 'delimiter', ',','precision', 9);
end
function ResampledData = ResampleToMaxLength(Data)
[rows, cols] = size(Data);
ResampledData = zeros(rows, cols);
for i = 1:rows
singleTS = Data(i, :);
singleTS = singleTS(~isnan(singleTS));
ResampledData(i, :) = resample(singleTS, cols, length(singleTS));
end
end