-
Notifications
You must be signed in to change notification settings - Fork 239
/
fast_tsne.m
129 lines (117 loc) · 5.41 KB
/
fast_tsne.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
function mappedX = fast_tsne(X, no_dims, initial_dims, perplexity, theta, alg, max_iter)
%FAST_TSNE Runs the C++ implementation of Barnes-Hut t-SNE
%
% mappedX = fast_tsne(X, no_dims, initial_dims, perplexity, theta, alg)
%
% Runs the C++ implementation of Barnes-Hut-SNE. The high-dimensional
% datapoints are specified in the NxD matrix X. The dimensionality of the
% datapoints is reduced to initial_dims dimensions using PCA (default = 50)
% before t-SNE is performed. Next, t-SNE reduces the points to no_dims
% dimensions. The perplexity of the input similarities may be specified
% through the perplexity variable (default = 30). The variable theta sets
% the trade-off parameter between speed and accuracy: theta = 0 corresponds
% to standard, slow t-SNE, while theta = 1 makes very crude approximations.
% Appropriate values for theta are between 0.1 and 0.7 (default = 0.5).
% The variable alg determines the algorithm used for PCA. The default is set
% to 'svd'. Other options are 'eig' or 'als' (see 'doc pca' for more details).
% The function returns the two-dimensional data points in mappedX.
%
% NOTE: The function is designed to run on large (N > 5000) data sets. It
% may give poor performance on very small data sets (it is better to use a
% standard t-SNE implementation on such data).
% Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology)
% All rights reserved.
%
% Redistribution and use in source and binary forms, with or without
% modification, are permitted provided that the following conditions are met:
% 1. Redistributions of source code must retain the above copyright
% notice, this list of conditions and the following disclaimer.
% 2. Redistributions in binary form must reproduce the above copyright
% notice, this list of conditions and the following disclaimer in the
% documentation and/or other materials provided with the distribution.
% 3. All advertising materials mentioning features or use of this software
% must display the following acknowledgement:
% This product includes software developed by the Delft University of Technology.
% 4. Neither the name of the Delft University of Technology nor the names of
% its contributors may be used to endorse or promote products derived from
% this software without specific prior written permission.
%
% THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS
% OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
% OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
% EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
% SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
% PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
% BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
% CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
% IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
% OF SUCH DAMAGE.
if ~exist('no_dims', 'var') || isempty(no_dims)
no_dims = 2;
end
if ~exist('initial_dims', 'var') || isempty(initial_dims)
initial_dims = 50;
end
if ~exist('perplexity', 'var') || isempty(perplexity)
perplexity = 30;
end
if ~exist('theta', 'var') || isempty(theta)
theta = 0.5;
end
if ~exist('alg', 'var') || isempty(alg)
alg = 'svd';
end
if ~exist('max_iter', 'var') || isempty(max_iter)
max_iter=1000;
end
% Perform the initial dimensionality reduction using PCA
X = double(X);
X = bsxfun(@minus, X, mean(X, 1));
M = pca(X,'NumComponents',initial_dims,'Algorithm',alg);
X = X * M;
tsne_path = which('fast_tsne');
tsne_path = fileparts(tsne_path);
% Compile t-SNE C code
if(~exist(fullfile(tsne_path,'./bh_tsne'),'file') && isunix)
system(sprintf('g++ %s %s -o %s -O2',...
fullfile(tsne_path,'./sptree.cpp'),...
fullfile(tsne_path,'./tsne.cpp'),...
fullfile(tsne_path,'./bh_tsne')));
end
% Run the fast diffusion SNE implementation
write_data(X, no_dims, theta, perplexity, max_iter);
tic
[flag, cmdout] = system(['"' fullfile(tsne_path,'./bh_tsne') '"']);
if(flag~=0)
error(cmdout);
end
toc
[mappedX, landmarks, costs] = read_data;
landmarks = landmarks + 1; % correct for Matlab indexing
delete('data.dat');
delete('result.dat');
end
% Writes the datafile for the fast t-SNE implementation
function write_data(X, no_dims, theta, perplexity, max_iter)
[n, d] = size(X);
h = fopen('data.dat', 'wb');
fwrite(h, n, 'integer*4');
fwrite(h, d, 'integer*4');
fwrite(h, theta, 'double');
fwrite(h, perplexity, 'double');
fwrite(h, no_dims, 'integer*4');
fwrite(h, max_iter, 'integer*4');
fwrite(h, X', 'double');
fclose(h);
end
% Reads the result file from the fast t-SNE implementation
function [X, landmarks, costs] = read_data
h = fopen('result.dat', 'rb');
n = fread(h, 1, 'integer*4');
d = fread(h, 1, 'integer*4');
X = fread(h, n * d, 'double');
landmarks = fread(h, n, 'integer*4');
costs = fread(h, n, 'double'); % this vector contains only zeros
X = reshape(X, [d n])';
fclose(h);
end