-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdata_gen.jl
executable file
·105 lines (94 loc) · 3.05 KB
/
data_gen.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
using LinearAlgebra
using MAT
function gen_synthetic(d::Int, sub_d::Int, nspaces::Int, npoints::Int, noise::Number; orthogonal::Bool=false)
"""
Translated from https://github.com/ChongYou/subspace-clustering
This function generates a union of subspaces under random model, i.e.,
subspaces are independently and uniformly distributed in the ambient space,
data points are independently and uniformly distributed on the unit sphere of each subspace
Parameters
-----------
d : int
Dimension of the ambient space
sub_d : int
Dimension of each subspace (all subspaces have the same dimension)
nspaces : int
Number of subspaces to be generated
npoints : int
Number of data points from each of the subspaces
noise : float
Amount of Gaussian noise on data
Returns
-------
data : shape d × (nspaces * npoints)
Data matrix containing points drawn from a union of subspaces as its rows
label : shape (nspaces * npoints)
Membership of each data point to the subspace it lies in
"""
data = zeros(d, nspaces * npoints)
label = zeros(Int, nspaces*npoints)
if orthogonal
@assert sub_d * nspaces <= d "Subspaces too high dimensional to be orthogonal"
bases = qr(randn(d, sub_d*nspaces)).Q
for i in 1:nspaces
sub_inds = 1+(i-1)*sub_d:i*sub_d
basis = bases[:,sub_inds]
coeff = mapslices(normalize, randn(sub_d, npoints); dims=1)
samples = basis * coeff
samples += randn(d, npoints)*noise
samples = mapslices(normalize, samples; dims=1)
inds = 1+(i-1)*npoints:i*npoints
data[:, inds] = samples
label[inds] .= i
end
return data, label
end
for i in 1:nspaces
basis = qr(randn(d, sub_d)).Q
coeff = mapslices(normalize, randn(sub_d, npoints); dims=1)
samples = basis * coeff
samples += randn(d, npoints)*noise
samples = mapslices(normalize, samples; dims=1)
inds = 1+(i-1)*npoints:i*npoints
data[:, inds] = samples
label[inds] .= i
end
return data, label
end
function load_umist_resized()
""" Loads UMIST face dataset """
nspaces = 20
pathname = "data/umist_resized.mat"
vars = matread(pathname)
X = vars["X"]
X = Matrix{Float64}(X)
label = vec(vars["label"])
label = Array{Int}(label)
norm_X = mapslices(normalize, X, dims=1)
return norm_X, nspaces, label
end
function load_scattered_coil(;nspaces=100)
""" Loads scattered COIL dataset """
pathname = "data/COIL_scatter.mat"
vars = matread(pathname)
X = vars["data"]
X = Matrix{Float64}(X)
label = vec(vars["label"])
label = Array{Int}(label)
norm_X = mapslices(normalize, X, dims=1)
norm_X = norm_X[:,1:nspaces*72]
label = label[1:nspaces*72];
return norm_X, nspaces, label
end
function load_scattered_umist()
""" Loads scattered UMIST dataset """
nspaces = 20
pathname = "data/umist_scatter.mat"
vars = matread(pathname)
X = vars["data"]
X = Matrix{Float64}(X)
label = vec(vars["label"])
label = Array{Int}(label)
norm_X = mapslices(normalize, X, dims=1)
return norm_X, nspaces, label
end