Skip to content

Commit 2565c2c

Browse files
committed
Add CategoricalVector type
1 parent 69390fe commit 2565c2c

File tree

5 files changed

+123
-1
lines changed

5 files changed

+123
-1
lines changed

src/AxisArrays.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ include("intervals.jl")
1717
include("search.jl")
1818
include("indexing.jl")
1919
include("sortedvector.jl")
20+
include("categoricalvector.jl")
2021
include("combine.jl")
2122

2223
end

src/categoricalvector.jl

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
2+
export CategoricalVector
3+
4+
"""
5+
A CategoricalVector is an AbstractVector which is treated as a categorical axis regardless
6+
of the element type. Duplicate values are not allowed but are not filtered out.
7+
8+
A CategoricalVector axis can be indexed with an ClosedInterval, with a value, or with a
9+
vector of values. Use of a CategoricalVector{Tuple} axis allows indexing similar to the
10+
hierarchical index of the Python Pandas package or the R data.table package.
11+
12+
In general, indexing into a CategoricalVector will be much slower than the corresponding
13+
SortedVector or another sorted axis type, as linear search is required.
14+
15+
### Constructors
16+
17+
```julia
18+
CategoricalVector(x::AbstractVector)
19+
```
20+
21+
### Arguments
22+
23+
* `x::AbstractVector` : the wrapped vector
24+
25+
### Examples
26+
27+
```julia
28+
v = CategoricalVector(collect([1; 8; 10:15]))
29+
A = AxisArray(reshape(1:16, 8, 2), v, [:a, :b])
30+
A[Axis{:row}(1), :]
31+
A[Axis{:row}(10), :]
32+
A[Axis{:row}([1, 10]), :]
33+
34+
## Hierarchical index example with three key levels
35+
36+
data = reshape(1.:40., 20, 2)
37+
v = collect(zip([:a, :b, :c][rand(1:3,20)], [:x,:y][rand(1:2,20)], [:x,:y][rand(1:2,20)]))
38+
A = AxisArray(data, CategoricalVector(v), [:a, :b])
39+
A[:b, :]
40+
A[[:a,:c], :]
41+
A[(:a,:x), :]
42+
A[(:a,:x,:x), :]
43+
```
44+
"""
45+
immutable CategoricalVector{T} <: AbstractVector{T}
46+
data::AbstractVector{T}
47+
end
48+
49+
Base.getindex(v::CategoricalVector, idx::Int) = v.data[idx]
50+
Base.getindex(v::CategoricalVector, idx::AbstractVector) = CategoricalVector(v.data[idx])
51+
52+
Base.length(v::CategoricalVector) = length(v.data)
53+
Base.size(v::CategoricalVector) = size(v.data)
54+
Base.size(v::CategoricalVector, i) = size(v.data, i)
55+
Base.indices(v::CategoricalVector) = indices(v.data)
56+
57+
axistrait{T}(::Type{CategoricalVector{T}}) = Categorical
58+
checkaxis(::CategoricalVector) = nothing
59+
60+
61+
## Add some special indexing for CategoricalVector{Tuple}'s to achieve something like
62+
## Panda's hierarchical indexing
63+
64+
axisindexes{T<:Tuple,S}(ax::Axis{S,CategoricalVector{T}}, idx) = axisindexes(ax, (idx,))
65+
66+
function axisindexes{T<:Tuple,S}(ax::Axis{S,CategoricalVector{T}}, idx::Tuple)
67+
collect(filter(ax_idx->_tuple_matches(ax.val[ax_idx], idx), indices(ax.val)...))
68+
end
69+
70+
function _tuple_matches(element::Tuple, idx::Tuple)
71+
length(idx) <= length(element) || return false
72+
73+
for (x, y) in zip(element, idx)
74+
x == y || return false
75+
end
76+
77+
return true
78+
end
79+
80+
axisindexes{T<:Tuple,S}(ax::Axis{S,CategoricalVector{T}}, idx::AbstractArray) =
81+
vcat([axisindexes(ax, i) for i in idx]...)

src/indexing.jl

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,17 @@ end
231231
ex = Expr(:tuple)
232232
n = 0
233233
for i=1:length(I)
234+
if axistrait(I[i]) <: Categorical && i <= length(Ax.parameters)
235+
if I[i] <: Axis
236+
push!(ex.args, :(axisindexes(A.axes[$i], I[$i].val)))
237+
else
238+
push!(ex.args, :(axisindexes(A.axes[$i], I[$i])))
239+
end
240+
n += 1
241+
242+
continue
243+
end
244+
234245
if I[i] <: Idx
235246
push!(ex.args, :(I[$i]))
236247
n += 1
@@ -243,7 +254,11 @@ end
243254
end
244255
n += length(I[i])
245256
elseif i <= length(Ax.parameters)
246-
push!(ex.args, :(axisindexes(A.axes[$i], I[$i])))
257+
if I[i] <: Axis
258+
push!(ex.args, :(axisindexes(A.axes[$i], I[$i].val)))
259+
else
260+
push!(ex.args, :(axisindexes(A.axes[$i], I[$i])))
261+
end
247262
n += 1
248263
else
249264
push!(ex.args, :(error("dimension ", $i, " does not have an axis to index")))

test/categoricalvector.jl

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Test CategoricalVector with a hierarchical index (indexed using Tuples)
2+
srand(1234)
3+
data = reshape(1.:40., 20, 2)
4+
v = collect(zip([:a, :b, :c][rand(1:3,20)], [:x,:y][rand(1:2,20)], [:x,:y][rand(1:2,20)]))
5+
idx = sortperm(v)
6+
A = AxisArray(data[idx,:], CategoricalVector(v[idx]), [:a, :b])
7+
@test A[:b, :] == A[5:12, :]
8+
@test A[[:a,:c], :] == A[[1:4;13:end], :]
9+
@test A[(:a,:y), :] == A[2:4, :]
10+
@test A[(:c,:y,:y), :] == A[16:end, :]
11+
@test AxisArrays.axistrait(axes(A)[1]) <: AxisArrays.Categorical
12+
13+
v = CategoricalVector(collect([1; 8; 10:15]))
14+
@test AxisArrays.axistrait(axes(A)[1]) <: AxisArrays.Categorical
15+
A = AxisArray(reshape(1:16, 8, 2), v, [:a, :b])
16+
@test A[Axis{:row}(CategoricalVector([15]))] == AxisArray(reshape(A.data[8, :], 1, 2), CategoricalVector([15]), [:a, :b])
17+
@test A[Axis{:row}(CategoricalVector([15])), 1] == AxisArray([A.data[8, 1]], CategoricalVector([15]))
18+
@test AxisArrays.axistrait(axes(A)[1]) <: AxisArrays.Categorical
19+
20+
# TODO: maybe make this work? Would require removing or modifying Base.getindex(A::AxisArray, idxs::Idx...)
21+
# @test A[CategoricalVector([15]), 1] == AxisArray([A.data[8, 1]], CategoricalVector([15]))

test/runtests.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ using Base.Test
2222
include("sortedvector.jl")
2323
end
2424

25+
@testset "CategoricalVector" begin
26+
include("categoricalvector.jl")
27+
end
28+
2529
@testset "Search" begin
2630
include("search.jl")
2731
end

0 commit comments

Comments
 (0)