-
Notifications
You must be signed in to change notification settings - Fork 43
Add CategoricalVector
and collapse
#88
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
6ca84d4
46982e2
0bc92de
85eda7c
5958549
3c29df0
b39560f
6ac9f9f
eb1a15d
e96a36a
4f4292a
e2796cb
42f5483
6575d4b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
julia 0.6 | ||
IntervalSets 0.1 | ||
IterTools | ||
RangeArrays |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
|
||
export CategoricalVector | ||
|
||
""" | ||
A CategoricalVector is an AbstractVector which is treated as a categorical axis regardless | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing backticks around types in this docstring. |
||
of the element type. Duplicate values are not allowed but are not filtered out. | ||
|
||
A CategoricalVector axis can be indexed with an ClosedInterval, with a value, or with a | ||
vector of values. Use of a CategoricalVector{Tuple} axis allows indexing similar to the | ||
hierarchical index of the Python Pandas package or the R data.table package. | ||
|
||
In general, indexing into a CategoricalVector will be much slower than the corresponding | ||
SortedVector or another sorted axis type, as linear search is required. | ||
|
||
### Constructors | ||
|
||
```julia | ||
CategoricalVector(x::AbstractVector) | ||
``` | ||
|
||
### Arguments | ||
|
||
* `x::AbstractVector` : the wrapped vector | ||
|
||
### Examples | ||
|
||
```julia | ||
v = CategoricalVector(collect([1; 8; 10:15])) | ||
A = AxisArray(reshape(1:16, 8, 2), v, [:a, :b]) | ||
A[Axis{:row}(1), :] | ||
A[Axis{:row}(10), :] | ||
A[Axis{:row}([1, 10]), :] | ||
|
||
## Hierarchical index example with three key levels | ||
|
||
data = reshape(1.:40., 20, 2) | ||
v = collect(zip([:a, :b, :c][rand(1:3,20)], [:x,:y][rand(1:2,20)], [:x,:y][rand(1:2,20)])) | ||
A = AxisArray(data, CategoricalVector(v), [:a, :b]) | ||
A[:b, :] | ||
A[[:a,:c], :] | ||
A[(:a,:x), :] | ||
A[(:a,:x,:x), :] | ||
``` | ||
""" | ||
immutable CategoricalVector{T, A<:AbstractVector{T}} <: AbstractVector{T} | ||
data::A | ||
end | ||
|
||
function CategoricalVector(data::AbstractVector{T}) where T | ||
CategoricalVector{T, typeof(data)}(data) | ||
end | ||
|
||
Base.getindex(v::CategoricalVector, idx::Int) = v.data[idx] | ||
Base.getindex(v::CategoricalVector, idx::AbstractVector) = CategoricalVector(v.data[idx]) | ||
|
||
Base.length(v::CategoricalVector) = length(v.data) | ||
Base.size(v::CategoricalVector) = size(v.data) | ||
Base.size(v::CategoricalVector, i) = size(v.data, i) | ||
Base.indices(v::CategoricalVector) = indices(v.data) | ||
|
||
axistrait(::Type{CategoricalVector{T,A}}) where {T,A} = Categorical | ||
checkaxis(::CategoricalVector) = nothing | ||
|
||
|
||
## Add some special indexing for CategoricalVector{Tuple}'s to achieve something like | ||
## Panda's hierarchical indexing | ||
|
||
axisindexes{T<:Tuple,S,A}(ax::Axis{S,CategoricalVector{T,A}}, idx) = axisindexes(ax, (idx,)) | ||
|
||
function axisindexes{T<:Tuple,S,A}(ax::Axis{S,CategoricalVector{T,A}}, idx::Tuple) | ||
collect(filter(ax_idx->_tuple_matches(ax.val[ax_idx], idx), indices(ax.val)...)) | ||
end | ||
|
||
function _tuple_matches(element::Tuple, idx::Tuple) | ||
length(idx) <= length(element) || return false | ||
|
||
for (x, y) in zip(element, idx) | ||
x == y || return false | ||
end | ||
|
||
return true | ||
end | ||
|
||
axisindexes{T<:Tuple,S,A}(ax::Axis{S,CategoricalVector{T,A}}, idx::AbstractArray) = | ||
vcat([axisindexes(ax, i) for i in idx]...) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -139,3 +139,145 @@ function Base.join{T,N,D,Ax}(As::AxisArray{T,N,D,Ax}...; fillvalue::T=zero(T), | |
return result | ||
|
||
end #join | ||
|
||
function _flatten_array_axes(array_name, array_axes...) | ||
((array_name, (idx isa Tuple ? idx : (idx,))...) for idx in product((Ax.val for Ax in array_axes)...)) | ||
end | ||
|
||
function _flatten_axes(array_names, array_axes) | ||
collect(Iterators.flatten(map(array_names, array_axes) do tup_name, tup_array_axes | ||
_flatten_array_axes(tup_name, tup_array_axes...) | ||
end)) | ||
end | ||
|
||
function _splitall{N}(::Type{Val{N}}, As...) | ||
tuple((Base.IteratorsMD.split(A, Val{N}) for A in As)...) | ||
end | ||
|
||
function _reshapeall{N}(::Type{Val{N}}, As...) | ||
tuple((reshape(A, Val{N}) for A in As)...) | ||
end | ||
|
||
function _check_common_axes(common_axis_tuple) | ||
if !all(axisname(first(common_axis_tuple)) .=== axisname.(common_axis_tuple[2:end])) | ||
throw(ArgumentError("Leading common axes must have the same name in each array")) | ||
end | ||
|
||
return nothing | ||
end | ||
|
||
function _flat_axis_eltype(LType, trailing_axes) | ||
eltypes = map(trailing_axes) do array_trailing_axes | ||
Tuple{LType, eltype.(array_trailing_axes)...} | ||
end | ||
|
||
return typejoin(eltypes...) | ||
end | ||
|
||
function flatten{N, AN}(::Type{Val{N}}, As::Vararg{AxisArray, AN}) | ||
flatten(Val{N}, ntuple(identity, Val{AN}), As...) | ||
end | ||
|
||
function flatten{N, AN, NewArrayType<:AbstractArray}(::Type{Val{N}}, ::Type{NewArrayType}, As::Vararg{AxisArray, AN}) | ||
flatten(Val{N}, NewArrayType, ntuple(identity, Val{AN}), As...) | ||
end | ||
|
||
@generated function flatten{N, AN, LType}(::Type{Val{N}}, labels::NTuple{AN, LType}, As::Vararg{AxisArray, AN}) | ||
flat_dim = Val{N + 1} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not used? |
||
flat_dim_int = Int(N) + 1 | ||
new_eltype = Base.promote_eltype(As...) | ||
|
||
quote | ||
flatten(Val{N}, Array{$new_eltype, $flat_dim_int}, labels, As...) | ||
end | ||
end | ||
|
||
""" | ||
flatten(::Type{Val{N}}, As::AxisArray...) -> AxisArray | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add examples? I find it hard to understand what this function does. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Example which has been added:
|
||
flatten(::Type{Val{N}}, labels::Tuple, As::AxisArray...) -> AxisArray | ||
flatten(::Type{Val{N}}, ::Type{NewArrayType}, As::AxisArray...) -> AxisArray | ||
flatten(::Type{Val{N}}, ::Type{NewArrayType}, labels::Tuple, As::AxisArray...) -> AxisArray | ||
|
||
Concatenates AxisArrays with N equal leading axes into a single AxisArray. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Backquotes around type names (here and elsewhere). |
||
All additional axes in any of the arrays are flattened into a single additional | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. collapsed |
||
CategoricalVector{Tuple} axis. | ||
|
||
### Arguments | ||
|
||
* `::Type{Val{N}}`: the greatest common dimension to share between all input | ||
arrays. The remaining axes are flattened. All N axes must be common | ||
to each input array, at the same dimension. Values from 0 up to the | ||
minimum number of dimensions across all input arrays are allowed. | ||
* `labels::Tuple`: (optional) a label for each AxisArray in As which is used in the flat | ||
axis. | ||
* `::Type{NewArrayType<:AbstractArray{_, N+1}}`: (optional) the desired underlying array | ||
type for the returned AxisArray. | ||
* `As::AxisArray...`: AxisArrays to be flattened together. | ||
""" | ||
@generated function flatten{N, AN, LType, NewArrayType<:AbstractArray}( | ||
::Type{Val{N}}, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this is the conventional indentation. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are only two existing cases of too-long method signatures and making this one look like those would mean pushing these arguments themselves too far to the right. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd do it anyway for consistency, and knowing that with the new There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can use the |
||
::Type{NewArrayType}, | ||
labels::NTuple{AN, LType}, | ||
As::Vararg{AxisArray, AN}, | ||
) | ||
if N < 0 | ||
throw(ArgumentError("flatten dimension N must be at least 0")) | ||
end | ||
|
||
if N > minimum(ndims.(As)) | ||
throw(ArgumentError( | ||
""" | ||
flatten dimension N must not be greater than the maximum number of dimensions | ||
across all input arrays | ||
""" | ||
)) | ||
end | ||
|
||
flat_dim = Val{N + 1} | ||
flat_dim_int = Int(N) + 1 | ||
|
||
common_axes, trailing_axes = zip(_splitall(Val{N}, axisparams.(As)...)...) | ||
|
||
foreach(_check_common_axes, zip(common_axes...)) | ||
|
||
new_common_axes = first(common_axes) | ||
flat_axis_eltype = _flat_axis_eltype(LType, trailing_axes) | ||
flat_axis_type = CategoricalVector{flat_axis_eltype, Vector{flat_axis_eltype}} | ||
|
||
new_axes_type = Tuple{new_common_axes..., Axis{:flat, flat_axis_type}} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
new_eltype = Base.promote_eltype(As...) | ||
|
||
quote | ||
common_axes, trailing_axes = zip(_splitall(Val{N}, axes.(As)...)...) | ||
|
||
for common_axis_tuple in zip(common_axes...) | ||
if !isempty(common_axis_tuple) | ||
for common_axis in common_axis_tuple[2:end] | ||
if !all(axisvalues(common_axis) .== axisvalues(common_axis_tuple[1])) | ||
throw(ArgumentError( | ||
""" | ||
Leading common axes must be identical across | ||
all input arrays""" | ||
)) | ||
end | ||
end | ||
end | ||
end | ||
|
||
array_data = cat($flat_dim, _reshapeall($flat_dim, As...)...) | ||
|
||
axis_array_type = AxisArray{ | ||
$new_eltype, | ||
$flat_dim_int, | ||
$NewArrayType, | ||
$new_axes_type | ||
} | ||
|
||
new_axes = ( | ||
first(common_axes)..., | ||
Axis{:flat, $flat_axis_type}($flat_axis_type(_flatten_axes(labels, trailing_axes))), | ||
) | ||
|
||
return axis_array_type(array_data, new_axes) | ||
end | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# Test CategoricalVector with a hierarchical index (indexed using Tuples) | ||
srand(1234) | ||
data = reshape(1.:40., 20, 2) | ||
v = collect(zip([:a, :b, :c][rand(1:3,20)], [:x,:y][rand(1:2,20)], [:x,:y][rand(1:2,20)])) | ||
idx = sortperm(v) | ||
A = AxisArray(data[idx,:], CategoricalVector(v[idx]), [:a, :b]) | ||
@test A[:b, :] == A[5:12, :] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume these reflect the actual random numbers produced due to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would; I was just following test/sortedvector.jl as much as possible. I can change it though :) |
||
@test A[[:a,:c], :] == A[[1:4;13:end], :] | ||
@test A[(:a,:y), :] == A[2:4, :] | ||
@test A[(:c,:y,:y), :] == A[16:end, :] | ||
@test AxisArrays.axistrait(axes(A)[1]) <: AxisArrays.Categorical | ||
|
||
v = CategoricalVector(collect([1; 8; 10:15])) | ||
@test AxisArrays.axistrait(axes(A)[1]) <: AxisArrays.Categorical | ||
A = AxisArray(reshape(1:16, 8, 2), v, [:a, :b]) | ||
@test A[Axis{:row}(CategoricalVector([15]))] == AxisArray(reshape(A.data[8, :], 1, 2), CategoricalVector([15]), [:a, :b]) | ||
@test A[Axis{:row}(CategoricalVector([15])), 1] == AxisArray([A.data[8, 1]], CategoricalVector([15])) | ||
@test AxisArrays.axistrait(axes(A)[1]) <: AxisArrays.Categorical | ||
|
||
# TODO: maybe make this work? Would require removing or modifying Base.getindex(A::AxisArray, idxs::Idx...) | ||
# @test A[CategoricalVector([15]), 1] == AxisArray([A.data[8, 1]], CategoricalVector([15])) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please use a different name for this, as
CategoricalVector
is already used in CategoricalArrays, which replacedPooledDataArray
in DataTables (and soon in DataFrames). Why notCategoricalAxis
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's not an
Axis
, and it mirrors theSortedVector
type.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, IIUC its only purpose is to treat it as a categorical axis, isn't it? Ideas about other possible names? It would be too bad to have conflicts when loading both AxisArrays and DataTables/DataFrames.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could choose not to export it?
There aren't conflicts when you use both packages unless you also use
CategoricalVector
.The nomenclature used within AxisArrays is
Categorical
, which is howCategoricalVector
came up.How about
DiscreteVector
?