diff --git a/REQUIRE b/REQUIRE index 870ab04..81015dc 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,3 +1,4 @@ julia 0.6 IntervalSets 0.1 +IterTools RangeArrays diff --git a/src/AxisArrays.jl b/src/AxisArrays.jl index 90cfed1..2d023cb 100644 --- a/src/AxisArrays.jl +++ b/src/AxisArrays.jl @@ -3,9 +3,11 @@ __precompile__() module AxisArrays using Base: tail +import Base.Iterators: repeated using RangeArrays, IntervalSets +using IterTools -export AxisArray, Axis, axisnames, axisvalues, axisdim, axes, atindex, atvalue +export AxisArray, Axis, axisnames, axisvalues, axisdim, axes, atindex, atvalue, collapse # From IntervalSets: export ClosedInterval, .. @@ -15,6 +17,7 @@ include("intervals.jl") include("search.jl") include("indexing.jl") include("sortedvector.jl") +include("categoricalvector.jl") include("combine.jl") end diff --git a/src/categoricalvector.jl b/src/categoricalvector.jl new file mode 100644 index 0000000..e4a35ed --- /dev/null +++ b/src/categoricalvector.jl @@ -0,0 +1,82 @@ +""" +A CategoricalVector is an AbstractVector which is treated as a categorical axis regardless +of the element type. Duplicate values are not allowed but are not filtered out. + +A CategoricalVector axis can be indexed with an ClosedInterval, with a value, or with a +vector of values. Use of a CategoricalVector{Tuple} axis allows indexing similar to the +hierarchical index of the Python Pandas package or the R data.table package. + +In general, indexing into a CategoricalVector will be much slower than the corresponding +SortedVector or another sorted axis type, as linear search is required. + +### Constructors + +```julia +CategoricalVector(x::AbstractVector) +``` + +### Arguments + +* `x::AbstractVector` : the wrapped vector + +### Examples + +```julia +v = CategoricalVector(collect([1; 8; 10:15])) +A = AxisArray(reshape(1:16, 8, 2), v, [:a, :b]) +A[Axis{:row}(1), :] +A[Axis{:row}(10), :] +A[Axis{:row}([1, 10]), :] + +## Hierarchical index example with three key levels + +data = reshape(1.:40., 20, 2) +v = collect(zip([:a, :b, :c][rand(1:3,20)], [:x,:y][rand(1:2,20)], [:x,:y][rand(1:2,20)])) +A = AxisArray(data, CategoricalVector(v), [:a, :b]) +A[:b, :] +A[[:a,:c], :] +A[(:a,:x), :] +A[(:a,:x,:x), :] +``` +""" +immutable CategoricalVector{T, A<:AbstractVector{T}} <: AbstractVector{T} + data::A +end + +function CategoricalVector(data::AbstractVector{T}) where T + CategoricalVector{T, typeof(data)}(data) +end + +Base.getindex(v::CategoricalVector, idx::Int) = v.data[idx] +Base.getindex(v::CategoricalVector, idx::AbstractVector) = CategoricalVector(v.data[idx]) + +Base.length(v::CategoricalVector) = length(v.data) +Base.size(v::CategoricalVector) = size(v.data) +Base.size(v::CategoricalVector, i) = size(v.data, i) +Base.indices(v::CategoricalVector) = indices(v.data) + +axistrait(::Type{CategoricalVector{T,A}}) where {T,A} = Categorical +checkaxis(::CategoricalVector) = nothing + + +## Add some special indexing for CategoricalVector{Tuple}'s to achieve something like +## Panda's hierarchical indexing + +axisindexes{T<:Tuple,S,A}(ax::Axis{S,CategoricalVector{T,A}}, idx) = axisindexes(ax, (idx,)) + +function axisindexes{T<:Tuple,S,A}(ax::Axis{S,CategoricalVector{T,A}}, idx::Tuple) + collect(filter(ax_idx->_tuple_matches(ax.val[ax_idx], idx), indices(ax.val)...)) +end + +function _tuple_matches(element::Tuple, idx::Tuple) + length(idx) <= length(element) || return false + + for (x, y) in zip(element, idx) + x == y || return false + end + + return true +end + +axisindexes{T<:Tuple,S,A}(ax::Axis{S,CategoricalVector{T,A}}, idx::AbstractArray) = + vcat([axisindexes(ax, i) for i in idx]...) diff --git a/src/combine.jl b/src/combine.jl index 45f2907..3452d5c 100644 --- a/src/combine.jl +++ b/src/combine.jl @@ -139,3 +139,197 @@ function Base.join{T,N,D,Ax}(As::AxisArray{T,N,D,Ax}...; fillvalue::T=zero(T), return result end #join + +function _collapse_array_axes(array_name, array_axes...) + ((array_name, (idx isa Tuple ? idx : (idx,))...) for idx in product((Ax.val for Ax in array_axes)...)) +end + +function _collapse_axes(array_names, array_axes) + collect(Iterators.flatten(map(array_names, array_axes) do tup_name, tup_array_axes + _collapse_array_axes(tup_name, tup_array_axes...) + end)) +end + +function _splitall{N}(::Type{Val{N}}, As...) + tuple((Base.IteratorsMD.split(A, Val{N}) for A in As)...) +end + +function _reshapeall{N}(::Type{Val{N}}, As...) + tuple((reshape(A, Val{N}) for A in As)...) +end + +function _check_common_axes(common_axis_tuple) + if !all(axisname(first(common_axis_tuple)) .=== axisname.(common_axis_tuple[2:end])) + throw(ArgumentError("Leading common axes must have the same name in each array")) + end + + return nothing +end + +function _collapsed_axis_eltype(LType, trailing_axes) + eltypes = map(trailing_axes) do array_trailing_axes + Tuple{LType, eltype.(array_trailing_axes)...} + end + + return typejoin(eltypes...) +end + +function collapse{N, AN}(::Type{Val{N}}, As::Vararg{AxisArray, AN}) + collapse(Val{N}, ntuple(identity, Val{AN}), As...) +end + +function collapse{N, AN, NewArrayType<:AbstractArray}(::Type{Val{N}}, ::Type{NewArrayType}, As::Vararg{AxisArray, AN}) + collapse(Val{N}, NewArrayType, ntuple(identity, Val{AN}), As...) +end + +@generated function collapse{N, AN, LType}(::Type{Val{N}}, labels::NTuple{AN, LType}, As::Vararg{AxisArray, AN}) + collapsed_dim_int = Int(N) + 1 + new_eltype = Base.promote_eltype(As...) + + quote + collapse(Val{N}, Array{$new_eltype, $collapsed_dim_int}, labels, As...) + end +end + +""" + collapse(::Type{Val{N}}, As::AxisArray...) -> AxisArray + collapse(::Type{Val{N}}, labels::Tuple, As::AxisArray...) -> AxisArray + collapse(::Type{Val{N}}, ::Type{NewArrayType}, As::AxisArray...) -> AxisArray + collapse(::Type{Val{N}}, ::Type{NewArrayType}, labels::Tuple, As::AxisArray...) -> AxisArray + +Collapses `AxisArray`s with `N` equal leading axes into a single `AxisArray`. +All additional axes in any of the arrays are collapsed into a single additional +axis of type `Axis{:collapsed, CategoricalVector{Tuple}}`. + +### Arguments + +* `::Type{Val{N}}`: the greatest common dimension to share between all input + arrays. The remaining axes are collapsed. All `N` axes must be common + to each input array, at the same dimension. Values from `0` up to the + minimum number of dimensions across all input arrays are allowed. +* `labels::Tuple`: (optional) an index for each array in `As` used as the leading element in + the index tuples in the `:collapsed` axis. Defaults to `1:length(As)`. +* `::Type{NewArrayType<:AbstractArray{_, N+1}}`: (optional) the desired underlying array + type for the returned `AxisArray`. +* `As::AxisArray...`: `AxisArray`s to be collapsed together. + +### Examples + +``` +julia> price_data = AxisArray(rand(10), Axis{:time}(Date(2016,01,01):Day(1):Date(2016,01,10))) +1-dimensional AxisArray{Float64,1,...} with axes: + :time, 2016-01-01:1 day:2016-01-10 +And data, a 10-element Array{Float64,1}: + 0.885014 + 0.418562 + 0.609344 + 0.72221 + 0.43656 + 0.840304 + 0.455337 + 0.65954 + 0.393801 + 0.260207 + +julia> size_data = AxisArray(rand(10,2), Axis{:time}(Date(2016,01,01):Day(1):Date(2016,01,10)), Axis{:measure}([:area, :volume])) +2-dimensional AxisArray{Float64,2,...} with axes: + :time, 2016-01-01:1 day:2016-01-10 + :measure, Symbol[:area, :volume] +And data, a 10×2 Array{Float64,2}: + 0.159434 0.456992 + 0.344521 0.374623 + 0.522077 0.313256 + 0.994697 0.320953 + 0.95104 0.900526 + 0.921854 0.729311 + 0.000922581 0.148822 + 0.449128 0.761714 + 0.650277 0.135061 + 0.688773 0.513845 + +julia> collapsed = collapse(Val{1}, (:price, :size), price_data, size_data) +2-dimensional AxisArray{Float64,2,...} with axes: + :time, 2016-01-01:1 day:2016-01-10 + :collapsed, Tuple{Symbol,Vararg{Symbol,N} where N}[(:price,), (:size, :area), (:size, :volume)] +And data, a 10×3 Array{Float64,2}: + 0.885014 0.159434 0.456992 + 0.418562 0.344521 0.374623 + 0.609344 0.522077 0.313256 + 0.72221 0.994697 0.320953 + 0.43656 0.95104 0.900526 + 0.840304 0.921854 0.729311 + 0.455337 0.000922581 0.148822 + 0.65954 0.449128 0.761714 + 0.393801 0.650277 0.135061 + 0.260207 0.688773 0.513845 + +julia> collapsed[Axis{:collapsed}(:size)] == size_data +true +``` + +""" +@generated function collapse(::Type{Val{N}}, + ::Type{NewArrayType}, + labels::NTuple{AN, LType}, + As::Vararg{AxisArray, AN}) where {N, AN, LType, NewArrayType<:AbstractArray} + if N < 0 + throw(ArgumentError("collapse dimension N must be at least 0")) + end + + if N > minimum(ndims.(As)) + throw(ArgumentError( + """ + collapse dimension N must not be greater than the maximum number of dimensions + across all input arrays + """ + )) + end + + collapsed_dim = Val{N + 1} + collapsed_dim_int = Int(N) + 1 + + common_axes, trailing_axes = zip(_splitall(Val{N}, axisparams.(As)...)...) + + foreach(_check_common_axes, zip(common_axes...)) + + new_common_axes = first(common_axes) + collapsed_axis_eltype = _collapsed_axis_eltype(LType, trailing_axes) + collapsed_axis_type = CategoricalVector{collapsed_axis_eltype, Vector{collapsed_axis_eltype}} + + new_axes_type = Tuple{new_common_axes..., Axis{:collapsed, collapsed_axis_type}} + new_eltype = Base.promote_eltype(As...) + + quote + common_axes, trailing_axes = zip(_splitall(Val{N}, axes.(As)...)...) + + for common_axis_tuple in zip(common_axes...) + if !isempty(common_axis_tuple) + for common_axis in common_axis_tuple[2:end] + if !all(axisvalues(common_axis) .== axisvalues(common_axis_tuple[1])) + throw(ArgumentError( + """ + Leading common axes must be identical across + all input arrays""" + )) + end + end + end + end + + array_data = cat($collapsed_dim, _reshapeall($collapsed_dim, As...)...) + + axis_array_type = AxisArray{ + $new_eltype, + $collapsed_dim_int, + $NewArrayType, + $new_axes_type + } + + new_axes = ( + first(common_axes)..., + Axis{:collapsed, $collapsed_axis_type}($collapsed_axis_type(_collapse_axes(labels, trailing_axes))), + ) + + return axis_array_type(array_data, new_axes) + end +end diff --git a/src/core.jl b/src/core.jl index 236bb80..d29847d 100644 --- a/src/core.jl +++ b/src/core.jl @@ -503,6 +503,15 @@ end axes(A::AbstractArray) = default_axes(A) axes(A::AbstractArray, dim::Int) = default_axes(A)[dim] +""" + axisparams(::AxisArray) -> Vararg{::Type{Axis}} + axisparams(::Type{AxisArray}) -> Vararg{::Type{Axis}} + +Returns the axis parameters for an AxisArray. +""" +axisparams{T,N,D,Ax}(::AxisArray{T,N,D,Ax}) = (Ax.parameters...) +axisparams{T,N,D,Ax}(::Type{AxisArray{T,N,D,Ax}}) = (Ax.parameters...) + ### Axis traits ### abstract type AxisTrait end immutable Dimensional <: AxisTrait end diff --git a/src/indexing.jl b/src/indexing.jl index 42aff72..8d404ed 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -231,6 +231,17 @@ end ex = Expr(:tuple) n = 0 for i=1:length(I) + if axistrait(I[i]) <: Categorical && i <= length(Ax.parameters) + if I[i] <: Axis + push!(ex.args, :(axisindexes(A.axes[$i], I[$i].val))) + else + push!(ex.args, :(axisindexes(A.axes[$i], I[$i]))) + end + n += 1 + + continue + end + if I[i] <: Idx push!(ex.args, :(I[$i])) n += 1 @@ -243,7 +254,11 @@ end end n += length(I[i]) elseif i <= length(Ax.parameters) - push!(ex.args, :(axisindexes(A.axes[$i], I[$i]))) + if I[i] <: Axis + push!(ex.args, :(axisindexes(A.axes[$i], I[$i].val))) + else + push!(ex.args, :(axisindexes(A.axes[$i], I[$i]))) + end n += 1 else push!(ex.args, :(error("dimension ", $i, " does not have an axis to index"))) diff --git a/test/categoricalvector.jl b/test/categoricalvector.jl new file mode 100644 index 0000000..28cf6ce --- /dev/null +++ b/test/categoricalvector.jl @@ -0,0 +1,21 @@ +# Test CategoricalVector with a hierarchical index (indexed using Tuples) +srand(1234) +data = reshape(1.:40., 20, 2) +v = collect(zip([:a, :b, :c][rand(1:3,20)], [:x,:y][rand(1:2,20)], [:x,:y][rand(1:2,20)])) +idx = sortperm(v) +A = AxisArray(data[idx,:], AxisArrays.CategoricalVector(v[idx]), [:a, :b]) +@test A[:b, :] == A[5:12, :] +@test A[[:a,:c], :] == A[[1:4;13:end], :] +@test A[(:a,:y), :] == A[2:4, :] +@test A[(:c,:y,:y), :] == A[16:end, :] +@test AxisArrays.axistrait(axes(A)[1]) <: AxisArrays.Categorical + +v = AxisArrays.CategoricalVector(collect([1; 8; 10:15])) +@test AxisArrays.axistrait(axes(A)[1]) <: AxisArrays.Categorical +A = AxisArray(reshape(1:16, 8, 2), v, [:a, :b]) +@test A[Axis{:row}(AxisArrays.CategoricalVector([15]))] == AxisArray(reshape(A.data[8, :], 1, 2), AxisArrays.CategoricalVector([15]), [:a, :b]) +@test A[Axis{:row}(AxisArrays.CategoricalVector([15])), 1] == AxisArray([A.data[8, 1]], AxisArrays.CategoricalVector([15])) +@test AxisArrays.axistrait(axes(A)[1]) <: AxisArrays.Categorical + +# TODO: maybe make this work? Would require removing or modifying Base.getindex(A::AxisArray, idxs::Idx...) +# @test A[AxisArrays.CategoricalVector([15]), 1] == AxisArray([A.data[8, 1]], AxisArrays.CategoricalVector([15])) diff --git a/test/combine.jl b/test/combine.jl index b8548ef..2334a40 100644 --- a/test/combine.jl +++ b/test/combine.jl @@ -47,3 +47,29 @@ ABdata[3:6,3:6,:,2] = Bdata @test join(A,B,method=:left) == AxisArray(ABdata[1:4, 1:4, :, :], A.axes...) @test join(A,B,method=:right) == AxisArray(ABdata[3:6, 3:6, :, :], B.axes...) @test join(A,B,method=:outer) == join(A,B) + +# collapse +A1 = AxisArray(A1data, Axis{:X}(1:2), Axis{:Y}(1:2)) +A2 = AxisArray(reshape(A2data, size(A2data)..., 1), Axis{:X}(1:2), Axis{:Y}(1:2), Axis{:Z}([:foo])) + +@test @inferred(collapse(Val{2}, A1, A2)) == AxisArray(cat(3, A1data, A2data), Axis{:X}(1:2), Axis{:Y}(1:2), Axis{:collapsed}(AxisArrays.CategoricalVector([(1,), (2, :foo)]))) +@test @inferred(collapse(Val{2}, A1)) == AxisArray(reshape(A1, 2, 2, 1), Axis{:X}(1:2), Axis{:Y}(1:2), Axis{:collapsed}(AxisArrays.CategoricalVector([(1,)]))) +@test @inferred(collapse(Val{2}, A1)) == AxisArray(reshape(A1.data, size(A1)..., 1), axes(A1)..., Axis{:collapsed}(AxisArrays.CategoricalVector([(1,)]))) + +@test @inferred(collapse(Val{2}, (:A1, :A2), A1, A2)) == AxisArray(cat(3, A1data, A2data), Axis{:X}(1:2), Axis{:Y}(1:2), Axis{:collapsed}(AxisArrays.CategoricalVector([(:A1,), (:A2, :foo)]))) +@test @inferred(collapse(Val{2}, (:foo,), A1)) == AxisArray(reshape(A1, 2, 2, 1), Axis{:X}(1:2), Axis{:Y}(1:2), Axis{:collapsed}(AxisArrays.CategoricalVector([(:foo,)]))) +@test @inferred(collapse(Val{2}, (:a,), A1)) == AxisArray(reshape(A1.data, size(A1)..., 1), axes(A1)..., Axis{:collapsed}(AxisArrays.CategoricalVector([(:a,)]))) + +@test @inferred(collapse(Val{0}, A1)) == AxisArray(vec(A1data), Axis{:collapsed}(AxisArrays.CategoricalVector(collect(IterTools.product((1,), axisvalues(A1)...))))) +@test @inferred(collapse(Val{1}, A1)) == AxisArray(A1data, Axis{:row}(1:2), Axis{:collapsed}(AxisArrays.CategoricalVector(collect(IterTools.product((1,), axisvalues(A1)[2]))))) +@test @inferred(collapse(Val{1}, (1,), A1)) == collapse(Val{1}, A1) +@test @inferred(collapse(Val{1}, Array{Int, 2}, A1)) == collapse(Val{1}, A1) +@test @inferred(collapse(Val{1}, Array{Int, 2}, (1,), A1)) == collapse(Val{1}, A1) + +@test_throws ArgumentError collapse(Val{-1}, A1) +@test_throws ArgumentError collapse(Val{10}, A1) + +A1ᵀ = transpose(A1) +@test_throws ArgumentError collapse(Val{-1}, A1, A1ᵀ) +@test_throws ArgumentError collapse(Val{1}, A1, A1ᵀ) +@test_throws ArgumentError collapse(Val{10}, A1, A1ᵀ) diff --git a/test/core.jl b/test/core.jl index ecd192c..fa1d6f0 100644 --- a/test/core.jl +++ b/test/core.jl @@ -240,8 +240,10 @@ map!(*, A2, A, A) # Reductions (issue #55) A = AxisArray(collect(reshape(1:15,3,5)), :y, :x) B = @inferred(AxisArray(collect(reshape(1:15,3,5)), Axis{:y}(0.1:0.1:0.3), Axis{:x}(10:10:50))) -for C in (A, B) - for op in (sum, minimum) # together, cover both reduced_indices and reduced_indices0 +arrays = (A, B) +functions = (sum, minimum) +for C in arrays + for op in functions # together, cover both reduced_indices and reduced_indices0 axv = axisvalues(C) C1 = @inferred(op(C, 1)) @test typeof(C1) == typeof(C) diff --git a/test/runtests.jl b/test/runtests.jl index 161c89d..9e09eb4 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,5 +1,6 @@ using AxisArrays using Base.Test +import IterTools @testset "AxisArrays" begin @test isempty(detect_ambiguities(AxisArrays, Base, Core)) @@ -20,6 +21,10 @@ using Base.Test include("sortedvector.jl") end + @testset "CategoricalVector" begin + include("categoricalvector.jl") + end + @testset "Search" begin include("search.jl") end