Skip to content

Commit

Permalink
Remove axis indices and nameddims (#126)
Browse files Browse the repository at this point in the history
Also some API changes
  • Loading branch information
kescobo committed Feb 20, 2022
1 parent ad5247e commit cb9941c
Show file tree
Hide file tree
Showing 9 changed files with 151 additions and 119 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ scratch*
docs/build/
docs/site/
Manifest.toml
.envrc
9 changes: 2 additions & 7 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,23 @@ keywords = ["microbiology", "microbiome", "biology"]
license = "MIT"
desc = "Functions and types for working with microbial community data"
authors = ["@kescobo <[email protected]>"]
version = "0.8.4"
version = "0.9.0"

[deps]
AxisIndices = "f52c9ee2-1b1c-4fd8-8546-6350938c7f11"
Dictionaries = "85a47980-9c8c-11e8-2b9f-f7ca1fa99fb4"
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
EcoBase = "a58aae7d-b440-5a11-b283-399458f99aac"
MultivariateStats = "6f286f6a-111f-5878-ab1e-185364afe411"
NamedDims = "356022a1-0364-5f58-8944-0da4b18d706f"
ReTest = "e0db7c4e-2690-44b9-bad6-7687da720f89"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[compat]
AxisIndices = "0.7"
Dictionaries = "0.3"
Distances = "0.10"
EcoBase = "0.1.4"
MultivariateStats = "0.7, 0.8"
NamedDims = "0.2"
MultivariateStats = "0.9"
ReTest = "0.3"
Tables = "1.2.1"
julia = "1.6"

22 changes: 7 additions & 15 deletions docs/src/profiles.md
Original file line number Diff line number Diff line change
Expand Up @@ -233,21 +233,15 @@ It is often inconvenient to find the numerical index
of a particular feature or sample.
Instead, you can use strings or regular expressions
to get slices of a `CommunityProfile`,
which will match on the `name` field of the features or samples.
This kind of indexing always returns a `CommunityProfile`,
even if it only has 1 value.
which will match on the `String` representation of the features or samples.
As with numerical indexing, if the index returns
a unique (feature, sample) pair,
it will return the abundance of that pair.
Otherwise, it will return a new CommunityProfile.

```jldoctest profiles
julia> comm["g1", "s1"]
CommunityProfile{Float64, Taxon, MicrobiomeSample} with 1 features in 1 samples
Feature names:
g1
Sample names:
s1
julia> comm["g__g1", "s1"]
0.0
julia> comm[r"[gs]1", "s1"]
CommunityProfile{Float64, Taxon, MicrobiomeSample} with 2 features in 1 samples
Expand All @@ -259,8 +253,6 @@ Sample names:
s1
```



## [Working with metadata](@id working-metadata)

The `metadata` dictionaries of `MicrobiomeSample`s can accessed
Expand Down
2 changes: 0 additions & 2 deletions src/Microbiome.jl
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,7 @@ export ginisimpson,
using Statistics
using SparseArrays
using EcoBase
using AxisIndices
using Dictionaries
using NamedDims
using Tables
using Distances
using MultivariateStats
Expand Down
8 changes: 4 additions & 4 deletions src/diversity.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ Otherwise, uses `set!`.
"""
function shannon!(abt::AbstractAbundanceTable; overwrite=false)
func! = overwrite ? set! : insert!
for s in samplenames(abt)
for s in samples(abt)
col = abt[:, s]
sh = shannon(abundances(col))
func!(samples(col)[1], :shannon, sh)
func!(s, :shannon, sh)
end
return abt
end
Expand Down Expand Up @@ -68,10 +68,10 @@ Otherwise, uses `set!`.
"""
function ginisimpson!(abt::AbstractAbundanceTable; overwrite=false)
func! = overwrite ? set! : insert!
for s in samplenames(abt)
for s in samples(abt)
col = abt[:, s]
sh = ginisimpson(abundances(col))
func!(samples(col)[1], :ginisimpson, sh)
func!(s, :ginisimpson, sh)
end
return abt
end
Expand Down
11 changes: 11 additions & 0 deletions src/features.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
name(as::AbstractFeature) = as.name
name(as::AbstractFeature) = as.name

const _ranks = (
domain = 0,
kingdom = 1,
Expand Down Expand Up @@ -165,6 +168,14 @@ hasrank(gf::GeneFunction) = hastaxon(gf) && !ismissing(taxrank(gf))

Base.String(gf::GeneFunction) = hastaxon(gf) ? string(name(gf), '|', String(taxon(gf))) : name(gf)

Base.:(==)(g1::GeneFunction, g2::GeneFunction) = String(g1) == String(g2)

@testset "GeneFunction Equality" begin
@test GeneFunction("test") == GeneFunction("test")
@test GeneFunction("test", Taxon("taxon")) == GeneFunction("test", Taxon("taxon"))
@test GeneFunction("test") != GeneFunction("test", Taxon("taxon"))
end

"""
genefunction(n::AbstractString)
Expand Down
180 changes: 100 additions & 80 deletions src/profiles.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,41 +7,56 @@ end
CommunityProfile{T, F, S} <: AbstractAbundanceTable{T, F, S}
An `AbstractAssemblage` from [EcoBase.jl](https://github.com/EcoJulia/EcoBase.jl)
that uses an `AxisArray` of a `SparseMatrixCSC` under the hood.
that uses a `SparseMatrixCSC` under the hood.
`CommunityProfile`s are tables with `AbstractFeature`-indexed rows and
`AbstractSample`-indexed columns.
Note - we can use the `name` of samples and features to index.
"""
mutable struct CommunityProfile{T, F, S} <: AbstractAbundanceTable{T, F, S}
aa::NamedAxisArray

function CommunityProfile(aa::NamedAxisArray)
@assert dimnames(aa) == (:features, :samples)
T = eltype(parent(aa))
F = eltype(keys(axes(aa, 1)))
S = eltype(keys(axes(aa, 2)))
return new{T, F, S}(aa)
abundances::SparseMatrixCSC{T}
features::AbstractVector{F}
samples::AbstractVector{S}
fidx::Dictionary{String, Int}
sidx::Dictionary{String, Int}

function CommunityProfile(abunds::AbstractSparseMatrix,
feats::AbstractVector{<:AbstractFeature},
smpls::AbstractVector{<:AbstractSample})

length(feats) == size(abunds, 1) || throw(DimensionMismatch("Number of features must equal number of rows in matrix"))
length(smpls) == size(abunds, 2) || throw(DimensionMismatch("Number of samples must equal number columns in matrix"))
fidx = Dictionary(String.(feats), eachindex(feats))
sidx = Dictionary(String.(smpls), eachindex(smpls))

T = eltype(abunds)
F = eltype(feats)
S = eltype(smpls)
return new{T, F, S}(abunds, feats, smpls, fidx, sidx)
end
end

function CommunityProfile(tab::SparseMatrixCSC{<:Real},
features::AbstractVector{<:AbstractFeature},
samples::AbstractVector{<:AbstractSample})
return CommunityProfile(NamedAxisArray(tab, features=features, samples=samples))

function CommunityProfile(tab::AbstractVecOrMat,
feats::AbstractVector{<:AbstractFeature},
smpls::AbstractVector{<:AbstractSample})
return CommunityProfile(sparse(tab), feats, smpls)
end

function CommunityProfile{T, F, S}(tab::SparseMatrixCSC{<:T},
features::AbstractVector{F},
samples::AbstractVector{S}) where {T, F, S}
return CommunityProfile(tab, features, samples)
# single-column CommunityProfile
function CommunityProfile(tab::AbstractVecOrMat,
feats::AbstractVector{<:AbstractFeature},
smpl::AbstractSample)
return CommunityProfile(sparse(reshape(tab, size(tab,1), size(tab,2))), feats, [smpl])
end

function CommunityProfile(tab::AbstractMatrix,
features::AbstractVector{<:AbstractFeature},
samples::AbstractVector{<:AbstractSample})
return CommunityProfile(sparse(tab), features, samples)
# single-row CommunityProfile
function CommunityProfile(tab::AbstractVecOrMat,
feat::AbstractFeature,
smpls::AbstractVector{<:AbstractSample})
return CommunityProfile(sparse(reshape(tab, size(tab,1), size(tab,2))), [feat], smpls)
end

## -- Convienience functions -- ##

function ==(p1::CommunityProfile, p2::CommunityProfile)
Expand All @@ -50,19 +65,52 @@ function ==(p1::CommunityProfile, p2::CommunityProfile)
features(p1) == features(p2)
end

"""
taxonomicprofile(mat, features, samples)
"""
function taxonomicprofile(mat, features::AbstractVector{<:AbstractString}, samples::AbstractVector{<:AbstractString})
CommunityProfile(mat, Taxon.(features), MicrobiomeSample.(samples))
end

"""
functionalprofile(mat, features, samples)
"""
function functionalprofile(mat, features::AbstractVector{<:AbstractString}, samples::AbstractVector{<:AbstractString})
CommunityProfile(mat, GeneFunction.(features), MicrobiomeSample.(samples))
end

"""
metabolicprofile(mat, features, samples)
"""
function metabolicprofile(mat, features::AbstractVector{<:AbstractString}, samples::AbstractVector{<:AbstractString})
CommunityProfile(mat, Metabolite.(features), MicrobiomeSample.(samples))
end

@testset "String Constructors" begin
tp = taxonomicprofile([1 0; 0 1], ["feature1", "feature2"], ["sample1", "sample2"])
@test tp isa CommunityProfile
@test all(f-> f isa Taxon, features(tp))
fp = functionalprofile([1 0; 0 1], ["feature1", "feature2"], ["sample1", "sample2"])
@test fp isa CommunityProfile
@test all(f-> f isa GeneFunction, features(fp))
mp = metabolicprofile([1 0; 0 1], ["feature1", "feature2"], ["sample1", "sample2"])
@test mp isa CommunityProfile
@test all(f-> f isa Metabolite, features(mp))
end

"""
features(at::AbstractAbundanceTable)
Returns features in `at`. To get featurenames instead, use [`featurenames`](@ref).
"""
features(at::AbstractAbundanceTable) = axes(at.aa, 1) |> keys
features(at::AbstractAbundanceTable) = at.features

"""
samples(at::AbstractAbundanceTable)
Returns samples in `at`. To get samplenames instead, use [`samplenames`](@ref).
"""
samples(at::AbstractAbundanceTable) = axes(at.aa, 2) |> keys
samples(at::AbstractAbundanceTable) = at.samples

"""
samples(at::AbstractAbundanceTable, name::AbstractString)
Expand All @@ -73,76 +121,48 @@ function samples(at::AbstractAbundanceTable, name::AbstractString)
idx = findall(==(name), samplenames(at))
length(idx) == 0 && throw(IndexError("No samples called $name"))
length(idx) > 1 && throw(IndexError("More than one sample matches name $name"))
return samples(at)[axes(at.aa, 2)][first(idx)]
return samples(at)[axes(at.abundances, 2)][first(idx)]
end

profiletype(at::AbstractAbundanceTable) = eltype(features(at))
ranks(at::AbstractAbundanceTable) = taxrank.(features(at))

Base.size(at::AbstractAbundanceTable, dims...) = size(at.aa, dims...)
Base.size(at::AbstractAbundanceTable, dims...) = size(at.abundances, dims...)

Base.copy(at::AbstractAbundanceTable) = typeof(at)(copy(abundances(at)), copy(features(at)), deepcopy(samples(at)))
Base.copy(at::AbstractAbundanceTable) = CommunityProfile(copy(abundances(at)), copy(features(at)), deepcopy(samples(at)))

# -- Indexing -- #

function _index_profile(at, idx, inds)
# single value - return that value
ndims(idx) == 0 && return idx
# another table - return a new CommunityProfile with that table
ndims(idx) == 2 && return CommunityProfile(idx)
# a row or a column, figure out which, and make it 2D
if ndims(idx) == 1
dn = dimnames(idx)[1]
# if it's a row...
if dn == :samples
return at[[inds[1]], inds[2]]
# if it's a column
elseif dn == :features
return at[inds[1], [inds[2]]]
end
end
end

function _toinds(arr, inds::AbstractVector{Regex})
return findall(a-> any(ind-> contains(a, ind), inds), arr)
end

function _toinds(arr, inds::AbstractVector{<: Union{AbstractSample, AbstractFeature, AbstractString}})
return findall(a-> any(==(a), inds), arr)
end

# fall back ↑
_toinds(arr, ind::Union{AbstractSample, AbstractFeature, AbstractString, Regex}) = _toinds(arr, [ind])

# if inds are integers, just return them
_toinds(_, ind::Int) = ind
_toinds(_, inds::AbstractVector{Int}) = inds

function Base.getindex(at::CommunityProfile, inds...)
idx = at.aa[inds...]
function Base.getindex(at::AbstractAbundanceTable, rowind, colind)
rows = _toind(at.fidx, rowind)
cols = _toind(at.sidx, colind)

mat = copy(abundances(at)[rows, cols])

isempty(size(mat)) && return mat

_index_profile(at, idx, inds)
feat = copy(features(at))[rows]
smpl = deepcopy(samples(at))[cols]

feat isa AbstractFeature && (mat = reshape(mat, 1, length(mat)))
return CommunityProfile(mat, feat, smpl)
end

function Base.getindex(at::CommunityProfile, rowind::Union{T, AbstractVector{<:T}} where T<:Union{AbstractString,Regex}, colind)
rows = _toinds(featurenames(at), rowind)
idx = at.aa[rows, colind]
# For integers, or vectors of integers, just return them
_toind(_, ind) = ind
_toind(_, inds::AbstractVector) = inds

_index_profile(at, idx, (rows, colind))
end
# for strings and regex, look for matches
_toind(d, ind::AbstractString) = only((d[i] for i in findall(key-> key == ind, keys(d))))
_toind(d, ind::Regex) = [d[i] for i in findall(key-> contains(key, ind), keys(d))]

function Base.getindex(at::CommunityProfile, rowind, colind::Union{T, AbstractVector{<:T}} where T<:Union{AbstractString,Regex})
cols = _toinds(samplenames(at), colind)
idx = at.aa[rowind, cols]
_toind(d, inds::AbstractVector{<:AbstractString}) = [d[i] for i in findall(key-> any(ind-> key == ind, inds), keys(d))]
_toind(d, inds::AbstractVector{<:Regex}) = [d[i] for i in findall(key-> any(ind-> contains(key, ind), inds), keys(d))]

_index_profile(at, idx, (rowind, cols))
end
# For samples and features, look for string representation matches
_toind(d, ind::Union{AbstractSample, AbstractFeature}) = only((d[i] for i in findall(key-> key == String(ind), keys(d))))
_toind(d, inds::AbstractVector{<:Union{AbstractSample, AbstractFeature}}) = [d[i] for i in findall(key-> any(ind-> key == String(ind), inds), keys(d))]

function Base.getindex(at::CommunityProfile, rowind::Union{T, AbstractVector{<:T}} where T<:Union{AbstractString,Regex},
colind::Union{S, AbstractVector{<:S}} where S<:Union{AbstractString,Regex})
rows = _toinds(featurenames(at), rowind)
at[rows, colind]
end

## -- EcoBase Translations -- ##
# see src/ecobase.jl for Microbiome function names
Expand All @@ -152,7 +172,7 @@ end

EcoBase.thingnames(at::AbstractAbundanceTable) = name.(features(at))
EcoBase.placenames(at::AbstractAbundanceTable) = name.(samples(at))
EcoBase.occurrences(at::AbstractAbundanceTable) = parent(parent(at.aa)) # first parent is the unnamed AxisArray
EcoBase.occurrences(at::AbstractAbundanceTable) = at.abundances
EcoBase.nthings(at::AbstractAbundanceTable) = size(at, 1)
EcoBase.nplaces(at::AbstractAbundanceTable) = size(at, 2)
## todo
Expand Down Expand Up @@ -262,7 +282,7 @@ end
Like [`relativeabundance!`](@ref), but does not mutate original.
"""
function relativeabundance(at::AbstractAbundanceTable, kind::Symbol=:fraction)
comm = typeof(at)(float.(abundances(at)), deepcopy(features(at)), deepcopy(samples(at)))
comm = CommunityProfile(float.(abundances(at)), deepcopy(features(at)), deepcopy(samples(at)))
relativeabundance!(comm)
end

Expand All @@ -285,7 +305,7 @@ present(::Missing, m::Real=0.0) = missing
function present(at::AbstractAbundanceTable, minabundance::Real=0.0)
mat = spzeros(Bool, size(at)...)
for i in eachindex(mat)
mat[i] = present(at[i], minabundance)
mat[i] = present(at[Tuple(i)...], minabundance)
end
return mat
end
Expand Down
Loading

2 comments on commit cb9941c

@kescobo
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/55050

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.9.0 -m "<description of version>" cb9941ca31afc7b34a02eb67db25b1d7b91b1c10
git push origin v0.9.0

Please sign in to comment.