From cb9941ca31afc7b34a02eb67db25b1d7b91b1c10 Mon Sep 17 00:00:00 2001 From: Kevin Bonham Date: Sun, 20 Feb 2022 14:00:14 -0500 Subject: [PATCH] Remove axis indices and nameddims (#126) Also some API changes --- .gitignore | 1 + Project.toml | 9 +- docs/src/profiles.md | 22 ++--- src/Microbiome.jl | 2 - src/diversity.jl | 8 +- src/features.jl | 11 +++ src/profiles.jl | 180 ++++++++++++++++++++++------------------ src/samples.jl | 12 ++- test/MicrobiomeTests.jl | 25 +++--- 9 files changed, 151 insertions(+), 119 deletions(-) diff --git a/.gitignore b/.gitignore index 59a11099..2829ee3b 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ scratch* docs/build/ docs/site/ Manifest.toml +.envrc \ No newline at end of file diff --git a/Project.toml b/Project.toml index a66ae4d5..f074bdf2 100644 --- a/Project.toml +++ b/Project.toml @@ -4,28 +4,23 @@ keywords = ["microbiology", "microbiome", "biology"] license = "MIT" desc = "Functions and types for working with microbial community data" authors = ["@kescobo "] -version = "0.8.4" +version = "0.9.0" [deps] -AxisIndices = "f52c9ee2-1b1c-4fd8-8546-6350938c7f11" Dictionaries = "85a47980-9c8c-11e8-2b9f-f7ca1fa99fb4" Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" EcoBase = "a58aae7d-b440-5a11-b283-399458f99aac" MultivariateStats = "6f286f6a-111f-5878-ab1e-185364afe411" -NamedDims = "356022a1-0364-5f58-8944-0da4b18d706f" ReTest = "e0db7c4e-2690-44b9-bad6-7687da720f89" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -AxisIndices = "0.7" Dictionaries = "0.3" Distances = "0.10" EcoBase = "0.1.4" -MultivariateStats = "0.7, 0.8" -NamedDims = "0.2" +MultivariateStats = "0.9" ReTest = "0.3" Tables = "1.2.1" julia = "1.6" - diff --git a/docs/src/profiles.md b/docs/src/profiles.md index 7f2499c3..71242884 100644 --- a/docs/src/profiles.md +++ b/docs/src/profiles.md @@ -233,21 +233,15 @@ It is often inconvenient to find the numerical index of a particular feature or sample. Instead, you can use strings or regular expressions to get slices of a `CommunityProfile`, -which will match on the `name` field of the features or samples. -This kind of indexing always returns a `CommunityProfile`, -even if it only has 1 value. +which will match on the `String` representation of the features or samples. +As with numerical indexing, if the index returns +a unique (feature, sample) pair, +it will return the abundance of that pair. +Otherwise, it will return a new CommunityProfile. ```jldoctest profiles -julia> comm["g1", "s1"] -CommunityProfile{Float64, Taxon, MicrobiomeSample} with 1 features in 1 samples - -Feature names: -g1 - -Sample names: -s1 - - +julia> comm["g__g1", "s1"] +0.0 julia> comm[r"[gs]1", "s1"] CommunityProfile{Float64, Taxon, MicrobiomeSample} with 2 features in 1 samples @@ -259,8 +253,6 @@ Sample names: s1 ``` - - ## [Working with metadata](@id working-metadata) The `metadata` dictionaries of `MicrobiomeSample`s can accessed diff --git a/src/Microbiome.jl b/src/Microbiome.jl index 70466d07..92926629 100644 --- a/src/Microbiome.jl +++ b/src/Microbiome.jl @@ -69,9 +69,7 @@ export ginisimpson, using Statistics using SparseArrays using EcoBase -using AxisIndices using Dictionaries -using NamedDims using Tables using Distances using MultivariateStats diff --git a/src/diversity.jl b/src/diversity.jl index 915afada..ce033bf0 100644 --- a/src/diversity.jl +++ b/src/diversity.jl @@ -30,10 +30,10 @@ Otherwise, uses `set!`. """ function shannon!(abt::AbstractAbundanceTable; overwrite=false) func! = overwrite ? set! : insert! - for s in samplenames(abt) + for s in samples(abt) col = abt[:, s] sh = shannon(abundances(col)) - func!(samples(col)[1], :shannon, sh) + func!(s, :shannon, sh) end return abt end @@ -68,10 +68,10 @@ Otherwise, uses `set!`. """ function ginisimpson!(abt::AbstractAbundanceTable; overwrite=false) func! = overwrite ? set! : insert! - for s in samplenames(abt) + for s in samples(abt) col = abt[:, s] sh = ginisimpson(abundances(col)) - func!(samples(col)[1], :ginisimpson, sh) + func!(s, :ginisimpson, sh) end return abt end diff --git a/src/features.jl b/src/features.jl index 6249d940..ada710d9 100644 --- a/src/features.jl +++ b/src/features.jl @@ -1,3 +1,6 @@ +name(as::AbstractFeature) = as.name +name(as::AbstractFeature) = as.name + const _ranks = ( domain = 0, kingdom = 1, @@ -165,6 +168,14 @@ hasrank(gf::GeneFunction) = hastaxon(gf) && !ismissing(taxrank(gf)) Base.String(gf::GeneFunction) = hastaxon(gf) ? string(name(gf), '|', String(taxon(gf))) : name(gf) +Base.:(==)(g1::GeneFunction, g2::GeneFunction) = String(g1) == String(g2) + +@testset "GeneFunction Equality" begin + @test GeneFunction("test") == GeneFunction("test") + @test GeneFunction("test", Taxon("taxon")) == GeneFunction("test", Taxon("taxon")) + @test GeneFunction("test") != GeneFunction("test", Taxon("taxon")) +end + """ genefunction(n::AbstractString) diff --git a/src/profiles.jl b/src/profiles.jl index 21f366b2..203cd410 100644 --- a/src/profiles.jl +++ b/src/profiles.jl @@ -7,41 +7,56 @@ end CommunityProfile{T, F, S} <: AbstractAbundanceTable{T, F, S} An `AbstractAssemblage` from [EcoBase.jl](https://github.com/EcoJulia/EcoBase.jl) -that uses an `AxisArray` of a `SparseMatrixCSC` under the hood. +that uses a `SparseMatrixCSC` under the hood. `CommunityProfile`s are tables with `AbstractFeature`-indexed rows and `AbstractSample`-indexed columns. Note - we can use the `name` of samples and features to index. """ mutable struct CommunityProfile{T, F, S} <: AbstractAbundanceTable{T, F, S} - aa::NamedAxisArray - - function CommunityProfile(aa::NamedAxisArray) - @assert dimnames(aa) == (:features, :samples) - T = eltype(parent(aa)) - F = eltype(keys(axes(aa, 1))) - S = eltype(keys(axes(aa, 2))) - return new{T, F, S}(aa) + abundances::SparseMatrixCSC{T} + features::AbstractVector{F} + samples::AbstractVector{S} + fidx::Dictionary{String, Int} + sidx::Dictionary{String, Int} + + function CommunityProfile(abunds::AbstractSparseMatrix, + feats::AbstractVector{<:AbstractFeature}, + smpls::AbstractVector{<:AbstractSample}) + + length(feats) == size(abunds, 1) || throw(DimensionMismatch("Number of features must equal number of rows in matrix")) + length(smpls) == size(abunds, 2) || throw(DimensionMismatch("Number of samples must equal number columns in matrix")) + fidx = Dictionary(String.(feats), eachindex(feats)) + sidx = Dictionary(String.(smpls), eachindex(smpls)) + + T = eltype(abunds) + F = eltype(feats) + S = eltype(smpls) + return new{T, F, S}(abunds, feats, smpls, fidx, sidx) end end -function CommunityProfile(tab::SparseMatrixCSC{<:Real}, - features::AbstractVector{<:AbstractFeature}, - samples::AbstractVector{<:AbstractSample}) - return CommunityProfile(NamedAxisArray(tab, features=features, samples=samples)) + +function CommunityProfile(tab::AbstractVecOrMat, + feats::AbstractVector{<:AbstractFeature}, + smpls::AbstractVector{<:AbstractSample}) + return CommunityProfile(sparse(tab), feats, smpls) end -function CommunityProfile{T, F, S}(tab::SparseMatrixCSC{<:T}, - features::AbstractVector{F}, - samples::AbstractVector{S}) where {T, F, S} - return CommunityProfile(tab, features, samples) +# single-column CommunityProfile +function CommunityProfile(tab::AbstractVecOrMat, + feats::AbstractVector{<:AbstractFeature}, + smpl::AbstractSample) + return CommunityProfile(sparse(reshape(tab, size(tab,1), size(tab,2))), feats, [smpl]) end -function CommunityProfile(tab::AbstractMatrix, - features::AbstractVector{<:AbstractFeature}, - samples::AbstractVector{<:AbstractSample}) - return CommunityProfile(sparse(tab), features, samples) +# single-row CommunityProfile +function CommunityProfile(tab::AbstractVecOrMat, + feat::AbstractFeature, + smpls::AbstractVector{<:AbstractSample}) + return CommunityProfile(sparse(reshape(tab, size(tab,1), size(tab,2))), [feat], smpls) end + ## -- Convienience functions -- ## function ==(p1::CommunityProfile, p2::CommunityProfile) @@ -50,19 +65,52 @@ function ==(p1::CommunityProfile, p2::CommunityProfile) features(p1) == features(p2) end +""" + taxonomicprofile(mat, features, samples) +""" +function taxonomicprofile(mat, features::AbstractVector{<:AbstractString}, samples::AbstractVector{<:AbstractString}) + CommunityProfile(mat, Taxon.(features), MicrobiomeSample.(samples)) +end + +""" + functionalprofile(mat, features, samples) +""" +function functionalprofile(mat, features::AbstractVector{<:AbstractString}, samples::AbstractVector{<:AbstractString}) + CommunityProfile(mat, GeneFunction.(features), MicrobiomeSample.(samples)) +end + +""" + metabolicprofile(mat, features, samples) +""" +function metabolicprofile(mat, features::AbstractVector{<:AbstractString}, samples::AbstractVector{<:AbstractString}) + CommunityProfile(mat, Metabolite.(features), MicrobiomeSample.(samples)) +end + +@testset "String Constructors" begin + tp = taxonomicprofile([1 0; 0 1], ["feature1", "feature2"], ["sample1", "sample2"]) + @test tp isa CommunityProfile + @test all(f-> f isa Taxon, features(tp)) + fp = functionalprofile([1 0; 0 1], ["feature1", "feature2"], ["sample1", "sample2"]) + @test fp isa CommunityProfile + @test all(f-> f isa GeneFunction, features(fp)) + mp = metabolicprofile([1 0; 0 1], ["feature1", "feature2"], ["sample1", "sample2"]) + @test mp isa CommunityProfile + @test all(f-> f isa Metabolite, features(mp)) +end + """ features(at::AbstractAbundanceTable) Returns features in `at`. To get featurenames instead, use [`featurenames`](@ref). """ -features(at::AbstractAbundanceTable) = axes(at.aa, 1) |> keys +features(at::AbstractAbundanceTable) = at.features """ samples(at::AbstractAbundanceTable) Returns samples in `at`. To get samplenames instead, use [`samplenames`](@ref). """ -samples(at::AbstractAbundanceTable) = axes(at.aa, 2) |> keys +samples(at::AbstractAbundanceTable) = at.samples """ samples(at::AbstractAbundanceTable, name::AbstractString) @@ -73,76 +121,48 @@ function samples(at::AbstractAbundanceTable, name::AbstractString) idx = findall(==(name), samplenames(at)) length(idx) == 0 && throw(IndexError("No samples called $name")) length(idx) > 1 && throw(IndexError("More than one sample matches name $name")) - return samples(at)[axes(at.aa, 2)][first(idx)] + return samples(at)[axes(at.abundances, 2)][first(idx)] end profiletype(at::AbstractAbundanceTable) = eltype(features(at)) ranks(at::AbstractAbundanceTable) = taxrank.(features(at)) -Base.size(at::AbstractAbundanceTable, dims...) = size(at.aa, dims...) +Base.size(at::AbstractAbundanceTable, dims...) = size(at.abundances, dims...) -Base.copy(at::AbstractAbundanceTable) = typeof(at)(copy(abundances(at)), copy(features(at)), deepcopy(samples(at))) +Base.copy(at::AbstractAbundanceTable) = CommunityProfile(copy(abundances(at)), copy(features(at)), deepcopy(samples(at))) # -- Indexing -- # -function _index_profile(at, idx, inds) - # single value - return that value - ndims(idx) == 0 && return idx - # another table - return a new CommunityProfile with that table - ndims(idx) == 2 && return CommunityProfile(idx) - # a row or a column, figure out which, and make it 2D - if ndims(idx) == 1 - dn = dimnames(idx)[1] - # if it's a row... - if dn == :samples - return at[[inds[1]], inds[2]] - # if it's a column - elseif dn == :features - return at[inds[1], [inds[2]]] - end - end -end - -function _toinds(arr, inds::AbstractVector{Regex}) - return findall(a-> any(ind-> contains(a, ind), inds), arr) -end - -function _toinds(arr, inds::AbstractVector{<: Union{AbstractSample, AbstractFeature, AbstractString}}) - return findall(a-> any(==(a), inds), arr) -end - -# fall back ↑ -_toinds(arr, ind::Union{AbstractSample, AbstractFeature, AbstractString, Regex}) = _toinds(arr, [ind]) - -# if inds are integers, just return them -_toinds(_, ind::Int) = ind -_toinds(_, inds::AbstractVector{Int}) = inds - -function Base.getindex(at::CommunityProfile, inds...) - idx = at.aa[inds...] +function Base.getindex(at::AbstractAbundanceTable, rowind, colind) + rows = _toind(at.fidx, rowind) + cols = _toind(at.sidx, colind) + + mat = copy(abundances(at)[rows, cols]) + + isempty(size(mat)) && return mat - _index_profile(at, idx, inds) + feat = copy(features(at))[rows] + smpl = deepcopy(samples(at))[cols] + + feat isa AbstractFeature && (mat = reshape(mat, 1, length(mat))) + return CommunityProfile(mat, feat, smpl) end -function Base.getindex(at::CommunityProfile, rowind::Union{T, AbstractVector{<:T}} where T<:Union{AbstractString,Regex}, colind) - rows = _toinds(featurenames(at), rowind) - idx = at.aa[rows, colind] +# For integers, or vectors of integers, just return them +_toind(_, ind) = ind +_toind(_, inds::AbstractVector) = inds - _index_profile(at, idx, (rows, colind)) -end +# for strings and regex, look for matches +_toind(d, ind::AbstractString) = only((d[i] for i in findall(key-> key == ind, keys(d)))) +_toind(d, ind::Regex) = [d[i] for i in findall(key-> contains(key, ind), keys(d))] -function Base.getindex(at::CommunityProfile, rowind, colind::Union{T, AbstractVector{<:T}} where T<:Union{AbstractString,Regex}) - cols = _toinds(samplenames(at), colind) - idx = at.aa[rowind, cols] +_toind(d, inds::AbstractVector{<:AbstractString}) = [d[i] for i in findall(key-> any(ind-> key == ind, inds), keys(d))] +_toind(d, inds::AbstractVector{<:Regex}) = [d[i] for i in findall(key-> any(ind-> contains(key, ind), inds), keys(d))] - _index_profile(at, idx, (rowind, cols)) -end +# For samples and features, look for string representation matches +_toind(d, ind::Union{AbstractSample, AbstractFeature}) = only((d[i] for i in findall(key-> key == String(ind), keys(d)))) +_toind(d, inds::AbstractVector{<:Union{AbstractSample, AbstractFeature}}) = [d[i] for i in findall(key-> any(ind-> key == String(ind), inds), keys(d))] -function Base.getindex(at::CommunityProfile, rowind::Union{T, AbstractVector{<:T}} where T<:Union{AbstractString,Regex}, - colind::Union{S, AbstractVector{<:S}} where S<:Union{AbstractString,Regex}) - rows = _toinds(featurenames(at), rowind) - at[rows, colind] -end ## -- EcoBase Translations -- ## # see src/ecobase.jl for Microbiome function names @@ -152,7 +172,7 @@ end EcoBase.thingnames(at::AbstractAbundanceTable) = name.(features(at)) EcoBase.placenames(at::AbstractAbundanceTable) = name.(samples(at)) -EcoBase.occurrences(at::AbstractAbundanceTable) = parent(parent(at.aa)) # first parent is the unnamed AxisArray +EcoBase.occurrences(at::AbstractAbundanceTable) = at.abundances EcoBase.nthings(at::AbstractAbundanceTable) = size(at, 1) EcoBase.nplaces(at::AbstractAbundanceTable) = size(at, 2) ## todo @@ -262,7 +282,7 @@ end Like [`relativeabundance!`](@ref), but does not mutate original. """ function relativeabundance(at::AbstractAbundanceTable, kind::Symbol=:fraction) - comm = typeof(at)(float.(abundances(at)), deepcopy(features(at)), deepcopy(samples(at))) + comm = CommunityProfile(float.(abundances(at)), deepcopy(features(at)), deepcopy(samples(at))) relativeabundance!(comm) end @@ -285,7 +305,7 @@ present(::Missing, m::Real=0.0) = missing function present(at::AbstractAbundanceTable, minabundance::Real=0.0) mat = spzeros(Bool, size(at)...) for i in eachindex(mat) - mat[i] = present(at[i], minabundance) + mat[i] = present(at[Tuple(i)...], minabundance) end return mat end diff --git a/src/samples.jl b/src/samples.jl index ffd7ce35..59e1573c 100644 --- a/src/samples.jl +++ b/src/samples.jl @@ -19,7 +19,17 @@ will update the parent `AbstractSample` as well. """ metadata(as::AbstractSample) = as.metadata -name(as::AbstractFeature) = as.name +Base.:(==)(as1::AbstractSample, as2::AbstractSample) = name(as1) == name(as2) + +@testset "Sample equality" begin + as1 = MicrobiomeSample("test") + as2 = deepcopy(as1) + as3 = MicrobiomeSample("test2") + set!(as1, :testvar, 1) + @test as1 == as2 + @test as1 != as3 + @test as2 != as3 +end Base.String(as::AbstractSample) = name(as) Base.String(af::AbstractFeature) = name(af) diff --git a/test/MicrobiomeTests.jl b/test/MicrobiomeTests.jl index 24454cd2..8d1a3e0b 100644 --- a/test/MicrobiomeTests.jl +++ b/test/MicrobiomeTests.jl @@ -6,7 +6,6 @@ using Microbiome.SparseArrays using Microbiome.Tables using Microbiome.Dictionaries import Microbiome.MultivariateStats: MDS -using Documenter @testset "Samples and Features" begin @testset "MicriobiomeSamples and metadata" begin @@ -131,7 +130,7 @@ end comm = CommunityProfile(mat, txs, mss) @testset "Profile operations" begin - @test CommunityProfile{Float64, Taxon, MicrobiomeSample}(mat, txs, mss) isa CommunityProfile + @test CommunityProfile(mat, txs, mss) isa CommunityProfile @test comm == CommunityProfile(dmat, txs, mss) @test nsamples(comm) == 5 @@ -185,9 +184,9 @@ end @test_throws ErrorException commjoin(comm, comm) let c3 = commjoin(comm[:,1:2], comm[:, 3:4], comm[:, 5]) - @test abundances(c3) == abundances(comm) - @test samples(c3) == samples(comm) - @test features(c3) == features(comm) + @test all(abundances(c3) .== abundances(comm)) + @test all(samples(c3) .== samples(comm)) + @test all(features(c3) .== features(comm)) end filtertest = CommunityProfile(sparse(Float64[3 2 1 # 0.66, assuming minabundance 2 @@ -219,8 +218,10 @@ end @test filter(hastaxon, strat) |> nfeatures == 2 @test filter(!hastaxon, strat) |> nfeatures == 2 - @test strat["gene1", :] |> nfeatures == 3 - @test strat[["gene1", "gene2"], :] |> nfeatures == 4 + @test strat["gene1", :] |> nfeatures == 1 + @test strat[["gene1", "gene2"], :] |> nfeatures == 2 + @test strat[r"gene1", :] |> nfeatures == 3 + @test strat[r"gene[12]", :] |> nfeatures == 4 @test strat[GeneFunction("gene1"), :] |> nfeatures == 1 end @@ -302,12 +303,16 @@ end for i in 1:5 @test abundances(comm[:, "sample$i"]) == mat[:, [i]] - @test abundances(comm["taxon$i", :]) == mat[[i], :] + @test abundances(comm["$(keys(Microbiome._shortranks)[i])__taxon$i", :]) == mat[[i], :] end - @test abundances(comm[r"taxon1", :]) == abundances(comm[["taxon1", "taxon10"], :]) == abundances(comm[[1,10], :]) + @test abundances(comm[r"taxon1", :]) == abundances(comm[["d__taxon1", "u__taxon10"], :]) == abundances(comm[[1,10], :]) @test abundances(comm[:, r"sample[13]"]) == abundances(comm[:,["sample1", "sample3"]]) == abundances(comm[:, [1,3]]) - @test abundances(comm[r"taxon1", r"sample[13]"]) == abundances(comm[["taxon1", "taxon10"],["sample1", "sample3"]]) == abundances(comm[[1,10], [1,3]]) + @test abundances(comm[r"taxon1", r"sample[13]"]) == + abundances(comm[["d__taxon1", "u__taxon10"],["sample1", "sample3"]]) == + abundances(comm[["d__taxon1", "u__taxon10"],[r"sample1", r"sample3"]]) == + abundances(comm[[1,10], [1,3]]) + for (i, col) in enumerate(Tables.columns(comm)) if i == 1