From cb9941ca31afc7b34a02eb67db25b1d7b91b1c10 Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kevbonham@gmail.com>
Date: Sun, 20 Feb 2022 14:00:14 -0500
Subject: [PATCH] Remove axis indices and nameddims (#126)

Also some API changes
---
 .gitignore              |   1 +
 Project.toml            |   9 +-
 docs/src/profiles.md    |  22 ++---
 src/Microbiome.jl       |   2 -
 src/diversity.jl        |   8 +-
 src/features.jl         |  11 +++
 src/profiles.jl         | 180 ++++++++++++++++++++++------------------
 src/samples.jl          |  12 ++-
 test/MicrobiomeTests.jl |  25 +++---
 9 files changed, 151 insertions(+), 119 deletions(-)

diff --git a/.gitignore b/.gitignore
index 59a11099..2829ee3b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ scratch*
 docs/build/
 docs/site/
 Manifest.toml
+.envrc
\ No newline at end of file
diff --git a/Project.toml b/Project.toml
index a66ae4d5..f074bdf2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,28 +4,23 @@ keywords = ["microbiology", "microbiome", "biology"]
 license = "MIT"
 desc = "Functions and types for working with microbial community data"
 authors = ["@kescobo <kevbonham@gmail.com>"]
-version = "0.8.4"
+version = "0.9.0"
 
 [deps]
-AxisIndices = "f52c9ee2-1b1c-4fd8-8546-6350938c7f11"
 Dictionaries = "85a47980-9c8c-11e8-2b9f-f7ca1fa99fb4"
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 EcoBase = "a58aae7d-b440-5a11-b283-399458f99aac"
 MultivariateStats = "6f286f6a-111f-5878-ab1e-185364afe411"
-NamedDims = "356022a1-0364-5f58-8944-0da4b18d706f"
 ReTest = "e0db7c4e-2690-44b9-bad6-7687da720f89"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
-AxisIndices = "0.7"
 Dictionaries = "0.3"
 Distances = "0.10"
 EcoBase = "0.1.4"
-MultivariateStats = "0.7, 0.8"
-NamedDims = "0.2"
+MultivariateStats = "0.9"
 ReTest = "0.3"
 Tables = "1.2.1"
 julia = "1.6"
-
diff --git a/docs/src/profiles.md b/docs/src/profiles.md
index 7f2499c3..71242884 100644
--- a/docs/src/profiles.md
+++ b/docs/src/profiles.md
@@ -233,21 +233,15 @@ It is often inconvenient to find the numerical index
 of a particular feature or sample.
 Instead, you can use strings or regular expressions
 to get slices of a `CommunityProfile`,
-which will match on the `name` field of the features or samples.
-This kind of indexing always returns a `CommunityProfile`,
-even if it only has 1 value.
+which will match on the `String`  representation of the features or samples.
+As with numerical indexing, if the index returns
+a unique (feature, sample) pair,
+it will return the abundance of that pair.
+Otherwise, it will return a new CommunityProfile.
 
 ```jldoctest profiles
-julia> comm["g1", "s1"]
-CommunityProfile{Float64, Taxon, MicrobiomeSample} with 1 features in 1 samples
-
-Feature names:
-g1
-
-Sample names:
-s1
-
-
+julia> comm["g__g1", "s1"]
+0.0
 
 julia> comm[r"[gs]1", "s1"]
 CommunityProfile{Float64, Taxon, MicrobiomeSample} with 2 features in 1 samples
@@ -259,8 +253,6 @@ Sample names:
 s1
 ```
 
-
-
 ## [Working with metadata](@id working-metadata)
 
 The `metadata` dictionaries of `MicrobiomeSample`s can accessed
diff --git a/src/Microbiome.jl b/src/Microbiome.jl
index 70466d07..92926629 100644
--- a/src/Microbiome.jl
+++ b/src/Microbiome.jl
@@ -69,9 +69,7 @@ export ginisimpson,
 using Statistics
 using SparseArrays
 using EcoBase
-using AxisIndices
 using Dictionaries
-using NamedDims
 using Tables
 using Distances
 using MultivariateStats
diff --git a/src/diversity.jl b/src/diversity.jl
index 915afada..ce033bf0 100644
--- a/src/diversity.jl
+++ b/src/diversity.jl
@@ -30,10 +30,10 @@ Otherwise, uses `set!`.
 """
 function shannon!(abt::AbstractAbundanceTable; overwrite=false)
     func! = overwrite ? set! : insert!
-    for s in samplenames(abt)
+    for s in samples(abt)
         col = abt[:, s]
         sh = shannon(abundances(col))
-        func!(samples(col)[1], :shannon, sh)
+        func!(s, :shannon, sh)
     end
     return abt
 end
@@ -68,10 +68,10 @@ Otherwise, uses `set!`.
 """
 function ginisimpson!(abt::AbstractAbundanceTable; overwrite=false)
     func! = overwrite ? set! : insert!
-    for s in samplenames(abt)
+    for s in samples(abt)
         col = abt[:, s]
         sh = ginisimpson(abundances(col))
-        func!(samples(col)[1], :ginisimpson, sh)
+        func!(s, :ginisimpson, sh)
     end
     return abt
 end
diff --git a/src/features.jl b/src/features.jl
index 6249d940..ada710d9 100644
--- a/src/features.jl
+++ b/src/features.jl
@@ -1,3 +1,6 @@
+name(as::AbstractFeature) = as.name
+name(as::AbstractFeature) = as.name
+
 const _ranks = (
     domain     = 0,
     kingdom    = 1,
@@ -165,6 +168,14 @@ hasrank(gf::GeneFunction) = hastaxon(gf) && !ismissing(taxrank(gf))
 
 Base.String(gf::GeneFunction) = hastaxon(gf) ? string(name(gf), '|', String(taxon(gf))) : name(gf)
 
+Base.:(==)(g1::GeneFunction, g2::GeneFunction) = String(g1) == String(g2)
+
+@testset "GeneFunction Equality" begin
+    @test GeneFunction("test") == GeneFunction("test")
+    @test GeneFunction("test", Taxon("taxon")) == GeneFunction("test", Taxon("taxon"))
+    @test GeneFunction("test") != GeneFunction("test", Taxon("taxon"))
+end
+
 """
     genefunction(n::AbstractString)
 
diff --git a/src/profiles.jl b/src/profiles.jl
index 21f366b2..203cd410 100644
--- a/src/profiles.jl
+++ b/src/profiles.jl
@@ -7,41 +7,56 @@ end
     CommunityProfile{T, F, S} <: AbstractAbundanceTable{T, F, S}
 
 An `AbstractAssemblage` from [EcoBase.jl](https://github.com/EcoJulia/EcoBase.jl)
-that uses an `AxisArray` of a `SparseMatrixCSC` under the hood.
+that uses a `SparseMatrixCSC` under the hood.
 
 `CommunityProfile`s are tables with `AbstractFeature`-indexed rows and
 `AbstractSample`-indexed columns.
 Note - we can use the `name` of samples and features to index.
 """
 mutable struct CommunityProfile{T, F, S} <: AbstractAbundanceTable{T, F, S}
-    aa::NamedAxisArray
-
-    function CommunityProfile(aa::NamedAxisArray)
-        @assert dimnames(aa) == (:features, :samples)
-        T = eltype(parent(aa))
-        F = eltype(keys(axes(aa, 1)))
-        S = eltype(keys(axes(aa, 2)))
-        return new{T, F, S}(aa)
+    abundances::SparseMatrixCSC{T}
+    features::AbstractVector{F}
+    samples::AbstractVector{S}
+    fidx::Dictionary{String, Int}
+    sidx::Dictionary{String, Int}
+
+    function CommunityProfile(abunds::AbstractSparseMatrix,
+                              feats::AbstractVector{<:AbstractFeature},
+                              smpls::AbstractVector{<:AbstractSample})
+        
+        length(feats) == size(abunds, 1) || throw(DimensionMismatch("Number of features must equal number of rows in matrix"))
+        length(smpls) == size(abunds, 2) || throw(DimensionMismatch("Number of samples must equal number columns in matrix"))
+        fidx = Dictionary(String.(feats), eachindex(feats))
+        sidx = Dictionary(String.(smpls), eachindex(smpls))
+
+        T = eltype(abunds)
+        F = eltype(feats)
+        S = eltype(smpls)
+        return new{T, F, S}(abunds, feats, smpls, fidx, sidx)
     end
 end
 
-function CommunityProfile(tab::SparseMatrixCSC{<:Real}, 
-                          features::AbstractVector{<:AbstractFeature},
-                          samples::AbstractVector{<:AbstractSample})
-    return CommunityProfile(NamedAxisArray(tab, features=features, samples=samples))
+
+function CommunityProfile(tab::AbstractVecOrMat,
+                          feats::AbstractVector{<:AbstractFeature},
+                          smpls::AbstractVector{<:AbstractSample})
+    return CommunityProfile(sparse(tab), feats, smpls)
 end
 
-function CommunityProfile{T, F, S}(tab::SparseMatrixCSC{<:T},
-                                   features::AbstractVector{F}, 
-                                   samples::AbstractVector{S}) where {T, F, S}
-    return CommunityProfile(tab, features, samples)
+# single-column CommunityProfile
+function CommunityProfile(tab::AbstractVecOrMat,
+                          feats::AbstractVector{<:AbstractFeature},
+                          smpl::AbstractSample)
+    return CommunityProfile(sparse(reshape(tab, size(tab,1), size(tab,2))), feats, [smpl])
 end
 
-function CommunityProfile(tab::AbstractMatrix,
-                          features::AbstractVector{<:AbstractFeature},
-                          samples::AbstractVector{<:AbstractSample})
-    return CommunityProfile(sparse(tab), features, samples)
+# single-row CommunityProfile
+function CommunityProfile(tab::AbstractVecOrMat,
+                          feat::AbstractFeature,
+                          smpls::AbstractVector{<:AbstractSample})
+    return CommunityProfile(sparse(reshape(tab, size(tab,1), size(tab,2))), [feat], smpls)
 end
+
 ## -- Convienience functions -- ##
 
 function ==(p1::CommunityProfile, p2::CommunityProfile)
@@ -50,19 +65,52 @@ function ==(p1::CommunityProfile, p2::CommunityProfile)
            features(p1)   == features(p2)
 end
 
+"""
+    taxonomicprofile(mat, features, samples)
+"""
+function taxonomicprofile(mat, features::AbstractVector{<:AbstractString}, samples::AbstractVector{<:AbstractString})
+    CommunityProfile(mat, Taxon.(features), MicrobiomeSample.(samples))
+end
+
+"""
+    functionalprofile(mat, features, samples)
+"""
+function functionalprofile(mat, features::AbstractVector{<:AbstractString}, samples::AbstractVector{<:AbstractString})
+    CommunityProfile(mat, GeneFunction.(features), MicrobiomeSample.(samples))
+end
+
+"""
+    metabolicprofile(mat, features, samples)
+"""
+function metabolicprofile(mat, features::AbstractVector{<:AbstractString}, samples::AbstractVector{<:AbstractString})
+    CommunityProfile(mat, Metabolite.(features), MicrobiomeSample.(samples))
+end
+
+@testset "String Constructors" begin
+    tp = taxonomicprofile([1 0; 0 1], ["feature1", "feature2"], ["sample1", "sample2"])
+    @test tp isa CommunityProfile
+    @test all(f-> f isa Taxon, features(tp))
+    fp = functionalprofile([1 0; 0 1], ["feature1", "feature2"], ["sample1", "sample2"])
+    @test fp isa CommunityProfile
+    @test all(f-> f isa GeneFunction, features(fp))
+    mp = metabolicprofile([1 0; 0 1], ["feature1", "feature2"], ["sample1", "sample2"])
+    @test mp isa CommunityProfile
+    @test all(f-> f isa Metabolite, features(mp))
+end
+
 """
     features(at::AbstractAbundanceTable)
 
 Returns features in `at`. To get featurenames instead, use [`featurenames`](@ref).
 """
-features(at::AbstractAbundanceTable) = axes(at.aa, 1) |> keys
+features(at::AbstractAbundanceTable) = at.features
 
 """
     samples(at::AbstractAbundanceTable)
 
 Returns samples in `at`. To get samplenames instead, use [`samplenames`](@ref).
 """
-samples(at::AbstractAbundanceTable) = axes(at.aa, 2) |> keys
+samples(at::AbstractAbundanceTable) = at.samples
 
 """
     samples(at::AbstractAbundanceTable, name::AbstractString)
@@ -73,76 +121,48 @@ function samples(at::AbstractAbundanceTable, name::AbstractString)
     idx = findall(==(name), samplenames(at))
     length(idx) == 0 && throw(IndexError("No samples called $name"))
     length(idx) > 1 && throw(IndexError("More than one sample matches name $name"))
-    return samples(at)[axes(at.aa, 2)][first(idx)]
+    return samples(at)[axes(at.abundances, 2)][first(idx)]
 end
 
 profiletype(at::AbstractAbundanceTable) = eltype(features(at))
 ranks(at::AbstractAbundanceTable) = taxrank.(features(at))
 
-Base.size(at::AbstractAbundanceTable, dims...) = size(at.aa, dims...)
+Base.size(at::AbstractAbundanceTable, dims...) = size(at.abundances, dims...)
 
-Base.copy(at::AbstractAbundanceTable) = typeof(at)(copy(abundances(at)), copy(features(at)), deepcopy(samples(at)))
+Base.copy(at::AbstractAbundanceTable) = CommunityProfile(copy(abundances(at)), copy(features(at)), deepcopy(samples(at)))
 
 # -- Indexing -- #
 
-function _index_profile(at, idx, inds)
-    # single value - return that value
-    ndims(idx) == 0 && return idx 
-    # another table - return a new CommunityProfile with that table
-    ndims(idx) == 2 && return CommunityProfile(idx)
-    # a row or a column, figure out which, and make it 2D
-    if ndims(idx) == 1
-        dn = dimnames(idx)[1]
-        # if it's a row...
-        if dn == :samples
-            return at[[inds[1]], inds[2]]
-        # if it's a column
-        elseif dn == :features
-            return at[inds[1], [inds[2]]]
-        end
-    end
-end
-
-function _toinds(arr, inds::AbstractVector{Regex})
-    return findall(a-> any(ind-> contains(a, ind), inds), arr)
-end
-
-function _toinds(arr, inds::AbstractVector{<: Union{AbstractSample, AbstractFeature, AbstractString}})
-    return findall(a-> any(==(a), inds), arr)
-end
-
-# fall back ↑
-_toinds(arr, ind::Union{AbstractSample, AbstractFeature, AbstractString, Regex}) = _toinds(arr, [ind])
-
-# if inds are integers, just return them
-_toinds(_, ind::Int) = ind
-_toinds(_, inds::AbstractVector{Int}) = inds
-
-function Base.getindex(at::CommunityProfile, inds...)
-    idx = at.aa[inds...]
+function Base.getindex(at::AbstractAbundanceTable, rowind, colind)
+    rows = _toind(at.fidx, rowind)
+    cols = _toind(at.sidx, colind)
+    
+    mat = copy(abundances(at)[rows, cols])
+    
+    isempty(size(mat)) && return mat
     
-    _index_profile(at, idx, inds)
+    feat = copy(features(at))[rows]
+    smpl = deepcopy(samples(at))[cols]
+
+    feat isa AbstractFeature && (mat = reshape(mat, 1, length(mat)))
+    return CommunityProfile(mat, feat, smpl)
 end
 
-function Base.getindex(at::CommunityProfile, rowind::Union{T, AbstractVector{<:T}} where T<:Union{AbstractString,Regex}, colind)
-    rows = _toinds(featurenames(at), rowind)
-    idx = at.aa[rows, colind]
+# For integers, or vectors of integers, just return them
+_toind(_, ind) = ind
+_toind(_, inds::AbstractVector) = inds
 
-    _index_profile(at, idx, (rows, colind))
-end
+# for strings and regex, look for matches
+_toind(d, ind::AbstractString) = only((d[i] for i in findall(key-> key == ind, keys(d))))
+_toind(d, ind::Regex)          = [d[i] for i in findall(key-> contains(key, ind), keys(d))]
 
-function Base.getindex(at::CommunityProfile, rowind, colind::Union{T, AbstractVector{<:T}} where T<:Union{AbstractString,Regex})
-    cols = _toinds(samplenames(at), colind)
-    idx = at.aa[rowind, cols]
+_toind(d, inds::AbstractVector{<:AbstractString}) = [d[i] for i in findall(key-> any(ind-> key == ind, inds), keys(d))]
+_toind(d, inds::AbstractVector{<:Regex})          = [d[i] for i in findall(key-> any(ind-> contains(key, ind), inds), keys(d))]
 
-    _index_profile(at, idx, (rowind, cols))
-end
+# For samples and features, look for string representation matches
+_toind(d, ind::Union{AbstractSample, AbstractFeature}) = only((d[i] for i in findall(key-> key == String(ind), keys(d))))
+_toind(d, inds::AbstractVector{<:Union{AbstractSample, AbstractFeature}}) = [d[i] for i in findall(key-> any(ind-> key == String(ind), inds), keys(d))]
 
-function Base.getindex(at::CommunityProfile, rowind::Union{T, AbstractVector{<:T}} where T<:Union{AbstractString,Regex},
-                                             colind::Union{S, AbstractVector{<:S}} where S<:Union{AbstractString,Regex})
-    rows = _toinds(featurenames(at), rowind)
-    at[rows, colind]
-end
 
 ## -- EcoBase Translations -- ##
 # see src/ecobase.jl for Microbiome function names
@@ -152,7 +172,7 @@ end
 
 EcoBase.thingnames(at::AbstractAbundanceTable) = name.(features(at))
 EcoBase.placenames(at::AbstractAbundanceTable) = name.(samples(at))
-EcoBase.occurrences(at::AbstractAbundanceTable) = parent(parent(at.aa)) # first parent is the unnamed AxisArray
+EcoBase.occurrences(at::AbstractAbundanceTable) = at.abundances
 EcoBase.nthings(at::AbstractAbundanceTable) = size(at, 1)
 EcoBase.nplaces(at::AbstractAbundanceTable) = size(at, 2)
 ## todo
@@ -262,7 +282,7 @@ end
 Like [`relativeabundance!`](@ref), but does not mutate original.
 """
 function relativeabundance(at::AbstractAbundanceTable, kind::Symbol=:fraction)
-    comm = typeof(at)(float.(abundances(at)), deepcopy(features(at)), deepcopy(samples(at)))
+    comm = CommunityProfile(float.(abundances(at)), deepcopy(features(at)), deepcopy(samples(at)))
     relativeabundance!(comm)
 end
 
@@ -285,7 +305,7 @@ present(::Missing, m::Real=0.0) = missing
 function present(at::AbstractAbundanceTable, minabundance::Real=0.0)
     mat = spzeros(Bool, size(at)...)
     for i in eachindex(mat)
-        mat[i] = present(at[i], minabundance)
+        mat[i] = present(at[Tuple(i)...], minabundance)
     end
     return mat
 end
diff --git a/src/samples.jl b/src/samples.jl
index ffd7ce35..59e1573c 100644
--- a/src/samples.jl
+++ b/src/samples.jl
@@ -19,7 +19,17 @@ will update the parent `AbstractSample` as well.
 """
 metadata(as::AbstractSample) = as.metadata
 
-name(as::AbstractFeature) = as.name
+Base.:(==)(as1::AbstractSample, as2::AbstractSample) = name(as1) == name(as2)
+
+@testset "Sample equality" begin
+    as1 = MicrobiomeSample("test")
+    as2 = deepcopy(as1)
+    as3 = MicrobiomeSample("test2")
+    set!(as1, :testvar, 1)
+    @test as1 == as2
+    @test as1 != as3
+    @test as2 != as3
+end
 
 Base.String(as::AbstractSample) = name(as)
 Base.String(af::AbstractFeature) = name(af)
diff --git a/test/MicrobiomeTests.jl b/test/MicrobiomeTests.jl
index 24454cd2..8d1a3e0b 100644
--- a/test/MicrobiomeTests.jl
+++ b/test/MicrobiomeTests.jl
@@ -6,7 +6,6 @@ using Microbiome.SparseArrays
 using Microbiome.Tables
 using Microbiome.Dictionaries
 import Microbiome.MultivariateStats: MDS
-using Documenter
 
 @testset "Samples and Features" begin
     @testset "MicriobiomeSamples and metadata" begin
@@ -131,7 +130,7 @@ end
     comm = CommunityProfile(mat, txs, mss)
 
     @testset "Profile operations" begin
-        @test CommunityProfile{Float64, Taxon, MicrobiomeSample}(mat, txs, mss) isa CommunityProfile
+        @test CommunityProfile(mat, txs, mss) isa CommunityProfile
         @test comm == CommunityProfile(dmat, txs, mss)
         
         @test nsamples(comm) == 5
@@ -185,9 +184,9 @@ end
 
         @test_throws ErrorException commjoin(comm, comm)
         let c3 = commjoin(comm[:,1:2], comm[:, 3:4], comm[:, 5])
-            @test abundances(c3) == abundances(comm)
-            @test samples(c3) == samples(comm)
-            @test features(c3) == features(comm)
+            @test all(abundances(c3) .== abundances(comm))
+            @test all(samples(c3) .== samples(comm))
+            @test all(features(c3) .== features(comm))
         end
 
         filtertest = CommunityProfile(sparse(Float64[3 2 1 # 0.66, assuming minabundance 2
@@ -219,8 +218,10 @@ end
         
         @test filter(hastaxon, strat)         |> nfeatures == 2
         @test filter(!hastaxon, strat)        |> nfeatures == 2
-        @test strat["gene1", :]               |> nfeatures == 3
-        @test strat[["gene1", "gene2"], :]    |> nfeatures == 4
+        @test strat["gene1", :]               |> nfeatures == 1
+        @test strat[["gene1", "gene2"], :]    |> nfeatures == 2
+        @test strat[r"gene1", :]               |> nfeatures == 3
+        @test strat[r"gene[12]", :]    |> nfeatures == 4
         @test strat[GeneFunction("gene1"), :] |> nfeatures == 1
     end
 
@@ -302,12 +303,16 @@ end
         
         for i in 1:5
             @test abundances(comm[:, "sample$i"]) == mat[:, [i]]
-            @test abundances(comm["taxon$i", :]) == mat[[i], :]
+            @test abundances(comm["$(keys(Microbiome._shortranks)[i])__taxon$i", :]) == mat[[i], :]
         end
 
-        @test abundances(comm[r"taxon1", :]) == abundances(comm[["taxon1", "taxon10"], :]) == abundances(comm[[1,10], :])
+        @test abundances(comm[r"taxon1", :]) == abundances(comm[["d__taxon1", "u__taxon10"], :]) == abundances(comm[[1,10], :])
         @test abundances(comm[:, r"sample[13]"]) == abundances(comm[:,["sample1", "sample3"]]) == abundances(comm[:, [1,3]])
-        @test abundances(comm[r"taxon1", r"sample[13]"]) == abundances(comm[["taxon1", "taxon10"],["sample1", "sample3"]]) == abundances(comm[[1,10], [1,3]])
+        @test abundances(comm[r"taxon1", r"sample[13]"]) == 
+              abundances(comm[["d__taxon1", "u__taxon10"],["sample1", "sample3"]]) == 
+              abundances(comm[["d__taxon1", "u__taxon10"],[r"sample1", r"sample3"]]) == 
+              abundances(comm[[1,10], [1,3]])
+
 
         for (i, col) in enumerate(Tables.columns(comm))
             if i == 1