Merge pull request #460 from antagomir/aitchison

Aitchison
vegandevs · Feb 24, 2022 · 710877a · 710877a
2 parents 47e9fc3 + 734ed6f
commit 710877a
Show file tree

Hide file tree

Showing 5 changed files with 281 additions and 36 deletions.
diff --git a/R/decostand.R b/R/decostand.R
@@ -5,12 +5,12 @@
     x <- as.matrix(x)
     METHODS <- c("total", "max", "frequency", "normalize", "range", "rank",
                  "rrank", "standardize", "pa", "chi.square", "hellinger",
-                 "log", "clr", "rclr")
+                 "log", "clr", "rclr", "alr", "ilr")
     method <- match.arg(method, METHODS)
     if (any(x < 0, na.rm = na.rm)) {
         k <- min(x, na.rm = na.rm)
         if (method %in% c("total", "frequency", "pa", "chi.square", "rank",
-                          "rrank", "clr", "rclr")) {
+                          "rrank", "clr", "rclr", "alr", "ilr")) {
             warning("input data contains negative entries: result may be non-sense\n")
         }
     }
@@ -85,18 +85,31 @@
                     call. = FALSE)
         }
         x[x > 0 & !is.na(x)] <- log(x[x > 0 & !is.na(x)], base = logbase) + 1
+
+    }, alr = {
+        if (missing(MARGIN))
+	    MARGIN <- 1
+        if (MARGIN == 1) 
+          x <- .calc_alr(x, ...)
+	else x <- t(.calc_alr(t(x), ...))
+    }, ilr = {
+        if (missing(MARGIN))
+	    MARGIN <- 1
+        if (MARGIN == 1) 
+          x <- .calc_ilr(x, ...)
+	else x <- t(.calc_ilr(t(x), ...))
     }, clr = {
         if (missing(MARGIN))
 	    MARGIN <- 1
         if (MARGIN == 1) 
-            x <- t(.calc_clr(t(x), ...))
-	else x <- .calc_clr(x, ...)
+          x <- .calc_clr(x, ...)
+	else x <- t(.calc_clr(t(x), ...))
     }, rclr = {
         if (missing(MARGIN))
 	    MARGIN <- 1
         if (MARGIN == 1) 
-            x <- t(.calc_rclr(t(x)))
-	else x <- .calc_rclr(x)
+          x <- .calc_rclr(x, ...)
+	else x <- t(.calc_rclr(t(x), ...))
     })
     if (any(is.nan(x)))
         warning("result contains NaN, perhaps due to impossible mathematical 
@@ -109,56 +122,93 @@
 
 
 # Modified from the original version in mia R package
-.calc_clr <- function(x, pseudocount=0){
+.calc_clr <- function(x, pseudocount=0, na.rm=TRUE){
     # Add pseudocount
     x <- x + pseudocount
-    # Calculate relative abundance
-    x <- .calc_rel_abund(x)
-    # If there is negative values, gives an error.
+    # If there are negative values, gives an error.
     if (any(x <= 0, na.rm = TRUE)) {
         stop("Abundance table contains zero or negative values and ",
              "clr-transformation is being applied without (suitable) ",
-             "pseudocount. \n",
-             "Try to add pseudocount (default choice pseudocount = 1 for ",
-             "count assay; or pseudocount = min(x[x>0]) with relabundance ",
-             "assay).",
-             call. = FALSE)
+             "pseudocount. \n")
     }
     # In every sample, calculates the log of individual entries.
     # After that calculates
     # the sample-specific mean value and subtracts every entries'
     # value with that.
+    #clog <- t(log(x))
+    #t(clog - rowMeans(clog))
+
     clog <- log(x)
-    clogm <- colMeans(clog)
-    return(t(t(clog) - clogm))
+    clog - rowMeans(clog)
+
 }
 
 # Modified from the original version in mia R package
-.calc_rclr <- function(x){
+.calc_rclr <- function(x, na.rm=TRUE){
+    # If there are negative values, gives an error.
+    if (any(x < 0, na.rm = na.rm)) {
+        stop("Abundance table contains negative values. The 
+              rclr transformation assumes non-negative values.\n")
+    }
    # Log transform
-   log_x <- log(x)
+   clog <- log(x)
    # zeros are converted into infinite values in clr
    # They are converted to NAs for now
-   log_x[is.infinite(log_x)] <- NA
+   clog[is.infinite(clog)] <- NA
    # Calculates means for every sample, does not take NAs into account
-   mean_log_x <- colMeans(log_x, na.rm = TRUE)
+   mean_clog <- rowMeans(clog, na.rm = TRUE)
    # Calculates exponential values from means, i.e., geometric means
-   geometric_means_of_samples <- exp(mean_log_x)
+   geometric_means_of_samples <- exp(mean_clog)
    # Divides all values by their sample-wide geometric means
-   values_divided_by_geom_mean <- t(x)/geometric_means_of_samples
-   # Does logarithmic transform and transposes the table back to its original
+   # Then does logarithmic transform and transposes the table back to its original
    # form
-   return_x <- t(log(values_divided_by_geom_mean))
+   xx <- log(x/geometric_means_of_samples)
    # If there were zeros, there are infinite values after logarithmic transform.
    # They are converted to zero.
-   return_x[is.infinite(return_x)] <- 0
-   return_x
+   xx[is.infinite(xx)] <- 0
+   xx
 }
 
-# Modified from the original version in mia R package
-# Same as decostand method "total" but faster
-.calc_rel_abund <- function(x){
-    sweep(x, 2, colSums(x, na.rm = TRUE), "/")
+
+.calc_alr <- function (x, reference = 1, na.rm=TRUE, pseudocount=0) {
+    # Add pseudocount
+    x <- x + pseudocount
+    # If there is negative values, gives an error.
+    if (any(x < 0, na.rm = na.rm)) {
+        stop("Abundance table contains negative values and ",
+             "alr-transformation is being applied without (suitable) ",
+             "pseudocount. \n")
+    }    
+    if (reference > nrow(x)) 
+        stop("The reference should be a feature name, or index between 1 to", ncol(x))
+    clog <- log(x)
+    clog[, -reference]-clog[, reference]
+}
+
+
+
+.calc_ilr <- function (x, pseudocount=0) {
+    # Add pseudocount
+    x <- x + pseudocount
+    # If there is negative values, gives an error.
+    if (any(x < 0, na.rm = TRUE)) {
+        stop("Abundance table contains negative values and ",
+             "alr-transformation is being applied without (suitable) ",
+             "pseudocount. \n")
+    }    
+
+    # For a more efficient implementation ideas, check the packages
+    # compositions and philr for ilrBase
+    x.ilr <- matrix(NA, nrow(x), ncol(x)-1)
+    rownames(x.ilr) <- rownames(x)    
+    for (i in seq_len(nrow(x))) {
+        for (j in seq_len(ncol(x.ilr))) {
+            x.ilr[i, j] <- -sqrt(j/(j + 1)) * log(((prod(x[i, seq_len(j)]))^(1/j))/x[i, j + 1])
+        }
+    }
+
+    x.ilr
+
 }
 
 
diff --git a/R/vegdist.R b/R/vegdist.R
@@ -11,7 +11,7 @@
                  "kulczynski", "gower", "morisita", "horn", #8
                  "mountford", "jaccard", "raup", "binomial", "chao", #13
                  "altGower", "cao", "mahalanobis", "clark", "chisq", "chord", #19
-		 "aitchison", "aitchison_robust") # 21
+		 "Aitchison", "rAitchison") # 21
     method <- pmatch(method, METHODS)
     inm <- METHODS[method]
     if (is.na(method))
@@ -47,9 +47,9 @@
         x <- decostand(x, "chi.square")
     if (method == 19) # chord
         x <- decostand(x, "normalize")
-    if (method == 20)  # aitchison
+    if (method == 20)  # Aitchison
         x <- decostand(x, "clr", ...)  # dots to pass possible pseudocount
-    if (method == 21)  # aitchison_robust
+    if (method == 21)  # rAitchison
         x <- decostand(x, "rclr") # No pseudocount for rclr
     if (binary)
         x <- decostand(x, "pa")

diff --git a/man/decostand.Rd b/man/decostand.Rd
@@ -69,6 +69,30 @@ wisconsin(x)
      \code{\link{vegdist}}), but the standardization can be used 
      independently of distance indices.
 
+     \item \code{alr}: Additive log ratio ("alr") transformation
+     (Aitchison 1986) reduces data skewness and compositionality
+     bias. The transformation assumes positive values, pseudocounts
+     can be added with the argument \code{pseudocount}. One of the
+     samples is a reference, this sample (name or index) can be given
+     by \code{reference}. The first sample is used by default
+     (\code{reference=1}).
+     Note that this transformation drops one
+     feature from the transformed output data. The \code{alr}
+     transformation is defined formally as follows: \deqn{alr =
+     [log\frac{x_1}{x_D}, ..., log\frac{x_{D-1}}{x_D}]}, where the
+     denominator sample \deqn{x_D} can be chosen arbitrarily. This
+     transformation is often used with pH and other chemistry
+     measurenments. It is also commonly used as multinomial logistic
+     regression.
+
+     \item \code{ilr}: Isometric log ratio ("ilr") transformation
+     (Aitchison 1986) reduces data skewness and compositionality
+     bias. The transformation assumes positive values, pseudocounts
+     can be added with the argument \code{pseudocount}. Note that this
+     transformation drops one feature from the transformed output
+     data. For a formal definition, see e.g. Egozcue et al. (2003). The
+     calculation may be slow for data sets with many samples or features.
+
      \item \code{clr}: centered log ratio ("clr") transformation proposed by
      Aitchison (1986) reduces data skewness and compositionality bias.
      This transformation has frequent applications in microbial ecology
@@ -98,7 +122,8 @@ wisconsin(x)
      \deqn{rclr = log_{10}\frac{x_{r}}{g(x_{r} > 0)}}{%
      rclr = log10(x_r/g(x_r > 0))}
      where \eqn{x_{r}} is a single relative value, and \eqn{g(x_{r} > 0)} is geometric 
-     mean of sample-wide relative values that are positive (over 0).  
+     mean of sample-wide relative values that are positive (over 0).
+
   }
 
   Standardization, as contrasted to transformation, means that the
@@ -125,7 +150,9 @@ wisconsin(x)
   \code{"method"}.
 }
 \author{Jari Oksanen, Etienne \enc{Laliberté}{Laliberte}
-  (\code{method = "log"}), Leo Lahti (\code{"clr"} and \code{"rclr"}).}
+  (\code{method = "log"}), Leo Lahti (\code{alr}, \code{ilr},
+  \code{"clr"} and \code{"rclr"}).}
+
 \note{Common transformations can be made with standard \R functions.}
 
 \references{ 
@@ -137,6 +164,11 @@ wisconsin(x)
   dispersion as a measure of beta diversity. \emph{Ecology Letters} 
   \strong{9}, 683--693.
 
+  Egozcue, J.J., Pawlowsky-Glahn, V., Mateu-Figueras, G.,
+  Barcel'o-Vidal, C. (2003) Isometric logratio transformations for
+  compositional data analysis. \emph{Mathematical Geology}
+  \strong{35}, 279--300.
+
   Gloor, G.B., Macklaim, J.M., Pawlowsky-Glahn, V. & Egozcue, J.J. (2017)
   Microbiome Datasets Are Compositional: And This Is Not Optional.
   \emph{Frontiers in Microbiology} \strong{8}, 2224. 
@@ -164,6 +196,9 @@ sptrans <- wisconsin(varespec)
 # CLR transformation for rows, with pseudocount
 varespec.clr <- decostand(varespec, "clr", pseudocount=1)
 
+# ALR transformation for rows, with pseudocount and reference sample
+varespec.alr <- decostand(varespec, "alr", pseudocount=1, reference=1)
+
 ## Chi-square: PCA similar but not identical to CA.
 ## Use wcmdscale for weighted analysis and identical results.
 sptrans <- decostand(varespec, "chi.square")

diff --git a/tests/aitchison-tests.R b/tests/aitchison-tests.R
@@ -0,0 +1,63 @@
+# Test data
+# data(varespec)
+testdata <- matrix(round(runif(1000, 0, 100)), nrow=20)
+testdata <- testdata - 50
+testdata[testdata < 0] <- 0
+rownames(testdata) <- paste0("row", seq_len(nrow(testdata)))
+colnames(testdata) <- paste0("col", seq_len(ncol(testdata)))
+
+# Calculates relative abundance table
+relative <- vegan::decostand(testdata, "total")
+
+# Count and relative data with pseudocount
+testdata.with.pseudo <- testdata + 1
+relative.with.pseudo <- vegan::decostand(testdata+1, "total")
+
+# Aitchison equals to CLR + Euclid (pseudocount is necessary with clr)
+a1 <- vegan::vegdist(testdata+1, method = "Aitchison")
+a2 <- vegan::vegdist(vegan::decostand(testdata+1, "clr"), method = "euclidean")
+max(abs(a1-a2)) < 1e-6 # Tolerance
+
+# Robust Aitchison equals to rCLR + Euclid
+# and works without pseudocount
+a1 <- vegan::vegdist(testdata, method = "rAitchison")
+a2 <- vegan::vegdist(vegan::decostand(testdata, "rclr"), method = "euclidean")
+max(abs(a1-a2)) < 1e-6 # Tolerance
+
+# Robust Aitchison and Aitchison are equal when there are no zeroes
+a1 <- vegan::vegdist(testdata.with.pseudo, method = "rAitchison")
+a2 <- vegan::vegdist(testdata.with.pseudo, method = "Aitchison")
+max(abs(a1-a2)) < 1e-6 # Tolerance
+
+# It is possible to pass pseudocount as a function argument to vegan::decostand
+a1 <- vegan::vegdist(testdata, method = "Aitchison", pseudocount=1)
+a2 <- vegan::vegdist(testdata+1, method = "Aitchison")
+max(abs(a1-a2)) < 1e-6 # Tolerance
+
+
+# Compare the outcomes with an external package that also provides compositional transformations
+# Adding these would demand adding Suggested packages in DESCRIPTION; skipped for now but can be
+# useful for manual testing.
+#skip <- TRUE
+#if (!skip) {
+#
+#    sum(compositions::ilr(testdata.with.pseudo) - vegan::decostand(testdata.with.pseudo, "ilr")) < 1e-6
+#    sum(compositions::ilr(testdata.with.pseudo) - vegan::decostand(testdata.with.pseudo, "ilr", MARGIN=1)) < 1e-6
+#    # rgr and compositions packages differ in sign; vegan::decostand is aligned with the "compositions" package
+#    sum(t(compositions::ilr(t(testdata.with.pseudo))) - (+vegan::decostand(testdata.with.pseudo, "ilr", MARGIN=2))) < 1e-6
+#    sum(t(rgr::ilr(t(testdata.with.pseudo))) - (-vegan::decostand(testdata.with.pseudo, "ilr", MARGIN=2))) < 1e-6    #
+#
+#    sum(compositions::clr(testdata.with.pseudo) - vegan::decostand(testdata.with.pseudo, "clr")) < 1e-6
+#    sum(compositions::clr(testdata.with.pseudo) - vegan::decostand(testdata.with.pseudo, "clr", MARGIN=1)) < 1e-6
+#    sum(t(compositions::clr(t(testdata.with.pseudo))) - vegan::decostand(testdata.with.pseudo, "clr", MARGIN=2)) < 1e-6
+#    sum(rgr::clr(testdata.with.pseudo) - vegan::decostand(testdata.with.pseudo, "clr"))<1e-6#
+#
+#    sum(compositions::alr(testdata.with.pseudo, ivar=1) - vegan::decostand(testdata.with.pseudo, "alr")) < 1e-6
+#    sum(compositions::alr(testdata.with.pseudo, ivar=1) - vegan::decostand(testdata.with.pseudo, "alr", MARGIN=1)) < 1e-6
+#    sum(t(compositions::alr(t(testdata.with.pseudo), ivar=1)) - vegan::decostand(testdata.with.pseudo, "alr", MARGIN=2)) < 1e-6
+#    sum(rgr::alr(testdata.with.pseudo, j=1) - vegan::decostand(testdata.with.pseudo, "alr", reference=1))<1e-6#
+#
+#}
+
+# --------------------------------------------------------------------------------------------------------------
+