From 413fee3970bd0bd840b63aa8c31c6ba84fe82368 Mon Sep 17 00:00:00 2001
From: TuomasBorman <tvborm@utu.fi>
Date: Wed, 13 Nov 2024 11:08:57 +0200
Subject: [PATCH] up

---
 vignettes/case_study.Rmd | 147 ++++++++++++++++++---------------------
 1 file changed, 66 insertions(+), 81 deletions(-)

diff --git a/vignettes/case_study.Rmd b/vignettes/case_study.Rmd
index ed37a9d..23a3c2e 100644
--- a/vignettes/case_study.Rmd
+++ b/vignettes/case_study.Rmd
@@ -42,7 +42,8 @@ Our main study questions are:
 
 * How does treatment influence the gut microbiota of salmon?
 * Do gut flora and fatty acids composition evolve over time?
-* Is there a relationship between gut microbiota and the fatty acid composition in muscle tissue?
+* Is there a relationship between gut microbiota and the fatty acid composition
+in muscle tissue?
 
 ```{r}
 #| label: start
@@ -127,7 +128,8 @@ head(salmon_sample_ids)
 The data returned above is a list of all sample accession numbers that are
 associated with all salmons. For example, metagenomic amplicon samples, such as
 [SAMEA112750580](https://www.holofooddata.org/sample/SAMEA112750580)
-or fatty acid samples, [SAMEA112950027](https://www.holofooddata.org/sample/SAMEA112950027).
+or fatty acid samples,
+[SAMEA112950027](https://www.holofooddata.org/sample/SAMEA112950027).
 
 We can use these accession numbers to fetch the data associated with each
 sample type and store them as `experiments` in a `MultiAssayExperiment` (`MAE`)
@@ -151,7 +153,7 @@ mae <- HoloFoodR::getResult(
 
 # Save salmon MAE This must be named differently
 path <- system.file(
-  "extdata", "salmon_mae_without_mgnify.RDS", package = "HoloFoodR")
+    "extdata", "salmon_mae_without_mgnify.RDS", package = "HoloFoodR")
 saveRDS(object = mae, file = path)
 ```
 
@@ -161,7 +163,7 @@ saveRDS(object = mae, file = path)
 
 # Read MAE to avoid retrieval
 path <- system.file(
-  "extdata", "salmon_mae_without_mgnify.RDS", package = "HoloFoodR")
+    "extdata", "salmon_mae_without_mgnify.RDS", package = "HoloFoodR")
 mae <- readRDS(file = path)
 ```
 
@@ -169,8 +171,10 @@ mae <- readRDS(file = path)
 
 HoloFood database does not include the data for metagenomic data. This data
 can be retrieved from the [MGnify portal](https://www.ebi.ac.uk/metagenomics).
-For this purpose, we will use [MGnifyR package](https://bioconductor.org/packages/release/bioc/html/MGnifyR.html),
-which in a similar fashion to HoloFoodR, allows simple interaction with MGnify API.
+For this purpose, we will use
+[MGnifyR package](https://bioconductor.org/packages/release/bioc/html/MGnifyR.html),
+which in a similar fashion to HoloFoodR, allows simple interaction with MGnify
+API.
 
 ```{r}
 #| label: get_metagenomic_samples
@@ -217,7 +221,7 @@ tse <- MGnifyR::getResult(
 
 # Save salmon metagenomic TreeSE
 path <- system.file(
-  "extdata", "salmon_metagenomic_tse.RDS", package = "HoloFoodR")
+    "extdata", "salmon_metagenomic_tse.RDS", package = "HoloFoodR")
 saveRDS(object = mae, file = path)
 ```
 
@@ -228,7 +232,7 @@ saveRDS(object = mae, file = path)
 
 # Read in salmon metagenomic TreeSE object
 path <- system.file(
-  "extdata", "salmon_metagenomic_tse.RDS", package = "HoloFoodR")
+    "extdata", "salmon_metagenomic_tse.RDS", package = "HoloFoodR")
 tse <- readRDS(file = path)
 ```
 
@@ -293,8 +297,8 @@ mae <- mae[, colData(mae)[["Trial code"]] == "SA", ]
 
 Some values of fatty acids are under detection thresholds. We assume them to be
 zeroes. Moreover, the data includes a feature that just states from where the
-fatty acids were collected. We remove this feature to ensure that the assay contains
-only numeric values.
+fatty acids were collected. We remove this feature to ensure that the assay
+contains only numeric values.
 
 ```{r}
 #| label: preprocess2
@@ -323,15 +327,17 @@ information.
 
 # Add time points
 timepoints <- colData(mae[[2]])
-timepoints <- timepoints[match(timepoints[["animal"]], rownames(colData(mae))), ]
+timepoints <- timepoints[
+    match(timepoints[["animal"]], rownames(colData(mae))), ]
 timepoints <- ifelse(timepoints[["trial.timepoint"]] == 0, "start", "end")
 timepoints <- factor(timepoints, levels = c("start", "end"))
 colData(mae)[["timepoint"]] <- timepoints
 
 # Add treatment groups
 colData(mae)[["study_group"]] <- ifelse(
-  colData(mae)[["Treatment concentration"]]>0, "treatment", "control")
-colData(mae)[colData(mae)[["timepoint"]] == "start" , "study_group"] <- "control"
+    colData(mae)[["Treatment concentration"]]>0, "treatment", "control")
+colData(mae)[colData(mae)[["timepoint"]] == "start" , "study_group"] <-
+    "control"
 
 # Add animal metadata to separate experiments
 mae[[1]] <- getWithColData(mae, 1)
@@ -344,7 +350,7 @@ Next, we can agglomerate features by prevalence to reduce the number of
 low-abundant taxa and contaminants.
 
 First, we visualize prevalence distribution of taxa with a histogram to decide
-the prevalence threshold to use. We use 0.1% detection level to filter out
+the prevalence threshold to use. We use 0.2% detection level to filter out
 extremely low-abundant genera.
 
 ```{r}
@@ -362,13 +368,13 @@ prevalence <- getPrevalence(
     assay.type = "relabundance",
     na.rm = TRUE,
     sort = TRUE,
-    detection = 0.1 / 100
+    detection = 0.2 / 100
 )
 
 # Exclude microbes with 0 prevalence
 prevalence <- prevalence[prevalence != 0]
 
-hist(prevalence, xlab = "Prevalence")
+hist(prevalence, main = "", xlab = "Prevalence")
 ```
 
 We can also look at the raw prevalence numbers.
@@ -381,7 +387,8 @@ sort(prevalence, decreasing = TRUE) |> head(10)
 ```
 
 _Mycoplasma_ is present in all samples, which is not surprising as this genus
-was found to be one of the most common in salmon intestine (see @zarkasiPyrosequencingbasedCharacterizationGastrointestinal2014).
+was found to be one of the most common in salmon intestine
+(see @zarkasiPyrosequencingbasedCharacterizationGastrointestinal2014).
 
 We then agglomerate our data by prevalence and by taxonomic rank to obtain group
 all genera which are below the specified thresholds to the "Other" group.
@@ -402,22 +409,6 @@ altExp(mae[[2]], "prev_genus") <- agglomerateByPrevalence(
 )
 ```
 
-Moreover, we agglomerate the data to include only 19 most abundant genera.
-
-```{r}
-#| label: select_core
-
-tse <- agglomerateByRank(mae[[2]], rank = "Genus")
-core <- getTop(tse, top = 19)
-# Get vector that tells whether taxa belongs to core taxa or not
-core_names <- rownames(tse)
-core_names[ !core_names %in% core ] <- "other"
-# Agglomerate based on core taxa
-rowData(tse)[["core"]] <- core_names
-altExp(mae[[2]], "core_genus") <- agglomerateByVariable(
-  tse, by = "rows", group = "core")
-```
-
 Due to the limited number of samples, we also filter the fatty acid data to
 include only those fatty acids that show variation within the dataset. The
 rationale is that if a fatty acid does not vary, it cannot exhibit differences
@@ -459,14 +450,14 @@ well-established biological relevance. These include:
 #| label: filter_fatty3
 
 relevant_fatty_acids <- c(
-  "Docosahexaenoic acid 22:6n-3 (DHA)",
-  "Eicosapentaenoic acid 20:5n-3 (EPA)",
-  "Alpha-Linolenic acid 18:3n-3",
-  "Arachidonic acid 20:4n-6 (ARA)",
-  "Linoleic acid 18:2n-6",
-  "Oleic acid 18:1n-9",
-  "Palmitic acid 16:0",
-  "Stearic acid 18:0"
+    "Docosahexaenoic acid 22:6n-3 (DHA)",
+    "Eicosapentaenoic acid 20:5n-3 (EPA)",
+    "Alpha-Linolenic acid 18:3n-3",
+    "Arachidonic acid 20:4n-6 (ARA)",
+    "Linoleic acid 18:2n-6",
+    "Oleic acid 18:1n-9",
+    "Palmitic acid 16:0",
+    "Stearic acid 18:0"
 )
 altExp(mae[[1]], "relevant") <- mae[[1]][
     rownames(mae[[1]]) %in% relevant_fatty_acids, ]
@@ -579,13 +570,6 @@ composition with relative abundance barplot.
 #| label: abundance_plot
 #| fig-cap: Relative abundance of core microbial genera across samples.
 
-p <- plotAbundance(
-    altExp(mae[[2]], "core_genus"),
-    assay.type = "relabundance",
-    col.var = c("study_group", "timepoint"),
-    facet.cols = TRUE, scales = "free_x"
-    )
-
 p <- plotAbundance(
     altExp(mae[[2]], "prev_genus"),
     assay.type = "relabundance",
@@ -593,7 +577,6 @@ p <- plotAbundance(
     facet.cols = TRUE, scales = "free_x"
     )
 p
-p
 ```
 
 Salmon gut seems to be dominated by either genus _Mycoplasma_ or
@@ -627,7 +610,7 @@ res
 
 Based on the results, we conclude that older salmon exhibit distinct microbial
 diversity compared to younger ones. Additionally, there appears to be a
-slight—though not statistically significant—effect of treatment on microbial
+slight — though not statistically significant — effect of treatment on microbial
 diversity.
 
 ```{r}
@@ -735,9 +718,9 @@ p <- p + geom_shadowtext(aes(label = df[["percentage"]]))
 p
 ```
 
-Factor 1 primarily captures variance within the metagenomics data, while Factor
-2 mostly represents variance in fatty acids, including some shared variability.
-Factors 3 and 4 capture shared variability between the metagenomic data and
+Factor 1 captures only variance within the metagenomics data, while over 2/3 of
+variance captured by Factor 2 represents variance in fatty acids.
+Factor 3 captures shared variability between the metagenomic data and
 fatty acids, reflecting interconnected patterns between the two datasets.
 
 Before exploring the shared variability, we first examine which metagenomic
@@ -746,13 +729,13 @@ variability is captured by Factor 1.
 ```{r}
 #| label: plot_factor1
 #| fig-width: 10
-#| fig-height: 8
+#| fig-height: 6
 #| fig-cap: Features with the highest loadings for Factor 1.
 
 p1 <- plot_top_weights(model, view = 1, factors = 1, nfeatures = 25) +
-  labs(title = "Fatty acids")
+    labs(title = "Fatty acids")
 p2 <- plot_top_weights(model, view = 2, factors = 1, nfeatures = 25) +
-  labs(title = "Microbiota")
+    labs(title = "Microbiota")
 
 p1 + p2
 ```
@@ -765,46 +748,48 @@ Let us then focus on loadings of factor 2.
 ```{r}
 #| label: plot_factor2
 #| fig-width: 10
-#| fig-height: 8
+#| fig-height: 6
 #| fig-cap: Features with the highest loadings for Factor 2.
 
 p1 <- plot_top_weights(model, view = 1, factors = 2, nfeatures = 25) +
-  labs(title = "Fatty acids")
+    labs(title = "Fatty acids")
 p2 <- plot_top_weights(model, view = 2, factors = 2, nfeatures = 25) +
-  labs(title = "Microbiota")
+    labs(title = "Microbiota")
 
 p1 + p2
 ```
 
-In the microbial data, particularly _Cetobacterium_ and _Photobacterium_ show a
-positive association with Factor 2. Additionally, many fatty acids display
-significant weights in this factor, though no single fatty acid can be
-specifically tied to these taxa. This suggests that as the abundances of these
-taxa rise, there is a corresponding increase in overall fatty acid levels.
+In the microbial data, particularly _Cetobacterium_, _Vibrio_, and _Aliivibrio_
+show a negative association with Factor 2. Additionally, many fatty acids
+display significant negative weights in this factor, though no single fatty
+acid can be specifically tied to these taxa. This suggests that as the
+abundances of these taxa is rise (or decrease), there is a corresponding
+increase (or decrease) in overall fatty acid levels.
+
+Next, we visualize Factor 3 that captured variance more evenly between microbes
+and fatty acids.
 
 ```{r}
 #| label: plot_factor3
 #| fig-width: 10
-#| fig-height: 15
-#| fig-cap: Features with the highest loadings for Factors 3 and 4.
-
-p <- lapply(c(3, 4), function(x){
-    p1 <- plot_top_weights(model, view = 1, factors = x, nfeatures = 25) +
-        labs(title = paste0("Fatty acids vs factor ", x))
-    p2 <- plot_top_weights(model, view = 2, factors = x, nfeatures = 25) +
-        labs(title = paste0("Microbiota vs factor ", x))
-    res <- list(p1, p2)
-    return(res)
-})
-p <- unlist(p, recursive = FALSE)
+#| fig-height: 6
+#| fig-cap: Features with the highest loadings for Factors 3.
+
+p1 <- plot_top_weights(model, view = 1, factors = 3, nfeatures = 25) +
+    labs(title = "Fatty acids")
+p2 <- plot_top_weights(model, view = 2, factors = 3, nfeatures = 25) +
+    labs(title = "Microbiota")
 
-wrap_plots(p, ncol = 2)
+p1 + p2
 ```
 
-From the shared Factors 3 and 4, _Aliivibrio_,  _Cetobacterium_, _Vibrio_, and
-_Photobacterium_ emerge prominently. Their association with the 22:1n-11 fatty
-acid and omega-3 fatty acids suggests a potential biological link between these
-microbes and specific fatty acid profiles, hinting at a functional relationship.
+From the shared Factor 3, _Photobacterium_ emerge prominently. Similarly to
+Factor 2, no single fatty acid can be directly associated with 
+_Photobacterium_.
+
+Worth noting is that, out of these 5 taxa, only _Mycoplasma_ does not appear to
+share any variability with fatty acids as all its variability was captured by
+the first factor which did not associate with fatty acids.
 
 ## Conclusions