Merge pull request #14 from zargham-ahmad/refactor

First iteration Refactoring for normFact and R2 function
RECETOX · Aug 1, 2024 · ce4d983 · ce4d983
2 parents b26c678 + 80f139a
commit ce4d983
Show file tree

Hide file tree

Showing 5 changed files with 221 additions and 188 deletions.
diff --git a/.github/workflows/r-conda.yml b/.github/workflows/r-conda.yml
@@ -20,22 +20,29 @@ jobs:
     strategy:
       fail-fast: false      
       matrix:
-        os: [ windows-latest, ubuntu-latest, macos-latest]
+        os: [ windows-latest, ubuntu-latest, macos-14]
+        experimental: [false]
+        include:
+          - os: macos-14-arm64
+            experimental: true
     runs-on: ${{ matrix.os }}
+    continue-on-error: ${{ matrix.experimental }}
     defaults:
       run:
         shell: bash -l {0}
 
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Install wget
         if: ${{ matrix.os == 'windows-latest' }}
         run: choco install wget
-
+      - name: Sets env vars for osx-64
+        run: echo "CONDA_SUBDIR=osx-64" >> $GITHUB_ENV
+        if: ${{ matrix.os == 'macos-14' }}
       - name: Create conda environment
-        uses: conda-incubator/setup-miniconda@v2
+        uses: conda-incubator/setup-miniconda@v3
         with:
           activate-environment: recetox-waveica-dev
           auto-update-conda: true
@@ -45,11 +52,10 @@ jobs:
           conda init bash
           conda env list
       - name: Fetch input data
-        run: wget -P tests/testthat/test-data -i tests/remote-files/fetch_input_data.txt
-      - name: Fetch batchwise waveica results
-        run: wget -P tests/testthat/test-data/batchwise-correction -i tests/remote-files/fetch_waveica_batchwise.txt
-      - name: Fetch nonbatchwise waveica results
-        run: wget -P tests/testthat/test-data/nonbatchwise-correction -i tests/remote-files/fetch_waveica_nonbatchwise.txt
+        run: | 
+          wget -P tests/testthat/test-data -i tests/remote-files/fetch_input_data.txt
+          wget -P tests/testthat/test-data/batchwise-correction -i tests/remote-files/fetch_waveica_batchwise.txt
+          wget -P tests/testthat/test-data/nonbatchwise-correction -i tests/remote-files/fetch_waveica_nonbatchwise.txt
       - name: setup Rtools
         if: ${{ matrix.os == 'windows-latest' }}
         uses: r-windows/install-rtools@master

diff --git a/R/R2.R b/R/R2.R
@@ -1,131 +1,130 @@
-R2 <- function(poi, V, poiType, pval = T) {
-  #
-  #   R2(poi,V,poiType)
-  #
-  # Args:
-  # - V is a p*k matrix, where the rows corresponds to the samples
-  # - poi is a matrix p*l, representing the phenotypes of interest
-  # - poiType (1*l) is the types of poi: 'continuous' (then a linear
-  # regression is used) or 'categorical' (then the mean by class is used)
-  #
-  # Outputs:
-  # - R2(l), higher R^2 value between a column of V and poi(l)
-  # - idxCorr(l), index of the column of V giving the higher R^2 value (if many,
-  # takes the first one)
-  # - allR2(k,l),  R2 value for column k of V with poi l
-  #
-  #    IF pval =TRUE, return also:   #
-  # - pv(l) smaller p-value association between a column of V and poi(l)
-  # - idxcorr2(l) index of the column of V giving the smaller p-value (if many,
-  #                                                                          # takes the first one)
-  # - allpv(k,l),  p-value for column k of V with poi l
-  #
-  # if missing information in poi, remove the corresponding samples in the R2 computation
+linear_regression_estimation <- function(component_values, poi_values) {
+  coefs <- coef(lm(component_values ~ as.numeric(poi_values)))
+  estimated_values <- coefs[2] * as.numeric(poi_values) + coefs[1]
+  return(estimated_values)
+}
 
+class_mean_estimation <- function(component_values, poi_values) {
+  classes <- unique(poi_values)
+  estimated_values <- rep(NA, length(component_values))
+  for (class in classes) {
+    class_indices <- which(poi_values == class)
+    estimated_values[class_indices] <- mean(component_values[class_indices])
+  }
+  return(estimated_values)
+}
 
-  if (is.vector(V)) {
-    V <- matrix(V, ncol = 1)
+#' Compute R-squared and P-values between Phenotypes of Interest and Components
+#'
+#' This function computes the R-squared and optionally the p-values between phenotypes of interest (POI) and components.
+#'
+#' @param poi Matrix (p x l). Representing the phenotypes of interest.
+#' @param components Matrix (p x k). The components where the rows correspond to the samples.
+#' @param poi_types Character vector (length l). Types of POI: 'continuous' (for linear regression) or 'categorical' (for class mean).
+#' @param pval Logical. If TRUE, compute p-values in addition to R-squared values. Default is TRUE.
+#' @return A list containing:
+#' \item{R2}{Vector (length l). Highest R-squared value between a column of `components` and each POI.}
+#' \item{idxCorr}{Vector (length l). Index of the column of `components` giving the highest R-squared value.}
+#' \item{allR2}{Matrix (k x l). R-squared values for each column of `components` with each POI.}
+#' \item{pv}{Vector (length l). Smallest p-value association between a column of `components` and each POI (if `pval` is TRUE).}
+#' \item{idxCorr2}{Vector (length l). Index of the column of `components` giving the smallest p-value (if `pval` is TRUE).}
+#' \item{allpv}{Matrix (k x l). P-values for each column of `components` with each POI (if `pval` is TRUE).}
+#' @export
+R2 <- function(poi, components, poi_types, pval = TRUE) {
+  if (is.vector(components)) {
+    components <- matrix(components, ncol = 1)
   }
   if (is.vector(poi)) {
     poi <- matrix(poi, nrow = length(poi))
   }
 
-  p <- nrow(V) # number of samples
-  k <- ncol(V) # number of components
-  l <- length(poiType) # number of cf/poi to test
-  if (is.null(l)) {
-    stop("POI type(s) neeeded")
+  n_samples <- nrow(components)
+  n_components <- ncol(components)
+  n_poi <- length(poi_types)
+
+  if (is.null(n_poi)) {
+    stop("POI type(s) needed")
   }
-  p2 <- nrow(poi)
-  l2 <- ncol(poi)
 
-  if (l2 != l) { # checking poi and poiType dimensions compatiblity
-    if (p2 == l) { # if poi is transposed (l*p)
+  poi_rows <- nrow(poi)
+  poi_cols <- ncol(poi)
+
+  if (poi_cols != n_poi) {
+    if (poi_rows == n_poi) {
       poi <- t(poi)
-      warning("Transposing poi to match poiType dimension")
-      p2 <- nrow(poi)
+      warning("Transposing POI to match POI types dimension")
+      poi_rows <- nrow(poi)
     } else {
-      print(poi)
-      print(poiType)
-      stop("poi dimensions doesn't match poiType dimension")
+      stop("POI dimensions do not match POI types dimension")
     }
   }
 
-
-  if (p != p2) { # checking poi and V dimensions compatiblity
-    if (p2 == k) {
-      warnings("Transposing V to match poi dimension")
-      V <- t(V)
-      k <- p
-      p <- p2
+  if (n_samples != poi_rows) {
+    if (poi_rows == n_components) {
+      warning("Transposing components to match POI dimension")
+      components <- t(components)
+      n_components <- n_samples
+      n_samples <- poi_rows
     } else {
-      stop("poi and V dimensions incompatible")
+      stop("POI and components dimensions incompatible")
     }
   }
 
-
-
-
-
-
-  R2 <- rep(-1, l)
-  names(R2) <- colnames(poi)
-  idxcorr <- R2
-  R2_tmp <- matrix(rep(-1, k * l), k, l, dimnames = list(colnames(V), colnames(poi))) # r2_tmp(k,l) hold the R2 value for column k of V with poi l
+  R2_values <- rep(-1, n_poi)
+  names(R2_values) <- colnames(poi)
+  idx_corr <- R2_values
+  R2_tmp <- matrix(rep(-1, n_components * n_poi), n_components, n_poi, dimnames = list(colnames(components), colnames(poi)))
 
   if (pval) {
-    pv <- R2
-    idxcorr2 <- R2
-    pv_tmp <- R2_tmp # r2_tmp(k,l) hold the R2 value for column k of V with poi l
+    p_values <- R2_values
+    idx_corr2 <- R2_values
+    p_values_tmp <- R2_tmp
   }
 
-  for (cmpt in 1:k) { # for each column of V
-    cmpt2an <- V[, cmpt]
-    for (ipoi in 1:l) {
-      idx_finite <- is.finite(as.factor(poi[, ipoi]))
-      poi2an <- poi[idx_finite, ipoi]
-      cmpt2an_finite <- cmpt2an[idx_finite]
-      if (poiType[ipoi] == "continuous") { # estimation by linear regression
-        coefs <- coef(lm(cmpt2an_finite ~ as.numeric(poi2an)))
-        cmpt2an_est <- coefs[2] * as.numeric(poi2an) + coefs[1]
-        nc <- 2
-      } else if (poiType[ipoi] == "categorical") { # estimation by classe mean
-        classes <- unique(poi2an)
-        nc <- length(classes)
-        cmpt2an_est <- rep(NA, length(cmpt2an_finite))
-        for (icl in 1:length(classes)) {
-          idxClasse <- which(poi2an == classes[icl])
-          cmpt2an_est[idxClasse] <- mean(cmpt2an_finite[idxClasse])
-        }
+  for (component_idx in 1:n_components) {
+    component_values <- components[, component_idx]
+    for (poi_idx in 1:n_poi) {
+      finite_indices <- is.finite(as.factor(poi[, poi_idx]))
+      poi_values <- poi[finite_indices, poi_idx]
+      finite_component_values <- component_values[finite_indices]
+
+      if (poi_types[poi_idx] == "continuous") {
+        estimated_values <- linear_regression_estimation(finite_component_values, poi_values)
+        num_classes <- 2
+      } else if (poi_types[poi_idx] == "categorical") {
+        estimated_values <- class_mean_estimation(finite_component_values, poi_values)
+        num_classes <- length(unique(poi_values))
       } else {
-        stop("Incorrect poiType. Select 'continuous' or 'categorical'. ")
+        stop("Incorrect poi_type. Select 'continuous' or 'categorical'.")
       }
-      sse <- sum((cmpt2an_finite - cmpt2an_est)^2)
-      sst <- sum((cmpt2an_finite - mean(cmpt2an_finite))^2)
-      R2_tmp[cmpt, ipoi] <- 1 - sse / sst
+
+      sse <- sum((finite_component_values - estimated_values)^2)
+      sst <- sum((finite_component_values - mean(finite_component_values))^2)
+      R2_tmp[component_idx, poi_idx] <- 1 - sse / sst
+
       if (pval) {
-        F <- ((sst - sse) / (nc - 1)) / (sse / (p - nc))
-        pv_tmp[cmpt, ipoi] <- 1 - pf(F, nc - 1, p - nc)
-        if (!is.finite(pv_tmp[cmpt, ipoi])) {
-          warning(paste("Non finite p-value for component ", cmpt, " (pv=", pv_tmp[cmpt, ipoi], ", F=", F, "), assigning NA", sep = ""))
-          pv_tmp[cmpt, ipoi] <- NA
+        F_value <- ((sst - sse) / (num_classes - 1)) / (sse / (n_samples - num_classes))
+        p_values_tmp[component_idx, poi_idx] <- 1 - pf(F_value, num_classes - 1, n_samples - num_classes)
+        if (!is.finite(p_values_tmp[component_idx, poi_idx])) {
+          warning(sprintf("Non-finite p-value for component %d (pv=%g, F=%g), assigning NA", component_idx, p_values_tmp[component_idx, poi_idx], F_value))
+          p_values_tmp[component_idx, poi_idx] <- NA
         }
       }
     }
   }
 
-  for (ipoi in 1:l) {
+  for (poi_idx in 1:n_poi) {
     if (pval) {
-      pv[ipoi] <- min(pv_tmp[, ipoi])
-      idxcorr2[ipoi] <- which(pv_tmp[, ipoi] == pv[ipoi])[1] # if more than one component gives the best R2, takes the first one
+      p_values[poi_idx] <- min(p_values_tmp[, poi_idx])
+      idx_corr2[poi_idx] <- which(p_values_tmp[, poi_idx] == p_values[poi_idx])[1]
     }
-    R2[ipoi] <- max(R2_tmp[, ipoi])
-    idxcorr[ipoi] <- which(R2_tmp[, ipoi] == R2[ipoi])[1] # if more than one component gives the best R2, takes the first one
+    R2_values[poi_idx] <- max(R2_tmp[, poi_idx])
+    idx_corr[poi_idx] <- which(R2_tmp[, poi_idx] == R2_values[poi_idx])[1]
   }
 
   if (pval) {
-    return(list(R2 = R2, idxcorr = idxcorr, allR2 = R2_tmp, pv = pv, idxcorr2 = idxcorr2, allpv = pv_tmp))
+    return(list(R2 = R2_values, idxCorr = idx_corr, allR2 = R2_tmp, pv = p_values, idxCorr2 = idx_corr2, allpv = p_values_tmp))
   } else {
-    return(list(R2 = R2, idxcorr = idxcorr, allR2 = R2_tmp))
+    return(list(R2 = R2_values, idxCorr = idx_corr, allR2 = R2_tmp))
   }
 }
diff --git a/R/WaveICA.R b/R/WaveICA.R
@@ -107,10 +107,10 @@ waveica_nonbatchwise <- function(data, wf = "haar", injection_order, alpha = 0,
 #' @param wf String. Wavelet function, the default is "haar".
 #' @param batch Vector. Batch number of each sample.
 #' @param factorization String. Matrix factorization method, options are ["stICA", "SVD"]. The default is "stICA".
-#' @param group Vector, optional. Type of a sample (blank, sample, QC) numerically encoded to blank:0, sample:1, QC:2.
+#' @param group Vector, optional. Type of a sample (blank, sample, QC, standard) numerically encoded to blank:0, sample:1, QC:2, standard:3.
 #' @param K Integer. The maximal number of independent components (for ICA) or singular vectors (SVD). The default is 20.
-#' @param t Float between 0 and 1. The threshold to consider a component associate with the batch. The default is 0.05.
-#' @param t2 Float between 0 and 1. The threshold to consider a component associate with the group. The default is 0.05.
+#' @param batch_threshold Float between 0 and 1. The threshold to consider a component associate with the batch. The default is 0.05.
+#' @param group_threshold Float between 0 and 1. The threshold to consider a component associate with the group. The default is 0.05.
 #' @param alpha Float between 0 and 1. The trade-off value between the independence of samples and those
 #' of variables. The default is 0.
 #' @return Dataframe. Feature table with intensities corrected of batch effects.
@@ -121,8 +121,8 @@ waveica <- function(data,
                     factorization = "stICA",
                     group = NULL,
                     K = 20,
-                    t = 0.05,
-                    t2 = 0.05,
+                    batch_threshold = 0.05,
+                    group_threshold = 0.05,
                     alpha = 0) {
   if (!factorization %in% c("stICA", "SVD")) {
     stop("The factorization method should be 'stICA' or 'SVD'.")
@@ -137,8 +137,8 @@ waveica <- function(data,
   cat(paste("Performing matrix factorization...\n"))
   for (i in (1:index)) {
     data_coef <- coef[[i]]
-    data_coef_ICA <- normFact(fact = factorization, X = t(data_coef), ref = batch, refType = "categorical", k = K, t = t, ref2 = group, refType2 = "categorical", t2 = t2, alpha)
-    data_wave_ICA[[i]] <- t(data_coef_ICA$Xn)
+    data_coef_ICA <- normFact(factorization_method = factorization, data_matrix = t(data_coef), batch_vector = batch, batch_type = "categorical", rank = K, batch_threshold = batch_threshold, group_matrix = group, group_types = "categorical", group_threshold = group_threshold, alpha)
+    data_wave_ICA[[i]] <- t(data_coef_ICA$normalized_matrix)
   }
   data_wave <- wt_reconstruction(data, data_wave_ICA, wf)