diff --git a/.github/workflows/r-conda.yml b/.github/workflows/r-conda.yml index 050fb98..472f32f 100644 --- a/.github/workflows/r-conda.yml +++ b/.github/workflows/r-conda.yml @@ -20,8 +20,13 @@ jobs: strategy: fail-fast: false matrix: - os: [ windows-latest, ubuntu-latest, macos-latest] + os: [ windows-latest, ubuntu-latest, macos-14] + experimental: [false] + include: + - os: macos-14-arm64 + experimental: true runs-on: ${{ matrix.os }} + continue-on-error: ${{ matrix.experimental }} defaults: run: shell: bash -l {0} @@ -29,13 +34,15 @@ jobs: # Steps represent a sequence of tasks that will be executed as part of the job steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Install wget if: ${{ matrix.os == 'windows-latest' }} run: choco install wget - + - name: Sets env vars for osx-64 + run: echo "CONDA_SUBDIR=osx-64" >> $GITHUB_ENV + if: ${{ matrix.os == 'macos-14' }} - name: Create conda environment - uses: conda-incubator/setup-miniconda@v2 + uses: conda-incubator/setup-miniconda@v3 with: activate-environment: recetox-waveica-dev auto-update-conda: true @@ -45,11 +52,10 @@ jobs: conda init bash conda env list - name: Fetch input data - run: wget -P tests/testthat/test-data -i tests/remote-files/fetch_input_data.txt - - name: Fetch batchwise waveica results - run: wget -P tests/testthat/test-data/batchwise-correction -i tests/remote-files/fetch_waveica_batchwise.txt - - name: Fetch nonbatchwise waveica results - run: wget -P tests/testthat/test-data/nonbatchwise-correction -i tests/remote-files/fetch_waveica_nonbatchwise.txt + run: | + wget -P tests/testthat/test-data -i tests/remote-files/fetch_input_data.txt + wget -P tests/testthat/test-data/batchwise-correction -i tests/remote-files/fetch_waveica_batchwise.txt + wget -P tests/testthat/test-data/nonbatchwise-correction -i tests/remote-files/fetch_waveica_nonbatchwise.txt - name: setup Rtools if: ${{ matrix.os == 'windows-latest' }} uses: r-windows/install-rtools@master diff --git a/R/R2.R b/R/R2.R index c639fc6..619fd7b 100644 --- a/R/R2.R +++ b/R/R2.R @@ -1,131 +1,130 @@ -R2 <- function(poi, V, poiType, pval = T) { - # - # R2(poi,V,poiType) - # - # Args: - # - V is a p*k matrix, where the rows corresponds to the samples - # - poi is a matrix p*l, representing the phenotypes of interest - # - poiType (1*l) is the types of poi: 'continuous' (then a linear - # regression is used) or 'categorical' (then the mean by class is used) - # - # Outputs: - # - R2(l), higher R^2 value between a column of V and poi(l) - # - idxCorr(l), index of the column of V giving the higher R^2 value (if many, - # takes the first one) - # - allR2(k,l), R2 value for column k of V with poi l - # - # IF pval =TRUE, return also: # - # - pv(l) smaller p-value association between a column of V and poi(l) - # - idxcorr2(l) index of the column of V giving the smaller p-value (if many, - # # takes the first one) - # - allpv(k,l), p-value for column k of V with poi l - # - # if missing information in poi, remove the corresponding samples in the R2 computation +linear_regression_estimation <- function(component_values, poi_values) { + coefs <- coef(lm(component_values ~ as.numeric(poi_values))) + estimated_values <- coefs[2] * as.numeric(poi_values) + coefs[1] + return(estimated_values) +} +class_mean_estimation <- function(component_values, poi_values) { + classes <- unique(poi_values) + estimated_values <- rep(NA, length(component_values)) + for (class in classes) { + class_indices <- which(poi_values == class) + estimated_values[class_indices] <- mean(component_values[class_indices]) + } + return(estimated_values) +} - if (is.vector(V)) { - V <- matrix(V, ncol = 1) +#' Compute R-squared and P-values between Phenotypes of Interest and Components +#' +#' This function computes the R-squared and optionally the p-values between phenotypes of interest (POI) and components. +#' +#' @param poi Matrix (p x l). Representing the phenotypes of interest. +#' @param components Matrix (p x k). The components where the rows correspond to the samples. +#' @param poi_types Character vector (length l). Types of POI: 'continuous' (for linear regression) or 'categorical' (for class mean). +#' @param pval Logical. If TRUE, compute p-values in addition to R-squared values. Default is TRUE. +#' @return A list containing: +#' \item{R2}{Vector (length l). Highest R-squared value between a column of `components` and each POI.} +#' \item{idxCorr}{Vector (length l). Index of the column of `components` giving the highest R-squared value.} +#' \item{allR2}{Matrix (k x l). R-squared values for each column of `components` with each POI.} +#' \item{pv}{Vector (length l). Smallest p-value association between a column of `components` and each POI (if `pval` is TRUE).} +#' \item{idxCorr2}{Vector (length l). Index of the column of `components` giving the smallest p-value (if `pval` is TRUE).} +#' \item{allpv}{Matrix (k x l). P-values for each column of `components` with each POI (if `pval` is TRUE).} +#' @export +R2 <- function(poi, components, poi_types, pval = TRUE) { + if (is.vector(components)) { + components <- matrix(components, ncol = 1) } if (is.vector(poi)) { poi <- matrix(poi, nrow = length(poi)) } - p <- nrow(V) # number of samples - k <- ncol(V) # number of components - l <- length(poiType) # number of cf/poi to test - if (is.null(l)) { - stop("POI type(s) neeeded") + n_samples <- nrow(components) + n_components <- ncol(components) + n_poi <- length(poi_types) + + if (is.null(n_poi)) { + stop("POI type(s) needed") } - p2 <- nrow(poi) - l2 <- ncol(poi) - if (l2 != l) { # checking poi and poiType dimensions compatiblity - if (p2 == l) { # if poi is transposed (l*p) + poi_rows <- nrow(poi) + poi_cols <- ncol(poi) + + if (poi_cols != n_poi) { + if (poi_rows == n_poi) { poi <- t(poi) - warning("Transposing poi to match poiType dimension") - p2 <- nrow(poi) + warning("Transposing POI to match POI types dimension") + poi_rows <- nrow(poi) } else { - print(poi) - print(poiType) - stop("poi dimensions doesn't match poiType dimension") + stop("POI dimensions do not match POI types dimension") } } - - if (p != p2) { # checking poi and V dimensions compatiblity - if (p2 == k) { - warnings("Transposing V to match poi dimension") - V <- t(V) - k <- p - p <- p2 + if (n_samples != poi_rows) { + if (poi_rows == n_components) { + warning("Transposing components to match POI dimension") + components <- t(components) + n_components <- n_samples + n_samples <- poi_rows } else { - stop("poi and V dimensions incompatible") + stop("POI and components dimensions incompatible") } } - - - - - - R2 <- rep(-1, l) - names(R2) <- colnames(poi) - idxcorr <- R2 - R2_tmp <- matrix(rep(-1, k * l), k, l, dimnames = list(colnames(V), colnames(poi))) # r2_tmp(k,l) hold the R2 value for column k of V with poi l + R2_values <- rep(-1, n_poi) + names(R2_values) <- colnames(poi) + idx_corr <- R2_values + R2_tmp <- matrix(rep(-1, n_components * n_poi), n_components, n_poi, dimnames = list(colnames(components), colnames(poi))) if (pval) { - pv <- R2 - idxcorr2 <- R2 - pv_tmp <- R2_tmp # r2_tmp(k,l) hold the R2 value for column k of V with poi l + p_values <- R2_values + idx_corr2 <- R2_values + p_values_tmp <- R2_tmp } - for (cmpt in 1:k) { # for each column of V - cmpt2an <- V[, cmpt] - for (ipoi in 1:l) { - idx_finite <- is.finite(as.factor(poi[, ipoi])) - poi2an <- poi[idx_finite, ipoi] - cmpt2an_finite <- cmpt2an[idx_finite] - if (poiType[ipoi] == "continuous") { # estimation by linear regression - coefs <- coef(lm(cmpt2an_finite ~ as.numeric(poi2an))) - cmpt2an_est <- coefs[2] * as.numeric(poi2an) + coefs[1] - nc <- 2 - } else if (poiType[ipoi] == "categorical") { # estimation by classe mean - classes <- unique(poi2an) - nc <- length(classes) - cmpt2an_est <- rep(NA, length(cmpt2an_finite)) - for (icl in 1:length(classes)) { - idxClasse <- which(poi2an == classes[icl]) - cmpt2an_est[idxClasse] <- mean(cmpt2an_finite[idxClasse]) - } + for (component_idx in 1:n_components) { + component_values <- components[, component_idx] + for (poi_idx in 1:n_poi) { + finite_indices <- is.finite(as.factor(poi[, poi_idx])) + poi_values <- poi[finite_indices, poi_idx] + finite_component_values <- component_values[finite_indices] + + if (poi_types[poi_idx] == "continuous") { + estimated_values <- linear_regression_estimation(finite_component_values, poi_values) + num_classes <- 2 + } else if (poi_types[poi_idx] == "categorical") { + estimated_values <- class_mean_estimation(finite_component_values, poi_values) + num_classes <- length(unique(poi_values)) } else { - stop("Incorrect poiType. Select 'continuous' or 'categorical'. ") + stop("Incorrect poi_type. Select 'continuous' or 'categorical'.") } - sse <- sum((cmpt2an_finite - cmpt2an_est)^2) - sst <- sum((cmpt2an_finite - mean(cmpt2an_finite))^2) - R2_tmp[cmpt, ipoi] <- 1 - sse / sst + + sse <- sum((finite_component_values - estimated_values)^2) + sst <- sum((finite_component_values - mean(finite_component_values))^2) + R2_tmp[component_idx, poi_idx] <- 1 - sse / sst + if (pval) { - F <- ((sst - sse) / (nc - 1)) / (sse / (p - nc)) - pv_tmp[cmpt, ipoi] <- 1 - pf(F, nc - 1, p - nc) - if (!is.finite(pv_tmp[cmpt, ipoi])) { - warning(paste("Non finite p-value for component ", cmpt, " (pv=", pv_tmp[cmpt, ipoi], ", F=", F, "), assigning NA", sep = "")) - pv_tmp[cmpt, ipoi] <- NA + F_value <- ((sst - sse) / (num_classes - 1)) / (sse / (n_samples - num_classes)) + p_values_tmp[component_idx, poi_idx] <- 1 - pf(F_value, num_classes - 1, n_samples - num_classes) + if (!is.finite(p_values_tmp[component_idx, poi_idx])) { + warning(sprintf("Non-finite p-value for component %d (pv=%g, F=%g), assigning NA", component_idx, p_values_tmp[component_idx, poi_idx], F_value)) + p_values_tmp[component_idx, poi_idx] <- NA } } } } - for (ipoi in 1:l) { + for (poi_idx in 1:n_poi) { if (pval) { - pv[ipoi] <- min(pv_tmp[, ipoi]) - idxcorr2[ipoi] <- which(pv_tmp[, ipoi] == pv[ipoi])[1] # if more than one component gives the best R2, takes the first one + p_values[poi_idx] <- min(p_values_tmp[, poi_idx]) + idx_corr2[poi_idx] <- which(p_values_tmp[, poi_idx] == p_values[poi_idx])[1] } - R2[ipoi] <- max(R2_tmp[, ipoi]) - idxcorr[ipoi] <- which(R2_tmp[, ipoi] == R2[ipoi])[1] # if more than one component gives the best R2, takes the first one + R2_values[poi_idx] <- max(R2_tmp[, poi_idx]) + idx_corr[poi_idx] <- which(R2_tmp[, poi_idx] == R2_values[poi_idx])[1] } if (pval) { - return(list(R2 = R2, idxcorr = idxcorr, allR2 = R2_tmp, pv = pv, idxcorr2 = idxcorr2, allpv = pv_tmp)) + return(list(R2 = R2_values, idxCorr = idx_corr, allR2 = R2_tmp, pv = p_values, idxCorr2 = idx_corr2, allpv = p_values_tmp)) } else { - return(list(R2 = R2, idxcorr = idxcorr, allR2 = R2_tmp)) + return(list(R2 = R2_values, idxCorr = idx_corr, allR2 = R2_tmp)) } } diff --git a/R/WaveICA.R b/R/WaveICA.R index 1b680ef..efec1ed 100644 --- a/R/WaveICA.R +++ b/R/WaveICA.R @@ -107,10 +107,10 @@ waveica_nonbatchwise <- function(data, wf = "haar", injection_order, alpha = 0, #' @param wf String. Wavelet function, the default is "haar". #' @param batch Vector. Batch number of each sample. #' @param factorization String. Matrix factorization method, options are ["stICA", "SVD"]. The default is "stICA". -#' @param group Vector, optional. Type of a sample (blank, sample, QC) numerically encoded to blank:0, sample:1, QC:2. +#' @param group Vector, optional. Type of a sample (blank, sample, QC, standard) numerically encoded to blank:0, sample:1, QC:2, standard:3. #' @param K Integer. The maximal number of independent components (for ICA) or singular vectors (SVD). The default is 20. -#' @param t Float between 0 and 1. The threshold to consider a component associate with the batch. The default is 0.05. -#' @param t2 Float between 0 and 1. The threshold to consider a component associate with the group. The default is 0.05. +#' @param batch_threshold Float between 0 and 1. The threshold to consider a component associate with the batch. The default is 0.05. +#' @param group_threshold Float between 0 and 1. The threshold to consider a component associate with the group. The default is 0.05. #' @param alpha Float between 0 and 1. The trade-off value between the independence of samples and those #' of variables. The default is 0. #' @return Dataframe. Feature table with intensities corrected of batch effects. @@ -121,8 +121,8 @@ waveica <- function(data, factorization = "stICA", group = NULL, K = 20, - t = 0.05, - t2 = 0.05, + batch_threshold = 0.05, + group_threshold = 0.05, alpha = 0) { if (!factorization %in% c("stICA", "SVD")) { stop("The factorization method should be 'stICA' or 'SVD'.") @@ -137,8 +137,8 @@ waveica <- function(data, cat(paste("Performing matrix factorization...\n")) for (i in (1:index)) { data_coef <- coef[[i]] - data_coef_ICA <- normFact(fact = factorization, X = t(data_coef), ref = batch, refType = "categorical", k = K, t = t, ref2 = group, refType2 = "categorical", t2 = t2, alpha) - data_wave_ICA[[i]] <- t(data_coef_ICA$Xn) + data_coef_ICA <- normFact(factorization_method = factorization, data_matrix = t(data_coef), batch_vector = batch, batch_type = "categorical", rank = K, batch_threshold = batch_threshold, group_matrix = group, group_types = "categorical", group_threshold = group_threshold, alpha) + data_wave_ICA[[i]] <- t(data_coef_ICA$normalized_matrix) } data_wave <- wt_reconstruction(data, data_wave_ICA, wf) diff --git a/R/normFact.R b/R/normFact.R index 7f169d8..c9e6a63 100644 --- a/R/normFact.R +++ b/R/normFact.R @@ -1,92 +1,120 @@ -normFact <- function(fact, X, ref, refType, k = 20, t = 0.5, ref2 = NULL, refType2 = NULL, t2 = 0.5, alpha, ...) { - # - # Function to normalize data X by factorizing X=A*t(B) and removing components having a R2(ref) value higher than threshold t. - # If ref2 is defined, the components with R2(ref2) higher than threshold t2 are kept. - # - # Inputs: - # - fact : factorization method, 'SVD' or 'stICA' - # - X :(matrix n*p) samples*features matrix to normalize - # - ref: (vector n) variable representing the information we want to remove from X - # - refType : type of ref, 'categorical' or 'continuous' to indicates which linear model to use (class means or linear regression) - # - k: rank of the low-rank decomposition - # - t: scalar in [0,1], if R2(cmpt, ref)>t the cmpt is removed from X to normalize - # - ref2: (vector n*l) ref2[,i] represents the ith information we want to not remove from X - # - refType2: refType2[i] gives the type of ref2[,i] , 'categorical' or 'continuous' to indicates which linear model to use (class means or linear regression) - # - t2: (vector 1*l) scalar(s) in [0,1], if R2(cmpt, ref2[,i] )> t2[i] the cmpt is kept in X , if t2 is a scalar this threshold is considered for all ref2[,i] - # - ... values to pass to factorization method (typically, alpha value if facotorization by stICA) - # - # Outputs: - # - # - Xn : matrix n*p, normalized version of X - # - R2 : R2[k,l] gives the R2 between B[k,] and the ref l ( ref or ref2 ) - # - bestSV : components of B correlating with ref (but not with ref2 ), removed from X to normalize - # - A : A in the matrix factorization X=A*t(B) - # - B : B in the matrix factorization X=A*t(B) - # - # Renard E., Branders S. and Absil P.-A.: Independent Component Analysis to Remove Batch Effects from Merged Microarray Datasets (WABI2016) - - - - - - if (fact == "stICA") { - obj <- unbiased_stICA(X, k, alpha = alpha) - B <- obj$B - A <- obj$A - } else if (fact == "SVD") { - obj <- svd(X, nu = k, nv = k) - A <- obj$u %*% diag(obj$d[1:k], k) - B <- obj$v +perform_factorization <- function(method, data_matrix, rank, alpha) { + if (method == "stICA") { + result <- unbiased_stICA(data_matrix, rank, alpha = alpha) + } else if (method == "SVD") { + result <- svd(data_matrix, nu = rank, nv = rank) + result$A <- result$u %*% diag(result$d[1:rank], rank) + result$B <- result$v + } else { + stop("Unsupported factorization method") } + return(result) +} - - factR2 <- R2(ref, B, refType, pval = T) - - idx <- which(factR2$allpv < t) - - - - - if (t < 0 | t > 1) { - stop("t not in [0 1]") +get_batch_indices <- function(batch_R2, threshold) { + if (threshold < 0 || threshold > 1) { + stop("batch_threshold not in [0, 1]") } + batch_indices <- which(batch_R2$allpv < threshold) + return(batch_indices) +} - if (!is.null(ref2)) { - if (sum(t2 < 0 | t2 > 1)) { - stop("t2 not in [0 1]") - } - factR2_2 <- R2(ref2, B, refType2, pval = T) - idx_2 <- c() - if (length(t2) != length(refType2)) { - if (length(t2) == 1) { - t2 <- rep(t2, length(refType2)) - } else { - stop("length(t2) sould be equal to 1 or length(refType2)") - } - } - for (i in 1:length(refType2)) { - idx_2 <- c(idx_2, which(factR2_2$allpv[, i] < t2[i])) +get_group_indices <- function(group_R2, group_types, group_threshold) { + if (any(group_threshold < 0 | group_threshold > 1)) { + stop("group_threshold not in [0, 1]") + } + + group_indices <- integer(0) + + if (length(group_threshold) != length(group_types)) { + if (length(group_threshold) == 1) { + group_threshold <- rep(group_threshold, length(group_types)) + } else { + stop("length(group_threshold) should be equal to 1 or length(group_types)") } + } - - idx2keep <- intersect(idx, idx_2) - print(paste("Keeping", length(idx2keep), "cmpts with P value less than t2")) - idx <- setdiff(idx, idx2keep) + for (i in seq_along(group_types)) { + group_indices <- c(group_indices, which(group_R2$allpv[, i] < group_threshold[i])) } - bestcmptA <- A[, idx] - bestcmptB <- B[, idx] + return(group_indices) +} - print(paste("Removing", length(idx), "components with P value less than", t)) +remove_components <- function(data_matrix, factor_matrix_A, factor_matrix_B, batch_indices) { + components_to_remove_A <- factor_matrix_A[, batch_indices] + components_to_remove_B <- factor_matrix_B[, batch_indices] + print(paste("Removing", length(batch_indices), "components with P value less than batch_threshold")) + normalized_matrix <- data_matrix - components_to_remove_A %*% t(components_to_remove_B) + return(normalized_matrix) +} +combine_R2_results <- function(batch_R2, group_matrix, group_R2) { + combined_R2 <- batch_R2$allR2 + if (!is.null(group_matrix)) { + combined_R2 <- cbind(combined_R2, group_R2$allR2) + } + return(combined_R2) +} - Xn <- X - bestcmptA %*% t(bestcmptB) +#' Normalize Data by Removing Batch Effects +#' +#' Function to normalize data matrix `X` by factorizing `X = A %*% t(B)` and removing components with a high R² value with respect to batch effects. +#' Components with high R² values with respect to group effects are retained. +#' +#' @param factorization_method Character. The factorization method, either 'SVD' or 'stICA'. +#' @param data_matrix Matrix (n x p). Samples x features matrix to normalize. +#' @param batch_vector Vector (n). Variable representing the batch information to remove from `data_matrix`. +#' @param batch_type Character. Type of `batch_vector`, either 'categorical' or 'continuous'. +#' @param rank Integer. Rank of the low-rank decomposition. Default is 20. +#' @param batch_threshold Numeric. Threshold in [0,1]; if R²(component, batch_vector) > batch_threshold, the component is removed. Default is 0.5. +#' @param group_matrix Matrix (n x l), optional. Each column represents a group variable to retain in `data_matrix`. +#' @param group_types Character vector. Each element corresponds to the type of `group_matrix` columns, either 'categorical' or 'continuous'. +#' @param group_threshold Numeric vector. Threshold(s) in [0,1]; if R²(component, group_vector) > group_threshold[i], the component is retained. If scalar, this threshold applies to all group variables. Default is 0.5. +#' @param alpha Numeric, optional. Parameter for `stICA` factorization. +#' @return A list with the following components: +#' \item{normalized_matrix}{Matrix (n x p). Normalized version of `data_matrix`.} +#' \item{R2_matrix}{Matrix. R² values between `B` components and batch/group variables.} +#' \item{removed_components}{Matrix. Components of `B` correlating with `batch_vector` but not with `group_matrix`, removed from `data_matrix`.} +#' \item{factor_matrix_A}{Matrix. `A` in the matrix factorization `X = A %*% t(B)`.} +#' \item{factor_matrix_B}{Matrix. `B` in the matrix factorization `X = A %*% t(B)`.} +#' @references Renard E., Branders S., Absil P.-A.: Independent Component Analysis to Remove Batch Effects from Merged Microarray Datasets (WABI2016). +normFact <- function( + factorization_method, + data_matrix, + batch_vector, + batch_type, + rank = 20, + batch_threshold = 0.5, + group_matrix = NULL, + group_types = NULL, + group_threshold = 0.5, + alpha = NULL +) { + factorization_result <- perform_factorization(factorization_method, data_matrix, rank, alpha) + factor_matrix_A <- factorization_result$A + factor_matrix_B <- factorization_result$B + + batch_R2 <- R2(batch_vector, factor_matrix_B, batch_type, pval = TRUE) + batch_indices <- get_batch_indices(batch_R2, batch_threshold) + + if (!is.null(group_matrix)) { + group_R2 <- R2(group_matrix, factor_matrix_B, group_types, pval = TRUE) + group_indices <- get_group_indices(group_R2, group_types, group_threshold) + indices_to_keep <- intersect(batch_indices, group_indices) + print(paste("Keeping", length(indices_to_keep), "components with P value less than group_threshold")) + batch_indices <- setdiff(batch_indices, indices_to_keep) + } + normalized_matrix <- remove_components(data_matrix, factor_matrix_A, factor_matrix_B, batch_indices) - R2 <- factR2$allR2 - if (!is.null(ref2)) { - R2 <- cbind(R2, factR2_2$allR2) - } + combined_R2 <- combine_R2_results(batch_R2, group_matrix, group_R2) - return(list(Xn = Xn, R2 = R2, bestSV = bestcmptB, A = A, B = B)) + list( + normalized_matrix = normalized_matrix, + R2_matrix = combined_R2, + removed_components = factor_matrix_B[, batch_indices], + factor_matrix_A = factor_matrix_A, + factor_matrix_B = factor_matrix_B + ) } diff --git a/tests/testthat/test_batchwise_waveica.R b/tests/testthat/test_batchwise_waveica.R index 88b5874..daf534d 100644 --- a/tests/testthat/test_batchwise_waveica.R +++ b/tests/testthat/test_batchwise_waveica.R @@ -14,8 +14,8 @@ patrick::with_parameters_test_that( group = group, batch = batch, K = 20, - t = 0.05, - t2 = 0.05, + batch_threshold = 0.05, + group_threshold = 0.05, alpha = 0 )