Skip to content

Commit

Permalink
Merge pull request #14 from zargham-ahmad/refactor
Browse files Browse the repository at this point in the history
First iteration Refactoring for normFact and R2 function
  • Loading branch information
hechth authored Aug 1, 2024
2 parents b26c678 + 80f139a commit ce4d983
Show file tree
Hide file tree
Showing 5 changed files with 221 additions and 188 deletions.
24 changes: 15 additions & 9 deletions .github/workflows/r-conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,29 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ windows-latest, ubuntu-latest, macos-latest]
os: [ windows-latest, ubuntu-latest, macos-14]
experimental: [false]
include:
- os: macos-14-arm64
experimental: true
runs-on: ${{ matrix.os }}
continue-on-error: ${{ matrix.experimental }}
defaults:
run:
shell: bash -l {0}

# Steps represent a sequence of tasks that will be executed as part of the job
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Install wget
if: ${{ matrix.os == 'windows-latest' }}
run: choco install wget

- name: Sets env vars for osx-64
run: echo "CONDA_SUBDIR=osx-64" >> $GITHUB_ENV
if: ${{ matrix.os == 'macos-14' }}
- name: Create conda environment
uses: conda-incubator/setup-miniconda@v2
uses: conda-incubator/setup-miniconda@v3
with:
activate-environment: recetox-waveica-dev
auto-update-conda: true
Expand All @@ -45,11 +52,10 @@ jobs:
conda init bash
conda env list
- name: Fetch input data
run: wget -P tests/testthat/test-data -i tests/remote-files/fetch_input_data.txt
- name: Fetch batchwise waveica results
run: wget -P tests/testthat/test-data/batchwise-correction -i tests/remote-files/fetch_waveica_batchwise.txt
- name: Fetch nonbatchwise waveica results
run: wget -P tests/testthat/test-data/nonbatchwise-correction -i tests/remote-files/fetch_waveica_nonbatchwise.txt
run: |
wget -P tests/testthat/test-data -i tests/remote-files/fetch_input_data.txt
wget -P tests/testthat/test-data/batchwise-correction -i tests/remote-files/fetch_waveica_batchwise.txt
wget -P tests/testthat/test-data/nonbatchwise-correction -i tests/remote-files/fetch_waveica_nonbatchwise.txt
- name: setup Rtools
if: ${{ matrix.os == 'windows-latest' }}
uses: r-windows/install-rtools@master
Expand Down
185 changes: 92 additions & 93 deletions R/R2.R
Original file line number Diff line number Diff line change
@@ -1,131 +1,130 @@
R2 <- function(poi, V, poiType, pval = T) {
#
# R2(poi,V,poiType)
#
# Args:
# - V is a p*k matrix, where the rows corresponds to the samples
# - poi is a matrix p*l, representing the phenotypes of interest
# - poiType (1*l) is the types of poi: 'continuous' (then a linear
# regression is used) or 'categorical' (then the mean by class is used)
#
# Outputs:
# - R2(l), higher R^2 value between a column of V and poi(l)
# - idxCorr(l), index of the column of V giving the higher R^2 value (if many,
# takes the first one)
# - allR2(k,l), R2 value for column k of V with poi l
#
# IF pval =TRUE, return also: #
# - pv(l) smaller p-value association between a column of V and poi(l)
# - idxcorr2(l) index of the column of V giving the smaller p-value (if many,
# # takes the first one)
# - allpv(k,l), p-value for column k of V with poi l
#
# if missing information in poi, remove the corresponding samples in the R2 computation
linear_regression_estimation <- function(component_values, poi_values) {
coefs <- coef(lm(component_values ~ as.numeric(poi_values)))
estimated_values <- coefs[2] * as.numeric(poi_values) + coefs[1]
return(estimated_values)
}

class_mean_estimation <- function(component_values, poi_values) {
classes <- unique(poi_values)
estimated_values <- rep(NA, length(component_values))
for (class in classes) {
class_indices <- which(poi_values == class)
estimated_values[class_indices] <- mean(component_values[class_indices])
}
return(estimated_values)
}

if (is.vector(V)) {
V <- matrix(V, ncol = 1)
#' Compute R-squared and P-values between Phenotypes of Interest and Components
#'
#' This function computes the R-squared and optionally the p-values between phenotypes of interest (POI) and components.
#'
#' @param poi Matrix (p x l). Representing the phenotypes of interest.
#' @param components Matrix (p x k). The components where the rows correspond to the samples.
#' @param poi_types Character vector (length l). Types of POI: 'continuous' (for linear regression) or 'categorical' (for class mean).
#' @param pval Logical. If TRUE, compute p-values in addition to R-squared values. Default is TRUE.
#' @return A list containing:
#' \item{R2}{Vector (length l). Highest R-squared value between a column of `components` and each POI.}
#' \item{idxCorr}{Vector (length l). Index of the column of `components` giving the highest R-squared value.}
#' \item{allR2}{Matrix (k x l). R-squared values for each column of `components` with each POI.}
#' \item{pv}{Vector (length l). Smallest p-value association between a column of `components` and each POI (if `pval` is TRUE).}
#' \item{idxCorr2}{Vector (length l). Index of the column of `components` giving the smallest p-value (if `pval` is TRUE).}
#' \item{allpv}{Matrix (k x l). P-values for each column of `components` with each POI (if `pval` is TRUE).}
#' @export
R2 <- function(poi, components, poi_types, pval = TRUE) {
if (is.vector(components)) {
components <- matrix(components, ncol = 1)
}
if (is.vector(poi)) {
poi <- matrix(poi, nrow = length(poi))
}

p <- nrow(V) # number of samples
k <- ncol(V) # number of components
l <- length(poiType) # number of cf/poi to test
if (is.null(l)) {
stop("POI type(s) neeeded")
n_samples <- nrow(components)
n_components <- ncol(components)
n_poi <- length(poi_types)

if (is.null(n_poi)) {
stop("POI type(s) needed")
}
p2 <- nrow(poi)
l2 <- ncol(poi)

if (l2 != l) { # checking poi and poiType dimensions compatiblity
if (p2 == l) { # if poi is transposed (l*p)
poi_rows <- nrow(poi)
poi_cols <- ncol(poi)

if (poi_cols != n_poi) {
if (poi_rows == n_poi) {
poi <- t(poi)
warning("Transposing poi to match poiType dimension")
p2 <- nrow(poi)
warning("Transposing POI to match POI types dimension")
poi_rows <- nrow(poi)
} else {
print(poi)
print(poiType)
stop("poi dimensions doesn't match poiType dimension")
stop("POI dimensions do not match POI types dimension")
}
}


if (p != p2) { # checking poi and V dimensions compatiblity
if (p2 == k) {
warnings("Transposing V to match poi dimension")
V <- t(V)
k <- p
p <- p2
if (n_samples != poi_rows) {
if (poi_rows == n_components) {
warning("Transposing components to match POI dimension")
components <- t(components)
n_components <- n_samples
n_samples <- poi_rows
} else {
stop("poi and V dimensions incompatible")
stop("POI and components dimensions incompatible")
}
}






R2 <- rep(-1, l)
names(R2) <- colnames(poi)
idxcorr <- R2
R2_tmp <- matrix(rep(-1, k * l), k, l, dimnames = list(colnames(V), colnames(poi))) # r2_tmp(k,l) hold the R2 value for column k of V with poi l
R2_values <- rep(-1, n_poi)
names(R2_values) <- colnames(poi)
idx_corr <- R2_values
R2_tmp <- matrix(rep(-1, n_components * n_poi), n_components, n_poi, dimnames = list(colnames(components), colnames(poi)))

if (pval) {
pv <- R2
idxcorr2 <- R2
pv_tmp <- R2_tmp # r2_tmp(k,l) hold the R2 value for column k of V with poi l
p_values <- R2_values
idx_corr2 <- R2_values
p_values_tmp <- R2_tmp
}

for (cmpt in 1:k) { # for each column of V
cmpt2an <- V[, cmpt]
for (ipoi in 1:l) {
idx_finite <- is.finite(as.factor(poi[, ipoi]))
poi2an <- poi[idx_finite, ipoi]
cmpt2an_finite <- cmpt2an[idx_finite]
if (poiType[ipoi] == "continuous") { # estimation by linear regression
coefs <- coef(lm(cmpt2an_finite ~ as.numeric(poi2an)))
cmpt2an_est <- coefs[2] * as.numeric(poi2an) + coefs[1]
nc <- 2
} else if (poiType[ipoi] == "categorical") { # estimation by classe mean
classes <- unique(poi2an)
nc <- length(classes)
cmpt2an_est <- rep(NA, length(cmpt2an_finite))
for (icl in 1:length(classes)) {
idxClasse <- which(poi2an == classes[icl])
cmpt2an_est[idxClasse] <- mean(cmpt2an_finite[idxClasse])
}
for (component_idx in 1:n_components) {
component_values <- components[, component_idx]
for (poi_idx in 1:n_poi) {
finite_indices <- is.finite(as.factor(poi[, poi_idx]))
poi_values <- poi[finite_indices, poi_idx]
finite_component_values <- component_values[finite_indices]

if (poi_types[poi_idx] == "continuous") {
estimated_values <- linear_regression_estimation(finite_component_values, poi_values)
num_classes <- 2
} else if (poi_types[poi_idx] == "categorical") {
estimated_values <- class_mean_estimation(finite_component_values, poi_values)
num_classes <- length(unique(poi_values))
} else {
stop("Incorrect poiType. Select 'continuous' or 'categorical'. ")
stop("Incorrect poi_type. Select 'continuous' or 'categorical'.")
}
sse <- sum((cmpt2an_finite - cmpt2an_est)^2)
sst <- sum((cmpt2an_finite - mean(cmpt2an_finite))^2)
R2_tmp[cmpt, ipoi] <- 1 - sse / sst

sse <- sum((finite_component_values - estimated_values)^2)
sst <- sum((finite_component_values - mean(finite_component_values))^2)
R2_tmp[component_idx, poi_idx] <- 1 - sse / sst

if (pval) {
F <- ((sst - sse) / (nc - 1)) / (sse / (p - nc))
pv_tmp[cmpt, ipoi] <- 1 - pf(F, nc - 1, p - nc)
if (!is.finite(pv_tmp[cmpt, ipoi])) {
warning(paste("Non finite p-value for component ", cmpt, " (pv=", pv_tmp[cmpt, ipoi], ", F=", F, "), assigning NA", sep = ""))
pv_tmp[cmpt, ipoi] <- NA
F_value <- ((sst - sse) / (num_classes - 1)) / (sse / (n_samples - num_classes))
p_values_tmp[component_idx, poi_idx] <- 1 - pf(F_value, num_classes - 1, n_samples - num_classes)
if (!is.finite(p_values_tmp[component_idx, poi_idx])) {
warning(sprintf("Non-finite p-value for component %d (pv=%g, F=%g), assigning NA", component_idx, p_values_tmp[component_idx, poi_idx], F_value))
p_values_tmp[component_idx, poi_idx] <- NA
}
}
}
}

for (ipoi in 1:l) {
for (poi_idx in 1:n_poi) {
if (pval) {
pv[ipoi] <- min(pv_tmp[, ipoi])
idxcorr2[ipoi] <- which(pv_tmp[, ipoi] == pv[ipoi])[1] # if more than one component gives the best R2, takes the first one
p_values[poi_idx] <- min(p_values_tmp[, poi_idx])
idx_corr2[poi_idx] <- which(p_values_tmp[, poi_idx] == p_values[poi_idx])[1]
}
R2[ipoi] <- max(R2_tmp[, ipoi])
idxcorr[ipoi] <- which(R2_tmp[, ipoi] == R2[ipoi])[1] # if more than one component gives the best R2, takes the first one
R2_values[poi_idx] <- max(R2_tmp[, poi_idx])
idx_corr[poi_idx] <- which(R2_tmp[, poi_idx] == R2_values[poi_idx])[1]
}

if (pval) {
return(list(R2 = R2, idxcorr = idxcorr, allR2 = R2_tmp, pv = pv, idxcorr2 = idxcorr2, allpv = pv_tmp))
return(list(R2 = R2_values, idxCorr = idx_corr, allR2 = R2_tmp, pv = p_values, idxCorr2 = idx_corr2, allpv = p_values_tmp))
} else {
return(list(R2 = R2, idxcorr = idxcorr, allR2 = R2_tmp))
return(list(R2 = R2_values, idxCorr = idx_corr, allR2 = R2_tmp))
}
}
14 changes: 7 additions & 7 deletions R/WaveICA.R
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,10 @@ waveica_nonbatchwise <- function(data, wf = "haar", injection_order, alpha = 0,
#' @param wf String. Wavelet function, the default is "haar".
#' @param batch Vector. Batch number of each sample.
#' @param factorization String. Matrix factorization method, options are ["stICA", "SVD"]. The default is "stICA".
#' @param group Vector, optional. Type of a sample (blank, sample, QC) numerically encoded to blank:0, sample:1, QC:2.
#' @param group Vector, optional. Type of a sample (blank, sample, QC, standard) numerically encoded to blank:0, sample:1, QC:2, standard:3.
#' @param K Integer. The maximal number of independent components (for ICA) or singular vectors (SVD). The default is 20.
#' @param t Float between 0 and 1. The threshold to consider a component associate with the batch. The default is 0.05.
#' @param t2 Float between 0 and 1. The threshold to consider a component associate with the group. The default is 0.05.
#' @param batch_threshold Float between 0 and 1. The threshold to consider a component associate with the batch. The default is 0.05.
#' @param group_threshold Float between 0 and 1. The threshold to consider a component associate with the group. The default is 0.05.
#' @param alpha Float between 0 and 1. The trade-off value between the independence of samples and those
#' of variables. The default is 0.
#' @return Dataframe. Feature table with intensities corrected of batch effects.
Expand All @@ -121,8 +121,8 @@ waveica <- function(data,
factorization = "stICA",
group = NULL,
K = 20,
t = 0.05,
t2 = 0.05,
batch_threshold = 0.05,
group_threshold = 0.05,
alpha = 0) {
if (!factorization %in% c("stICA", "SVD")) {
stop("The factorization method should be 'stICA' or 'SVD'.")
Expand All @@ -137,8 +137,8 @@ waveica <- function(data,
cat(paste("Performing matrix factorization...\n"))
for (i in (1:index)) {
data_coef <- coef[[i]]
data_coef_ICA <- normFact(fact = factorization, X = t(data_coef), ref = batch, refType = "categorical", k = K, t = t, ref2 = group, refType2 = "categorical", t2 = t2, alpha)
data_wave_ICA[[i]] <- t(data_coef_ICA$Xn)
data_coef_ICA <- normFact(factorization_method = factorization, data_matrix = t(data_coef), batch_vector = batch, batch_type = "categorical", rank = K, batch_threshold = batch_threshold, group_matrix = group, group_types = "categorical", group_threshold = group_threshold, alpha)
data_wave_ICA[[i]] <- t(data_coef_ICA$normalized_matrix)
}
data_wave <- wt_reconstruction(data, data_wave_ICA, wf)

Expand Down
Loading

0 comments on commit ce4d983

Please sign in to comment.