From 20f2cef84c409656e801ec352e2394299294cefb Mon Sep 17 00:00:00 2001 From: Julie Livingstone Date: Thu, 14 Dec 2023 10:13:10 -0800 Subject: [PATCH] lintr changes; update simulated_data_5method for patient removal iteration --- .../1.Outlier_Detection.R | 31 +++++---- .../2.Distribution_Identification.R | 18 ++--- .../3.Simulated_Data_generation_1.R | 14 ++-- .../4.Simulated_Data_generation_2.R | 11 +-- .../5.Simulated_Data_5method.R | 68 ++++++++++++------- 5 files changed, 82 insertions(+), 60 deletions(-) diff --git a/OutlierDetectionAlgorithm/1.Outlier_Detection.R b/OutlierDetectionAlgorithm/1.Outlier_Detection.R index db8bcf5..2de3716 100644 --- a/OutlierDetectionAlgorithm/1.Outlier_Detection.R +++ b/OutlierDetectionAlgorithm/1.Outlier_Detection.R @@ -97,11 +97,11 @@ molecular.data.filter <- fpkm.tumor.symbol.filter[, patient.part]; trim.sample <- function(x, trim = 0.05) { x <- sort(x); if (length(x) <= 10) { - patient.trim.value <- 2:(length(x)-1); - } else { + patient.trim.value <- 2:(length(x) - 1); + } else { trim.sample.number <- length(x) * trim; trim.sample.number.integer <- round(trim.sample.number); - patient.trim.value <- (trim.sample.number.integer + 1):(length(x)-trim.sample.number.integer); + patient.trim.value <- (trim.sample.number.integer + 1):(length(x) - trim.sample.number.integer); } x[patient.trim.value]; } @@ -276,7 +276,7 @@ outlier.detection.cosine <- function(x, value.portion = 1) { }) return(decimal.numbers) }) - add.minimum.value <- 1 / 10^as.numeric(max(unlist(decimal.number.max))); + add.minimum.value <- 1 / 10 ^ as.numeric(max(unlist(decimal.number.max))); # function: Compute the cosine similarity of the largest data point cosine.similarity.large.value.percent <- function(x, y, large.value.percent) { @@ -312,7 +312,7 @@ outlier.detection.cosine <- function(x, value.portion = 1) { sample.fpkm.qq.nozero <- sample.fpkm.qq + add.minimum.value; # Trimmed samples -Trim 5% of each side - sample.trim.number <- trim.sample(seq(length(sample.fpkm.qq.nozero)), 0.05); + sample.trim.number <- trim.sample(x = seq(length(sample.fpkm.qq.nozero)), trim = 0.05); sample.fpkm.qq.trim <- sort(sample.fpkm.qq)[sample.trim.number]; sample.fpkm.qq.nozero.trim <- sample.fpkm.qq.trim + add.minimum.value; @@ -349,7 +349,7 @@ outlier.detection.cosine <- function(x, value.portion = 1) { last.cos <- cosine.similarity.large.value.percent(exp.quantile, obs.quantile.exp, large.value.percent = value.portion); } else if (4 == distribution.fit) { - ### 4 gamma distribution + # 4 gamma distribution mean.gamma <- mean(sample.fpkm.qq.nozero.trim); sd.gamma <- sd(sample.fpkm.qq.nozero.trim); gamma.shape <- (mean.gamma / sd.gamma) ^ 2; @@ -378,7 +378,7 @@ clusterEvalQ( expr = library(gamlss) ) -# Define a minimum value +# Define a minimum value (should set a seed) random.col <- sample(patient.part, 1) decimal.number.max <- lapply(na.omit(fpkm.tumor.symbol.filter[,random.col]), function(x) { decimal.numbers <- sapply(x, function(y) { @@ -390,11 +390,11 @@ add.minimum.value <- 1 / 10 ^ as.numeric(max(unlist(decimal.number.max))); bic.trim.distribution <- NULL; -# Use foreach to iterate over the rows of fpkm.tumor.symbol.filter in parallel +# Use foreach to iterate over the rows (genes) of fpkm.tumor.symbol.filter in parallel bic.trim.distribution <- foreach(j = 1:nrow(fpkm.tumor.symbol.filter), .combine = rbind) %dopar% { sample.fpkm.qq <- round(as.numeric(fpkm.tumor.symbol.filter[j,patient.part]), digits = 6); - sample.trim.number <- trim.sample(sample.number, 5); + sample.trim.number <- trim.sample(x = sample.number, trim = 5); sample.fpkm.qq.sort <- sort(sample.fpkm.qq)[sample.trim.number]; sample.fpkm.qq.nozero <- sample.fpkm.qq.sort + add.minimum.value; @@ -403,17 +403,18 @@ bic.trim.distribution <- foreach(j = 1:nrow(fpkm.tumor.symbol.filter), .combine glm.gamma <- gamlss(sample.fpkm.qq.nozero ~ 1, family = GA); glm.exp <- gamlss(sample.fpkm.qq.nozero ~ 1, family = EXP); - glm.bic <- c(glm.norm$sbc, - glm.lnorm$sbc, - glm.exp$sbc, - glm.gamma$sbc); + glm.bic <- c( + glm.norm$sbc, + glm.lnorm$sbc, + glm.exp$sbc, + glm.gamma$sbc + ); glm.bic; } stopCluster(cl = cl); -# Find the best fitted distribution -# - BIC +# Find the best fitted distribution - BIC rownames(bic.trim.distribution) <- rownames(fpkm.tumor.symbol.filter); bic.trim.distribution.fit <- apply(bic.trim.distribution, 1, which.min); diff --git a/OutlierDetectionAlgorithm/2.Distribution_Identification.R b/OutlierDetectionAlgorithm/2.Distribution_Identification.R index 6d7d493..94c6247 100644 --- a/OutlierDetectionAlgorithm/2.Distribution_Identification.R +++ b/OutlierDetectionAlgorithm/2.Distribution_Identification.R @@ -102,8 +102,8 @@ obs.residue.quantile <- foreach(i = 1:nrow(fpkm.tumor.symbol.filter), .combine = # 2. Log-normal distribution mean.log <- mean(sample.fpkm.qq.nozero.trim); sd.log <- sd(sample.fpkm.qq.nozero.trim); - m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2)); - sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2))); + m2 <- log(mean.log ^ 2 / sqrt(sd.log ^ 2 + mean.log ^ 2)); + sd2 <- sqrt(log(1 + (sd.log ^ 2 / mean.log ^ 2))); lnorm.quantile <- qlnorm(p, meanlog = m2, sdlog = sd2); obs.quantile.lnorm <- quantile(sample.fpkm.qq.nozero, prob = p); obs.residue.non.trim <- obs.quantile.lnorm - lnorm.quantile; @@ -118,7 +118,7 @@ obs.residue.quantile <- foreach(i = 1:nrow(fpkm.tumor.symbol.filter), .combine = } else if (4 == bic.trim.distribution.fit[i]) { - ### 4 gamma distribution + # 4 gamma distribution mean.gamma <- mean(sample.fpkm.qq.nozero.trim); sd.gamma <- sd(sample.fpkm.qq.nozero.trim); gamma.shape <- (mean.gamma / sd.gamma) ^ 2; @@ -139,7 +139,7 @@ rownames(obs.residue.quantile) <- rownames(fpkm.tumor.symbol.filter); obs.residue.quantile.trim <- apply( X = obs.residue.quantile, MARGIN = 1, - FUN - function(x) { + FUN = function(x) { sort(as.numeric(x)) } ); @@ -181,10 +181,12 @@ noise.min.off.bic.distribution <- foreach(j = 1:nrow(obs.residue.quantile.trim), glm.gamma <- gamlss(sample.fpkm.qq.nozero ~ 1, family = GA); glm.exp <- gamlss(sample.fpkm.qq.nozero ~ 1, family = EXP); - glm.bic <- c(glm.norm$sbc, - glm.lnorm$sbc, - glm.exp$sbc, - glm.gamma$sbc) + glm.bic <- c( + glm.norm$sbc, + glm.lnorm$sbc, + glm.exp$sbc, + glm.gamma$sbc + ) glm.bic; } diff --git a/OutlierDetectionAlgorithm/3.Simulated_Data_generation_1.R b/OutlierDetectionAlgorithm/3.Simulated_Data_generation_1.R index a2a282d..e6d2845 100644 --- a/OutlierDetectionAlgorithm/3.Simulated_Data_generation_1.R +++ b/OutlierDetectionAlgorithm/3.Simulated_Data_generation_1.R @@ -73,7 +73,7 @@ clusterEvalQ( expr = library(extraDistr) ) -simulated.generation.negative <- function(x = fpkm.tumor.symbol.filter.distribution.fit, distribution = bic.trim.distribution.fit, num.negative = 10000, sample.size = sample.number) { +simulated.generation.negative <- function(x = fpkm.tumor.symbol.filter.bic, distribution = bic.trim.distribution.fit, num.negative = 10000, sample.size = sample.number) { # Define a minimum value random.col <- sample(sample.size, 1) decimal.number.max <- lapply(na.omit(x[,random.col]), function(x) { @@ -98,7 +98,8 @@ simulated.generation.negative <- function(x = fpkm.tumor.symbol.filter.distribut if (!is.numeric(num.negative)) stop('num.negative should be numeric.') if (!is.numeric(sample.size)) stop('sample.size should be numeric.') - random.number.negative <- sample(length(distribution), num.negative, replace = TRUE); + # shuffle values and labels to create simulated data + random.number.negative <- sample(x = length(distribution), size = num.negative, replace = TRUE); names(random.number.negative) <- names(distribution)[random.number.negative] # use the foreach function to parallelize the sapply loop @@ -114,12 +115,13 @@ simulated.generation.negative <- function(x = fpkm.tumor.symbol.filter.distribut sample.fpkm.qq.nozero.trim <- sample.fpkm.qq.trim + add.minimum.value; if (1 == distribution[i]) { - ### 1) Normal distribution + # 1. Normal distribution norm.mean <- mean(sample.fpkm.qq.nozero.trim); norm.sd <- sd(sample.fpkm.qq.nozero.trim); rtnorm(length(sample.size), mean = norm.mean, sd = norm.sd, a = 0); } else if (2 == distribution[i]) { + # 2. Log-normal distribution mean.log <- mean(sample.fpkm.qq.nozero.trim); sd.log <- sd(sample.fpkm.qq.nozero.trim); m2 <- log(mean.log ^ 2 / sqrt(sd.log ^ 2 + mean.log ^ 2)); @@ -127,12 +129,12 @@ simulated.generation.negative <- function(x = fpkm.tumor.symbol.filter.distribut rlnorm(n = length(sample.size), m2, sd2); } else if (3 == distribution[i]) { - ### 4) exponential distribution + # 3. Exponential distribution exp.rate <- 1 / mean(sample.fpkm.qq.nozero.trim); rexp(n = length(sample.size), rate = exp.rate); } else if (4 == distribution[i]) { - ### 5) gamma distribution + # 4. Gamma distribution mean.gamma <- mean(sample.fpkm.qq.nozero.trim); sd.gamma <- sd(sample.fpkm.qq.nozero.trim); gamma.shape <- (mean.gamma / sd.gamma) ^ 2; @@ -150,7 +152,7 @@ seeds <- round(runif(n = ntimes, min = 1, max = 10000)) for (i in 1:ntimes) { seed <- seeds[i] set.seed(seed) - print(paste0('run negative random number:', i)) + negative.random.number.bic <- simulated.generation.negative( x = fpkm.tumor.symbol.filter.bic, distribution = bic.trim.distribution.fit, diff --git a/OutlierDetectionAlgorithm/4.Simulated_Data_generation_2.R b/OutlierDetectionAlgorithm/4.Simulated_Data_generation_2.R index 8aa1be0..083a972 100644 --- a/OutlierDetectionAlgorithm/4.Simulated_Data_generation_2.R +++ b/OutlierDetectionAlgorithm/4.Simulated_Data_generation_2.R @@ -70,7 +70,7 @@ decimal.number.max <- lapply(na.omit(fpkm.tumor.symbol.filter[,random.col]), fun }) add.minimum.value <- 1 / 10 ^ as.numeric(max(unlist(decimal.number.max))); -# 2. residual +# 2. residual ** using magic numbers residue.negative.random.number.bic <- obs.residue.quantile.trim[match(substr(rownames(negative.random.number.bic), 1, 15), substr(rownames(obs.residue.quantile.trim), 1, 15)),]; noise.min.off.bic.distribution.residue <- noise.min.off.bic.distribution.fit[match(substr(rownames(negative.random.number.bic), 1, 15), substr(names(noise.min.off.bic.distribution.fit), 1, 15))]; @@ -109,12 +109,13 @@ negative.random.number.noise.bic <- foreach(i = 1:nrow(residue.negative.random.n } if (1 == noise.min.off.bic.distribution.residue[i]) { - ### 1) Normal distribution + # 1. Normal distribution norm.mean <- mean(sample.fpkm.qq.nozero); norm.sd <- sd(sample.fpkm.qq.nozero); simulated.sample <- rtnorm(length(sample.number), mean = norm.mean, sd = norm.sd, a = 0); } else if (2 == noise.min.off.bic.distribution.residue[i]) { + # 2. Log-normal distribution mean.log <- mean(sample.fpkm.qq.nozero); sd.log <- sd(sample.fpkm.qq.nozero); m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2)); @@ -122,12 +123,12 @@ negative.random.number.noise.bic <- foreach(i = 1:nrow(residue.negative.random.n simulated.sample <- rlnorm(n = length(sample.number), m2, sd2); } else if (3 == noise.min.off.bic.distribution.residue[i]) { - ### 4) exponential distribution + # 3. Exponential distribution exp.rate <- 1 / mean(sample.fpkm.qq.nozero); simulated.sample <- rexp(n = length(sample.number), rate = exp.rate); } else if (4 == noise.min.off.bic.distribution.residue[i]) { - ### 5) gamma distribution + # 4. Gamma distribution mean.gamma <- mean(sample.fpkm.qq.nozero); sd.gamma <- sd(sample.fpkm.qq.nozero); gamma.shape <- (mean.gamma / sd.gamma) ^ 2; @@ -163,5 +164,5 @@ save( noise.min.off.bic.distribution.residue, negative.random.number.noise.bic, negative.simulated.sum, - file = generate.filename('Simulated_data_generation_2', paste(dataset.name, replicate, sep = '.'), 'rda') + file = generate.filename('Simulated_Data_generation_2', paste(dataset.name, replicate, sep = '.'), 'rda') ) diff --git a/OutlierDetectionAlgorithm/5.Simulated_Data_5method.R b/OutlierDetectionAlgorithm/5.Simulated_Data_5method.R index 4b54af3..d29819f 100644 --- a/OutlierDetectionAlgorithm/5.Simulated_Data_5method.R +++ b/OutlierDetectionAlgorithm/5.Simulated_Data_5method.R @@ -21,7 +21,8 @@ params <- matrix( 'dataset.name', 'd', '0', 'character', 'working.directory', 'w', '0', 'character', 'distribution.identification.file', 'o', '0', 'character', - 'simulated.data.file', 's', '0', 'character' + 'simulated.data.file', 's', '0', 'character', + 'patient.to.remove', 'p', '0', 'numeric' ), ncol = 4, byrow = TRUE @@ -32,11 +33,13 @@ dataset.name <- opt$dataset.name working.directory <- opt$working.directory distribution.identification.file <- opt$distribution.identification.file simulated.data.file <- opt$simulated.data.file +patient.to.remove <- opt$patient.to.remove #working.directory <- '/hot/users/jlivingstone/outlier/run_method' #dataset.name <- 'BRCA_EU' #distribution.identification.file <- '/hot/users/jlivingstone/outlier/run_method/2023-11-20_Distribution_Identification_short_BRCA_EU.rda' #simulated.data.file <- '/hot/users/jlivingstone/outlier/run_method/2023-11-21_Simulated_data_generation_2_BRCA_EU.1.rda' +#patient.to.remove <- 0 # Set the working directory setwd(working.directory); @@ -58,10 +61,18 @@ load( # sample size patient.part <- 1:ncol(fpkm.tumor.symbol.filter); sample.number <- 1:ncol(fpkm.tumor.symbol.filter); -molecular.data.filter <- fpkm.tumor.symbol.filter[, patient.part]; bic.trim.distribution.fit.obs <- bic.trim.distribution.fit; +# remove the number of patients stated for method iteration to identify outlier patients +if (patient.to.remove > 0) { + patient.part <- patient.part[1:(length(patient.part) - patient.to.remove)]; + sample.number <- patient.part + negative.simulated.sum <- negative.simulated.sum[, patient.part] + fpkm.tumor.symbol.filter <- fpkm.tumor.symbol.filter[,patient.part] + } + # Define a minimum value +set.seed(42) random.col <- sample(patient.part, 1) decimal.number.max <- lapply(na.omit(fpkm.tumor.symbol.filter[,random.col]), function(x) { decimal.numbers <- sapply(x, function(y) { @@ -71,7 +82,6 @@ decimal.number.max <- lapply(na.omit(fpkm.tumor.symbol.filter[,random.col]), fun }) add.minimum.value <- 1 / 10 ^ as.numeric(max(unlist(decimal.number.max))); - # function: Compute the cosine similarity of the largest data point cosine.similarity.large.value.percent <- function(x, y, large.value.percent) { @@ -102,7 +112,6 @@ cosine.similarity.large.value.percent <- function(x, y, large.value.percent) { cosine.large.value; } - # function: Trim 5% of samples from each side trim.sample <- function(x, trim.portion = 5) { if (length(x) <= 10) { @@ -130,7 +139,7 @@ outlier.detection.cosine <- function(x, value.portion = 1) { sample.fpkm.qq.nozero <- sample.fpkm.qq + add.minimum.value; # Trimmed samples -Trim 5% of each side - sample.trim.number <- trim.sample(seq(length(sample.fpkm.qq.nozero)), 5); + sample.trim.number <- trim.sample(x = seq(length(sample.fpkm.qq.nozero)), trim.portion = 5); sample.fpkm.qq.trim <- sort(sample.fpkm.qq)[sample.trim.number]; sample.fpkm.qq.nozero.trim <- sample.fpkm.qq.trim + add.minimum.value; @@ -167,7 +176,7 @@ outlier.detection.cosine <- function(x, value.portion = 1) { last.cos <- cosine.similarity.large.value.percent(exp.quantile, obs.quantile.exp, large.value.percent = value.portion); } else if (4 == distribution.fit) { - ### 4 gamma distribution + # 4. gamma distribution mean.gamma <- mean(sample.fpkm.qq.nozero.trim); sd.gamma <- sd(sample.fpkm.qq.nozero.trim); gamma.shape <- (mean.gamma / sd.gamma) ^ 2; @@ -199,19 +208,25 @@ clusterEvalQ( # Define a minimum value random.col <- sample(patient.part, 1) -decimal.number.max <- lapply(na.omit(fpkm.tumor.symbol.filter[,random.col]), function(x) { - decimal.numbers <- sapply(x, function(y) { - nchar(as.character(y)) - nchar(as.integer(y)) - 1 - }) - return(decimal.numbers) - }) +decimal.number.max <- lapply( + X = na.omit(fpkm.tumor.symbol.filter[,random.col]), + FUN = function(x) { + decimal.numbers <- sapply( + X = x, + FUN = function(y) { + nchar(as.character(y)) - nchar(as.integer(y)) - 1 + } + ) + return(decimal.numbers) + } + ) bic.trim.distribution <- NULL; # Use foreach to iterate over the rows of fpkm.tumor.symbol.filter in parallel bic.trim.distribution <- foreach(j = 1:nrow(negative.simulated.sum), .combine = rbind) %dopar% { sample.fpkm.qq <- round(as.numeric(negative.simulated.sum[j, sample.number]), digits = 6); - sample.trim.number <- trim.sample(sample.number, 5); + sample.trim.number <- trim.sample(x = sample.number, trim.portion = 5); sample.fpkm.qq.sort <- sort(sample.fpkm.qq)[sample.trim.number]; sample.fpkm.qq.nozero <- sample.fpkm.qq.sort + add.minimum.value; @@ -220,24 +235,27 @@ bic.trim.distribution <- foreach(j = 1:nrow(negative.simulated.sum), .combine = glm.gamma <- gamlss(sample.fpkm.qq.nozero ~ 1, family = GA); glm.exp <- gamlss(sample.fpkm.qq.nozero ~ 1, family = EXP); - glm.bic <- c(glm.norm$sbc, - glm.lnorm$sbc, - glm.exp$sbc, - glm.gamma$sbc); + glm.bic <- c( + glm.norm$sbc, + glm.lnorm$sbc, + glm.exp$sbc, + glm.gamma$sbc + ); glm.bic; } stopCluster(cl = cl); - # check the distribution again rownames(bic.trim.distribution) <- rownames(negative.simulated.sum); bic.trim.distribution.fit <- apply(bic.trim.distribution, 1, which.min); # Check the cosine similarity negative.simulated.sum.fit <- cbind(negative.simulated.sum, distribution = bic.trim.distribution.fit); + # run it parallel cl <- makeCluster(spec = detectCores() - 2); + # register the cluster with the parallel package registerDoParallel(cl = cl); clusterExport( @@ -251,20 +269,19 @@ clusterEvalQ( data.cosine.negative <- apply( X = negative.simulated.sum.fit, - MARGIN = 1, - FUN = outlier.detection.cosine, - value.portion = 0 + MARGIN = 1, + FUN = outlier.detection.cosine, + value.portion = 0 ) stopCluster(cl = cl); - data.cosine.negative.t <- t(data.cosine.negative); data.cosine.negative.t <- data.frame(data.cosine.negative.t); colnames(data.cosine.negative.t) <- c('cosine', 'distribution'); -# 1,2,3,4 +# 1, 2, 3, 4 quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALSE) { x.na <- na.omit(as.numeric(x)); if (methods == 'median') { @@ -305,7 +322,6 @@ quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALS } } } - else { if (length(unique(as.numeric(x.na))) == 1) { kmean.matrix <- rep(NA, length(x.na)); @@ -435,7 +451,7 @@ save( gene.zrange.fraction.negative.simulated.sum.1M, data.cosine.negative.t, gene.zrange.fraction.negative.simulated.sum.bic.5method.1M, - file = generate.filename('Simulated_Data_5method', paste(dataset.name, replicate, 'short', sep = '.'), 'rda') + file = generate.filename('Simulated_Data_5method', paste(dataset.name, replicate, patient.to.remove, 'short', sep = '.'), 'rda') ); save( @@ -456,5 +472,5 @@ save( gene.zrange.fraction.negative.simulated.sum.1M, data.cosine.negative.t, gene.zrange.fraction.negative.simulated.sum.bic.5method.1M, - file = generate.filename('Simulated_Data_5method', paste(dataset.name, replicate, 'long', sep = '.'), 'rda') + file = generate.filename('Simulated_Data_5method', paste(dataset.name, replicate, patient.to.remove, 'long', sep = '.'), 'rda') )