From 8acbc09069baadad0be79f0d4f190b1966e114cb Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Sat, 2 Dec 2023 14:53:49 -0800 Subject: [PATCH 1/2] Refactor `quantify.outliers()` (#22) * Refactor function `trim.sample` Refactor function `trim.sample` in `1.Outlier_Detection.R`: - Move closer to top of script - Remove local definition in function `outlier.detection.cosine` and duplicate definition at top level - Rename parameter `trim.portion` -> `trim`, and change definition to match `base::mean`, i.e., proportion trimmed rather than percentage trimmed. * Return data, not indices in `trim.sample` Return the trimmed data, rather than the indices of the trimmed data, in `1.Outlier_Detection.R/trim.sample`. * Try using `trim.sample` in `quantify.outliers` Try using `trim.sample` in `quantify.outliers`, specifically in the block calculating the trimmed mean z-scores. A comparison of the output on a test sample suggests that there are slight differences between the old and new versions for `trim` > 0, suggesting that there's a slight difference in the indices being chosen. * Restore bespoke trimming code in quantify.outliers Restore bespoke trimming code in `1.Outlier_Detection.R/quantify.outliers`. * Remove extra as.numeric calls in quantify.outliers Remove extra calls to `as.numeric` in `1.Outlier_Detection.R/quantify.outliers`. * Correct use of `trim` in `trim.sample` Correct use of `trim` in call to `1.Outlier_Detection.R/trim.sample`. * Fix style: constants on left side of equality op Fix style: constants on left side of equality operator in `1.Outlier_Detection.R`. * Parameterize `nstart` for k-means clustering Parameterize `nstart` for k-means clustering in `1.Outlier_Detection.R/quantify.outliers`. The default value `1` is the same as that of the function `stats::kmeans`. * Ensure sample is sorted before trimming Ensure sample is sorted before trimming in `1.Outlier_Detection.R/trim.sample`. * Simplify trimming when testing distributions Simplify the code for trimming the sample when testing the fit of different distributions. --- .../1.Outlier_Detection.R | 99 ++++++++----------- 1 file changed, 43 insertions(+), 56 deletions(-) diff --git a/OutlierDetectionAlgorithm/1.Outlier_Detection.R b/OutlierDetectionAlgorithm/1.Outlier_Detection.R index 6b65e9f..1cfe30f 100644 --- a/OutlierDetectionAlgorithm/1.Outlier_Detection.R +++ b/OutlierDetectionAlgorithm/1.Outlier_Detection.R @@ -59,17 +59,29 @@ fpkm.tumor.symbol.filter <- fpkm.tumor.symbol[rownames(fpkm.tumor.symbol) %in% n molecular.data.filter <- fpkm.tumor.symbol.filter[, patient.part]; +### Trim sample +trim.sample <- function(x, trim = 0.05) { + x <- sort(x); + if (length(x) <= 10) { + patient.trim.value <- 2:(length(x)-1); + } else { + trim.sample.number <- length(x) * trim; + trim.sample.number.integer <- round(trim.sample.number); + patient.trim.value <- (trim.sample.number.integer + 1):(length(x)-trim.sample.number.integer); + } + x[patient.trim.value]; + } ### Outlier detection function # Default : methods = 'mean', trim = 0 # 1. MEAN and SD : methods = 'mean', trim = 0 -# 2. TRIMMED MEAN and TRIMMED SD : methods = 'mean', trim = 5 +# 2. TRIMMED MEAN and TRIMMED SD : methods = 'mean', trim = 0.05 # 3. MEDIAN and MAD : methods = 'median' -# 4. KMEAN : methods = 'kmean' -quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALSE) { +# 4. KMEAN : methods = 'kmean', nstart = 1000 +quantify.outliers <- function(x, methods = 'mean', trim = 0, nstart = 1, exclude.zero = FALSE) { x.na <- na.omit(as.numeric(x)); - if (methods == 'median') { + if ('median' == methods) { if (exclude.zero) { x.nonzero <- x.na[0 != x.na]; data.median <- median(x.nonzero); @@ -83,25 +95,25 @@ quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALS x[which(!is.na(x))] <- result.na; x; } - else if (methods == 'kmean') { + else if ('kmean' == methods) { if (exclude.zero) { - if (length(unique(as.numeric(x.na))) == 1) { + if (1 == length(unique(x.na))) { kmean.matrix <- rep(NA, length(x.na)); names(kmean.matrix) <- names(x.na); } else { data.order <- sort(x.na, decreasing = TRUE); non.zero <- data.order[data.order > 0]; - if (length(unique(as.numeric(non.zero))) <= 2) { + if (length(unique(non.zero)) <= 2) { na.matrix <- rep(NA, length(non.zero)); - cluster.zero <- c(na.matrix, rep(0, length(x.na[x.na == 0]))); + cluster.zero <- c(na.matrix, rep(0, length(x.na[0 == x.na]))); kmean.matrix <- cluster.zero[match(x.na, data.order)]; names(kmean.matrix) <- names(x.na); } else { - kmean <- kmeans(non.zero, 2, nstart = 1000); + kmean <- kmeans(non.zero, 2, nstart = nstart); cluster <- kmean$cluster; - cluster.zero <- c(cluster, rep(0, length(x[x == 0]))); + cluster.zero <- c(cluster, rep(0, length(x[0 == x]))); kmean.matrix <- cluster.zero[match(x.na, data.order)]; names(kmean.matrix) <- names(x.na); } @@ -109,12 +121,12 @@ quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALS } else { - if (length(unique(as.numeric(x.na))) == 1) { + if (1 == length(unique(x.na))) { kmean.matrix <- rep(NA, length(x.na)); names(kmean.matrix) <- names(x.na); } else { - kmean <- kmeans(x.na, 2, nstart = 1000); + kmean <- kmeans(x.na, 2, nstart = nstart); cluster <- kmean$cluster; kmean.matrix <- cluster; names(kmean.matrix) <- names(x.na); @@ -127,16 +139,16 @@ quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALS else { gene.order <- x.na[order(x.na, decreasing = TRUE)]; if (exclude.zero) { - gene.order.nonzero <- gene.order[0 != gene.order]; - top.patient <- round(length(gene.order.nonzero) * (trim / 100), digit = 0); - low.patient <- round(length(gene.order.nonzero) * (1 - (trim / 100)), digit = 0); - data.mean <- mean(gene.order.nonzero, trim = (trim / 100)); + gene.order.nonzero <- gene.order[0 != gene.order]; + top.patient <- round(length(gene.order.nonzero) * trim, digit = 0); + low.patient <- round(length(gene.order.nonzero) * (1 - trim), digit = 0); + data.mean <- mean(gene.order.nonzero, trim = trim); data.sd <- sd(gene.order.nonzero[(top.patient+1):(low.patient)]); } else { - top.patient <- round(length(x.na) * (trim / 100), digit = 0); - low.patient <- round(length(x.na) * (1 - (trim / 100)), digit = 0); - data.mean <- mean(gene.order, trim = (trim / 100)); + top.patient <- round(length(x.na) * trim, digit = 0); + low.patient <- round(length(x.na) * (1 - trim), digit = 0); + data.mean <- mean(gene.order, trim = trim); data.sd <- sd(gene.order[(top.patient+1):(low.patient)]); } result.na <- (x.na - data.mean) / data.sd; @@ -158,7 +170,7 @@ data.mean <- foreach(i=1:nrow(molecular.data.filter), .combine = rbind) %dopar% data.mean <- data.frame(data.mean); # 2. TRIMMED MEAN and TRIMMED SD : method = 'mean', trim = 5 -data.trimmean <- foreach(i=1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i,], trim = 5); +data.trimmean <- foreach(i=1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i,], trim = 0.05); data.trimmean <- data.frame(data.trimmean); # 3. MEDIAN and MAD : method = 'median' @@ -166,7 +178,7 @@ data.median <- foreach(i=1:nrow(molecular.data.filter), .combine = rbind) %dopar data.median <- data.frame(data.median); # 4. KMEAN : method = 'kmean' -data.kmean <- foreach(i=1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i,], methods = 'kmean') +data.kmean <- foreach(i=1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i,], methods = 'kmean', nstart = 1000) data.kmean <- data.frame(data.kmean); stopCluster(cl) @@ -202,12 +214,12 @@ data.zrange.median.t <- data.frame(t(data.zrange.median)); ### Calculate the kmean fraction ##### # Function outlier.detection.kmean <- function(x) { - if (1== length(unique(as.numeric(x)))) { + if (1 == length(unique(as.numeric(x)))) { fraction <- NA; } else { - cluster.one <- length(x[x == 1]); - cluster.two <- length(x[x == 2]); + cluster.one <- length(x[1 == x]); + cluster.two <- length(x[2 == x]); cluster.sum <- cluster.one + cluster.two; smaller.value <- min(cluster.one, cluster.two); fraction <- round(smaller.value/cluster.sum, digit = 4); @@ -238,18 +250,6 @@ outlier.detection.cosine <- function (x, value.portion = 1) { }) add.minimum.value <- 1 / 10^as.numeric(max(unlist(decimal.number.max))); - - trim.sample <- function(x, trim.portion = 5) { - if (length(x) <= 10) { - patient.trim.value <- 2:(length(x)-1); - } else { - trim.sample.number <- length(x) * (trim.portion/100); - trim.sample.number.integer <- round(trim.sample.number, digits = 0); - patient.trim.value <- (trim.sample.number.integer + 1):(length(x)-trim.sample.number.integer); - } - patient.trim.value; - } - # function: Compute the cosine similarity of the largest data point cosine.similarity.large.value.percent <- function(x, y, large.value.percent) { @@ -285,7 +285,7 @@ outlier.detection.cosine <- function (x, value.portion = 1) { # Trimmed samples -Trim 5% of each side - sample.trim.number <- trim.sample(seq(length(sample.fpkm.qq.nozero)), 5); + sample.trim.number <- trim.sample(seq(length(sample.fpkm.qq.nozero)), 0.05); sample.fpkm.qq.trim <- sort(sample.fpkm.qq)[sample.trim.number]; sample.fpkm.qq.nozero.trim <- sample.fpkm.qq.trim + add.minimum.value; @@ -338,18 +338,6 @@ outlier.detection.cosine <- function (x, value.portion = 1) { } -# Trimming function -trim.sample <- function(x, trim.portion = 5) { - if (length(x) <= 10) { - patient.trim.value <- 2:(length(x)-1); - } else { - trim.sample.number <- length(x) * (trim.portion/100); - trim.sample.number.integer <- round(trim.sample.number, digits = 0); - patient.trim.value <- (trim.sample.number.integer + 1):(length(x)-trim.sample.number.integer); - } - patient.trim.value; - } - # Determine the distribution # Find the best fitted distribution cl <- makeCluster(2); @@ -374,15 +362,14 @@ bic.trim.distribution <- NULL; # Use foreach to iterate over the rows of fpkm.tumor.symbol.filter in parallel bic.trim.distribution <- foreach(j = 1:nrow(fpkm.tumor.symbol.filter), .combine = rbind) %dopar% { sample.fpkm.qq <- round(as.numeric(fpkm.tumor.symbol.filter[j,patient.part]), digits = 6); - sample.trim.number <- trim.sample(sample.number, 5); - sample.fpkm.qq.sort <- sort(sample.fpkm.qq)[sample.trim.number]; - sample.fpkm.qq.nozero <- sample.fpkm.qq.sort + add.minimum.value; + sample.fpkm.qq.trimmed <- trim.sample(sample.fpkm.qq, 0.05) + sample.fpkm.qq.trimmed.nozero <- sample.fpkm.qq.trimmed + add.minimum.value; - glm.norm <- gamlss(sample.fpkm.qq.nozero ~ 1, family=NO); - glm.lnorm <- gamlss(sample.fpkm.qq.nozero ~ 1, family=LNO); - glm.gamma <- gamlss(sample.fpkm.qq.nozero ~ 1, family=GA); - glm.exp <- gamlss(sample.fpkm.qq.nozero ~ 1, family=EXP); + glm.norm <- gamlss(sample.fpkm.qq.trimmed.nozero ~ 1, family=NO); + glm.lnorm <- gamlss(sample.fpkm.qq.trimmed.nozero ~ 1, family=LNO); + glm.gamma <- gamlss(sample.fpkm.qq.trimmed.nozero ~ 1, family=GA); + glm.exp <- gamlss(sample.fpkm.qq.trimmed.nozero ~ 1, family=EXP); glm.bic <- c(glm.norm$sbc, glm.lnorm$sbc, From 2e74746a31ac8b45a5c93df15ed7deb659f7e691 Mon Sep 17 00:00:00 2001 From: jeeyunhan Date: Sat, 2 Dec 2023 20:06:46 -0800 Subject: [PATCH 2/2] Algorithm update --- .../10.Number_Outlier_sample_5method.R | 425 +++++++++++++++ ...er_sample_Simulated_Data_5method_combine.R | 40 ++ ...ier_sample_Significant_Outlier_Detection.R | 156 ++++++ ...e_Significant_Outlier_Pvalue_Calculation.R | 45 ++ ...er_Outlier_sample_Simulated_Data_5method.R | 488 ++++++++++++++++++ 5 files changed, 1154 insertions(+) create mode 100644 OutlierDetectionAlgorithm/10.Number_Outlier_sample_5method.R create mode 100644 OutlierDetectionAlgorithm/11.Number_Outlier_sample_Simulated_Data_5method_combine.R create mode 100644 OutlierDetectionAlgorithm/12.Number_Outlier_sample_Significant_Outlier_Detection.R create mode 100644 OutlierDetectionAlgorithm/13.Number_Outlier_sample_Significant_Outlier_Pvalue_Calculation.R create mode 100644 OutlierDetectionAlgorithm/9.Number_Outlier_sample_Simulated_Data_5method.R diff --git a/OutlierDetectionAlgorithm/10.Number_Outlier_sample_5method.R b/OutlierDetectionAlgorithm/10.Number_Outlier_sample_5method.R new file mode 100644 index 0000000..9069e6a --- /dev/null +++ b/OutlierDetectionAlgorithm/10.Number_Outlier_sample_5method.R @@ -0,0 +1,425 @@ +#!/usr/bin/env Rscript + + +### 10.Number_Outlier_sample_5method.R #################################################### +# Basically, this is the same script as 1.Outlier_detection.R +# - exclude the largest value (exclude one patient) and run the outlier detection algorithm + + +# Set the working directory +setwd('RNA-seq/CCLE/four_zero/'); + +# Set the name of the dataset +dataset.name <- 'CCLE'; + +# Required R packages +# Install and load the 'gamlss' package +install.packages('gamlss', repo = 'http://cran.us.r-project.org'); +library(gamlss); +# Install and load the 'doParallel' package +install.packages('doParallel', repo = 'http://cran.us.r-project.org'); +library(doParallel); +# Install and load the 'foreach' package +install.packages('foreach', repo = 'http://cran.us.r-project.org'); +library(foreach); +# Install and load the 'parallel' package +install.packages('parallel', repo = 'http://cran.us.r-project.org'); +library(parallel); +# Install and load the 'extraDistr' package +install.packages('extraDistr', repo = 'http://cran.us.r-project.org'); +library(extraDistr); +# Install and load the 'truncnorm' package +install.packages('truncnorm', repo = 'http://cran.us.r-project.org'); +library(truncnorm); +# Install and load the 'lsa' package +install.packages('lsa', repo = 'http://cran.us.r-project.org'); +library(lsa); +# Install and load the 'SnowballC' package +install.packages('SnowballC', repo = 'http://cran.us.r-project.org'); +library(SnowballC); + + +# Load the R environment +# - 1. File from script 1: short version +load(file = '2023-09-18_CCLE_final_outlier_rank_bic.short.rda'); + + +# Set the number of patients to be excluded +# - First round should be '1' +args <- commandArgs(trailingOnly = TRUE); +args.num <- as.numeric(args); + +# Sample number +patient.part.arg <- patient.part[1:(length(patient.part)-args.num)]; +sample.number <- patient.part.arg; + +# Remove the largest value of each gene +fpkm.tumor.symbol.filter.arg <- NULL; +for(i in 1:nrow(fpkm.tumor.symbol.filter)) { + fpkm.sort.1 <- sort(as.numeric(fpkm.tumor.symbol.filter[i,patient.part]))[seq(length(patient.part.arg))]; + fpkm.tumor.symbol.filter.arg <- rbind(fpkm.tumor.symbol.filter.arg, fpkm.sort.1); + } +rownames(fpkm.tumor.symbol.filter.arg) <- rownames(fpkm.tumor.symbol.filter); + + + +# Same script from #1 +# function: Compute the cosine similarity of the largest data point +cosine.similarity.large.value.percent <- function(x, y, large.value.percent) { + + # rounding function + roundToInteger <- function(z) round(z, digits = 0) + + # check if large value percent is zero + if (0 == large.value.percent) { + large.value.number.integer <- 1; + } + else { + large.value.number <- length(x) * (large.value.percent/100); + large.value.number.integer <- roundToInteger(large.value.number); + } + + # subset the largest values + patient.larger.value <- (length(x)-large.value.number.integer + 1):length(x); + observed.value <- sort(y); + theoretical.value <- sort(x); + mid.value <- c(1, 1); + value.x.y <- data.frame(theoretical.value, observed.value); + + # calculate cosine similarity + cosine.large.value <- NULL; + cosine.large.value <- sapply(patient.larger.value, function(i) { + cosine(as.numeric(value.x.y[i,]), c(1, 1)) + }) + cosine.large.value; + } + + + + + +outlier.detection.cosine <- function (x, value.portion = 1) { + + # Define a minimum value + decimal.number.max <- lapply(na.omit(x), function(x) { + decimal.numbers <- sapply(x, function(y) { + nchar(as.character(y)) - nchar(as.integer(y)) - 1 + }) + return(decimal.numbers) + }) + add.minimum.value <- 1 / 10^as.numeric(max(unlist(decimal.number.max))); + + # function: Trim 5% of samples from each side + trim.sample <- function(x, trim.portion = 5) { + trim.sample.number <- length(x) * (trim.portion/100); + trim.sample.number.integer <- round(trim.sample.number, digits = 0); + patient.trimr.value <- (trim.sample.number.integer + 1):(length(x)-trim.sample.number.integer); + patient.trimr.value; + } + + sample.fpkm.qq <- as.numeric(x[sample.number]) + sample.fpkm.qq.nozero <- sample.fpkm.qq + add.minimum.value; + + + # Trimmed samples -Trim 5% of each side + sample.trim.number <- trim.sample(sample.number, 5); + sample.fpkm.qq.trim <- sort(sample.fpkm.qq)[sample.trim.number]; + sample.fpkm.qq.nozero.trim <- sample.fpkm.qq.trim + add.minimum.value; + + + # Quantile + p <- ppoints(sample.fpkm.qq.nozero); + + # Distribution + distribution.fit <- as.numeric(x[length(x)]); + + if (1 == distribution.fit){ + # 1. Normal distribution + norm.mean <- mean(sample.fpkm.qq.nozero.trim); + norm.sd <- sd(sample.fpkm.qq.nozero.trim); + # Use truncated norm + norm.quantiles <- qtruncnorm(p, a=0, b=Inf, mean = norm.mean, sd = norm.sd); + obs.quantile.norm <- quantile(sample.fpkm.qq.nozero, prob = p); + last.cos <- cosine.similarity.large.value.percent(norm.quantiles, obs.quantile.norm, large.value.percent = value.portion); + } + else if (2 == distribution.fit) { + # 2. Log-normal distribution + mean.log <- mean(sample.fpkm.qq.nozero.trim); + sd.log <- sd(sample.fpkm.qq.nozero.trim); + m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2)); + sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2))); + lnorm.quantile <- qlnorm(p, meanlog = m2, sdlog = sd2); + obs.quantile.lnorm <- quantile(sample.fpkm.qq.nozero, prob = p); + last.cos <- cosine.similarity.large.value.percent(lnorm.quantile, obs.quantile.lnorm, large.value.percent = value.portion); + } + else if (3 == distribution.fit) { + # 3. Exponential distribution + exp.rate <- 1 / mean(sample.fpkm.qq.nozero.trim); + exp.quantile <- qexp(p, rate = exp.rate); + obs.quantile.exp <- quantile(sample.fpkm.qq.nozero, prob = p); + last.cos <- cosine.similarity.large.value.percent(exp.quantile, obs.quantile.exp, large.value.percent = value.portion); + } + else if (4 == distribution.fit) { + ### 4 gamma distribution + mean.gamma <- mean(sample.fpkm.qq.nozero.trim); + sd.gamma <- sd(sample.fpkm.qq.nozero.trim); + gamma.shape <- (mean.gamma/sd.gamma)^2; + gamma.rate <- mean.gamma/(sd.gamma^2); + gamma.quantile <- qgamma(p, shape = gamma.shape, rate = gamma.rate); + obs.quantile.gamma <- quantile(sample.fpkm.qq.nozero, prob = p); + last.cos <- cosine.similarity.large.value.percent(gamma.quantile, obs.quantile.gamma, large.value.percent = value.portion); + } + + cosine.sum.distribution.fit <- c(last.cos, distribution.fit); + cosine.sum.distribution.fit; + } + + +# Determine the distribution +# Find the best fitted distribution + +trim.sample <- function(x, trim.portion = 5) { + trim.sample.number <- length(x) * (trim.portion/100); + trim.sample.number.integer <- round(trim.sample.number, digits = 0); + patient.trimr.value <- (trim.sample.number.integer + 1):(length(x)-trim.sample.number.integer); + patient.trimr.value; + } + + + + + +cl <- makeCluster(25); +# register the cluster with the parallel package +registerDoParallel(cl); +clusterExport(cl, "outlier.detection.cosine"); +clusterEvalQ(cl, c(library(lsa), library(SnowballC))); + +fpkm.tumor.symbol.filter.bic.fit <- cbind(fpkm.tumor.symbol.filter.arg, distribution = bic.trim.distribution.fit); +data.cosine.bic <- apply(fpkm.tumor.symbol.filter.bic.fit, + 1, + outlier.detection.cosine, + value.portion = 0); + +stopImplicitCluster(); + + +data.cosine.bic.t.arg <- data.frame(t(data.cosine.bic)); +colnames(data.cosine.bic.t.arg) <- c('cosine', 'distribution'); + + + + +# 1,2,3,4 +quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALSE) { + x <- as.numeric(x); + if (methods == 'median') { + if (exclude.zero) { + x.nonzero <- x[0 != x]; + data.median <- median(x.nonzero); + data.mad <- mad(x.nonzero); + } + else { + data.median <- median(x); + data.mad <- mad(x); + } + (x - data.median) / data.mad; + } + else if (methods == 'kmean') { + if (exclude.zero) { + if (length(unique(as.numeric(x))) == 1) { + kmean.matrix <- rep(NA, length(x)); + names(kmean.matrix) <- names(x); + } + else { + data.order <- sort(x, decreasing = TRUE); + non.zero <- data.order[data.order > 0]; + if (length(unique(as.numeric(non.zero))) <= 2) { + na.matrix <- rep(NA, length(non.zero)); + cluster.zero <- c(na.matrix, rep(0, length(x[x == 0]))); + kmean.matrix <- cluster.zero[match(x, data.order)]; + names(kmean.matrix) <- names(x); + } + else { + kmean <- kmeans(non.zero, 2, nstart = 1000); + cluster <- kmean$cluster; + cluster.zero <- c(cluster, rep(0, length(x[x == 0]))); + kmean.matrix <- cluster.zero[match(x, data.order)]; + names(kmean.matrix) <- names(x); + } + } + } + + else { + if (length(unique(as.numeric(x))) == 1) { + kmean.matrix <- rep(NA, length(x)); + names(kmean.matrix) <- names(x); + } + else { + kmean <- kmeans(x, 2, nstart = 1000); + cluster <- kmean$cluster; + kmean.matrix <- cluster; + names(kmean.matrix) <- names(x); + } + } + kmean.matrix; + } + else { + gene.order <- x[order(x, decreasing = TRUE)]; + if (exclude.zero) { + gene.order.nonzero <- gene.order[0 != gene.order]; + top.patient <- round(length(gene.order.nonzero) * (trim / 100), digit = 0); + low.patient <- round(length(gene.order.nonzero) * (1 - (trim / 100)), digit = 0); + data.mean <- mean(gene.order.nonzero, trim = (trim / 100)); + data.sd <- sd(gene.order.nonzero[(top.patient+1):(low.patient)]); + } + else { + top.patient <- round(length(x) * (trim / 100), digit = 0); + low.patient <- round(length(x) * (1 - (trim / 100)), digit = 0); + data.mean <- mean(gene.order, trim = (trim / 100)); + data.sd <- sd(gene.order[(top.patient+1):(low.patient)]); + } + (x - data.mean) / data.sd; + } + } + + +# Parallel running +cl <- makeCluster(detectCores()-1); +# register the cluster with the parallel package +registerDoParallel(cl); + +# 1. MEAN and SD : method = 'mean', trim = 0 +data.mean <- foreach(i=1:nrow(fpkm.tumor.symbol.filter.arg[,patient.part.arg]), .combine = rbind) %dopar% quantify.outliers(fpkm.tumor.symbol.filter.arg[,patient.part.arg][i,]); +data.mean <- data.frame(data.mean); + +# 2. TRIMMED MEAN and TRIMMED SD : method = 'mean', trim = 5 +data.trimmean <- foreach(i=1:nrow(fpkm.tumor.symbol.filter.arg[,patient.part.arg]), .combine = rbind) %dopar% quantify.outliers(fpkm.tumor.symbol.filter.arg[,patient.part.arg][i,], trim = 5); +data.trimmean <- data.frame(data.trimmean); + +# 3. MEDIAN and MAD : method = 'median' +data.median <- foreach(i=1:nrow(fpkm.tumor.symbol.filter.arg[,patient.part.arg]), .combine = rbind) %dopar% quantify.outliers(fpkm.tumor.symbol.filter.arg[,patient.part.arg][i,], methods = 'median'); +data.median <- data.frame(data.median); + +# 4. KMEAN : method = 'kmean' +data.kmean <- foreach(i=1:nrow(fpkm.tumor.symbol.filter.arg[,patient.part.arg]), .combine = rbind) %dopar% quantify.outliers(fpkm.tumor.symbol.filter.arg[,patient.part.arg][i,], methods = 'kmean') +data.kmean <- data.frame(data.kmean); + +stopCluster(cl) + + + + + +outlier.detection.zrange <- function(x) { + zrange <- max(x) - min(x); + zrange.matrix <- c(x, zrange); + names(zrange.matrix) <- c(names(x), 'zrange'); + zrange.matrix; + } + + + + +# 1. MEAN and SD +data.zrange.mean <- apply(data.mean, 1, outlier.detection.zrange); +data.zrange.mean.t.arg <- data.frame(t(data.zrange.mean)); + +# 2. TRIMMED MEAN and TRIMMED SD +data.zrange.trimmean <- apply(data.trimmean, 1, outlier.detection.zrange); +data.zrange.trimmean.t.arg <- data.frame(t(data.zrange.trimmean)); + +# 3. MEDIAN and MAD +data.zrange.median <- apply(data.median, 1, outlier.detection.zrange); +data.zrange.median.t.arg <- data.frame(t(data.zrange.median)); + + + + +### Calculate the kmean fraction ##### +# Function +outlier.detection.kmean <- function(x) { + if (1== length(unique(as.numeric(x)))) { + fraction <- NA; + } + else { + cluster.one <- length(x[x == 1]); + cluster.two <- length(x[x == 2]); + cluster.sum <- cluster.one + cluster.two; + smaller.value <- min(cluster.one, cluster.two); + fraction <- round(smaller.value/cluster.sum, digit = 4); + } + fraction.matrix <- c(x, fraction); + names(fraction.matrix) <- c(names(x), 'fraction'); + fraction.matrix; + } + +# 4. KMEAN fraction +data.fraction.kmean <- apply(data.kmean, 1, outlier.detection.kmean); +data.fraction.kmean.t.arg <- data.frame(t(data.fraction.kmean)); + + + + +### Final gene-wise matrix ##### +### Final gene-wise matrix ##### +gene.zrange.fraction.cosine.last.point.bic.arg <- data.frame(cbind(data.zrange.mean.t.arg$zrange, + data.zrange.median.t.arg$zrange, + data.zrange.trimmean.t.arg$zrange, + data.fraction.kmean.t.arg$fraction, + data.cosine.bic.t.arg$cosine, + data.cosine.bic.t.arg$distribution)); +rownames(gene.zrange.fraction.cosine.last.point.bic.arg) <- rownames(fpkm.tumor.symbol.filter); +colnames(gene.zrange.fraction.cosine.last.point.bic.arg) <- c('zrange.mean', 'zrange.median', 'zrange.trimmean', 'fraction.kmean', 'cosine', 'distribution'); + + + +zrange.mean <- paste('data.zrange.mean.t.', args.num, sep = ''); +assign(zrange.mean, data.zrange.mean.t.arg); + +zrange.trimmean <- paste('data.zrange.trimmean.t.', args.num, sep = ''); +assign(zrange.trimmean, data.zrange.trimmean.t.arg); + +zrange.median <- paste('data.zrange.median.t.', args.num, sep = ''); +assign(zrange.median, data.zrange.median.t.arg); + +fraction.kmean <- paste('data.fraction.kmean.t.', args.num, sep = ''); +assign(fraction.kmean, data.fraction.kmean.t.arg); + +cosine.bic <- paste('data.cosine.bic.t.', args.num, sep = ''); +assign(cosine.bic, data.cosine.bic.t.arg); + +gene.zrange.fraction <- paste('gene.zrange.fraction.cosine.last.point.bic.', args.num, sep = ''); +assign(gene.zrange.fraction, gene.zrange.fraction.cosine.last.point.bic.arg); + + + + +save( + fpkm.tumor.symbol.filter, + patient.part, + sample.number, + bic.trim.distribution.fit, + list = paste0('gene.zrange.fraction.cosine.last.point.bic.', args.num, sep = ''), + file = paste0('10.Number_Outlier_sample_5method.', args.num, '.short.rda', sep = '') + ); + + +save( + fpkm.tumor.symbol.filter, + patient.part, + sample.number, + bic.trim.distribution.fit, + list = c( + paste0('data.zrange.mean.t.', args.num, sep = ''), + paste0('data.zrange.trimmean.t.', args.num, sep = ''), + paste0('data.zrange.median.t.', args.num, sep = ''), + paste0('data.fraction.kmean.t.', args.num, sep = ''), + paste0('data.cosine.bic.t.', args.num, sep = ''), + paste0('gene.zrange.fraction.cosine.last.point.bic.', args.num, sep = '')), + file = paste0('10.Number_Outlier_sample_5method.', args.num, '.long.rda', sep = '') + ); + + + + diff --git a/OutlierDetectionAlgorithm/11.Number_Outlier_sample_Simulated_Data_5method_combine.R b/OutlierDetectionAlgorithm/11.Number_Outlier_sample_Simulated_Data_5method_combine.R new file mode 100644 index 0000000..a8b2c03 --- /dev/null +++ b/OutlierDetectionAlgorithm/11.Number_Outlier_sample_Simulated_Data_5method_combine.R @@ -0,0 +1,40 @@ +#!/usr/bin/env Rscript + +### 11.Simulated_Data_5method_combine.R #################################################### +# Combine the 10 chunks of statistics results + + +# Set the working directory +setwd('RNA-seq/CCLE/four_zero/'); + +# Set the name of dataset +dataset.name <- 'CCLE'; + +# Combine all 10 chunks +args <- commandArgs(trailingOnly = TRUE) + + +for (i in 1:10) { + load(paste('9.Number_Outlier_sample_Simulated_Data_5method.', dataset.name, '.', i, '.', args, '.short.rda', sep = '')); + gene.zrange.fraction.negative.simulated.sum.bic.5method.1M <- get(paste('gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.', i, sep = '')); + all.statistics <- paste('gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.', args, i, sep = ''); + assign(all.statistics, gene.zrange.fraction.negative.simulated.sum.bic.5method.1M); + } + +#1. residue.negative.random.number.bic +gene.zrange.fraction.negative.simulated.sum.bic.5method.1M <- NULL; +for (i in 1:10) { + p.value <- get(paste('gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.', args, i, sep = '')); + gene.zrange.fraction.negative.simulated.sum.bic.5method.1M <- rbind(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M, p.value); + } + + +gene.zrange.fraction <- paste('gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.', args, sep = ''); +assign(gene.zrange.fraction, gene.zrange.fraction.negative.simulated.sum.bic.5method.1M); + +save( + fpkm.tumor.symbol.filter, + bic.trim.distribution.fit, + list = paste0('gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.', args, sep = ''), + file = paste('11.Number_Outlier_sample_Simulated_Data_5method_combine.', dataset.name, '.', args, '.rda', sep = '') + ); diff --git a/OutlierDetectionAlgorithm/12.Number_Outlier_sample_Significant_Outlier_Detection.R b/OutlierDetectionAlgorithm/12.Number_Outlier_sample_Significant_Outlier_Detection.R new file mode 100644 index 0000000..1bc7935 --- /dev/null +++ b/OutlierDetectionAlgorithm/12.Number_Outlier_sample_Significant_Outlier_Detection.R @@ -0,0 +1,156 @@ +#!/usr/bin/env Rscript + +### 12.Significant_Outlier_Detection.R #################################################### +# Compute p-values + + +# Set the working directory +setwd('RNA-seq/CCLE/four_zero/'); + +# Set the name of dataset +# dataset.name <- 'Matador'; + + +# Required R package +install.packages('parallel', repo = 'http://cran.us.r-project.org'); +install.packages('foreach', repo = 'http://cran.us.r-project.org'); +install.packages('doParallel', repo = 'http://cran.us.r-project.org'); +library(parallel); +library(foreach); +library(doParallel); + + +dataset.name <- 'CCLE'; + +# Run 1000 genes at once +# - array number should be ceiling(nrow(fpkm.tumor.symbol.filter)) +args <- commandArgs(trailingOnly = TRUE); +row.num.args <- as.numeric(args); + + +# This will be used identify the number of outlier patients per gene +# - if '0', use whole patients (first step) +# - if '1', use n-1 patients (exclude the patient having the largest value) +# - repeat this '2', '3', '4'... until there is no outlier genes +data.args <- 1; + + +# Load the R encironment +load(file = paste('10.Number_Outlier_sample_5method.', data.args, '.short.rda', sep = '')); +load(file = paste('11.Number_Outlier_sample_Simulated_Data_5method_combine.', dataset.name, '.', data.args, '.rda', sep = '')); + + + +gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.new <- get(paste('gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.', data.args, sep = '')); +gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.new <- data.frame(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.new); + +### Rank each methods ##### +# Function +outlier.rank <- function(x) { + methods <- c('zrange.mean', 'zrange.median', 'zrange.trimmean', 'fraction.kmean', 'cosine'); + rank.matrix <- NULL; + # Give rank for each methods based on z-score range/fraction of kmean + for (i in 1:3) { + methods.column <- methods[i]; + rank.methods <- rank(-x[,methods.column], ties.method = 'max', na.last = 'keep'); + rank.matrix <- cbind(rank.matrix, rank.methods); + } + for (i in 4:5) { + methods.column <- methods[i]; + rank.methods <- rank(x[,methods.column], ties.method = 'max', na.last = 'keep'); + rank.matrix <- cbind(rank.matrix, rank.methods); + } + rownames(rank.matrix) <-rownames(x); + colnames(rank.matrix) <- methods; + rank.matrix <- data.frame(rank.matrix); + } + + +### Rank product to determine Top ranked genes ##### +# Function +# x: ranked matrix +# NA.number = Number of methods with non-NA should be more than assigned number +outlier.rank.product <- function(x, NA.number = 0) { + rank <- as.numeric(x[1:5]); + num <- length(which(!is.na(rank))); + if (NA.number >= num) { + NA; + } + else { + prod(rank, na.rm = TRUE)^(1/num); + } + } + + + +### Combine matrix +# - relabel the null data +gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relable <- cbind(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.new, + gene = rownames(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.new)); +rownames(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relable) <- paste(rep("ND", 1000000), c(1:1000000), sep = ''); + + + +gene.number.start.end.matrix <- NULL; +for (i in 1:ceiling(nrow(fpkm.tumor.symbol.filter)/1000)) { + if (i == ceiling(nrow(fpkm.tumor.symbol.filter)/1000)) { + range.number <- c((i-1)*1000 + 1, nrow(fpkm.tumor.symbol.filter)); + gene.number.start.end.matrix <- data.frame(rbind( + gene.number.start.end.matrix, + i = range.number + )); + } + else { + range.number<- c((i-1)*1000 + 1, i*1000) + gene.number.start.end.matrix <- data.frame(rbind( + gene.number.start.end.matrix, + i = range.number + )); + } + } + + + +cl <- makeCluster(20); +# register the cluster with the parallel package +registerDoParallel(cl); +clusterExport(cl, c("outlier.rank", "outlier.rank.product")); + +gene.zrange.fraction.fpkm.bic.5method.1M.data <- get(paste('gene.zrange.fraction.cosine.last.point.bic.', data.args, sep = '')); + +gene.rank.p.value.one.gene <- NULL; +gene.rank.p.value.one.gene <- foreach(i = as.numeric(gene.number.start.end.matrix[row.num.args,1]):as.numeric(gene.number.start.end.matrix[row.num.args,2]), .combine=rbind) %dopar% { + observed.gene <- gene.zrange.fraction.fpkm.bic.5method.1M.data[i,1:5]; + combine.matrix <- rbind(observed.gene, + gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relable[,1:5]); + # get ranks + data.rank.bic <- outlier.rank(combine.matrix); + rank.product.bic <- apply(data.rank.bic, 1, outlier.rank.product, NA.number = 3); + gene.rank.poduct.bic <- cbind(data.rank.bic, + rank.product.bic); + obs <- rank.product.bic[1] + null <- rank.product.bic[2:(nrow(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relable) + 1)] + length.null <- nrow(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relable); + obs.p.value <- (sum(obs >= null) + 1) / (length.null + 1) + + obs.p.value.rank <- cbind(gene.rank.poduct.bic[1,], obs.p.value); + p.value.one.gene <- data.frame(x = obs.p.value.rank, i = i); + p.value.one.gene; +} + + +p.value.one <- paste('gene.rank.p.value.one.gene.', data.args, sep = ''); +assign(p.value.one, gene.rank.p.value.one.gene); + + +stopImplicitCluster(); + + + + +save( + list = paste0('gene.rank.p.value.one.gene.', data.args, sep = ''), + file = paste('12.Number_Outlier_sample_Significant_Outlier_Detection.', dataset.name, '.', row.num.args, '.',data.args, '.rda', sep = '') + ); + + diff --git a/OutlierDetectionAlgorithm/13.Number_Outlier_sample_Significant_Outlier_Pvalue_Calculation.R b/OutlierDetectionAlgorithm/13.Number_Outlier_sample_Significant_Outlier_Pvalue_Calculation.R new file mode 100644 index 0000000..04213e4 --- /dev/null +++ b/OutlierDetectionAlgorithm/13.Number_Outlier_sample_Significant_Outlier_Pvalue_Calculation.R @@ -0,0 +1,45 @@ +#!/usr/bin/env Rscript + +### 13.Number_Outlier_sample_Significant_Outlier_Pvalue_Calculation.R #################################################### +# Compute p-values + + +# Set the working directory +setwd('RNA-seq/CCLE/four_zero/'); + +# Set the name of dataset +dataset.name <- 'CCLE'; + +# Number of excluded patients +args <- commandArgs(trailingOnly = TRUE) + + +# Manually enter 'ceiling(nrow(fpkm.tumor.symbol.filter))' +# - should be changed depending on the dataset +row.chunk.num <- 14; + + +for (i in 1:row.chunk.num) { + load(file = paste('12.Number_Outlier_sample_Significant_Outlier_Detection.', dataset.name, '.', i, '.', args, '.rda', sep = '')); + p.value.set <- paste('gene.rank.p.value.', i, sep = ''); + assign(p.value.set, get(paste('gene.rank.p.value.one.gene.', args, sep = ''))); + } + +#1. residue.negative.random.number.bic +gene.p.value.each.null <- NULL; +for (i in 1:row.chunk.num) { + p.value <- get(paste('gene.rank.p.value.', i, sep = '')); + gene.p.value.each.null <- rbind(gene.p.value.each.null, p.value); + } + + +p.value.all <- paste('gene.rank.p.value.one.gene.p', args, sep = ''); +assign(p.value.all, gene.p.value.each.null); + + +save( + list = paste0('gene.rank.p.value.one.gene.p', args, sep = ''), + file = paste('13.Number_Outlier_sample_Significant_Outlier_Pvalue_Calculation.', dataset.name, '.', args, '.rda', sep = '') +); + + diff --git a/OutlierDetectionAlgorithm/9.Number_Outlier_sample_Simulated_Data_5method.R b/OutlierDetectionAlgorithm/9.Number_Outlier_sample_Simulated_Data_5method.R new file mode 100644 index 0000000..4c3b41b --- /dev/null +++ b/OutlierDetectionAlgorithm/9.Number_Outlier_sample_Simulated_Data_5method.R @@ -0,0 +1,488 @@ +#!/usr/bin/env Rscript + +### 9.Number_Outlier_sample_Simulated_Data_5method.R #################################################### +# Compute the 5 statistics of simulated data with patient removal + +# Run parallel: 10 chucnks +args <- commandArgs(trailingOnly = TRUE) + +# Set the working directory +setwd('RNA-seq/CCLE/four_zero/'); + +# Set the name of dataset +dataset.name <- 'CCLE'; + +# load the R environment file saved from 4.Simulated_Data_generation_2.R and 2.Distribution_Identfication.R +load(file = paste('4.Simulated_Data_generation_2.', dataset.name, '.', args, '.rda', sep = '')); +load(file = paste('5.Simulated_Data_5method.', dataset.name, '.', args, '.short.rda', sep = '')); + + +# Required R package +install.packages('extraDistr', repo = 'http://cran.us.r-project.org'); +install.packages('truncnorm', repo = 'http://cran.us.r-project.org'); +install.packages('SnowballC', repo = 'http://cran.us.r-project.org'); +install.packages('lsa', repo = 'http://cran.us.r-project.org'); +library(extraDistr); +library(truncnorm); +library(SnowballC); +library(lsa); +install.packages('parallel', repo = 'http://cran.us.r-project.org'); +install.packages('foreach', repo = 'http://cran.us.r-project.org'); +install.packages('doParallel', repo = 'http://cran.us.r-project.org'); +library(parallel); +library(foreach); +library(doParallel); + + + + + + + + +# Manually set the number of patients to be excluded +# - First round should be '1' +patient.arg <- 1; +patient.part.arg <- patient.part[1:(length(patient.part)-patient.arg )]; +sample.number <- patient.part.arg; + +# Remove the patients from the simulated data +negative.simulated.sum.arg <- negative.simulated.sum[,patient.part.arg]; +rownames(negative.simulated.sum.arg) <- rownames(negative.simulated.sum); + +rm(negative.simulated.sum); + + + + + + + + +# Define a minimum value +random.col <- sample(patient.part, 1) +decimal.number.max <- lapply(na.omit(fpkm.tumor.symbol.filter[,random.col]), function(x) { + decimal.numbers <- sapply(x, function(y) { + nchar(as.character(y)) - nchar(as.integer(y)) - 1 + }) + return(decimal.numbers) + }) +add.minimum.value <- 1 / 10^as.numeric(max(unlist(decimal.number.max))); + + + +# function: Compute the cosine similarity of the largest data point +cosine.similarity.large.value.percent <- function(x, y, large.value.percent) { + + # rounding function + roundToInteger <- function(z) round(z, digits = 0) + + # check if large value percent is zero + if (0 == large.value.percent) { + large.value.number.integer <- 1; + } + else { + large.value.number <- length(x) * (large.value.percent/100); + large.value.number.integer <- roundToInteger(large.value.number); + } + + # subset the largest values + patient.larger.value <- (length(x)-large.value.number.integer + 1):length(x); + observed.value <- sort(y); + theoretical.value <- sort(x); + mid.value <- c(1, 1); + value.x.y <- data.frame(theoretical.value, observed.value); + + # calculate cosine similarity + cosine.large.value <- NULL; + cosine.large.value <- sapply(patient.larger.value, function(i) { + cosine(as.numeric(value.x.y[i,]), c(1, 1)) + }) + cosine.large.value; + } + + + +# function: Trim 5% of samples from each side +trim.sample <- function(x, trim.portion = 5) { + if (length(x) <= 10) { + patient.trim.value <- 2:(length(x)-1); + } else { + trim.sample.number <- length(x) * (trim.portion/100); + trim.sample.number.integer <- round(trim.sample.number, digits = 0); + patient.trim.value <- (trim.sample.number.integer + 1):(length(x)-trim.sample.number.integer); + } + patient.trim.value; + } + +outlier.detection.cosine <- function (x, value.portion = 1) { + + # Define a minimum value + decimal.number.max <- lapply(na.omit(x), function(x) { + decimal.numbers <- sapply(x, function(y) { + nchar(as.character(y)) - nchar(as.integer(y)) - 1 + }) + return(decimal.numbers) + }) + add.minimum.value <- 1 / 10^as.numeric(max(unlist(decimal.number.max))); + + + trim.sample <- function(x, trim.portion = 5) { + if (length(x) <= 10) { + patient.trim.value <- 2:(length(x)-1); + } else { + trim.sample.number <- length(x) * (trim.portion/100); + trim.sample.number.integer <- round(trim.sample.number, digits = 0); + patient.trim.value <- (trim.sample.number.integer + 1):(length(x)-trim.sample.number.integer); + } + patient.trim.value; + } + + # function: Compute the cosine similarity of the largest data point + cosine.similarity.large.value.percent <- function(x, y, large.value.percent) { + + # rounding function + roundToInteger <- function(z) round(z, digits = 0) + + # check if large value percent is zero + if (0 == large.value.percent) { + large.value.number.integer <- 1; + } + else { + large.value.number <- length(x) * (large.value.percent/100); + large.value.number.integer <- roundToInteger(large.value.number); + } + + # subset the largest values + patient.larger.value <- (length(x)-large.value.number.integer + 1):length(x); + observed.value <- sort(y); + theoretical.value <- sort(x); + mid.value <- c(1, 1); + value.x.y <- data.frame(theoretical.value, observed.value); + + # calculate cosine similarity + cosine.large.value <- NULL; + cosine.large.value <- sapply(patient.larger.value, function(i) { + cosine(as.numeric(value.x.y[i,]), c(1, 1)) + }) + cosine.large.value; + } + + sample.fpkm.qq <- na.omit(as.numeric(x[sample.number])) + sample.fpkm.qq.nozero <- sample.fpkm.qq + add.minimum.value; + + + # Trimmed samples -Trim 5% of each side + sample.trim.number <- trim.sample(seq(length(sample.fpkm.qq.nozero)), 5); + sample.fpkm.qq.trim <- sort(sample.fpkm.qq)[sample.trim.number]; + sample.fpkm.qq.nozero.trim <- sample.fpkm.qq.trim + add.minimum.value; + + + # Quantile + p <- ppoints(sample.fpkm.qq.nozero); + + # Distribution + distribution.fit <- as.numeric(x[length(x)]); + + if (1 == distribution.fit){ + # 1. Normal distribution + norm.mean <- mean(sample.fpkm.qq.nozero.trim); + norm.sd <- sd(sample.fpkm.qq.nozero.trim); + # Use truncated norm + norm.quantiles <- qtruncnorm(p, a=0, b=Inf, mean = norm.mean, sd = norm.sd); + obs.quantile.norm <- quantile(sample.fpkm.qq.nozero, prob = p); + last.cos <- cosine.similarity.large.value.percent(norm.quantiles, obs.quantile.norm, large.value.percent = value.portion); + } + else if (2 == distribution.fit) { + # 2. Log-normal distribution + mean.log <- mean(sample.fpkm.qq.nozero.trim); + sd.log <- sd(sample.fpkm.qq.nozero.trim); + m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2)); + sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2))); + lnorm.quantile <- qlnorm(p, meanlog = m2, sdlog = sd2); + obs.quantile.lnorm <- quantile(sample.fpkm.qq.nozero, prob = p); + last.cos <- cosine.similarity.large.value.percent(lnorm.quantile, obs.quantile.lnorm, large.value.percent = value.portion); + } + else if (3 == distribution.fit) { + # 3. Exponential distribution + exp.rate <- 1 / mean(sample.fpkm.qq.nozero.trim); + exp.quantile <- qexp(p, rate = exp.rate); + obs.quantile.exp <- quantile(sample.fpkm.qq.nozero, prob = p); + last.cos <- cosine.similarity.large.value.percent(exp.quantile, obs.quantile.exp, large.value.percent = value.portion); + } + else if (4 == distribution.fit) { + ### 4 gamma distribution + mean.gamma <- mean(sample.fpkm.qq.nozero.trim); + sd.gamma <- sd(sample.fpkm.qq.nozero.trim); + gamma.shape <- (mean.gamma/sd.gamma)^2; + gamma.rate <- mean.gamma/(sd.gamma^2); + gamma.quantile <- qgamma(p, shape = gamma.shape, rate = gamma.rate); + obs.quantile.gamma <- quantile(sample.fpkm.qq.nozero, prob = p); + last.cos <- cosine.similarity.large.value.percent(gamma.quantile, obs.quantile.gamma, large.value.percent = value.portion); + } + + cosine.sum.distribution.fit <- c(last.cos, distribution.fit); + cosine.sum.distribution.fit; + } + + + + +# Check the cosine similarity +negative.simulated.sum.fit <- cbind(negative.simulated.sum.arg, distribution = data.cosine.negative.t$distribution); +# run it parallel +cl <- makeCluster(20); +# register the cluster with the parallel package +registerDoParallel(cl); +clusterExport(cl, "outlier.detection.cosine"); +clusterEvalQ(cl, c(library(lsa), library(SnowballC))); + +data.cosine.negative <- apply(negative.simulated.sum.fit, + 1, + outlier.detection.cosine, + value.portion = 0); + +stopImplicitCluster(); + + +data.cosine.negative.t <- t(data.cosine.negative); +data.cosine.negative.t.arg <- data.frame(data.cosine.negative.t); +colnames(data.cosine.negative.t.arg) <- c('cosine', 'distribution'); + + + + +# 1,2,3,4 +quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALSE) { + x.na <- na.omit(as.numeric(x)); + if (methods == 'median') { + if (exclude.zero) { + x.nonzero <- x.na[0 != x.na]; + data.median <- median(x.nonzero); + data.mad <- mad(x.nonzero); + } + else { + data.median <- median(x.na); + data.mad <- mad(x.na); + } + result.na <- (x.na - data.median) / data.mad; + x[which(!is.na(x))] <- result.na; + x; + } + else if (methods == 'kmean') { + if (exclude.zero) { + if (length(unique(as.numeric(x.na))) == 1) { + kmean.matrix <- rep(NA, length(x.na)); + names(kmean.matrix) <- names(x.na); + } + else { + data.order <- sort(x.na, decreasing = TRUE); + non.zero <- data.order[data.order > 0]; + if (length(unique(as.numeric(non.zero))) <= 2) { + na.matrix <- rep(NA, length(non.zero)); + cluster.zero <- c(na.matrix, rep(0, length(x.na[x.na == 0]))); + kmean.matrix <- cluster.zero[match(x.na, data.order)]; + names(kmean.matrix) <- names(x.na); + } + else { + kmean <- kmeans(non.zero, 2, nstart = 1000); + cluster <- kmean$cluster; + cluster.zero <- c(cluster, rep(0, length(x[x == 0]))); + kmean.matrix <- cluster.zero[match(x.na, data.order)]; + names(kmean.matrix) <- names(x.na); + } + } + } + + else { + if (length(unique(as.numeric(x.na))) == 1) { + kmean.matrix <- rep(NA, length(x.na)); + names(kmean.matrix) <- names(x.na); + } + else { + kmean <- kmeans(x.na, 2, nstart = 1000); + cluster <- kmean$cluster; + kmean.matrix <- cluster; + names(kmean.matrix) <- names(x.na); + } + } + result.na <- kmean.matrix; + x[which(!is.na(x))] <- result.na; + x; + } + else { + gene.order <- x.na[order(x.na, decreasing = TRUE)]; + if (exclude.zero) { + gene.order.nonzero <- gene.order[0 != gene.order]; + top.patient <- round(length(gene.order.nonzero) * (trim / 100), digit = 0); + low.patient <- round(length(gene.order.nonzero) * (1 - (trim / 100)), digit = 0); + data.mean <- mean(gene.order.nonzero, trim = (trim / 100)); + data.sd <- sd(gene.order.nonzero[(top.patient+1):(low.patient)]); + } + else { + top.patient <- round(length(x.na) * (trim / 100), digit = 0); + low.patient <- round(length(x.na) * (1 - (trim / 100)), digit = 0); + data.mean <- mean(gene.order, trim = (trim / 100)); + data.sd <- sd(gene.order[(top.patient+1):(low.patient)]); + } + result.na <- (x.na - data.mean) / data.sd; + x[which(!is.na(x))] <- result.na; + x; + } + } + + + + + + +# Parallel running +cl <- makeCluster(20); +# register the cluster with the parallel package +registerDoParallel(cl); + +# 1. MEAN and SD : method = 'mean', trim = 0 +data.mean <- foreach(i=1:nrow(negative.simulated.sum.arg), .combine = rbind) %dopar% quantify.outliers(negative.simulated.sum.arg[i,]); +data.mean <- data.frame(data.mean); + +# 2. TRIMMED MEAN and TRIMMED SD : method = 'mean', trim = 5 +data.trimmean <- foreach(i=1:nrow(negative.simulated.sum.arg), .combine = rbind) %dopar% quantify.outliers(negative.simulated.sum.arg[i,], trim = 5); +data.trimmean <- data.frame(data.trimmean); + +# 3. MEDIAN and MAD : method = 'median' +data.median <- foreach(i=1:nrow(negative.simulated.sum.arg), .combine = rbind) %dopar% quantify.outliers(negative.simulated.sum.arg[i,], methods = 'median'); +data.median <- data.frame(data.median); + +# 4. KMEAN : method = 'kmean' +data.kmean <- foreach(i=1:nrow(negative.simulated.sum.arg), .combine = rbind) %dopar% quantify.outliers(negative.simulated.sum.arg[i,], methods = 'kmean') +data.kmean <- data.frame(data.kmean); + +stopCluster(cl) + + + +outlier.detection.zrange <- function(x) { + x.na <- na.omit(x) + zrange <- max(x.na) - min(x.na); + zrange.matrix <- c(x, zrange); + names(zrange.matrix) <- c(names(x), 'zrange'); + zrange.matrix; + } + + + + +# 1. MEAN and SD +data.zrange.mean <- apply(data.mean, 1, outlier.detection.zrange); +mean.simulated.negative.1M.arg <- data.frame(t(data.zrange.mean)); + +# 2. TRIMMED MEAN and TRIMMED SD +data.zrange.trimmean <- apply(data.trimmean, 1, outlier.detection.zrange); +trimmean.simulated.negative.1M.arg <- data.frame(t(data.zrange.trimmean)); + +# 4. MEDIAN and MAD +data.zrange.median <- apply(data.median, 1, outlier.detection.zrange); +median.simulated.negative.1M.arg<- data.frame(t(data.zrange.median)); + + + + +### Calculate the kmean fraction ##### +# Function +outlier.detection.kmean <- function(x) { + if (1== length(unique(as.numeric(x)))) { + fraction <- NA; + } + else { + cluster.one <- length(x[x == 1]); + cluster.two <- length(x[x == 2]); + cluster.sum <- cluster.one + cluster.two; + smaller.value <- min(cluster.one, cluster.two); + fraction <- round(smaller.value/cluster.sum, digit = 4); + } + fraction.matrix <- c(x, fraction); + names(fraction.matrix) <- c(names(x), 'fraction'); + fraction.matrix; + } + +# 4. KMEAN fraction +data.fraction.kmean <- apply(data.kmean, 1, outlier.detection.kmean); +kmean.simulated.negative.1M.arg <- data.frame(t(data.fraction.kmean)); + + + + +### Final gene-wise matrix ##### +gene.zrange.fraction.negative.simulated.sum.1M.arg <- cbind(mean.simulated.negative.1M.arg$zrange, + median.simulated.negative.1M.arg$zrange, + trimmean.simulated.negative.1M.arg$zrange, + kmean.simulated.negative.1M.arg$fraction); +rownames(gene.zrange.fraction.negative.simulated.sum.1M.arg) <- rownames(negative.simulated.sum.arg); +colnames(gene.zrange.fraction.negative.simulated.sum.1M.arg) <- c('zrange.mean', 'zrange.median', 'zrange.trimmean', 'fraction.kmean'); + + +# Final statistic matrix +gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.arg <- cbind(gene.zrange.fraction.negative.simulated.sum.1M.arg[,c(1,2,3,4)], + data.cosine.negative.t.arg$cosine, + data.cosine.negative.t.arg$distribution); +colnames(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.arg) <- c('zrange.mean', 'zrange.median', 'zrange.trimmean', 'fraction.kmean', 'cosine', 'distribution'); + + + + + +zrange.mean <- paste('mean.simulated.negative.1M.', args, sep = ''); +assign(zrange.mean, mean.simulated.negative.1M.arg); + +zrange.trimmean <- paste('trimmean.simulated.negative.1M.', args, sep = ''); +assign(zrange.trimmean, trimmean.simulated.negative.1M.arg); + +zrange.median <- paste('median.simulated.negative.1M.', args, sep = ''); +assign(zrange.median, median.simulated.negative.1M.arg); + +fraction.kmean <- paste('kmean.simulated.negative.1M.', args, sep = ''); +assign(fraction.kmean, kmean.simulated.negative.1M.arg); + +cosine.bic <- paste('data.cosine.negative.t.', args, sep = ''); +assign(cosine.bic, data.cosine.negative.t.arg); + +gene.zrange.fraction <- paste('gene.zrange.fraction.negative.simulated.sum.1M.', args, sep = ''); +assign(gene.zrange.fraction, gene.zrange.fraction.negative.simulated.sum.1M.arg); + +gene.zrange.fraction.cosine <- paste('gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.', args, sep = ''); +assign(gene.zrange.fraction.cosine, gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.arg); + + + + + +save( + fpkm.tumor.symbol.filter, + sample.number, + bic.trim.distribution.fit.obs, + bic.trim.distribution.fit, + list = c( + paste0('gene.zrange.fraction.negative.simulated.sum.1M.', args, sep = ''), + paste0('data.cosine.negative.t.', args, sep = ''), + paste0('gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.', args, sep = '')), + file = paste('9.Number_Outlier_sample_Simulated_Data_5method.', dataset.name, '.', args, '.', patient.arg, '.short.rda', sep = '') + ); + + +save( + fpkm.tumor.symbol.filter, + patient.part, + bic.trim.distribution.fit.obs, + bic.trim.distribution.fit, + list = c( + paste0('mean.simulated.negative.1M.', args, sep = ''), + paste0('median.simulated.negative.1M.', args, sep = ''), + paste0('trimmean.simulated.negative.1M.', args, sep = ''), + paste0('kmean.simulated.negative.1M.', args, sep = ''), + paste0('gene.zrange.fraction.negative.simulated.sum.1M.', args, sep = ''), + paste0('data.cosine.negative.t.', args, sep = ''), + paste0('gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.', args, sep = '')), + file = paste('9.Number_Outlier_sample_Simulated_Data_5method.', dataset.name, '.', args, '.', patient.arg, '.long.rda', sep = '') + ); + +