From 8acbc09069baadad0be79f0d4f190b1966e114cb Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Sat, 2 Dec 2023 14:53:49 -0800
Subject: [PATCH 1/2] Refactor `quantify.outliers()` (#22)

* Refactor function `trim.sample`

Refactor function `trim.sample` in `1.Outlier_Detection.R`:
- Move closer to top of script
- Remove local definition in function `outlier.detection.cosine` and
duplicate definition at top level
- Rename parameter `trim.portion` -> `trim`, and change definition to
match `base::mean`, i.e., proportion trimmed rather than percentage
trimmed.

* Return data, not indices in `trim.sample`

Return the trimmed data, rather than the indices of the trimmed data,
in `1.Outlier_Detection.R/trim.sample`.

* Try using `trim.sample` in `quantify.outliers`

Try using `trim.sample` in `quantify.outliers`, specifically in the
block calculating the trimmed mean z-scores.

A comparison of the output on a test sample suggests that there are
slight differences between the old and new versions for `trim` > 0,
suggesting that there's a slight difference in the indices being
chosen.

* Restore bespoke trimming code in quantify.outliers

Restore bespoke trimming code in
`1.Outlier_Detection.R/quantify.outliers`.

* Remove extra as.numeric calls in quantify.outliers

Remove extra calls to `as.numeric` in
`1.Outlier_Detection.R/quantify.outliers`.

* Correct use of `trim` in `trim.sample`

Correct use of `trim` in call to `1.Outlier_Detection.R/trim.sample`.

* Fix style: constants on left side of equality op

Fix style: constants on left side of equality operator in
`1.Outlier_Detection.R`.

* Parameterize `nstart` for k-means clustering

Parameterize `nstart` for k-means clustering in
`1.Outlier_Detection.R/quantify.outliers`.  The default value `1` is
the same as that of the function `stats::kmeans`.

* Ensure sample is sorted before trimming

Ensure sample is sorted before trimming in
`1.Outlier_Detection.R/trim.sample`.

* Simplify trimming when testing distributions

Simplify the code for trimming the sample when testing the fit of
different distributions.
---
 .../1.Outlier_Detection.R                     | 99 ++++++++-----------
 1 file changed, 43 insertions(+), 56 deletions(-)

diff --git a/OutlierDetectionAlgorithm/1.Outlier_Detection.R b/OutlierDetectionAlgorithm/1.Outlier_Detection.R
index 6b65e9f..1cfe30f 100644
--- a/OutlierDetectionAlgorithm/1.Outlier_Detection.R
+++ b/OutlierDetectionAlgorithm/1.Outlier_Detection.R
@@ -59,17 +59,29 @@ fpkm.tumor.symbol.filter <- fpkm.tumor.symbol[rownames(fpkm.tumor.symbol) %in% n
 molecular.data.filter <- fpkm.tumor.symbol.filter[, patient.part];
 
 
+### Trim sample
+trim.sample <- function(x, trim = 0.05) {
+    x <- sort(x);
+    if (length(x) <= 10) {
+        patient.trim.value <- 2:(length(x)-1);
+        } else {
+        trim.sample.number <- length(x) * trim;
+        trim.sample.number.integer <- round(trim.sample.number);
+        patient.trim.value <- (trim.sample.number.integer + 1):(length(x)-trim.sample.number.integer);
+        }
+    x[patient.trim.value];
+    }
 
 
 ### Outlier detection function
 # Default : methods = 'mean', trim = 0
 # 1. MEAN and SD : methods = 'mean', trim = 0
-# 2. TRIMMED MEAN and TRIMMED SD : methods = 'mean', trim = 5
+# 2. TRIMMED MEAN and TRIMMED SD : methods = 'mean', trim = 0.05
 # 3. MEDIAN and MAD : methods = 'median'
-# 4. KMEAN : methods = 'kmean'
-quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALSE) {
+# 4. KMEAN : methods = 'kmean', nstart = 1000
+quantify.outliers <- function(x, methods = 'mean', trim = 0, nstart = 1, exclude.zero = FALSE) {
     x.na <- na.omit(as.numeric(x));
-    if (methods == 'median') {
+    if ('median' == methods) {
         if (exclude.zero) { 
             x.nonzero <- x.na[0 != x.na]; 
             data.median <- median(x.nonzero);
@@ -83,25 +95,25 @@ quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALS
         x[which(!is.na(x))] <- result.na;
         x;
         }
-    else if (methods == 'kmean') {
+    else if ('kmean' == methods) {
         if (exclude.zero) {
-            if (length(unique(as.numeric(x.na))) == 1) {
+            if (1 == length(unique(x.na))) {
                 kmean.matrix <- rep(NA, length(x.na));
                 names(kmean.matrix) <- names(x.na);
                 } 
             else {
                 data.order <- sort(x.na, decreasing = TRUE);
                 non.zero <- data.order[data.order > 0];
-                if (length(unique(as.numeric(non.zero))) <= 2) {
+                if (length(unique(non.zero)) <= 2) {
                     na.matrix <- rep(NA, length(non.zero));
-                    cluster.zero <- c(na.matrix, rep(0, length(x.na[x.na == 0])));
+                    cluster.zero <- c(na.matrix, rep(0, length(x.na[0 == x.na])));
                     kmean.matrix <- cluster.zero[match(x.na, data.order)];
                     names(kmean.matrix) <- names(x.na);  
                     } 
                 else {
-                    kmean <- kmeans(non.zero, 2, nstart = 1000);
+                    kmean <- kmeans(non.zero, 2, nstart = nstart);
                     cluster <- kmean$cluster;
-                    cluster.zero <- c(cluster, rep(0, length(x[x == 0])));
+                    cluster.zero <- c(cluster, rep(0, length(x[0 == x])));
                     kmean.matrix <- cluster.zero[match(x.na, data.order)];
                     names(kmean.matrix) <- names(x.na);   
                     }
@@ -109,12 +121,12 @@ quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALS
             } 
     
         else {
-            if (length(unique(as.numeric(x.na))) == 1) {
+            if (1 == length(unique(x.na))) {
                 kmean.matrix <- rep(NA, length(x.na));
                 names(kmean.matrix) <- names(x.na);  
                 } 
             else {
-                kmean <- kmeans(x.na, 2, nstart = 1000);
+                kmean <- kmeans(x.na, 2, nstart = nstart);
                 cluster <- kmean$cluster;
                 kmean.matrix <- cluster;
                 names(kmean.matrix) <- names(x.na);  
@@ -127,16 +139,16 @@ quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALS
     else {
         gene.order <- x.na[order(x.na, decreasing = TRUE)];
         if (exclude.zero) { 
-            gene.order.nonzero <- gene.order[0 != gene.order]; 
-            top.patient <- round(length(gene.order.nonzero) * (trim / 100), digit = 0);
-            low.patient <- round(length(gene.order.nonzero) * (1 - (trim / 100)), digit = 0);
-            data.mean <- mean(gene.order.nonzero, trim = (trim / 100));
+            gene.order.nonzero <- gene.order[0 != gene.order];
+            top.patient <- round(length(gene.order.nonzero) * trim, digit = 0);
+            low.patient <- round(length(gene.order.nonzero) * (1 - trim), digit = 0);
+            data.mean <- mean(gene.order.nonzero, trim = trim);
             data.sd <- sd(gene.order.nonzero[(top.patient+1):(low.patient)]);
             } 
         else {
-            top.patient <- round(length(x.na) * (trim / 100), digit = 0);
-            low.patient <- round(length(x.na) * (1 - (trim / 100)), digit = 0);
-            data.mean <- mean(gene.order, trim = (trim / 100));
+            top.patient <- round(length(x.na) * trim, digit = 0);
+            low.patient <- round(length(x.na) * (1 - trim), digit = 0);
+            data.mean <- mean(gene.order, trim = trim);
             data.sd <- sd(gene.order[(top.patient+1):(low.patient)]);
             }
         result.na <- (x.na - data.mean) / data.sd;
@@ -158,7 +170,7 @@ data.mean <- foreach(i=1:nrow(molecular.data.filter), .combine = rbind) %dopar%
 data.mean <- data.frame(data.mean);
 
 # 2. TRIMMED MEAN and TRIMMED SD : method = 'mean', trim = 5
-data.trimmean <- foreach(i=1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i,], trim = 5);
+data.trimmean <- foreach(i=1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i,], trim = 0.05);
 data.trimmean <- data.frame(data.trimmean);
 
 # 3. MEDIAN and MAD : method = 'median'
@@ -166,7 +178,7 @@ data.median <- foreach(i=1:nrow(molecular.data.filter), .combine = rbind) %dopar
 data.median <- data.frame(data.median);
 
 # 4. KMEAN : method = 'kmean'
-data.kmean <- foreach(i=1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i,], methods = 'kmean')
+data.kmean <- foreach(i=1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i,], methods = 'kmean', nstart = 1000)
 data.kmean <- data.frame(data.kmean);
 
 stopCluster(cl)
@@ -202,12 +214,12 @@ data.zrange.median.t <- data.frame(t(data.zrange.median));
 ### Calculate the kmean fraction #####
 # Function
 outlier.detection.kmean <- function(x) {
-    if (1== length(unique(as.numeric(x)))) {
+    if (1 == length(unique(as.numeric(x)))) {
         fraction <- NA;
         }
     else {
-        cluster.one <- length(x[x == 1]);
-        cluster.two <- length(x[x == 2]);
+        cluster.one <- length(x[1 == x]);
+        cluster.two <- length(x[2 == x]);
         cluster.sum <- cluster.one + cluster.two;
         smaller.value <- min(cluster.one, cluster.two);
         fraction <- round(smaller.value/cluster.sum, digit = 4);
@@ -238,18 +250,6 @@ outlier.detection.cosine <- function (x, value.portion = 1) {
         })    
     add.minimum.value <- 1 / 10^as.numeric(max(unlist(decimal.number.max)));
 
-
-    trim.sample <- function(x, trim.portion = 5) {
-        if (length(x) <= 10) {
-            patient.trim.value <- 2:(length(x)-1);
-        } else {
-            trim.sample.number <- length(x) * (trim.portion/100);
-            trim.sample.number.integer <- round(trim.sample.number, digits = 0);
-            patient.trim.value <- (trim.sample.number.integer + 1):(length(x)-trim.sample.number.integer);
-            }
-        patient.trim.value;
-        }
-
     # function: Compute the cosine similarity of the largest data point
     cosine.similarity.large.value.percent <- function(x, y, large.value.percent) {
 
@@ -285,7 +285,7 @@ outlier.detection.cosine <- function (x, value.portion = 1) {
     
 
     # Trimmed samples -Trim 5% of each side
-    sample.trim.number <- trim.sample(seq(length(sample.fpkm.qq.nozero)), 5);
+    sample.trim.number <- trim.sample(seq(length(sample.fpkm.qq.nozero)), 0.05);
     sample.fpkm.qq.trim <- sort(sample.fpkm.qq)[sample.trim.number];
     sample.fpkm.qq.nozero.trim <- sample.fpkm.qq.trim + add.minimum.value;
 
@@ -338,18 +338,6 @@ outlier.detection.cosine <- function (x, value.portion = 1) {
     }
 
 
-# Trimming function
-trim.sample <- function(x, trim.portion = 5) {
-    if (length(x) <= 10) {
-        patient.trim.value <- 2:(length(x)-1);
-    } else {
-        trim.sample.number <- length(x) * (trim.portion/100);
-        trim.sample.number.integer <- round(trim.sample.number, digits = 0);
-        patient.trim.value <- (trim.sample.number.integer + 1):(length(x)-trim.sample.number.integer);
-        }
-    patient.trim.value;
-    }
-
 # Determine the distribution
 # Find the best fitted distribution
 cl <- makeCluster(2);
@@ -374,15 +362,14 @@ bic.trim.distribution <- NULL;
 # Use foreach to iterate over the rows of fpkm.tumor.symbol.filter in parallel
 bic.trim.distribution <- foreach(j = 1:nrow(fpkm.tumor.symbol.filter), .combine = rbind) %dopar% {
     sample.fpkm.qq <- round(as.numeric(fpkm.tumor.symbol.filter[j,patient.part]), digits = 6);
-    sample.trim.number <- trim.sample(sample.number, 5);
-    sample.fpkm.qq.sort <- sort(sample.fpkm.qq)[sample.trim.number];
-    sample.fpkm.qq.nozero <- sample.fpkm.qq.sort + add.minimum.value;
+    sample.fpkm.qq.trimmed <- trim.sample(sample.fpkm.qq, 0.05)
+    sample.fpkm.qq.trimmed.nozero <- sample.fpkm.qq.trimmed + add.minimum.value;
     
     
-    glm.norm <- gamlss(sample.fpkm.qq.nozero ~ 1, family=NO);
-    glm.lnorm <- gamlss(sample.fpkm.qq.nozero ~ 1, family=LNO);
-    glm.gamma <- gamlss(sample.fpkm.qq.nozero ~ 1, family=GA);
-    glm.exp <- gamlss(sample.fpkm.qq.nozero ~ 1, family=EXP);
+    glm.norm <- gamlss(sample.fpkm.qq.trimmed.nozero ~ 1, family=NO);
+    glm.lnorm <- gamlss(sample.fpkm.qq.trimmed.nozero ~ 1, family=LNO);
+    glm.gamma <- gamlss(sample.fpkm.qq.trimmed.nozero ~ 1, family=GA);
+    glm.exp <- gamlss(sample.fpkm.qq.trimmed.nozero ~ 1, family=EXP);
 
     glm.bic <- c(glm.norm$sbc,
                  glm.lnorm$sbc,

From 2e74746a31ac8b45a5c93df15ed7deb659f7e691 Mon Sep 17 00:00:00 2001
From: jeeyunhan <jyhan@mednet.ucla.edu>
Date: Sat, 2 Dec 2023 20:06:46 -0800
Subject: [PATCH 2/2] Algorithm update

---
 .../10.Number_Outlier_sample_5method.R        | 425 +++++++++++++++
 ...er_sample_Simulated_Data_5method_combine.R |  40 ++
 ...ier_sample_Significant_Outlier_Detection.R | 156 ++++++
 ...e_Significant_Outlier_Pvalue_Calculation.R |  45 ++
 ...er_Outlier_sample_Simulated_Data_5method.R | 488 ++++++++++++++++++
 5 files changed, 1154 insertions(+)
 create mode 100644 OutlierDetectionAlgorithm/10.Number_Outlier_sample_5method.R
 create mode 100644 OutlierDetectionAlgorithm/11.Number_Outlier_sample_Simulated_Data_5method_combine.R
 create mode 100644 OutlierDetectionAlgorithm/12.Number_Outlier_sample_Significant_Outlier_Detection.R
 create mode 100644 OutlierDetectionAlgorithm/13.Number_Outlier_sample_Significant_Outlier_Pvalue_Calculation.R
 create mode 100644 OutlierDetectionAlgorithm/9.Number_Outlier_sample_Simulated_Data_5method.R

diff --git a/OutlierDetectionAlgorithm/10.Number_Outlier_sample_5method.R b/OutlierDetectionAlgorithm/10.Number_Outlier_sample_5method.R
new file mode 100644
index 0000000..9069e6a
--- /dev/null
+++ b/OutlierDetectionAlgorithm/10.Number_Outlier_sample_5method.R
@@ -0,0 +1,425 @@
+#!/usr/bin/env Rscript
+
+
+### 10.Number_Outlier_sample_5method.R ####################################################
+# Basically, this is the same script as 1.Outlier_detection.R
+#   - exclude the largest value (exclude one patient) and run the outlier detection algorithm
+
+
+# Set the working directory
+setwd('RNA-seq/CCLE/four_zero/');
+
+# Set the name of the dataset
+dataset.name <- 'CCLE';
+
+# Required R packages
+# Install and load the 'gamlss' package
+install.packages('gamlss', repo = 'http://cran.us.r-project.org');
+library(gamlss);
+# Install and load the 'doParallel' package
+install.packages('doParallel', repo = 'http://cran.us.r-project.org');
+library(doParallel);
+# Install and load the 'foreach' package
+install.packages('foreach', repo = 'http://cran.us.r-project.org');
+library(foreach);
+# Install and load the 'parallel' package
+install.packages('parallel', repo = 'http://cran.us.r-project.org');
+library(parallel);
+# Install and load the 'extraDistr' package
+install.packages('extraDistr', repo = 'http://cran.us.r-project.org');
+library(extraDistr);
+# Install and load the 'truncnorm' package
+install.packages('truncnorm', repo = 'http://cran.us.r-project.org');
+library(truncnorm);
+# Install and load the 'lsa' package
+install.packages('lsa', repo = 'http://cran.us.r-project.org');
+library(lsa);
+# Install and load the 'SnowballC' package
+install.packages('SnowballC', repo = 'http://cran.us.r-project.org');
+library(SnowballC);
+
+
+# Load the R environment
+#   - 1. File from script 1: short version
+load(file = '2023-09-18_CCLE_final_outlier_rank_bic.short.rda');
+
+
+# Set the number of patients to be excluded
+#    - First round should be '1'
+args <- commandArgs(trailingOnly = TRUE);
+args.num <- as.numeric(args);
+
+# Sample number
+patient.part.arg <- patient.part[1:(length(patient.part)-args.num)];
+sample.number <- patient.part.arg;
+
+# Remove the largest value of each gene
+fpkm.tumor.symbol.filter.arg <- NULL;
+for(i in 1:nrow(fpkm.tumor.symbol.filter)) {
+    fpkm.sort.1 <- sort(as.numeric(fpkm.tumor.symbol.filter[i,patient.part]))[seq(length(patient.part.arg))];
+    fpkm.tumor.symbol.filter.arg <- rbind(fpkm.tumor.symbol.filter.arg, fpkm.sort.1);
+    }
+rownames(fpkm.tumor.symbol.filter.arg) <- rownames(fpkm.tumor.symbol.filter);
+
+
+
+# Same script from #1
+# function: Compute the cosine similarity of the largest data point
+cosine.similarity.large.value.percent <- function(x, y, large.value.percent) {
+
+    # rounding function
+    roundToInteger <- function(z) round(z, digits = 0)
+
+    # check if large value percent is zero
+    if (0 == large.value.percent) {
+        large.value.number.integer <- 1;
+        }
+    else {
+        large.value.number <- length(x) * (large.value.percent/100);
+        large.value.number.integer <- roundToInteger(large.value.number);
+        }
+    
+    # subset the largest values
+    patient.larger.value <- (length(x)-large.value.number.integer + 1):length(x);
+    observed.value <- sort(y);
+    theoretical.value <- sort(x);
+    mid.value <- c(1, 1);
+    value.x.y <- data.frame(theoretical.value, observed.value);
+
+    # calculate cosine similarity
+    cosine.large.value <- NULL;
+    cosine.large.value <- sapply(patient.larger.value, function(i) {
+        cosine(as.numeric(value.x.y[i,]), c(1, 1))
+        })
+    cosine.large.value;
+    }
+
+
+
+
+
+outlier.detection.cosine <- function (x, value.portion = 1) {
+
+        # Define a minimum value
+    decimal.number.max <- lapply(na.omit(x), function(x) {
+        decimal.numbers <- sapply(x, function(y) {
+            nchar(as.character(y)) - nchar(as.integer(y)) - 1
+            })
+        return(decimal.numbers)
+        })    
+    add.minimum.value <- 1 / 10^as.numeric(max(unlist(decimal.number.max)));
+    
+    # function: Trim 5% of samples from each side
+    trim.sample <- function(x, trim.portion = 5) {
+        trim.sample.number <- length(x) * (trim.portion/100);
+        trim.sample.number.integer <- round(trim.sample.number, digits = 0);
+        patient.trimr.value <- (trim.sample.number.integer + 1):(length(x)-trim.sample.number.integer);
+        patient.trimr.value;
+        }
+
+    sample.fpkm.qq <- as.numeric(x[sample.number])
+    sample.fpkm.qq.nozero <- sample.fpkm.qq + add.minimum.value;
+    
+
+    # Trimmed samples -Trim 5% of each side
+    sample.trim.number <- trim.sample(sample.number, 5);
+    sample.fpkm.qq.trim <- sort(sample.fpkm.qq)[sample.trim.number];
+    sample.fpkm.qq.nozero.trim <- sample.fpkm.qq.trim + add.minimum.value;
+
+    
+    # Quantile
+    p <- ppoints(sample.fpkm.qq.nozero);
+    
+    # Distribution
+    distribution.fit <- as.numeric(x[length(x)]);
+    
+    if (1 == distribution.fit){
+        # 1. Normal distribution
+        norm.mean <- mean(sample.fpkm.qq.nozero.trim);
+        norm.sd <- sd(sample.fpkm.qq.nozero.trim);
+        # Use truncated norm
+        norm.quantiles <- qtruncnorm(p, a=0, b=Inf, mean = norm.mean, sd = norm.sd);
+        obs.quantile.norm <- quantile(sample.fpkm.qq.nozero, prob = p);
+        last.cos <- cosine.similarity.large.value.percent(norm.quantiles, obs.quantile.norm, large.value.percent = value.portion);
+        }
+    else if (2 == distribution.fit) {
+        # 2. Log-normal distribution
+        mean.log <- mean(sample.fpkm.qq.nozero.trim);
+        sd.log <- sd(sample.fpkm.qq.nozero.trim);
+        m2 <-  log(mean.log^2 / sqrt(sd.log^2 + mean.log^2));
+        sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2)));
+        lnorm.quantile <- qlnorm(p, meanlog = m2, sdlog = sd2);
+        obs.quantile.lnorm <- quantile(sample.fpkm.qq.nozero, prob = p);
+        last.cos <- cosine.similarity.large.value.percent(lnorm.quantile, obs.quantile.lnorm, large.value.percent = value.portion);
+        }
+    else if (3 == distribution.fit) {
+        # 3. Exponential distribution
+        exp.rate <- 1 / mean(sample.fpkm.qq.nozero.trim);
+        exp.quantile <- qexp(p, rate = exp.rate);
+        obs.quantile.exp <- quantile(sample.fpkm.qq.nozero, prob = p);
+        last.cos <- cosine.similarity.large.value.percent(exp.quantile, obs.quantile.exp, large.value.percent = value.portion);
+        }
+    else if (4 == distribution.fit) {
+        ### 4 gamma distribution
+        mean.gamma <- mean(sample.fpkm.qq.nozero.trim);
+        sd.gamma <- sd(sample.fpkm.qq.nozero.trim);
+        gamma.shape <- (mean.gamma/sd.gamma)^2;
+        gamma.rate <- mean.gamma/(sd.gamma^2);
+        gamma.quantile <- qgamma(p, shape = gamma.shape, rate = gamma.rate);
+        obs.quantile.gamma <- quantile(sample.fpkm.qq.nozero, prob = p);
+        last.cos <- cosine.similarity.large.value.percent(gamma.quantile, obs.quantile.gamma, large.value.percent = value.portion);
+        }
+
+    cosine.sum.distribution.fit <- c(last.cos, distribution.fit);
+    cosine.sum.distribution.fit;
+    }
+
+
+# Determine the distribution
+# Find the best fitted distribution
+
+trim.sample <- function(x, trim.portion = 5) {
+    trim.sample.number <- length(x) * (trim.portion/100);
+    trim.sample.number.integer <- round(trim.sample.number, digits = 0);
+    patient.trimr.value <- (trim.sample.number.integer + 1):(length(x)-trim.sample.number.integer);
+    patient.trimr.value;
+    }
+
+
+
+
+
+cl <- makeCluster(25);
+# register the cluster with the parallel package
+registerDoParallel(cl);
+clusterExport(cl, "outlier.detection.cosine");
+clusterEvalQ(cl, c(library(lsa), library(SnowballC)));
+
+fpkm.tumor.symbol.filter.bic.fit <- cbind(fpkm.tumor.symbol.filter.arg, distribution = bic.trim.distribution.fit);
+data.cosine.bic <- apply(fpkm.tumor.symbol.filter.bic.fit, 
+                         1, 
+                         outlier.detection.cosine, 
+                         value.portion = 0);
+
+stopImplicitCluster();
+
+
+data.cosine.bic.t.arg <- data.frame(t(data.cosine.bic));
+colnames(data.cosine.bic.t.arg) <- c('cosine', 'distribution');
+
+
+
+
+# 1,2,3,4 
+quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALSE) {
+    x <- as.numeric(x);
+    if (methods == 'median') {
+        if (exclude.zero) { 
+            x.nonzero <- x[0 != x]; 
+            data.median <- median(x.nonzero);
+            data.mad <- mad(x.nonzero);
+            } 
+        else {
+            data.median <- median(x);
+            data.mad <- mad(x);
+            }
+        (x - data.median) / data.mad;
+        }
+    else if (methods == 'kmean') {
+        if (exclude.zero) {
+            if (length(unique(as.numeric(x))) == 1) {
+                kmean.matrix <- rep(NA, length(x));
+                names(kmean.matrix) <- names(x);
+                } 
+            else {
+                data.order <- sort(x, decreasing = TRUE);
+                non.zero <- data.order[data.order > 0];
+                if (length(unique(as.numeric(non.zero))) <= 2) {
+                    na.matrix <- rep(NA, length(non.zero));
+                    cluster.zero <- c(na.matrix, rep(0, length(x[x == 0])));
+                    kmean.matrix <- cluster.zero[match(x, data.order)];
+                    names(kmean.matrix) <- names(x);  
+                    } 
+                else {
+                    kmean <- kmeans(non.zero, 2, nstart = 1000);
+                    cluster <- kmean$cluster;
+                    cluster.zero <- c(cluster, rep(0, length(x[x == 0])));
+                    kmean.matrix <- cluster.zero[match(x, data.order)];
+                    names(kmean.matrix) <- names(x);   
+                    }
+                }
+            } 
+    
+        else {
+            if (length(unique(as.numeric(x))) == 1) {
+                kmean.matrix <- rep(NA, length(x));
+                names(kmean.matrix) <- names(x);  
+                } 
+            else {
+                kmean <- kmeans(x, 2, nstart = 1000);
+                cluster <- kmean$cluster;
+                kmean.matrix <- cluster;
+                names(kmean.matrix) <- names(x);  
+                }
+            }
+        kmean.matrix;
+        }
+    else {
+        gene.order <- x[order(x, decreasing = TRUE)];
+        if (exclude.zero) { 
+            gene.order.nonzero <- gene.order[0 != gene.order]; 
+            top.patient <- round(length(gene.order.nonzero) * (trim / 100), digit = 0);
+            low.patient <- round(length(gene.order.nonzero) * (1 - (trim / 100)), digit = 0);
+            data.mean <- mean(gene.order.nonzero, trim = (trim / 100));
+            data.sd <- sd(gene.order.nonzero[(top.patient+1):(low.patient)]);
+            } 
+        else {
+            top.patient <- round(length(x) * (trim / 100), digit = 0);
+            low.patient <- round(length(x) * (1 - (trim / 100)), digit = 0);
+            data.mean <- mean(gene.order, trim = (trim / 100));
+            data.sd <- sd(gene.order[(top.patient+1):(low.patient)]);
+            }
+        (x - data.mean) / data.sd;
+        }
+    }
+
+
+# Parallel running
+cl <- makeCluster(detectCores()-1);
+# register the cluster with the parallel package
+registerDoParallel(cl);
+
+# 1. MEAN and SD : method = 'mean', trim = 0
+data.mean <- foreach(i=1:nrow(fpkm.tumor.symbol.filter.arg[,patient.part.arg]), .combine = rbind) %dopar% quantify.outliers(fpkm.tumor.symbol.filter.arg[,patient.part.arg][i,]);
+data.mean <- data.frame(data.mean);
+
+# 2. TRIMMED MEAN and TRIMMED SD : method = 'mean', trim = 5
+data.trimmean <- foreach(i=1:nrow(fpkm.tumor.symbol.filter.arg[,patient.part.arg]), .combine = rbind) %dopar% quantify.outliers(fpkm.tumor.symbol.filter.arg[,patient.part.arg][i,], trim = 5);
+data.trimmean <- data.frame(data.trimmean);
+
+# 3. MEDIAN and MAD : method = 'median'
+data.median <- foreach(i=1:nrow(fpkm.tumor.symbol.filter.arg[,patient.part.arg]), .combine = rbind) %dopar% quantify.outliers(fpkm.tumor.symbol.filter.arg[,patient.part.arg][i,], methods = 'median');
+data.median <- data.frame(data.median);
+
+# 4. KMEAN : method = 'kmean'
+data.kmean <- foreach(i=1:nrow(fpkm.tumor.symbol.filter.arg[,patient.part.arg]), .combine = rbind) %dopar% quantify.outliers(fpkm.tumor.symbol.filter.arg[,patient.part.arg][i,], methods = 'kmean')
+data.kmean <- data.frame(data.kmean);
+
+stopCluster(cl)
+
+
+
+
+
+outlier.detection.zrange <- function(x) {
+    zrange <- max(x) - min(x);
+    zrange.matrix <- c(x, zrange);
+    names(zrange.matrix) <- c(names(x), 'zrange');
+    zrange.matrix;
+    }
+
+
+
+
+# 1. MEAN and SD
+data.zrange.mean <- apply(data.mean, 1, outlier.detection.zrange);
+data.zrange.mean.t.arg <- data.frame(t(data.zrange.mean));
+
+# 2. TRIMMED MEAN and TRIMMED SD
+data.zrange.trimmean <- apply(data.trimmean, 1, outlier.detection.zrange);
+data.zrange.trimmean.t.arg <- data.frame(t(data.zrange.trimmean));
+
+# 3. MEDIAN and MAD
+data.zrange.median <- apply(data.median, 1, outlier.detection.zrange);
+data.zrange.median.t.arg <- data.frame(t(data.zrange.median));
+
+
+
+
+### Calculate the kmean fraction #####
+# Function
+outlier.detection.kmean <- function(x) {
+    if (1== length(unique(as.numeric(x)))) {
+        fraction <- NA;
+        }
+    else {
+        cluster.one <- length(x[x == 1]);
+        cluster.two <- length(x[x == 2]);
+        cluster.sum <- cluster.one + cluster.two;
+        smaller.value <- min(cluster.one, cluster.two);
+        fraction <- round(smaller.value/cluster.sum, digit = 4);
+        }
+    fraction.matrix <- c(x, fraction);
+    names(fraction.matrix) <- c(names(x), 'fraction');
+    fraction.matrix;
+    }
+
+# 4. KMEAN fraction
+data.fraction.kmean <- apply(data.kmean, 1, outlier.detection.kmean);
+data.fraction.kmean.t.arg <- data.frame(t(data.fraction.kmean));
+
+
+
+
+### Final gene-wise matrix #####
+### Final gene-wise matrix #####
+gene.zrange.fraction.cosine.last.point.bic.arg <- data.frame(cbind(data.zrange.mean.t.arg$zrange,
+                     data.zrange.median.t.arg$zrange,
+                     data.zrange.trimmean.t.arg$zrange,
+                     data.fraction.kmean.t.arg$fraction,
+                     data.cosine.bic.t.arg$cosine,
+                     data.cosine.bic.t.arg$distribution));
+rownames(gene.zrange.fraction.cosine.last.point.bic.arg) <- rownames(fpkm.tumor.symbol.filter);
+colnames(gene.zrange.fraction.cosine.last.point.bic.arg) <- c('zrange.mean', 'zrange.median', 'zrange.trimmean', 'fraction.kmean', 'cosine', 'distribution');
+
+
+
+zrange.mean <- paste('data.zrange.mean.t.', args.num, sep = '');
+assign(zrange.mean, data.zrange.mean.t.arg);
+
+zrange.trimmean <- paste('data.zrange.trimmean.t.', args.num, sep = '');
+assign(zrange.trimmean, data.zrange.trimmean.t.arg);
+
+zrange.median <- paste('data.zrange.median.t.', args.num, sep = '');
+assign(zrange.median, data.zrange.median.t.arg);
+
+fraction.kmean <- paste('data.fraction.kmean.t.', args.num, sep = '');
+assign(fraction.kmean, data.fraction.kmean.t.arg);
+
+cosine.bic <- paste('data.cosine.bic.t.', args.num, sep = '');
+assign(cosine.bic, data.cosine.bic.t.arg);
+
+gene.zrange.fraction <- paste('gene.zrange.fraction.cosine.last.point.bic.', args.num, sep = '');
+assign(gene.zrange.fraction, gene.zrange.fraction.cosine.last.point.bic.arg);
+
+
+
+
+save(
+    fpkm.tumor.symbol.filter,
+    patient.part,
+    sample.number,
+    bic.trim.distribution.fit,
+    list = paste0('gene.zrange.fraction.cosine.last.point.bic.', args.num, sep = ''),
+    file = paste0('10.Number_Outlier_sample_5method.', args.num, '.short.rda', sep = '')
+    );
+
+
+save(
+    fpkm.tumor.symbol.filter,
+    patient.part,
+    sample.number,
+    bic.trim.distribution.fit,
+    list = c(
+        paste0('data.zrange.mean.t.', args.num, sep = ''),
+        paste0('data.zrange.trimmean.t.', args.num, sep = ''),
+        paste0('data.zrange.median.t.', args.num, sep = ''),
+        paste0('data.fraction.kmean.t.', args.num, sep = ''),
+        paste0('data.cosine.bic.t.', args.num, sep = ''),
+        paste0('gene.zrange.fraction.cosine.last.point.bic.', args.num, sep = '')),
+    file = paste0('10.Number_Outlier_sample_5method.', args.num, '.long.rda', sep = '')
+    );
+
+
+
+
diff --git a/OutlierDetectionAlgorithm/11.Number_Outlier_sample_Simulated_Data_5method_combine.R b/OutlierDetectionAlgorithm/11.Number_Outlier_sample_Simulated_Data_5method_combine.R
new file mode 100644
index 0000000..a8b2c03
--- /dev/null
+++ b/OutlierDetectionAlgorithm/11.Number_Outlier_sample_Simulated_Data_5method_combine.R
@@ -0,0 +1,40 @@
+#!/usr/bin/env Rscript
+
+### 11.Simulated_Data_5method_combine.R ####################################################
+# Combine the 10 chunks of statistics results
+
+
+# Set the working directory
+setwd('RNA-seq/CCLE/four_zero/');
+
+# Set the name of dataset
+dataset.name <- 'CCLE';
+
+# Combine all 10 chunks
+args <- commandArgs(trailingOnly = TRUE)
+
+
+for (i in 1:10) {
+    load(paste('9.Number_Outlier_sample_Simulated_Data_5method.', dataset.name, '.', i, '.', args, '.short.rda', sep = ''));
+    gene.zrange.fraction.negative.simulated.sum.bic.5method.1M <- get(paste('gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.', i, sep = ''));
+    all.statistics <- paste('gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.', args, i, sep = '');
+    assign(all.statistics, gene.zrange.fraction.negative.simulated.sum.bic.5method.1M);    
+    }
+
+#1. residue.negative.random.number.bic
+gene.zrange.fraction.negative.simulated.sum.bic.5method.1M <- NULL;
+for (i in 1:10) {
+    p.value <- get(paste('gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.', args, i, sep = ''));
+    gene.zrange.fraction.negative.simulated.sum.bic.5method.1M <- rbind(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M, p.value);
+    }
+
+
+gene.zrange.fraction <- paste('gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.', args, sep = '');
+assign(gene.zrange.fraction, gene.zrange.fraction.negative.simulated.sum.bic.5method.1M);
+
+save(
+    fpkm.tumor.symbol.filter,
+    bic.trim.distribution.fit,
+    list = paste0('gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.', args, sep = ''),
+    file = paste('11.Number_Outlier_sample_Simulated_Data_5method_combine.', dataset.name, '.', args, '.rda', sep = '')
+    );
diff --git a/OutlierDetectionAlgorithm/12.Number_Outlier_sample_Significant_Outlier_Detection.R b/OutlierDetectionAlgorithm/12.Number_Outlier_sample_Significant_Outlier_Detection.R
new file mode 100644
index 0000000..1bc7935
--- /dev/null
+++ b/OutlierDetectionAlgorithm/12.Number_Outlier_sample_Significant_Outlier_Detection.R
@@ -0,0 +1,156 @@
+#!/usr/bin/env Rscript
+
+### 12.Significant_Outlier_Detection.R ####################################################
+# Compute p-values 
+
+
+# Set the working directory
+setwd('RNA-seq/CCLE/four_zero/');
+
+# Set the name of dataset
+# dataset.name <- 'Matador';
+
+
+# Required R package
+install.packages('parallel', repo = 'http://cran.us.r-project.org');
+install.packages('foreach', repo = 'http://cran.us.r-project.org');
+install.packages('doParallel', repo = 'http://cran.us.r-project.org');
+library(parallel);
+library(foreach);
+library(doParallel);
+
+
+dataset.name <- 'CCLE';
+
+# Run 1000 genes at once
+#   - array number should be ceiling(nrow(fpkm.tumor.symbol.filter))
+args <- commandArgs(trailingOnly = TRUE);
+row.num.args <- as.numeric(args);
+
+
+# This will be used identify the number of outlier patients per gene
+#   - if '0', use whole patients (first step)
+#   - if '1', use n-1 patients (exclude the patient having the largest value)
+#   - repeat this '2', '3', '4'... until there is no outlier genes
+data.args <- 1;
+
+
+# Load the R encironment
+load(file = paste('10.Number_Outlier_sample_5method.', data.args, '.short.rda', sep = ''));
+load(file = paste('11.Number_Outlier_sample_Simulated_Data_5method_combine.', dataset.name, '.', data.args, '.rda', sep = ''));
+
+
+
+gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.new <- get(paste('gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.', data.args, sep = ''));
+gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.new <- data.frame(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.new);
+
+### Rank each methods #####
+# Function
+outlier.rank <- function(x) {
+    methods <- c('zrange.mean', 'zrange.median', 'zrange.trimmean', 'fraction.kmean', 'cosine');
+    rank.matrix <- NULL;
+    # Give rank for each methods based on z-score range/fraction of kmean
+    for (i in 1:3) {
+        methods.column <- methods[i];
+        rank.methods <- rank(-x[,methods.column], ties.method = 'max', na.last = 'keep');
+        rank.matrix <- cbind(rank.matrix, rank.methods);
+        }
+    for (i in 4:5) {
+        methods.column <- methods[i];
+        rank.methods <- rank(x[,methods.column], ties.method = 'max', na.last = 'keep');
+        rank.matrix <- cbind(rank.matrix, rank.methods);
+        }
+    rownames(rank.matrix) <-rownames(x);
+    colnames(rank.matrix) <- methods;
+    rank.matrix <- data.frame(rank.matrix);
+    }
+
+
+### Rank product to determine Top ranked genes #####
+# Function
+# x: ranked matrix
+# NA.number = Number of methods with non-NA should be more than assigned number
+outlier.rank.product <- function(x, NA.number = 0) {
+    rank <- as.numeric(x[1:5]);
+    num <- length(which(!is.na(rank)));
+    if (NA.number >= num) {
+        NA;
+        }
+    else {
+        prod(rank, na.rm = TRUE)^(1/num);
+        }
+    }
+
+
+
+### Combine matrix
+# - relabel the null data
+gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relable <- cbind(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.new,
+                                                                            gene = rownames(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.new));
+rownames(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relable) <- paste(rep("ND", 1000000), c(1:1000000), sep = '');
+
+
+
+gene.number.start.end.matrix <- NULL;
+for (i in 1:ceiling(nrow(fpkm.tumor.symbol.filter)/1000)) {
+    if (i == ceiling(nrow(fpkm.tumor.symbol.filter)/1000)) {
+        range.number <- c((i-1)*1000 + 1, nrow(fpkm.tumor.symbol.filter));
+        gene.number.start.end.matrix <- data.frame(rbind(
+            gene.number.start.end.matrix,
+            i = range.number
+            ));
+        } 
+    else {
+        range.number<- c((i-1)*1000 + 1, i*1000)
+        gene.number.start.end.matrix <- data.frame(rbind(
+            gene.number.start.end.matrix,
+            i = range.number
+            ));
+        }
+    }
+
+
+
+cl <- makeCluster(20);
+# register the cluster with the parallel package
+registerDoParallel(cl);
+clusterExport(cl, c("outlier.rank", "outlier.rank.product"));
+
+gene.zrange.fraction.fpkm.bic.5method.1M.data <- get(paste('gene.zrange.fraction.cosine.last.point.bic.', data.args, sep = ''));
+
+gene.rank.p.value.one.gene <- NULL;
+gene.rank.p.value.one.gene <- foreach(i = as.numeric(gene.number.start.end.matrix[row.num.args,1]):as.numeric(gene.number.start.end.matrix[row.num.args,2]), .combine=rbind) %dopar% {
+  observed.gene <-  gene.zrange.fraction.fpkm.bic.5method.1M.data[i,1:5];
+  combine.matrix <- rbind(observed.gene,
+                          gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relable[,1:5]);
+  # get ranks
+  data.rank.bic <- outlier.rank(combine.matrix);
+  rank.product.bic <- apply(data.rank.bic, 1, outlier.rank.product, NA.number = 3);
+  gene.rank.poduct.bic <- cbind(data.rank.bic,
+                                rank.product.bic);
+  obs <- rank.product.bic[1]
+  null <- rank.product.bic[2:(nrow(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relable) + 1)]
+  length.null <- nrow(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relable);
+  obs.p.value <- (sum(obs >= null) + 1) / (length.null + 1)
+  
+  obs.p.value.rank <- cbind(gene.rank.poduct.bic[1,], obs.p.value);
+  p.value.one.gene <- data.frame(x = obs.p.value.rank, i = i);
+  p.value.one.gene;
+}
+
+
+p.value.one <- paste('gene.rank.p.value.one.gene.', data.args, sep = '');
+assign(p.value.one, gene.rank.p.value.one.gene);
+
+
+stopImplicitCluster();
+
+
+
+
+save(
+    list = paste0('gene.rank.p.value.one.gene.', data.args, sep = ''),
+    file = paste('12.Number_Outlier_sample_Significant_Outlier_Detection.', dataset.name, '.', row.num.args, '.',data.args, '.rda', sep = '')
+    );
+
+
diff --git a/OutlierDetectionAlgorithm/13.Number_Outlier_sample_Significant_Outlier_Pvalue_Calculation.R b/OutlierDetectionAlgorithm/13.Number_Outlier_sample_Significant_Outlier_Pvalue_Calculation.R
new file mode 100644
index 0000000..04213e4
--- /dev/null
+++ b/OutlierDetectionAlgorithm/13.Number_Outlier_sample_Significant_Outlier_Pvalue_Calculation.R
@@ -0,0 +1,45 @@
+#!/usr/bin/env Rscript
+
+### 13.Number_Outlier_sample_Significant_Outlier_Pvalue_Calculation.R ####################################################
+# Compute p-values 
+
+
+# Set the working directory
+setwd('RNA-seq/CCLE/four_zero/');
+
+# Set the name of dataset
+dataset.name <- 'CCLE';
+
+# Number of excluded patients
+args <- commandArgs(trailingOnly = TRUE)
+
+
+# Manually enter 'ceiling(nrow(fpkm.tumor.symbol.filter))'
+#   - should be changed depending on the dataset
+row.chunk.num <- 14;
+
+
+for (i in 1:row.chunk.num) {
+    load(file = paste('12.Number_Outlier_sample_Significant_Outlier_Detection.', dataset.name, '.', i, '.', args, '.rda', sep = ''));
+    p.value.set <- paste('gene.rank.p.value.', i, sep = '');
+    assign(p.value.set, get(paste('gene.rank.p.value.one.gene.', args, sep = '')));    
+    }
+
+#1. residue.negative.random.number.bic
+gene.p.value.each.null <- NULL;
+for (i in 1:row.chunk.num) {
+    p.value <- get(paste('gene.rank.p.value.', i, sep = ''));
+    gene.p.value.each.null <- rbind(gene.p.value.each.null, p.value);
+    }
+
+
+p.value.all <- paste('gene.rank.p.value.one.gene.p', args, sep = '');
+assign(p.value.all, gene.p.value.each.null);
+
+
+save(
+  list = paste0('gene.rank.p.value.one.gene.p', args, sep = ''),
+  file = paste('13.Number_Outlier_sample_Significant_Outlier_Pvalue_Calculation.', dataset.name, '.', args, '.rda', sep = '')
+);
+
+
diff --git a/OutlierDetectionAlgorithm/9.Number_Outlier_sample_Simulated_Data_5method.R b/OutlierDetectionAlgorithm/9.Number_Outlier_sample_Simulated_Data_5method.R
new file mode 100644
index 0000000..4c3b41b
--- /dev/null
+++ b/OutlierDetectionAlgorithm/9.Number_Outlier_sample_Simulated_Data_5method.R
@@ -0,0 +1,488 @@
+#!/usr/bin/env Rscript
+
+### 9.Number_Outlier_sample_Simulated_Data_5method.R ####################################################
+# Compute the 5 statistics of simulated data with patient removal
+
+# Run parallel: 10 chucnks
+args <- commandArgs(trailingOnly = TRUE)
+
+# Set the working directory
+setwd('RNA-seq/CCLE/four_zero/');
+
+# Set the name of dataset
+dataset.name <- 'CCLE';
+
+# load the R environment file saved from 4.Simulated_Data_generation_2.R and 2.Distribution_Identfication.R
+load(file = paste('4.Simulated_Data_generation_2.', dataset.name, '.', args, '.rda', sep = ''));
+load(file = paste('5.Simulated_Data_5method.', dataset.name, '.', args, '.short.rda', sep = ''));
+
+
+# Required R package
+install.packages('extraDistr', repo = 'http://cran.us.r-project.org');
+install.packages('truncnorm', repo = 'http://cran.us.r-project.org');
+install.packages('SnowballC', repo = 'http://cran.us.r-project.org');
+install.packages('lsa', repo = 'http://cran.us.r-project.org');
+library(extraDistr);
+library(truncnorm);
+library(SnowballC);
+library(lsa);
+install.packages('parallel', repo = 'http://cran.us.r-project.org');
+install.packages('foreach', repo = 'http://cran.us.r-project.org');
+install.packages('doParallel', repo = 'http://cran.us.r-project.org');
+library(parallel);
+library(foreach);
+library(doParallel);
+
+
+
+
+
+
+
+
+# Manually set the number of patients to be excluded
+#    - First round should be '1'
+patient.arg <- 1;
+patient.part.arg <- patient.part[1:(length(patient.part)-patient.arg )];
+sample.number <- patient.part.arg;
+
+# Remove the patients from the simulated data
+negative.simulated.sum.arg <- negative.simulated.sum[,patient.part.arg];
+rownames(negative.simulated.sum.arg) <- rownames(negative.simulated.sum);
+
+rm(negative.simulated.sum);
+
+
+
+
+
+
+
+
+# Define a minimum value
+random.col <- sample(patient.part, 1)
+decimal.number.max <- lapply(na.omit(fpkm.tumor.symbol.filter[,random.col]), function(x) {
+    decimal.numbers <- sapply(x, function(y) {
+        nchar(as.character(y)) - nchar(as.integer(y)) - 1
+        })
+    return(decimal.numbers)
+    })    
+add.minimum.value <- 1 / 10^as.numeric(max(unlist(decimal.number.max)));
+
+
+
+# function: Compute the cosine similarity of the largest data point
+cosine.similarity.large.value.percent <- function(x, y, large.value.percent) {
+
+    # rounding function
+    roundToInteger <- function(z) round(z, digits = 0)
+
+    # check if large value percent is zero
+    if (0 == large.value.percent) {
+        large.value.number.integer <- 1;
+        }
+    else {
+        large.value.number <- length(x) * (large.value.percent/100);
+        large.value.number.integer <- roundToInteger(large.value.number);
+        }
+    
+    # subset the largest values
+    patient.larger.value <- (length(x)-large.value.number.integer + 1):length(x);
+    observed.value <- sort(y);
+    theoretical.value <- sort(x);
+    mid.value <- c(1, 1);
+    value.x.y <- data.frame(theoretical.value, observed.value);
+
+    # calculate cosine similarity
+    cosine.large.value <- NULL;
+    cosine.large.value <- sapply(patient.larger.value, function(i) {
+        cosine(as.numeric(value.x.y[i,]), c(1, 1))
+        })
+    cosine.large.value;
+    }
+
+
+
+# function: Trim 5% of samples from each side
+trim.sample <- function(x, trim.portion = 5) {
+    if (length(x) <= 10) {
+        patient.trim.value <- 2:(length(x)-1);
+    } else {
+        trim.sample.number <- length(x) * (trim.portion/100);
+        trim.sample.number.integer <- round(trim.sample.number, digits = 0);
+        patient.trim.value <- (trim.sample.number.integer + 1):(length(x)-trim.sample.number.integer);
+        }
+    patient.trim.value;
+    }
+
+outlier.detection.cosine <- function (x, value.portion = 1) {
+
+        # Define a minimum value
+    decimal.number.max <- lapply(na.omit(x), function(x) {
+        decimal.numbers <- sapply(x, function(y) {
+            nchar(as.character(y)) - nchar(as.integer(y)) - 1
+            })
+        return(decimal.numbers)
+        })    
+    add.minimum.value <- 1 / 10^as.numeric(max(unlist(decimal.number.max)));
+
+
+    trim.sample <- function(x, trim.portion = 5) {
+        if (length(x) <= 10) {
+            patient.trim.value <- 2:(length(x)-1);
+        } else {
+            trim.sample.number <- length(x) * (trim.portion/100);
+            trim.sample.number.integer <- round(trim.sample.number, digits = 0);
+            patient.trim.value <- (trim.sample.number.integer + 1):(length(x)-trim.sample.number.integer);
+            }
+        patient.trim.value;
+        }
+
+    # function: Compute the cosine similarity of the largest data point
+    cosine.similarity.large.value.percent <- function(x, y, large.value.percent) {
+
+        # rounding function
+        roundToInteger <- function(z) round(z, digits = 0)
+
+        # check if large value percent is zero
+        if (0 == large.value.percent) {
+            large.value.number.integer <- 1;
+            }
+        else {
+            large.value.number <- length(x) * (large.value.percent/100);
+            large.value.number.integer <- roundToInteger(large.value.number);
+            }
+        
+        # subset the largest values
+        patient.larger.value <- (length(x)-large.value.number.integer + 1):length(x);
+        observed.value <- sort(y);
+        theoretical.value <- sort(x);
+        mid.value <- c(1, 1);
+        value.x.y <- data.frame(theoretical.value, observed.value);
+
+        # calculate cosine similarity
+        cosine.large.value <- NULL;
+        cosine.large.value <- sapply(patient.larger.value, function(i) {
+            cosine(as.numeric(value.x.y[i,]), c(1, 1))
+            })
+        cosine.large.value;
+        }
+
+    sample.fpkm.qq <- na.omit(as.numeric(x[sample.number]))
+    sample.fpkm.qq.nozero <- sample.fpkm.qq + add.minimum.value;
+    
+
+    # Trimmed samples -Trim 5% of each side
+    sample.trim.number <- trim.sample(seq(length(sample.fpkm.qq.nozero)), 5);
+    sample.fpkm.qq.trim <- sort(sample.fpkm.qq)[sample.trim.number];
+    sample.fpkm.qq.nozero.trim <- sample.fpkm.qq.trim + add.minimum.value;
+
+    
+    # Quantile
+    p <- ppoints(sample.fpkm.qq.nozero);
+    
+    # Distribution
+    distribution.fit <- as.numeric(x[length(x)]);
+    
+    if (1 == distribution.fit){
+        # 1. Normal distribution
+        norm.mean <- mean(sample.fpkm.qq.nozero.trim);
+        norm.sd <- sd(sample.fpkm.qq.nozero.trim);
+        # Use truncated norm
+        norm.quantiles <- qtruncnorm(p, a=0, b=Inf, mean = norm.mean, sd = norm.sd);
+        obs.quantile.norm <- quantile(sample.fpkm.qq.nozero, prob = p);
+        last.cos <- cosine.similarity.large.value.percent(norm.quantiles, obs.quantile.norm, large.value.percent = value.portion);
+        }
+    else if (2 == distribution.fit) {
+        # 2. Log-normal distribution
+        mean.log <- mean(sample.fpkm.qq.nozero.trim);
+        sd.log <- sd(sample.fpkm.qq.nozero.trim);
+        m2 <-  log(mean.log^2 / sqrt(sd.log^2 + mean.log^2));
+        sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2)));
+        lnorm.quantile <- qlnorm(p, meanlog = m2, sdlog = sd2);
+        obs.quantile.lnorm <- quantile(sample.fpkm.qq.nozero, prob = p);
+        last.cos <- cosine.similarity.large.value.percent(lnorm.quantile, obs.quantile.lnorm, large.value.percent = value.portion);
+        }
+    else if (3 == distribution.fit) {
+        # 3. Exponential distribution
+        exp.rate <- 1 / mean(sample.fpkm.qq.nozero.trim);
+        exp.quantile <- qexp(p, rate = exp.rate);
+        obs.quantile.exp <- quantile(sample.fpkm.qq.nozero, prob = p);
+        last.cos <- cosine.similarity.large.value.percent(exp.quantile, obs.quantile.exp, large.value.percent = value.portion);
+        }
+    else if (4 == distribution.fit) {
+        ### 4 gamma distribution
+        mean.gamma <- mean(sample.fpkm.qq.nozero.trim);
+        sd.gamma <- sd(sample.fpkm.qq.nozero.trim);
+        gamma.shape <- (mean.gamma/sd.gamma)^2;
+        gamma.rate <- mean.gamma/(sd.gamma^2);
+        gamma.quantile <- qgamma(p, shape = gamma.shape, rate = gamma.rate);
+        obs.quantile.gamma <- quantile(sample.fpkm.qq.nozero, prob = p);
+        last.cos <- cosine.similarity.large.value.percent(gamma.quantile, obs.quantile.gamma, large.value.percent = value.portion);
+        }
+
+    cosine.sum.distribution.fit <- c(last.cos, distribution.fit);
+    cosine.sum.distribution.fit;
+    }
+
+
+
+
+# Check the cosine similarity
+negative.simulated.sum.fit <- cbind(negative.simulated.sum.arg, distribution = data.cosine.negative.t$distribution);
+# run it parallel
+cl <- makeCluster(20);
+# register the cluster with the parallel package
+registerDoParallel(cl);
+clusterExport(cl, "outlier.detection.cosine");
+clusterEvalQ(cl, c(library(lsa), library(SnowballC)));
+
+data.cosine.negative <- apply(negative.simulated.sum.fit, 
+                         1, 
+                         outlier.detection.cosine, 
+                         value.portion = 0);
+
+stopImplicitCluster();
+
+
+data.cosine.negative.t <- t(data.cosine.negative);
+data.cosine.negative.t.arg <- data.frame(data.cosine.negative.t);
+colnames(data.cosine.negative.t.arg) <- c('cosine', 'distribution');
+
+
+
+
+# 1,2,3,4 
+quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALSE) {
+    x.na <- na.omit(as.numeric(x));
+    if (methods == 'median') {
+        if (exclude.zero) { 
+            x.nonzero <- x.na[0 != x.na]; 
+            data.median <- median(x.nonzero);
+            data.mad <- mad(x.nonzero);
+            } 
+        else {
+            data.median <- median(x.na);
+            data.mad <- mad(x.na);
+            }
+        result.na <- (x.na - data.median) / data.mad;
+        x[which(!is.na(x))] <- result.na;
+        x;
+        }
+    else if (methods == 'kmean') {
+        if (exclude.zero) {
+            if (length(unique(as.numeric(x.na))) == 1) {
+                kmean.matrix <- rep(NA, length(x.na));
+                names(kmean.matrix) <- names(x.na);
+                } 
+            else {
+                data.order <- sort(x.na, decreasing = TRUE);
+                non.zero <- data.order[data.order > 0];
+                if (length(unique(as.numeric(non.zero))) <= 2) {
+                    na.matrix <- rep(NA, length(non.zero));
+                    cluster.zero <- c(na.matrix, rep(0, length(x.na[x.na == 0])));
+                    kmean.matrix <- cluster.zero[match(x.na, data.order)];
+                    names(kmean.matrix) <- names(x.na);  
+                    } 
+                else {
+                    kmean <- kmeans(non.zero, 2, nstart = 1000);
+                    cluster <- kmean$cluster;
+                    cluster.zero <- c(cluster, rep(0, length(x[x == 0])));
+                    kmean.matrix <- cluster.zero[match(x.na, data.order)];
+                    names(kmean.matrix) <- names(x.na);   
+                    }
+                }
+            } 
+    
+        else {
+            if (length(unique(as.numeric(x.na))) == 1) {
+                kmean.matrix <- rep(NA, length(x.na));
+                names(kmean.matrix) <- names(x.na);  
+                } 
+            else {
+                kmean <- kmeans(x.na, 2, nstart = 1000);
+                cluster <- kmean$cluster;
+                kmean.matrix <- cluster;
+                names(kmean.matrix) <- names(x.na);  
+                }
+            }
+        result.na <- kmean.matrix;
+        x[which(!is.na(x))] <- result.na;
+        x;
+        }
+    else {
+        gene.order <- x.na[order(x.na, decreasing = TRUE)];
+        if (exclude.zero) { 
+            gene.order.nonzero <- gene.order[0 != gene.order]; 
+            top.patient <- round(length(gene.order.nonzero) * (trim / 100), digit = 0);
+            low.patient <- round(length(gene.order.nonzero) * (1 - (trim / 100)), digit = 0);
+            data.mean <- mean(gene.order.nonzero, trim = (trim / 100));
+            data.sd <- sd(gene.order.nonzero[(top.patient+1):(low.patient)]);
+            } 
+        else {
+            top.patient <- round(length(x.na) * (trim / 100), digit = 0);
+            low.patient <- round(length(x.na) * (1 - (trim / 100)), digit = 0);
+            data.mean <- mean(gene.order, trim = (trim / 100));
+            data.sd <- sd(gene.order[(top.patient+1):(low.patient)]);
+            }
+        result.na <- (x.na - data.mean) / data.sd;
+        x[which(!is.na(x))] <- result.na;
+        x;
+        }
+    }
+
+
+
+
+
+
+# Parallel running
+cl <- makeCluster(20);
+# register the cluster with the parallel package
+registerDoParallel(cl);
+
+# 1. MEAN and SD : method = 'mean', trim = 0
+data.mean <- foreach(i=1:nrow(negative.simulated.sum.arg), .combine = rbind) %dopar% quantify.outliers(negative.simulated.sum.arg[i,]);
+data.mean <- data.frame(data.mean);
+
+# 2. TRIMMED MEAN and TRIMMED SD : method = 'mean', trim = 5
+data.trimmean <- foreach(i=1:nrow(negative.simulated.sum.arg), .combine = rbind) %dopar% quantify.outliers(negative.simulated.sum.arg[i,], trim = 5);
+data.trimmean <- data.frame(data.trimmean);
+
+# 3. MEDIAN and MAD : method = 'median'
+data.median <- foreach(i=1:nrow(negative.simulated.sum.arg), .combine = rbind) %dopar% quantify.outliers(negative.simulated.sum.arg[i,], methods = 'median');
+data.median <- data.frame(data.median);
+
+# 4. KMEAN : method = 'kmean'
+data.kmean <- foreach(i=1:nrow(negative.simulated.sum.arg), .combine = rbind) %dopar% quantify.outliers(negative.simulated.sum.arg[i,], methods = 'kmean')
+data.kmean <- data.frame(data.kmean);
+
+stopCluster(cl)
+
+
+
+outlier.detection.zrange <- function(x) {
+  x.na <- na.omit(x)
+  zrange <- max(x.na) - min(x.na);
+  zrange.matrix <- c(x, zrange);
+  names(zrange.matrix) <- c(names(x), 'zrange');
+  zrange.matrix;
+    }
+
+
+
+
+# 1. MEAN and SD
+data.zrange.mean <- apply(data.mean, 1, outlier.detection.zrange);
+mean.simulated.negative.1M.arg <- data.frame(t(data.zrange.mean));
+
+# 2. TRIMMED MEAN and TRIMMED SD
+data.zrange.trimmean <- apply(data.trimmean, 1, outlier.detection.zrange);
+trimmean.simulated.negative.1M.arg <- data.frame(t(data.zrange.trimmean));
+
+# 4. MEDIAN and MAD
+data.zrange.median <- apply(data.median, 1, outlier.detection.zrange);
+median.simulated.negative.1M.arg<- data.frame(t(data.zrange.median));
+
+
+
+
+### Calculate the kmean fraction #####
+# Function
+outlier.detection.kmean <- function(x) {
+    if (1== length(unique(as.numeric(x)))) {
+        fraction <- NA;
+        }
+    else {
+        cluster.one <- length(x[x == 1]);
+        cluster.two <- length(x[x == 2]);
+        cluster.sum <- cluster.one + cluster.two;
+        smaller.value <- min(cluster.one, cluster.two);
+        fraction <- round(smaller.value/cluster.sum, digit = 4);
+        }
+    fraction.matrix <- c(x, fraction);
+    names(fraction.matrix) <- c(names(x), 'fraction');
+    fraction.matrix;
+    }
+
+# 4. KMEAN fraction
+data.fraction.kmean <- apply(data.kmean, 1, outlier.detection.kmean);
+kmean.simulated.negative.1M.arg <- data.frame(t(data.fraction.kmean));
+
+
+
+
+### Final gene-wise matrix #####
+gene.zrange.fraction.negative.simulated.sum.1M.arg <- cbind(mean.simulated.negative.1M.arg$zrange,
+                     median.simulated.negative.1M.arg$zrange,
+                     trimmean.simulated.negative.1M.arg$zrange,
+                     kmean.simulated.negative.1M.arg$fraction);
+rownames(gene.zrange.fraction.negative.simulated.sum.1M.arg) <- rownames(negative.simulated.sum.arg);
+colnames(gene.zrange.fraction.negative.simulated.sum.1M.arg) <- c('zrange.mean', 'zrange.median', 'zrange.trimmean', 'fraction.kmean');
+
+
+# Final statistic matrix
+gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.arg <- cbind(gene.zrange.fraction.negative.simulated.sum.1M.arg[,c(1,2,3,4)],
+                                                    data.cosine.negative.t.arg$cosine,
+                                                    data.cosine.negative.t.arg$distribution);
+colnames(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.arg) <- c('zrange.mean', 'zrange.median', 'zrange.trimmean', 'fraction.kmean', 'cosine', 'distribution');
+
+
+
+
+
+zrange.mean <- paste('mean.simulated.negative.1M.', args, sep = '');
+assign(zrange.mean, mean.simulated.negative.1M.arg);
+
+zrange.trimmean <- paste('trimmean.simulated.negative.1M.', args, sep = '');
+assign(zrange.trimmean, trimmean.simulated.negative.1M.arg);
+
+zrange.median <- paste('median.simulated.negative.1M.', args, sep = '');
+assign(zrange.median, median.simulated.negative.1M.arg);
+
+fraction.kmean <- paste('kmean.simulated.negative.1M.', args, sep = '');
+assign(fraction.kmean, kmean.simulated.negative.1M.arg);
+
+cosine.bic <- paste('data.cosine.negative.t.', args, sep = '');
+assign(cosine.bic, data.cosine.negative.t.arg);
+
+gene.zrange.fraction <- paste('gene.zrange.fraction.negative.simulated.sum.1M.', args, sep = '');
+assign(gene.zrange.fraction, gene.zrange.fraction.negative.simulated.sum.1M.arg);
+
+gene.zrange.fraction.cosine <- paste('gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.', args, sep = '');
+assign(gene.zrange.fraction.cosine, gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.arg);
+
+
+
+
+
+save(
+    fpkm.tumor.symbol.filter,
+    sample.number,
+    bic.trim.distribution.fit.obs,
+    bic.trim.distribution.fit,
+    list = c(
+        paste0('gene.zrange.fraction.negative.simulated.sum.1M.', args, sep = ''),
+        paste0('data.cosine.negative.t.', args, sep = ''),
+        paste0('gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.', args, sep = '')),
+    file = paste('9.Number_Outlier_sample_Simulated_Data_5method.', dataset.name, '.', args, '.', patient.arg, '.short.rda', sep = '')
+    );
+
+
+save(
+    fpkm.tumor.symbol.filter,
+    patient.part,
+    bic.trim.distribution.fit.obs,
+    bic.trim.distribution.fit,
+    list = c(
+        paste0('mean.simulated.negative.1M.', args, sep = ''),
+        paste0('median.simulated.negative.1M.', args, sep = ''),
+        paste0('trimmean.simulated.negative.1M.', args, sep = ''),
+        paste0('kmean.simulated.negative.1M.', args, sep = ''),
+        paste0('gene.zrange.fraction.negative.simulated.sum.1M.', args, sep = ''),
+        paste0('data.cosine.negative.t.', args, sep = ''),
+        paste0('gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.', args, sep = '')),
+    file = paste('9.Number_Outlier_sample_Simulated_Data_5method.', dataset.name, '.', args, '.', patient.arg, '.long.rda', sep = '')
+    );
+
+