fix merge conflict with jsahrmann-refactor-quantify-outliers

uclahs-cds · Dec 6, 2023 · 5efd555 · 5efd555
2 parents b55dac9 + 3ad0938
commit 5efd555
Show file tree

Hide file tree

Showing 6 changed files with 1,186 additions and 50 deletions.
diff --git a/OutlierDetectionAlgorithm/1.Outlier_Detection.R b/OutlierDetectionAlgorithm/1.Outlier_Detection.R
@@ -93,21 +93,32 @@ annot.filter <- annot[which(0.01 > zero.portion), ]
 
 molecular.data.filter <- fpkm.tumor.symbol.filter[, patient.part];
 
+### Trim sample
+trim.sample <- function(x, trim = 0.05) {
+    x <- sort(x);
+    if (length(x) <= 10) {
+        patient.trim.value <- 2:(length(x)-1);
+        } else {
+        trim.sample.number <- length(x) * trim;
+        trim.sample.number.integer <- round(trim.sample.number);
+        patient.trim.value <- (trim.sample.number.integer + 1):(length(x)-trim.sample.number.integer);
+        }
+    x[patient.trim.value];
+    }
+
 # Would this be faster if they were separate functions instead of if statements ?
 
 ### Outlier detection function
 # Default : methods = 'mean', trim = 0
 # 1. MEAN and SD : methods = 'mean', trim = 0
-# 2. TRIMMED MEAN and TRIMMED SD : methods = 'mean', trim = 5
+# 2. TRIMMED MEAN and TRIMMED SD : methods = 'mean', trim = 0.05
 # 3. MEDIAN and MAD : methods = 'median'
-# 4. KMEAN : methods = 'kmean'
-
-quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALSE) {
-    # a better name might be x.complete, x.na makes me thingk it includes NAs
+# 4. KMEAN : methods = 'kmean', nstart = 1000
+quantify.outliers <- function(x, methods = 'mean', trim = 0, nstart = 1, exclude.zero = FALSE) {
     x.na <- na.omit(as.numeric(x));
-    if (methods == 'median') {
-        if (exclude.zero) {
-            x.nonzero <- x.na[0 != x.na];
+    if ('median' == methods) {
+        if (exclude.zero) { 
+            x.nonzero <- x.na[0 != x.na]; 
             data.median <- median(x.nonzero);
             data.mad <- mad(x.nonzero);
             }
@@ -119,37 +130,37 @@ quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALS
         x[which(!is.na(x))] <- result.na;
         x;
         }
-    else if (methods == 'kmean') {
+    else if ('kmean' == methods) {
         if (exclude.zero) {
-            if (length(unique(as.numeric(x.na))) == 1) {
+            if (1 == length(unique(x.na))) {
                 kmean.matrix <- rep(NA, length(x.na));
                 names(kmean.matrix) <- names(x.na);
             }
             else {
                 data.order <- sort(x.na, decreasing = TRUE);
                 non.zero <- data.order[data.order > 0];
-                if (length(unique(as.numeric(non.zero))) <= 2) {
+                if (length(unique(non.zero)) <= 2) {
                     na.matrix <- rep(NA, length(non.zero));
-                    cluster.zero <- c(na.matrix, rep(0, length(x.na[x.na == 0])));
+                    cluster.zero <- c(na.matrix, rep(0, length(x.na[0 == x.na])));
                     kmean.matrix <- cluster.zero[match(x.na, data.order)];
                     names(kmean.matrix) <- names(x.na);
                     }
                 else {
-                    kmean <- kmeans(non.zero, 2, nstart = 100);
+                    kmean <- kmeans(non.zero, 2, nstart = nstart);
                     cluster <- kmean$cluster;
-                    cluster.zero <- c(cluster, rep(0, length(x[x == 0])));
+                    cluster.zero <- c(cluster, rep(0, length(x[0 == x])));
                     kmean.matrix <- cluster.zero[match(x.na, data.order)];
                     names(kmean.matrix) <- names(x.na);
                     }
                 }
             }
         else {
-            if (length(unique(as.numeric(x.na))) == 1) {
+            if (1 == length(unique(x.na))) {
                 kmean.matrix <- rep(NA, length(x.na));
                 names(kmean.matrix) <- names(x.na);
                 }
             else {
-                kmean <- kmeans(x.na, 2, nstart = 100);
+                kmean <- kmeans(x.na, 2, nstart = nstart);
                 cluster <- kmean$cluster;
                 kmean.matrix <- cluster;
                 names(kmean.matrix) <- names(x.na);
@@ -192,8 +203,6 @@ data.mean <- foreach(i = 1:nrow(molecular.data.filter), .combine = rbind) %dopar
 data.mean <- data.frame(data.mean);
 
 # 2. TRIMMED MEAN and TRIMMED SD : method = 'mean', trim = 5
-print('Calculating using TRIMMED MEAN and TRIMMED SD')
-print(Sys.time())
 data.trimmean <- foreach(i = 1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i,], trim = 5);
 data.trimmean <- data.frame(data.trimmean);
 
@@ -204,9 +213,7 @@ data.median <- foreach(i = 1:nrow(molecular.data.filter), .combine = rbind) %dop
 data.median <- data.frame(data.median);
 
 # 4. KMEAN : method = 'kmean'
-print('Calculating using KMEANS')
-print(Sys.time())
-data.kmean <- foreach(i = 1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i,], methods = 'kmean')
+data.kmean <- foreach(i = 1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i,], methods = 'kmean', nstart = 1000)
 data.kmean <- data.frame(data.kmean);
 
 stopCluster(cl = cl)
@@ -235,16 +242,15 @@ data.zrange.trimmean.t <- data.frame(t(data.zrange.trimmean));
 data.zrange.median <- apply(data.median, 1, outlier.detection.zrange);
 data.zrange.median.t <- data.frame(t(data.zrange.median));
 
-
 ### Calculate the kmean fraction #####
 # Function
 outlier.detection.kmean <- function(x) {
     if (1 == length(unique(as.numeric(x)))) {
         fraction <- NA;
         }
     else {
-        cluster.one <- length(x[x == 1]);
-        cluster.two <- length(x[x == 2]);
+        cluster.one <- length(x[1 == x]);
+        cluster.two <- length(x[2 == x]);
         cluster.sum <- cluster.one + cluster.two;
         smaller.value <- min(cluster.one, cluster.two);
         fraction <- round(smaller.value / cluster.sum, digit = 4);
@@ -259,7 +265,6 @@ print('Outlier detection kmeans')
 data.fraction.kmean <- apply(data.kmean, 1, outlier.detection.kmean);
 data.fraction.kmean.t <- data.frame(t(data.fraction.kmean));
 
-
 # 5. Cosine similarity
 # function: Compute the cosine similarity of the largest data point
 outlier.detection.cosine <- function(x, value.portion = 1) {
@@ -273,17 +278,6 @@ outlier.detection.cosine <- function(x, value.portion = 1) {
         })
     add.minimum.value <- 1 / 10^as.numeric(max(unlist(decimal.number.max)));
 
-    trim.sample <- function(x, trim.portion = 5) {
-        if (length(x) <= 10) {
-            patient.trim.value <- 2:(length(x) - 1);
-        } else {
-            trim.sample.number <- length(x) * (trim.portion / 100);
-            trim.sample.number.integer <- round(trim.sample.number, digits = 0);
-            patient.trim.value <- (trim.sample.number.integer + 1):(length(x) - trim.sample.number.integer);
-            }
-        patient.trim.value;
-        }
-
     # function: Compute the cosine similarity of the largest data point
     cosine.similarity.large.value.percent <- function(x, y, large.value.percent) {
 
@@ -318,7 +312,7 @@ outlier.detection.cosine <- function(x, value.portion = 1) {
     sample.fpkm.qq.nozero <- sample.fpkm.qq + add.minimum.value;
 
     # Trimmed samples -Trim 5% of each side
-    sample.trim.number <- trim.sample(seq(length(sample.fpkm.qq.nozero)), 5);
+    sample.trim.number <- trim.sample(seq(length(sample.fpkm.qq.nozero)), 0.05);
     sample.fpkm.qq.trim <- sort(sample.fpkm.qq)[sample.trim.number];
     sample.fpkm.qq.nozero.trim <- sample.fpkm.qq.trim + add.minimum.value;
 
@@ -369,18 +363,6 @@ outlier.detection.cosine <- function(x, value.portion = 1) {
     cosine.sum.distribution.fit;
     }
 
-# Trimming function
-trim.sample <- function(x, trim.portion = 5) {
-    if (length(x) <= 10) {
-        patient.trim.value <- 2:(length(x) - 1);
-    } else {
-        trim.sample.number <- length(x) * (trim.portion / 100);
-        trim.sample.number.integer <- round(trim.sample.number, digits = 0);
-        patient.trim.value <- (trim.sample.number.integer + 1):(length(x) - trim.sample.number.integer);
-        }
-    patient.trim.value;
-    }
-
 print('Determine the distribution')
 # Determine the distribution
 # Find the best fitted distribution
@@ -411,6 +393,7 @@ bic.trim.distribution <- NULL;
 # Use foreach to iterate over the rows of fpkm.tumor.symbol.filter in parallel
 bic.trim.distribution <- foreach(j = 1:nrow(fpkm.tumor.symbol.filter), .combine = rbind) %dopar% {
     sample.fpkm.qq <- round(as.numeric(fpkm.tumor.symbol.filter[j,patient.part]), digits = 6);
+
     sample.trim.number <- trim.sample(sample.number, 5);
     sample.fpkm.qq.sort <- sort(sample.fpkm.qq)[sample.trim.number];
     sample.fpkm.qq.nozero <- sample.fpkm.qq.sort + add.minimum.value;
@@ -477,7 +460,6 @@ gene.zrange.fraction.cosine.last.point.bic <- data.frame(
 rownames(gene.zrange.fraction.cosine.last.point.bic) <- rownames(fpkm.tumor.symbol.filter);
 colnames(gene.zrange.fraction.cosine.last.point.bic) <- c('zrange.mean', 'zrange.median', 'zrange.trimmean', 'fraction.kmean', 'cosine', 'distribution');
 
-
 print('Rank each method')
 ### Rank each methods #####
 # Function