update scripts 7 and 8

uclahs-cds · Dec 6, 2023 · 1c976f2 · 1c976f2
1 parent 6de0f30
commit 1c976f2
Show file tree

Hide file tree

Showing 2 changed files with 107 additions and 95 deletions.
diff --git a/OutlierDetectionAlgorithm/7.Significant_Outlier_Detection.R b/OutlierDetectionAlgorithm/7.Significant_Outlier_Detection.R
@@ -1,20 +1,29 @@
 #!/usr/bin/env Rscript
 
+# Rscript 7.Significant_Outlier_Detection.R --dataset.name BRCA_EU --working.directory /hot/user/jlivingstone/outlier/run_method \
+# --rank.file /hot/users/jlivingstone/outlier/run_method/2023-11-20_BRCA-EU_final_outlier_rank_bic.short.rda \
+# --combined.file /hot/users/jlivingstone/outlier/run_method/2023-11-30_Simulated_Data_5method_combine_BRCA_EU.rda \
+# --row.chunks 1000 --matrix.chunk 1 --method.iteration 0
+
 ### 7.Significant_Outlier_Detection.R ####################################################
 # Compute p-values 
 
 # Required R package
-library(parallel);
-library(foreach);
+library(BoutrosLab.utilities)
 library(doParallel);
+library(foreach);
 library(getopt)
+library(parallel);
 
 params <- matrix(
     data = c(
-	'dataset.name', 'd', '0', 'character',
+        'dataset.name', 'd', '0', 'character',
         'working.directory', 'w', '0', 'character',
-        'rank.file.', 'r', '0', 'character',
-	'combined.file', 'c', '0', 'character'
+        'rank.file.', 'f', '0', 'character',
+        'combined.file', 'c', '0', 'character',
+        'row.chunks', 'r', '0', 'character',
+	'matrix.chunk', 'm', '0', 'character',
+	'method.iteration', 'i', '0', 'character'
         ),
     ncol = 4,
     byrow = TRUE
@@ -25,32 +34,28 @@ dataset.name <- opt$dataset.name
 working.directory <- opt$working.directory
 rank.file <- opt$rank.file
 combined.file <- opt$combined.file
+row.chunks <- as.numeric(opt$row.chunks)
+matrix.chunk <- as.numeric(opt$matrix.chunk)
+
+# This will be used identify the number of outlier patients per gene
+#   - if '0', use whole patients (first step)
+#   - if '1', use n-1 patients (exclude the patient having the largest value)
+#   - repeat this '2', '3', '4'... until there is no outlier genes
+method.iteration <- opt$method.iteration
 
 # Set the working directory
 setwd(working.directory);
 
 # Load the R environment
 #   - 1. File from script 1: short version
 load(
-	file = rank.file
-	)
+    file = rank.file
+    )
+
 #   - 2. File from script 6
 load(
-	file = combined.file
-	)
-
-#why?
-#gene.zrange.fraction.negative.simulated.sum.bic.5method.1M <- data.frame(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M);
-
-# Run 1000 simulated genes at once
-#   - array number should be ceiling(nrow(fpkm.tumor.symbol.filter))
-row.num.args <- as.numeric(args);
-
-# This will be used identify the number of outlier patients per gene
-#   - if '0', use whole patients (first step)
-#   - if '1', use n-1 patients (exclude the patient having the largest value)
-#   - repeat this '2', '3', '4'... until there is no outlier genes
-data.args <- 0;
+    file = combined.file
+    )
 
 ### Rank each methods #####
 # Function
@@ -61,10 +66,10 @@ outlier.rank <- function(outlier.matrix) {
     # Give rank for each methods based on z-score range/fraction of kmean
     for (i in 1:length(methods)) {
         rank.methods <- rank(
-		x = outlier.matrix[,i] * direction[i],
-		ties.method = 'max',
-		na.last = 'keep'
-		)
+            x = outlier.matrix[,i] * direction[i],
+            ties.method = 'max',
+            na.last = 'keep'
+            )
         rank.matrix <- cbind(rank.matrix, rank.methods);
         }
     rownames(rank.matrix) <- rownames(outlier.matrix);
@@ -86,22 +91,24 @@ outlier.rank.product <- function(data.rank, NA.number = 0) {
         }
     }
 
-
 ### Combine matrix
 # - relabel the null data
 gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel <- data.frame(
-	gene.zrange.fraction.negative.simulated.sum.bic.5method.1M,
-	gene = rownames(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M)
-	)
-rownames(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel) <- paste0('ND', 1:nrow(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M)
+    gene.zrange.fraction.negative.simulated.sum.bic.5method.1M,
+    gene = rownames(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M)
+    )
+rownames(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel) <- paste0('ND', 1:nrow(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M))
+
+# gene.zrange.fraction.negative.simulated.sum.bic.5method.1M <- data.frame(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M);
 
 # Assign the row number from start to end
 gene.number.start.end.matrix <- data.frame(
-	start = numeric(),
-	end = numeric()
-	stringsAsFactors = FALSE
-	)
-number.of.rows <- ceiling(nrow(fpkm.tumor.symbol.filter) / row.num.args)
+    start = numeric(),
+    end = numeric(),
+    stringsAsFactors = FALSE
+    )
+
+number.of.rows <- ceiling(nrow(fpkm.tumor.symbol.filter) / row.chunks)
 for (i in 1:number.of.rows) {
     gene.number.start.end.matrix[i,'start'] <- (i - 1) * 1000 + 1
     if (i == number.of.rows) {
@@ -116,48 +123,44 @@ cl <- makeCluster(spec = detectCores() - 2);
 # register the cluster with the parallel package
 registerDoParallel(cl = cl);
 clusterExport(
-	cl = cl,
-	varlist = c('outlier.rank', 'outlier.rank.product')
-	)
-
-#what is the point of this ?
-#gene.zrange.fraction.fpkm.bic.5method.1M.data <- get(paste('gene.zrange.fraction.cosine.last.point.bic', sep = ''));
+    cl = cl,
+    varlist = c('outlier.rank', 'outlier.rank.product')
+    )
 
 gene.rank.p.value.one.gene <- NULL;
-gene.rank.p.value.one.gene <- foreach (i = gene.number.start.end.matrix[i,'start']):gene.number.start.end.matrix[i,'end']), .combine=rbind) %dopar% {
-  methods <- c('zrange.mean', 'zrange.median', 'zrange.trimmean', 'fraction.kmean', 'cosine')
-  observed.gene <- gene.zrange.fraction.fpkm.bic.5method.1M.data[i,methods];
-  combine.matrix <- rbind(
-	observed.gene,
-	gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel[,methods]
-	)
-  # get ranks
-  data.rank.bic <- outlier.rank(outlier.matrix = combine.matrix);
-  rank.product.bic <- apply(data.rank.bic, 1, outlier.rank.product, NA.number = 3);
-  gene.rank.poduct.bic <- data.frame(
-	data.rank.bic,
-	rank.product.bic
-	)
-  obs <- rank.product.bic[1]
-  null <- rank.product.bic[2:(nrow(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel) + 1)]
-  length.null <- nrow(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel);
-  obs.p.value <- (sum(obs >= null) + 1) / (length.null + 1)
-
-  obs.p.value.rank <- cbind(gene.rank.poduct.bic[1,], obs.p.value);
-  p.value.one.gene <- data.frame(x = obs.p.value.rank, i = i);
-  p.value.one.gene;
-}
-
-
-p.value.one <- paste0('gene.rank.p.value.one.gene.', data.args);
-assign(p.value.one, gene.rank.p.value.one.gene);
-
+gene.rank.p.value.one.gene <- foreach (i = gene.number.start.end.matrix[matrix.chunk,'start']:gene.number.start.end.matrix[matrix.chunk,'end'], .combine = rbind) %dopar% {
+    methods <- c('zrange.mean', 'zrange.median', 'zrange.trimmean', 'fraction.kmean', 'cosine')
+    observed.gene <- gene.zrange.fraction.cosine.last.point.bic[i, methods];
+    combine.matrix <- rbind(
+        observed.gene,
+        gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel[, methods]
+        )
+    # get ranks
+    data.rank.bic <- outlier.rank(outlier.matrix = combine.matrix);
+    rank.product.bic <- apply(data.rank.bic, 1, outlier.rank.product, NA.number = 3);
+    gene.rank.poduct.bic <- data.frame(
+        data.rank.bic,
+        rank.product.bic
+        )
+    # gene we are testing is always first
+    obs <- rank.product.bic[1]
+    # all the rest is simulated data
+    null <- rank.product.bic[2:nrow(gene.rank.poduct.bic)]
+    length.null <- nrow(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel);
+    # permutation test
+    obs.p.value <- (sum(obs >= null) + 1) / (length.null + 1)
+
+    p.value.one.gene <- data.frame(gene.rank.poduct.bic[1,], obs.p.value, i = i, gene = rownames(observed.gene));
+    p.value.one.gene;
+    }
 
 stopCluster(cl = cl);
 
+# to update the object name with the interation number
+p.value.one <- paste0('gene.rank.p.value.one.gene.', method.iteration);
+assign(p.value.one, gene.rank.p.value.one.gene);
+
 save(
-    list = paste0('gene.rank.p.value.one.gene.', data.args),
-    file = generate.filename('Significant_Outlier_Detection.', paste(dataset.name, row.num.args, data.args, sep = '.'), 'rda')
+    list = paste0('gene.rank.p.value.one.gene.', method.iteration),
+    file = generate.filename('Significant_Outlier_Detection', paste(dataset.name, matrix.chunk, method.iteration, sep = '.'), 'rda')
     );
-
-
diff --git a/OutlierDetectionAlgorithm/8.Significant_Outlier_Pvalue_Calculation.R b/OutlierDetectionAlgorithm/8.Significant_Outlier_Pvalue_Calculation.R
@@ -1,6 +1,9 @@
 #!/usr/bin/env Rscript
 
-# Rscript 8.Significant_Outlier_Pvalue_Calculation.R --dataset.name BRCA_EU --working.directory /hot/user/jlivingstone/outlier/run_method --row.chunk 18
+# Rscript 8.Significant_Outlier_Pvalue_Calculation.R --dataset.name BRCA_EU \
+# --working.directory /hot/user/jlivingstone/outlier/run_method \
+# --method.iteration 0
+
 ### 8.Significant_Outlier_Pvalue_Calculation.R ####################################################
 # Compute p-values 
 library(BoutrosLab.utilities)
@@ -10,7 +13,7 @@ params <- matrix(
     data = c(
 	'dataset.name', 'd', '0', 'character',
         'working.directory', 'w', '0', 'character',
-	'row.chunk', 'r', '0', 'character'
+	'method.iteration', 'i', '0', 'character'
         ),
     ncol = 4,
     byrow = TRUE
@@ -19,33 +22,39 @@ params <- matrix(
 opt <- getopt(params);
 dataset.name <- opt$dataset.name
 working.directory <- opt$working.directory
-row.chunk.num <- opt$row.chunk
+method.iteration <- opt$method.iteration
 
 # Set the working directory
 setwd(working.directory)
 
+files <- list.files(
+	pattern = 'Significant_Outlier_Detection'
+	)
 
-for (i in 1:row.chunk.num) {
+p.value.all <- NULL
+for (i in 1:length(files)) {
     load(
-	file = paste('Significant_Outlier_Detection', dataset.name, i, '0', 'rda', sep = '.')
+	file = files[i]
+	)
+    assign(
+	x = 'variable.name',
+	value = paste('gene.rank.p.value.one.gene', method.iteration, sep = '.')
+	)
+    p.value.all <- rbind(
+	p.value.all,
+	get(x = variable.name)
 	)
-    p.value.set <- paste('gene.rank.p.value', i, sep = '.');
-    assign(p.value.set, get(paste('gene.rank.p.value.one.gene.', '0', sep = '')));    
-    }
-
-#1. residue.negative.random.number.bic
-gene.p.value.each.null <- NULL;
-for (i in 1:row.chunk.num) {
-    p.value <- get(paste('gene.rank.p.value.', i, sep = ''));
-    gene.p.value.each.null <- rbind(gene.p.value.each.null, p.value);
     }
+p.value.all <- p.value.all[order(p.value.all$i),]
+p.value.all$q.value <- p.adjust(
+	p = p.value.all$obs.p.value,
+	method = 'fdr'
+	)
 
-p.value.all <- paste('gene.rank.p.value.one.gene.p', '0', sep = '');
-assign(p.value.all, gene.p.value.each.null);
+# assign back to original variable name
+assign(x = variable.name, value = p.value.all)
 
 save(
-  list = paste0('gene.rank.p.value.one.gene.p', '0', sep = ''),
-  file = generate.filename('Significant_Outlier_Pvalue_Calculation', paste(dataset.name, '0', sep = '.'), 'rda')
-  )
-
-
+    list = paste('gene.rank.p.value.one.gene', method.iteration, sep = '.'),
+    file = generate.filename('Significant_Outlier_Pvalue_Calculation', paste(dataset.name, method.iteration, sep = '.'), 'rda')
+    )