From 1c976f2689724114f42c2e0fef668d958b2fc1d1 Mon Sep 17 00:00:00 2001 From: Julie Livingstone Date: Wed, 6 Dec 2023 10:47:22 -0800 Subject: [PATCH] update scripts 7 and 8 --- .../7.Significant_Outlier_Detection.R | 151 +++++++++--------- ...8.Significant_Outlier_Pvalue_Calculation.R | 51 +++--- 2 files changed, 107 insertions(+), 95 deletions(-) diff --git a/OutlierDetectionAlgorithm/7.Significant_Outlier_Detection.R b/OutlierDetectionAlgorithm/7.Significant_Outlier_Detection.R index fb058b1..d2183ee 100644 --- a/OutlierDetectionAlgorithm/7.Significant_Outlier_Detection.R +++ b/OutlierDetectionAlgorithm/7.Significant_Outlier_Detection.R @@ -1,20 +1,29 @@ #!/usr/bin/env Rscript +# Rscript 7.Significant_Outlier_Detection.R --dataset.name BRCA_EU --working.directory /hot/user/jlivingstone/outlier/run_method \ +# --rank.file /hot/users/jlivingstone/outlier/run_method/2023-11-20_BRCA-EU_final_outlier_rank_bic.short.rda \ +# --combined.file /hot/users/jlivingstone/outlier/run_method/2023-11-30_Simulated_Data_5method_combine_BRCA_EU.rda \ +# --row.chunks 1000 --matrix.chunk 1 --method.iteration 0 + ### 7.Significant_Outlier_Detection.R #################################################### # Compute p-values # Required R package -library(parallel); -library(foreach); +library(BoutrosLab.utilities) library(doParallel); +library(foreach); library(getopt) +library(parallel); params <- matrix( data = c( - 'dataset.name', 'd', '0', 'character', + 'dataset.name', 'd', '0', 'character', 'working.directory', 'w', '0', 'character', - 'rank.file.', 'r', '0', 'character', - 'combined.file', 'c', '0', 'character' + 'rank.file.', 'f', '0', 'character', + 'combined.file', 'c', '0', 'character', + 'row.chunks', 'r', '0', 'character', + 'matrix.chunk', 'm', '0', 'character', + 'method.iteration', 'i', '0', 'character' ), ncol = 4, byrow = TRUE @@ -25,6 +34,14 @@ dataset.name <- opt$dataset.name working.directory <- opt$working.directory rank.file <- opt$rank.file combined.file <- opt$combined.file +row.chunks <- as.numeric(opt$row.chunks) +matrix.chunk <- as.numeric(opt$matrix.chunk) + +# This will be used identify the number of outlier patients per gene +# - if '0', use whole patients (first step) +# - if '1', use n-1 patients (exclude the patient having the largest value) +# - repeat this '2', '3', '4'... until there is no outlier genes +method.iteration <- opt$method.iteration # Set the working directory setwd(working.directory); @@ -32,25 +49,13 @@ setwd(working.directory); # Load the R environment # - 1. File from script 1: short version load( - file = rank.file - ) + file = rank.file + ) + # - 2. File from script 6 load( - file = combined.file - ) - -#why? -#gene.zrange.fraction.negative.simulated.sum.bic.5method.1M <- data.frame(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M); - -# Run 1000 simulated genes at once -# - array number should be ceiling(nrow(fpkm.tumor.symbol.filter)) -row.num.args <- as.numeric(args); - -# This will be used identify the number of outlier patients per gene -# - if '0', use whole patients (first step) -# - if '1', use n-1 patients (exclude the patient having the largest value) -# - repeat this '2', '3', '4'... until there is no outlier genes -data.args <- 0; + file = combined.file + ) ### Rank each methods ##### # Function @@ -61,10 +66,10 @@ outlier.rank <- function(outlier.matrix) { # Give rank for each methods based on z-score range/fraction of kmean for (i in 1:length(methods)) { rank.methods <- rank( - x = outlier.matrix[,i] * direction[i], - ties.method = 'max', - na.last = 'keep' - ) + x = outlier.matrix[,i] * direction[i], + ties.method = 'max', + na.last = 'keep' + ) rank.matrix <- cbind(rank.matrix, rank.methods); } rownames(rank.matrix) <- rownames(outlier.matrix); @@ -86,22 +91,24 @@ outlier.rank.product <- function(data.rank, NA.number = 0) { } } - ### Combine matrix # - relabel the null data gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel <- data.frame( - gene.zrange.fraction.negative.simulated.sum.bic.5method.1M, - gene = rownames(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M) - ) -rownames(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel) <- paste0('ND', 1:nrow(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M) + gene.zrange.fraction.negative.simulated.sum.bic.5method.1M, + gene = rownames(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M) + ) +rownames(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel) <- paste0('ND', 1:nrow(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M)) + +# gene.zrange.fraction.negative.simulated.sum.bic.5method.1M <- data.frame(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M); # Assign the row number from start to end gene.number.start.end.matrix <- data.frame( - start = numeric(), - end = numeric() - stringsAsFactors = FALSE - ) -number.of.rows <- ceiling(nrow(fpkm.tumor.symbol.filter) / row.num.args) + start = numeric(), + end = numeric(), + stringsAsFactors = FALSE + ) + +number.of.rows <- ceiling(nrow(fpkm.tumor.symbol.filter) / row.chunks) for (i in 1:number.of.rows) { gene.number.start.end.matrix[i,'start'] <- (i - 1) * 1000 + 1 if (i == number.of.rows) { @@ -116,48 +123,44 @@ cl <- makeCluster(spec = detectCores() - 2); # register the cluster with the parallel package registerDoParallel(cl = cl); clusterExport( - cl = cl, - varlist = c('outlier.rank', 'outlier.rank.product') - ) - -#what is the point of this ? -#gene.zrange.fraction.fpkm.bic.5method.1M.data <- get(paste('gene.zrange.fraction.cosine.last.point.bic', sep = '')); + cl = cl, + varlist = c('outlier.rank', 'outlier.rank.product') + ) gene.rank.p.value.one.gene <- NULL; -gene.rank.p.value.one.gene <- foreach (i = gene.number.start.end.matrix[i,'start']):gene.number.start.end.matrix[i,'end']), .combine=rbind) %dopar% { - methods <- c('zrange.mean', 'zrange.median', 'zrange.trimmean', 'fraction.kmean', 'cosine') - observed.gene <- gene.zrange.fraction.fpkm.bic.5method.1M.data[i,methods]; - combine.matrix <- rbind( - observed.gene, - gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel[,methods] - ) - # get ranks - data.rank.bic <- outlier.rank(outlier.matrix = combine.matrix); - rank.product.bic <- apply(data.rank.bic, 1, outlier.rank.product, NA.number = 3); - gene.rank.poduct.bic <- data.frame( - data.rank.bic, - rank.product.bic - ) - obs <- rank.product.bic[1] - null <- rank.product.bic[2:(nrow(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel) + 1)] - length.null <- nrow(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel); - obs.p.value <- (sum(obs >= null) + 1) / (length.null + 1) - - obs.p.value.rank <- cbind(gene.rank.poduct.bic[1,], obs.p.value); - p.value.one.gene <- data.frame(x = obs.p.value.rank, i = i); - p.value.one.gene; -} - - -p.value.one <- paste0('gene.rank.p.value.one.gene.', data.args); -assign(p.value.one, gene.rank.p.value.one.gene); - +gene.rank.p.value.one.gene <- foreach (i = gene.number.start.end.matrix[matrix.chunk,'start']:gene.number.start.end.matrix[matrix.chunk,'end'], .combine = rbind) %dopar% { + methods <- c('zrange.mean', 'zrange.median', 'zrange.trimmean', 'fraction.kmean', 'cosine') + observed.gene <- gene.zrange.fraction.cosine.last.point.bic[i, methods]; + combine.matrix <- rbind( + observed.gene, + gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel[, methods] + ) + # get ranks + data.rank.bic <- outlier.rank(outlier.matrix = combine.matrix); + rank.product.bic <- apply(data.rank.bic, 1, outlier.rank.product, NA.number = 3); + gene.rank.poduct.bic <- data.frame( + data.rank.bic, + rank.product.bic + ) + # gene we are testing is always first + obs <- rank.product.bic[1] + # all the rest is simulated data + null <- rank.product.bic[2:nrow(gene.rank.poduct.bic)] + length.null <- nrow(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel); + # permutation test + obs.p.value <- (sum(obs >= null) + 1) / (length.null + 1) + + p.value.one.gene <- data.frame(gene.rank.poduct.bic[1,], obs.p.value, i = i, gene = rownames(observed.gene)); + p.value.one.gene; + } stopCluster(cl = cl); +# to update the object name with the interation number +p.value.one <- paste0('gene.rank.p.value.one.gene.', method.iteration); +assign(p.value.one, gene.rank.p.value.one.gene); + save( - list = paste0('gene.rank.p.value.one.gene.', data.args), - file = generate.filename('Significant_Outlier_Detection.', paste(dataset.name, row.num.args, data.args, sep = '.'), 'rda') + list = paste0('gene.rank.p.value.one.gene.', method.iteration), + file = generate.filename('Significant_Outlier_Detection', paste(dataset.name, matrix.chunk, method.iteration, sep = '.'), 'rda') ); - - diff --git a/OutlierDetectionAlgorithm/8.Significant_Outlier_Pvalue_Calculation.R b/OutlierDetectionAlgorithm/8.Significant_Outlier_Pvalue_Calculation.R index 1469a35..448911b 100644 --- a/OutlierDetectionAlgorithm/8.Significant_Outlier_Pvalue_Calculation.R +++ b/OutlierDetectionAlgorithm/8.Significant_Outlier_Pvalue_Calculation.R @@ -1,6 +1,9 @@ #!/usr/bin/env Rscript -# Rscript 8.Significant_Outlier_Pvalue_Calculation.R --dataset.name BRCA_EU --working.directory /hot/user/jlivingstone/outlier/run_method --row.chunk 18 +# Rscript 8.Significant_Outlier_Pvalue_Calculation.R --dataset.name BRCA_EU \ +# --working.directory /hot/user/jlivingstone/outlier/run_method \ +# --method.iteration 0 + ### 8.Significant_Outlier_Pvalue_Calculation.R #################################################### # Compute p-values library(BoutrosLab.utilities) @@ -10,7 +13,7 @@ params <- matrix( data = c( 'dataset.name', 'd', '0', 'character', 'working.directory', 'w', '0', 'character', - 'row.chunk', 'r', '0', 'character' + 'method.iteration', 'i', '0', 'character' ), ncol = 4, byrow = TRUE @@ -19,33 +22,39 @@ params <- matrix( opt <- getopt(params); dataset.name <- opt$dataset.name working.directory <- opt$working.directory -row.chunk.num <- opt$row.chunk +method.iteration <- opt$method.iteration # Set the working directory setwd(working.directory) +files <- list.files( + pattern = 'Significant_Outlier_Detection' + ) -for (i in 1:row.chunk.num) { +p.value.all <- NULL +for (i in 1:length(files)) { load( - file = paste('Significant_Outlier_Detection', dataset.name, i, '0', 'rda', sep = '.') + file = files[i] + ) + assign( + x = 'variable.name', + value = paste('gene.rank.p.value.one.gene', method.iteration, sep = '.') + ) + p.value.all <- rbind( + p.value.all, + get(x = variable.name) ) - p.value.set <- paste('gene.rank.p.value', i, sep = '.'); - assign(p.value.set, get(paste('gene.rank.p.value.one.gene.', '0', sep = ''))); - } - -#1. residue.negative.random.number.bic -gene.p.value.each.null <- NULL; -for (i in 1:row.chunk.num) { - p.value <- get(paste('gene.rank.p.value.', i, sep = '')); - gene.p.value.each.null <- rbind(gene.p.value.each.null, p.value); } +p.value.all <- p.value.all[order(p.value.all$i),] +p.value.all$q.value <- p.adjust( + p = p.value.all$obs.p.value, + method = 'fdr' + ) -p.value.all <- paste('gene.rank.p.value.one.gene.p', '0', sep = ''); -assign(p.value.all, gene.p.value.each.null); +# assign back to original variable name +assign(x = variable.name, value = p.value.all) save( - list = paste0('gene.rank.p.value.one.gene.p', '0', sep = ''), - file = generate.filename('Significant_Outlier_Pvalue_Calculation', paste(dataset.name, '0', sep = '.'), 'rda') - ) - - + list = paste('gene.rank.p.value.one.gene', method.iteration, sep = '.'), + file = generate.filename('Significant_Outlier_Pvalue_Calculation', paste(dataset.name, method.iteration, sep = '.'), 'rda') + )