Skip to content

Commit

Permalink
update scripts 7 and 8
Browse files Browse the repository at this point in the history
  • Loading branch information
jmlivingstone committed Dec 6, 2023
1 parent 6de0f30 commit 1c976f2
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 95 deletions.
151 changes: 77 additions & 74 deletions OutlierDetectionAlgorithm/7.Significant_Outlier_Detection.R
Original file line number Diff line number Diff line change
@@ -1,20 +1,29 @@
#!/usr/bin/env Rscript

# Rscript 7.Significant_Outlier_Detection.R --dataset.name BRCA_EU --working.directory /hot/user/jlivingstone/outlier/run_method \
# --rank.file /hot/users/jlivingstone/outlier/run_method/2023-11-20_BRCA-EU_final_outlier_rank_bic.short.rda \
# --combined.file /hot/users/jlivingstone/outlier/run_method/2023-11-30_Simulated_Data_5method_combine_BRCA_EU.rda \
# --row.chunks 1000 --matrix.chunk 1 --method.iteration 0

### 7.Significant_Outlier_Detection.R ####################################################
# Compute p-values

# Required R package
library(parallel);
library(foreach);
library(BoutrosLab.utilities)
library(doParallel);
library(foreach);
library(getopt)
library(parallel);

params <- matrix(
data = c(
'dataset.name', 'd', '0', 'character',
'dataset.name', 'd', '0', 'character',
'working.directory', 'w', '0', 'character',
'rank.file.', 'r', '0', 'character',
'combined.file', 'c', '0', 'character'
'rank.file.', 'f', '0', 'character',
'combined.file', 'c', '0', 'character',
'row.chunks', 'r', '0', 'character',
'matrix.chunk', 'm', '0', 'character',
'method.iteration', 'i', '0', 'character'
),
ncol = 4,
byrow = TRUE
Expand All @@ -25,32 +34,28 @@ dataset.name <- opt$dataset.name
working.directory <- opt$working.directory
rank.file <- opt$rank.file
combined.file <- opt$combined.file
row.chunks <- as.numeric(opt$row.chunks)
matrix.chunk <- as.numeric(opt$matrix.chunk)

# This will be used identify the number of outlier patients per gene
# - if '0', use whole patients (first step)
# - if '1', use n-1 patients (exclude the patient having the largest value)
# - repeat this '2', '3', '4'... until there is no outlier genes
method.iteration <- opt$method.iteration

# Set the working directory
setwd(working.directory);

# Load the R environment
# - 1. File from script 1: short version
load(
file = rank.file
)
file = rank.file
)

# - 2. File from script 6
load(
file = combined.file
)

#why?
#gene.zrange.fraction.negative.simulated.sum.bic.5method.1M <- data.frame(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M);

# Run 1000 simulated genes at once
# - array number should be ceiling(nrow(fpkm.tumor.symbol.filter))
row.num.args <- as.numeric(args);

# This will be used identify the number of outlier patients per gene
# - if '0', use whole patients (first step)
# - if '1', use n-1 patients (exclude the patient having the largest value)
# - repeat this '2', '3', '4'... until there is no outlier genes
data.args <- 0;
file = combined.file
)

### Rank each methods #####
# Function
Expand All @@ -61,10 +66,10 @@ outlier.rank <- function(outlier.matrix) {
# Give rank for each methods based on z-score range/fraction of kmean
for (i in 1:length(methods)) {
rank.methods <- rank(
x = outlier.matrix[,i] * direction[i],
ties.method = 'max',
na.last = 'keep'
)
x = outlier.matrix[,i] * direction[i],
ties.method = 'max',
na.last = 'keep'
)
rank.matrix <- cbind(rank.matrix, rank.methods);
}
rownames(rank.matrix) <- rownames(outlier.matrix);
Expand All @@ -86,22 +91,24 @@ outlier.rank.product <- function(data.rank, NA.number = 0) {
}
}


### Combine matrix
# - relabel the null data
gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel <- data.frame(
gene.zrange.fraction.negative.simulated.sum.bic.5method.1M,
gene = rownames(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M)
)
rownames(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel) <- paste0('ND', 1:nrow(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M)
gene.zrange.fraction.negative.simulated.sum.bic.5method.1M,
gene = rownames(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M)
)
rownames(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel) <- paste0('ND', 1:nrow(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M))

# gene.zrange.fraction.negative.simulated.sum.bic.5method.1M <- data.frame(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M);

# Assign the row number from start to end
gene.number.start.end.matrix <- data.frame(
start = numeric(),
end = numeric()
stringsAsFactors = FALSE
)
number.of.rows <- ceiling(nrow(fpkm.tumor.symbol.filter) / row.num.args)
start = numeric(),
end = numeric(),
stringsAsFactors = FALSE
)

number.of.rows <- ceiling(nrow(fpkm.tumor.symbol.filter) / row.chunks)
for (i in 1:number.of.rows) {
gene.number.start.end.matrix[i,'start'] <- (i - 1) * 1000 + 1
if (i == number.of.rows) {
Expand All @@ -116,48 +123,44 @@ cl <- makeCluster(spec = detectCores() - 2);
# register the cluster with the parallel package
registerDoParallel(cl = cl);
clusterExport(
cl = cl,
varlist = c('outlier.rank', 'outlier.rank.product')
)

#what is the point of this ?
#gene.zrange.fraction.fpkm.bic.5method.1M.data <- get(paste('gene.zrange.fraction.cosine.last.point.bic', sep = ''));
cl = cl,
varlist = c('outlier.rank', 'outlier.rank.product')
)

gene.rank.p.value.one.gene <- NULL;
gene.rank.p.value.one.gene <- foreach (i = gene.number.start.end.matrix[i,'start']):gene.number.start.end.matrix[i,'end']), .combine=rbind) %dopar% {
methods <- c('zrange.mean', 'zrange.median', 'zrange.trimmean', 'fraction.kmean', 'cosine')
observed.gene <- gene.zrange.fraction.fpkm.bic.5method.1M.data[i,methods];
combine.matrix <- rbind(
observed.gene,
gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel[,methods]
)
# get ranks
data.rank.bic <- outlier.rank(outlier.matrix = combine.matrix);
rank.product.bic <- apply(data.rank.bic, 1, outlier.rank.product, NA.number = 3);
gene.rank.poduct.bic <- data.frame(
data.rank.bic,
rank.product.bic
)
obs <- rank.product.bic[1]
null <- rank.product.bic[2:(nrow(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel) + 1)]
length.null <- nrow(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel);
obs.p.value <- (sum(obs >= null) + 1) / (length.null + 1)

obs.p.value.rank <- cbind(gene.rank.poduct.bic[1,], obs.p.value);
p.value.one.gene <- data.frame(x = obs.p.value.rank, i = i);
p.value.one.gene;
}


p.value.one <- paste0('gene.rank.p.value.one.gene.', data.args);
assign(p.value.one, gene.rank.p.value.one.gene);

gene.rank.p.value.one.gene <- foreach (i = gene.number.start.end.matrix[matrix.chunk,'start']:gene.number.start.end.matrix[matrix.chunk,'end'], .combine = rbind) %dopar% {
methods <- c('zrange.mean', 'zrange.median', 'zrange.trimmean', 'fraction.kmean', 'cosine')
observed.gene <- gene.zrange.fraction.cosine.last.point.bic[i, methods];
combine.matrix <- rbind(
observed.gene,
gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel[, methods]
)
# get ranks
data.rank.bic <- outlier.rank(outlier.matrix = combine.matrix);
rank.product.bic <- apply(data.rank.bic, 1, outlier.rank.product, NA.number = 3);
gene.rank.poduct.bic <- data.frame(
data.rank.bic,
rank.product.bic
)
# gene we are testing is always first
obs <- rank.product.bic[1]
# all the rest is simulated data
null <- rank.product.bic[2:nrow(gene.rank.poduct.bic)]
length.null <- nrow(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.relabel);
# permutation test
obs.p.value <- (sum(obs >= null) + 1) / (length.null + 1)

p.value.one.gene <- data.frame(gene.rank.poduct.bic[1,], obs.p.value, i = i, gene = rownames(observed.gene));
p.value.one.gene;
}

stopCluster(cl = cl);

# to update the object name with the interation number
p.value.one <- paste0('gene.rank.p.value.one.gene.', method.iteration);
assign(p.value.one, gene.rank.p.value.one.gene);

save(
list = paste0('gene.rank.p.value.one.gene.', data.args),
file = generate.filename('Significant_Outlier_Detection.', paste(dataset.name, row.num.args, data.args, sep = '.'), 'rda')
list = paste0('gene.rank.p.value.one.gene.', method.iteration),
file = generate.filename('Significant_Outlier_Detection', paste(dataset.name, matrix.chunk, method.iteration, sep = '.'), 'rda')
);


Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#!/usr/bin/env Rscript

# Rscript 8.Significant_Outlier_Pvalue_Calculation.R --dataset.name BRCA_EU --working.directory /hot/user/jlivingstone/outlier/run_method --row.chunk 18
# Rscript 8.Significant_Outlier_Pvalue_Calculation.R --dataset.name BRCA_EU \
# --working.directory /hot/user/jlivingstone/outlier/run_method \
# --method.iteration 0

### 8.Significant_Outlier_Pvalue_Calculation.R ####################################################
# Compute p-values
library(BoutrosLab.utilities)
Expand All @@ -10,7 +13,7 @@ params <- matrix(
data = c(
'dataset.name', 'd', '0', 'character',
'working.directory', 'w', '0', 'character',
'row.chunk', 'r', '0', 'character'
'method.iteration', 'i', '0', 'character'
),
ncol = 4,
byrow = TRUE
Expand All @@ -19,33 +22,39 @@ params <- matrix(
opt <- getopt(params);
dataset.name <- opt$dataset.name
working.directory <- opt$working.directory
row.chunk.num <- opt$row.chunk
method.iteration <- opt$method.iteration

# Set the working directory
setwd(working.directory)

files <- list.files(
pattern = 'Significant_Outlier_Detection'
)

for (i in 1:row.chunk.num) {
p.value.all <- NULL
for (i in 1:length(files)) {
load(
file = paste('Significant_Outlier_Detection', dataset.name, i, '0', 'rda', sep = '.')
file = files[i]
)
assign(
x = 'variable.name',
value = paste('gene.rank.p.value.one.gene', method.iteration, sep = '.')
)
p.value.all <- rbind(
p.value.all,
get(x = variable.name)
)
p.value.set <- paste('gene.rank.p.value', i, sep = '.');
assign(p.value.set, get(paste('gene.rank.p.value.one.gene.', '0', sep = '')));
}

#1. residue.negative.random.number.bic
gene.p.value.each.null <- NULL;
for (i in 1:row.chunk.num) {
p.value <- get(paste('gene.rank.p.value.', i, sep = ''));
gene.p.value.each.null <- rbind(gene.p.value.each.null, p.value);
}
p.value.all <- p.value.all[order(p.value.all$i),]
p.value.all$q.value <- p.adjust(
p = p.value.all$obs.p.value,
method = 'fdr'
)

p.value.all <- paste('gene.rank.p.value.one.gene.p', '0', sep = '');
assign(p.value.all, gene.p.value.each.null);
# assign back to original variable name
assign(x = variable.name, value = p.value.all)

save(
list = paste0('gene.rank.p.value.one.gene.p', '0', sep = ''),
file = generate.filename('Significant_Outlier_Pvalue_Calculation', paste(dataset.name, '0', sep = '.'), 'rda')
)


list = paste('gene.rank.p.value.one.gene', method.iteration, sep = '.'),
file = generate.filename('Significant_Outlier_Pvalue_Calculation', paste(dataset.name, method.iteration, sep = '.'), 'rda')
)

0 comments on commit 1c976f2

Please sign in to comment.