diff --git a/.github/workflows/cicd-base.yaml b/.github/workflows/cicd-base.yaml new file mode 100644 index 0000000..a7d5af7 --- /dev/null +++ b/.github/workflows/cicd-base.yaml @@ -0,0 +1,33 @@ +--- +name: CICD-base + +on: + push: + branches: + - master + - jlivingstone-run-method + pull_request: + branches: + - master + - jlivingstone-run-method + +jobs: + CICD-base: + runs-on: ubuntu-latest + + timeout-minutes: 15 + + steps: + # Checkout codebase + - name: Checkout + uses: actions/checkout@v2 + + # Run cicd-base + - name: CICD-base + uses: docker://ghcr.io/uclahs-cds/cicd-base:latest + env: + VALIDATE_PYTHON: false + VALIDATE_YAML: false + VALIDATE_SHELL: false + VALIDATE_PERL: false + VALIDATE_DOCKERFILE: false diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e0a1def --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +# System files +.DS_Store + +# R +.Rhistory +.Rapp.history +.Rproj.user +.RData diff --git a/OutlierDetectionAlgorithm/1.Outlier_Detection.R b/OutlierDetectionAlgorithm/1.Outlier_Detection.R index da9e06e..0b4cf4b 100644 --- a/OutlierDetectionAlgorithm/1.Outlier_Detection.R +++ b/OutlierDetectionAlgorithm/1.Outlier_Detection.R @@ -2,7 +2,7 @@ # Rscript 1.Outlier_Detection.R --dataset.name BRCA_EU --working.directory /hot/users/jlivingstone/outlier/run_method --data.matrix.file /hot/users/jlivingstone/outlier/NikZainal_2016/original/SupplementaryTable7Transcriptomic342.txt ### 1.Outlier_Detection.R #################################################### -# this should utlimately be handled in the R package setup but for now +# this should utlimately be handled in the R package setup but for now needed.packages <- c('gamlss', 'foreach', 'extraDistr', 'truncnorm', 'lsa', 'SnowballC', 'getopt') packages <- rownames(installed.packages()) @@ -13,8 +13,8 @@ install.packages(to.install, repo = 'http://cran.us.r-project.org') # Required R packages # Install and load the 'gamlss' package #if (!require('gamlss')) { -# install.packages('gamlss', repo = 'http://cran.us.r-project.org') -# } +# install.packages('gamlss', repo = 'http://cran.us.r-project.org') +# } library(BoutrosLab.utilities) library(doParallel) library(extraDistr) @@ -57,37 +57,37 @@ setwd(working.directory) # fpkm.tumor.symbol: gene x sample matrix # example: fpkm.tumor.log <- read.csv( - file = data.matrix.file, - check.names = FALSE, - stringsAsFactors = FALSE, - sep = '\t', - row.names = 1 - ) + file = data.matrix.file, + check.names = FALSE, + stringsAsFactors = FALSE, + sep = '\t', + row.names = 1 + ) cols.to.remove <- c('Ensembl', 'Source', 'Name', 'loc') annot <- fpkm.tumor.log[, match(cols.to.remove, colnames(fpkm.tumor.log))] fpkm.tumor.symbol.log <- fpkm.tumor.log[, -match(cols.to.remove, colnames(fpkm.tumor.log))] -# - make it non log format +# - make it non log format fpkm.tumor.symbol <- 2 ^ fpkm.tumor.symbol.log; # change NAs to 0 fpkm.tumor.symbol[is.na(fpkm.tumor.symbol)] <- 0 # Number of samples -# - if the last column has symbol, it should be 1:(ncol(fpkm.tumor.symbol))-1) +# - if the last column has symbol, it should be 1:(ncol(fpkm.tumor.symbol))-1) patient.part <- 1:ncol(fpkm.tumor.symbol); sample.number <- 1:ncol(fpkm.tumor.symbol); # Get the genes with less than 1% of zero values # - excludes genes which have zero values more than 99% zero.portion <- apply( - X = fpkm.tumor.symbol[, patient.part], - MARGIN = 1, - FUN = function(x) { - length(x[0 == x]) / length(patient.part) - } - ); + X = fpkm.tumor.symbol[, patient.part], + MARGIN = 1, + FUN = function(x) { + length(x[0 == x]) / length(patient.part) + } + ); fpkm.tumor.symbol.filter <- fpkm.tumor.symbol[which(0.01 > zero.portion), ]; annot.filter <- annot[which(0.01 > zero.portion), ] @@ -124,7 +124,7 @@ quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALS if (length(unique(as.numeric(x.na))) == 1) { kmean.matrix <- rep(NA, length(x.na)); names(kmean.matrix) <- names(x.na); - } + } else { data.order <- sort(x.na, decreasing = TRUE); non.zero <- data.order[data.order > 0]; @@ -132,27 +132,27 @@ quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALS na.matrix <- rep(NA, length(non.zero)); cluster.zero <- c(na.matrix, rep(0, length(x.na[x.na == 0]))); kmean.matrix <- cluster.zero[match(x.na, data.order)]; - names(kmean.matrix) <- names(x.na); - } + names(kmean.matrix) <- names(x.na); + } else { kmean <- kmeans(non.zero, 2, nstart = 100); cluster <- kmean$cluster; cluster.zero <- c(cluster, rep(0, length(x[x == 0]))); kmean.matrix <- cluster.zero[match(x.na, data.order)]; - names(kmean.matrix) <- names(x.na); + names(kmean.matrix) <- names(x.na); } } - } + } else { if (length(unique(as.numeric(x.na))) == 1) { kmean.matrix <- rep(NA, length(x.na)); - names(kmean.matrix) <- names(x.na); - } + names(kmean.matrix) <- names(x.na); + } else { kmean <- kmeans(x.na, 2, nstart = 100); cluster <- kmean$cluster; kmean.matrix <- cluster; - names(kmean.matrix) <- names(x.na); + names(kmean.matrix) <- names(x.na); } } result.na <- kmean.matrix; @@ -161,18 +161,18 @@ quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALS } else { gene.order <- x.na[order(x.na, decreasing = TRUE)]; - if (exclude.zero) { - gene.order.nonzero <- gene.order[0 != gene.order]; + if (exclude.zero) { + gene.order.nonzero <- gene.order[0 != gene.order]; top.patient <- round(length(gene.order.nonzero) * (trim / 100), digit = 0); low.patient <- round(length(gene.order.nonzero) * (1 - (trim / 100)), digit = 0); data.mean <- mean(gene.order.nonzero, trim = (trim / 100)); - data.sd <- sd(gene.order.nonzero[(top.patient+1):(low.patient)]); - } + data.sd <- sd(gene.order.nonzero[(top.patient + 1):(low.patient)]); + } else { top.patient <- round(length(x.na) * (trim / 100), digit = 0); low.patient <- round(length(x.na) * (1 - (trim / 100)), digit = 0); data.mean <- mean(gene.order, trim = (trim / 100)); - data.sd <- sd(gene.order[(top.patient+1):(low.patient)]); + data.sd <- sd(gene.order[(top.patient + 1):(low.patient)]); } result.na <- (x.na - data.mean) / data.sd; x[which(!is.na(x))] <- result.na; @@ -188,25 +188,25 @@ registerDoParallel(cl); # 1. MEAN and SD : method = 'mean', trim = 0 print('Calculating using MEAN and SD') print(Sys.time()) -data.mean <- foreach (i = 1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i, ]); +data.mean <- foreach(i = 1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i, ]); data.mean <- data.frame(data.mean); # 2. TRIMMED MEAN and TRIMMED SD : method = 'mean', trim = 5 print('Calculating using TRIMMED MEAN and TRIMMED SD') print(Sys.time()) -data.trimmean <- foreach (i = 1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i,], trim = 5); +data.trimmean <- foreach(i = 1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i,], trim = 5); data.trimmean <- data.frame(data.trimmean); # 3. MEDIAN and MAD : method = 'median' print('Calculating using MEDIAN and MAD') print(Sys.time()) -data.median <- foreach (i = 1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i,], methods = 'median'); +data.median <- foreach(i = 1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i,], methods = 'median'); data.median <- data.frame(data.median); # 4. KMEAN : method = 'kmean' print('Calculating using KMEANS') print(Sys.time()) -data.kmean <- foreach (i = 1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i,], methods = 'kmean') +data.kmean <- foreach(i = 1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i,], methods = 'kmean') data.kmean <- data.frame(data.kmean); stopCluster(cl = cl) @@ -239,7 +239,7 @@ data.zrange.median.t <- data.frame(t(data.zrange.median)); ### Calculate the kmean fraction ##### # Function outlier.detection.kmean <- function(x) { - if (1== length(unique(as.numeric(x)))) { + if (1 == length(unique(as.numeric(x)))) { fraction <- NA; } else { @@ -247,7 +247,7 @@ outlier.detection.kmean <- function(x) { cluster.two <- length(x[x == 2]); cluster.sum <- cluster.one + cluster.two; smaller.value <- min(cluster.one, cluster.two); - fraction <- round(smaller.value/cluster.sum, digit = 4); + fraction <- round(smaller.value / cluster.sum, digit = 4); } fraction.matrix <- c(x, fraction); names(fraction.matrix) <- c(names(x), 'fraction'); @@ -262,7 +262,7 @@ data.fraction.kmean.t <- data.frame(t(data.fraction.kmean)); # 5. Cosine similarity # function: Compute the cosine similarity of the largest data point -outlier.detection.cosine <- function (x, value.portion = 1) { +outlier.detection.cosine <- function(x, value.portion = 1) { # Define a minimum value decimal.number.max <- lapply(na.omit(x), function(x) { @@ -270,14 +270,14 @@ outlier.detection.cosine <- function (x, value.portion = 1) { nchar(as.character(y)) - nchar(as.integer(y)) - 1 }) return(decimal.numbers) - }) + }) add.minimum.value <- 1 / 10^as.numeric(max(unlist(decimal.number.max))); trim.sample <- function(x, trim.portion = 5) { if (length(x) <= 10) { - patient.trim.value <- 2:(length(x)-1); + patient.trim.value <- 2:(length(x) - 1); } else { - trim.sample.number <- length(x) * (trim.portion/100); + trim.sample.number <- length(x) * (trim.portion / 100); trim.sample.number.integer <- round(trim.sample.number, digits = 0); patient.trim.value <- (trim.sample.number.integer + 1):(length(x) - trim.sample.number.integer); } @@ -295,10 +295,10 @@ outlier.detection.cosine <- function (x, value.portion = 1) { large.value.number.integer <- 1; } else { - large.value.number <- length(x) * (large.value.percent/100); + large.value.number <- length(x) * (large.value.percent / 100); large.value.number.integer <- roundToInteger(large.value.number); } - + # subset the largest values patient.larger.value <- (length(x) - large.value.number.integer + 1):length(x); observed.value <- sort(y); @@ -316,7 +316,7 @@ outlier.detection.cosine <- function (x, value.portion = 1) { sample.fpkm.qq <- na.omit(as.numeric(x[sample.number])) sample.fpkm.qq.nozero <- sample.fpkm.qq + add.minimum.value; - + # Trimmed samples -Trim 5% of each side sample.trim.number <- trim.sample(seq(length(sample.fpkm.qq.nozero)), 5); sample.fpkm.qq.trim <- sort(sample.fpkm.qq)[sample.trim.number]; @@ -324,16 +324,16 @@ outlier.detection.cosine <- function (x, value.portion = 1) { # Quantile p <- ppoints(sample.fpkm.qq.nozero); - + # Distribution distribution.fit <- as.numeric(x[length(x)]); - - if (1 == distribution.fit){ + + if (1 == distribution.fit) { # 1. Normal distribution norm.mean <- mean(sample.fpkm.qq.nozero.trim); norm.sd <- sd(sample.fpkm.qq.nozero.trim); # Use truncated norm - norm.quantiles <- qtruncnorm(p, a=0, b=Inf, mean = norm.mean, sd = norm.sd); + norm.quantiles <- qtruncnorm(p, a = 0, b = Inf, mean = norm.mean, sd = norm.sd); obs.quantile.norm <- quantile(sample.fpkm.qq.nozero, prob = p); last.cos <- cosine.similarity.large.value.percent(norm.quantiles, obs.quantile.norm, large.value.percent = value.portion); } @@ -341,8 +341,8 @@ outlier.detection.cosine <- function (x, value.portion = 1) { # 2. Log-normal distribution mean.log <- mean(sample.fpkm.qq.nozero.trim); sd.log <- sd(sample.fpkm.qq.nozero.trim); - m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2)); - sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2))); + m2 <- log(mean.log ^ 2 / sqrt(sd.log ^ 2 + mean.log^2)); + sd2 <- sqrt(log(1 + (sd.log ^ 2 / mean.log ^ 2))); lnorm.quantile <- qlnorm(p, meanlog = m2, sdlog = sd2); obs.quantile.lnorm <- quantile(sample.fpkm.qq.nozero, prob = p); last.cos <- cosine.similarity.large.value.percent(lnorm.quantile, obs.quantile.lnorm, large.value.percent = value.portion); @@ -358,8 +358,8 @@ outlier.detection.cosine <- function (x, value.portion = 1) { ### 4 gamma distribution mean.gamma <- mean(sample.fpkm.qq.nozero.trim); sd.gamma <- sd(sample.fpkm.qq.nozero.trim); - gamma.shape <- (mean.gamma/sd.gamma)^2; - gamma.rate <- mean.gamma/(sd.gamma^2); + gamma.shape <- (mean.gamma / sd.gamma) ^ 2; + gamma.rate <- mean.gamma / (sd.gamma ^ 2); gamma.quantile <- qgamma(p, shape = gamma.shape, rate = gamma.rate); obs.quantile.gamma <- quantile(sample.fpkm.qq.nozero, prob = p); last.cos <- cosine.similarity.large.value.percent(gamma.quantile, obs.quantile.gamma, large.value.percent = value.portion); @@ -369,7 +369,6 @@ outlier.detection.cosine <- function (x, value.portion = 1) { cosine.sum.distribution.fit; } - # Trimming function trim.sample <- function(x, trim.portion = 5) { if (length(x) <= 10) { @@ -389,13 +388,13 @@ cl <- makeCluster(spec = detectCores() - 2); # register the cluster with the parallel package registerDoParallel(cl); clusterExport( - cl = cl, - varlist = 'trim.sample' - ) + cl = cl, + varlist = 'trim.sample' + ) clusterEvalQ( - cl = cl, - expr = library(gamlss) - ) + cl = cl, + expr = library(gamlss) + ) # Define a minimum value random.col <- sample(patient.part, 1) @@ -430,7 +429,6 @@ bic.trim.distribution <- foreach(j = 1:nrow(fpkm.tumor.symbol.filter), .combine stopCluster(cl = cl); - # Find the best fitted distribution # - BIC rownames(bic.trim.distribution) <- rownames(fpkm.tumor.symbol.filter); @@ -444,21 +442,21 @@ cl <- makeCluster(spec = detectCores() - 2); # register the cluster with the parallel package registerDoParallel(cl = cl); clusterExport( - cl = cl, - varlist = 'outlier.detection.cosine' - ); + cl = cl, + varlist = 'outlier.detection.cosine' + ); clusterEvalQ( - cl = cl, - expr = c(library(lsa), library(SnowballC)) - ); + cl = cl, + expr = c(library(lsa), library(SnowballC)) + ); print('Calculate Cosine') data.cosine.bic <- apply( - X = fpkm.tumor.symbol.filter.bic.fit, - MARGIN = 1, - FUN = outlier.detection.cosine, - value.portion = 0 - ); + X = fpkm.tumor.symbol.filter.bic.fit, + MARGIN = 1, + FUN = outlier.detection.cosine, + value.portion = 0 + ); stopCluster(cl = cl); @@ -498,7 +496,7 @@ outlier.rank <- function(x) { rank.methods <- rank(x[,methods.column], ties.method = 'max', na.last = 'keep'); rank.matrix <- cbind(rank.matrix, rank.methods); } - rownames(rank.matrix) <-rownames(x); + rownames(rank.matrix) <- rownames(x); colnames(rank.matrix) <- methods; rank.matrix <- data.frame(rank.matrix); } @@ -525,11 +523,11 @@ data.rank.bic <- outlier.rank(gene.zrange.fraction.cosine.last.point.bic); rank.product.bic <- apply(data.rank.bic, 1, outlier.rank.product, NA.number = 3); gene.rank.poduct.bic <- cbind( - annot.filter, - data.rank.bic, - rank.product.bic, - distribution = gene.zrange.fraction.cosine.last.point.bic$distribution - ); + annot.filter, + data.rank.bic, + rank.product.bic, + distribution = gene.zrange.fraction.cosine.last.point.bic$distribution + ); gene.rank.order.5method.cosine.last.point.bic <- gene.rank.poduct.bic[order(gene.rank.poduct.bic$rank.product, decreasing = FALSE),]; print('Saving results') diff --git a/OutlierDetectionAlgorithm/2.Distribution_Identification.R b/OutlierDetectionAlgorithm/2.Distribution_Identification.R index c8e6b9b..6d7d493 100644 --- a/OutlierDetectionAlgorithm/2.Distribution_Identification.R +++ b/OutlierDetectionAlgorithm/2.Distribution_Identification.R @@ -35,8 +35,8 @@ setwd(working.directory); # load the R environment file saved from 1.Outlier_detection_5method.R load( - file = outlier.rank.file - ) + file = outlier.rank.file + ) ### Function ### # Define a minimum value @@ -47,14 +47,14 @@ decimal.number.max <- lapply(na.omit(fpkm.tumor.symbol.filter[,random.col]), fun nchar(as.character(y)) - nchar(as.integer(y)) - 1 }) return(decimal.numbers) - }) + }) add.minimum.value <- 1 / 10 ^ as.numeric(max(unlist(decimal.number.max))); # function: Trim 5% of samples from each side trim.sample <- function(x, trim.portion = 5) { - trim.sample.number <- length(x) * (trim.portion/100); + trim.sample.number <- length(x) * (trim.portion / 100); trim.sample.number.integer <- round(trim.sample.number, digits = 0); - patient.trimr.value <- (trim.sample.number.integer + 1):(length(x)-trim.sample.number.integer); + patient.trimr.value <- (trim.sample.number.integer + 1):(length(x) - trim.sample.number.integer); patient.trimr.value; } @@ -67,65 +67,65 @@ cl <- makeCluster(spec = detectCores() - 2); # register the cluster with the parallel package registerDoParallel(cl = cl); clusterExport( - cl = cl, - varlist = 'trim.sample' - ) + cl = cl, + varlist = 'trim.sample' + ) clusterEvalQ( - cl = cl, - expr = library(gamlss) - ) + cl = cl, + expr = library(gamlss) + ) -obs.residue.quantile <- foreach (i = 1:nrow(fpkm.tumor.symbol.filter), .combine = rbind) %dopar% { +obs.residue.quantile <- foreach(i = 1:nrow(fpkm.tumor.symbol.filter), .combine = rbind) %dopar% { sample.fpkm.qq <- round(as.numeric(fpkm.tumor.symbol.filter[i,patient.part]), digits = 6); sample.fpkm.qq.sort <- sort(sample.fpkm.qq); sample.fpkm.qq.nozero <- sample.fpkm.qq.sort + add.minimum.value; - + sample.trim.number <- trim.sample(seq(sample.fpkm.qq.sort), 5); sample.fpkm.qq.trim <- sort(sample.fpkm.qq)[sample.trim.number]; sample.fpkm.qq.nozero.trim <- sample.fpkm.qq.trim + add.minimum.value; - + # Quantile # p <- seq(0.001, 0.patient.number, 0.001); p <- ppoints(length(patient.part)); - + if (1 == bic.trim.distribution.fit[i]) { - # 1. Normal distribution - norm.mean <- mean(sample.fpkm.qq.nozero.trim); - norm.sd <- sd(sample.fpkm.qq.nozero.trim); - norm.quantiles <- qnorm(p, mean = norm.mean, sd = norm.sd); - obs.quantile.norm <- quantile(sample.fpkm.qq.nozero, prob = p); - obs.residue.non.trim <- obs.quantile.norm - norm.quantiles; + # 1. Normal distribution + norm.mean <- mean(sample.fpkm.qq.nozero.trim); + norm.sd <- sd(sample.fpkm.qq.nozero.trim); + norm.quantiles <- qnorm(p, mean = norm.mean, sd = norm.sd); + obs.quantile.norm <- quantile(sample.fpkm.qq.nozero, prob = p); + obs.residue.non.trim <- obs.quantile.norm - norm.quantiles; } else if (2 == bic.trim.distribution.fit[i]) { - # 2. Log-normal distribution - mean.log <- mean(sample.fpkm.qq.nozero.trim); - sd.log <- sd(sample.fpkm.qq.nozero.trim); - m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2)); - sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2))); - lnorm.quantile <- qlnorm(p, meanlog = m2, sdlog = sd2); - obs.quantile.lnorm <- quantile(sample.fpkm.qq.nozero, prob = p); - obs.residue.non.trim <- obs.quantile.lnorm - lnorm.quantile; + # 2. Log-normal distribution + mean.log <- mean(sample.fpkm.qq.nozero.trim); + sd.log <- sd(sample.fpkm.qq.nozero.trim); + m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2)); + sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2))); + lnorm.quantile <- qlnorm(p, meanlog = m2, sdlog = sd2); + obs.quantile.lnorm <- quantile(sample.fpkm.qq.nozero, prob = p); + obs.residue.non.trim <- obs.quantile.lnorm - lnorm.quantile; } - + else if (3 == bic.trim.distribution.fit[i]) { - # 3. Exponential distribution - exp.rate <- 1 / mean(sample.fpkm.qq.nozero.trim); - exp.quantile <- qexp(p, rate = exp.rate); - obs.quantile.exp <- quantile(sample.fpkm.qq.nozero, prob = p); - obs.residue.non.trim <- obs.quantile.exp - exp.quantile; + # 3. Exponential distribution + exp.rate <- 1 / mean(sample.fpkm.qq.nozero.trim); + exp.quantile <- qexp(p, rate = exp.rate); + obs.quantile.exp <- quantile(sample.fpkm.qq.nozero, prob = p); + obs.residue.non.trim <- obs.quantile.exp - exp.quantile; } - + else if (4 == bic.trim.distribution.fit[i]) { - ### 4 gamma distribution - mean.gamma <- mean(sample.fpkm.qq.nozero.trim); - sd.gamma <- sd(sample.fpkm.qq.nozero.trim); - gamma.shape <- (mean.gamma/sd.gamma)^2; - gamma.rate <- mean.gamma/(sd.gamma^2); - gamma.quantile <- qgamma(p, shape = gamma.shape, rate = gamma.rate); - obs.quantile.gamma <- quantile(sample.fpkm.qq.nozero, prob = p); - obs.residue.non.trim <- obs.quantile.gamma - gamma.quantile; + ### 4 gamma distribution + mean.gamma <- mean(sample.fpkm.qq.nozero.trim); + sd.gamma <- sd(sample.fpkm.qq.nozero.trim); + gamma.shape <- (mean.gamma / sd.gamma) ^ 2; + gamma.rate <- mean.gamma / (sd.gamma ^ 2); + gamma.quantile <- qgamma(p, shape = gamma.shape, rate = gamma.rate); + obs.quantile.gamma <- quantile(sample.fpkm.qq.nozero, prob = p); + obs.residue.non.trim <- obs.quantile.gamma - gamma.quantile; } obs.residue.non.trim @@ -136,7 +136,13 @@ stopCluster(cl = cl) obs.residue.quantile <- data.frame(obs.residue.quantile); rownames(obs.residue.quantile) <- rownames(fpkm.tumor.symbol.filter); # Trim each 5% -obs.residue.quantile.trim <- apply(obs.residue.quantile, 1, function(x) {sort(as.numeric(x))}); +obs.residue.quantile.trim <- apply( + X = obs.residue.quantile, + MARGIN = 1, + FUN - function(x) { + sort(as.numeric(x)) + } + ); obs.residue.quantile.trim <- data.frame(t(obs.residue.quantile.trim)); sample.trim.number <- trim.sample(patient.part, 5); @@ -151,30 +157,30 @@ cl <- makeCluster(spec = detectCores() - 2) registerDoParallel(cl = cl) clusterExport( - cl = cl, - varlist = 'trim.sample' - ) + cl = cl, + varlist = 'trim.sample' + ) clusterEvalQ( - cl = cl, - expr = library(gamlss) - ) + cl = cl, + expr = library(gamlss) + ) noise.min.off.bic.distribution <- NULL -noise.min.off.bic.distribution <- foreach (j = 1:nrow(obs.residue.quantile.trim), .combine = rbind) %dopar% { +noise.min.off.bic.distribution <- foreach(j = 1:nrow(obs.residue.quantile.trim), .combine = rbind) %dopar% { sample.fpkm.qq <- round(as.numeric(obs.residue.quantile.trim[j,]), digits = 6); sample.fpkm.qq.sort <- sort(sample.fpkm.qq); if (min(sample.fpkm.qq.sort) < 0) { sample.fpkm.qq.nozero <- sample.fpkm.qq.sort - min(sample.fpkm.qq.sort) + add.minimum.value; } else { sample.fpkm.qq.nozero <- sample.fpkm.qq.sort + add.minimum.value ; - } + } glm.norm <- gamlss(sample.fpkm.qq.nozero ~ 1, family = NO); glm.lnorm <- gamlss(sample.fpkm.qq.nozero ~ 1, family = LNO); glm.gamma <- gamlss(sample.fpkm.qq.nozero ~ 1, family = GA); glm.exp <- gamlss(sample.fpkm.qq.nozero ~ 1, family = EXP); - + glm.bic <- c(glm.norm$sbc, glm.lnorm$sbc, glm.exp$sbc, diff --git a/OutlierDetectionAlgorithm/3.Simulated_Data_generation_1.R b/OutlierDetectionAlgorithm/3.Simulated_Data_generation_1.R index c6c7765..a2a282d 100644 --- a/OutlierDetectionAlgorithm/3.Simulated_Data_generation_1.R +++ b/OutlierDetectionAlgorithm/3.Simulated_Data_generation_1.R @@ -21,7 +21,7 @@ params <- matrix( 'dataset.name', 'd', '0', 'character', 'working.directory', 'w', '0', 'character', 'outlier.rank.file', 'f', '0', 'character', - 'ntimes', 'n', '0', 'character' + 'ntimes', 'n', '0', 'character' ), ncol = 4, byrow = TRUE @@ -36,20 +36,20 @@ ntimes <- opt$ntimes working.directory <- '/hot/users/jlivingstone/outlier/run_method' dataset.name <- 'BRCA_EU' outlier.rank.file <- '/hot/users/jlivingstone/outlier/run_method/2023-11-20_BRCA-EU_final_outlier_rank_bic.long.rda' -ntimes = 10 +ntimes <- 10 # Set the working directory setwd(working.directory); # load the R environment file saved from 2.Distribution_Identification.R load( - file = outlier.rank.file - ) + file = outlier.rank.file + ) # Run parallel: 10 chucnks #args <- commandArgs( -# trailingOnly = TRUE -# ) +# trailingOnly = TRUE +# ) # sample size patient.part <- 1:ncol(fpkm.tumor.symbol.filter); @@ -69,9 +69,9 @@ cl <- makeCluster(spec = detectCores() - 2) registerDoParallel(cl = cl) clusterEvalQ( - cl = cl, - expr = library(extraDistr) - ) + cl = cl, + expr = library(extraDistr) + ) simulated.generation.negative <- function(x = fpkm.tumor.symbol.filter.distribution.fit, distribution = bic.trim.distribution.fit, num.negative = 10000, sample.size = sample.number) { # Define a minimum value @@ -81,9 +81,9 @@ simulated.generation.negative <- function(x = fpkm.tumor.symbol.filter.distribut nchar(as.character(y)) - nchar(as.integer(y)) - 1 }) return(decimal.numbers) - }) + }) add.minimum.value <- 1 / 10 ^ as.numeric(max(unlist(decimal.number.max))); - + # function: Trim 5% of samples from each side trim.sample <- function(x, trim.portion = 5) { trim.sample.number <- length(x) * (trim.portion / 100); @@ -92,47 +92,46 @@ simulated.generation.negative <- function(x = fpkm.tumor.symbol.filter.distribut patient.trimr.value; } - # Validate input parameters - if (!is.data.frame(x)) stop("x should be a data frame.") - if (!is.numeric(distribution)) stop("distribution should be numeric.") - if (!is.numeric(num.negative)) stop("num.negative should be numeric.") - if (!is.numeric(sample.size)) stop("sample.size should be numeric.") - + if (!is.data.frame(x)) stop('x should be a data frame.') + if (!is.numeric(distribution)) stop('distribution should be numeric.') + if (!is.numeric(num.negative)) stop('num.negative should be numeric.') + if (!is.numeric(sample.size)) stop('sample.size should be numeric.') + random.number.negative <- sample(length(distribution), num.negative, replace = TRUE); names(random.number.negative) <- names(distribution)[random.number.negative] - + # use the foreach function to parallelize the sapply loop - simulated.negative <- foreach (i = random.number.negative, .combine = rbind) %dopar% { - + simulated.negative <- foreach(i = random.number.negative, .combine = rbind) %dopar% { + sample.fpkm <- x[names(distribution)[i], sample.size]; sample.fpkm.qq <- na.omit(as.numeric(sample.fpkm)); sample.fpkm.qq.nozero <- sample.fpkm.qq + add.minimum.value; - + # Trimmed samples -Trim 5% of each side sample.trim.number <- trim.sample(sample.fpkm.qq.nozero, 5); sample.fpkm.qq.trim <- sort(sample.fpkm.qq)[sample.trim.number]; sample.fpkm.qq.nozero.trim <- sample.fpkm.qq.trim + add.minimum.value; - - if (1 == distribution[i]){ + + if (1 == distribution[i]) { ### 1) Normal distribution norm.mean <- mean(sample.fpkm.qq.nozero.trim); norm.sd <- sd(sample.fpkm.qq.nozero.trim); rtnorm(length(sample.size), mean = norm.mean, sd = norm.sd, a = 0); } - else if (2 == distribution[i]){ + else if (2 == distribution[i]) { mean.log <- mean(sample.fpkm.qq.nozero.trim); sd.log <- sd(sample.fpkm.qq.nozero.trim); m2 <- log(mean.log ^ 2 / sqrt(sd.log ^ 2 + mean.log ^ 2)); sd2 <- sqrt(log(1 + (sd.log ^ 2 / mean.log ^ 2))); rlnorm(n = length(sample.size), m2, sd2); } - else if (3 == distribution[i]){ + else if (3 == distribution[i]) { ### 4) exponential distribution exp.rate <- 1 / mean(sample.fpkm.qq.nozero.trim); rexp(n = length(sample.size), rate = exp.rate); } - else if (4 == distribution[i]){ + else if (4 == distribution[i]) { ### 5) gamma distribution mean.gamma <- mean(sample.fpkm.qq.nozero.trim); sd.gamma <- sd(sample.fpkm.qq.nozero.trim); @@ -149,26 +148,26 @@ simulated.generation.negative <- function(x = fpkm.tumor.symbol.filter.distribut seeds <- round(runif(n = ntimes, min = 1, max = 10000)) for (i in 1:ntimes) { - seed <- seeds[i] - set.seed(seed) - print(paste0('run negative random number:', i)) - negative.random.number.bic <- simulated.generation.negative( - x = fpkm.tumor.symbol.filter.bic, - distribution = bic.trim.distribution.fit, - num.negative = 100000, - sample.size = sample.number - ) - - # save the R environment - save( - seed, - fpkm.tumor.symbol.filter, - patient.part, - sample.number, - bic.trim.distribution.fit, - negative.random.number.bic, - file = generate.filename('Simulated_data_generation_1', paste(dataset.name, i, sep = '.'), 'rda') - ) - } + seed <- seeds[i] + set.seed(seed) + print(paste0('run negative random number:', i)) + negative.random.number.bic <- simulated.generation.negative( + x = fpkm.tumor.symbol.filter.bic, + distribution = bic.trim.distribution.fit, + num.negative = 100000, + sample.size = sample.number + ) + + # save the R environment + save( + seed, + fpkm.tumor.symbol.filter, + patient.part, + sample.number, + bic.trim.distribution.fit, + negative.random.number.bic, + file = generate.filename('Simulated_data_generation_1', paste(dataset.name, i, sep = '.'), 'rda') + ) + } stopCluster(cl = cl) diff --git a/OutlierDetectionAlgorithm/4.Simulated_Data_generation_2.R b/OutlierDetectionAlgorithm/4.Simulated_Data_generation_2.R index 7af613c..8aa1be0 100644 --- a/OutlierDetectionAlgorithm/4.Simulated_Data_generation_2.R +++ b/OutlierDetectionAlgorithm/4.Simulated_Data_generation_2.R @@ -19,7 +19,7 @@ params <- matrix( 'dataset.name', 'd', '0', 'character', 'working.directory', 'w', '0', 'character', 'distribution.identification.file', 'o', '0', 'character', - 'simulated.data.file', 's', '0', 'character' + 'simulated.data.file', 's', '0', 'character' ), ncol = 4, byrow = TRUE @@ -37,7 +37,7 @@ simulated.data.file <- opt$simulated.data.file #simulated.data.file <- '/hot/users/jlivingstone/outlier/run_method/2023-11-21_Simulated_data_generation_1_BRCA_EU.1.rda' # replicate number is parsed from the input file -pattern <- "\\d+" +pattern <- '\\d+' parsed.file <- substr(simulated.data.file, nchar(simulated.data.file) - 5, nchar(simulated.data.file)) index <- gregexpr(pattern = pattern, text = parsed.file) replicate <- regmatches(parsed.file, index)[[1]] @@ -47,12 +47,12 @@ setwd(working.directory); # load the R environment file saved from 2.Distribution_Identification.R and 3.Simulated_Data_generation_1.R load( - file = distribution.identification.file - ) + file = distribution.identification.file + ) load( - file = simulated.data.file - ) + file = simulated.data.file + ) # sample size patient.part <- 1:ncol(fpkm.tumor.symbol.filter); @@ -67,7 +67,7 @@ decimal.number.max <- lapply(na.omit(fpkm.tumor.symbol.filter[,random.col]), fun nchar(as.character(y)) - nchar(as.integer(y)) - 1 }) return(decimal.numbers) - }) + }) add.minimum.value <- 1 / 10 ^ as.numeric(max(unlist(decimal.number.max))); # 2. residual @@ -80,13 +80,13 @@ cl <- makeCluster(spec = detectCores() - 2) # register the cluster with the parallel package registerDoParallel(cl = cl); clusterExport( - cl = cl, - varlist = 'add.minimum.value' - ) + cl = cl, + varlist = 'add.minimum.value' + ) clusterEvalQ( - cl = cl, - expr = library(extraDistr) - ) + cl = cl, + expr = library(extraDistr) + ) random.col <- sample(patient.part, 1) decimal.number.max <- lapply(na.omit(fpkm.tumor.symbol.filter[,random.col]), function(x) { @@ -94,10 +94,10 @@ decimal.number.max <- lapply(na.omit(fpkm.tumor.symbol.filter[,random.col]), fun nchar(as.character(y)) - nchar(as.integer(y)) - 1 }) return(decimal.numbers) - }) + }) add.minimum.value <- 1 / 10 ^ as.numeric(max(unlist(decimal.number.max))); -negative.random.number.noise.bic <- foreach (i = 1:nrow(residue.negative.random.number.bic), .combine = 'rbind') %dopar% { +negative.random.number.noise.bic <- foreach(i = 1:nrow(residue.negative.random.number.bic), .combine = 'rbind') %dopar% { sample.fpkm.qq <- as.numeric(residue.negative.random.number.bic[i,]); sample.fpkm.qq.sort <- sort(sample.fpkm.qq); @@ -106,45 +106,45 @@ negative.random.number.noise.bic <- foreach (i = 1:nrow(residue.negative.random. } else { sample.fpkm.qq.nozero <- sample.fpkm.qq.sort + add.minimum.value; - } + } - if (1 == noise.min.off.bic.distribution.residue[i]){ + if (1 == noise.min.off.bic.distribution.residue[i]) { ### 1) Normal distribution norm.mean <- mean(sample.fpkm.qq.nozero); norm.sd <- sd(sample.fpkm.qq.nozero); simulated.sample <- rtnorm(length(sample.number), mean = norm.mean, sd = norm.sd, a = 0); } - else if (2 == noise.min.off.bic.distribution.residue[i]){ + else if (2 == noise.min.off.bic.distribution.residue[i]) { mean.log <- mean(sample.fpkm.qq.nozero); sd.log <- sd(sample.fpkm.qq.nozero); m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2)); sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2))); - simulated.sample <- rlnorm(n=length(sample.number), m2, sd2); + simulated.sample <- rlnorm(n = length(sample.number), m2, sd2); } - else if (3 == noise.min.off.bic.distribution.residue[i]){ + else if (3 == noise.min.off.bic.distribution.residue[i]) { ### 4) exponential distribution exp.rate <- 1 / mean(sample.fpkm.qq.nozero); - simulated.sample <- rexp(n=length(sample.number), rate = exp.rate); + simulated.sample <- rexp(n = length(sample.number), rate = exp.rate); } - else if (4 == noise.min.off.bic.distribution.residue[i]){ + else if (4 == noise.min.off.bic.distribution.residue[i]) { ### 5) gamma distribution mean.gamma <- mean(sample.fpkm.qq.nozero); sd.gamma <- sd(sample.fpkm.qq.nozero); - gamma.shape <- (mean.gamma/sd.gamma)^2; - gamma.rate <- mean.gamma/(sd.gamma^2); - simulated.sample <- rgamma(n=length(sample.number), gamma.shape, gamma.rate); + gamma.shape <- (mean.gamma / sd.gamma) ^ 2; + gamma.rate <- mean.gamma / (sd.gamma ^ 2) + simulated.sample <- rgamma(n = length(sample.number), gamma.shape, gamma.rate); } - + if (min(sample.fpkm.qq.sort) < 0) { simulated.sample.min <- simulated.sample + min(sample.fpkm.qq.sort) - add.minimum.value; } else { - simulated.sample.min <- simulated.sample - add.minimum.value; + simulated.sample.min <- simulated.sample - add.minimum.value; } simulated.sample.min; } -rownames(negative.random.number.noise.bic) <- rownames(residue.negative.random.number.bic); +rownames(negative.random.number.noise.bic) <- rownames(residue.negative.random.number.bic); stopCluster(cl = cl) diff --git a/OutlierDetectionAlgorithm/5.Simulated_Data_5method.R b/OutlierDetectionAlgorithm/5.Simulated_Data_5method.R index 24a94e4..4b54af3 100644 --- a/OutlierDetectionAlgorithm/5.Simulated_Data_5method.R +++ b/OutlierDetectionAlgorithm/5.Simulated_Data_5method.R @@ -41,19 +41,19 @@ simulated.data.file <- opt$simulated.data.file # Set the working directory setwd(working.directory); -pattern <- "\\d+" +pattern <- '\\d+' parsed.file <- substr(simulated.data.file, nchar(simulated.data.file) - 5, nchar(simulated.data.file)) index <- gregexpr(pattern = pattern, text = parsed.file) replicate <- regmatches(parsed.file, index)[[1]] # load the R environment file saved from 4.Simulated_Data_generation_2.R and 2.Distribution_Identfication.R load( - file = simulated.data.file - ) + file = simulated.data.file + ) load( - file = distribution.identification.file - ) + file = distribution.identification.file + ) # sample size patient.part <- 1:ncol(fpkm.tumor.symbol.filter); @@ -68,7 +68,7 @@ decimal.number.max <- lapply(na.omit(fpkm.tumor.symbol.filter[,random.col]), fun nchar(as.character(y)) - nchar(as.integer(y)) - 1 }) return(decimal.numbers) - }) + }) add.minimum.value <- 1 / 10 ^ as.numeric(max(unlist(decimal.number.max))); @@ -83,10 +83,10 @@ cosine.similarity.large.value.percent <- function(x, y, large.value.percent) { large.value.number.integer <- 1; } else { - large.value.number <- length(x) * (large.value.percent/100); + large.value.number <- length(x) * (large.value.percent / 100); large.value.number.integer <- roundToInteger(large.value.number); } - + # subset the largest values patient.larger.value <- (length(x) - large.value.number.integer + 1):length(x); observed.value <- sort(y); @@ -110,12 +110,12 @@ trim.sample <- function(x, trim.portion = 5) { } else { trim.sample.number <- length(x) * (trim.portion / 100); trim.sample.number.integer <- round(trim.sample.number, digits = 0); - patient.trim.value <- (trim.sample.number.integer + 1):(length(x)-trim.sample.number.integer); + patient.trim.value <- (trim.sample.number.integer + 1):(length(x) - trim.sample.number.integer); } patient.trim.value; } -outlier.detection.cosine <- function (x, value.portion = 1) { +outlier.detection.cosine <- function(x, value.portion = 1) { # Define a minimum value decimal.number.max <- lapply(na.omit(x), function(x) { @@ -123,7 +123,7 @@ outlier.detection.cosine <- function (x, value.portion = 1) { nchar(as.character(y)) - nchar(as.integer(y)) - 1 }) return(decimal.numbers) - }) + }) add.minimum.value <- 1 / 10 ^ as.numeric(max(unlist(decimal.number.max))); sample.fpkm.qq <- na.omit(as.numeric(x[sample.number])) @@ -133,14 +133,14 @@ outlier.detection.cosine <- function (x, value.portion = 1) { sample.trim.number <- trim.sample(seq(length(sample.fpkm.qq.nozero)), 5); sample.fpkm.qq.trim <- sort(sample.fpkm.qq)[sample.trim.number]; sample.fpkm.qq.nozero.trim <- sample.fpkm.qq.trim + add.minimum.value; - + # Quantile p <- ppoints(sample.fpkm.qq.nozero); - + # Distribution distribution.fit <- as.numeric(x[length(x)]); - - if (1 == distribution.fit){ + + if (1 == distribution.fit) { # 1. Normal distribution norm.mean <- mean(sample.fpkm.qq.nozero.trim); norm.sd <- sd(sample.fpkm.qq.nozero.trim); @@ -189,13 +189,13 @@ cl <- makeCluster(spec = detectCores() - 2); # register the cluster with the parallel package registerDoParallel(cl = cl); clusterExport( - cl = cl, - varlist = 'trim.sample' - ) + cl = cl, + varlist = 'trim.sample' + ) clusterEvalQ( - cl = cl, - expr = library(gamlss) - ) + cl = cl, + expr = library(gamlss) + ) # Define a minimum value random.col <- sample(patient.part, 1) @@ -204,18 +204,17 @@ decimal.number.max <- lapply(na.omit(fpkm.tumor.symbol.filter[,random.col]), fun nchar(as.character(y)) - nchar(as.integer(y)) - 1 }) return(decimal.numbers) - }) + }) bic.trim.distribution <- NULL; # Use foreach to iterate over the rows of fpkm.tumor.symbol.filter in parallel -bic.trim.distribution <- foreach (j = 1:nrow(negative.simulated.sum), .combine = rbind) %dopar% { - sample.fpkm.qq <- round(as.numeric(negative.simulated.sum[j,sample.number]), digits = 6); +bic.trim.distribution <- foreach(j = 1:nrow(negative.simulated.sum), .combine = rbind) %dopar% { + sample.fpkm.qq <- round(as.numeric(negative.simulated.sum[j, sample.number]), digits = 6); sample.trim.number <- trim.sample(sample.number, 5); sample.fpkm.qq.sort <- sort(sample.fpkm.qq)[sample.trim.number]; sample.fpkm.qq.nozero <- sample.fpkm.qq.sort + add.minimum.value; - - + glm.norm <- gamlss(sample.fpkm.qq.nozero ~ 1, family = NO); glm.lnorm <- gamlss(sample.fpkm.qq.nozero ~ 1, family = LNO); glm.gamma <- gamlss(sample.fpkm.qq.nozero ~ 1, family = GA); @@ -242,20 +241,20 @@ cl <- makeCluster(spec = detectCores() - 2); # register the cluster with the parallel package registerDoParallel(cl = cl); clusterExport( - cl = cl, - varlist = 'outlier.detection.cosine' - ); + cl = cl, + varlist = 'outlier.detection.cosine' + ); clusterEvalQ( - cl = cl, - expr = c(library(lsa), library(SnowballC)) - ) + cl = cl, + expr = c(library(lsa), library(SnowballC)) + ) data.cosine.negative <- apply( - X = negative.simulated.sum.fit, + X = negative.simulated.sum.fit, MARGIN = 1, FUN = outlier.detection.cosine, value.portion = 0 - ) + ) stopCluster(cl = cl); @@ -265,15 +264,15 @@ data.cosine.negative.t <- data.frame(data.cosine.negative.t); colnames(data.cosine.negative.t) <- c('cosine', 'distribution'); -# 1,2,3,4 +# 1,2,3,4 quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALSE) { x.na <- na.omit(as.numeric(x)); if (methods == 'median') { - if (exclude.zero) { - x.nonzero <- x.na[0 != x.na]; + if (exclude.zero) { + x.nonzero <- x.na[0 != x.na]; data.median <- median(x.nonzero); data.mad <- mad(x.nonzero); - } + } else { data.median <- median(x.na); data.mad <- mad(x.na); @@ -287,7 +286,7 @@ quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALS if (length(unique(as.numeric(x.na))) == 1) { kmean.matrix <- rep(NA, length(x.na)); names(kmean.matrix) <- names(x.na); - } + } else { data.order <- sort(x.na, decreasing = TRUE); non.zero <- data.order[data.order > 0]; @@ -295,28 +294,28 @@ quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALS na.matrix <- rep(NA, length(non.zero)); cluster.zero <- c(na.matrix, rep(0, length(x.na[x.na == 0]))); kmean.matrix <- cluster.zero[match(x.na, data.order)]; - names(kmean.matrix) <- names(x.na); - } + names(kmean.matrix) <- names(x.na); + } else { kmean <- kmeans(non.zero, 2, nstart = 1000); cluster <- kmean$cluster; cluster.zero <- c(cluster, rep(0, length(x[x == 0]))); kmean.matrix <- cluster.zero[match(x.na, data.order)]; - names(kmean.matrix) <- names(x.na); + names(kmean.matrix) <- names(x.na); } } - } - + } + else { if (length(unique(as.numeric(x.na))) == 1) { kmean.matrix <- rep(NA, length(x.na)); - names(kmean.matrix) <- names(x.na); - } + names(kmean.matrix) <- names(x.na); + } else { kmean <- kmeans(x.na, 2, nstart = 1000); cluster <- kmean$cluster; kmean.matrix <- cluster; - names(kmean.matrix) <- names(x.na); + names(kmean.matrix) <- names(x.na); } } result.na <- kmean.matrix; @@ -325,13 +324,13 @@ quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALS } else { gene.order <- x.na[order(x.na, decreasing = TRUE)]; - if (exclude.zero) { - gene.order.nonzero <- gene.order[0 != gene.order]; + if (exclude.zero) { + gene.order.nonzero <- gene.order[0 != gene.order]; top.patient <- round(length(gene.order.nonzero) * (trim / 100), digit = 0); low.patient <- round(length(gene.order.nonzero) * (1 - (trim / 100)), digit = 0); data.mean <- mean(gene.order.nonzero, trim = (trim / 100)); - data.sd <- sd(gene.order.nonzero[(top.patient+1):(low.patient)]); - } + data.sd <- sd(gene.order.nonzero[(top.patient + 1):(low.patient)]); + } else { top.patient <- round(length(x.na) * (trim / 100), digit = 0); low.patient <- round(length(x.na) * (1 - (trim / 100)), digit = 0); @@ -350,19 +349,19 @@ cl <- makeCluster(detectCores() - 2); registerDoParallel(cl = cl); # 1. MEAN and SD : method = 'mean', trim = 0 -data.mean <- foreach (i = 1:nrow(negative.simulated.sum), .combine = rbind) %dopar% quantify.outliers(negative.simulated.sum[i,]); +data.mean <- foreach(i = 1:nrow(negative.simulated.sum), .combine = rbind) %dopar% quantify.outliers(negative.simulated.sum[i,]); data.mean <- data.frame(data.mean); # 2. TRIMMED MEAN and TRIMMED SD : method = 'mean', trim = 5 -data.trimmean <- foreach (i = 1:nrow(negative.simulated.sum), .combine = rbind) %dopar% quantify.outliers(negative.simulated.sum[i,], trim = 5); +data.trimmean <- foreach(i = 1:nrow(negative.simulated.sum), .combine = rbind) %dopar% quantify.outliers(negative.simulated.sum[i,], trim = 5); data.trimmean <- data.frame(data.trimmean); # 3. MEDIAN and MAD : method = 'median' -data.median <- foreach (i = 1:nrow(negative.simulated.sum), .combine = rbind) %dopar% quantify.outliers(negative.simulated.sum[i,], methods = 'median'); +data.median <- foreach(i = 1:nrow(negative.simulated.sum), .combine = rbind) %dopar% quantify.outliers(negative.simulated.sum[i,], methods = 'median'); data.median <- data.frame(data.median); # 4. KMEAN : method = 'kmean' -data.kmean <- foreach (i = 1:nrow(negative.simulated.sum), .combine = rbind) %dopar% quantify.outliers(negative.simulated.sum[i,], methods = 'kmean') +data.kmean <- foreach(i = 1:nrow(negative.simulated.sum), .combine = rbind) %dopar% quantify.outliers(negative.simulated.sum[i,], methods = 'kmean') data.kmean <- data.frame(data.kmean); stopCluster(cl = cl) @@ -411,21 +410,20 @@ kmean.simulated.negative.1M <- data.frame(t(data.fraction.kmean)); ### Final gene-wise matrix ##### gene.zrange.fraction.negative.simulated.sum.1M <- cbind( - mean.simulated.negative.1M$zrange, + mean.simulated.negative.1M$zrange, median.simulated.negative.1M$zrange, trimmean.simulated.negative.1M$zrange, kmean.simulated.negative.1M$fraction - ) + ) rownames(gene.zrange.fraction.negative.simulated.sum.1M) <- rownames(negative.simulated.sum); colnames(gene.zrange.fraction.negative.simulated.sum.1M) <- c('zrange.mean', 'zrange.median', 'zrange.trimmean', 'fraction.kmean'); - # Final statistic matrix gene.zrange.fraction.negative.simulated.sum.bic.5method.1M <- cbind( - gene.zrange.fraction.negative.simulated.sum.1M[,c(1, 2, 3, 4)], - data.cosine.negative.t$cosine, - data.cosine.negative.t$distribution - ) + gene.zrange.fraction.negative.simulated.sum.1M[,c(1, 2, 3, 4)], + data.cosine.negative.t$cosine, + data.cosine.negative.t$distribution + ) colnames(gene.zrange.fraction.negative.simulated.sum.bic.5method.1M) <- c('zrange.mean', 'zrange.median', 'zrange.trimmean', 'fraction.kmean', 'cosine', 'distribution'); save( diff --git a/OutlierDetectionAlgorithm/6.Simulated_Data_5method_combine.R b/OutlierDetectionAlgorithm/6.Simulated_Data_5method_combine.R index 30fcd48..d29e84c 100644 --- a/OutlierDetectionAlgorithm/6.Simulated_Data_5method_combine.R +++ b/OutlierDetectionAlgorithm/6.Simulated_Data_5method_combine.R @@ -11,7 +11,7 @@ params <- matrix( data = c( 'dataset.name', 'd', '0', 'character', 'working.directory', 'w', '0', 'character', - 'file.date', 'd', '0', 'character' + 'file.date', 'd', '0', 'character' ), ncol = 4, byrow = TRUE @@ -30,26 +30,26 @@ gene.zrange.fraction.negative.simulated.sum.1M.combined <- NULL gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.combined <- NULL for (i in 1:10) { load( - file = paste(file.date, '_Simulated_Data_5method_', dataset.name, '.', i, '.short.rda', sep = '') + file = paste(file.date, '_Simulated_Data_5method_', dataset.name, '.', i, '.short.rda', sep = '') ) gene.zrange.fraction.negative.simulated.sum.1M.combined <- rbind( - gene.zrange.fraction.negative.simulated.sum.1M.combined, - gene.zrange.fraction.negative.simulated.sum.1M - ) + gene.zrange.fraction.negative.simulated.sum.1M.combined, + gene.zrange.fraction.negative.simulated.sum.1M + ) gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.combined <- rbind( - gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.combined, - gene.zrange.fraction.negative.simulated.sum.bic.5method.1M - ) + gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.combined, + gene.zrange.fraction.negative.simulated.sum.bic.5method.1M + ) } gene.zrange.fraction.negative.simulated.sum.1M <- gene.zrange.fraction.negative.simulated.sum.1M.combined gene.zrange.fraction.negative.simulated.sum.bic.5method.1M <- gene.zrange.fraction.negative.simulated.sum.bic.5method.1M.combined save( - fpkm.tumor.symbol.filter, - bic.trim.distribution.fit.obs, - noise.min.off.bic.distribution.fit, - gene.zrange.fraction.negative.simulated.sum.bic.5method.1M, - gene.zrange.fraction.negative.simulated.sum.1M, - file = generate.filename('Simulated_Data_5method_combine', dataset.name, 'rda') - ) + fpkm.tumor.symbol.filter, + bic.trim.distribution.fit.obs, + noise.min.off.bic.distribution.fit, + gene.zrange.fraction.negative.simulated.sum.bic.5method.1M, + gene.zrange.fraction.negative.simulated.sum.1M, + file = generate.filename('Simulated_Data_5method_combine', dataset.name, 'rda') + ) diff --git a/OutlierDetectionAlgorithm/7.Significant_Outlier_Detection.R b/OutlierDetectionAlgorithm/7.Significant_Outlier_Detection.R index d2183ee..929bf90 100644 --- a/OutlierDetectionAlgorithm/7.Significant_Outlier_Detection.R +++ b/OutlierDetectionAlgorithm/7.Significant_Outlier_Detection.R @@ -6,7 +6,7 @@ # --row.chunks 1000 --matrix.chunk 1 --method.iteration 0 ### 7.Significant_Outlier_Detection.R #################################################### -# Compute p-values +# Compute p-values # Required R package library(BoutrosLab.utilities) @@ -22,8 +22,8 @@ params <- matrix( 'rank.file.', 'f', '0', 'character', 'combined.file', 'c', '0', 'character', 'row.chunks', 'r', '0', 'character', - 'matrix.chunk', 'm', '0', 'character', - 'method.iteration', 'i', '0', 'character' + 'matrix.chunk', 'm', '0', 'character', + 'method.iteration', 'i', '0', 'character' ), ncol = 4, byrow = TRUE @@ -128,7 +128,7 @@ clusterExport( ) gene.rank.p.value.one.gene <- NULL; -gene.rank.p.value.one.gene <- foreach (i = gene.number.start.end.matrix[matrix.chunk,'start']:gene.number.start.end.matrix[matrix.chunk,'end'], .combine = rbind) %dopar% { +gene.rank.p.value.one.gene <- foreach(i = gene.number.start.end.matrix[matrix.chunk,'start']:gene.number.start.end.matrix[matrix.chunk,'end'], .combine = rbind) %dopar% { methods <- c('zrange.mean', 'zrange.median', 'zrange.trimmean', 'fraction.kmean', 'cosine') observed.gene <- gene.zrange.fraction.cosine.last.point.bic[i, methods]; combine.matrix <- rbind( diff --git a/OutlierDetectionAlgorithm/8.Significant_Outlier_Pvalue_Calculation.R b/OutlierDetectionAlgorithm/8.Significant_Outlier_Pvalue_Calculation.R index 448911b..9986f60 100644 --- a/OutlierDetectionAlgorithm/8.Significant_Outlier_Pvalue_Calculation.R +++ b/OutlierDetectionAlgorithm/8.Significant_Outlier_Pvalue_Calculation.R @@ -5,15 +5,15 @@ # --method.iteration 0 ### 8.Significant_Outlier_Pvalue_Calculation.R #################################################### -# Compute p-values +# Combine p-values across chunks library(BoutrosLab.utilities) library(getopt) params <- matrix( data = c( - 'dataset.name', 'd', '0', 'character', + 'dataset.name', 'd', '0', 'character', 'working.directory', 'w', '0', 'character', - 'method.iteration', 'i', '0', 'character' + 'method.iteration', 'i', '0', 'character' ), ncol = 4, byrow = TRUE @@ -28,28 +28,28 @@ method.iteration <- opt$method.iteration setwd(working.directory) files <- list.files( - pattern = 'Significant_Outlier_Detection' - ) + pattern = 'Significant_Outlier_Detection' + ) p.value.all <- NULL for (i in 1:length(files)) { load( - file = files[i] - ) + file = files[i] + ) assign( - x = 'variable.name', - value = paste('gene.rank.p.value.one.gene', method.iteration, sep = '.') - ) + x = 'variable.name', + value = paste('gene.rank.p.value.one.gene', method.iteration, sep = '.') + ) p.value.all <- rbind( - p.value.all, - get(x = variable.name) - ) + p.value.all, + get(x = variable.name) + ) } -p.value.all <- p.value.all[order(p.value.all$i),] +p.value.all <- p.value.all[order(p.value.all$i), ] p.value.all$q.value <- p.adjust( - p = p.value.all$obs.p.value, - method = 'fdr' - ) + p = p.value.all$obs.p.value, + method = 'fdr' + ) # assign back to original variable name assign(x = variable.name, value = p.value.all) diff --git a/metadata.yaml b/metadata.yaml new file mode 100644 index 0000000..5876e7c --- /dev/null +++ b/metadata.yaml @@ -0,0 +1,7 @@ +--- +Description: '' +Maintainers: [''] +Contributors: '' +Languages: ['R'] +Dependencies: ['doParallel', 'foreach', 'getopt', 'parallel'] +References: ''