Skip to content

Commit

Permalink
lintr changes; update simulated_data_5method for patient removal iter…
Browse files Browse the repository at this point in the history
…ation
  • Loading branch information
jmlivingstone committed Dec 14, 2023
1 parent 5efd555 commit 20f2cef
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 60 deletions.
31 changes: 16 additions & 15 deletions OutlierDetectionAlgorithm/1.Outlier_Detection.R
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,11 @@ molecular.data.filter <- fpkm.tumor.symbol.filter[, patient.part];
trim.sample <- function(x, trim = 0.05) {
x <- sort(x);
if (length(x) <= 10) {
patient.trim.value <- 2:(length(x)-1);
} else {
patient.trim.value <- 2:(length(x) - 1);
} else {
trim.sample.number <- length(x) * trim;
trim.sample.number.integer <- round(trim.sample.number);
patient.trim.value <- (trim.sample.number.integer + 1):(length(x)-trim.sample.number.integer);
patient.trim.value <- (trim.sample.number.integer + 1):(length(x) - trim.sample.number.integer);
}
x[patient.trim.value];
}
Expand Down Expand Up @@ -276,7 +276,7 @@ outlier.detection.cosine <- function(x, value.portion = 1) {
})
return(decimal.numbers)
})
add.minimum.value <- 1 / 10^as.numeric(max(unlist(decimal.number.max)));
add.minimum.value <- 1 / 10 ^ as.numeric(max(unlist(decimal.number.max)));

# function: Compute the cosine similarity of the largest data point
cosine.similarity.large.value.percent <- function(x, y, large.value.percent) {
Expand Down Expand Up @@ -312,7 +312,7 @@ outlier.detection.cosine <- function(x, value.portion = 1) {
sample.fpkm.qq.nozero <- sample.fpkm.qq + add.minimum.value;

# Trimmed samples -Trim 5% of each side
sample.trim.number <- trim.sample(seq(length(sample.fpkm.qq.nozero)), 0.05);
sample.trim.number <- trim.sample(x = seq(length(sample.fpkm.qq.nozero)), trim = 0.05);
sample.fpkm.qq.trim <- sort(sample.fpkm.qq)[sample.trim.number];
sample.fpkm.qq.nozero.trim <- sample.fpkm.qq.trim + add.minimum.value;

Expand Down Expand Up @@ -349,7 +349,7 @@ outlier.detection.cosine <- function(x, value.portion = 1) {
last.cos <- cosine.similarity.large.value.percent(exp.quantile, obs.quantile.exp, large.value.percent = value.portion);
}
else if (4 == distribution.fit) {
### 4 gamma distribution
# 4 gamma distribution
mean.gamma <- mean(sample.fpkm.qq.nozero.trim);
sd.gamma <- sd(sample.fpkm.qq.nozero.trim);
gamma.shape <- (mean.gamma / sd.gamma) ^ 2;
Expand Down Expand Up @@ -378,7 +378,7 @@ clusterEvalQ(
expr = library(gamlss)
)

# Define a minimum value
# Define a minimum value (should set a seed)
random.col <- sample(patient.part, 1)
decimal.number.max <- lapply(na.omit(fpkm.tumor.symbol.filter[,random.col]), function(x) {
decimal.numbers <- sapply(x, function(y) {
Expand All @@ -390,11 +390,11 @@ add.minimum.value <- 1 / 10 ^ as.numeric(max(unlist(decimal.number.max)));

bic.trim.distribution <- NULL;

# Use foreach to iterate over the rows of fpkm.tumor.symbol.filter in parallel
# Use foreach to iterate over the rows (genes) of fpkm.tumor.symbol.filter in parallel
bic.trim.distribution <- foreach(j = 1:nrow(fpkm.tumor.symbol.filter), .combine = rbind) %dopar% {
sample.fpkm.qq <- round(as.numeric(fpkm.tumor.symbol.filter[j,patient.part]), digits = 6);

sample.trim.number <- trim.sample(sample.number, 5);
sample.trim.number <- trim.sample(x = sample.number, trim = 5);
sample.fpkm.qq.sort <- sort(sample.fpkm.qq)[sample.trim.number];
sample.fpkm.qq.nozero <- sample.fpkm.qq.sort + add.minimum.value;

Expand All @@ -403,17 +403,18 @@ bic.trim.distribution <- foreach(j = 1:nrow(fpkm.tumor.symbol.filter), .combine
glm.gamma <- gamlss(sample.fpkm.qq.nozero ~ 1, family = GA);
glm.exp <- gamlss(sample.fpkm.qq.nozero ~ 1, family = EXP);

glm.bic <- c(glm.norm$sbc,
glm.lnorm$sbc,
glm.exp$sbc,
glm.gamma$sbc);
glm.bic <- c(
glm.norm$sbc,
glm.lnorm$sbc,
glm.exp$sbc,
glm.gamma$sbc
);
glm.bic;
}

stopCluster(cl = cl);

# Find the best fitted distribution
# - BIC
# Find the best fitted distribution - BIC
rownames(bic.trim.distribution) <- rownames(fpkm.tumor.symbol.filter);
bic.trim.distribution.fit <- apply(bic.trim.distribution, 1, which.min);

Expand Down
18 changes: 10 additions & 8 deletions OutlierDetectionAlgorithm/2.Distribution_Identification.R
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ obs.residue.quantile <- foreach(i = 1:nrow(fpkm.tumor.symbol.filter), .combine =
# 2. Log-normal distribution
mean.log <- mean(sample.fpkm.qq.nozero.trim);
sd.log <- sd(sample.fpkm.qq.nozero.trim);
m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2));
sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2)));
m2 <- log(mean.log ^ 2 / sqrt(sd.log ^ 2 + mean.log ^ 2));
sd2 <- sqrt(log(1 + (sd.log ^ 2 / mean.log ^ 2)));
lnorm.quantile <- qlnorm(p, meanlog = m2, sdlog = sd2);
obs.quantile.lnorm <- quantile(sample.fpkm.qq.nozero, prob = p);
obs.residue.non.trim <- obs.quantile.lnorm - lnorm.quantile;
Expand All @@ -118,7 +118,7 @@ obs.residue.quantile <- foreach(i = 1:nrow(fpkm.tumor.symbol.filter), .combine =
}

else if (4 == bic.trim.distribution.fit[i]) {
### 4 gamma distribution
# 4 gamma distribution
mean.gamma <- mean(sample.fpkm.qq.nozero.trim);
sd.gamma <- sd(sample.fpkm.qq.nozero.trim);
gamma.shape <- (mean.gamma / sd.gamma) ^ 2;
Expand All @@ -139,7 +139,7 @@ rownames(obs.residue.quantile) <- rownames(fpkm.tumor.symbol.filter);
obs.residue.quantile.trim <- apply(
X = obs.residue.quantile,
MARGIN = 1,
FUN - function(x) {
FUN = function(x) {
sort(as.numeric(x))
}
);
Expand Down Expand Up @@ -181,10 +181,12 @@ noise.min.off.bic.distribution <- foreach(j = 1:nrow(obs.residue.quantile.trim),
glm.gamma <- gamlss(sample.fpkm.qq.nozero ~ 1, family = GA);
glm.exp <- gamlss(sample.fpkm.qq.nozero ~ 1, family = EXP);

glm.bic <- c(glm.norm$sbc,
glm.lnorm$sbc,
glm.exp$sbc,
glm.gamma$sbc)
glm.bic <- c(
glm.norm$sbc,
glm.lnorm$sbc,
glm.exp$sbc,
glm.gamma$sbc
)
glm.bic;
}

Expand Down
14 changes: 8 additions & 6 deletions OutlierDetectionAlgorithm/3.Simulated_Data_generation_1.R
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ clusterEvalQ(
expr = library(extraDistr)
)

simulated.generation.negative <- function(x = fpkm.tumor.symbol.filter.distribution.fit, distribution = bic.trim.distribution.fit, num.negative = 10000, sample.size = sample.number) {
simulated.generation.negative <- function(x = fpkm.tumor.symbol.filter.bic, distribution = bic.trim.distribution.fit, num.negative = 10000, sample.size = sample.number) {
# Define a minimum value
random.col <- sample(sample.size, 1)
decimal.number.max <- lapply(na.omit(x[,random.col]), function(x) {
Expand All @@ -98,7 +98,8 @@ simulated.generation.negative <- function(x = fpkm.tumor.symbol.filter.distribut
if (!is.numeric(num.negative)) stop('num.negative should be numeric.')
if (!is.numeric(sample.size)) stop('sample.size should be numeric.')

random.number.negative <- sample(length(distribution), num.negative, replace = TRUE);
# shuffle values and labels to create simulated data
random.number.negative <- sample(x = length(distribution), size = num.negative, replace = TRUE);
names(random.number.negative) <- names(distribution)[random.number.negative]

# use the foreach function to parallelize the sapply loop
Expand All @@ -114,25 +115,26 @@ simulated.generation.negative <- function(x = fpkm.tumor.symbol.filter.distribut
sample.fpkm.qq.nozero.trim <- sample.fpkm.qq.trim + add.minimum.value;

if (1 == distribution[i]) {
### 1) Normal distribution
# 1. Normal distribution
norm.mean <- mean(sample.fpkm.qq.nozero.trim);
norm.sd <- sd(sample.fpkm.qq.nozero.trim);
rtnorm(length(sample.size), mean = norm.mean, sd = norm.sd, a = 0);
}
else if (2 == distribution[i]) {
# 2. Log-normal distribution
mean.log <- mean(sample.fpkm.qq.nozero.trim);
sd.log <- sd(sample.fpkm.qq.nozero.trim);
m2 <- log(mean.log ^ 2 / sqrt(sd.log ^ 2 + mean.log ^ 2));
sd2 <- sqrt(log(1 + (sd.log ^ 2 / mean.log ^ 2)));
rlnorm(n = length(sample.size), m2, sd2);
}
else if (3 == distribution[i]) {
### 4) exponential distribution
# 3. Exponential distribution
exp.rate <- 1 / mean(sample.fpkm.qq.nozero.trim);
rexp(n = length(sample.size), rate = exp.rate);
}
else if (4 == distribution[i]) {
### 5) gamma distribution
# 4. Gamma distribution
mean.gamma <- mean(sample.fpkm.qq.nozero.trim);
sd.gamma <- sd(sample.fpkm.qq.nozero.trim);
gamma.shape <- (mean.gamma / sd.gamma) ^ 2;
Expand All @@ -150,7 +152,7 @@ seeds <- round(runif(n = ntimes, min = 1, max = 10000))
for (i in 1:ntimes) {
seed <- seeds[i]
set.seed(seed)
print(paste0('run negative random number:', i))

negative.random.number.bic <- simulated.generation.negative(
x = fpkm.tumor.symbol.filter.bic,
distribution = bic.trim.distribution.fit,
Expand Down
11 changes: 6 additions & 5 deletions OutlierDetectionAlgorithm/4.Simulated_Data_generation_2.R
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ decimal.number.max <- lapply(na.omit(fpkm.tumor.symbol.filter[,random.col]), fun
})
add.minimum.value <- 1 / 10 ^ as.numeric(max(unlist(decimal.number.max)));

# 2. residual
# 2. residual ** using magic numbers
residue.negative.random.number.bic <- obs.residue.quantile.trim[match(substr(rownames(negative.random.number.bic), 1, 15), substr(rownames(obs.residue.quantile.trim), 1, 15)),];

noise.min.off.bic.distribution.residue <- noise.min.off.bic.distribution.fit[match(substr(rownames(negative.random.number.bic), 1, 15), substr(names(noise.min.off.bic.distribution.fit), 1, 15))];
Expand Down Expand Up @@ -109,25 +109,26 @@ negative.random.number.noise.bic <- foreach(i = 1:nrow(residue.negative.random.n
}

if (1 == noise.min.off.bic.distribution.residue[i]) {
### 1) Normal distribution
# 1. Normal distribution
norm.mean <- mean(sample.fpkm.qq.nozero);
norm.sd <- sd(sample.fpkm.qq.nozero);
simulated.sample <- rtnorm(length(sample.number), mean = norm.mean, sd = norm.sd, a = 0);
}
else if (2 == noise.min.off.bic.distribution.residue[i]) {
# 2. Log-normal distribution
mean.log <- mean(sample.fpkm.qq.nozero);
sd.log <- sd(sample.fpkm.qq.nozero);
m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2));
sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2)));
simulated.sample <- rlnorm(n = length(sample.number), m2, sd2);
}
else if (3 == noise.min.off.bic.distribution.residue[i]) {
### 4) exponential distribution
# 3. Exponential distribution
exp.rate <- 1 / mean(sample.fpkm.qq.nozero);
simulated.sample <- rexp(n = length(sample.number), rate = exp.rate);
}
else if (4 == noise.min.off.bic.distribution.residue[i]) {
### 5) gamma distribution
# 4. Gamma distribution
mean.gamma <- mean(sample.fpkm.qq.nozero);
sd.gamma <- sd(sample.fpkm.qq.nozero);
gamma.shape <- (mean.gamma / sd.gamma) ^ 2;
Expand Down Expand Up @@ -163,5 +164,5 @@ save(
noise.min.off.bic.distribution.residue,
negative.random.number.noise.bic,
negative.simulated.sum,
file = generate.filename('Simulated_data_generation_2', paste(dataset.name, replicate, sep = '.'), 'rda')
file = generate.filename('Simulated_Data_generation_2', paste(dataset.name, replicate, sep = '.'), 'rda')
)
Loading

0 comments on commit 20f2cef

Please sign in to comment.