Skip to content

Commit

Permalink
fix merge conflict with jsahrmann-refactor-quantify-outliers
Browse files Browse the repository at this point in the history
  • Loading branch information
jmlivingstone committed Dec 6, 2023
2 parents b55dac9 + 3ad0938 commit 5efd555
Show file tree
Hide file tree
Showing 6 changed files with 1,186 additions and 50 deletions.
82 changes: 32 additions & 50 deletions OutlierDetectionAlgorithm/1.Outlier_Detection.R
Original file line number Diff line number Diff line change
Expand Up @@ -93,21 +93,32 @@ annot.filter <- annot[which(0.01 > zero.portion), ]

molecular.data.filter <- fpkm.tumor.symbol.filter[, patient.part];

### Trim sample
trim.sample <- function(x, trim = 0.05) {
x <- sort(x);
if (length(x) <= 10) {
patient.trim.value <- 2:(length(x)-1);
} else {
trim.sample.number <- length(x) * trim;
trim.sample.number.integer <- round(trim.sample.number);
patient.trim.value <- (trim.sample.number.integer + 1):(length(x)-trim.sample.number.integer);
}
x[patient.trim.value];
}

# Would this be faster if they were separate functions instead of if statements ?

### Outlier detection function
# Default : methods = 'mean', trim = 0
# 1. MEAN and SD : methods = 'mean', trim = 0
# 2. TRIMMED MEAN and TRIMMED SD : methods = 'mean', trim = 5
# 2. TRIMMED MEAN and TRIMMED SD : methods = 'mean', trim = 0.05
# 3. MEDIAN and MAD : methods = 'median'
# 4. KMEAN : methods = 'kmean'

quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALSE) {
# a better name might be x.complete, x.na makes me thingk it includes NAs
# 4. KMEAN : methods = 'kmean', nstart = 1000
quantify.outliers <- function(x, methods = 'mean', trim = 0, nstart = 1, exclude.zero = FALSE) {
x.na <- na.omit(as.numeric(x));
if (methods == 'median') {
if (exclude.zero) {
x.nonzero <- x.na[0 != x.na];
if ('median' == methods) {
if (exclude.zero) {
x.nonzero <- x.na[0 != x.na];
data.median <- median(x.nonzero);
data.mad <- mad(x.nonzero);
}
Expand All @@ -119,37 +130,37 @@ quantify.outliers <- function(x, methods = 'mean', trim = 0, exclude.zero = FALS
x[which(!is.na(x))] <- result.na;
x;
}
else if (methods == 'kmean') {
else if ('kmean' == methods) {
if (exclude.zero) {
if (length(unique(as.numeric(x.na))) == 1) {
if (1 == length(unique(x.na))) {
kmean.matrix <- rep(NA, length(x.na));
names(kmean.matrix) <- names(x.na);
}
else {
data.order <- sort(x.na, decreasing = TRUE);
non.zero <- data.order[data.order > 0];
if (length(unique(as.numeric(non.zero))) <= 2) {
if (length(unique(non.zero)) <= 2) {
na.matrix <- rep(NA, length(non.zero));
cluster.zero <- c(na.matrix, rep(0, length(x.na[x.na == 0])));
cluster.zero <- c(na.matrix, rep(0, length(x.na[0 == x.na])));
kmean.matrix <- cluster.zero[match(x.na, data.order)];
names(kmean.matrix) <- names(x.na);
}
else {
kmean <- kmeans(non.zero, 2, nstart = 100);
kmean <- kmeans(non.zero, 2, nstart = nstart);
cluster <- kmean$cluster;
cluster.zero <- c(cluster, rep(0, length(x[x == 0])));
cluster.zero <- c(cluster, rep(0, length(x[0 == x])));
kmean.matrix <- cluster.zero[match(x.na, data.order)];
names(kmean.matrix) <- names(x.na);
}
}
}
else {
if (length(unique(as.numeric(x.na))) == 1) {
if (1 == length(unique(x.na))) {
kmean.matrix <- rep(NA, length(x.na));
names(kmean.matrix) <- names(x.na);
}
else {
kmean <- kmeans(x.na, 2, nstart = 100);
kmean <- kmeans(x.na, 2, nstart = nstart);
cluster <- kmean$cluster;
kmean.matrix <- cluster;
names(kmean.matrix) <- names(x.na);
Expand Down Expand Up @@ -192,8 +203,6 @@ data.mean <- foreach(i = 1:nrow(molecular.data.filter), .combine = rbind) %dopar
data.mean <- data.frame(data.mean);

# 2. TRIMMED MEAN and TRIMMED SD : method = 'mean', trim = 5
print('Calculating using TRIMMED MEAN and TRIMMED SD')
print(Sys.time())
data.trimmean <- foreach(i = 1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i,], trim = 5);
data.trimmean <- data.frame(data.trimmean);

Expand All @@ -204,9 +213,7 @@ data.median <- foreach(i = 1:nrow(molecular.data.filter), .combine = rbind) %dop
data.median <- data.frame(data.median);

# 4. KMEAN : method = 'kmean'
print('Calculating using KMEANS')
print(Sys.time())
data.kmean <- foreach(i = 1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i,], methods = 'kmean')
data.kmean <- foreach(i = 1:nrow(molecular.data.filter), .combine = rbind) %dopar% quantify.outliers(molecular.data.filter[i,], methods = 'kmean', nstart = 1000)
data.kmean <- data.frame(data.kmean);

stopCluster(cl = cl)
Expand Down Expand Up @@ -235,16 +242,15 @@ data.zrange.trimmean.t <- data.frame(t(data.zrange.trimmean));
data.zrange.median <- apply(data.median, 1, outlier.detection.zrange);
data.zrange.median.t <- data.frame(t(data.zrange.median));


### Calculate the kmean fraction #####
# Function
outlier.detection.kmean <- function(x) {
if (1 == length(unique(as.numeric(x)))) {
fraction <- NA;
}
else {
cluster.one <- length(x[x == 1]);
cluster.two <- length(x[x == 2]);
cluster.one <- length(x[1 == x]);
cluster.two <- length(x[2 == x]);
cluster.sum <- cluster.one + cluster.two;
smaller.value <- min(cluster.one, cluster.two);
fraction <- round(smaller.value / cluster.sum, digit = 4);
Expand All @@ -259,7 +265,6 @@ print('Outlier detection kmeans')
data.fraction.kmean <- apply(data.kmean, 1, outlier.detection.kmean);
data.fraction.kmean.t <- data.frame(t(data.fraction.kmean));


# 5. Cosine similarity
# function: Compute the cosine similarity of the largest data point
outlier.detection.cosine <- function(x, value.portion = 1) {
Expand All @@ -273,17 +278,6 @@ outlier.detection.cosine <- function(x, value.portion = 1) {
})
add.minimum.value <- 1 / 10^as.numeric(max(unlist(decimal.number.max)));

trim.sample <- function(x, trim.portion = 5) {
if (length(x) <= 10) {
patient.trim.value <- 2:(length(x) - 1);
} else {
trim.sample.number <- length(x) * (trim.portion / 100);
trim.sample.number.integer <- round(trim.sample.number, digits = 0);
patient.trim.value <- (trim.sample.number.integer + 1):(length(x) - trim.sample.number.integer);
}
patient.trim.value;
}

# function: Compute the cosine similarity of the largest data point
cosine.similarity.large.value.percent <- function(x, y, large.value.percent) {

Expand Down Expand Up @@ -318,7 +312,7 @@ outlier.detection.cosine <- function(x, value.portion = 1) {
sample.fpkm.qq.nozero <- sample.fpkm.qq + add.minimum.value;

# Trimmed samples -Trim 5% of each side
sample.trim.number <- trim.sample(seq(length(sample.fpkm.qq.nozero)), 5);
sample.trim.number <- trim.sample(seq(length(sample.fpkm.qq.nozero)), 0.05);
sample.fpkm.qq.trim <- sort(sample.fpkm.qq)[sample.trim.number];
sample.fpkm.qq.nozero.trim <- sample.fpkm.qq.trim + add.minimum.value;

Expand Down Expand Up @@ -369,18 +363,6 @@ outlier.detection.cosine <- function(x, value.portion = 1) {
cosine.sum.distribution.fit;
}

# Trimming function
trim.sample <- function(x, trim.portion = 5) {
if (length(x) <= 10) {
patient.trim.value <- 2:(length(x) - 1);
} else {
trim.sample.number <- length(x) * (trim.portion / 100);
trim.sample.number.integer <- round(trim.sample.number, digits = 0);
patient.trim.value <- (trim.sample.number.integer + 1):(length(x) - trim.sample.number.integer);
}
patient.trim.value;
}

print('Determine the distribution')
# Determine the distribution
# Find the best fitted distribution
Expand Down Expand Up @@ -411,6 +393,7 @@ bic.trim.distribution <- NULL;
# Use foreach to iterate over the rows of fpkm.tumor.symbol.filter in parallel
bic.trim.distribution <- foreach(j = 1:nrow(fpkm.tumor.symbol.filter), .combine = rbind) %dopar% {
sample.fpkm.qq <- round(as.numeric(fpkm.tumor.symbol.filter[j,patient.part]), digits = 6);

sample.trim.number <- trim.sample(sample.number, 5);
sample.fpkm.qq.sort <- sort(sample.fpkm.qq)[sample.trim.number];
sample.fpkm.qq.nozero <- sample.fpkm.qq.sort + add.minimum.value;
Expand Down Expand Up @@ -477,7 +460,6 @@ gene.zrange.fraction.cosine.last.point.bic <- data.frame(
rownames(gene.zrange.fraction.cosine.last.point.bic) <- rownames(fpkm.tumor.symbol.filter);
colnames(gene.zrange.fraction.cosine.last.point.bic) <- c('zrange.mean', 'zrange.median', 'zrange.trimmean', 'fraction.kmean', 'cosine', 'distribution');


print('Rank each method')
### Rank each methods #####
# Function
Expand Down
Loading

0 comments on commit 5efd555

Please sign in to comment.