RECETOX · hechth · Aug 10, 2022 · Jul 23, 2022 · Jul 23, 2022 · Jul 23, 2022
diff --git a/R/adjust.time.R b/R/adjust.time.R
@@ -108,7 +108,7 @@ adjust.time <- function(features,
                                 rt_tol_relative = rt_tol_relative,
                                 mz_tol_absolute = mz_tol_absolute,
                                 do.plot = do.plot)
-        rt_tol_relative <- all.ft$chr.tol
+        rt_tol_relative <- all.ft$rt.tol
 
         message("**** performing time correction ****")
         message(paste("m/z tolerance level: ", mz_tol_relative))
@@ -117,7 +117,7 @@ adjust.time <- function(features,
         for(i in 1:number_of_samples) {
             this <- features[[i]]
             sel <- which(all.ft$lab == i)
-            that <- cbind(all.ft$mz[sel], all.ft$chr[sel], all.ft$grps[sel])
+            that <- cbind(all.ft$mz[sel], all.ft$rt[sel], all.ft$grps[sel])
             this <- this[order(this[, 1], this[, 2]), ]
             that <- that[order(that[, 1], that[, 2]), ]
             this <- cbind(this, V6=rep(i, nrow(this)), V7=that[, 3])

diff --git a/R/feature.align.R b/R/feature.align.R
@@ -1,20 +1,78 @@
 to_attach <- function(pick, number_of_samples, use = "sum") {
     strengths <- rep(0, number_of_samples)
     if (is.null(nrow(pick))) {
+        # this is very strange if it can ever happen
+        # maybe commas are missing? we want the same as below
+        # but also why if there are no rows...
         strengths[pick[6]] <- pick[5]
         return(c(pick[1], pick[2], pick[1], pick[1], strengths))
     } else {
         for (i in seq_along(strengths)) {
+            # select all areas from the same sample
+            areas <- pick[pick[, 6] == i, 5]
             if (use == "sum")
-                strengths[i] <- sum(pick[pick[, 6] == i, 5])
+                strengths[i] <- sum(areas)
             if (use == "median")
-                strengths[i] <- median(pick[pick[, 6] == i, 5])
+                strengths[i] <- median(areas)
+            # can be NA if pick does not contain any data from a sample
         }
+        # average of m/z, average of rt, min of rt, max of rt, sum/median of areas
         return(c(mean(pick[, 1]), mean(pick[, 2]), min(pick[, 1]),
                  max(pick[, 1]), strengths))
     }
 }
 
+
+select_rt <- function(sample, rt_tol_relative, min_occurrence, number_of_samples) {
+    # Kernel Density Estimation, this time for retention time
+    den <- density(sample[, 2], bw = rt_tol_relative / 1.414)
+    # again select statistically significant points
+    turns <- find.turn.point(den$y)
+    peaks <- den$x[turns$pks]
+    valleys <- den$x[turns$vlys]
+    for (i in seq_along(peaks)) {
+        lower_bound <- max(valleys[valleys < peaks[i]])
+        upper_bound <- min(valleys[valleys > peaks[i]])
+        # select data with m/z within lower and upper bound from density estimation
+        selected <- which(sample[, 2] > lower_bound & sample[, 2] <= upper_bound)
+        sample_grouped <- sample[selected, ]
+        if (!is.null(nrow(sample_grouped))) {
+            if (length(unique(sample_grouped[, 6])) >= min_occurrence) {
+                # continue if data is still from at least 'min_occurrence' samples
+                return(c(to_attach(sample_grouped, number_of_samples, use = "sum"),
+                         to_attach(sample_grouped[, c(1, 2, 3, 4, 2, 6)], number_of_samples, use = "median"),
+                         sd(sample_grouped[, 1], na.rm = TRUE)
+                       )
+                )
+            }
+        }
+    }
+}
+
+
+select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, number_of_samples) {
+    # Kernel Density Estimation with target standard deviation 'mz_tol_relative' multiplied by median of m/z
+    des <- density(sample[, 1], bw = mz_tol_relative * median(sample[, 1]))
+    # find the peaks and valleys of a density function
+    turns <- find.turn.point(des$y)
+    peaks <- des$x[turns$pks]
+    valleys <- des$x[turns$vlys]
+    for (i in seq_along(peaks)) {
+        lower_bound <- max(valleys[valleys < peaks[i]])
+        upper_bound <- min(valleys[valleys > peaks[i]])
+        # select data with m/z within lower and upper bound from density estimation
+        selected <- which(sample[, 1] > lower_bound & sample[, 1] <= upper_bound)
+        sample_grouped <- sample[selected, ]
+        if (!is.null(nrow(sample_grouped))) {
+            # continue if data is still from at least 'min_occurrence' samples
+            if (length(unique(sample_grouped[, 6])) >= min_occurrence) {
+                return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, number_of_samples)) 
+            }
+        }
+    }
+}
+
+
 # returns a list of aligned features and original peak times
 feature.align <- function(features,
                           min_occurrence = 2,
@@ -34,22 +92,22 @@ feature.align <- function(features,
     if (number_of_samples > 1) {
         values <- get_feature_values(features, rt_colname)
         mz_values <- values$mz
-        chr <- values$rt
-        lab <- values$sample_id
+        rt <- values$rt
+        sample_id <- values$sample_id
 
-        o <- order(mz_values, chr)
-        mz_values <- mz_values[o]
-        chr <- chr[o]
-        lab <- lab[o]
+        # sort all values by m/z, if equal by rt
+        ordering <- order(mz_values, rt)
+        mz_values <- mz_values[ordering]
+        rt <- rt[ordering]
+        sample_id <- sample_id[ordering]
 
         # find relative m/z tolerance level
         if (is.na(mz_tol_relative)) {
             mz_tol_relative <- find.tol(mz_values, mz_max_diff = mz_max_diff, do.plot = do.plot)
             if (length(mz_tol_relative) == 0) {
                 mz_tol_relative <- 1e-5
-                warning(
-                    "Automatic tolerance finding failed, 10 ppm was assigned. May need to manually assign alignment mz tolerance level."
-                )
+                warning("Automatic tolerance finding failed, 10 ppm was assigned. 
+                        May need to manually assign alignment mz tolerance level.")
             }
         } else if (do.plot) {
             draw_plot(main = "alignment m/z tolerance level given", 
@@ -62,119 +120,107 @@ feature.align <- function(features,
         }
 
         # find relative retention time tolerance level
-        all.ft <- find.tol.time(mz_values,
-                                chr,
-                                lab,
-                                number_of_samples = number_of_samples,
-                                mz_tol_relative = mz_tol_relative,
-                                rt_tol_relative = rt_tol_relative,
-                                mz_tol_absolute = mz_tol_absolute,
-                                do.plot = do.plot)
-        rt_tol_relative <- all.ft$chr.tol
+        # also does some preprocessing grouping steps
+        all_features <- find.tol.time(mz_values,
+                                      rt,
+                                      sample_id,
+                                      number_of_samples = number_of_samples,
+                                      mz_tol_relative = mz_tol_relative,
+                                      rt_tol_relative = rt_tol_relative,
+                                      mz_tol_absolute = mz_tol_absolute,
+                                      do.plot = do.plot)
+        rt_tol_relative <- all_features$rt.tol
 
         message("**** performing feature alignment ****")
         message(paste("m/z tolerance level: ", mz_tol_relative))
         message(paste("time tolerance level:", rt_tol_relative))
 
-        aligned.ftrs <- pk.times <- rep(0, 4 + number_of_samples)
-        mz.sd.rec <- 0
-
-        labels <- unique(all.ft$grps)
+        # create zero vectors of length number_of_samples + 4 ?
+        aligned_features <- pk.times <- rep(0, 4 + number_of_samples)
+        mz_sd <- 0
 
+        labels <- unique(all_features$grps)
         area <- grps <- mz_values
 
         # grouping the features based on their m/z values (assuming the tolerance level)
         sizes <- c(0, cumsum(sapply(features, nrow)))
         for (i in 1:number_of_samples) {
-            this <- features[[i]]
-            sel <- which(all.ft$lab == i)
-            that <- cbind(all.ft$mz[sel], all.ft$chr[sel], all.ft$grps[sel])
-            this <- this[order(this[, 1], this[, 2]),]
-            that <- that[order(that[, 1], that[, 2]),]
+            sample <- features[[i]]
+            # order by m/z then by rt
+            sample <- sample[order(sample[, 1], sample[, 2]),]
+
+            # select preprocessed features belonging to current sample
+            group_ids <- which(all_features$lab == i)
+            # select m/z, rt and their group ID
+            sample_grouped <- cbind(all_features$mz[group_ids], all_features$rt[group_ids], all_features$grps[group_ids])
+            # order them again? should be ordered already...
+            sample_grouped <- sample_grouped[order(sample_grouped[, 1], sample_grouped[, 2]),]
 
-            mz_values[(sizes[i] + 1):sizes[i + 1]] <- this[, 1]
-            chr[(sizes[i] + 1):sizes[i + 1]] <- this[, 2]
-            area[(sizes[i] + 1):sizes[i + 1]] <- this[, 5]
-            grps[(sizes[i] + 1):sizes[i + 1]] <- that[, 3]
-            lab[(sizes[i] + 1):sizes[i + 1]] <- i
+            # update m/z, rt, area values with ordered ones
+            mz_values[(sizes[i] + 1):sizes[i + 1]] <- sample[, 1]
+            rt[(sizes[i] + 1):sizes[i + 1]] <- sample[, 2]
+            area[(sizes[i] + 1):sizes[i + 1]] <- sample[, 5]
+            # assign row identifier
+            grps[(sizes[i] + 1):sizes[i + 1]] <- sample_grouped[, 3]
+            # assign batch identifier
+            sample_id[(sizes[i] + 1):sizes[i + 1]] <- i
         }
 
-        ttt <- table(all.ft$grps)
-        curr.row <- sum(ttt >= min_occurrence) * 3
-        mz.sd.rec <- rep(0, curr.row)
+        # table with number of values in a group
+        groups_cardinality <- table(all_features$grps)
+        # count those with minimal occurrence 
+        # (times 3 ? shouldn't be number of sample) !!!
+        curr.row <- sum(groups_cardinality >= min_occurrence) * 3
+        mz_sd <- rep(0, curr.row)
 
-        sel.labels <- as.numeric(names(ttt)[ttt >= min_occurrence])
+        sel.labels <- as.numeric(names(groups_cardinality)[groups_cardinality >= min_occurrence])
 
         # retention time alignment
-        aligned.ftrs <-
+        aligned_features <-
             foreach(i = seq_along(sel.labels), .combine = rbind) %do% {
                 if (i %% 100 == 0)
-                    gc()
-                this.return <- NULL
-                sel <- which(grps == sel.labels[i])
-                if (length(sel) > 1) {
-                    this <- cbind(mz_values[sel], chr[sel], chr[sel], chr[sel], area[sel], lab[sel])
-                    if (length(unique(this[, 6])) >= min_occurrence) {
-                        this.den <- density(this[, 1], bw = mz_tol_relative * median(this[, 1]))
-                        turns <- find.turn.point(this.den$y)
-                        pks <- this.den$x[turns$pks]
-                        vlys <- this.den$x[turns$vlys]
-                        for (j in seq_along(pks)) {
-                            this.lower <- max(vlys[vlys < pks[j]])
-                            this.upper <- min(vlys[vlys > pks[j]])
-                            this.sel <- which(this[, 1] > this.lower & this[, 1] <= this.upper)
-                            that <- this[this.sel, ]
-                            if (!is.null(nrow(that))) {
-                                if (length(unique(that[, 6])) >= min_occurrence) {
-                                    that.den <- density(that[, 2], bw = rt_tol_relative / 1.414)
-                                    that.turns <- find.turn.point(that.den$y)
-                                    that.pks <- that.den$x[that.turns$pks]
-                                    that.vlys <- that.den$x[that.turns$vlys]
-                                    for (k in seq_along(that.pks)) {
-                                        that.lower <- max(that.vlys[that.vlys < that.pks[k]])
-                                        that.upper <- min(that.vlys[that.vlys > that.pks[k]])
-                                        thee <- that[that[, 2] > that.lower & that[, 2] <= that.upper, ]
-                                        if (!is.null(nrow(thee))) {
-                                            if (length(unique(thee[, 6])) >= min_occurrence) {
-                                                this.return <-
-                                                    c(to_attach(thee, number_of_samples, use = "sum"),
-                                                      to_attach(thee[, c(1, 2, 3, 4, 2, 6)], number_of_samples, use = "median"),
-                                                      sd(thee[, 1], na.rm = TRUE)
-                                                    )
-                                            }
-                                        }
-                                    }
-                                }
-                            }
-                        }
+                    gc() # call Garbage Collection for performance improvement?
+                # select a group
+                group_ids <- which(grps == sel.labels[i])
+                if (length(group_ids) > 1) {
+                    # select data from the group
+                    sample <- cbind(mz_values[group_ids], rt[group_ids], rt[group_ids], rt[group_ids], area[group_ids], sample_id[group_ids])
+                    # continue if data is from at least 'min_occurrence' samples
+                    if (length(unique(sample[, 6])) >= min_occurrence) {
+                        return(select_mz(sample, mz_tol_relative, rt_tol_relative, min_occurrence, number_of_samples))
                     }
                 } else {
                     if (min_occurrence == 1) {
-                        thee <- c(mz_values[sel], chr[sel], chr[sel], chr[sel], area[sel], lab[sel])
-                        this.return <- c(to_attach(thee, number_of_samples, use = "sum"),
-                                         to_attach(thee[c(1, 2, 3, 4, 2, 6)], number_of_samples, use = "median"),
-                                         NA
-                        )
+                        sample_grouped <- c(mz_values[group_ids], rt[group_ids], rt[group_ids], rt[group_ids], area[group_ids], sample_id[group_ids])
+                        return(c(to_attach(sample_grouped, number_of_samples, use = "sum"),
+                                 to_attach(sample_grouped[c(1, 2, 3, 4, 2, 6)], number_of_samples, use = "median"),
+                                 NA
+                                )
+                              )
                     }
                 }
-                this.return
+                return(NULL)
             }
 
-        pk.times <- aligned.ftrs[, (5 + number_of_samples):(2 * (4 + number_of_samples))]
-        mz.sd.rec <- aligned.ftrs[, ncol(aligned.ftrs)]
-        aligned.ftrs <- aligned.ftrs[, 1:(4 + number_of_samples)]
+        # select columns: average of m/z, average of rt, min of m/z, max of m/z, median of rt per sample (the second to_attach call)
+        pk.times <- aligned_features[, (5 + number_of_samples):(2 * (4 + number_of_samples))]
+        mz_sd <- aligned_features[, ncol(aligned_features)]
+        # select columns: average of m/z, average of rt, min of m/z, max of m/z, sum of areas per sample (the first to_attach call)
+        aligned_features <- aligned_features[, 1:(4 + number_of_samples)]
 
-        colnames(aligned.ftrs) <-
+        # rename columns on both tables, samples are called "exp_i"
+        colnames(aligned_features) <-
             colnames(pk.times) <- c("mz", "time", "mz.min", "mz.max", paste("exp", 1:number_of_samples))
 
+        # return both tables and both computed tolerances
         rec <- new("list")
-        rec$aligned.ftrs <- aligned.ftrs
+        rec$aligned.ftrs <- aligned_features
         rec$pk.times <- pk.times
         rec$mz.tol <- mz_tol_relative
         rec$chr.tol <- rt_tol_relative
 
         if (do.plot) {
-            hist(mz.sd.rec, xlab = "m/z SD", ylab = "Frequency",
+            hist(mz_sd, xlab = "m/z SD", ylab = "Frequency",
                  main = "m/z SD distribution")
             hist(apply(pk.times[, -1:-4], 1, sd, na.rm = TRUE), 
                  xlab = "Retention time SD", ylab = "Frequency",

diff --git a/R/find.tol.time.R b/R/find.tol.time.R
@@ -120,10 +120,10 @@ find.tol.time <- function(mz,
 
     list(
         mz = mz,
-        chr = rt,
+        rt = rt,
         lab = sample_id,
         grps = groups,
-        chr.tol = rt_tol_relative,
+        rt.tol = rt_tol_relative,
         mz.tol = mz_tol_relative
     )
 }