Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Align features refactoring (vol. 2) #88

Merged
merged 9 commits into from
Aug 10, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions R/adjust.time.R
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ adjust.time <- function(features,
rt_tol_relative = rt_tol_relative,
mz_tol_absolute = mz_tol_absolute,
do.plot = do.plot)
rt_tol_relative <- all.ft$chr.tol
rt_tol_relative <- all.ft$rt.tol

message("**** performing time correction ****")
message(paste("m/z tolerance level: ", mz_tol_relative))
Expand All @@ -117,7 +117,7 @@ adjust.time <- function(features,
for(i in 1:number_of_samples) {
this <- features[[i]]
sel <- which(all.ft$lab == i)
that <- cbind(all.ft$mz[sel], all.ft$chr[sel], all.ft$grps[sel])
that <- cbind(all.ft$mz[sel], all.ft$rt[sel], all.ft$grps[sel])
this <- this[order(this[, 1], this[, 2]), ]
that <- that[order(that[, 1], that[, 2]), ]
this <- cbind(this, V6=rep(i, nrow(this)), V7=that[, 3])
Expand Down
224 changes: 135 additions & 89 deletions R/feature.align.R
Original file line number Diff line number Diff line change
@@ -1,20 +1,78 @@
to_attach <- function(pick, number_of_samples, use = "sum") {
strengths <- rep(0, number_of_samples)
if (is.null(nrow(pick))) {
# this is very strange if it can ever happen
# maybe commas are missing? we want the same as below
# but also why if there are no rows...
strengths[pick[6]] <- pick[5]
return(c(pick[1], pick[2], pick[1], pick[1], strengths))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess the problem lies there that nrow(pick) might be null if pick has the wrong type - meaning it is not a table but a list or so and then it actually behaves differently.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess the problem lies there that nrow(pick) might be null if pick has the wrong type - meaning it is not a table but a list or so and then it actually behaves differently.

Exactly. That also happens with nrow check in prof.to.features.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, that makes sense, it was just suspicious for me.

} else {
for (i in seq_along(strengths)) {
# select all areas from the same sample
areas <- pick[pick[, 6] == i, 5]
if (use == "sum")
strengths[i] <- sum(pick[pick[, 6] == i, 5])
strengths[i] <- sum(areas)
if (use == "median")
strengths[i] <- median(pick[pick[, 6] == i, 5])
strengths[i] <- median(areas)
# can be NA if pick does not contain any data from a sample
}
# average of m/z, average of rt, min of rt, max of rt, sum/median of areas
return(c(mean(pick[, 1]), mean(pick[, 2]), min(pick[, 1]),
max(pick[, 1]), strengths))
}
}


select_rt <- function(sample, rt_tol_relative, min_occurrence, number_of_samples) {
# Kernel Density Estimation, this time for retention time
den <- density(sample[, 2], bw = rt_tol_relative / 1.414)
# again select statistically significant points
turns <- find.turn.point(den$y)
peaks <- den$x[turns$pks]
valleys <- den$x[turns$vlys]
for (i in seq_along(peaks)) {
lower_bound <- max(valleys[valleys < peaks[i]])
upper_bound <- min(valleys[valleys > peaks[i]])
# select data with m/z within lower and upper bound from density estimation
selected <- which(sample[, 2] > lower_bound & sample[, 2] <= upper_bound)
sample_grouped <- sample[selected, ]
if (!is.null(nrow(sample_grouped))) {
if (length(unique(sample_grouped[, 6])) >= min_occurrence) {
# continue if data is still from at least 'min_occurrence' samples
return(c(to_attach(sample_grouped, number_of_samples, use = "sum"),
to_attach(sample_grouped[, c(1, 2, 3, 4, 2, 6)], number_of_samples, use = "median"),
sd(sample_grouped[, 1], na.rm = TRUE)
)
)
}
}
}
}


select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, number_of_samples) {
# Kernel Density Estimation with target standard deviation 'mz_tol_relative' multiplied by median of m/z
des <- density(sample[, 1], bw = mz_tol_relative * median(sample[, 1]))
# find the peaks and valleys of a density function
turns <- find.turn.point(des$y)
peaks <- des$x[turns$pks]
valleys <- des$x[turns$vlys]
for (i in seq_along(peaks)) {
lower_bound <- max(valleys[valleys < peaks[i]])
upper_bound <- min(valleys[valleys > peaks[i]])
# select data with m/z within lower and upper bound from density estimation
selected <- which(sample[, 1] > lower_bound & sample[, 1] <= upper_bound)
sample_grouped <- sample[selected, ]
if (!is.null(nrow(sample_grouped))) {
# continue if data is still from at least 'min_occurrence' samples
if (length(unique(sample_grouped[, 6])) >= min_occurrence) {
return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, number_of_samples))
}
}
}
}


# returns a list of aligned features and original peak times
feature.align <- function(features,
min_occurrence = 2,
Expand All @@ -34,22 +92,22 @@ feature.align <- function(features,
if (number_of_samples > 1) {
values <- get_feature_values(features, rt_colname)
mz_values <- values$mz
chr <- values$rt
lab <- values$sample_id
rt <- values$rt
sample_id <- values$sample_id

o <- order(mz_values, chr)
mz_values <- mz_values[o]
chr <- chr[o]
lab <- lab[o]
# sort all values by m/z, if equal by rt
ordering <- order(mz_values, rt)
mz_values <- mz_values[ordering]
rt <- rt[ordering]
sample_id <- sample_id[ordering]
Comment on lines +135 to +139
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Data could be arranged in a tibble or data.frame.


# find relative m/z tolerance level
if (is.na(mz_tol_relative)) {
mz_tol_relative <- find.tol(mz_values, mz_max_diff = mz_max_diff, do.plot = do.plot)
if (length(mz_tol_relative) == 0) {
mz_tol_relative <- 1e-5
warning(
"Automatic tolerance finding failed, 10 ppm was assigned. May need to manually assign alignment mz tolerance level."
)
warning("Automatic tolerance finding failed, 10 ppm was assigned.
May need to manually assign alignment mz tolerance level.")
Comment on lines +146 to +147
Copy link
Member

@maximskorik maximskorik Aug 9, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is something that we should address maybe: I doubt someone will see this warning given that most of our test-cases produce dozens of warnings.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think the function should fail with an error? Or we could parametrise the whole function to use default value vs. fail with an error.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I don't think an error would be an appropriate solution. I just wanted to point out that apLCMS may produce a meaningful warning, so we should make sure that the warning doesn't get lost in a bunch of deprecation warnings, etc. I think we just have to reduce the number of other warnings like deprecation, imports, and so on. Should be done with #93.

}
} else if (do.plot) {
draw_plot(main = "alignment m/z tolerance level given",
Expand All @@ -62,119 +120,107 @@ feature.align <- function(features,
}

# find relative retention time tolerance level
all.ft <- find.tol.time(mz_values,
chr,
lab,
number_of_samples = number_of_samples,
mz_tol_relative = mz_tol_relative,
rt_tol_relative = rt_tol_relative,
mz_tol_absolute = mz_tol_absolute,
do.plot = do.plot)
rt_tol_relative <- all.ft$chr.tol
# also does some preprocessing grouping steps
all_features <- find.tol.time(mz_values,
rt,
sample_id,
number_of_samples = number_of_samples,
mz_tol_relative = mz_tol_relative,
rt_tol_relative = rt_tol_relative,
mz_tol_absolute = mz_tol_absolute,
do.plot = do.plot)
rt_tol_relative <- all_features$rt.tol

message("**** performing feature alignment ****")
message(paste("m/z tolerance level: ", mz_tol_relative))
message(paste("time tolerance level:", rt_tol_relative))

aligned.ftrs <- pk.times <- rep(0, 4 + number_of_samples)
mz.sd.rec <- 0

labels <- unique(all.ft$grps)
# create zero vectors of length number_of_samples + 4 ?
aligned_features <- pk.times <- rep(0, 4 + number_of_samples)
mz_sd <- 0

labels <- unique(all_features$grps)
area <- grps <- mz_values

# grouping the features based on their m/z values (assuming the tolerance level)
sizes <- c(0, cumsum(sapply(features, nrow)))
for (i in 1:number_of_samples) {
this <- features[[i]]
sel <- which(all.ft$lab == i)
that <- cbind(all.ft$mz[sel], all.ft$chr[sel], all.ft$grps[sel])
this <- this[order(this[, 1], this[, 2]),]
that <- that[order(that[, 1], that[, 2]),]
sample <- features[[i]]
# order by m/z then by rt
sample <- sample[order(sample[, 1], sample[, 2]),]

# select preprocessed features belonging to current sample
group_ids <- which(all_features$lab == i)
# select m/z, rt and their group ID
sample_grouped <- cbind(all_features$mz[group_ids], all_features$rt[group_ids], all_features$grps[group_ids])
# order them again? should be ordered already...
sample_grouped <- sample_grouped[order(sample_grouped[, 1], sample_grouped[, 2]),]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you try to run tests without this line?

Copy link
Author

@xtrojak xtrojak Aug 10, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, without that the test case fails, so I guess they weren't really ordered...


mz_values[(sizes[i] + 1):sizes[i + 1]] <- this[, 1]
chr[(sizes[i] + 1):sizes[i + 1]] <- this[, 2]
area[(sizes[i] + 1):sizes[i + 1]] <- this[, 5]
grps[(sizes[i] + 1):sizes[i + 1]] <- that[, 3]
lab[(sizes[i] + 1):sizes[i + 1]] <- i
# update m/z, rt, area values with ordered ones
mz_values[(sizes[i] + 1):sizes[i + 1]] <- sample[, 1]
rt[(sizes[i] + 1):sizes[i + 1]] <- sample[, 2]
area[(sizes[i] + 1):sizes[i + 1]] <- sample[, 5]
# assign row identifier
grps[(sizes[i] + 1):sizes[i + 1]] <- sample_grouped[, 3]
# assign batch identifier
sample_id[(sizes[i] + 1):sizes[i + 1]] <- i
}

ttt <- table(all.ft$grps)
curr.row <- sum(ttt >= min_occurrence) * 3
mz.sd.rec <- rep(0, curr.row)
# table with number of values in a group
groups_cardinality <- table(all_features$grps)
# count those with minimal occurrence
# (times 3 ? shouldn't be number of sample) !!!
curr.row <- sum(groups_cardinality >= min_occurrence) * 3
mz_sd <- rep(0, curr.row)

sel.labels <- as.numeric(names(ttt)[ttt >= min_occurrence])
sel.labels <- as.numeric(names(groups_cardinality)[groups_cardinality >= min_occurrence])

# retention time alignment
aligned.ftrs <-
aligned_features <-
foreach(i = seq_along(sel.labels), .combine = rbind) %do% {
if (i %% 100 == 0)
gc()
this.return <- NULL
sel <- which(grps == sel.labels[i])
if (length(sel) > 1) {
this <- cbind(mz_values[sel], chr[sel], chr[sel], chr[sel], area[sel], lab[sel])
if (length(unique(this[, 6])) >= min_occurrence) {
this.den <- density(this[, 1], bw = mz_tol_relative * median(this[, 1]))
turns <- find.turn.point(this.den$y)
pks <- this.den$x[turns$pks]
vlys <- this.den$x[turns$vlys]
for (j in seq_along(pks)) {
this.lower <- max(vlys[vlys < pks[j]])
this.upper <- min(vlys[vlys > pks[j]])
this.sel <- which(this[, 1] > this.lower & this[, 1] <= this.upper)
that <- this[this.sel, ]
if (!is.null(nrow(that))) {
if (length(unique(that[, 6])) >= min_occurrence) {
that.den <- density(that[, 2], bw = rt_tol_relative / 1.414)
that.turns <- find.turn.point(that.den$y)
that.pks <- that.den$x[that.turns$pks]
that.vlys <- that.den$x[that.turns$vlys]
for (k in seq_along(that.pks)) {
that.lower <- max(that.vlys[that.vlys < that.pks[k]])
that.upper <- min(that.vlys[that.vlys > that.pks[k]])
thee <- that[that[, 2] > that.lower & that[, 2] <= that.upper, ]
if (!is.null(nrow(thee))) {
if (length(unique(thee[, 6])) >= min_occurrence) {
this.return <-
c(to_attach(thee, number_of_samples, use = "sum"),
to_attach(thee[, c(1, 2, 3, 4, 2, 6)], number_of_samples, use = "median"),
sd(thee[, 1], na.rm = TRUE)
)
}
}
}
}
}
}
gc() # call Garbage Collection for performance improvement?
# select a group
group_ids <- which(grps == sel.labels[i])
if (length(group_ids) > 1) {
# select data from the group
sample <- cbind(mz_values[group_ids], rt[group_ids], rt[group_ids], rt[group_ids], area[group_ids], sample_id[group_ids])
# continue if data is from at least 'min_occurrence' samples
if (length(unique(sample[, 6])) >= min_occurrence) {
return(select_mz(sample, mz_tol_relative, rt_tol_relative, min_occurrence, number_of_samples))
}
} else {
if (min_occurrence == 1) {
thee <- c(mz_values[sel], chr[sel], chr[sel], chr[sel], area[sel], lab[sel])
this.return <- c(to_attach(thee, number_of_samples, use = "sum"),
to_attach(thee[c(1, 2, 3, 4, 2, 6)], number_of_samples, use = "median"),
NA
)
sample_grouped <- c(mz_values[group_ids], rt[group_ids], rt[group_ids], rt[group_ids], area[group_ids], sample_id[group_ids])
return(c(to_attach(sample_grouped, number_of_samples, use = "sum"),
to_attach(sample_grouped[c(1, 2, 3, 4, 2, 6)], number_of_samples, use = "median"),
NA
)
)
}
}
this.return
return(NULL)
}

pk.times <- aligned.ftrs[, (5 + number_of_samples):(2 * (4 + number_of_samples))]
mz.sd.rec <- aligned.ftrs[, ncol(aligned.ftrs)]
aligned.ftrs <- aligned.ftrs[, 1:(4 + number_of_samples)]
# select columns: average of m/z, average of rt, min of m/z, max of m/z, median of rt per sample (the second to_attach call)
pk.times <- aligned_features[, (5 + number_of_samples):(2 * (4 + number_of_samples))]
mz_sd <- aligned_features[, ncol(aligned_features)]
# select columns: average of m/z, average of rt, min of m/z, max of m/z, sum of areas per sample (the first to_attach call)
aligned_features <- aligned_features[, 1:(4 + number_of_samples)]
Comment on lines +238 to +241
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Computations for the numbers of columns could be extracted to variables to clearly indicate what is being used and why.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will be changed in #87 anyway.


colnames(aligned.ftrs) <-
# rename columns on both tables, samples are called "exp_i"
colnames(aligned_features) <-
colnames(pk.times) <- c("mz", "time", "mz.min", "mz.max", paste("exp", 1:number_of_samples))

# return both tables and both computed tolerances
rec <- new("list")
rec$aligned.ftrs <- aligned.ftrs
rec$aligned.ftrs <- aligned_features
rec$pk.times <- pk.times
rec$mz.tol <- mz_tol_relative
rec$chr.tol <- rt_tol_relative

if (do.plot) {
hist(mz.sd.rec, xlab = "m/z SD", ylab = "Frequency",
hist(mz_sd, xlab = "m/z SD", ylab = "Frequency",
main = "m/z SD distribution")
hist(apply(pk.times[, -1:-4], 1, sd, na.rm = TRUE),
xlab = "Retention time SD", ylab = "Frequency",
Comment on lines 254 to 258
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could be moved to the plotting.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as in @zargham-ahmad's comment, do you have a suggestion? I think we would just create an artificial function which takes the same arguments as hist.

Comment on lines +255 to 258

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can be moved to plot.R

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have any suggestions? I think we would just create an artificial function which takes the same arguments as hist.

Expand Down
4 changes: 2 additions & 2 deletions R/find.tol.time.R
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,10 @@ find.tol.time <- function(mz,

list(
mz = mz,
chr = rt,
rt = rt,
lab = sample_id,
grps = groups,
chr.tol = rt_tol_relative,
rt.tol = rt_tol_relative,
mz.tol = mz_tol_relative
)
}