diff --git a/NAMESPACE b/NAMESPACE index d8db1ea..4e394fc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(add_motifs) +export(contains_motif) export(count_seq_kmers) export(filter_chisq) export(filter_fcbf) @@ -11,7 +12,7 @@ export(filter_quipt) export(generate_kmer_data) export(generate_motif) export(generate_motifs) -export(generate_sequence) +export(generate_negative_sequence) export(generate_sequence_data) export(get_target_additive) export(get_target_interactions) diff --git a/R/motifs.R b/R/motifs.R index 703b020..dd4fed0 100644 --- a/R/motifs.R +++ b/R/motifs.R @@ -95,7 +95,7 @@ add_motifs <- function(motifs, sequence) { for(iter in na.omit(iterations)) { - if(length(iterations) == 1) + if(length(motifs) == 1) motifs_grid <- data.frame(unlist(maximum_motifs_positions)) else motifs_grid <- t(sapply(1:iter, function(i){ diff --git a/R/sequences.R b/R/sequences.R index 3fe3ace..be3df3e 100644 --- a/R/sequences.R +++ b/R/sequences.R @@ -1,3 +1,40 @@ + +#' Simple motifs check +#' +#' This function check if at least one of provided motifs is contained in the +#' provided sequence. +#' +#' @inheritParams generate_kmer_data +#' @param sequence a vector of characters +#' +#' @return randomly generated sequences +#' +#' @examples +#' alph <- 1:4 +#' motifs <- generate_motifs(alph, 3, 3, 3, 2) +#' sequence <- sample(alph, 100, replace = TRUE) +#' contains_motif(sequence, motifs) +#' +#' @export + +contains_motif <- function(sequence, motifs) { + + sequence <- paste0(sequence, collapse = "") + + for(i in 1:length(motifs)) { + ith_motif <- motifs[[i]] + ith_motif[ith_motif == "_"] <- "." + ith_motif <- paste0(ith_motif, collapse = "") + + find_motifs <- regexec(ith_motif, sequence)[[1]] + + if(find_motifs != -1) + return(TRUE) + } + FALSE +} + + #' Sampling from alphabet #' #' This function generates sequence of elements from alphabet with replacement @@ -7,14 +44,58 @@ #' @return randomly generated sequences #' #' @examples -#' generate_sequence(5, 1L:4) -#' generate_sequence(10, c("a", "b", "c")) -#' generate_sequence(10, c("a", "b", "c"), c(0.6, 0.2, 0.2)) +#' set.seed(2) +#' alph <- 1:4 +#' motifs <- generate_motifs(alph, 3, 3, 3, 2) +#' generate_negative_sequence(5, alph, motifs) +#' generate_negative_sequence(10, c("a", "b", "c"), motifs) +#' generate_negative_sequence(10, c("a", "b", "c"), c(0.6, 0.2, 0.2), motifs) #' #' @export -generate_sequence <- function(sequence_length, alphabet, seqProbs = NULL){ - sample(alphabet, size = sequence_length, replace = TRUE, prob = seqProbs) +generate_negative_sequence <- function(sequence_length, + alphabet, + motifs, + seqProbs = NULL, + attempts = 100){ + + candidate <- sample(alphabet, size = sequence_length, + replace = TRUE, prob = seqProbs) + + if(contains_motif(candidate, motifs)) { + + mask <- rep(FALSE, sequence_length) + + for(i in 1:length(motifs)) { + ith_motif <- motifs[[i]] + ith_motif[ith_motif == "_"] <- "." + ith_motif <- paste0(ith_motif, collapse = "") + + found_motifs <- gregexpr(ith_motif, paste0(candidate, collapse = ""))[[1]] + + if(length(found_motifs) == 1 && found_motifs == -1) { + next + } else { + motifs_ids <- as.vector( + sapply(found_motifs, function(x) x:(x + nchar(ith_motif) - 1)) + ) + mask[motifs_ids] <- TRUE + } + } + + attempt <- 1 + + while (contains_motif(candidate, motifs) & attempt <= attempts) { + attempt <- attempt + 1 + candidate[mask] <- sample(candidate[mask]) + } + + if(attempt == attempts) + stop(paste0("It was impossible to generate negative sequence under + provided assumptions within ", attempts, " attempts.")) + } + + candidate } #' Sequences generation @@ -60,17 +141,18 @@ generate_sequence_data <- function(n_seq, motifs_ids <- sample(1:length(motifs), n_injections[i]) motifs_map[i, motifs_ids] <- 1 selected_motifs <- motifs[motifs_ids] + new_seq <- add_motifs(selected_motifs, - generate_sequence(sequence_length, - alphabet, seqProbs)) + generate_negative_sequence(sequence_length, + alphabet, seqProbs)) list_of_masks[[i]] <- attr(new_seq, "masks") sequences[i, ] <- new_seq } for (i in 1:(n_seq - n_pos)) { - sequences[n_pos + i, ] <- generate_sequence(sequence_length, - alphabet, - seqProbs) + sequences[n_pos + i, ] <- generate_negative_sequence(sequence_length, + alphabet, + seqProbs) } attr(sequences, "max_injection") <- max_injection attr(sequences, "motifs_map") <- motifs_map diff --git a/man/contains_motif.Rd b/man/contains_motif.Rd new file mode 100644 index 0000000..bf964bd --- /dev/null +++ b/man/contains_motif.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sequences.R +\name{contains_motif} +\alias{contains_motif} +\title{Simple motifs check} +\usage{ +contains_motif(sequence, motifs) +} +\arguments{ +\item{sequence}{a vector of characters} + +\item{motifs}{list of motifs} +} +\value{ +randomly generated sequences +} +\description{ +This function check if at least one of provided motifs is contained in the +provided sequence. +} +\examples{ +alph <- 1:4 +motifs <- generate_motifs(alph, 3, 3, 3, 2) +sequence <- sample(alph, 100, replace = TRUE) +contains_motif(sequence, motifs) + +} diff --git a/man/generate_negative_sequence.Rd b/man/generate_negative_sequence.Rd new file mode 100644 index 0000000..7e7054c --- /dev/null +++ b/man/generate_negative_sequence.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sequences.R +\name{generate_negative_sequence} +\alias{generate_negative_sequence} +\title{Sampling from alphabet} +\usage{ +generate_negative_sequence( + sequence_length, + alphabet, + motifs, + seqProbs = NULL, + attempts = 100 +) +} +\arguments{ +\item{sequence_length}{sequence length} + +\item{alphabet}{elements used to build sequence} + +\item{motifs}{list of motifs} + +\item{seqProbs}{alphabet probabilities for sequences} +} +\value{ +randomly generated sequences +} +\description{ +This function generates sequence of elements from alphabet with replacement +} +\examples{ +set.seed(2) +alph <- 1:4 +motifs <- generate_motifs(alph, 3, 3, 3, 2) +generate_negative_sequence(5, alph, motifs) +generate_negative_sequence(10, c("a", "b", "c"), motifs) +generate_negative_sequence(10, c("a", "b", "c"), c(0.6, 0.2, 0.2), motifs) + +} diff --git a/man/generate_sequence.Rd b/man/generate_sequence.Rd deleted file mode 100644 index 39f0989..0000000 --- a/man/generate_sequence.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/sequences.R -\name{generate_sequence} -\alias{generate_sequence} -\title{Sampling from alphabet} -\usage{ -generate_sequence(sequence_length, alphabet, seqProbs = NULL) -} -\arguments{ -\item{sequence_length}{sequence length} - -\item{alphabet}{elements used to build sequence} - -\item{seqProbs}{alphabet probabilities for sequences} -} -\value{ -randomly generated sequences -} -\description{ -This function generates sequence of elements from alphabet with replacement -} -\examples{ -generate_sequence(5, 1L:4) -generate_sequence(10, c("a", "b", "c")) -generate_sequence(10, c("a", "b", "c"), c(0.6, 0.2, 0.2)) - -} diff --git a/tests/testthat/test-motifs.R b/tests/testthat/test-motifs.R index 0c939d4..d94e414 100644 --- a/tests/testthat/test-motifs.R +++ b/tests/testthat/test-motifs.R @@ -70,7 +70,7 @@ test_that("generate list of motifs", { motifs <- generate_motifs(alphabet, 1, 1, n, d, weights, validate = FALSE) - expect_equal(motifs, list(c("c", "_", "_", "a", "a", "a"))) + expect_equal(motifs, list(c("c", "_", "a", "_", "a"))) }) test_that("motif injection", { @@ -81,13 +81,13 @@ test_that("motif injection", { motifs <- generate_motifs(letters[1:4], 2, 2, 4, 2) injected_sequence <- add_motifs(motifs, sequence) expect_equal(injected_sequence, - structure(c("a", "c", "b", "d", "c", "c", "d", "d"), + structure(c("d", "c", "a", "c", "c", "c", "c", "d"), motifs = list(c("d", "c"), c("a", "c", "_", "_", "c")), - masks = list(c(FALSE, FALSE, FALSE, TRUE, - TRUE, FALSE, FALSE, FALSE), - c(TRUE, TRUE, FALSE, FALSE, - TRUE, FALSE, FALSE, FALSE)))) + masks = list(c(TRUE, TRUE, FALSE, FALSE, FALSE, + FALSE, FALSE, FALSE), + c(FALSE, FALSE, TRUE, TRUE, FALSE, + FALSE, TRUE, FALSE)))) }) test_that("set of motifs cannot be injected", { diff --git a/tests/testthat/test-sequences.R b/tests/testthat/test-sequences.R index 0e6ed71..7f15adb 100644 --- a/tests/testthat/test-sequences.R +++ b/tests/testthat/test-sequences.R @@ -3,18 +3,23 @@ library(testthat) context("Sequence generation") -test_that("generate single sequence", { +test_that("generate single sequence", { set.seed(42) - seq <- generate_sequence(5, 1L:4) - expect_equal(seq, c(1L, 1L, 1L, 1L, 2L)) + + motifs <- generate_motifs(1:4, 3, 1, n = 4, d = 6) + sequence <- generate_negative_sequence(5, 1L:4, motifs) + expect_equal(sequence, c(3L, 2L, 1L, 2L, 4L)) }) test_that("sequence probabilites", { + set.seed(42) seqProbs <- runif(6) seqProbs <- seqProbs / sum(seqProbs) - long_sequence <- generate_sequence(10e6, 1:6, seqProbs) + motifs <- generate_motifs(1:4, 3, 1, n = 4, d = 6) + + long_sequence <- generate_negative_sequence(10e5, 1:6, motifs, seqProbs) seqFreqs <- c(table(long_sequence) / length(long_sequence)) names(seqFreqs) <- NULL expect_equal(seqProbs, c(seqFreqs), tolerance = 10e-4) @@ -64,7 +69,7 @@ test_that("k-mer counts", { kmers <- count_seq_kmers(seq_data, alph) - expect_equal(dim(kmers), c(20L, 6189L)) + expect_equal(dim(kmers), c(20L, 6201L)) kmers <- count_seq_kmers(seq_data, alph, n = 1, d = 0) expect_equal(ncol(kmers), length(alph)) @@ -87,8 +92,7 @@ test_that("k-mer data", { kmer_data <- generate_kmer_data(n_seq, sequence_length, alph, motifs, n_injections, fraction) - - expect_equal(dim(kmer_data), c(20L, 5718L)) + expect_equal(dim(kmer_data), c(20L, 5022L)) expect_equal(n_seq, nrow(kmer_data)) # correct fraction of positive sequences @@ -113,18 +117,17 @@ test_that("Interaction model works", { results <- generate_kmer_data(n_seq, sequence_length, alph, motifs, n_injections = 3) expect_identical( - c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, - 1L, 1L, 0L, 0L, 0L), get_target_interactions(results) + c(1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, + 0L, 0L, 1L, 0L, 0L), get_target_interactions(results) ) expect_equal( - c(3.2669325505849, 3.2669325505849, 0.824915252858773, 3.2669325505849, - 0.824915252858773, 1.01497719204053, 0.745780731318519, - 2.05459328205325, 1.16902287956327, 1.77926696231589, - -1.69982326356694, -1.69982326356694, -1.69982326356694, - -1.69982326356694, -1.69982326356694, -1.69982326356694, - -1.69982326356694, -1.69982326356694, -1.69982326356694, - -1.69982326356694), get_target_interactions(results, binary = FALSE) + c(2.3877928876318, 4.19229120924138, 2.12166230170988, 0.988599377218634, + 4.47515172231942, 1.58585822791792, 1.17262414400466, 0.266789596760646, + 4.73356275283732, 2.3877928876318, -1.03832712979056, -1.03832712979056, + -1.03832712979056, -1.03832712979056, -1.03832712979056, -1.03832712979056, + -1.03832712979056, -1.03832712979056, -1.03832712979056, -1.03832712979056 + ), get_target_interactions(results, binary = FALSE) ) }) @@ -141,12 +144,12 @@ test_that("Additive model works", { results <- generate_kmer_data(n_seq, sequence_length, alph, motifs, n_injections = 3) expect_identical( - c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, - 0L, 0L, 0L, 0L, 1L), get_target_additive(results) + c(0L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, + 0L, 1L, 0L, 0L, 0L), get_target_additive(results) ) expect_equal( - c(2.05890567833558, 2.05890567833558, 0.39565183990635), + c(0.675145752727985, 1.19026312511414, 1.00235199206509), get_target_additive(results, binary = FALSE)[1:3] ) @@ -168,13 +171,13 @@ test_that("Logic model works", { results <- generate_kmer_data(n_seq, sequence_length, alph, motifs, n_injections = 3) expect_identical( - c(0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, - 1L, 0L, 0L, 0L, 1L), get_target_logic(results) + c(1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, + 0L, 0L, 0L, 0L, 0L), get_target_logic(results) ) expect_equal( - c(0.854367734864354, 0.854367734864354, 0.854367734864354, - 0.854367734864354), get_target_logic(results, binary = FALSE)[1:4] + c(0.583639514865354, 1.4445967883803, 1.4445967883803, 0.583639514865354 + ), get_target_logic(results, binary = FALSE)[1:4] ) expressions = matrix(rbinom(n_seq*2, 1, .5), nrow = n_seq) @@ -185,7 +188,7 @@ test_that("Logic model works", { ) expect_identical( - c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, + c(0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), get_target_logic(results, expressions = expressions, weights = c(1, 2)) )