Skip to content

Commit

Permalink
Merge pull request #3 from pmcharrison/generate
Browse files Browse the repository at this point in the history
Implemente `generate` option in `model_seq`
  • Loading branch information
pmcharrison authored May 31, 2021
2 parents a166226 + 8bd8411 commit d58e759
Show file tree
Hide file tree
Showing 16 changed files with 305 additions and 119 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: ppm
Type: Package
Title: Prediction by Partial Matching
Version: 0.2.0
Version: 0.3.0
Date: 2019-03-08
Author: Peter M. C. Harrison
Maintainer: Peter M. C. Harrison <[email protected]>
Expand All @@ -22,7 +22,7 @@ LinkingTo:
Rcpp,
BH,
testthat
RoxygenNote: 7.1.0
RoxygenNote: 7.1.1
Encoding: UTF-8
SystemRequirements: C++11
Suggests:
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# ppm 0.3.0

- Implemented `generate` option in `model_seq`.

# ppm 0.2.0

- Added NEWS.md.
Expand Down
63 changes: 46 additions & 17 deletions R/model-seq.R
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
#' Model sequence
#'
#' Analyses a sequence using a PPM model.
#' Analyses or generates a sequence using a PPM model.
#'
#' @param model
#' A PPM model object as produced by (for example)
#' \code{\link{new_ppm_simple}} or \code{\link{new_ppm_decay}}.
#'
#' @param seq
#' An integer vector defining the input sequence
#' When \code{generate = FALSE} an integer vector defining the input sequence
#' (equivalently a numeric vector containing solely integers,
#' or a factor vector, both of which which will be coerced to integer vectors).
#' When \code{generate = TRUE}, an integer scalar giving the desired length
#' of the generated sequence.
#'
#' @param time
#' (NULL or a numeric vector)
#' Timepoints corresponding to each element of the argument \code{seq}.
#' Timepoints corresponding to each element of the sequence.
#' Only used by certain model types (e.g. decay-based models).
#'
#' @param zero_indexed
Expand Down Expand Up @@ -43,6 +45,13 @@
#' Whether or not to return the entropy of each event prediction
#' (ignored if \code{predict = FALSE}).
#'
#' @param generate
#' (Logical scalar)
#' If \code{TRUE}, the output will correspond to a newly generated sequence
#' with length as specified by the \code{seq} argument,
#' produced by sampling from the model's predictive distribution.
#' The default is \code{FALSE}.
#'
#' @return
#' A \code{\link[tibble]{tibble}} which will be empty if \code{predict = FALSE}
#' and otherwise will contain one row for each element in the sequence,
Expand All @@ -68,7 +77,15 @@ model_seq <- function(model,
train = TRUE,
predict = TRUE,
return_distribution = TRUE,
return_entropy = TRUE) {
return_entropy = TRUE,
generate = FALSE) {
if (is.character(seq)) {
stop("'seq' cannot be a character vector; please provide a factor representation instead.")
}
if (generate && length(seq) == 1 && is.numeric(seq)) {
seq <- integer(seq)
}

seq <- as.integer(seq)

stopifnot(is_ppm(model))
Expand All @@ -78,20 +95,27 @@ model_seq <- function(model,
checkmate::qassert(predict, "B1")
checkmate::qassert(return_distribution, "B1")
checkmate::qassert(return_entropy, "B1")
checkmate::qassert(generate, "B1")
stopifnot(is.null(time) || is.numeric(time))

if (zero_indexed) {
if (any(seq < 0L))
stop("all elements of 'seq' must be greater than or equal to 0")
if (any(seq >= model$alphabet_size))
stop("all elements of 'seq' must be less than the model's alphabet size",
" (", model$alphabet_size, ")")
} else {
if (any(seq < 1L))
stop("all elements of 'seq' must be greater than or equal to 1")
if (any(seq > model$alphabet_size))
stop("all elements of 'seq' must be less than or equal to the model's alphabet size",
" (", model$alphabet_size, ")")
if (!generate) {
if (zero_indexed) {
if (any(seq < 0L))
stop("all elements of 'seq' must be greater than or equal to 0")
if (any(seq >= model$alphabet_size))
stop("all elements of 'seq' must be less than the model's alphabet size",
" (", model$alphabet_size, ")")
} else {
if (any(seq < 1L))
stop("all elements of 'seq' must be greater than or equal to 1")
if (any(seq > model$alphabet_size))
stop("all elements of 'seq' must be less than or equal to the model's alphabet size",
" (", model$alphabet_size, ")")
}
}

if (is.factor(seq) && length(model$alphabet_levels > 0) && !identical(levels(seq), model$alphabet_levels)) {
warning("sequence's factor levels seemed inconsistent with model$alphabet_levels")
}

if (any(diff(time) < 0)) stop("decreasing values of time are not permitted")
Expand All @@ -109,11 +133,16 @@ model_seq <- function(model,
train = train,
predict = predict,
return_distribution = return_distribution,
return_entropy = return_entropy
return_entropy = return_entropy,
generate = generate
)
df <- res$as_tibble()

if (!zero_indexed) df$symbol <- df$symbol + 1L

if (length(model$alphabet_levels) > 0) {
df$symbol <- factor(model$alphabet_levels[df$symbol], levels = model$alphabet_levels)
}

df
}
52 changes: 41 additions & 11 deletions R/new-model.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#' @param alphabet_size
#' (Integerish scalar)
#' The size of the alphabet upon which the model will be trained and tested.
#' If not provided, this will be taken as \code{length(alphabet_levels)}.
#'
#' @param order_bound
#' (Integerish scalar)
Expand Down Expand Up @@ -53,6 +54,11 @@
#' Whether to print (currently rather messy and ad hoc) debug output
#' for smoothing.
#'
#' @param alphabet_levels
#' (Character vector)
#' Optional vector of levels for the alphabet. If provided,
#' these will be used to define factor levels for the output.
#'
#' @note
#' The implementation does not scale well to very large order bounds (> 50).
#'
Expand All @@ -75,31 +81,44 @@ new_ppm_simple <- function(
exclusion = TRUE,
update_exclusion = TRUE,
escape = "c",
debug_smooth = FALSE
debug_smooth = FALSE,
alphabet_levels = character()
) {
if (missing(alphabet_size) && length(alphabet_levels) > 0) {
alphabet_size <- length(alphabet_levels)
}
if (length(alphabet_levels) > 0 && length(alphabet_levels) != alphabet_size) {
stop("length(alphabet_levels) must equal alphabet_size.")
}

checkmate::qassert(alphabet_size, "X1")
checkmate::qassert(order_bound, "X[0,)")
checkmate::qassert(shortest_deterministic, "B1")
checkmate::qassert(exclusion, "B1")
checkmate::qassert(update_exclusion, "B1")
checkmate::qassert(escape, "S1")
checkmate::qassert(debug_smooth, "B1")
checkmate::qassert(alphabet_levels, "S")

valid_escape_methods <- c("a", "b", "c", "d", "ax")
if (!escape %in% valid_escape_methods)
stop("escape parameter must be one of: ",
paste(valid_escape_methods, collapse = ", "))

new(
mod <- new(
ppm_simple,
alphabet_size = as.integer(alphabet_size),
order_bound = as.integer(order_bound),
shortest_deterministic = shortest_deterministic,
exclusion = exclusion,
update_exclusion = update_exclusion,
escape = escape,
debug_smooth = debug_smooth
alphabet_levels
)

mod$debug_smooth <- debug_smooth

mod
}

#' Create decay-based PPM model
Expand Down Expand Up @@ -177,10 +196,6 @@ new_ppm_simple <- function(
#' and explicitly disables exclusion and update exclusion.
#' See \insertCite{Harrison2020;textual}{ppm} for details.
#'
#' @param alphabet_size
#' (Integerish scalar)
#' The size of the alphabet from which sequences are drawn.
#'
#' @param order_bound
#' (Integerish scalar)
#' The model's Markov order bound.
Expand Down Expand Up @@ -259,6 +274,8 @@ new_ppm_simple <- function(
#' \code{\link{new_ppm_simple}},
#' \code{\link{model_seq}}.
#'
#' @inheritParams new_ppm_simple
#'
#' @md
#'
#' @references
Expand All @@ -281,8 +298,16 @@ new_ppm_decay <- function(
only_predict_from_buffer = FALSE,
seed = sample.int(.Machine$integer.max, 1),
debug_smooth = FALSE,
debug_decay = FALSE
debug_decay = FALSE,
alphabet_levels = character()
) {
if (missing(alphabet_size) && length(alphabet_levels) > 0) {
alphabet_size <- length(alphabet_levels)
}
if (length(alphabet_levels) > 0 && length(alphabet_levels) != alphabet_size) {
stop("length(alphabet_levels) must equal alphabet_size.")
}

checkmate::qassert(alphabet_size, "X1")
checkmate::qassert(order_bound, "X[0,)")
checkmate::qassert(ltm_weight, "N1[0,)")
Expand All @@ -298,6 +323,7 @@ new_ppm_decay <- function(
checkmate::qassert(only_predict_from_buffer, "B1")
checkmate::qassert(debug_smooth, "B1")
checkmate::qassert(debug_decay, "B1")
checkmate::qassert(alphabet_levels, "S")

decay_par = list(
ltm_weight = as.numeric(ltm_weight),
Expand All @@ -313,15 +339,19 @@ new_ppm_decay <- function(
only_predict_from_buffer = as.logical(only_predict_from_buffer)
)

new(
mod <- new(
ppm_decay,
alphabet_size = as.integer(alphabet_size),
order_bound = as.integer(order_bound),
decay_par = decay_par,
seed = as.integer(seed),
debug_smooth = debug_smooth,
debug_decay = debug_decay
alphabet_levels = alphabet_levels
)

mod$debug_decay <- debug_decay
mod$debug_smooth <- debug_smooth

mod
}

#' Is 'x' a 'ppm' object?
Expand Down
23 changes: 20 additions & 3 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,13 @@ print(as.integer(seq_2))
```

The *ppm* package treats factor objects like their underlying integer representations.
When modelling factor objects, it's recommended to pass the underlying factor representation
to the PPM model upon initialisation. In this case, the `alphabet_size` parameter need not
be provided.

```{r}
mod <- new_ppm_simple(alphabet_levels = c("a", "b", "c", "d", "r"))
```

You feed sequences to the PPM model using the `model_seq` function.
By default, the model processes these sequences incrementally,
Expand Down Expand Up @@ -177,6 +184,16 @@ points(res_2$information_content,
type = "l", col = "red")
```

By setting `generate = TRUE`, we can also instruct the model to generate new samples
based on the statistics that it has learned so far. In this case the first argument to
`model_seq` should be an integer corresponding to the desired length of the generated sequence.

```{r}
res_3 <- model_seq(mod, seq = 20, generate = TRUE)
res_3$symbol
```


## PPM-Decay

The original PPM algorithm has a perfect memory,
Expand Down Expand Up @@ -229,16 +246,16 @@ symbol observations.
seq_2_time <- seq_along(seq_2)
print(seq_2_time)
res_3 <- model_seq(mod_decay,
res_4 <- model_seq(mod_decay,
seq_2,
time = seq_2_time)
plot(res$information_content,
plot(res_4$information_content,
xlab = "Position",
ylab = "Information content (bits)",
type = "l",
ylim = c(0, 5))
points(res_3$information_content,
points(res_4$information_content,
type = "l", col = "blue")
```

Expand Down
Loading

0 comments on commit d58e759

Please sign in to comment.