Merge pull request #3 from pmcharrison/generate

Implemente `generate` option in `model_seq`
pmcharrison · May 31, 2021 · d58e759 · d58e759
2 parents a166226 + 8bd8411
commit d58e759
Show file tree

Hide file tree

Showing 16 changed files with 305 additions and 119 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: ppm
 Type: Package
 Title: Prediction by Partial Matching
-Version: 0.2.0
+Version: 0.3.0
 Date: 2019-03-08
 Author: Peter M. C. Harrison
 Maintainer: Peter M. C. Harrison <[email protected]>
@@ -22,7 +22,7 @@ LinkingTo:
     Rcpp,
     BH,
     testthat
-RoxygenNote: 7.1.0
+RoxygenNote: 7.1.1
 Encoding: UTF-8
 SystemRequirements: C++11
 Suggests: 

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+# ppm 0.3.0
+
+- Implemented `generate` option in `model_seq`.
+
 # ppm 0.2.0
 
 - Added NEWS.md.

diff --git a/R/model-seq.R b/R/model-seq.R
@@ -1,19 +1,21 @@
 #' Model sequence
 #' 
-#' Analyses a sequence using a PPM model.
+#' Analyses or generates a sequence using a PPM model.
 #' 
 #' @param model
 #' A PPM model object as produced by (for example)
 #' \code{\link{new_ppm_simple}} or \code{\link{new_ppm_decay}}.
 #' 
 #' @param seq
-#' An integer vector defining the input sequence
+#' When \code{generate = FALSE} an integer vector defining the input sequence
 #' (equivalently a numeric vector containing solely integers,
 #' or a factor vector, both of which which will be coerced to integer vectors).
+#' When \code{generate = TRUE}, an integer scalar giving the desired length 
+#' of the generated sequence.
 #' 
 #' @param time
 #' (NULL or a numeric vector)
-#' Timepoints corresponding to each element of the argument \code{seq}.
+#' Timepoints corresponding to each element of the sequence.
 #' Only used by certain model types (e.g. decay-based models).
 #' 
 #' @param zero_indexed
@@ -43,6 +45,13 @@
 #' Whether or not to return the entropy of each event prediction
 #' (ignored if \code{predict = FALSE}).
 #' 
+#' @param generate
+#' (Logical scalar)
+#' If \code{TRUE}, the output will correspond to a newly generated sequence
+#' with length as specified by the \code{seq} argument, 
+#' produced by sampling from the model's predictive distribution.
+#' The default is \code{FALSE}.
+#' 
 #' @return 
 #' A \code{\link[tibble]{tibble}} which will be empty if \code{predict = FALSE}
 #' and otherwise will contain one row for each element in the sequence,
@@ -68,7 +77,15 @@ model_seq <- function(model,
                       train = TRUE,
                       predict = TRUE,
                       return_distribution = TRUE,
-                      return_entropy = TRUE) {
+                      return_entropy = TRUE,
+                      generate = FALSE) {
+  if (is.character(seq)) {
+    stop("'seq' cannot be a character vector; please provide a factor representation instead.")
+  }
+  if (generate && length(seq) == 1 && is.numeric(seq)) {
+    seq <- integer(seq)
+  }
+
   seq <- as.integer(seq)
 
   stopifnot(is_ppm(model))
@@ -78,20 +95,27 @@ model_seq <- function(model,
   checkmate::qassert(predict, "B1")
   checkmate::qassert(return_distribution, "B1")
   checkmate::qassert(return_entropy, "B1")
+  checkmate::qassert(generate, "B1")
   stopifnot(is.null(time) || is.numeric(time))
 
-  if (zero_indexed) {
-    if (any(seq < 0L))
-      stop("all elements of 'seq' must be greater than or equal to 0")
-    if (any(seq >= model$alphabet_size))
-      stop("all elements of 'seq' must be less than the model's alphabet size", 
-           " (", model$alphabet_size, ")")
-  } else {
-    if (any(seq < 1L))
-      stop("all elements of 'seq' must be greater than or equal to 1")
-    if (any(seq > model$alphabet_size))
-      stop("all elements of 'seq' must be less than or equal to the model's alphabet size",
-           " (", model$alphabet_size, ")")
+  if (!generate) {
+    if (zero_indexed) {
+      if (any(seq < 0L))
+        stop("all elements of 'seq' must be greater than or equal to 0")
+      if (any(seq >= model$alphabet_size))
+        stop("all elements of 'seq' must be less than the model's alphabet size", 
+             " (", model$alphabet_size, ")")
+    } else {
+      if (any(seq < 1L))
+        stop("all elements of 'seq' must be greater than or equal to 1")
+      if (any(seq > model$alphabet_size))
+        stop("all elements of 'seq' must be less than or equal to the model's alphabet size",
+             " (", model$alphabet_size, ")")
+    }
+  }
+
+  if (is.factor(seq) && length(model$alphabet_levels > 0) && !identical(levels(seq), model$alphabet_levels)) {
+    warning("sequence's factor levels seemed inconsistent with model$alphabet_levels")
   }
 
   if (any(diff(time) < 0)) stop("decreasing values of time are not permitted")
@@ -109,11 +133,16 @@ model_seq <- function(model,
     train = train,
     predict = predict,
     return_distribution = return_distribution,
-    return_entropy = return_entropy
+    return_entropy = return_entropy,
+    generate = generate
   )
   df <- res$as_tibble()
 
   if (!zero_indexed) df$symbol <- df$symbol + 1L
 
+  if (length(model$alphabet_levels) > 0) {
+    df$symbol <- factor(model$alphabet_levels[df$symbol], levels = model$alphabet_levels)
+  }
+
   df
 }
diff --git a/R/new-model.R b/R/new-model.R
@@ -6,6 +6,7 @@
 #' @param alphabet_size
 #' (Integerish scalar)
 #' The size of the alphabet upon which the model will be trained and tested.
+#' If not provided, this will be taken as \code{length(alphabet_levels)}.
 #' 
 #' @param order_bound
 #' (Integerish scalar)
@@ -53,6 +54,11 @@
 #' Whether to print (currently rather messy and ad hoc) debug output
 #' for smoothing.
 #' 
+#' @param alphabet_levels
+#' (Character vector)
+#' Optional vector of levels for the alphabet. If provided,
+#' these will be used to define factor levels for the output.
+#' 
 #' @note
 #' The implementation does not scale well to very large order bounds (> 50).
 #' 
@@ -75,31 +81,44 @@ new_ppm_simple <- function(
   exclusion = TRUE,
   update_exclusion = TRUE,
   escape = "c",
-  debug_smooth = FALSE
+  debug_smooth = FALSE,
+  alphabet_levels = character()
 ) {
+  if (missing(alphabet_size) && length(alphabet_levels) > 0) {
+    alphabet_size <- length(alphabet_levels)
+  }
+  if (length(alphabet_levels) > 0 && length(alphabet_levels) != alphabet_size) {
+    stop("length(alphabet_levels) must equal alphabet_size.")
+  }
+
   checkmate::qassert(alphabet_size, "X1")
   checkmate::qassert(order_bound, "X[0,)")
   checkmate::qassert(shortest_deterministic, "B1")
   checkmate::qassert(exclusion, "B1")
   checkmate::qassert(update_exclusion, "B1")
   checkmate::qassert(escape, "S1")
   checkmate::qassert(debug_smooth, "B1")
+  checkmate::qassert(alphabet_levels, "S")
 
   valid_escape_methods <- c("a", "b", "c", "d", "ax")
   if (!escape %in% valid_escape_methods)
     stop("escape parameter must be one of: ",
          paste(valid_escape_methods, collapse = ", "))
 
-  new(
+  mod <- new(
     ppm_simple, 
     alphabet_size = as.integer(alphabet_size),
     order_bound = as.integer(order_bound),
     shortest_deterministic = shortest_deterministic,
     exclusion = exclusion,
     update_exclusion = update_exclusion,
     escape = escape,
-    debug_smooth = debug_smooth
+    alphabet_levels
   )
+
+  mod$debug_smooth <- debug_smooth
+
+  mod
 }
 
 #' Create decay-based PPM model
@@ -177,10 +196,6 @@ new_ppm_simple <- function(
 #' and explicitly disables exclusion and update exclusion.
 #' See \insertCite{Harrison2020;textual}{ppm} for details. 
 #' 
-#' @param alphabet_size
-#' (Integerish scalar)
-#' The size of the alphabet from which sequences are drawn.
-#' 
 #' @param order_bound
 #' (Integerish scalar)
 #' The model's Markov order bound.
@@ -259,6 +274,8 @@ new_ppm_simple <- function(
 #' \code{\link{new_ppm_simple}},
 #' \code{\link{model_seq}}.
 #' 
+#' @inheritParams new_ppm_simple
+#' 
 #' @md
 #' 
 #' @references
@@ -281,8 +298,16 @@ new_ppm_decay <- function(
   only_predict_from_buffer = FALSE,
   seed = sample.int(.Machine$integer.max, 1),
   debug_smooth = FALSE,
-  debug_decay = FALSE
+  debug_decay = FALSE,
+  alphabet_levels = character()
 ) {
+  if (missing(alphabet_size) && length(alphabet_levels) > 0) {
+    alphabet_size <- length(alphabet_levels)
+  }
+  if (length(alphabet_levels) > 0 && length(alphabet_levels) != alphabet_size) {
+    stop("length(alphabet_levels) must equal alphabet_size.")
+  }
+
   checkmate::qassert(alphabet_size, "X1")
   checkmate::qassert(order_bound, "X[0,)")
   checkmate::qassert(ltm_weight, "N1[0,)")
@@ -298,6 +323,7 @@ new_ppm_decay <- function(
   checkmate::qassert(only_predict_from_buffer, "B1")
   checkmate::qassert(debug_smooth, "B1")
   checkmate::qassert(debug_decay, "B1")
+  checkmate::qassert(alphabet_levels, "S")
 
   decay_par = list(
     ltm_weight = as.numeric(ltm_weight),
@@ -313,15 +339,19 @@ new_ppm_decay <- function(
     only_predict_from_buffer = as.logical(only_predict_from_buffer)
   )
 
-  new(
+  mod <- new(
     ppm_decay, 
     alphabet_size = as.integer(alphabet_size),
     order_bound = as.integer(order_bound),
     decay_par = decay_par,
     seed = as.integer(seed),
-    debug_smooth = debug_smooth,
-    debug_decay = debug_decay
+    alphabet_levels = alphabet_levels
   )
+
+  mod$debug_decay <- debug_decay
+  mod$debug_smooth <- debug_smooth
+
+  mod
 }
 
 #' Is 'x' a 'ppm' object?

diff --git a/README.Rmd b/README.Rmd
@@ -118,6 +118,13 @@ print(as.integer(seq_2))
 ```
 
 The *ppm* package treats factor objects like their underlying integer representations.
+When modelling factor objects, it's recommended to pass the underlying factor representation
+to the PPM model upon initialisation. In this case, the `alphabet_size` parameter need not 
+be provided.
+
+```{r}
+mod <- new_ppm_simple(alphabet_levels = c("a", "b", "c", "d", "r"))
+```
 
 You feed sequences to the PPM model using the `model_seq` function.
 By default, the model processes these sequences incrementally,
@@ -177,6 +184,16 @@ points(res_2$information_content,
        type = "l", col = "red")
 ```
 
+By setting `generate = TRUE`, we can also instruct the model to generate new samples
+based on the statistics that it has learned so far. In this case the first argument to 
+`model_seq` should be an integer corresponding to the desired length of the generated sequence.
+
+```{r}
+res_3 <- model_seq(mod, seq = 20, generate = TRUE)
+res_3$symbol
+```
+
+
 ## PPM-Decay
 
 The original PPM algorithm has a perfect memory,
@@ -229,16 +246,16 @@ symbol observations.
 seq_2_time <- seq_along(seq_2)
 print(seq_2_time)
 
-res_3 <- model_seq(mod_decay, 
+res_4 <- model_seq(mod_decay, 
                    seq_2,
                    time = seq_2_time)
 
-plot(res$information_content,
+plot(res_4$information_content,
      xlab = "Position",
      ylab = "Information content (bits)",
      type = "l", 
      ylim = c(0, 5))
-points(res_3$information_content,
+points(res_4$information_content,
        type = "l", col = "blue")
 ```