Merge branch 'release/2.6.0'

psychelzh · Nov 28, 2023 · 761df88 · 761df88
2 parents ae6bf0c + bed6516
commit 761df88
Show file tree

Hide file tree

Showing 27 changed files with 748 additions and 258 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: preproc.iquizoo
 Title: Utility Functions for Data Processing of Iquizoo Games
-Version: 2.5.2
+Version: 2.6.0
 Authors@R:
     person("Liang", "Zhang", , "[email protected]", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0001-9041-1150"))
@@ -15,6 +15,7 @@ Depends:
     R (>= 4.1.0)
 Imports:
     dplyr,
+    jsonlite,
     pracma,
     purrr,
     rlang (>= 0.1.2),
@@ -29,8 +30,10 @@ Suggests:
     readr,
     roxygen2,
     testthat (>= 3.0.0),
+    tidytable,
     withr
 Config/testthat/edition: 3
+Config/testthat/parallel: true
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.2.3
diff --git a/NAMESPACE b/NAMESPACE
@@ -26,6 +26,7 @@ export(multisense)
 export(nback)
 export(nle)
 export(nsymncmp)
+export(preproc_data)
 export(racer)
 export(rapm)
 export(refframe)
@@ -39,6 +40,7 @@ export(sumweighted)
 export(switchcost)
 export(symncmp)
 export(synwin)
+export(wrangle_data)
 import(dplyr)
 import(rlang)
 import(tidyr)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,13 @@
+# preproc.iquizoo 2.6.0
+
+## Breaking Changes
+
+* Added `wrangle_data()` and `preproc_data()` functions, which were previously in tarflow.iquizoo package.
+
+## Bug Fixes
+
+* Fixed an edge case when `fit_numerosity()` will stuck in infinite loop.
+
 # preproc.iquizoo 2.5.2
 
 * Enhance code quality of internal functions.

diff --git a/R/nsymncmp.R b/R/nsymncmp.R
@@ -53,7 +53,72 @@ calc_numerosity <- function(data, name_bigset, name_smallset, name_acc) {
       "Cannot find fit after the max number of fitting.",
       "fit_not_converge"
     )
-    pars <- replace(pars, TRUE, NA_real_)
   }
   tibble::as_tibble_row(pars)
 }
+
+#' Fit a Simple Numerosity Model
+#'
+#' This model assumes the distribution of mental representation for a given
+#' number/count k is N(k, (w * k) ^ 2).
+#'
+#' @template common
+#' @param name_bigset,name_smallset Variable name in `data` indicates bigger and
+#'   smaller set.
+#' @param name_acc Variable name in `data` indicates user's response is correct
+#'   or not.
+#' @param n_fit Number of fits to try to find the best estimate.
+#' @param seed Random seed. Default is 1 so that results can be reproduced.
+#' @return A [list()] with structure the same as [optim()].
+#' @export
+fit_numerosity <- function(data, name_bigset, name_smallset, name_acc,
+                           n_fit = 5, seed = 1) {
+  set.seed(seed)
+  b <- data[[name_bigset]]
+  s <- data[[name_smallset]]
+  acc <- data[[name_acc]]
+
+  min_objective <- Inf
+  best_fit <- NULL
+  for (j in seq_len(n_fit)) {
+    # try 10 times to find a good initial value
+    for (i in seq_len(10)) {
+      init <- c(w = stats::runif(1))
+      init_objective <- ll_numerosity(init, b, s, acc)
+      if (init_objective < 1e6) {
+        break
+      }
+    }
+    if (init_objective >= 1e6) {
+      warn(
+        "Cannot find a good initial value after 10 tries.",
+        "no_good_init"
+      )
+      return(list(par = c(w = NA_real_), convergence = 1))
+    }
+    fit <- stats::optim(
+      init, ll_numerosity,
+      method = "L-BFGS-B",
+      b = b, s = s, acc = acc,
+      lower = 0
+    )
+    if (fit[["value"]] < min_objective) {
+      best_fit <- fit
+    }
+  }
+  best_fit
+}
+
+ll_numerosity <- function(pars, b, s, acc) {
+  means <- b - s
+  sds <- pars["w"]^2 * (b^2 + s^2)
+
+  # incorrect means the mental representation is less than 0, so lower tail
+  dens <- ifelse(
+    !acc,
+    stats::pnorm(0, means, sds, lower.tail = TRUE, log.p = TRUE),
+    stats::pnorm(0, means, sds, lower.tail = FALSE, log.p = TRUE)
+  )
+
+  return(ifelse(any(!is.finite(dens)), 1e6, -sum(dens)))
+}
diff --git a/R/preproc.R b/R/preproc.R
@@ -0,0 +1,122 @@
+#' Wrangle Raw Data
+#'
+#' Parse raw json string data as [data.frame()] and store them in a list column.
+#'
+#' @param data The raw data.
+#' @param name_raw_json The column name in which stores user's raw data in
+#'   format of json string.
+#' @param name_raw_parsed The name used to store parsed data.
+#' @return A [data.frame] contains the parsed data.
+#' @export
+wrangle_data <- function(data,
+                         name_raw_json = "game_data",
+                         name_raw_parsed = "raw_parsed") {
+  data[[name_raw_parsed]] <- purrr::map(
+    data[[name_raw_json]],
+    parse_raw_json
+  )
+  select(data, !all_of(name_raw_json))
+}
+
+#' Feed Raw Data to Pre-processing
+#'
+#' Calculate indices using data typically returned by [wrangle_data()].
+#'
+#' @details
+#'
+#' Observations with empty raw data (empty vector, e.g. `NULL`, in
+#' `name_raw_parsed` column) are removed before calculating indices. If no
+#' observations left after removing, a warning is signaled and `NULL` is
+#' returned.
+#'
+#' @param data A [data.frame] contains raw data.
+#' @param fn This can be a function or formula. See [rlang::as_function()] for
+#'   more details.
+#' @param ... Additional arguments passed to `fn`.
+#' @param name_raw_parsed The column name in which stores user's raw data in
+#'   format of a list of [data.frame]s.
+#' @param pivot_results Whether to pivot the calculated indices. If `TRUE`, the
+#'   calculated indices are pivoted into long format, with each index name
+#'   stored in the column of `pivot_names_to`, and each index value stored in
+#'   the column of `pivot_values_to`. If `FALSE`, the calculated indices are
+#'   stored in the same format as returned by `fn`.
+#' @param pivot_names_to,pivot_values_to The column names used to store index
+#'   names and values if `pivot_results` is `TRUE`. See [tidyr::pivot_longer()]
+#'   for more details.
+#' @return A [data.frame] contains the calculated indices.
+#' @export
+preproc_data <- function(data, fn, ...,
+                         name_raw_parsed = "raw_parsed",
+                         pivot_results = TRUE,
+                         pivot_names_to = "index_name",
+                         pivot_values_to = "score") {
+  data <- filter(data, !purrr::map_lgl(.data[[name_raw_parsed]], is_empty))
+  if (nrow(data) == 0) {
+    warn("No non-empty data found.")
+    return()
+  }
+  fn <- as_function(fn)
+  results <- data |>
+    mutate(
+      calc_indices(.data[[name_raw_parsed]], fn, ...),
+      .keep = "unused"
+    )
+  if (pivot_results) {
+    results <- results |>
+      pivot_longer(
+        cols = !any_of(names(data)),
+        names_to = pivot_names_to,
+        values_to = pivot_values_to
+      ) |>
+      vctrs::vec_restore(data)
+  }
+  results
+}
+
+# helper functions
+parse_raw_json <- function(jstr) {
+  parsed <- tryCatch(
+    jsonlite::fromJSON(jstr),
+    error = function(cnd) {
+      warn(
+        c(
+          "Failed to parse json string with the following error:",
+          conditionMessage(cnd),
+          i = "Will parse it as `NULL` instead."
+        )
+      )
+      return()
+    }
+  )
+  if (is_empty(parsed)) {
+    return()
+  }
+  parsed |>
+    rename_with(tolower) |>
+    mutate(across(where(is.character), tolower))
+}
+
+calc_indices <- function(l, fn, ...) {
+  # used as a temporary id for each element
+  name_id <- ".id"
+  tryCatch(
+    bind_rows(l, .id = name_id),
+    error = function(cnd) {
+      warn(
+        c(
+          "Failed to bind raw data with the following error:",
+          conditionMessage(cnd),
+          i = "Will try using tidytable package."
+        )
+      )
+      check_installed(
+        "tidytable",
+        "because tidyr package fails to bind raw data."
+      )
+      tidytable::bind_rows(l, .id = name_id) |>
+        utils::type.convert(as.is = TRUE)
+    }
+  ) |>
+    fn(.by = name_id, ...) |>
+    select(!all_of(name_id))
+}
diff --git a/R/switch-congruence.R b/R/switch-congruence.R
@@ -186,3 +186,105 @@ switchcost <- function(data, .by = NULL, .input = NULL, .extra = NULL) {
   merge(spd_acc, switch_cost, by = .by) |>
     vctrs::vec_restore(data)
 }
+
+#' Switch cost
+#'
+#' Utility function to calculate general and specific switch cost.
+#'
+#' @template common
+#' @param by The column name(s) in `data` used to be grouped by. If set to
+#'   `NULL`, all data will be treated as from one subject.
+#' @templateVar name_acc TRUE
+#' @templateVar name_rt TRUE
+#' @template names
+#' @param name_switch The column name of the `data` input whose values are
+#'   the switch type, in which is a `character` vector with at least `"switch"`
+#'   and `"repeat"` values.
+#' @keywords internal
+calc_switch_cost <- function(data, by, name_switch, name_rt, name_acc) {
+  data[[name_switch]] <- factor(data[[name_switch]], c("switch", "repeat"))
+  calc_cond_diff(
+    data,
+    by,
+    name_cond = name_switch,
+    name_diff_prefix = "switch_cost_",
+    name_acc = name_acc,
+    name_rt = name_rt
+  )
+}
+
+#' Congruence effect
+#'
+#' Utility function to calculate congruence effect sizes.
+#'
+#' @template common
+#' @param by The column name(s) in `data` used to be grouped by. If set to
+#'   `NULL`, all data will be treated as from one subject.
+#' @templateVar name_acc TRUE
+#' @templateVar name_rt TRUE
+#' @template names
+#' @param name_cong The column name of the `data` input whose values are the
+#'   congruence information, in which is a `character` vector with "incongruent
+#'   condition" (label: `"inc"`) and "congruent condition" (label: `"con"`). It
+#'   will be coerced as a `factor` vector with these two levels.
+#' @return A [tibble][tibble::tibble-package] contains congruence effect results
+#'   on accuracy and response time.
+#' @keywords internal
+calc_cong_eff <- function(data, by, name_cong, name_acc, name_rt) {
+  data[[name_cong]] <- factor(data[[name_cong]], c("inc", "con"))
+  calc_cond_diff(
+    data,
+    by,
+    name_cond = name_cong,
+    name_diff_prefix = "cong_eff_",
+    name_acc = name_acc,
+    name_rt = name_rt
+  )
+}
+
+calc_cond_diff <- function(data, by, name_acc, name_rt,
+                           name_cond, name_diff_prefix) {
+  conds <- levels(data[[name_cond]])
+  index_each_cond <- data |>
+    calc_spd_acc(
+      by = c(by, name_cond),
+      name_acc = name_acc,
+      name_rt = name_rt
+    ) |>
+    complete(.data[[name_cond]]) |>
+    select(all_of(c(by, name_cond, "pc", "mrt", "ies", "rcs", "lisas")))
+  index_each_cond |>
+    pivot_longer(
+      cols = -any_of(c(by, name_cond)),
+      names_to = "index_name",
+      values_to = "score"
+    ) |>
+    pivot_wider(
+      names_from = all_of(name_cond),
+      values_from = "score"
+    ) |>
+    mutate(
+      diff = .data[[conds[[1]]]] - .data[[conds[[2]]]],
+      .keep = "unused"
+    ) |>
+    # make sure larger values correspond to larger switch cost
+    mutate(
+      diff = if_else(
+        .data$index_name %in% c("pc", "rcs"),
+        -diff, diff
+      )
+    ) |>
+    pivot_wider(
+      names_from = "index_name",
+      values_from = "diff",
+      names_prefix = name_diff_prefix
+    ) |>
+    merge(
+      index_each_cond |>
+        pivot_wider(
+          names_from = all_of(name_cond),
+          values_from = -any_of(c(by, name_cond))
+        ),
+      by = by
+    )
+}