From 6264c6b7c215438a700ab564afbf5d3916d9d923 Mon Sep 17 00:00:00 2001 From: Liang Zhang Date: Thu, 5 Oct 2023 10:25:50 +0800 Subject: [PATCH 01/12] Increment version number to 2.5.2.9000 --- DESCRIPTION | 2 +- NEWS.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index e1c9629..204f7f9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: preproc.iquizoo Title: Utility Functions for Data Processing of Iquizoo Games -Version: 2.5.2 +Version: 2.5.2.9000 Authors@R: person("Liang", "Zhang", , "psychelzh@outlook.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-9041-1150")) diff --git a/NEWS.md b/NEWS.md index 6c56703..584b7da 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +# preproc.iquizoo (development version) + # preproc.iquizoo 2.5.2 * Enhance code quality of internal functions. From d7a67d0c5585c8df90cb295ae3d5438dc0e29905 Mon Sep 17 00:00:00 2001 From: Liang Zhang Date: Thu, 5 Oct 2023 20:45:52 +0800 Subject: [PATCH 02/12] Enhance code quality Signed-off-by: Liang Zhang --- R/utils-common.R | 28 +++++++++---------- .../testthat/test-calc_staircase_wetherill.R | 2 +- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/R/utils-common.R b/R/utils-common.R index 4ab0ad5..4bc567a 100644 --- a/R/utils-common.R +++ b/R/utils-common.R @@ -62,26 +62,26 @@ check_outliers_rt <- function(x, threshold = 2.5) { #' @return The mean threshold. #' @keywords internal calc_staircase_wetherill <- function(x) { - find_reversals <- function(x, type = c("peaks", "valleys")) { - type <- match.arg(type) - if (type == "valleys") x <- -x - mat <- pracma::findpeaks(x) - if (is.null(mat)) { - warn(paste("No", type, "found from input"), "input_not_suitable") - return() - } - if (type == "valleys") { - -mat[, 1] - } else { + find_reversals <- function(x) { + find_peaks_val <- function(x) { + mat <- pracma::findpeaks(x) + if (is.null(mat)) { + warn("Reversals not found from input", "no_reversals_found") + return(NA_real_) + } mat[, 1] } + list( + peaks = find_peaks_val(x), + valleys = -find_peaks_val(-x) + ) } - # use run length encoding to remove repetitions in transformed method + # remove repetitions in transformed method x <- rle(x)$values - reversals <- c("peaks", "valleys") |> - purrr::map(\(type) find_reversals(x, type)) + reversals <- find_reversals(x) reversals |> purrr::map( + # keep equal number of peaks and valleys \(x) utils::tail(x, min(lengths(reversals))) ) |> purrr::list_c() |> diff --git a/tests/testthat/test-calc_staircase_wetherill.R b/tests/testthat/test-calc_staircase_wetherill.R index 71fd870..c8f1c38 100644 --- a/tests/testthat/test-calc_staircase_wetherill.R +++ b/tests/testthat/test-calc_staircase_wetherill.R @@ -8,5 +8,5 @@ test_that("Correctly find all peaks and valleys", { test_that("Return NA if no peaks or valleys", { is.na(calc_staircase_wetherill(c(2:5))) |> expect_true() |> - expect_warning(class = "input_not_suitable") + expect_warning(class = "no_reversals_found") }) From 9c92a6e564bfe5c85a6927732b8e5e48b4bc495e Mon Sep 17 00:00:00 2001 From: Liang Zhang Date: Thu, 5 Oct 2023 20:52:41 +0800 Subject: [PATCH 03/12] Update utils structures Signed-off-by: Liang Zhang --- R/nsymncmp.R | 57 ++++++++++++++ R/switch-congruence.R | 102 ++++++++++++++++++++++++++ R/utils-common.R | 89 ---------------------- R/utils-switch-congruence.R | 101 ------------------------- R/{utils-speed-accuracy.R => utils.R} | 90 +++++++++++++++++++++++ R/utils_numerosity.R | 56 -------------- man/calc_cong_eff.Rd | 2 +- man/calc_sdt.Rd | 2 +- man/calc_spd_acc.Rd | 2 +- man/calc_staircase_wetherill.Rd | 2 +- man/calc_switch_cost.Rd | 2 +- man/check_outliers_rt.Rd | 2 +- man/fit_numerosity.Rd | 2 +- man/parse_char_resp.Rd | 2 +- man/update_settings.Rd | 2 +- 15 files changed, 258 insertions(+), 255 deletions(-) delete mode 100644 R/utils-common.R delete mode 100644 R/utils-switch-congruence.R rename R/{utils-speed-accuracy.R => utils.R} (63%) delete mode 100644 R/utils_numerosity.R diff --git a/R/nsymncmp.R b/R/nsymncmp.R index af49fb2..59878cf 100644 --- a/R/nsymncmp.R +++ b/R/nsymncmp.R @@ -57,3 +57,60 @@ calc_numerosity <- function(data, name_bigset, name_smallset, name_acc) { } tibble::as_tibble_row(pars) } + +#' Fit a Simple Numerosity Model +#' +#' This model assumes the distribution of mental representation for a given +#' number/count k is N(k, (w * k) ^ 2). +#' +#' @template common +#' @param name_bigset,name_smallset Variable name in `data` indicates bigger and +#' smaller set. +#' @param name_acc Variable name in `data` indicates user's response is correct +#' or not. +#' @param n_fit Number of fits to try to find the best estimate. +#' @param seed Random seed. Default is 1 so that results can be reproduced. +#' @return A [list()] with structure the same as [optim()]. +#' @export +fit_numerosity <- function(data, name_bigset, name_smallset, name_acc, + n_fit = 5, seed = 1) { + set.seed(seed) + b <- data[[name_bigset]] + s <- data[[name_smallset]] + acc <- data[[name_acc]] + + min_objective <- Inf + best_fit <- NULL + for (j in seq_len(n_fit)) { + repeat { + init <- c(w = stats::runif(1)) + if (ll_numerosity(init, b, s, acc) < 1e6) { + break + } + } + fit <- stats::optim( + init, ll_numerosity, + method = "L-BFGS-B", + b = b, s = s, acc = acc, + lower = 0 + ) + if (fit[["value"]] < min_objective) { + best_fit <- fit + } + } + best_fit +} + +ll_numerosity <- function(pars, b, s, acc) { + means <- b - s + sds <- pars["w"]^2 * (b^2 + s^2) + + # incorrect means the mental representation is less than 0, so lower tail + dens <- ifelse( + !acc, + stats::pnorm(0, means, sds, lower.tail = TRUE, log.p = TRUE), + stats::pnorm(0, means, sds, lower.tail = FALSE, log.p = TRUE) + ) + + return(ifelse(any(!is.finite(dens)), 1e6, -sum(dens))) +} diff --git a/R/switch-congruence.R b/R/switch-congruence.R index cbd4418..35fc13a 100644 --- a/R/switch-congruence.R +++ b/R/switch-congruence.R @@ -186,3 +186,105 @@ switchcost <- function(data, .by = NULL, .input = NULL, .extra = NULL) { merge(spd_acc, switch_cost, by = .by) |> vctrs::vec_restore(data) } + +#' Switch cost +#' +#' Utility function to calculate general and specific switch cost. +#' +#' @template common +#' @param by The column name(s) in `data` used to be grouped by. If set to +#' `NULL`, all data will be treated as from one subject. +#' @templateVar name_acc TRUE +#' @templateVar name_rt TRUE +#' @template names +#' @param name_switch The column name of the `data` input whose values are +#' the switch type, in which is a `character` vector with at least `"switch"` +#' and `"repeat"` values. +#' @keywords internal +calc_switch_cost <- function(data, by, name_switch, name_rt, name_acc) { + data[[name_switch]] <- factor(data[[name_switch]], c("switch", "repeat")) + calc_cond_diff( + data, + by, + name_cond = name_switch, + name_diff_prefix = "switch_cost_", + name_acc = name_acc, + name_rt = name_rt + ) +} + +#' Congruence effect +#' +#' Utility function to calculate congruence effect sizes. +#' +#' @template common +#' @param by The column name(s) in `data` used to be grouped by. If set to +#' `NULL`, all data will be treated as from one subject. +#' @templateVar name_acc TRUE +#' @templateVar name_rt TRUE +#' @template names +#' @param name_cong The column name of the `data` input whose values are the +#' congruence information, in which is a `character` vector with "incongruent +#' condition" (label: `"inc"`) and "congruent condition" (label: `"con"`). It +#' will be coerced as a `factor` vector with these two levels. +#' @return A [tibble][tibble::tibble-package] contains congruence effect results +#' on accuracy and response time. +#' @keywords internal +calc_cong_eff <- function(data, by, name_cong, name_acc, name_rt) { + data[[name_cong]] <- factor(data[[name_cong]], c("inc", "con")) + calc_cond_diff( + data, + by, + name_cond = name_cong, + name_diff_prefix = "cong_eff_", + name_acc = name_acc, + name_rt = name_rt + ) +} + +calc_cond_diff <- function(data, by, name_acc, name_rt, + name_cond, name_diff_prefix) { + conds <- levels(data[[name_cond]]) + index_each_cond <- data |> + calc_spd_acc( + by = c(by, name_cond), + name_acc = name_acc, + name_rt = name_rt + ) |> + complete(.data[[name_cond]]) |> + select(all_of(c(by, name_cond, "pc", "mrt", "ies", "rcs", "lisas"))) + index_each_cond |> + pivot_longer( + cols = -any_of(c(by, name_cond)), + names_to = "index_name", + values_to = "score" + ) |> + pivot_wider( + names_from = all_of(name_cond), + values_from = "score" + ) |> + mutate( + diff = .data[[conds[[1]]]] - .data[[conds[[2]]]], + .keep = "unused" + ) |> + # make sure larger values correspond to larger switch cost + mutate( + diff = if_else( + .data$index_name %in% c("pc", "rcs"), + -diff, diff + ) + ) |> + pivot_wider( + names_from = "index_name", + values_from = "diff", + names_prefix = name_diff_prefix + ) |> + merge( + index_each_cond |> + pivot_wider( + names_from = all_of(name_cond), + values_from = -any_of(c(by, name_cond)) + ), + by = by + ) +} diff --git a/R/utils-common.R b/R/utils-common.R deleted file mode 100644 index 4bc567a..0000000 --- a/R/utils-common.R +++ /dev/null @@ -1,89 +0,0 @@ -#' Convert character responses -#' -#' Simple function converts character correctness to numeric one. -#' -#' @param x The character vector to be parsed. -#' @param delim Delimiter used to join correctness when forming the character. -#' Usually is hyphen (i.e., `"-"`), which is the default. -#' @param convert_numeric A logical value indicating if the values should be -#' converted to `numeric` ones. -#' @return A list of the parsed result, the same length as the input vector. -#' @keywords internal -parse_char_resp <- function(x, delim = "-", convert_numeric = TRUE) { - parsed <- stringr::str_split(x, delim) - if (convert_numeric) { - parsed <- purrr::map(parsed, as.numeric) - } - parsed -} - - -#' Update settings with option settings -#' -#' Options are set in list can be tricky to update. This function makes partly -#' adding custom options work. -#' -#' @param origin The original settings. -#' @param updates The updates to settings -#' @return An update list of settings. -#' @keywords internal -update_settings <- function(origin, updates) { - if (is.null(updates)) { - return(origin) - } - purrr::imap(origin, ~ updates[[.y]] %||% .x) -} - -#' Outliers Detection for response time data -#' -#' This method is also called "transform" method, because it does a -#' transformation before applying z-score method. -#' -#' This is based on Cousineau, D., & Chartier, S. (2010), which is said to be -#' suitable for reaction time data. -#' -#' @param x A vector of input reaction time data. -#' @param threshold The threshold for determining whether a value is outlier or -#' not. Default is set at 2.5, which is best sample size dependent. -#' @return A logical vector of the detected outliers. -#' @keywords internal -check_outliers_rt <- function(x, threshold = 2.5) { - z_scores <- scale( - scale(x, min(x, na.rm = TRUE), diff(range(x, na.rm = TRUE))) - )[, 1] - abs(z_scores) > threshold -} - -#' Calculate threshold by staircase method -#' -#' Here we used the method suggested by Wetherill et al (1966). -#' -#' @param x The levels in data. -#' @return The mean threshold. -#' @keywords internal -calc_staircase_wetherill <- function(x) { - find_reversals <- function(x) { - find_peaks_val <- function(x) { - mat <- pracma::findpeaks(x) - if (is.null(mat)) { - warn("Reversals not found from input", "no_reversals_found") - return(NA_real_) - } - mat[, 1] - } - list( - peaks = find_peaks_val(x), - valleys = -find_peaks_val(-x) - ) - } - # remove repetitions in transformed method - x <- rle(x)$values - reversals <- find_reversals(x) - reversals |> - purrr::map( - # keep equal number of peaks and valleys - \(x) utils::tail(x, min(lengths(reversals))) - ) |> - purrr::list_c() |> - mean() -} diff --git a/R/utils-switch-congruence.R b/R/utils-switch-congruence.R deleted file mode 100644 index b12f91e..0000000 --- a/R/utils-switch-congruence.R +++ /dev/null @@ -1,101 +0,0 @@ -#' Switch cost -#' -#' Utility function to calculate general and specific switch cost. -#' -#' @template common -#' @param by The column name(s) in `data` used to be grouped by. If set to -#' `NULL`, all data will be treated as from one subject. -#' @templateVar name_acc TRUE -#' @templateVar name_rt TRUE -#' @template names -#' @param name_switch The column name of the `data` input whose values are -#' the switch type, in which is a `character` vector with at least `"switch"` -#' and `"repeat"` values. -#' @keywords internal -calc_switch_cost <- function(data, by, name_switch, name_rt, name_acc) { - data[[name_switch]] <- factor(data[[name_switch]], c("switch", "repeat")) - calc_cond_diff( - data, - by, - name_cond = name_switch, - name_diff_prefix = "switch_cost_", - name_acc = name_acc, - name_rt = name_rt - ) -} - -#' Congruence effect -#' -#' Utility function to calculate congruence effect sizes. -#' -#' @template common -#' @param by The column name(s) in `data` used to be grouped by. If set to -#' `NULL`, all data will be treated as from one subject. -#' @templateVar name_acc TRUE -#' @templateVar name_rt TRUE -#' @template names -#' @param name_cong The column name of the `data` input whose values are the -#' congruence information, in which is a `character` vector with "incongruent -#' condition" (label: `"inc"`) and "congruent condition" (label: `"con"`). It -#' will be coerced as a `factor` vector with these two levels. -#' @return A [tibble][tibble::tibble-package] contains congruence effect results -#' on accuracy and response time. -#' @keywords internal -calc_cong_eff <- function(data, by, name_cong, name_acc, name_rt) { - data[[name_cong]] <- factor(data[[name_cong]], c("inc", "con")) - calc_cond_diff( - data, - by, - name_cond = name_cong, - name_diff_prefix = "cong_eff_", - name_acc = name_acc, - name_rt = name_rt - ) -} - -calc_cond_diff <- function(data, by, name_acc, name_rt, - name_cond, name_diff_prefix) { - conds <- levels(data[[name_cond]]) - index_each_cond <- data |> - calc_spd_acc( - by = c(by, name_cond), - name_acc = name_acc, - name_rt = name_rt - ) |> - complete(.data[[name_cond]]) |> - select(all_of(c(by, name_cond, "pc", "mrt", "ies", "rcs", "lisas"))) - index_each_cond |> - pivot_longer( - cols = -any_of(c(by, name_cond)), - names_to = "index_name", - values_to = "score" - ) |> - pivot_wider( - names_from = all_of(name_cond), - values_from = "score" - ) |> - mutate( - diff = .data[[conds[[1]]]] - .data[[conds[[2]]]], - .keep = "unused" - ) |> - # make sure larger values correspond to larger switch cost - mutate( - diff = if_else( - .data$index_name %in% c("pc", "rcs"), - -diff, diff - ) - ) |> - pivot_wider( - names_from = "index_name", - values_from = "diff", - names_prefix = name_diff_prefix - ) |> - merge( - index_each_cond |> - pivot_wider( - names_from = all_of(name_cond), - values_from = -any_of(c(by, name_cond)) - ), - by = by - ) -} diff --git a/R/utils-speed-accuracy.R b/R/utils.R similarity index 63% rename from R/utils-speed-accuracy.R rename to R/utils.R index 5a5d23c..8a50440 100644 --- a/R/utils-speed-accuracy.R +++ b/R/utils.R @@ -129,3 +129,93 @@ calc_sdt <- function(data, by = NULL, name_acc = "acc", name_type = "type") { omissions = .data$e_s ) } + +#' Calculate threshold by staircase method +#' +#' Here we used the method suggested by Wetherill et al (1966). +#' +#' @param x The levels in data. +#' @return The mean threshold. +#' @keywords internal +calc_staircase_wetherill <- function(x) { + find_reversals <- function(x) { + find_peaks_val <- function(x) { + mat <- pracma::findpeaks(x) + if (is.null(mat)) { + warn("Reversals not found from input", "no_reversals_found") + return(NA_real_) + } + mat[, 1] + } + list( + peaks = find_peaks_val(x), + valleys = -find_peaks_val(-x) + ) + } + # remove repetitions in transformed method + x <- rle(x)$values + reversals <- find_reversals(x) + reversals |> + purrr::map( + # keep equal number of peaks and valleys + \(x) utils::tail(x, min(lengths(reversals))) + ) |> + purrr::list_c() |> + mean() +} + +#' Convert character responses +#' +#' Simple function converts character correctness to numeric one. +#' +#' @param x The character vector to be parsed. +#' @param delim Delimiter used to join correctness when forming the character. +#' Usually is hyphen (i.e., `"-"`), which is the default. +#' @param convert_numeric A logical value indicating if the values should be +#' converted to `numeric` ones. +#' @return A list of the parsed result, the same length as the input vector. +#' @keywords internal +parse_char_resp <- function(x, delim = "-", convert_numeric = TRUE) { + parsed <- stringr::str_split(x, delim) + if (convert_numeric) { + parsed <- purrr::map(parsed, as.numeric) + } + parsed +} + + +#' Update settings with option settings +#' +#' Options are set in list can be tricky to update. This function makes partly +#' adding custom options work. +#' +#' @param origin The original settings. +#' @param updates The updates to settings +#' @return An update list of settings. +#' @keywords internal +update_settings <- function(origin, updates) { + if (is.null(updates)) { + return(origin) + } + purrr::imap(origin, ~ updates[[.y]] %||% .x) +} + +#' Outliers Detection for response time data +#' +#' This method is also called "transform" method, because it does a +#' transformation before applying z-score method. +#' +#' This is based on Cousineau, D., & Chartier, S. (2010), which is said to be +#' suitable for reaction time data. +#' +#' @param x A vector of input reaction time data. +#' @param threshold The threshold for determining whether a value is outlier or +#' not. Default is set at 2.5, which is best sample size dependent. +#' @return A logical vector of the detected outliers. +#' @keywords internal +check_outliers_rt <- function(x, threshold = 2.5) { + z_scores <- scale( + scale(x, min(x, na.rm = TRUE), diff(range(x, na.rm = TRUE))) + )[, 1] + abs(z_scores) > threshold +} diff --git a/R/utils_numerosity.R b/R/utils_numerosity.R deleted file mode 100644 index 1085d2e..0000000 --- a/R/utils_numerosity.R +++ /dev/null @@ -1,56 +0,0 @@ -#' Fit a Simple Numerosity Model -#' -#' This model assumes the distribution of mental representation for a given -#' number/count k is N(k, (w * k) ^ 2). -#' -#' @template common -#' @param name_bigset,name_smallset Variable name in `data` indicates bigger and -#' smaller set. -#' @param name_acc Variable name in `data` indicates user's response is correct -#' or not. -#' @param n_fit Number of fits to try to find the best estimate. -#' @param seed Random seed. Default is 1 so that results can be reproduced. -#' @return A [list()] with structure the same as [optim()]. -#' @export -fit_numerosity <- function(data, name_bigset, name_smallset, name_acc, - n_fit = 5, seed = 1) { - set.seed(seed) - b <- data[[name_bigset]] - s <- data[[name_smallset]] - acc <- data[[name_acc]] - - min_objective <- Inf - best_fit <- NULL - for (j in seq_len(n_fit)) { - repeat { - init <- c(w = stats::runif(1)) - if (ll_numerosity(init, b, s, acc) < 1e6) { - break - } - } - fit <- stats::optim( - init, ll_numerosity, - method = "L-BFGS-B", - b = b, s = s, acc = acc, - lower = 0 - ) - if (fit[["value"]] < min_objective) { - best_fit <- fit - } - } - best_fit -} - -ll_numerosity <- function(pars, b, s, acc) { - means <- b - s - sds <- pars["w"]^2 * (b^2 + s^2) - - # incorrect means the mental representation is less than 0, so lower tail - dens <- ifelse( - !acc, - stats::pnorm(0, means, sds, lower.tail = TRUE, log.p = TRUE), - stats::pnorm(0, means, sds, lower.tail = FALSE, log.p = TRUE) - ) - - return(ifelse(any(!is.finite(dens)), 1e6, -sum(dens))) -} diff --git a/man/calc_cong_eff.Rd b/man/calc_cong_eff.Rd index 1879f87..35c37e9 100644 --- a/man/calc_cong_eff.Rd +++ b/man/calc_cong_eff.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-switch-congruence.R +% Please edit documentation in R/switch-congruence.R \name{calc_cong_eff} \alias{calc_cong_eff} \title{Congruence effect} diff --git a/man/calc_sdt.Rd b/man/calc_sdt.Rd index 6d75d9c..b8daee7 100644 --- a/man/calc_sdt.Rd +++ b/man/calc_sdt.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-speed-accuracy.R +% Please edit documentation in R/utils.R \name{calc_sdt} \alias{calc_sdt} \title{Signal Detection Theory} diff --git a/man/calc_spd_acc.Rd b/man/calc_spd_acc.Rd index e8dbc85..f5de041 100644 --- a/man/calc_spd_acc.Rd +++ b/man/calc_spd_acc.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-speed-accuracy.R +% Please edit documentation in R/utils.R \name{calc_spd_acc} \alias{calc_spd_acc} \title{Calculate basic speed and accuracy scores} diff --git a/man/calc_staircase_wetherill.Rd b/man/calc_staircase_wetherill.Rd index a15ade6..606898c 100644 --- a/man/calc_staircase_wetherill.Rd +++ b/man/calc_staircase_wetherill.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-common.R +% Please edit documentation in R/utils.R \name{calc_staircase_wetherill} \alias{calc_staircase_wetherill} \title{Calculate threshold by staircase method} diff --git a/man/calc_switch_cost.Rd b/man/calc_switch_cost.Rd index 53216de..1773be7 100644 --- a/man/calc_switch_cost.Rd +++ b/man/calc_switch_cost.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-switch-congruence.R +% Please edit documentation in R/switch-congruence.R \name{calc_switch_cost} \alias{calc_switch_cost} \title{Switch cost} diff --git a/man/check_outliers_rt.Rd b/man/check_outliers_rt.Rd index 1f8c77a..4aeaed3 100644 --- a/man/check_outliers_rt.Rd +++ b/man/check_outliers_rt.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-common.R +% Please edit documentation in R/utils.R \name{check_outliers_rt} \alias{check_outliers_rt} \title{Outliers Detection for response time data} diff --git a/man/fit_numerosity.Rd b/man/fit_numerosity.Rd index 035d37f..515826e 100644 --- a/man/fit_numerosity.Rd +++ b/man/fit_numerosity.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_numerosity.R +% Please edit documentation in R/nsymncmp.R \name{fit_numerosity} \alias{fit_numerosity} \title{Fit a Simple Numerosity Model} diff --git a/man/parse_char_resp.Rd b/man/parse_char_resp.Rd index 38dca80..8206564 100644 --- a/man/parse_char_resp.Rd +++ b/man/parse_char_resp.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-common.R +% Please edit documentation in R/utils.R \name{parse_char_resp} \alias{parse_char_resp} \title{Convert character responses} diff --git a/man/update_settings.Rd b/man/update_settings.Rd index 8050663..da6a7b4 100644 --- a/man/update_settings.Rd +++ b/man/update_settings.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-common.R +% Please edit documentation in R/utils.R \name{update_settings} \alias{update_settings} \title{Update settings with option settings} From cb8934181aafec8dde8d772fd0da9eab722a673d Mon Sep 17 00:00:00 2001 From: Liang Zhang Date: Tue, 31 Oct 2023 11:46:04 +0800 Subject: [PATCH 04/12] Let fit_numerosity find initial at most 10 times The old `repeat` method will make the program stuck in some cases. Signed-off-by: Liang Zhang --- R/nsymncmp.R | 13 +++++++++++-- tests/testthat/test-nsymncmp.R | 15 +++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/R/nsymncmp.R b/R/nsymncmp.R index 59878cf..025d9fc 100644 --- a/R/nsymncmp.R +++ b/R/nsymncmp.R @@ -82,12 +82,21 @@ fit_numerosity <- function(data, name_bigset, name_smallset, name_acc, min_objective <- Inf best_fit <- NULL for (j in seq_len(n_fit)) { - repeat { + # try 10 times to find a good initial value + for (i in seq_len(10)) { init <- c(w = stats::runif(1)) - if (ll_numerosity(init, b, s, acc) < 1e6) { + init_objective <- ll_numerosity(init, b, s, acc) + if (init_objective < 1e6) { break } } + if (init_objective >= 1e6) { + warn( + "Cannot find a good initial value after 10 tries.", + "no_good_init" + ) + return(list(par = c(w = NA_real_), convergence = 1)) + } fit <- stats::optim( init, ll_numerosity, method = "L-BFGS-B", diff --git a/tests/testthat/test-nsymncmp.R b/tests/testthat/test-nsymncmp.R index 582e609..04b5fbb 100644 --- a/tests/testthat/test-nsymncmp.R +++ b/tests/testthat/test-nsymncmp.R @@ -52,3 +52,18 @@ test_that("Warning if not converged", { nsymncmp(data) |> expect_warning(class = "fit_not_converge") }) + +test_that("Warn if no initial values found", { + data <- data.frame( + b = rep(0, 10), + s = rep(0, 10), + acc = rep(1, 10) + ) + fit_numerosity( + data, + name_bigset = "b", + name_smallset = "s", + name_acc = "acc" + ) |> + expect_warning(class = "no_good_init") +}) From ec85135c394a0b0ea499ec70696553efde544cd9 Mon Sep 17 00:00:00 2001 From: Liang Zhang Date: Tue, 31 Oct 2023 11:46:32 +0800 Subject: [PATCH 05/12] Return w value even not converged Signed-off-by: Liang Zhang --- R/nsymncmp.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/nsymncmp.R b/R/nsymncmp.R index 025d9fc..a7a49f1 100644 --- a/R/nsymncmp.R +++ b/R/nsymncmp.R @@ -53,7 +53,6 @@ calc_numerosity <- function(data, name_bigset, name_smallset, name_acc) { "Cannot find fit after the max number of fitting.", "fit_not_converge" ) - pars <- replace(pars, TRUE, NA_real_) } tibble::as_tibble_row(pars) } From 137d2d5ebfb12ac03e9ff6314cdee6add18e50b7 Mon Sep 17 00:00:00 2001 From: Liang Zhang Date: Tue, 31 Oct 2023 11:48:26 +0800 Subject: [PATCH 06/12] Update changelog Signed-off-by: Liang Zhang --- NEWS.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/NEWS.md b/NEWS.md index 584b7da..34698c8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # preproc.iquizoo (development version) +## Bug Fixes + +* Fixed an edge case when `fit_numerosity()` will stuck in infinite loop. + # preproc.iquizoo 2.5.2 * Enhance code quality of internal functions. From 5e9d8cd47caca773acb75cb2765ec1ccb099c357 Mon Sep 17 00:00:00 2001 From: Liang Zhang Date: Tue, 31 Oct 2023 15:00:34 +0800 Subject: [PATCH 07/12] Use parallel testing Signed-off-by: Liang Zhang --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 204f7f9..c2edb13 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,6 +31,7 @@ Suggests: testthat (>= 3.0.0), withr Config/testthat/edition: 3 +Config/testthat/parallel: true Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.3 From d2679c115884b822fd89efa4a45d114c0d8185eb Mon Sep 17 00:00:00 2001 From: Liang Zhang Date: Tue, 31 Oct 2023 15:02:11 +0800 Subject: [PATCH 08/12] Fix test glitches Signed-off-by: Liang Zhang --- tests/testthat/test-calc_staircase_wetherill.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-calc_staircase_wetherill.R b/tests/testthat/test-calc_staircase_wetherill.R index c8f1c38..1139fe6 100644 --- a/tests/testthat/test-calc_staircase_wetherill.R +++ b/tests/testthat/test-calc_staircase_wetherill.R @@ -8,5 +8,6 @@ test_that("Correctly find all peaks and valleys", { test_that("Return NA if no peaks or valleys", { is.na(calc_staircase_wetherill(c(2:5))) |> expect_true() |> + expect_warning(class = "no_reversals_found") |> expect_warning(class = "no_reversals_found") }) From b440a03facac53d8f7211a997c068977b1257261 Mon Sep 17 00:00:00 2001 From: Liang Zhang Date: Tue, 28 Nov 2023 17:43:41 +0800 Subject: [PATCH 09/12] Accept function from tarflow See https://github.com/psychelzh/tarflow.iquizoo/issues/58 for details Signed-off-by: Liang Zhang --- NAMESPACE | 2 + NEWS.md | 4 + R/preproc.R | 122 ++++++++++++++++++++++++ man/preproc_data.Rd | 49 ++++++++++ man/wrangle_data.Rd | 22 +++++ tests/testthat/_snaps/preproc.md | 155 +++++++++++++++++++++++++++++++ tests/testthat/helper-preproc.R | 8 ++ tests/testthat/test-preproc.R | 79 ++++++++++++++++ 8 files changed, 441 insertions(+) create mode 100644 R/preproc.R create mode 100644 man/preproc_data.Rd create mode 100644 man/wrangle_data.Rd create mode 100644 tests/testthat/_snaps/preproc.md create mode 100644 tests/testthat/helper-preproc.R create mode 100644 tests/testthat/test-preproc.R diff --git a/NAMESPACE b/NAMESPACE index b964a4c..40a2f5e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -26,6 +26,7 @@ export(multisense) export(nback) export(nle) export(nsymncmp) +export(preproc_data) export(racer) export(rapm) export(refframe) @@ -39,6 +40,7 @@ export(sumweighted) export(switchcost) export(symncmp) export(synwin) +export(wrangle_data) import(dplyr) import(rlang) import(tidyr) diff --git a/NEWS.md b/NEWS.md index 34698c8..d66b004 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # preproc.iquizoo (development version) +## Breaking Changes + +* Added `wrangle_data()` and `preproc_data()` functions, which were previously in tarflow.iquizoo package. + ## Bug Fixes * Fixed an edge case when `fit_numerosity()` will stuck in infinite loop. diff --git a/R/preproc.R b/R/preproc.R new file mode 100644 index 0000000..85efef5 --- /dev/null +++ b/R/preproc.R @@ -0,0 +1,122 @@ +#' Wrangle Raw Data +#' +#' Parse raw json string data as [data.frame()] and store them in a list column. +#' +#' @param data The raw data. +#' @param name_raw_json The column name in which stores user's raw data in +#' format of json string. +#' @param name_raw_parsed The name used to store parsed data. +#' @return A [data.frame] contains the parsed data. +#' @export +wrangle_data <- function(data, + name_raw_json = "game_data", + name_raw_parsed = "raw_parsed") { + data[[name_raw_parsed]] <- purrr::map( + data[[name_raw_json]], + parse_raw_json + ) + select(data, !all_of(name_raw_json)) +} + +#' Feed Raw Data to Pre-processing +#' +#' Calculate indices using data typically returned by [wrangle_data()]. +#' +#' @details +#' +#' Observations with empty raw data (empty vector, e.g. `NULL`, in +#' `name_raw_parsed` column) are removed before calculating indices. If no +#' observations left after removing, a warning is signaled and `NULL` is +#' returned. +#' +#' @param data A [data.frame] contains raw data. +#' @param fn This can be a function or formula. See [rlang::as_function()] for +#' more details. +#' @param ... Additional arguments passed to `fn`. +#' @param name_raw_parsed The column name in which stores user's raw data in +#' format of a list of [data.frame]s. +#' @param pivot_results Whether to pivot the calculated indices. If `TRUE`, the +#' calculated indices are pivoted into long format, with each index name +#' stored in the column of `pivot_names_to`, and each index value stored in +#' the column of `pivot_values_to`. If `FALSE`, the calculated indices are +#' stored in the same format as returned by `fn`. +#' @param pivot_names_to,pivot_values_to The column names used to store index +#' names and values if `pivot_results` is `TRUE`. See [tidyr::pivot_longer()] +#' for more details. +#' @return A [data.frame] contains the calculated indices. +#' @export +preproc_data <- function(data, fn, ..., + name_raw_parsed = "raw_parsed", + pivot_results = TRUE, + pivot_names_to = "index_name", + pivot_values_to = "score") { + data <- filter(data, !purrr::map_lgl(.data[[name_raw_parsed]], is_empty)) + if (nrow(data) == 0) { + warn("No non-empty data found.") + return() + } + fn <- as_function(fn) + results <- data |> + mutate( + calc_indices(.data[[name_raw_parsed]], fn, ...), + .keep = "unused" + ) + if (pivot_results) { + results <- results |> + pivot_longer( + cols = !any_of(names(data)), + names_to = pivot_names_to, + values_to = pivot_values_to + ) |> + vctrs::vec_restore(data) + } + results +} + +# helper functions +parse_raw_json <- function(jstr) { + parsed <- tryCatch( + jsonlite::fromJSON(jstr), + error = function(cnd) { + warn( + c( + "Failed to parse json string with the following error:", + conditionMessage(cnd), + i = "Will parse it as `NULL` instead." + ) + ) + return() + } + ) + if (is_empty(parsed)) { + return() + } + parsed |> + rename_with(tolower) |> + mutate(across(where(is.character), tolower)) +} + +calc_indices <- function(l, fn, ...) { + # used as a temporary id for each element + name_id <- ".id" + tryCatch( + bind_rows(l, .id = name_id), + error = function(cnd) { + warn( + c( + "Failed to bind raw data with the following error:", + conditionMessage(cnd), + i = "Will try using tidytable package." + ) + ) + check_installed( + "tidytable", + "because tidyr package fails to bind raw data." + ) + tidytable::bind_rows(l, .id = name_id) |> + utils::type.convert(as.is = TRUE) + } + ) |> + fn(.by = name_id, ...) |> + select(!all_of(name_id)) +} diff --git a/man/preproc_data.Rd b/man/preproc_data.Rd new file mode 100644 index 0000000..6e07878 --- /dev/null +++ b/man/preproc_data.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preproc.R +\name{preproc_data} +\alias{preproc_data} +\title{Feed Raw Data to Pre-processing} +\usage{ +preproc_data( + data, + fn, + ..., + name_raw_parsed = "raw_parsed", + pivot_results = TRUE, + pivot_names_to = "index_name", + pivot_values_to = "score" +) +} +\arguments{ +\item{data}{A \link{data.frame} contains raw data.} + +\item{fn}{This can be a function or formula. See \code{\link[rlang:as_function]{rlang::as_function()}} for +more details.} + +\item{...}{Additional arguments passed to \code{fn}.} + +\item{name_raw_parsed}{The column name in which stores user's raw data in +format of a list of \link{data.frame}s.} + +\item{pivot_results}{Whether to pivot the calculated indices. If \code{TRUE}, the +calculated indices are pivoted into long format, with each index name +stored in the column of \code{pivot_names_to}, and each index value stored in +the column of \code{pivot_values_to}. If \code{FALSE}, the calculated indices are +stored in the same format as returned by \code{fn}.} + +\item{pivot_names_to, pivot_values_to}{The column names used to store index +names and values if \code{pivot_results} is \code{TRUE}. See \code{\link[tidyr:pivot_longer]{tidyr::pivot_longer()}} +for more details.} +} +\value{ +A \link{data.frame} contains the calculated indices. +} +\description{ +Calculate indices using data typically returned by \code{\link[=wrangle_data]{wrangle_data()}}. +} +\details{ +Observations with empty raw data (empty vector, e.g. \code{NULL}, in +\code{name_raw_parsed} column) are removed before calculating indices. If no +observations left after removing, a warning is signaled and \code{NULL} is +returned. +} diff --git a/man/wrangle_data.Rd b/man/wrangle_data.Rd new file mode 100644 index 0000000..7d6f296 --- /dev/null +++ b/man/wrangle_data.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preproc.R +\name{wrangle_data} +\alias{wrangle_data} +\title{Wrangle Raw Data} +\usage{ +wrangle_data(data, name_raw_json = "game_data", name_raw_parsed = "raw_parsed") +} +\arguments{ +\item{data}{The raw data.} + +\item{name_raw_json}{The column name in which stores user's raw data in +format of json string.} + +\item{name_raw_parsed}{The name used to store parsed data.} +} +\value{ +A \link{data.frame} contains the parsed data. +} +\description{ +Parse raw json string data as \code{\link[=data.frame]{data.frame()}} and store them in a list column. +} diff --git a/tests/testthat/_snaps/preproc.md b/tests/testthat/_snaps/preproc.md new file mode 100644 index 0000000..b4151e1 --- /dev/null +++ b/tests/testthat/_snaps/preproc.md @@ -0,0 +1,155 @@ +# Basic situation in `preproc_data()` + + { + "type": "list", + "attributes": { + "names": { + "type": "character", + "attributes": {}, + "value": ["user_id", "index_name", "score"] + }, + "row.names": { + "type": "integer", + "attributes": {}, + "value": [1, 2] + }, + "class": { + "type": "character", + "attributes": {}, + "value": ["tbl_df", "tbl", "data.frame"] + } + }, + "value": [ + { + "type": "integer", + "attributes": {}, + "value": [1, 2] + }, + { + "type": "character", + "attributes": {}, + "value": ["nhit", "nhit"] + }, + { + "type": "double", + "attributes": {}, + "value": ["NaN", 1] + } + ] + } + +--- + + { + "type": "list", + "attributes": { + "names": { + "type": "character", + "attributes": {}, + "value": ["user_id", "nhit"] + }, + "row.names": { + "type": "integer", + "attributes": {}, + "value": [1, 2] + }, + "class": { + "type": "character", + "attributes": {}, + "value": ["tbl_df", "tbl", "data.frame"] + } + }, + "value": [ + { + "type": "integer", + "attributes": {}, + "value": [1, 2] + }, + { + "type": "double", + "attributes": {}, + "value": ["NaN", 1] + } + ] + } + +# Deal with `NULL` in parsed data + + { + "type": "list", + "attributes": { + "names": { + "type": "character", + "attributes": {}, + "value": ["user_id", "index_name", "score"] + }, + "row.names": { + "type": "integer", + "attributes": {}, + "value": [1, 2] + }, + "class": { + "type": "character", + "attributes": {}, + "value": ["tbl_df", "tbl", "data.frame"] + } + }, + "value": [ + { + "type": "integer", + "attributes": {}, + "value": [1, 3] + }, + { + "type": "character", + "attributes": {}, + "value": ["nhit", "nhit"] + }, + { + "type": "double", + "attributes": {}, + "value": ["NaN", 1] + } + ] + } + +# Can deal with mismatch column types in raw data + + { + "type": "list", + "attributes": { + "names": { + "type": "character", + "attributes": {}, + "value": ["user_id", "index_name", "score"] + }, + "row.names": { + "type": "integer", + "attributes": {}, + "value": [1, 2, 3] + }, + "class": { + "type": "character", + "attributes": {}, + "value": ["tbl_df", "tbl", "data.frame"] + } + }, + "value": [ + { + "type": "integer", + "attributes": {}, + "value": [1, 2, 3] + }, + { + "type": "character", + "attributes": {}, + "value": ["nhit", "nhit", "nhit"] + }, + { + "type": "double", + "attributes": {}, + "value": ["NaN", 2, 3] + } + ] + } + diff --git a/tests/testthat/helper-preproc.R b/tests/testthat/helper-preproc.R new file mode 100644 index 0000000..354db01 --- /dev/null +++ b/tests/testthat/helper-preproc.R @@ -0,0 +1,8 @@ +prep_fun <- function(data, .by = NULL) { + data |> + group_by(pick(all_of(.by))) |> + summarise( + nhit = mean(.data$nhit[.data$feedback == 1]), + .groups = "drop" + ) +} diff --git a/tests/testthat/test-preproc.R b/tests/testthat/test-preproc.R new file mode 100644 index 0000000..e5b08f6 --- /dev/null +++ b/tests/testthat/test-preproc.R @@ -0,0 +1,79 @@ +test_that("Basic situation for `wrangle_data()`", { + js_str <- r"([{"a": 1, "b": 2}])" + data <- tibble::tibble(game_data = js_str) + wrangle_data(data) |> + expect_silent() |> + expect_named("raw_parsed") |> + purrr::pluck("raw_parsed", 1) |> + expect_identical(jsonlite::fromJSON(js_str)) + wrangle_data(data, name_raw_parsed = "parsed") |> + expect_silent() |> + expect_named("parsed") +}) + +test_that("Can deal with invalid or empty json", { + data_case_invalid <- data.frame(game_data = "[1") + wrangle_data(data_case_invalid) |> + expect_warning("Failed to parse json string") |> + purrr::pluck("raw_parsed", 1) |> + expect_null() + data_case_empty <- data.frame(game_data = c("[]", "{}")) + wrangle_data(data_case_empty) |> + purrr::pluck("raw_parsed") |> + purrr::walk(expect_null) +}) + +test_that("Change names and values to lowercase", { + js_str <- r"([{"A": "A"}, {"A": "B"}])" + data <- tibble::tibble(game_data = js_str) + wrangle_data(data) |> + expect_silent() |> + purrr::pluck("raw_parsed", 1) |> + expect_identical(data.frame(a = c("a", "b"))) +}) + +test_that("Basic situation in `preproc_data()`", { + data <- tibble::tibble( + user_id = 1:2, + raw_parsed = list( + data.frame(nhit = 1, feedback = 0), + data.frame(nhit = 1, feedback = 1) + ) + ) + preproc_data(data, prep_fun) |> + expect_snapshot_value(style = "json2") + preproc_data(data, prep_fun, pivot_results = FALSE) |> + expect_snapshot_value(style = "json2") +}) + +test_that("Deal with `NULL` in parsed data", { + tibble::tibble(raw_parsed = list(NULL)) |> + preproc_data(prep_fun) |> + expect_null() |> + expect_warning("No non-empty data found.") + tibble::tibble( + user_id = 1:3, + raw_parsed = list( + data.frame(nhit = 1, feedback = 0), + NULL, + data.frame(nhit = 1, feedback = 1) + ) + ) |> + preproc_data(prep_fun) |> + expect_snapshot_value(style = "json2") +}) + +test_that("Can deal with mismatch column types in raw data", { + skip_if_not_installed("tidytable") + data <- tibble::tibble( + user_id = 1:3, + raw_parsed = list( + data.frame(nhit = 1, feedback = 0), + data.frame(nhit = 2, feedback = 1), + data.frame(nhit = "3", feedback = 1) + ) + ) + preproc_data(data, prep_fun) |> + expect_snapshot_value(style = "json2") |> + expect_warning("Failed to bind raw data") +}) From eacbdc90b9d835768023db5948c3a3a8e90acd38 Mon Sep 17 00:00:00 2001 From: Liang Zhang Date: Tue, 28 Nov 2023 18:03:43 +0800 Subject: [PATCH 10/12] Add more used packages Signed-off-by: Liang Zhang --- DESCRIPTION | 2 ++ 1 file changed, 2 insertions(+) diff --git a/DESCRIPTION b/DESCRIPTION index c2edb13..d563142 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,6 +15,7 @@ Depends: R (>= 4.1.0) Imports: dplyr, + jsonlite, pracma, purrr, rlang (>= 0.1.2), @@ -29,6 +30,7 @@ Suggests: readr, roxygen2, testthat (>= 3.0.0), + tidytable, withr Config/testthat/edition: 3 Config/testthat/parallel: true From f6f4398d8d06f4e00c0fbaec542f628df6102c22 Mon Sep 17 00:00:00 2001 From: Liang Zhang Date: Wed, 29 Nov 2023 01:10:13 +0800 Subject: [PATCH 11/12] Increment version number to 2.6.0 --- DESCRIPTION | 2 +- NEWS.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d563142..5f190cc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: preproc.iquizoo Title: Utility Functions for Data Processing of Iquizoo Games -Version: 2.5.2.9000 +Version: 2.6.0 Authors@R: person("Liang", "Zhang", , "psychelzh@outlook.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-9041-1150")) diff --git a/NEWS.md b/NEWS.md index d66b004..f020b80 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# preproc.iquizoo (development version) +# preproc.iquizoo 2.6.0 ## Breaking Changes From bed6516b069f11a201f122bc6a2b7ba3908cdb64 Mon Sep 17 00:00:00 2001 From: Liang Zhang Date: Wed, 29 Nov 2023 01:22:07 +0800 Subject: [PATCH 12/12] Add reference structure Signed-off-by: Liang Zhang --- _pkgdown.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/_pkgdown.yml b/_pkgdown.yml index c49d4fa..ec9c251 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,3 +1,16 @@ url: https://psychelzh.github.io/preproc.iquizoo template: bootstrap: 5 +reference: + - title: "High-level API for data pre-processing" + desc: Typically you would like to call these two functions to invoke processing. + contents: + - wrangle_data + - preproc_data + - title: "Low-level data pre-processing functions" + desc: > + These functions are typically used in the `preproc_data` function, + separately for each task paradigm. + contents: + - -wrangle_data + - -preproc_data