From 5963913060bb12ccb91500718a3e638b9b958541 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 1 Mar 2019 13:49:43 -0800 Subject: [PATCH 1/7] Boldly/foolishly roughing out code before writing tests --- R/add-variable.R | 57 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/R/add-variable.R b/R/add-variable.R index 03bf1c9fb..85a57e081 100644 --- a/R/add-variable.R +++ b/R/add-variable.R @@ -81,6 +81,7 @@ validateVarDefRows <- function(vardef, numrows) { POSTNewVariable <- function(catalog_url, variable) { do.POST <- function(x) crPOST(catalog_url, body = toJSON(x, digits = 15)) + is_binddef <- FALSE if (!any(c("expr", "derivation") %in% names(variable))) { ## If deriving a variable, skip this and go straight to POSTing if (variable$type %in% c("multiple_response", "categorical_array")) { @@ -102,6 +103,12 @@ POSTNewVariable <- function(catalog_url, variable) { } is_binddef <- is.character(variable$subvariables) && !("categories" %in% names(variable)) + if (is_binddef) { + # Pop the magic flag off + # TODO: allow setting this magic flag in makeArray() + do_post_bind_magic <- variable$autonames %||% FALSE + variable$autonames <- NULL + } is_arraydef <- is_catvardef(variable) && !any(vapply(variable$subvariables, is_catvardef, logical(1))) case3 <- !(is_binddef | is_arraydef) @@ -131,6 +138,11 @@ POSTNewVariable <- function(catalog_url, variable) { } } out <- do.POST(variable) + if (is_binddef && do_post_bind_magic) { + # Look for common variable name stems and clean that + var <- VariableEntity(crGET(out)) + cleanImportedArray(var) + } invisible(out) } @@ -148,3 +160,48 @@ checkVarDefErrors <- function(new_var_urls) { ) } } + +cleanImportedArray <- function (variable) { + MIN_PREFIX_LENGTH <- 20 # TODO: tune/make configurable + prefix <- findCommonPrefix(names(subvariales(variable))) + # If length of the common stem is enough, extract it, + # remove it from the subvar names, + # remove trailing whitespace/punctuation, + # and set it as variable description. + if (nchar(prefix) >= MIN_PREFIX_LENGTH) { + # Use wildcard regexp with length just in case there are special chars in prefix. + # We already know that the prefix matches. + re <- paste0("^.{", nchar(prefix), "}") + names(subvariables(variable)) <- sub(re, "", names(subvariables(variable))) + # Now, remove whitespace and some punctuation from end of prefix, but + # don't remove a question mark or other reasonable punctuation + prefix <- sub("[[:space:]\\-\\:;]*$", "", prefix) + description(variable) <- prefix + } + return(variable) +} + +findCommonPrefix <- function (x) { + # Find the shortest one and start with that + nchars <- nchar(x) + shortest <- which.min(nchars) + step_size <- prefix_length <- min(nchars) + while (step_size > 0 and prefix_length > 0) { + step_size <- round(step_size / 2) + # Bisect to find the common stem + stems <- substr(1, prefix_length, x) + if (length(unique(stems)) == 1) { + # Go longer + prefix_length <- prefix_length + step_size + } else { + # Go shorter + prefix_length <- prefix_length - step_size + } + } + + if (prefix_length > 0) { + return(substr(1, prefix_length, shortest)) + } else { + return("") + } +} From 643c177548728e32ce03145c1c57706a537f8296 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 1 Mar 2019 14:41:52 -0800 Subject: [PATCH 2/7] Make prefix function work --- R/add-variable.R | 58 +++++++++++++++---------------- tests/testthat/test-clean-array.R | 9 +++++ 2 files changed, 37 insertions(+), 30 deletions(-) create mode 100644 tests/testthat/test-clean-array.R diff --git a/R/add-variable.R b/R/add-variable.R index 85a57e081..b88336bf2 100644 --- a/R/add-variable.R +++ b/R/add-variable.R @@ -162,46 +162,44 @@ checkVarDefErrors <- function(new_var_urls) { } cleanImportedArray <- function (variable) { - MIN_PREFIX_LENGTH <- 20 # TODO: tune/make configurable - prefix <- findCommonPrefix(names(subvariales(variable))) - # If length of the common stem is enough, extract it, - # remove it from the subvar names, - # remove trailing whitespace/punctuation, - # and set it as variable description. - if (nchar(prefix) >= MIN_PREFIX_LENGTH) { - # Use wildcard regexp with length just in case there are special chars in prefix. - # We already know that the prefix matches. - re <- paste0("^.{", nchar(prefix), "}") - names(subvariables(variable)) <- sub(re, "", names(subvariables(variable))) - # Now, remove whitespace and some punctuation from end of prefix, but - # don't remove a question mark or other reasonable punctuation - prefix <- sub("[[:space:]\\-\\:;]*$", "", prefix) - description(variable) <- prefix + if (length(subvariables(variable)) > 1) { + MIN_PREFIX_LENGTH <- 20 # TODO: tune/make configurable + prefix <- findCommonPrefix(names(subvariables(variable))) + # If length of the common stem is enough, extract it, + # remove it from the subvar names, + # remove trailing whitespace/punctuation, + # and set it as variable description. + if (nchar(prefix) >= MIN_PREFIX_LENGTH) { + # Use wildcard regexp with length just in case there are special chars in prefix. + # We already know that the prefix matches. + re <- paste0("^.{", nchar(prefix), "}") + names(subvariables(variable)) <- sub(re, "", names(subvariables(variable))) + # Now, remove whitespace and some punctuation from end of prefix, but + # don't remove a question mark or other reasonable punctuation + prefix <- sub("[[:space:]\\-\\:;]*$", "", prefix) + description(variable) <- prefix + } } return(variable) } findCommonPrefix <- function (x) { # Find the shortest one and start with that - nchars <- nchar(x) - shortest <- which.min(nchars) - step_size <- prefix_length <- min(nchars) - while (step_size > 0 and prefix_length > 0) { - step_size <- round(step_size / 2) + step_size <- prefix_length <- min(nchar(x)) + out <- "" + while (step_size > 0 && prefix_length > 0) { # Bisect to find the common stem - stems <- substr(1, prefix_length, x) - if (length(unique(stems)) == 1) { - # Go longer + step_size <- round(step_size / 2) + stems <- unique(substr(x, 1, prefix_length)) + if (length(stems) == 1) { + # Keep this one + out <- stems + # Try longer prefix_length <- prefix_length + step_size } else { - # Go shorter + # Try shorter prefix_length <- prefix_length - step_size } } - - if (prefix_length > 0) { - return(substr(1, prefix_length, shortest)) - } else { - return("") - } + return(out) } diff --git a/tests/testthat/test-clean-array.R b/tests/testthat/test-clean-array.R new file mode 100644 index 000000000..717ce3c32 --- /dev/null +++ b/tests/testthat/test-clean-array.R @@ -0,0 +1,9 @@ +context("Cleaning array variables") + +test_that("findCommonPrefix", { + expect_identical(findCommonPrefix(c("abc def", "ab cd ef")), "ab") + expect_identical(findCommonPrefix(c("XX select all. A", "XX select all. BB")), "XX select all. ") + expect_identical(findCommonPrefix(c("A", "B")), "") + expect_identical(findCommonPrefix(c("abc defg", "abc defg")), "abc defg") + expect_identical(findCommonPrefix(c("abc defg", "gfed cbaooo")), "") +}) From 3bc814c2633228aa9342e70f5e250138b33120f0 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 1 Mar 2019 14:42:39 -0800 Subject: [PATCH 3/7] Move code to clean-array.R --- R/add-variable.R | 43 ------------------------------------------- R/clean-array.R | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 43 deletions(-) create mode 100644 R/clean-array.R diff --git a/R/add-variable.R b/R/add-variable.R index b88336bf2..648a374fe 100644 --- a/R/add-variable.R +++ b/R/add-variable.R @@ -160,46 +160,3 @@ checkVarDefErrors <- function(new_var_urls) { ) } } - -cleanImportedArray <- function (variable) { - if (length(subvariables(variable)) > 1) { - MIN_PREFIX_LENGTH <- 20 # TODO: tune/make configurable - prefix <- findCommonPrefix(names(subvariables(variable))) - # If length of the common stem is enough, extract it, - # remove it from the subvar names, - # remove trailing whitespace/punctuation, - # and set it as variable description. - if (nchar(prefix) >= MIN_PREFIX_LENGTH) { - # Use wildcard regexp with length just in case there are special chars in prefix. - # We already know that the prefix matches. - re <- paste0("^.{", nchar(prefix), "}") - names(subvariables(variable)) <- sub(re, "", names(subvariables(variable))) - # Now, remove whitespace and some punctuation from end of prefix, but - # don't remove a question mark or other reasonable punctuation - prefix <- sub("[[:space:]\\-\\:;]*$", "", prefix) - description(variable) <- prefix - } - } - return(variable) -} - -findCommonPrefix <- function (x) { - # Find the shortest one and start with that - step_size <- prefix_length <- min(nchar(x)) - out <- "" - while (step_size > 0 && prefix_length > 0) { - # Bisect to find the common stem - step_size <- round(step_size / 2) - stems <- unique(substr(x, 1, prefix_length)) - if (length(stems) == 1) { - # Keep this one - out <- stems - # Try longer - prefix_length <- prefix_length + step_size - } else { - # Try shorter - prefix_length <- prefix_length - step_size - } - } - return(out) -} diff --git a/R/clean-array.R b/R/clean-array.R new file mode 100644 index 000000000..2891ccc5b --- /dev/null +++ b/R/clean-array.R @@ -0,0 +1,42 @@ +cleanImportedArray <- function (variable) { + if (length(subvariables(variable)) > 1) { + MIN_PREFIX_LENGTH <- 20 # TODO: tune/make configurable + prefix <- findCommonPrefix(names(subvariables(variable))) + # If length of the common stem is enough, extract it, + # remove it from the subvar names, + # remove trailing whitespace/punctuation, + # and set it as variable description. + if (nchar(prefix) >= MIN_PREFIX_LENGTH) { + # Use wildcard regexp with length just in case there are special chars in prefix. + # We already know that the prefix matches. + re <- paste0("^.{", nchar(prefix), "}") + names(subvariables(variable)) <- sub(re, "", names(subvariables(variable))) + # Now, remove whitespace and some punctuation from end of prefix, but + # don't remove a question mark or other reasonable punctuation + prefix <- sub("[[:space:]\\-\\:;]*$", "", prefix) + description(variable) <- prefix + } + } + return(variable) +} + +findCommonPrefix <- function (x) { + # Find the shortest one and start with that + step_size <- prefix_length <- min(nchar(x)) + out <- "" + while (step_size > 0 && prefix_length > 0) { + # Bisect to find the common stem + step_size <- round(step_size / 2) + stems <- unique(substr(x, 1, prefix_length)) + if (length(stems) == 1) { + # Keep this one + out <- stems + # Try longer + prefix_length <- prefix_length + step_size + } else { + # Try shorter + prefix_length <- prefix_length - step_size + } + } + return(out) +} From 5dbb07bc136b342c24d0b816d7e97858bbb1c766 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 1 Mar 2019 14:48:06 -0800 Subject: [PATCH 4/7] collate --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 5d2e4a51e..1adf095ff 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -64,6 +64,7 @@ Collate: 'categories.R' 'category.R' 'change-category-id.R' + 'clean-array.R' 'combine-categories.R' 'compare-categories.R' 'compare-datasets.R' From f876457e7fcb102d1cbe7b67e4b1d0cfd3c2b58a Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 1 Mar 2019 15:48:08 -0800 Subject: [PATCH 5/7] Docs --- NAMESPACE | 1 + R/clean-array.R | 16 ++++++++++++++++ man/cleanImportedArray.Rd | 27 +++++++++++++++++++++++++++ 3 files changed, 44 insertions(+) create mode 100644 man/cleanImportedArray.Rd diff --git a/NAMESPACE b/NAMESPACE index 5bd276fdb..4b0062a91 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -109,6 +109,7 @@ export(categoriesFromLevels) export(cd) export(changeCategoryID) export(checkForNewVersion) +export(cleanImportedArray) export(cleanseBatches) export(collapseCategories) export(combine) diff --git a/R/clean-array.R b/R/clean-array.R index 2891ccc5b..960ce5acf 100644 --- a/R/clean-array.R +++ b/R/clean-array.R @@ -1,3 +1,19 @@ +#' Clean up an array variable imported from a lossy file format +#' +#' Array and multiple-response variables coming in from SPSS or other file +#' formats generally need some work to reconstruct the "right" metadata because +#' they have to shove both parent and subvariable metadata into the "varlabels" +#' of the subvariables. This often follows a pattern of having varlabels with a +#' prefix containing the parent question wording (description) and a suffix that +#' is the actual response label. +#' +#' This function detects this prefix and reconstructs what may have been the +#' original array definition. +#' +#' @param variable An array Variable +#' @return `variable` with edits pushed to the API. A common prefix on +#' subvariable names is extracted and set as the variable's description. +#' @export cleanImportedArray <- function (variable) { if (length(subvariables(variable)) > 1) { MIN_PREFIX_LENGTH <- 20 # TODO: tune/make configurable diff --git a/man/cleanImportedArray.Rd b/man/cleanImportedArray.Rd new file mode 100644 index 000000000..b87618f24 --- /dev/null +++ b/man/cleanImportedArray.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clean-array.R +\name{cleanImportedArray} +\alias{cleanImportedArray} +\title{Clean up an array variable imported from a lossy file format} +\usage{ +cleanImportedArray(variable) +} +\arguments{ +\item{variable}{An array Variable} +} +\value{ +\code{variable} with edits pushed to the API. A common prefix on +subvariable names is extracted and set as the variable's description. +} +\description{ +Array and multiple-response variables coming in from SPSS or other file +formats generally need some work to reconstruct the "right" metadata because +they have to shove both parent and subvariable metadata into the "varlabels" +of the subvariables. This often follows a pattern of having varlabels with a +prefix containing the parent question wording (description) and a suffix that +is the actual response label. +} +\details{ +This function detects this prefix and reconstructs what may have been the +original array definition. +} From eb8546bc9921bb978cc2011cdef5d75384daecc0 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 1 Mar 2019 16:19:52 -0800 Subject: [PATCH 6/7] Tweak cleanImportedArray after manual testing --- R/clean-array.R | 18 ++++++++++-------- man/cleanImportedArray.Rd | 6 +++++- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/R/clean-array.R b/R/clean-array.R index 960ce5acf..99e5ec36c 100644 --- a/R/clean-array.R +++ b/R/clean-array.R @@ -1,5 +1,5 @@ -#' Clean up an array variable imported from a lossy file format -#' +#' Clean up an array variable imported from a lossy file format +#' #' Array and multiple-response variables coming in from SPSS or other file #' formats generally need some work to reconstruct the "right" metadata because #' they have to shove both parent and subvariable metadata into the "varlabels" @@ -7,29 +7,31 @@ #' prefix containing the parent question wording (description) and a suffix that #' is the actual response label. #' -#' This function detects this prefix and reconstructs what may have been the +#' This function detects this prefix and reconstructs what may have been the #' original array definition. #' #' @param variable An array Variable -#' @return `variable` with edits pushed to the API. A common prefix on +#' @param min.prefix.length Integer: how many characters long does the common +#' string need to be in order to consider it significant enough to use? Default +#' is 20. +#' @return `variable` with edits pushed to the API. A common prefix on #' subvariable names is extracted and set as the variable's description. #' @export -cleanImportedArray <- function (variable) { +cleanImportedArray <- function (variable, min.prefix.length=20) { if (length(subvariables(variable)) > 1) { - MIN_PREFIX_LENGTH <- 20 # TODO: tune/make configurable prefix <- findCommonPrefix(names(subvariables(variable))) # If length of the common stem is enough, extract it, # remove it from the subvar names, # remove trailing whitespace/punctuation, # and set it as variable description. - if (nchar(prefix) >= MIN_PREFIX_LENGTH) { + if (nchar(prefix) >= min.prefix.length) { # Use wildcard regexp with length just in case there are special chars in prefix. # We already know that the prefix matches. re <- paste0("^.{", nchar(prefix), "}") names(subvariables(variable)) <- sub(re, "", names(subvariables(variable))) # Now, remove whitespace and some punctuation from end of prefix, but # don't remove a question mark or other reasonable punctuation - prefix <- sub("[[:space:]\\-\\:;]*$", "", prefix) + prefix <- sub("[[:space:]\\-\\:;|]*$", "", prefix) description(variable) <- prefix } } diff --git a/man/cleanImportedArray.Rd b/man/cleanImportedArray.Rd index b87618f24..4b9b4aacb 100644 --- a/man/cleanImportedArray.Rd +++ b/man/cleanImportedArray.Rd @@ -4,10 +4,14 @@ \alias{cleanImportedArray} \title{Clean up an array variable imported from a lossy file format} \usage{ -cleanImportedArray(variable) +cleanImportedArray(variable, min.prefix.length = 20) } \arguments{ \item{variable}{An array Variable} + +\item{min.prefix.length}{Integer: how many characters long does the common +string need to be in order to consider it significant enough to use? Default +is 20.} } \value{ \code{variable} with edits pushed to the API. A common prefix on From c640f4d3ff92f5cc003f32209a458e9d102a1472 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Sun, 3 Mar 2019 08:31:33 -0800 Subject: [PATCH 7/7] update spelling --- inst/WORDLIST | 2 ++ 1 file changed, 2 insertions(+) diff --git a/inst/WORDLIST b/inst/WORDLIST index b631ce493..1a374b1c3 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -112,6 +112,7 @@ JSON JSONing Jupyter libcurl +lossy magrittr makeWeight MemberCatalog @@ -136,6 +137,7 @@ POSIXt POSTed POSTing POSTs +powerpoint PPA pre programmatically