Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Array import magic #365

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ Collate:
'categories.R'
'category.R'
'change-category-id.R'
'clean-array.R'
'combine-categories.R'
'compare-categories.R'
'compare-datasets.R'
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ export(categoriesFromLevels)
export(cd)
export(changeCategoryID)
export(checkForNewVersion)
export(cleanImportedArray)
export(cleanseBatches)
export(collapseCategories)
export(combine)
Expand Down
12 changes: 12 additions & 0 deletions R/add-variable.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ validateVarDefRows <- function(vardef, numrows) {
POSTNewVariable <- function(catalog_url, variable) {
do.POST <- function(x) crPOST(catalog_url, body = toJSON(x, digits = 15))

is_binddef <- FALSE
if (!any(c("expr", "derivation") %in% names(variable))) {
## If deriving a variable, skip this and go straight to POSTing
if (variable$type %in% c("multiple_response", "categorical_array")) {
Expand All @@ -102,6 +103,12 @@ POSTNewVariable <- function(catalog_url, variable) {
}
is_binddef <- is.character(variable$subvariables) &&
!("categories" %in% names(variable))
if (is_binddef) {
# Pop the magic flag off
# TODO: allow setting this magic flag in makeArray()
do_post_bind_magic <- variable$autonames %||% FALSE
variable$autonames <- NULL
}
is_arraydef <- is_catvardef(variable) &&
!any(vapply(variable$subvariables, is_catvardef, logical(1)))
case3 <- !(is_binddef | is_arraydef)
Expand Down Expand Up @@ -131,6 +138,11 @@ POSTNewVariable <- function(catalog_url, variable) {
}
}
out <- do.POST(variable)
if (is_binddef && do_post_bind_magic) {
# Look for common variable name stems and clean that
var <- VariableEntity(crGET(out))
cleanImportedArray(var)
}
invisible(out)
}

Expand Down
60 changes: 60 additions & 0 deletions R/clean-array.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#' Clean up an array variable imported from a lossy file format
#'
#' Array and multiple-response variables coming in from SPSS or other file
#' formats generally need some work to reconstruct the "right" metadata because
#' they have to shove both parent and subvariable metadata into the "varlabels"
#' of the subvariables. This often follows a pattern of having varlabels with a
#' prefix containing the parent question wording (description) and a suffix that
#' is the actual response label.
#'
#' This function detects this prefix and reconstructs what may have been the
#' original array definition.
#'
#' @param variable An array Variable
#' @param min.prefix.length Integer: how many characters long does the common
#' string need to be in order to consider it significant enough to use? Default
#' is 20.
#' @return `variable` with edits pushed to the API. A common prefix on
#' subvariable names is extracted and set as the variable's description.
#' @export
cleanImportedArray <- function (variable, min.prefix.length=20) {
if (length(subvariables(variable)) > 1) {
prefix <- findCommonPrefix(names(subvariables(variable)))
# If length of the common stem is enough, extract it,
# remove it from the subvar names,
# remove trailing whitespace/punctuation,
# and set it as variable description.
if (nchar(prefix) >= min.prefix.length) {
# Use wildcard regexp with length just in case there are special chars in prefix.
# We already know that the prefix matches.
re <- paste0("^.{", nchar(prefix), "}")
names(subvariables(variable)) <- sub(re, "", names(subvariables(variable)))
# Now, remove whitespace and some punctuation from end of prefix, but
# don't remove a question mark or other reasonable punctuation
prefix <- sub("[[:space:]\\-\\:;|]*$", "", prefix)
description(variable) <- prefix
}
}
return(variable)
}

findCommonPrefix <- function (x) {
# Find the shortest one and start with that
step_size <- prefix_length <- min(nchar(x))
out <- ""
while (step_size > 0 && prefix_length > 0) {
# Bisect to find the common stem
step_size <- round(step_size / 2)
stems <- unique(substr(x, 1, prefix_length))
if (length(stems) == 1) {
# Keep this one
out <- stems
# Try longer
prefix_length <- prefix_length + step_size
} else {
# Try shorter
prefix_length <- prefix_length - step_size
}
}
return(out)
}
2 changes: 2 additions & 0 deletions inst/WORDLIST
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ JSON
JSONing
Jupyter
libcurl
lossy
magrittr
makeWeight
MemberCatalog
Expand All @@ -136,6 +137,7 @@ POSIXt
POSTed
POSTing
POSTs
powerpoint
PPA
pre
programmatically
Expand Down
31 changes: 31 additions & 0 deletions man/cleanImportedArray.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions tests/testthat/test-clean-array.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
context("Cleaning array variables")

test_that("findCommonPrefix", {
expect_identical(findCommonPrefix(c("abc def", "ab cd ef")), "ab")
expect_identical(findCommonPrefix(c("XX select all. A", "XX select all. BB")), "XX select all. ")
expect_identical(findCommonPrefix(c("A", "B")), "")
expect_identical(findCommonPrefix(c("abc defg", "abc defg")), "abc defg")
expect_identical(findCommonPrefix(c("abc defg", "gfed cbaooo")), "")
})