From 5963913060bb12ccb91500718a3e638b9b958541 Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal@crunch.io>
Date: Fri, 1 Mar 2019 13:49:43 -0800
Subject: [PATCH 1/7] Boldly/foolishly roughing out code before writing tests

---
 R/add-variable.R | 57 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/R/add-variable.R b/R/add-variable.R
index 03bf1c9fb..85a57e081 100644
--- a/R/add-variable.R
+++ b/R/add-variable.R
@@ -81,6 +81,7 @@ validateVarDefRows <- function(vardef, numrows) {
 POSTNewVariable <- function(catalog_url, variable) {
     do.POST <- function(x) crPOST(catalog_url, body = toJSON(x, digits = 15))
 
+    is_binddef <- FALSE
     if (!any(c("expr", "derivation") %in% names(variable))) {
         ## If deriving a variable, skip this and go straight to POSTing
         if (variable$type %in% c("multiple_response", "categorical_array")) {
@@ -102,6 +103,12 @@ POSTNewVariable <- function(catalog_url, variable) {
             }
             is_binddef <- is.character(variable$subvariables) &&
                 !("categories" %in% names(variable))
+            if (is_binddef) {
+                # Pop the magic flag off
+                # TODO: allow setting this magic flag in makeArray()
+                do_post_bind_magic <- variable$autonames %||% FALSE
+                variable$autonames <- NULL
+            }
             is_arraydef <- is_catvardef(variable) &&
                 !any(vapply(variable$subvariables, is_catvardef, logical(1)))
             case3 <- !(is_binddef | is_arraydef)
@@ -131,6 +138,11 @@ POSTNewVariable <- function(catalog_url, variable) {
         }
     }
     out <- do.POST(variable)
+    if (is_binddef && do_post_bind_magic) {
+        # Look for common variable name stems and clean that
+        var <- VariableEntity(crGET(out))
+        cleanImportedArray(var)
+    }
     invisible(out)
 }
 
@@ -148,3 +160,48 @@ checkVarDefErrors <- function(new_var_urls) {
         )
     }
 }
+
+cleanImportedArray <- function (variable) {
+    MIN_PREFIX_LENGTH <- 20 # TODO: tune/make configurable
+    prefix <- findCommonPrefix(names(subvariales(variable)))
+    # If length of the common stem is enough, extract it,
+    # remove it from the subvar names,
+    # remove trailing whitespace/punctuation,
+    # and set it as variable description.
+    if (nchar(prefix) >= MIN_PREFIX_LENGTH) {
+        # Use wildcard regexp with length just in case there are special chars in prefix.
+        # We already know that the prefix matches.
+        re <- paste0("^.{", nchar(prefix), "}")
+        names(subvariables(variable)) <- sub(re, "", names(subvariables(variable)))
+        # Now, remove whitespace and some punctuation from end of prefix, but
+        # don't remove a question mark or other reasonable punctuation
+        prefix <- sub("[[:space:]\\-\\:;]*$", "", prefix)
+        description(variable) <- prefix
+    }
+    return(variable)
+}
+
+findCommonPrefix <- function (x) {
+    # Find the shortest one and start with that
+    nchars <- nchar(x)
+    shortest <- which.min(nchars)
+    step_size <- prefix_length <- min(nchars)
+    while (step_size > 0 and prefix_length > 0) {
+        step_size <- round(step_size / 2)
+        # Bisect to find the common stem
+        stems <- substr(1, prefix_length, x)
+        if (length(unique(stems)) == 1) {
+            # Go longer
+            prefix_length <- prefix_length + step_size
+        } else {
+            # Go shorter
+            prefix_length <- prefix_length - step_size
+        }
+    }
+
+    if (prefix_length > 0) {
+        return(substr(1, prefix_length, shortest))
+    } else {
+        return("")
+    }
+}

From 643c177548728e32ce03145c1c57706a537f8296 Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal@crunch.io>
Date: Fri, 1 Mar 2019 14:41:52 -0800
Subject: [PATCH 2/7] Make prefix function work

---
 R/add-variable.R                  | 58 +++++++++++++++----------------
 tests/testthat/test-clean-array.R |  9 +++++
 2 files changed, 37 insertions(+), 30 deletions(-)
 create mode 100644 tests/testthat/test-clean-array.R

diff --git a/R/add-variable.R b/R/add-variable.R
index 85a57e081..b88336bf2 100644
--- a/R/add-variable.R
+++ b/R/add-variable.R
@@ -162,46 +162,44 @@ checkVarDefErrors <- function(new_var_urls) {
 }
 
 cleanImportedArray <- function (variable) {
-    MIN_PREFIX_LENGTH <- 20 # TODO: tune/make configurable
-    prefix <- findCommonPrefix(names(subvariales(variable)))
-    # If length of the common stem is enough, extract it,
-    # remove it from the subvar names,
-    # remove trailing whitespace/punctuation,
-    # and set it as variable description.
-    if (nchar(prefix) >= MIN_PREFIX_LENGTH) {
-        # Use wildcard regexp with length just in case there are special chars in prefix.
-        # We already know that the prefix matches.
-        re <- paste0("^.{", nchar(prefix), "}")
-        names(subvariables(variable)) <- sub(re, "", names(subvariables(variable)))
-        # Now, remove whitespace and some punctuation from end of prefix, but
-        # don't remove a question mark or other reasonable punctuation
-        prefix <- sub("[[:space:]\\-\\:;]*$", "", prefix)
-        description(variable) <- prefix
+    if (length(subvariables(variable)) > 1) {
+        MIN_PREFIX_LENGTH <- 20 # TODO: tune/make configurable
+        prefix <- findCommonPrefix(names(subvariables(variable)))
+        # If length of the common stem is enough, extract it,
+        # remove it from the subvar names,
+        # remove trailing whitespace/punctuation,
+        # and set it as variable description.
+        if (nchar(prefix) >= MIN_PREFIX_LENGTH) {
+            # Use wildcard regexp with length just in case there are special chars in prefix.
+            # We already know that the prefix matches.
+            re <- paste0("^.{", nchar(prefix), "}")
+            names(subvariables(variable)) <- sub(re, "", names(subvariables(variable)))
+            # Now, remove whitespace and some punctuation from end of prefix, but
+            # don't remove a question mark or other reasonable punctuation
+            prefix <- sub("[[:space:]\\-\\:;]*$", "", prefix)
+            description(variable) <- prefix
+        }
     }
     return(variable)
 }
 
 findCommonPrefix <- function (x) {
     # Find the shortest one and start with that
-    nchars <- nchar(x)
-    shortest <- which.min(nchars)
-    step_size <- prefix_length <- min(nchars)
-    while (step_size > 0 and prefix_length > 0) {
-        step_size <- round(step_size / 2)
+    step_size <- prefix_length <- min(nchar(x))
+    out <- ""
+    while (step_size > 0 && prefix_length > 0) {
         # Bisect to find the common stem
-        stems <- substr(1, prefix_length, x)
-        if (length(unique(stems)) == 1) {
-            # Go longer
+        step_size <- round(step_size / 2)
+        stems <- unique(substr(x, 1, prefix_length))
+        if (length(stems) == 1) {
+            # Keep this one
+            out <- stems
+            # Try longer
             prefix_length <- prefix_length + step_size
         } else {
-            # Go shorter
+            # Try shorter
             prefix_length <- prefix_length - step_size
         }
     }
-
-    if (prefix_length > 0) {
-        return(substr(1, prefix_length, shortest))
-    } else {
-        return("")
-    }
+    return(out)
 }
diff --git a/tests/testthat/test-clean-array.R b/tests/testthat/test-clean-array.R
new file mode 100644
index 000000000..717ce3c32
--- /dev/null
+++ b/tests/testthat/test-clean-array.R
@@ -0,0 +1,9 @@
+context("Cleaning array variables")
+
+test_that("findCommonPrefix", {
+    expect_identical(findCommonPrefix(c("abc def", "ab cd ef")), "ab")
+    expect_identical(findCommonPrefix(c("XX select all. A", "XX select all. BB")), "XX select all. ")
+    expect_identical(findCommonPrefix(c("A", "B")), "")
+    expect_identical(findCommonPrefix(c("abc defg", "abc defg")), "abc defg")
+    expect_identical(findCommonPrefix(c("abc defg", "gfed cbaooo")), "")
+})

From 3bc814c2633228aa9342e70f5e250138b33120f0 Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal@crunch.io>
Date: Fri, 1 Mar 2019 14:42:39 -0800
Subject: [PATCH 3/7] Move code to clean-array.R

---
 R/add-variable.R | 43 -------------------------------------------
 R/clean-array.R  | 42 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 43 deletions(-)
 create mode 100644 R/clean-array.R

diff --git a/R/add-variable.R b/R/add-variable.R
index b88336bf2..648a374fe 100644
--- a/R/add-variable.R
+++ b/R/add-variable.R
@@ -160,46 +160,3 @@ checkVarDefErrors <- function(new_var_urls) {
         )
     }
 }
-
-cleanImportedArray <- function (variable) {
-    if (length(subvariables(variable)) > 1) {
-        MIN_PREFIX_LENGTH <- 20 # TODO: tune/make configurable
-        prefix <- findCommonPrefix(names(subvariables(variable)))
-        # If length of the common stem is enough, extract it,
-        # remove it from the subvar names,
-        # remove trailing whitespace/punctuation,
-        # and set it as variable description.
-        if (nchar(prefix) >= MIN_PREFIX_LENGTH) {
-            # Use wildcard regexp with length just in case there are special chars in prefix.
-            # We already know that the prefix matches.
-            re <- paste0("^.{", nchar(prefix), "}")
-            names(subvariables(variable)) <- sub(re, "", names(subvariables(variable)))
-            # Now, remove whitespace and some punctuation from end of prefix, but
-            # don't remove a question mark or other reasonable punctuation
-            prefix <- sub("[[:space:]\\-\\:;]*$", "", prefix)
-            description(variable) <- prefix
-        }
-    }
-    return(variable)
-}
-
-findCommonPrefix <- function (x) {
-    # Find the shortest one and start with that
-    step_size <- prefix_length <- min(nchar(x))
-    out <- ""
-    while (step_size > 0 && prefix_length > 0) {
-        # Bisect to find the common stem
-        step_size <- round(step_size / 2)
-        stems <- unique(substr(x, 1, prefix_length))
-        if (length(stems) == 1) {
-            # Keep this one
-            out <- stems
-            # Try longer
-            prefix_length <- prefix_length + step_size
-        } else {
-            # Try shorter
-            prefix_length <- prefix_length - step_size
-        }
-    }
-    return(out)
-}
diff --git a/R/clean-array.R b/R/clean-array.R
new file mode 100644
index 000000000..2891ccc5b
--- /dev/null
+++ b/R/clean-array.R
@@ -0,0 +1,42 @@
+cleanImportedArray <- function (variable) {
+    if (length(subvariables(variable)) > 1) {
+        MIN_PREFIX_LENGTH <- 20 # TODO: tune/make configurable
+        prefix <- findCommonPrefix(names(subvariables(variable)))
+        # If length of the common stem is enough, extract it,
+        # remove it from the subvar names,
+        # remove trailing whitespace/punctuation,
+        # and set it as variable description.
+        if (nchar(prefix) >= MIN_PREFIX_LENGTH) {
+            # Use wildcard regexp with length just in case there are special chars in prefix.
+            # We already know that the prefix matches.
+            re <- paste0("^.{", nchar(prefix), "}")
+            names(subvariables(variable)) <- sub(re, "", names(subvariables(variable)))
+            # Now, remove whitespace and some punctuation from end of prefix, but
+            # don't remove a question mark or other reasonable punctuation
+            prefix <- sub("[[:space:]\\-\\:;]*$", "", prefix)
+            description(variable) <- prefix
+        }
+    }
+    return(variable)
+}
+
+findCommonPrefix <- function (x) {
+    # Find the shortest one and start with that
+    step_size <- prefix_length <- min(nchar(x))
+    out <- ""
+    while (step_size > 0 && prefix_length > 0) {
+        # Bisect to find the common stem
+        step_size <- round(step_size / 2)
+        stems <- unique(substr(x, 1, prefix_length))
+        if (length(stems) == 1) {
+            # Keep this one
+            out <- stems
+            # Try longer
+            prefix_length <- prefix_length + step_size
+        } else {
+            # Try shorter
+            prefix_length <- prefix_length - step_size
+        }
+    }
+    return(out)
+}

From 5dbb07bc136b342c24d0b816d7e97858bbb1c766 Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal@crunch.io>
Date: Fri, 1 Mar 2019 14:48:06 -0800
Subject: [PATCH 4/7] collate

---
 DESCRIPTION | 1 +
 1 file changed, 1 insertion(+)

diff --git a/DESCRIPTION b/DESCRIPTION
index 5d2e4a51e..1adf095ff 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -64,6 +64,7 @@ Collate:
     'categories.R'
     'category.R'
     'change-category-id.R'
+    'clean-array.R'
     'combine-categories.R'
     'compare-categories.R'
     'compare-datasets.R'

From f876457e7fcb102d1cbe7b67e4b1d0cfd3c2b58a Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal@crunch.io>
Date: Fri, 1 Mar 2019 15:48:08 -0800
Subject: [PATCH 5/7] Docs

---
 NAMESPACE                 |  1 +
 R/clean-array.R           | 16 ++++++++++++++++
 man/cleanImportedArray.Rd | 27 +++++++++++++++++++++++++++
 3 files changed, 44 insertions(+)
 create mode 100644 man/cleanImportedArray.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 5bd276fdb..4b0062a91 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -109,6 +109,7 @@ export(categoriesFromLevels)
 export(cd)
 export(changeCategoryID)
 export(checkForNewVersion)
+export(cleanImportedArray)
 export(cleanseBatches)
 export(collapseCategories)
 export(combine)
diff --git a/R/clean-array.R b/R/clean-array.R
index 2891ccc5b..960ce5acf 100644
--- a/R/clean-array.R
+++ b/R/clean-array.R
@@ -1,3 +1,19 @@
+#' Clean up an array variable imported from a lossy file format 
+#' 
+#' Array and multiple-response variables coming in from SPSS or other file
+#' formats generally need some work to reconstruct the "right" metadata because
+#' they have to shove both parent and subvariable metadata into the "varlabels"
+#' of the subvariables. This often follows a pattern of having varlabels with a
+#' prefix containing the parent question wording (description) and a suffix that
+#' is the actual response label.
+#'
+#' This function detects this prefix and reconstructs what may have been the 
+#' original array definition.
+#'
+#' @param variable An array Variable
+#' @return `variable` with edits pushed to the API. A common prefix on 
+#' subvariable names is extracted and set as the variable's description.
+#' @export
 cleanImportedArray <- function (variable) {
     if (length(subvariables(variable)) > 1) {
         MIN_PREFIX_LENGTH <- 20 # TODO: tune/make configurable
diff --git a/man/cleanImportedArray.Rd b/man/cleanImportedArray.Rd
new file mode 100644
index 000000000..b87618f24
--- /dev/null
+++ b/man/cleanImportedArray.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/clean-array.R
+\name{cleanImportedArray}
+\alias{cleanImportedArray}
+\title{Clean up an array variable imported from a lossy file format}
+\usage{
+cleanImportedArray(variable)
+}
+\arguments{
+\item{variable}{An array Variable}
+}
+\value{
+\code{variable} with edits pushed to the API. A common prefix on
+subvariable names is extracted and set as the variable's description.
+}
+\description{
+Array and multiple-response variables coming in from SPSS or other file
+formats generally need some work to reconstruct the "right" metadata because
+they have to shove both parent and subvariable metadata into the "varlabels"
+of the subvariables. This often follows a pattern of having varlabels with a
+prefix containing the parent question wording (description) and a suffix that
+is the actual response label.
+}
+\details{
+This function detects this prefix and reconstructs what may have been the
+original array definition.
+}

From eb8546bc9921bb978cc2011cdef5d75384daecc0 Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal@crunch.io>
Date: Fri, 1 Mar 2019 16:19:52 -0800
Subject: [PATCH 6/7] Tweak cleanImportedArray after manual testing

---
 R/clean-array.R           | 18 ++++++++++--------
 man/cleanImportedArray.Rd |  6 +++++-
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/R/clean-array.R b/R/clean-array.R
index 960ce5acf..99e5ec36c 100644
--- a/R/clean-array.R
+++ b/R/clean-array.R
@@ -1,5 +1,5 @@
-#' Clean up an array variable imported from a lossy file format 
-#' 
+#' Clean up an array variable imported from a lossy file format
+#'
 #' Array and multiple-response variables coming in from SPSS or other file
 #' formats generally need some work to reconstruct the "right" metadata because
 #' they have to shove both parent and subvariable metadata into the "varlabels"
@@ -7,29 +7,31 @@
 #' prefix containing the parent question wording (description) and a suffix that
 #' is the actual response label.
 #'
-#' This function detects this prefix and reconstructs what may have been the 
+#' This function detects this prefix and reconstructs what may have been the
 #' original array definition.
 #'
 #' @param variable An array Variable
-#' @return `variable` with edits pushed to the API. A common prefix on 
+#' @param min.prefix.length Integer: how many characters long does the common
+#' string need to be in order to consider it significant enough to use? Default
+#' is 20.
+#' @return `variable` with edits pushed to the API. A common prefix on
 #' subvariable names is extracted and set as the variable's description.
 #' @export
-cleanImportedArray <- function (variable) {
+cleanImportedArray <- function (variable, min.prefix.length=20) {
     if (length(subvariables(variable)) > 1) {
-        MIN_PREFIX_LENGTH <- 20 # TODO: tune/make configurable
         prefix <- findCommonPrefix(names(subvariables(variable)))
         # If length of the common stem is enough, extract it,
         # remove it from the subvar names,
         # remove trailing whitespace/punctuation,
         # and set it as variable description.
-        if (nchar(prefix) >= MIN_PREFIX_LENGTH) {
+        if (nchar(prefix) >= min.prefix.length) {
             # Use wildcard regexp with length just in case there are special chars in prefix.
             # We already know that the prefix matches.
             re <- paste0("^.{", nchar(prefix), "}")
             names(subvariables(variable)) <- sub(re, "", names(subvariables(variable)))
             # Now, remove whitespace and some punctuation from end of prefix, but
             # don't remove a question mark or other reasonable punctuation
-            prefix <- sub("[[:space:]\\-\\:;]*$", "", prefix)
+            prefix <- sub("[[:space:]\\-\\:;|]*$", "", prefix)
             description(variable) <- prefix
         }
     }
diff --git a/man/cleanImportedArray.Rd b/man/cleanImportedArray.Rd
index b87618f24..4b9b4aacb 100644
--- a/man/cleanImportedArray.Rd
+++ b/man/cleanImportedArray.Rd
@@ -4,10 +4,14 @@
 \alias{cleanImportedArray}
 \title{Clean up an array variable imported from a lossy file format}
 \usage{
-cleanImportedArray(variable)
+cleanImportedArray(variable, min.prefix.length = 20)
 }
 \arguments{
 \item{variable}{An array Variable}
+
+\item{min.prefix.length}{Integer: how many characters long does the common
+string need to be in order to consider it significant enough to use? Default
+is 20.}
 }
 \value{
 \code{variable} with edits pushed to the API. A common prefix on

From c640f4d3ff92f5cc003f32209a458e9d102a1472 Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal@crunch.io>
Date: Sun, 3 Mar 2019 08:31:33 -0800
Subject: [PATCH 7/7] update spelling

---
 inst/WORDLIST | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/inst/WORDLIST b/inst/WORDLIST
index b631ce493..1a374b1c3 100644
--- a/inst/WORDLIST
+++ b/inst/WORDLIST
@@ -112,6 +112,7 @@ JSON
 JSONing
 Jupyter
 libcurl
+lossy
 magrittr
 makeWeight
 MemberCatalog
@@ -136,6 +137,7 @@ POSIXt
 POSTed
 POSTing
 POSTs
+powerpoint
 PPA
 pre
 programmatically