added fixColnames, improved and renamed trySplit

Qile0317 · Jul 4, 2024 · e80513a · e80513a
1 parent a7f0191
commit e80513a
Show file tree

Hide file tree

Showing 6 changed files with 178 additions and 32 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -11,6 +11,7 @@ License: MIT + file LICENSE
 Encoding: UTF-8
 Language: en-US
 Imports: 
+    assertthat,
     BiocManager,
     devtools,
     dplyr,

diff --git a/NAMESPACE b/NAMESPACE
@@ -13,6 +13,7 @@ export(enclose)
 export(encloseBr)
 export(enumerateit)
 export(findMissingRdSections)
+export(fixColnames)
 export(fmrs)
 export(getAvgHex)
 export(getChar)
@@ -56,7 +57,7 @@ export(substrEnd)
 export(subtract)
 export(tableToNumeric)
 export(test_quietly_that)
-export(trySplit)
+export(trySplitWords)
 export(val)
 export(val1)
 export(validateObject)

diff --git a/R/spelling.R b/R/spelling.R
@@ -1,38 +1,44 @@
-#' Try to Split Names Based on Naming Convention
+#' Try to Split Words Based on Naming Convention
 #'
 #' This function attempts to split characters into its component words (and by default,
 #' all in lowercase) based on  camelCase, PascalCase, or snake_case conventions. If
-#' the string does not match any of these conventions, it returns the original string.
+#' the string does not match any of these conventions, it returns all groups of letters.
 #'
-#' @param x A character string or vector to be analyzed and split.
+#' @param ... character(s) to be split, treated as a single vector after unlisting
 #' @param conseq A logical indicating whether the `conseq` argument in [splitCamel()]/
 #' [splitPascal()] should be `TRUE` or `FALSE`.
 #' @param strictSnake A logical indicating the `strict` argument in [isSnakeCase()].
 #' @param uncase A logical indicating whether to remove all casing in the output to
 #' lowercase.
 #' 
-#' @return A list of character vectors, each containing the parts of the string 
-#'         split according to its naming convention or the original string if no 
-#'         convention matches.
+#' @return A list of character vectors, each containing the parts of the string
+#'         split into individual words.
 #' @export
 #' @keywords spelling
 #' @seealso \code{\link{splitCamel}}, \code{\link{splitPascal}}, \code{\link{splitSnake}},
 #'          \code{\link{isCamelCase}}, \code{\link{isPascalCase}}, \code{\link{isSnakeCase}}
-#' 
-#' @examples
-#' trySplit("camelCaseExample")
-#' trySplit("PascalCaseExample")
-#' trySplit("snake_case_example")
-#' trySplit("some|random|case")
 #'
-trySplit <- function(x, conseq = TRUE, strictSnake = FALSE, uncase = TRUE) {
+#' @examples
+#' trySplitWords("camelCaseExample")
+#' trySplitWords("PascalCaseExample")
+#' trySplitWords("snake_case_example", c("more_snake_cases"), "third_snake_case")
+#' trySplitWords("some|random|case")
+#' trySplitWords("Space Words", "UPPER_CASE", uncase = TRUE)
+#'
+trySplitWords <- function(
+    ..., conseq = TRUE, strictSnake = FALSE, uncase = TRUE
+) {
+
+    x <- unlist(list(...), use.names = FALSE)
+    assertthat::assert_that(is.character(x))
+
     lapply(x, function(y) {
         if (isCamelCase(y) || isPascalCase(y)) {
             out <- splitCamel(y, conseq = isTRUE(conseq))[[1]]
         } else if (isSnakeCase(y, strict = isTRUE(strictSnake))) {
             out <- splitSnake(y)[[1]]
         } else {
-            out <- y
+            out <- regmatches(y, gregexpr("[a-zA-Z]+", y))[[1]]
         }
         if (isTRUE(uncase)) return(tolower(out))
         out
@@ -63,6 +69,9 @@ trySplit <- function(x, conseq = TRUE, strictSnake = FALSE, uncase = TRUE) {
 #' @keywords spelling
 #' @source \url{https://stackoverflow.com/questions/8406974/splitting-camelcase-in-r}
 splitCamel <- function(x, conseq = TRUE) {
+
+    assertthat::assert_that(is.character(x))
+
     if (isTRUE(conseq)) {
         return(strsplit(
             x,
@@ -93,7 +102,10 @@ splitPascal <- splitCamel
 #' splitSnake("this_is_snake_case")
 #' splitSnake("another_example_here")
 #'
-splitSnake <- function(x) strsplit(x, "_", fixed = TRUE)
+splitSnake <- function(x) {
+    assertthat::assert_that(is.character(x))
+    strsplit(x, "_", fixed = TRUE)
+}
 
 #' Check if String is camelCase
 #'
@@ -110,7 +122,10 @@ splitSnake <- function(x) strsplit(x, "_", fixed = TRUE)
 #' isCamelCase("CamelCase")   # returns FALSE
 #' isCamelCase("camelcase")   # returns TRUE
 #'
-isCamelCase <- function(x) grepl("^[a-z]+[A-Z]?([A-Za-z]*?)$", x)
+isCamelCase <- function(x) {
+    assertthat::assert_that(is.character(x))
+    grepl("^[a-z]+[A-Z]?([A-Za-z]*?)$", x)
+}
 
 #' Check if String is PascalCase
 #'
@@ -126,7 +141,10 @@ isCamelCase <- function(x) grepl("^[a-z]+[A-Z]?([A-Za-z]*?)$", x)
 #' isPascalCase("PascalCase") # returns TRUE
 #' isPascalCase("pascalCase") # returns FALSE
 #' isPascalCase("Pascalcase") # returns FALSE
-isPascalCase <- function(x) grepl("^[A-Z]+[a-z]?([A-Za-z]*?)$", x)
+isPascalCase <- function(x) {
+    assertthat::assert_that(is.character(x))
+    grepl("^[A-Z]+[a-z]?([A-Za-z]*?)$", x)
+}
 
 #' Check if String is snake_case
 #'
@@ -149,6 +167,9 @@ isPascalCase <- function(x) grepl("^[A-Z]+[a-z]?([A-Za-z]*?)$", x)
 #' isSnakeCase("Snake_Case", FALSE) # returns TRUE
 #' 
 isSnakeCase <- function(x, strict = TRUE) {
+
+    assertthat::assert_that(is.character(x))
+
     grepl(
         ifelse(
             isTRUE(strict),
@@ -174,6 +195,7 @@ isSnakeCase <- function(x, strict = TRUE) {
 #' # Check if 'b' is a vowel
 #' isVowel("b")
 isVowel <- function(x) {
+    assertthat::assert_that(is.character(x))
     tolower(x) %in% c("a", "e", "i", "o", "u")
 }
 
@@ -191,7 +213,10 @@ isVowel <- function(x) {
 #' startsWithVowel("apple")
 #' # Check if "banana" starts with a vowel
 #' startsWithVowel("banana")
-startsWithVowel <- function(x) isVowel(getChar(x, 1))
+startsWithVowel <- function(x) {
+    assertthat::assert_that(is.character(x))
+    isVowel(getChar(x, 1))
+}
 
 #' Prepend an Indefinite Article to a String
 #'
@@ -208,6 +233,7 @@ startsWithVowel <- function(x) isVowel(getChar(x, 1))
 #' # Prepend an indefinite article to "banana"
 #' prependIndefArticle("banana")
 prependIndefArticle <- function(x) {
+    assertthat::assert_that(is.character(x))
     paste("a", ifelse(startsWithVowel(x), "n", ""), " ", x, sep = "")
 }
 
@@ -223,7 +249,10 @@ prependIndefArticle <- function(x) {
 #' @examples
 #' # Remove spaces from "hello world"
 #' stripSpaces("hello world")
-stripSpaces <- function(x) gsub(" ", "", x)
+stripSpaces <- function(x) {
+    assertthat::assert_that(is.character(x))
+    gsub(" ", "", x)
+}
 
 #' Find the Closest Word in a Set to a Given Word
 #'
@@ -241,6 +270,10 @@ stripSpaces <- function(x) gsub(" ", "", x)
 #' closestWord("hello", c("hallo", "hullo", "hey"))
 closestWord <- function(s, strset, distFunc = utils::adist) {
 
+    assertthat::assert_that(is.character(s))
+    assertthat::assert_that(is.character(strset))
+    assertthat::assert_that(is.function(distFunc) && (length(formals(distFunc)) >= 2))
+
     strset <- unique(strset)
     if (length(strset) == 1) return(strset)
 

diff --git a/R/wrangling.R b/R/wrangling.R
@@ -110,3 +110,70 @@ setRownames <- function(object, newRownames) {
     rownames(object) <- newRownames
     object
 }
+
+#' Fix Column Names
+#'
+#' @description 
+#' `r lifecycle::badge("experimental")`
+#'
+#' This function fixes the column names of a given object so that all words are spaced by a specified delimiter, 
+#' and any special characters are replaced according to a substitution map.
+#'
+#' @param object A data frame or matrix.
+#' @param invalidRegex A character string containing a regular expression pattern for invalid characters to replace. Default is "( )|(\\()|(\\))|(\\.)|(/)".
+#' @param spacing A character string to replace invalid characters with. Default is "_".
+#' @param subMap A named list where the names are regular expressions and the values are the replacement strings. These substitutions are applied before `.subMap`.
+#' @param .subMap A named list where the names are regular expressions and the values are the replacement strings. These substitutions are applied after `subMap`. Default is list("\\+" = "plus").
+#' @param unique A logical indicating whether to ensure unique column names by appending a suffix if necessary. Default is FALSE.
+#'
+#' @return The data frame or matrix with fixed column names.
+#' @export
+#' @keywords wrangling
+#' @examples
+#' # Fix column names of a data frame
+#' df <- data.frame(`A (1)` = c(1, 2, 3), `B/C` = c(4, 5, 6), `D+E` = c(7, 8, 9))
+#' fixColnames(df)
+fixColnames <- function(
+    object,
+    invalidRegex = "( )|(\\()|(\\))|(\\.)|(/)",
+    spacing = "_",
+    subMap = NULL,
+    .subMap = list(
+        "%+" = "pct",
+        "\\$+" = "dollars",
+        "\\++" = "plus",
+        "-+" = "minus",
+        "\\*+" = "star",
+        "#+" = "cnt",
+        "&+" = "and",
+        "@+" = "at"
+    ),
+    unique = FALSE
+) {
+
+    assertthat::assert_that(is.character(invalidRegex) && length(invalidRegex) == 1)
+    assertthat::assert_that(is.character(spacing) && length(spacing) == 1)
+
+    subMap <- append(subMap, .subMap)
+
+    # Apply all substitutions from the substitution maps
+    newColnames <- colnames(object)
+    for (pattern in names(subMap)) {
+        replacement <- subMap[[pattern]]
+        newColnames <- gsub(pattern, replacement, newColnames)
+    }
+
+    gsubr <- function(x, pattern, replacement) gsub(pattern, replacement, x)
+
+    newColnames <- newColnames %>%
+        gsubr(invalidRegex, spacing) %>%
+        gsubr("([a-z])([A-Z])", "\\1_\\2") %>%
+        (function(x) tolower(trimws(x))) %>%
+        gsubr("(^_+|_+$)", "") %>%
+        gsubr("_+", "_") %>%
+        (function(x) if (unique) make.unique(x, sep = spacing) else x)
+
+    # Assign the new column names to the object
+    colnames(object) <- newColnames
+    object
+}
diff --git a/man/fixColnames.Rd b/man/fixColnames.Rd
diff --git a/man/trySplit.Rd → man/trySplitWords.Rd b/man/trySplit.Rd → man/trySplitWords.Rd