From e80513a1a67508c2841d518e60356f6ff12350ca Mon Sep 17 00:00:00 2001
From: Qile0317 <qile0317@gmail.com>
Date: Wed, 3 Jul 2024 22:06:03 -0700
Subject: [PATCH] added fixColnames, improved and renamed trySplit

---
 DESCRIPTION                           |  1 +
 NAMESPACE                             |  3 +-
 R/spelling.R                          | 71 ++++++++++++++++++++-------
 R/wrangling.R                         | 67 +++++++++++++++++++++++++
 man/fixColnames.Rd                    | 44 +++++++++++++++++
 man/{trySplit.Rd => trySplitWords.Rd} | 24 ++++-----
 6 files changed, 178 insertions(+), 32 deletions(-)
 create mode 100644 man/fixColnames.Rd
 rename man/{trySplit.Rd => trySplitWords.Rd} (62%)

diff --git a/DESCRIPTION b/DESCRIPTION
index 40dfbbc..357027d 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -11,6 +11,7 @@ License: MIT + file LICENSE
 Encoding: UTF-8
 Language: en-US
 Imports: 
+    assertthat,
     BiocManager,
     devtools,
     dplyr,
diff --git a/NAMESPACE b/NAMESPACE
index 7e1a34a..660701e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -13,6 +13,7 @@ export(enclose)
 export(encloseBr)
 export(enumerateit)
 export(findMissingRdSections)
+export(fixColnames)
 export(fmrs)
 export(getAvgHex)
 export(getChar)
@@ -56,7 +57,7 @@ export(substrEnd)
 export(subtract)
 export(tableToNumeric)
 export(test_quietly_that)
-export(trySplit)
+export(trySplitWords)
 export(val)
 export(val1)
 export(validateObject)
diff --git a/R/spelling.R b/R/spelling.R
index faad571..d8e799f 100644
--- a/R/spelling.R
+++ b/R/spelling.R
@@ -1,38 +1,44 @@
-#' Try to Split Names Based on Naming Convention
+#' Try to Split Words Based on Naming Convention
 #'
 #' This function attempts to split characters into its component words (and by default,
 #' all in lowercase) based on  camelCase, PascalCase, or snake_case conventions. If
-#' the string does not match any of these conventions, it returns the original string.
+#' the string does not match any of these conventions, it returns all groups of letters.
 #'
-#' @param x A character string or vector to be analyzed and split.
+#' @param ... character(s) to be split, treated as a single vector after unlisting
 #' @param conseq A logical indicating whether the `conseq` argument in [splitCamel()]/
 #' [splitPascal()] should be `TRUE` or `FALSE`.
 #' @param strictSnake A logical indicating the `strict` argument in [isSnakeCase()].
 #' @param uncase A logical indicating whether to remove all casing in the output to
 #' lowercase.
 #' 
-#' @return A list of character vectors, each containing the parts of the string 
-#'         split according to its naming convention or the original string if no 
-#'         convention matches.
+#' @return A list of character vectors, each containing the parts of the string
+#'         split into individual words.
 #' @export
 #' @keywords spelling
 #' @seealso \code{\link{splitCamel}}, \code{\link{splitPascal}}, \code{\link{splitSnake}},
 #'          \code{\link{isCamelCase}}, \code{\link{isPascalCase}}, \code{\link{isSnakeCase}}
-#' 
-#' @examples
-#' trySplit("camelCaseExample")
-#' trySplit("PascalCaseExample")
-#' trySplit("snake_case_example")
-#' trySplit("some|random|case")
 #'
-trySplit <- function(x, conseq = TRUE, strictSnake = FALSE, uncase = TRUE) {
+#' @examples
+#' trySplitWords("camelCaseExample")
+#' trySplitWords("PascalCaseExample")
+#' trySplitWords("snake_case_example", c("more_snake_cases"), "third_snake_case")
+#' trySplitWords("some|random|case")
+#' trySplitWords("Space Words", "UPPER_CASE", uncase = TRUE)
+#'
+trySplitWords <- function(
+    ..., conseq = TRUE, strictSnake = FALSE, uncase = TRUE
+) {
+
+    x <- unlist(list(...), use.names = FALSE)
+    assertthat::assert_that(is.character(x))
+
     lapply(x, function(y) {
         if (isCamelCase(y) || isPascalCase(y)) {
             out <- splitCamel(y, conseq = isTRUE(conseq))[[1]]
         } else if (isSnakeCase(y, strict = isTRUE(strictSnake))) {
             out <- splitSnake(y)[[1]]
         } else {
-            out <- y
+            out <- regmatches(y, gregexpr("[a-zA-Z]+", y))[[1]]
         }
         if (isTRUE(uncase)) return(tolower(out))
         out
@@ -63,6 +69,9 @@ trySplit <- function(x, conseq = TRUE, strictSnake = FALSE, uncase = TRUE) {
 #' @keywords spelling
 #' @source \url{https://stackoverflow.com/questions/8406974/splitting-camelcase-in-r}
 splitCamel <- function(x, conseq = TRUE) {
+
+    assertthat::assert_that(is.character(x))
+    
     if (isTRUE(conseq)) {
         return(strsplit(
             x,
@@ -93,7 +102,10 @@ splitPascal <- splitCamel
 #' splitSnake("this_is_snake_case")
 #' splitSnake("another_example_here")
 #'
-splitSnake <- function(x) strsplit(x, "_", fixed = TRUE)
+splitSnake <- function(x) {
+    assertthat::assert_that(is.character(x))
+    strsplit(x, "_", fixed = TRUE)
+}
 
 #' Check if String is camelCase
 #'
@@ -110,7 +122,10 @@ splitSnake <- function(x) strsplit(x, "_", fixed = TRUE)
 #' isCamelCase("CamelCase")   # returns FALSE
 #' isCamelCase("camelcase")   # returns TRUE
 #'
-isCamelCase <- function(x) grepl("^[a-z]+[A-Z]?([A-Za-z]*?)$", x)
+isCamelCase <- function(x) {
+    assertthat::assert_that(is.character(x))
+    grepl("^[a-z]+[A-Z]?([A-Za-z]*?)$", x)
+}
 
 #' Check if String is PascalCase
 #'
@@ -126,7 +141,10 @@ isCamelCase <- function(x) grepl("^[a-z]+[A-Z]?([A-Za-z]*?)$", x)
 #' isPascalCase("PascalCase") # returns TRUE
 #' isPascalCase("pascalCase") # returns FALSE
 #' isPascalCase("Pascalcase") # returns FALSE
-isPascalCase <- function(x) grepl("^[A-Z]+[a-z]?([A-Za-z]*?)$", x)
+isPascalCase <- function(x) {
+    assertthat::assert_that(is.character(x))
+    grepl("^[A-Z]+[a-z]?([A-Za-z]*?)$", x)
+}
 
 #' Check if String is snake_case
 #'
@@ -149,6 +167,9 @@ isPascalCase <- function(x) grepl("^[A-Z]+[a-z]?([A-Za-z]*?)$", x)
 #' isSnakeCase("Snake_Case", FALSE) # returns TRUE
 #' 
 isSnakeCase <- function(x, strict = TRUE) {
+
+    assertthat::assert_that(is.character(x))
+
     grepl(
         ifelse(
             isTRUE(strict),
@@ -174,6 +195,7 @@ isSnakeCase <- function(x, strict = TRUE) {
 #' # Check if 'b' is a vowel
 #' isVowel("b")
 isVowel <- function(x) {
+    assertthat::assert_that(is.character(x))
     tolower(x) %in% c("a", "e", "i", "o", "u")
 }
 
@@ -191,7 +213,10 @@ isVowel <- function(x) {
 #' startsWithVowel("apple")
 #' # Check if "banana" starts with a vowel
 #' startsWithVowel("banana")
-startsWithVowel <- function(x) isVowel(getChar(x, 1))
+startsWithVowel <- function(x) {
+    assertthat::assert_that(is.character(x))
+    isVowel(getChar(x, 1))
+}
 
 #' Prepend an Indefinite Article to a String
 #'
@@ -208,6 +233,7 @@ startsWithVowel <- function(x) isVowel(getChar(x, 1))
 #' # Prepend an indefinite article to "banana"
 #' prependIndefArticle("banana")
 prependIndefArticle <- function(x) {
+    assertthat::assert_that(is.character(x))
     paste("a", ifelse(startsWithVowel(x), "n", ""), " ", x, sep = "")
 }
 
@@ -223,7 +249,10 @@ prependIndefArticle <- function(x) {
 #' @examples
 #' # Remove spaces from "hello world"
 #' stripSpaces("hello world")
-stripSpaces <- function(x) gsub(" ", "", x)
+stripSpaces <- function(x) {
+    assertthat::assert_that(is.character(x))
+    gsub(" ", "", x)
+}
 
 #' Find the Closest Word in a Set to a Given Word
 #'
@@ -241,6 +270,10 @@ stripSpaces <- function(x) gsub(" ", "", x)
 #' closestWord("hello", c("hallo", "hullo", "hey"))
 closestWord <- function(s, strset, distFunc = utils::adist) {
 
+    assertthat::assert_that(is.character(s))
+    assertthat::assert_that(is.character(strset))
+    assertthat::assert_that(is.function(distFunc) && (length(formals(distFunc)) >= 2))
+
     strset <- unique(strset)
     if (length(strset) == 1) return(strset)
 
diff --git a/R/wrangling.R b/R/wrangling.R
index ce7d451..1c6e0fb 100644
--- a/R/wrangling.R
+++ b/R/wrangling.R
@@ -110,3 +110,70 @@ setRownames <- function(object, newRownames) {
     rownames(object) <- newRownames
     object
 }
+
+#' Fix Column Names
+#'
+#' @description 
+#' `r lifecycle::badge("experimental")`
+#'
+#' This function fixes the column names of a given object so that all words are spaced by a specified delimiter, 
+#' and any special characters are replaced according to a substitution map.
+#'
+#' @param object A data frame or matrix.
+#' @param invalidRegex A character string containing a regular expression pattern for invalid characters to replace. Default is "( )|(\\()|(\\))|(\\.)|(/)".
+#' @param spacing A character string to replace invalid characters with. Default is "_".
+#' @param subMap A named list where the names are regular expressions and the values are the replacement strings. These substitutions are applied before `.subMap`.
+#' @param .subMap A named list where the names are regular expressions and the values are the replacement strings. These substitutions are applied after `subMap`. Default is list("\\+" = "plus").
+#' @param unique A logical indicating whether to ensure unique column names by appending a suffix if necessary. Default is FALSE.
+#'
+#' @return The data frame or matrix with fixed column names.
+#' @export
+#' @keywords wrangling
+#' @examples
+#' # Fix column names of a data frame
+#' df <- data.frame(`A (1)` = c(1, 2, 3), `B/C` = c(4, 5, 6), `D+E` = c(7, 8, 9))
+#' fixColnames(df)
+fixColnames <- function(
+    object,
+    invalidRegex = "( )|(\\()|(\\))|(\\.)|(/)",
+    spacing = "_",
+    subMap = NULL,
+    .subMap = list(
+        "%+" = "pct",
+        "\\$+" = "dollars",
+        "\\++" = "plus",
+        "-+" = "minus",
+        "\\*+" = "star",
+        "#+" = "cnt",
+        "&+" = "and",
+        "@+" = "at"
+    ),
+    unique = FALSE
+) {
+
+    assertthat::assert_that(is.character(invalidRegex) && length(invalidRegex) == 1)
+    assertthat::assert_that(is.character(spacing) && length(spacing) == 1)
+
+    subMap <- append(subMap, .subMap)
+
+    # Apply all substitutions from the substitution maps
+    newColnames <- colnames(object)
+    for (pattern in names(subMap)) {
+        replacement <- subMap[[pattern]]
+        newColnames <- gsub(pattern, replacement, newColnames)
+    }
+
+    gsubr <- function(x, pattern, replacement) gsub(pattern, replacement, x)
+    
+    newColnames <- newColnames %>%
+        gsubr(invalidRegex, spacing) %>%
+        gsubr("([a-z])([A-Z])", "\\1_\\2") %>%
+        (function(x) tolower(trimws(x))) %>%
+        gsubr("(^_+|_+$)", "") %>%
+        gsubr("_+", "_") %>%
+        (function(x) if (unique) make.unique(x, sep = spacing) else x)
+
+    # Assign the new column names to the object
+    colnames(object) <- newColnames
+    object
+}
diff --git a/man/fixColnames.Rd b/man/fixColnames.Rd
new file mode 100644
index 0000000..56151a8
--- /dev/null
+++ b/man/fixColnames.Rd
@@ -0,0 +1,44 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/wrangling.R
+\name{fixColnames}
+\alias{fixColnames}
+\title{Fix Column Names}
+\usage{
+fixColnames(
+  object,
+  invalidRegex = "( )|(\\\\()|(\\\\))|(\\\\.)|(/)",
+  spacing = "_",
+  subMap = NULL,
+  .subMap = list(`\%+` = "pct", `\\\\$+` = "dollars", `\\\\++` = "plus", `-+` = "minus",
+    `\\\\*+` = "star", `#+` = "cnt", `&+` = "and", `@+` = "at"),
+  unique = FALSE
+)
+}
+\arguments{
+\item{object}{A data frame or matrix.}
+
+\item{invalidRegex}{A character string containing a regular expression pattern for invalid characters to replace. Default is "( )|(\\()|(\\))|(\\.)|(/)".}
+
+\item{spacing}{A character string to replace invalid characters with. Default is "_".}
+
+\item{subMap}{A named list where the names are regular expressions and the values are the replacement strings. These substitutions are applied before \code{.subMap}.}
+
+\item{.subMap}{A named list where the names are regular expressions and the values are the replacement strings. These substitutions are applied after \code{subMap}. Default is list("\\+" = "plus").}
+
+\item{unique}{A logical indicating whether to ensure unique column names by appending a suffix if necessary. Default is FALSE.}
+}
+\value{
+The data frame or matrix with fixed column names.
+}
+\description{
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
+
+This function fixes the column names of a given object so that all words are spaced by a specified delimiter,
+and any special characters are replaced according to a substitution map.
+}
+\examples{
+# Fix column names of a data frame
+df <- data.frame(`A (1)` = c(1, 2, 3), `B/C` = c(4, 5, 6), `D+E` = c(7, 8, 9))
+fixColnames(df)
+}
+\keyword{wrangling}
diff --git a/man/trySplit.Rd b/man/trySplitWords.Rd
similarity index 62%
rename from man/trySplit.Rd
rename to man/trySplitWords.Rd
index 9ba959d..6a4f234 100644
--- a/man/trySplit.Rd
+++ b/man/trySplitWords.Rd
@@ -1,13 +1,13 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/spelling.R
-\name{trySplit}
-\alias{trySplit}
-\title{Try to Split Names Based on Naming Convention}
+\name{trySplitWords}
+\alias{trySplitWords}
+\title{Try to Split Words Based on Naming Convention}
 \usage{
-trySplit(x, conseq = TRUE, strictSnake = FALSE, uncase = TRUE)
+trySplitWords(..., conseq = TRUE, strictSnake = FALSE, uncase = TRUE)
 }
 \arguments{
-\item{x}{A character string or vector to be analyzed and split.}
+\item{...}{character(s) to be split, treated as a single vector after unlisting}
 
 \item{conseq}{A logical indicating whether the \code{conseq} argument in \code{\link[=splitCamel]{splitCamel()}}/
 \code{\link[=splitPascal]{splitPascal()}} should be \code{TRUE} or \code{FALSE}.}
@@ -19,19 +19,19 @@ lowercase.}
 }
 \value{
 A list of character vectors, each containing the parts of the string
-split according to its naming convention or the original string if no
-convention matches.
+split into individual words.
 }
 \description{
 This function attempts to split characters into its component words (and by default,
 all in lowercase) based on  camelCase, PascalCase, or snake_case conventions. If
-the string does not match any of these conventions, it returns the original string.
+the string does not match any of these conventions, it returns all groups of letters.
 }
 \examples{
-trySplit("camelCaseExample")
-trySplit("PascalCaseExample")
-trySplit("snake_case_example")
-trySplit("some|random|case")
+trySplitWords("camelCaseExample")
+trySplitWords("PascalCaseExample")
+trySplitWords("snake_case_example", c("more_snake_cases"), "third_snake_case")
+trySplitWords("some|random|case")
+trySplitWords("Space Words", "UPPER_CASE", uncase = TRUE)
 
 }
 \seealso{