From e80513a1a67508c2841d518e60356f6ff12350ca Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Wed, 3 Jul 2024 22:06:03 -0700 Subject: [PATCH] added fixColnames, improved and renamed trySplit --- DESCRIPTION | 1 + NAMESPACE | 3 +- R/spelling.R | 71 ++++++++++++++++++++------- R/wrangling.R | 67 +++++++++++++++++++++++++ man/fixColnames.Rd | 44 +++++++++++++++++ man/{trySplit.Rd => trySplitWords.Rd} | 24 ++++----- 6 files changed, 178 insertions(+), 32 deletions(-) create mode 100644 man/fixColnames.Rd rename man/{trySplit.Rd => trySplitWords.Rd} (62%) diff --git a/DESCRIPTION b/DESCRIPTION index 40dfbbc..357027d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -11,6 +11,7 @@ License: MIT + file LICENSE Encoding: UTF-8 Language: en-US Imports: + assertthat, BiocManager, devtools, dplyr, diff --git a/NAMESPACE b/NAMESPACE index 7e1a34a..660701e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,6 +13,7 @@ export(enclose) export(encloseBr) export(enumerateit) export(findMissingRdSections) +export(fixColnames) export(fmrs) export(getAvgHex) export(getChar) @@ -56,7 +57,7 @@ export(substrEnd) export(subtract) export(tableToNumeric) export(test_quietly_that) -export(trySplit) +export(trySplitWords) export(val) export(val1) export(validateObject) diff --git a/R/spelling.R b/R/spelling.R index faad571..d8e799f 100644 --- a/R/spelling.R +++ b/R/spelling.R @@ -1,38 +1,44 @@ -#' Try to Split Names Based on Naming Convention +#' Try to Split Words Based on Naming Convention #' #' This function attempts to split characters into its component words (and by default, #' all in lowercase) based on camelCase, PascalCase, or snake_case conventions. If -#' the string does not match any of these conventions, it returns the original string. +#' the string does not match any of these conventions, it returns all groups of letters. #' -#' @param x A character string or vector to be analyzed and split. +#' @param ... character(s) to be split, treated as a single vector after unlisting #' @param conseq A logical indicating whether the `conseq` argument in [splitCamel()]/ #' [splitPascal()] should be `TRUE` or `FALSE`. #' @param strictSnake A logical indicating the `strict` argument in [isSnakeCase()]. #' @param uncase A logical indicating whether to remove all casing in the output to #' lowercase. #' -#' @return A list of character vectors, each containing the parts of the string -#' split according to its naming convention or the original string if no -#' convention matches. +#' @return A list of character vectors, each containing the parts of the string +#' split into individual words. #' @export #' @keywords spelling #' @seealso \code{\link{splitCamel}}, \code{\link{splitPascal}}, \code{\link{splitSnake}}, #' \code{\link{isCamelCase}}, \code{\link{isPascalCase}}, \code{\link{isSnakeCase}} -#' -#' @examples -#' trySplit("camelCaseExample") -#' trySplit("PascalCaseExample") -#' trySplit("snake_case_example") -#' trySplit("some|random|case") #' -trySplit <- function(x, conseq = TRUE, strictSnake = FALSE, uncase = TRUE) { +#' @examples +#' trySplitWords("camelCaseExample") +#' trySplitWords("PascalCaseExample") +#' trySplitWords("snake_case_example", c("more_snake_cases"), "third_snake_case") +#' trySplitWords("some|random|case") +#' trySplitWords("Space Words", "UPPER_CASE", uncase = TRUE) +#' +trySplitWords <- function( + ..., conseq = TRUE, strictSnake = FALSE, uncase = TRUE +) { + + x <- unlist(list(...), use.names = FALSE) + assertthat::assert_that(is.character(x)) + lapply(x, function(y) { if (isCamelCase(y) || isPascalCase(y)) { out <- splitCamel(y, conseq = isTRUE(conseq))[[1]] } else if (isSnakeCase(y, strict = isTRUE(strictSnake))) { out <- splitSnake(y)[[1]] } else { - out <- y + out <- regmatches(y, gregexpr("[a-zA-Z]+", y))[[1]] } if (isTRUE(uncase)) return(tolower(out)) out @@ -63,6 +69,9 @@ trySplit <- function(x, conseq = TRUE, strictSnake = FALSE, uncase = TRUE) { #' @keywords spelling #' @source \url{https://stackoverflow.com/questions/8406974/splitting-camelcase-in-r} splitCamel <- function(x, conseq = TRUE) { + + assertthat::assert_that(is.character(x)) + if (isTRUE(conseq)) { return(strsplit( x, @@ -93,7 +102,10 @@ splitPascal <- splitCamel #' splitSnake("this_is_snake_case") #' splitSnake("another_example_here") #' -splitSnake <- function(x) strsplit(x, "_", fixed = TRUE) +splitSnake <- function(x) { + assertthat::assert_that(is.character(x)) + strsplit(x, "_", fixed = TRUE) +} #' Check if String is camelCase #' @@ -110,7 +122,10 @@ splitSnake <- function(x) strsplit(x, "_", fixed = TRUE) #' isCamelCase("CamelCase") # returns FALSE #' isCamelCase("camelcase") # returns TRUE #' -isCamelCase <- function(x) grepl("^[a-z]+[A-Z]?([A-Za-z]*?)$", x) +isCamelCase <- function(x) { + assertthat::assert_that(is.character(x)) + grepl("^[a-z]+[A-Z]?([A-Za-z]*?)$", x) +} #' Check if String is PascalCase #' @@ -126,7 +141,10 @@ isCamelCase <- function(x) grepl("^[a-z]+[A-Z]?([A-Za-z]*?)$", x) #' isPascalCase("PascalCase") # returns TRUE #' isPascalCase("pascalCase") # returns FALSE #' isPascalCase("Pascalcase") # returns FALSE -isPascalCase <- function(x) grepl("^[A-Z]+[a-z]?([A-Za-z]*?)$", x) +isPascalCase <- function(x) { + assertthat::assert_that(is.character(x)) + grepl("^[A-Z]+[a-z]?([A-Za-z]*?)$", x) +} #' Check if String is snake_case #' @@ -149,6 +167,9 @@ isPascalCase <- function(x) grepl("^[A-Z]+[a-z]?([A-Za-z]*?)$", x) #' isSnakeCase("Snake_Case", FALSE) # returns TRUE #' isSnakeCase <- function(x, strict = TRUE) { + + assertthat::assert_that(is.character(x)) + grepl( ifelse( isTRUE(strict), @@ -174,6 +195,7 @@ isSnakeCase <- function(x, strict = TRUE) { #' # Check if 'b' is a vowel #' isVowel("b") isVowel <- function(x) { + assertthat::assert_that(is.character(x)) tolower(x) %in% c("a", "e", "i", "o", "u") } @@ -191,7 +213,10 @@ isVowel <- function(x) { #' startsWithVowel("apple") #' # Check if "banana" starts with a vowel #' startsWithVowel("banana") -startsWithVowel <- function(x) isVowel(getChar(x, 1)) +startsWithVowel <- function(x) { + assertthat::assert_that(is.character(x)) + isVowel(getChar(x, 1)) +} #' Prepend an Indefinite Article to a String #' @@ -208,6 +233,7 @@ startsWithVowel <- function(x) isVowel(getChar(x, 1)) #' # Prepend an indefinite article to "banana" #' prependIndefArticle("banana") prependIndefArticle <- function(x) { + assertthat::assert_that(is.character(x)) paste("a", ifelse(startsWithVowel(x), "n", ""), " ", x, sep = "") } @@ -223,7 +249,10 @@ prependIndefArticle <- function(x) { #' @examples #' # Remove spaces from "hello world" #' stripSpaces("hello world") -stripSpaces <- function(x) gsub(" ", "", x) +stripSpaces <- function(x) { + assertthat::assert_that(is.character(x)) + gsub(" ", "", x) +} #' Find the Closest Word in a Set to a Given Word #' @@ -241,6 +270,10 @@ stripSpaces <- function(x) gsub(" ", "", x) #' closestWord("hello", c("hallo", "hullo", "hey")) closestWord <- function(s, strset, distFunc = utils::adist) { + assertthat::assert_that(is.character(s)) + assertthat::assert_that(is.character(strset)) + assertthat::assert_that(is.function(distFunc) && (length(formals(distFunc)) >= 2)) + strset <- unique(strset) if (length(strset) == 1) return(strset) diff --git a/R/wrangling.R b/R/wrangling.R index ce7d451..1c6e0fb 100644 --- a/R/wrangling.R +++ b/R/wrangling.R @@ -110,3 +110,70 @@ setRownames <- function(object, newRownames) { rownames(object) <- newRownames object } + +#' Fix Column Names +#' +#' @description +#' `r lifecycle::badge("experimental")` +#' +#' This function fixes the column names of a given object so that all words are spaced by a specified delimiter, +#' and any special characters are replaced according to a substitution map. +#' +#' @param object A data frame or matrix. +#' @param invalidRegex A character string containing a regular expression pattern for invalid characters to replace. Default is "( )|(\\()|(\\))|(\\.)|(/)". +#' @param spacing A character string to replace invalid characters with. Default is "_". +#' @param subMap A named list where the names are regular expressions and the values are the replacement strings. These substitutions are applied before `.subMap`. +#' @param .subMap A named list where the names are regular expressions and the values are the replacement strings. These substitutions are applied after `subMap`. Default is list("\\+" = "plus"). +#' @param unique A logical indicating whether to ensure unique column names by appending a suffix if necessary. Default is FALSE. +#' +#' @return The data frame or matrix with fixed column names. +#' @export +#' @keywords wrangling +#' @examples +#' # Fix column names of a data frame +#' df <- data.frame(`A (1)` = c(1, 2, 3), `B/C` = c(4, 5, 6), `D+E` = c(7, 8, 9)) +#' fixColnames(df) +fixColnames <- function( + object, + invalidRegex = "( )|(\\()|(\\))|(\\.)|(/)", + spacing = "_", + subMap = NULL, + .subMap = list( + "%+" = "pct", + "\\$+" = "dollars", + "\\++" = "plus", + "-+" = "minus", + "\\*+" = "star", + "#+" = "cnt", + "&+" = "and", + "@+" = "at" + ), + unique = FALSE +) { + + assertthat::assert_that(is.character(invalidRegex) && length(invalidRegex) == 1) + assertthat::assert_that(is.character(spacing) && length(spacing) == 1) + + subMap <- append(subMap, .subMap) + + # Apply all substitutions from the substitution maps + newColnames <- colnames(object) + for (pattern in names(subMap)) { + replacement <- subMap[[pattern]] + newColnames <- gsub(pattern, replacement, newColnames) + } + + gsubr <- function(x, pattern, replacement) gsub(pattern, replacement, x) + + newColnames <- newColnames %>% + gsubr(invalidRegex, spacing) %>% + gsubr("([a-z])([A-Z])", "\\1_\\2") %>% + (function(x) tolower(trimws(x))) %>% + gsubr("(^_+|_+$)", "") %>% + gsubr("_+", "_") %>% + (function(x) if (unique) make.unique(x, sep = spacing) else x) + + # Assign the new column names to the object + colnames(object) <- newColnames + object +} diff --git a/man/fixColnames.Rd b/man/fixColnames.Rd new file mode 100644 index 0000000..56151a8 --- /dev/null +++ b/man/fixColnames.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/wrangling.R +\name{fixColnames} +\alias{fixColnames} +\title{Fix Column Names} +\usage{ +fixColnames( + object, + invalidRegex = "( )|(\\\\()|(\\\\))|(\\\\.)|(/)", + spacing = "_", + subMap = NULL, + .subMap = list(`\%+` = "pct", `\\\\$+` = "dollars", `\\\\++` = "plus", `-+` = "minus", + `\\\\*+` = "star", `#+` = "cnt", `&+` = "and", `@+` = "at"), + unique = FALSE +) +} +\arguments{ +\item{object}{A data frame or matrix.} + +\item{invalidRegex}{A character string containing a regular expression pattern for invalid characters to replace. Default is "( )|(\\()|(\\))|(\\.)|(/)".} + +\item{spacing}{A character string to replace invalid characters with. Default is "_".} + +\item{subMap}{A named list where the names are regular expressions and the values are the replacement strings. These substitutions are applied before \code{.subMap}.} + +\item{.subMap}{A named list where the names are regular expressions and the values are the replacement strings. These substitutions are applied after \code{subMap}. Default is list("\\+" = "plus").} + +\item{unique}{A logical indicating whether to ensure unique column names by appending a suffix if necessary. Default is FALSE.} +} +\value{ +The data frame or matrix with fixed column names. +} +\description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +This function fixes the column names of a given object so that all words are spaced by a specified delimiter, +and any special characters are replaced according to a substitution map. +} +\examples{ +# Fix column names of a data frame +df <- data.frame(`A (1)` = c(1, 2, 3), `B/C` = c(4, 5, 6), `D+E` = c(7, 8, 9)) +fixColnames(df) +} +\keyword{wrangling} diff --git a/man/trySplit.Rd b/man/trySplitWords.Rd similarity index 62% rename from man/trySplit.Rd rename to man/trySplitWords.Rd index 9ba959d..6a4f234 100644 --- a/man/trySplit.Rd +++ b/man/trySplitWords.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/spelling.R -\name{trySplit} -\alias{trySplit} -\title{Try to Split Names Based on Naming Convention} +\name{trySplitWords} +\alias{trySplitWords} +\title{Try to Split Words Based on Naming Convention} \usage{ -trySplit(x, conseq = TRUE, strictSnake = FALSE, uncase = TRUE) +trySplitWords(..., conseq = TRUE, strictSnake = FALSE, uncase = TRUE) } \arguments{ -\item{x}{A character string or vector to be analyzed and split.} +\item{...}{character(s) to be split, treated as a single vector after unlisting} \item{conseq}{A logical indicating whether the \code{conseq} argument in \code{\link[=splitCamel]{splitCamel()}}/ \code{\link[=splitPascal]{splitPascal()}} should be \code{TRUE} or \code{FALSE}.} @@ -19,19 +19,19 @@ lowercase.} } \value{ A list of character vectors, each containing the parts of the string -split according to its naming convention or the original string if no -convention matches. +split into individual words. } \description{ This function attempts to split characters into its component words (and by default, all in lowercase) based on camelCase, PascalCase, or snake_case conventions. If -the string does not match any of these conventions, it returns the original string. +the string does not match any of these conventions, it returns all groups of letters. } \examples{ -trySplit("camelCaseExample") -trySplit("PascalCaseExample") -trySplit("snake_case_example") -trySplit("some|random|case") +trySplitWords("camelCaseExample") +trySplitWords("PascalCaseExample") +trySplitWords("snake_case_example", c("more_snake_cases"), "third_snake_case") +trySplitWords("some|random|case") +trySplitWords("Space Words", "UPPER_CASE", uncase = TRUE) } \seealso{