Skip to content

Commit

Permalink
added fixColnames, improved and renamed trySplit
Browse files Browse the repository at this point in the history
  • Loading branch information
Qile0317 committed Jul 4, 2024
1 parent a7f0191 commit e80513a
Show file tree
Hide file tree
Showing 6 changed files with 178 additions and 32 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ License: MIT + file LICENSE
Encoding: UTF-8
Language: en-US
Imports:
assertthat,
BiocManager,
devtools,
dplyr,
Expand Down
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ export(enclose)
export(encloseBr)
export(enumerateit)
export(findMissingRdSections)
export(fixColnames)
export(fmrs)
export(getAvgHex)
export(getChar)
Expand Down Expand Up @@ -56,7 +57,7 @@ export(substrEnd)
export(subtract)
export(tableToNumeric)
export(test_quietly_that)
export(trySplit)
export(trySplitWords)
export(val)
export(val1)
export(validateObject)
Expand Down
71 changes: 52 additions & 19 deletions R/spelling.R
Original file line number Diff line number Diff line change
@@ -1,38 +1,44 @@
#' Try to Split Names Based on Naming Convention
#' Try to Split Words Based on Naming Convention
#'
#' This function attempts to split characters into its component words (and by default,
#' all in lowercase) based on camelCase, PascalCase, or snake_case conventions. If
#' the string does not match any of these conventions, it returns the original string.
#' the string does not match any of these conventions, it returns all groups of letters.
#'
#' @param x A character string or vector to be analyzed and split.
#' @param ... character(s) to be split, treated as a single vector after unlisting
#' @param conseq A logical indicating whether the `conseq` argument in [splitCamel()]/
#' [splitPascal()] should be `TRUE` or `FALSE`.
#' @param strictSnake A logical indicating the `strict` argument in [isSnakeCase()].
#' @param uncase A logical indicating whether to remove all casing in the output to
#' lowercase.
#'
#' @return A list of character vectors, each containing the parts of the string
#' split according to its naming convention or the original string if no
#' convention matches.
#' @return A list of character vectors, each containing the parts of the string
#' split into individual words.
#' @export
#' @keywords spelling
#' @seealso \code{\link{splitCamel}}, \code{\link{splitPascal}}, \code{\link{splitSnake}},
#' \code{\link{isCamelCase}}, \code{\link{isPascalCase}}, \code{\link{isSnakeCase}}
#'
#' @examples
#' trySplit("camelCaseExample")
#' trySplit("PascalCaseExample")
#' trySplit("snake_case_example")
#' trySplit("some|random|case")
#'
trySplit <- function(x, conseq = TRUE, strictSnake = FALSE, uncase = TRUE) {
#' @examples
#' trySplitWords("camelCaseExample")
#' trySplitWords("PascalCaseExample")
#' trySplitWords("snake_case_example", c("more_snake_cases"), "third_snake_case")
#' trySplitWords("some|random|case")
#' trySplitWords("Space Words", "UPPER_CASE", uncase = TRUE)
#'
trySplitWords <- function(
..., conseq = TRUE, strictSnake = FALSE, uncase = TRUE
) {

x <- unlist(list(...), use.names = FALSE)
assertthat::assert_that(is.character(x))

lapply(x, function(y) {
if (isCamelCase(y) || isPascalCase(y)) {
out <- splitCamel(y, conseq = isTRUE(conseq))[[1]]
} else if (isSnakeCase(y, strict = isTRUE(strictSnake))) {
out <- splitSnake(y)[[1]]
} else {
out <- y
out <- regmatches(y, gregexpr("[a-zA-Z]+", y))[[1]]
}
if (isTRUE(uncase)) return(tolower(out))
out
Expand Down Expand Up @@ -63,6 +69,9 @@ trySplit <- function(x, conseq = TRUE, strictSnake = FALSE, uncase = TRUE) {
#' @keywords spelling
#' @source \url{https://stackoverflow.com/questions/8406974/splitting-camelcase-in-r}
splitCamel <- function(x, conseq = TRUE) {

assertthat::assert_that(is.character(x))

if (isTRUE(conseq)) {
return(strsplit(
x,
Expand Down Expand Up @@ -93,7 +102,10 @@ splitPascal <- splitCamel
#' splitSnake("this_is_snake_case")
#' splitSnake("another_example_here")
#'
splitSnake <- function(x) strsplit(x, "_", fixed = TRUE)
splitSnake <- function(x) {
assertthat::assert_that(is.character(x))
strsplit(x, "_", fixed = TRUE)
}

#' Check if String is camelCase
#'
Expand All @@ -110,7 +122,10 @@ splitSnake <- function(x) strsplit(x, "_", fixed = TRUE)
#' isCamelCase("CamelCase") # returns FALSE
#' isCamelCase("camelcase") # returns TRUE
#'
isCamelCase <- function(x) grepl("^[a-z]+[A-Z]?([A-Za-z]*?)$", x)
isCamelCase <- function(x) {
assertthat::assert_that(is.character(x))
grepl("^[a-z]+[A-Z]?([A-Za-z]*?)$", x)
}

#' Check if String is PascalCase
#'
Expand All @@ -126,7 +141,10 @@ isCamelCase <- function(x) grepl("^[a-z]+[A-Z]?([A-Za-z]*?)$", x)
#' isPascalCase("PascalCase") # returns TRUE
#' isPascalCase("pascalCase") # returns FALSE
#' isPascalCase("Pascalcase") # returns FALSE
isPascalCase <- function(x) grepl("^[A-Z]+[a-z]?([A-Za-z]*?)$", x)
isPascalCase <- function(x) {
assertthat::assert_that(is.character(x))
grepl("^[A-Z]+[a-z]?([A-Za-z]*?)$", x)
}

#' Check if String is snake_case
#'
Expand All @@ -149,6 +167,9 @@ isPascalCase <- function(x) grepl("^[A-Z]+[a-z]?([A-Za-z]*?)$", x)
#' isSnakeCase("Snake_Case", FALSE) # returns TRUE
#'
isSnakeCase <- function(x, strict = TRUE) {

assertthat::assert_that(is.character(x))

grepl(
ifelse(
isTRUE(strict),
Expand All @@ -174,6 +195,7 @@ isSnakeCase <- function(x, strict = TRUE) {
#' # Check if 'b' is a vowel
#' isVowel("b")
isVowel <- function(x) {
assertthat::assert_that(is.character(x))
tolower(x) %in% c("a", "e", "i", "o", "u")
}

Expand All @@ -191,7 +213,10 @@ isVowel <- function(x) {
#' startsWithVowel("apple")
#' # Check if "banana" starts with a vowel
#' startsWithVowel("banana")
startsWithVowel <- function(x) isVowel(getChar(x, 1))
startsWithVowel <- function(x) {
assertthat::assert_that(is.character(x))
isVowel(getChar(x, 1))
}

#' Prepend an Indefinite Article to a String
#'
Expand All @@ -208,6 +233,7 @@ startsWithVowel <- function(x) isVowel(getChar(x, 1))
#' # Prepend an indefinite article to "banana"
#' prependIndefArticle("banana")
prependIndefArticle <- function(x) {
assertthat::assert_that(is.character(x))
paste("a", ifelse(startsWithVowel(x), "n", ""), " ", x, sep = "")
}

Expand All @@ -223,7 +249,10 @@ prependIndefArticle <- function(x) {
#' @examples
#' # Remove spaces from "hello world"
#' stripSpaces("hello world")
stripSpaces <- function(x) gsub(" ", "", x)
stripSpaces <- function(x) {
assertthat::assert_that(is.character(x))
gsub(" ", "", x)
}

#' Find the Closest Word in a Set to a Given Word
#'
Expand All @@ -241,6 +270,10 @@ stripSpaces <- function(x) gsub(" ", "", x)
#' closestWord("hello", c("hallo", "hullo", "hey"))
closestWord <- function(s, strset, distFunc = utils::adist) {

assertthat::assert_that(is.character(s))
assertthat::assert_that(is.character(strset))
assertthat::assert_that(is.function(distFunc) && (length(formals(distFunc)) >= 2))

strset <- unique(strset)
if (length(strset) == 1) return(strset)

Expand Down
67 changes: 67 additions & 0 deletions R/wrangling.R
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,70 @@ setRownames <- function(object, newRownames) {
rownames(object) <- newRownames
object
}

#' Fix Column Names
#'
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' This function fixes the column names of a given object so that all words are spaced by a specified delimiter,
#' and any special characters are replaced according to a substitution map.
#'
#' @param object A data frame or matrix.
#' @param invalidRegex A character string containing a regular expression pattern for invalid characters to replace. Default is "( )|(\\()|(\\))|(\\.)|(/)".
#' @param spacing A character string to replace invalid characters with. Default is "_".
#' @param subMap A named list where the names are regular expressions and the values are the replacement strings. These substitutions are applied before `.subMap`.
#' @param .subMap A named list where the names are regular expressions and the values are the replacement strings. These substitutions are applied after `subMap`. Default is list("\\+" = "plus").
#' @param unique A logical indicating whether to ensure unique column names by appending a suffix if necessary. Default is FALSE.
#'
#' @return The data frame or matrix with fixed column names.
#' @export
#' @keywords wrangling
#' @examples
#' # Fix column names of a data frame
#' df <- data.frame(`A (1)` = c(1, 2, 3), `B/C` = c(4, 5, 6), `D+E` = c(7, 8, 9))
#' fixColnames(df)
fixColnames <- function(
object,
invalidRegex = "( )|(\\()|(\\))|(\\.)|(/)",
spacing = "_",
subMap = NULL,
.subMap = list(
"%+" = "pct",
"\\$+" = "dollars",
"\\++" = "plus",
"-+" = "minus",
"\\*+" = "star",
"#+" = "cnt",
"&+" = "and",
"@+" = "at"
),
unique = FALSE
) {

assertthat::assert_that(is.character(invalidRegex) && length(invalidRegex) == 1)
assertthat::assert_that(is.character(spacing) && length(spacing) == 1)

subMap <- append(subMap, .subMap)

# Apply all substitutions from the substitution maps
newColnames <- colnames(object)
for (pattern in names(subMap)) {
replacement <- subMap[[pattern]]
newColnames <- gsub(pattern, replacement, newColnames)
}

gsubr <- function(x, pattern, replacement) gsub(pattern, replacement, x)

newColnames <- newColnames %>%
gsubr(invalidRegex, spacing) %>%
gsubr("([a-z])([A-Z])", "\\1_\\2") %>%
(function(x) tolower(trimws(x))) %>%
gsubr("(^_+|_+$)", "") %>%
gsubr("_+", "_") %>%
(function(x) if (unique) make.unique(x, sep = spacing) else x)

# Assign the new column names to the object
colnames(object) <- newColnames
object
}
44 changes: 44 additions & 0 deletions man/fixColnames.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 12 additions & 12 deletions man/trySplit.Rd → man/trySplitWords.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit e80513a

Please sign in to comment.