CRAN accepted (#13)

* Minor refactor from full package review comments * Detect text that can be interpreted ambiguously - Returns NA for all ambiguities - Include warning for ambiguities - Returns NA for R keywords like NA, NaN, Inf, NULL, TRUE, FALSE * Automatic readme update * Bump version to indicate stable release * Update CITATION.cff * update recon lifecycle * Update CITATION.cff * doc updates after devtools::check() * Add NEWS.md * prep for cran submission * Update CITATION.cff * Increment version number to 1.0.0 * Update CITATION.cff * post CRAN submission * Remove cat as suggested by CRAN review * Add expect_warning() assertion - Minor tests refactor * cran submission notice --------- Co-authored-by: GitHub Action <[email protected]>
epiverse-trace · Aug 5, 2024 · 4f9f676 · 4f9f676
1 parent fc32825
commit 4f9f676
Show file tree

Hide file tree

Showing 14 changed files with 234 additions and 119 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -10,4 +10,6 @@
 ^\.Rproj\.user$
 ^CITATION\.cff$
 ^__.*$
-^\.vscode$
+^\.vscode.*
+^cran-comments\.md$
+^CRAN-SUBMISSION$
diff --git a/CITATION.cff b/CITATION.cff
@@ -1,54 +1,50 @@
-# -----------------------------------------------------------
-# CITATION file created with {cffr} R package, v0.5.0
+# --------------------------------------------
+# CITATION file created with {cffr} R package
 # See also: https://docs.ropensci.org/cffr/
-# -----------------------------------------------------------
+# --------------------------------------------
 
 cff-version: 1.2.0
 message: 'To cite package "numberize" in publications use:'
 type: software
 license: MIT
-title: 'numberize: Convert words to numbers in multiple languages'
-version: 0.0.1
-abstract: Converts numbers written as English, French or Spanish words to their equivalent
-  number. English, French or Spanish words to their equivalent numeric.
+title: 'numberize: Convert Words to Numbers in Multiple Languages'
+version: 1.0.0
+abstract: Converts written out numbers into their equivalent numbers. Supports numbers
+  written out in English, French, or Spanish.
 authors:
-- family-names: Laddha
-  given-names: Avinash
-  email: [email protected]
-- family-names: Azam
-  given-names: James M.
-  email: [email protected]
-  orcid: https://orcid.org/0000-0001-5782-7330
-- family-names: Gupte
-  given-names: Pratik
-  email: [email protected]
-  orcid: https://orcid.org/0000-0001-5294-7819
-- family-names: Lambert
-  given-names: Joshua W.
-  email: [email protected]
-  orcid: https://orcid.org/0000-0001-5218-3046
 - family-names: Gruson
   given-names: Hugo
   email: [email protected]
   orcid: https://orcid.org/0000-0002-4094-1476
 - family-names: Ahadzie
-  given-names: Banky
-  email: [email protected]
+  given-names: Bankole
+  email: [email protected]
 repository-code: https://github.com/epiverse-trace/numberize
 url: https://github.com/epiverse-trace/numberize
 contact:
 - family-names: Ahadzie
-  given-names: Banky
-  email: [email protected]
+  given-names: Bankole
+  email: [email protected]
 keywords:
 - r-package
 - r-programming
 references:
+- type: software
+  title: 'R: A Language and Environment for Statistical Computing'
+  notes: Depends
+  url: https://www.R-project.org/
+  authors:
+  - name: R Core Team
+  institution:
+    name: R Foundation for Statistical Computing
+    address: Vienna, Austria
+  year: '2024'
+  version: '>= 3.5.0'
 - type: software
   title: spelling
   abstract: 'spelling: Tools for Spell Checking in R'
   notes: Suggests
-  url: https://docs.ropensci.org/spelling/
+  url: https://ropensci.r-universe.dev/spelling
   repository: https://CRAN.R-project.org/package=spelling
   authors:
   - family-names: Ooms
@@ -71,3 +67,4 @@ references:
     email: [email protected]
   year: '2024'
   version: '>= 3.0.0'
+
diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION
@@ -0,0 +1,3 @@
+Version: 1.0.0
+Date: 2024-06-10 18:28:56 UTC
+SHA: 7222d306dbf134c3804fd6860c39bc4684a255fd
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,36 +1,36 @@
 Package: numberize
-Title: Convert words to numbers in multiple languages
-Version: 0.0.1
+Title: Convert Words to Numbers in Multiple Languages
+Version: 1.0.0
 Authors@R: c(
-    person("Avinash", "Laddha", , "[email protected]", role = "aut"),
-    person("James M.", "Azam", , "[email protected]", role = "aut",
-           comment = c(ORCID = "0000-0001-5782-7330")),
-    person("Jaime A.", "Pavlich-Mariscal", , "[email protected]", role = "ctb",
-           comment = c(ORCID = "0000-0002-3892-6680")),
-    person("Pratik", "Gupte", , "[email protected]", role = "aut",
-           comment = c(ORCID = "0000-0001-5294-7819")),
-    person("Joshua W.", "Lambert", , "[email protected]", role = "aut",
+    person("Chris", "Hartgerink", , "[email protected]", role = "rev",
+           comment = c(ORCID = "0000-0003-1050-6809")),
+    person("Joshua W.", "Lambert", , "[email protected]", role = "ctb",
            comment = c(ORCID = "0000-0001-5218-3046")),
+    person("Karim", "Mané", , "[email protected]", role = "ctb",
+           comment = c(ORCID = "0000-0002-9892-2999")),
     person("Hugo", "Gruson", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0002-4094-1476")),
-    person("Banky", "Ahadzie", , "[email protected]", role = c("aut", "cre"))
+    person("Bankole", "Ahadzie", , "[email protected]", role = c("aut", "cre", "cph"))
   )
-Description: Converts numbers written as English, French or Spanish words
-    to their equivalent number.  English, French or Spanish words to their
-    equivalent numeric.
+Description: Converts written out numbers into their equivalent numbers.
+    Supports numbers written out in English, French, or Spanish.
 License: MIT + file LICENSE
 URL: https://github.com/epiverse-trace/numberize
 BugReports: https://github.com/epiverse-trace/numberize/issues
+Depends: 
+    R (>= 3.5.0)
 Suggests: 
     spelling,
     testthat (>= 3.0.0)
+Config/Department: Centre for the Mathematical Modelling of Infectious
+    Diseases
+Config/DepartmentURL: 
+    https://www.lshtm.ac.uk/research/centres/centre-mathematical-modelling-infectious-diseases
+Config/Institution: London School of Hygiene and Tropical Medicine
+Config/Needs/website: epiverse-trace/epiversetheme
+Config/Recon: stable
+Config/testthat/edition: 3
 Encoding: UTF-8
 Language: en-US
-Config/testthat/edition: 3
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.3.1
-Config/Institution: London School of Hygiene and Tropical Medicine
-Config/Needs/website: epiverse-trace/epiversetheme
-Config/Department: Centre for the Mathematical Modelling of Infectious Diseases
-Config/DepartmentURL: https://www.lshtm.ac.uk/research/centres/centre-mathematical-modelling-infectious-diseases
-Config/Recon: experimental
diff --git a/LICENSE b/LICENSE
@@ -1,2 +1,2 @@
-YEAR: 2022
-COPYRIGHT HOLDER: numberizeR authors
+YEAR: 2024
+COPYRIGHT HOLDER: numberize authors
diff --git a/LICENSE.md b/LICENSE.md
@@ -1,6 +1,6 @@
 # MIT License
 
-Copyright (c) 2022 numberizeR authors
+Copyright (c) 2024 numberize authors
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,5 @@
+# numberize 1.0.0
+
+# numberize 0.0.1
+
+* Initial CRAN submission.
diff --git a/R/numberize.R b/R/numberize.R
@@ -1,27 +1,44 @@
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #' Generate a numeric vector from text in a supported language.
 #'
 #' @param text Word(s) that spell numbers. e.g. "one", "deux", "trois"
 #' @param lang The text's language. Currently one of `"en" | "fr" | "es"`.
 #'
 #' @return A numeric vector.
 #' @keywords internal
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 digits_from <- function(text, lang = "en") {
+  invalid_structure <- function(positions) {
+    valid_position <- c(
+      "units", "tens", "hundreds", "thousand", "million", "billion", "trillion"
+    )
+    for (i in seq_along(valid_position)) {
+      index <- which(positions %in% valid_position[i])
+      is_adjacent <- any(diff(index) == 1)
+      if (is_adjacent) {
+        return(is_adjacent)
+      }
+    }
+    FALSE
+  }
+
   # data frame that maps numbers to words
   numbers <- data.frame(
     stringsAsFactors = FALSE,
     digit = c(
       0:30, # because es is unique to 30
-      seq(40, 90, by = 10),
+      seq(40, 70, by = 10),
+      71:80,
+      90:99,
       seq(100, 900, by = 100), 1000, 1E6, 1E9, 1E12
     ),
     en = c(
       "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
       "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
       "sixteen", "seventeen", "eighteen", "nineteen",
       "twenty", "", "", "", "", "", "", "", "", "",
-      "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
+      "thirty", "forty", "fifty", "sixty",
+      "seventy", "", "", "", "", "", "", "", "", "",
+      "eighty",
+      "ninety", "", "", "", "", "", "", "", "", "",
       "hundred", "", "", "", "", "", "", "", "",
       "thousand", "million", "billion", "trillion"
     ),
@@ -30,9 +47,11 @@ digits_from <- function(text, lang = "en") {
       "nueve", "diez", "once", "doce", "trece", "catorce", "quince",
       "diecis\u00e9is", "diecisiete", "dieciocho", "diecinueve", "veinte",
       "veintiuno", "veintid\u00f3s", "veintitr\u00e9s", "veinticuatro",
-      "veinticinco", "veintis\u00e9is", "veintisiete", "veintiocho",
-      "veintinueve", "treinta", "cuarenta", "cincuenta", "sesenta",
-      "setenta", "ochenta", "noventa",
+      "veinticinco", "veintis\u00e9is", "veintisiete", "veintiocho", "veintinueve", # nolint
+      "treinta", "cuarenta", "cincuenta", "sesenta",
+      "setenta", "", "", "", "", "", "", "", "", "",
+      "ochenta",
+      "noventa", "", "", "", "", "", "", "", "", "",
       "ciento", "doscientos", "trescientos", "cuatrocientos", "quinientos",
       "seiscientos", "setecientos", "ochocientos", "novecientos",
       "mil", "mill\u00f3n", "mil-millones", "bill\u00f3n"
@@ -42,13 +61,37 @@ digits_from <- function(text, lang = "en") {
       "huit", "neuf", "dix", "onze", "douze", "treize", "quatorze",
       "quinze", "seize", "dix sept", "dix huit", "dix neuf",
       "vingt", "", "", "", "", "", "", "", "", "",
-      "trente", "quarante", "cinquante",
-      "soixante", "soixante dix", "quatre-vingt", "quatre-vingt dix",
+      "trente", "quarante", "cinquante", "soixante",
+      "soixante-dix", "soixante-onze", "soixante-douze", "soixante-treize",
+      "soixante-quatorze", "soixante-quinze", "soixante-seize",
+      "soixante-dix-sept", "soixante-dix-huit", "soixante-dix-neuf",
+      "quatre-vingt",
+      "quatre-vingt-dix", "quatre-vingt-onze", "quatre-vingt-douze", "quatre-vingt-treize", # nolint
+      "quatre-vingt-quatorze", "quatre-vingt-quinze", "quatre-vingt-seize",
+      "quatre-vingt-dix-sept", "quatre-vingt-dix-huit", "quatre-vingt-dix-neuf",
       "cent", "", "", "", "", "", "", "", "",
       "mille", "million", "milliard", "billion"
+    ),
+    position = c(
+      rep("units", 10),
+      rep("tens", 45),
+      rep("hundreds", 9),
+      "thousand", "million", "billion", "trillion"
+    ),
+    positional_digit = c(
+      0:9, # units
+      rep(1, 10), # tens (10-19)
+      rep(2, 10), # tens (20-29)
+      3:6, # tens (30-60)
+      rep(7, 10), # tens (70-79)
+      8, # tens (80)
+      rep(9, 10), # tens (90-99)
+      1:9, # hundreds (100-900)
+      rep(1, 4) # thousand, million, billion, trillion
     )
   )
 
+  original_text <- text # to report warning if necessary
   # clean and prep
   text <- tolower(text) # converts to string as a side effect
   text <- trimws(text)
@@ -62,18 +105,33 @@ digits_from <- function(text, lang = "en") {
     text <- gsub("\\sun\\s", " uno ", text)
   }
   if (lang == "fr") {
-    # lang=fr plural-> singular
+    # plural to singular
     text <- gsub("(cent|mille|million|milliard|billion)s\\b", "\\1", text)
-    # lang=fr one word
-    text <- gsub("quatre vingt", "quatre-vingt", text, fixed = TRUE)
+    # handle 70-79
+    text <- gsub(
+      "soixante (dix|onze|douze|treize|quatorze|quinze|seize)",
+      "soixante-\\1", text
+    )
+    text <- gsub("soixante-dix (sept|huit|neuf)", "soixante-dix-\\1", text)
+    # handle 90-99
+    text <- gsub(
+      "quatre vingt (dix|onze|douze|treize|quatorze|quinze|seize)",
+      "quatre-vingt-\\1", text
+    )
+    text <- gsub("quatre-vingt (sept|huit|neuf)", "quatre-vingt-\\1", text)
   }
 
   words <- strsplit(text, "\\s+")[[1]]
-  digits <- numbers[match(words, numbers[[lang]]), "digit"]
-  digits
+  positions <- numbers[match(words, numbers[[lang]]), "position"]
+  if (invalid_structure(positions)) {
+    warning(
+      "[", original_text, "] can be interpreted in different ways.\n"
+    )
+    return(NA)
+  }
+  numbers[match(words, numbers[[lang]]), "digit"]
 }
 
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #' Generate a number from a numeric vector.
 #' Uses `digits_from()` output to generate the numeric value of the text.
 #'
@@ -82,7 +140,6 @@ digits_from <- function(text, lang = "en") {
 #' @return A numeric value.
 #'
 #' @keywords internal
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 number_from <- function(digits) {
   thousand_index <- match(1000, digits, nomatch = 0)
   million_index <- match(1E6, digits, nomatch = 0)
@@ -107,7 +164,6 @@ number_from <- function(digits) {
   summed + total
 }
 
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #' Internal function used in the numberize() call for vectors.
 #'
 #' @param text Character string in a supported language.
@@ -118,10 +174,12 @@ number_from <- function(digits) {
 #'
 #' @keywords internal
 #'
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .numberize <- function(text, lang = c("en", "fr", "es")) {
-  # return NA if the input is NA
-  if (is.na(text)) {
+  text <- toString(text)
+  if (
+    trimws(text) %in%
+      c("NA", "TRUE", "FALSE", "nan", "Inf", "") || # check other R keywords
+      length(text) == 0) { # check for NULL
     return(NA)
   }
 
@@ -140,15 +198,19 @@ number_from <- function(digits) {
   }
 }
 
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #' Convert a vector string of spelled numbers in a supported language to
 #' its numeric equivalent.
 #'
-#' @param text Vector containing spelled numbers in a supported language.
-#' @param lang The text's language. Currently one of `"en" | "fr" | "es"`.
+#' The range of words supported is between \strong{zero} and
+#' \strong{nine hundred and ninety nine trillion, nine hundred and}
+#' \strong{ninety nine billion, nine hundred and ninety nine million, nine}
+#' \strong{hundred and ninety nine thousand, nine hundred and ninety nine}
+#'
+#' @param text String vector of spelled numbers in a supported language.
+#' @param lang The text's language. Currently one of `c("en", "fr", "es")`.
 #' Default is "en"
 #'
-#' @return A vector of numeric values.
+#' @return A numeric vector.
 #'
 #' @examples
 #' # convert to numbers a scalar
@@ -158,9 +220,12 @@ number_from <- function(digits) {
 #' numberize(c("dix", "soixante-cinq", "deux mille vingt-quatre"), lang = "fr")
 #'
 #' @export
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 numberize <- function(text, lang = c("en", "fr", "es")) {
+  lang <- tolower(lang)
   lang <- match.arg(lang)
+  if (is.null(text)) {
+    return(NA)
+  }
   vapply(
     text,
     .numberize,