Apply our faster implementation of word throughout (#220)

As part of #196, we found that stringr::word was quite slow, and so implemented a faster version. This PR makes the new word function a private function accessible via APCalign:::word; adds tests for new function; extends use of this new function throughout Co-authored-by: ehwenk <[email protected]>
traitecoevo · dfalster · May 6, 2024 · Apr 18, 2024 · Apr 18, 2024 · Apr 19, 2024
commit 9be97769b39e2f3c65e15779311a833872520a2a
diff --git a/R/create_species_state_origin_matrix.R b/R/create_species_state_origin_matrix.R
@@ -44,7 +44,7 @@ separate_states <- function(data) {
 #' @noRd
 identify_places <- function(sep_state_data) {
   all_codes <- unique(stringr::str_trim(unlist(sep_state_data)))
-  unique(stringr::word(all_codes[!is.na(all_codes)], 1, 1))
+  unique(word(all_codes[!is.na(all_codes)], 1, 1))
 }
 
 #' @noRd

diff --git a/R/load_taxonomic_resources.R b/R/load_taxonomic_resources.R
@@ -55,38 +55,6 @@ load_taxonomic_resources <-
     ### Note: Use `zzzz zzzz` because the fuzzy matching algorithm can't handles NA's
     zzz <- "zzzz zzzz"
 
-    word <- function(string, start = 1L, end = start) {
-      if(end == start) {
-        str_split_i(string, " ", start)
-      } else if(end == start+1) {
-        w1 <- str_split_i(string, " ", start)
-        w2 <- str_split_i(string, " ", start+1)
-
-        out <- paste(w1, w2) 
-        out[is.na(w2)] <- NA_character_
-
-        out
-      } else if(end == start+2) {
-
-        w1 <- str_split_i(string, " ", start)
-        w2 <- str_split_i(string, " ", start+1)
-        w3 <- str_split_i(string, " ", start+2)
-
-        out <- paste(w1, w2, w3) 
-        out[is.na(w2) | is.na(w3)] <- NA_character_
-
-        out
-      } else {
-        i <- seq(start, end)
-
-        txt <- str_split(string, " ")
-        lngth <- purrr::map_int(txt, length)
-        out <- purrr::map(txt, ~paste(.x[i], collapse = " "))
-        out[lngth < end] <- NA
-        out
-      }
-    }
-
     taxonomic_resources$APC <- taxonomic_resources$APC %>%
       rename(
         taxon_ID = .data$taxonID,

diff --git a/R/match_taxa.R b/R/match_taxa.R
@@ -82,8 +82,8 @@ match_taxa <- function(
         update_na_with(strip_names(cleaned_name)),
       stripped_name2 = stripped_name2 %>%
         update_na_with(strip_names_extra(stripped_name)),
-      trinomial = stringr::word(stripped_name2, start = 1, end = 3),
-      binomial = stringr::word(stripped_name2, start = 1, end = 2),
+      trinomial = word(stripped_name2, start = 1, end = 3),
+      binomial = word(stripped_name2, start = 1, end = 2),
       genus = extract_genus(original_name)
     )
 
@@ -224,7 +224,7 @@ match_taxa <- function(
   i <-
     stringr::str_detect(taxa$tocheck$cleaned_name, "[:space:]sp\\.$") &
     taxa$tocheck$genus %in% resources$genera_all2$genus &
-    stringr::word(taxa$tocheck$cleaned_name, 2) %in% c("sp.")
+    word(taxa$tocheck$cleaned_name, 2) %in% c("sp.")
 
   ii <-
     match(
@@ -277,7 +277,7 @@ match_taxa <- function(
   i <-
     stringr::str_detect(taxa$tocheck$cleaned_name, "[:space:]sp\\.$") &
     taxa$tocheck$fuzzy_match_genus %in% resources$genera_accepted$genus &
-    stringr::word(taxa$tocheck$cleaned_name, 2) %in% c("sp.")
+    word(taxa$tocheck$cleaned_name, 2) %in% c("sp.")
 
   ii <-
     match(
@@ -316,7 +316,7 @@ match_taxa <- function(
   i <-
     stringr::str_detect(taxa$tocheck$cleaned_name, "[:space:]sp\\.$") &
     taxa$tocheck$fuzzy_match_genus_synonym %in% resources$genera_synonym$genus &
-    stringr::word(taxa$tocheck$cleaned_name, 2) %in% c("sp.")
+    word(taxa$tocheck$cleaned_name, 2) %in% c("sp.")
 
   ii <-
     match(
@@ -353,7 +353,7 @@ match_taxa <- function(
   i <-
     stringr::str_detect(taxa$tocheck$cleaned_name, "[:space:]sp\\.$") &
     taxa$tocheck$genus %in% resources$family_accepted$canonical_name &
-    stringr::word(taxa$tocheck$cleaned_name, 2) %in% c("sp.")
+    word(taxa$tocheck$cleaned_name, 2) %in% c("sp.")
 
   taxa$tocheck[i,] <- taxa$tocheck[i,] %>%
     mutate(
@@ -524,7 +524,7 @@ match_taxa <- function(
     mutate(
       taxonomic_dataset = NA_character_,
       taxon_rank = NA,
-      aligned_name_tmp = paste0(stringr::word(cleaned_name,1), " sp. [", cleaned_name),
+      aligned_name_tmp = paste0(word(cleaned_name,1), " sp. [", cleaned_name),
       aligned_name = NA,
       aligned_reason = paste0(
         "Taxon name includes '--' (double dash) indicating an intergrade between two taxa, but exact and fuzzy matches fail to align to a genus in the APC or APNI (",
@@ -726,7 +726,7 @@ match_taxa <- function(
     mutate(
       taxonomic_dataset = NA_character_,
       taxon_rank = NA,
-      aligned_name_tmp = paste0(stringr::word(cleaned_name,1), " sp. [", cleaned_name),
+      aligned_name_tmp = paste0(word(cleaned_name,1), " sp. [", cleaned_name),
       aligned_name = NA,
       aligned_reason = paste0(
         "Taxon name includes '/' (slash) indicating an uncertain species identification  but exact and fuzzy matches fail to align to a genus in the APC or APNI (",
@@ -1033,7 +1033,7 @@ match_taxa <- function(
     mutate(
       taxonomic_dataset = NA_character_,
       taxon_rank = NA,
-      aligned_name_tmp = paste0(stringr::word(cleaned_name,1), " sp. [", cleaned_name),
+      aligned_name_tmp = paste0(word(cleaned_name,1), " sp. [", cleaned_name),
       aligned_name = NA,
       aligned_reason = paste0(
         "Taxon name includes 'affinis' or 'aff' indicating an unknown taxon that bears an affinity to a different taxon in the same genus,  but exact and fuzzy matches fail to align to a genus in the APC or APNI (",
@@ -1297,7 +1297,7 @@ match_taxa <- function(
     mutate(
       taxonomic_dataset = NA_character_,
       taxon_rank = NA,
-      aligned_name_tmp = paste0(stringr::word(cleaned_name,1), " x [", cleaned_name),
+      aligned_name_tmp = paste0(word(cleaned_name,1), " x [", cleaned_name),
       aligned_name = NA,
       aligned_reason = paste0(
         "Taxon name includes ' x ' indicating a hybrid,  but exact and fuzzy matches fail to align to a genus in the APC or APNI (",
@@ -1927,7 +1927,7 @@ match_taxa <- function(
   # The 'taxon name' is then reformatted  as `family sp.` with the original name in square brackets.
 
   i <-
-    stringr::str_detect(stringr::word(taxa$tocheck$cleaned_name, 1), "aceae$") &
+    stringr::str_detect(word(taxa$tocheck$cleaned_name, 1), "aceae$") &
     taxa$tocheck$genus %in% resources$family_accepted$canonical_name
 
   taxa$tocheck[i,] <- taxa$tocheck[i,] %>%
@@ -1958,7 +1958,7 @@ match_taxa <- function(
   # The 'taxon name' is then reformatted  as `family sp.` with the original name in square brackets.
 
   i <-
-    stringr::str_detect(stringr::word(taxa$tocheck$cleaned_name, 1), "ae$") &
+    stringr::str_detect(word(taxa$tocheck$cleaned_name, 1), "ae$") &
     taxa$tocheck$genus %in% resources$family_synonym$canonical_name
 
   taxa$tocheck[i,] <- taxa$tocheck[i,] %>%

diff --git a/R/update_taxonomy.R b/R/update_taxonomy.R
@@ -456,7 +456,7 @@ update_taxonomy_APC_species_and_infraspecific_taxa <- function(data, resources,
     dplyr::group_by(canonical_name) %>%
     dplyr::mutate(
       number_of_collapsed_taxa = sum(number_of_collapsed_taxa),
-      accepted_name_2 = paste(stringr::word(accepted_name_2, 1), "sp."),
+      accepted_name_2 = paste(word(accepted_name_2, 1), "sp."),
       alternative_possible_names = 
                     alternative_accepted_name_tmp %>%
                     unique() %>%
@@ -469,7 +469,7 @@ update_taxonomy_APC_species_and_infraspecific_taxa <- function(data, resources,
     dplyr::mutate(
       alternative_possible_names = ifelse(taxonomic_status_aligned != "accepted" & canonical_name %in% resources$'APC list (accepted)'$canonical_name, NA, alternative_possible_names),
       alternative_possible_names = stringr::str_replace_all(alternative_possible_names, "\\ \\|\\ NA", ""),
-      suggested_collapsed_name = paste(stringr::word(accepted_name_2, 1), "sp. [collapsed names:", alternative_possible_names, "]"),
+      suggested_collapsed_name = paste(word(accepted_name_2, 1), "sp. [collapsed names:", alternative_possible_names, "]"),
       taxon_rank = ifelse(number_of_collapsed_taxa > 1 & species_and_infraspecific(taxon_rank), "genus", taxon_rank)
     ) %>%
     dplyr::select(-alternative_accepted_name_tmp, -alternative_possible_names)
@@ -561,7 +561,7 @@ update_taxonomy_APC_species_and_infraspecific_taxa <- function(data, resources,
       ## there are rare cases of names within the APC that do not align to an accepted name.
       ## For these taxa, the `suggested_name` is the `aligned_name` and the family name must be added
       genus = ifelse(is.na(genus_accepted), genus, genus_accepted),
-      family = ifelse(is.na(family), resources$APC$family[match(stringr::word(suggested_name, 1), resources$APC$genus)], family),
+      family = ifelse(is.na(family), resources$APC$family[match(word(suggested_name, 1), resources$APC$genus)], family),
       update_reason = ifelse(
           (number_of_collapsed_taxa > 1) & !is.na(number_of_collapsed_taxa),
           "collapsed to genus due to ambiguity",
@@ -609,7 +609,7 @@ update_taxonomy_APNI_species_and_infraspecific_taxa <- function(data, resources)
         aligned_name,
         suggested_name
       ),
-      genus = stringr::word(suggested_name, 1)
+      genus = word(suggested_name, 1)
     ) %>%
     # when possible the genus of APNI names is matched to an APC-accepted genus and the appropriate genus-level taxon_ID is added
     dplyr::left_join(

diff --git a/R/word.R b/R/word.R
@@ -0,0 +1,47 @@
+#' Extract words from a sentence. Intended as a faster 
+#' replacement for stringr::word
+#'
+#' @param string A character vector
+
+#' @param start,end Pair of integer vectors giving range of words (inclusive)
+#'   to extract. The default value select the first word.
+#' @param sep Separator between words. Defaults to single space.
+#' @return A character vector with the same length as `string`/`start`/`end`.
+#'
+#' @examples
+#' spp <- c("Banksia serrata", "Actinotus helanthii")
+#' APCalign:::word(spp, 1)
+#' APCalign:::word(spp, 2)
+word <- function(string, start = 1L, end = start, sep = " ") {
+  if(end == start) {
+    str_split_i(string, " ", start)
+  } else if(end == start+1) {
+    w1 <- str_split_i(string, sep, start)
+    w2 <- str_split_i(string, sep, start+1)
+
+    out <- paste(w1, w2) 
+    out[is.na(w2)] <- NA_character_
+
+    return(out)
+  } else if(end == start+2) {
+
+    w1 <- str_split_i(string, sep, start)
+    w2 <- str_split_i(string, sep, start+1)
+    w3 <- str_split_i(string, sep, start+2)
+
+    out <- paste(w1, w2, w3) 
+    out[is.na(w2) | is.na(w3)] <- NA_character_
+
+    return(out)
+  } else {
+    i <- seq(start, end)
+
+    txt <- str_split(string, sep)
+    out <- purrr::map(txt, ~paste(.x[i], collapse = sep))
+
+    lngth <- purrr::map_int(txt, length)
+    out[lngth < end] <- NA
+
+    return(out)
+  }
+}
diff --git a/man/word.Rd b/man/word.Rd
diff --git a/tests/testthat/test-functions-word.R b/tests/testthat/test-functions-word.R
@@ -0,0 +1,24 @@
+test_that("Word", {
+
+  taxa <-
+    c(
+      NA,
+      "Banksia integrifolia",
+      "Acacia longifolia",
+      "Commersonia rosea",
+      "Thelymitra pauciflora",
+      "Justicia procumbens",
+      "Hibbertia",
+      "Rostellularia long leaves",
+      "Hibbertia sericea var  silliafolius",
+      "Hibbertia sp.",
+      "x Cynochloris macivorii",
+      "(Dockrillia pugioniformis x Dockrillia striolata) x Dockrillia pugioniformis"
+    )
+
+  expect_equal(APCalign:::word(taxa, 1), stringr::word(taxa, 1))
+  expect_equal(APCalign:::word(taxa, 2), stringr::word(taxa, 2))
+  expect_equal(APCalign:::word(taxa, 3), stringr::word(taxa, 3))
+  expect_equal(APCalign:::word(taxa, 1,2), stringr::word(taxa, 1,2))
+  expect_equal(APCalign:::word(taxa, 1,3), stringr::word(taxa, 1,3))
+})
diff --git a/tests/testthat/test-operation_executes.R b/tests/testthat/test-operation_executes.R
@@ -170,7 +170,7 @@ test_that("handles NAs inn inputs", {
   expect_equal(original_name, out2$original_name)
   expect_equal(original_name, out2$aligned_name)
   expect_equal(original_name, out2$accepted_name)
-  expect_equal(original_name[1], stringr::word(out2$suggested_name[1], start = 1, end = 2))
+  expect_equal(original_name[1], word(out2$suggested_name[1], start = 1, end = 2))
 
   })