diff --git a/R/load_taxonomic_resources.R b/R/load_taxonomic_resources.R index 1b04831f..2c7bd31d 100644 --- a/R/load_taxonomic_resources.R +++ b/R/load_taxonomic_resources.R @@ -56,7 +56,7 @@ load_taxonomic_resources <- dataset_name = .data$datasetName ) %>% mutate( - genus = stringr::word(canonical_name, 1), + genus = extract_genus(canonical_name), taxon_rank = stringr::str_to_lower(taxon_rank), taxon_rank = stringr::str_replace(taxon_rank, "regnum", "kingdom"), taxon_rank = stringr::str_replace(taxon_rank, "classis", "class"), @@ -82,7 +82,7 @@ load_taxonomic_resources <- name_element = .data$nameElement ) %>% mutate( - genus = stringr::word(canonical_name, 1), + genus = extract_genus(canonical_name), taxon_rank = stringr::str_to_lower(taxon_rank), taxon_rank = stringr::str_replace(taxon_rank, "regnum", "kingdom"), taxon_rank = stringr::str_replace(taxon_rank, "classis", "class"), @@ -121,7 +121,7 @@ load_taxonomic_resources <- ), binomial = ifelse(is.na(binomial), zzz, binomial), binomial = base::replace(binomial, duplicated(binomial), zzz), - genus = stringr::word(stripped_canonical, 1), + genus = extract_genus(stripped_canonical), trinomial = stringr::word(stripped_canonical2, start = 1, end = 3), trinomial = ifelse(is.na(trinomial), zzz, trinomial), trinomial = base::replace(trinomial, duplicated(trinomial), zzz), @@ -159,7 +159,7 @@ load_taxonomic_resources <- trinomial = stringr::word(stripped_canonical2, start = 1, end = 3), trinomial = ifelse(is.na(trinomial), "zzzz zzzz", trinomial), trinomial = base::replace(trinomial, duplicated(trinomial), "zzzz zzzz"), - genus = stringr::word(stripped_canonical, 1), + genus = extract_genus(stripped_canonical), taxonomic_dataset = "APNI" ) %>% dplyr::distinct() %>% diff --git a/R/match_taxa.R b/R/match_taxa.R index a2550a96..cf117f6c 100644 --- a/R/match_taxa.R +++ b/R/match_taxa.R @@ -84,7 +84,7 @@ match_taxa <- function( update_na_with(strip_names_2(cleaned_name)), trinomial = stringr::word(stripped_name2, start = 1, end = 3), binomial = stringr::word(stripped_name2, start = 1, end = 2), - genus = stringr::word(original_name, start = 1, end = 1), + genus = extract_genus(original_name), fuzzy_match_genus = fuzzy_match_genera(genus, resources$genera_accepted$canonical_name), fuzzy_match_genus_known = diff --git a/R/standardise_names.R b/R/standardise_names.R index 016fa74d..b167ee32 100644 --- a/R/standardise_names.R +++ b/R/standardise_names.R @@ -98,3 +98,27 @@ standardise_names <- function(taxon_names) { stringr::str_squish() } +#' Extract Genus +#' +#' This function extracts the genus component of a scientific name. +#' It identifies if the genus is/is not a hybrid. For a hybrid genus, +#' the first two words of the taxon name are extracted (e.g. "x Cynochloris"), +#' while for a non-hybrid genus just the first word is extracted (e.g. "Banksia"). +#' +#' @param taxon_name +#' +#' @return The genus for a scientific name. +#' +#' @examples +#' genus = extract_genus(stripped_name) +#' +#' +extract_genus <- function(taxon_name) { + genus <- + ifelse( + stringr::word(taxon_name, 1) == "x", + stringr::word(taxon_name, start = 1, end = 2), + stringr::word(taxon_name, 1) + ) + genus +} diff --git a/R/update_taxonomy.R b/R/update_taxonomy.R index 892d56d2..36c15afe 100644 --- a/R/update_taxonomy.R +++ b/R/update_taxonomy.R @@ -97,7 +97,7 @@ update_taxonomy <- function(aligned_data, aligned_data %>% dplyr::select(original_name, aligned_name, taxon_rank, taxonomic_dataset, aligned_reason) %>% dplyr::mutate( - genus = stringr::word(aligned_name, 1), + genus = extract_genus(aligned_name), row_number = dplyr::row_number() ) @@ -300,7 +300,7 @@ update_taxonomy_APC_genus <- function(data, resources) { taxonomic_status = ifelse(is.na(accepted_name_usage_ID), as.character(my_order), paste("genus", taxonomic_status_genus)), taxon_ID_genus = resources$genera_all$taxon_ID[match(accepted_name_usage_ID, resources$genera_all$accepted_name_usage_ID)], # genus names in `aligned_name` that are not APC-accepted need to be updated to their current name in `suggested_name` - aligned_minus_genus = ifelse(is.na(genus_accepted), NA, stringr::str_replace(aligned_name, stringr::word(aligned_name, 1), "")), + aligned_minus_genus = ifelse(is.na(genus_accepted), NA, stringr::str_replace(aligned_name, extract_genus(aligned_name), "")), suggested_name = ifelse(taxonomic_status == "accepted", paste0(genus_accepted, aligned_minus_genus), NA), suggested_name = ifelse(taxonomic_status != "accepted", aligned_name, suggested_name), # indicate taxonomic_status of the genus name in `aligned_name` and why it needed to be updated for the `suggested_name` @@ -379,8 +379,7 @@ update_taxonomy_APC_species_and_infraspecific_taxa <- function(data, resources, dplyr::filter(species_and_infraspecific(taxon_rank)) %>% dplyr::distinct(canonical_name, .keep_all = TRUE) %>% dplyr::select(canonical_name, accepted_name_usage_ID) - - + split_taxa_table <- resources$APC %>% dplyr::filter(species_and_infraspecific(taxon_rank)) %>% @@ -548,7 +547,7 @@ update_taxonomy_APC_species_and_infraspecific_taxa <- function(data, resources, suggested_name = ifelse(is.na(suggested_name), aligned_name, suggested_name), taxonomic_status = ifelse(is.na(accepted_name), taxonomic_status_aligned, "accepted"), # for APC-accepted species, the `genus` is the first word of the `accepted_name` - genus_accepted = stringr::word(suggested_name, 1), + genus_accepted = extract_genus(suggested_name), taxon_ID_genus = resources$genera_all$taxon_ID[match(genus_accepted, resources$genera_all$canonical_name)], update_reason = taxonomic_status_aligned, taxonomic_dataset = "APC",