diff --git a/R/load_taxonomic_resources.R b/R/load_taxonomic_resources.R index 1b04831f..2c7bd31d 100644 --- a/R/load_taxonomic_resources.R +++ b/R/load_taxonomic_resources.R @@ -56,7 +56,7 @@ load_taxonomic_resources <- dataset_name = .data$datasetName ) %>% mutate( - genus = stringr::word(canonical_name, 1), + genus = extract_genus(canonical_name), taxon_rank = stringr::str_to_lower(taxon_rank), taxon_rank = stringr::str_replace(taxon_rank, "regnum", "kingdom"), taxon_rank = stringr::str_replace(taxon_rank, "classis", "class"), @@ -82,7 +82,7 @@ load_taxonomic_resources <- name_element = .data$nameElement ) %>% mutate( - genus = stringr::word(canonical_name, 1), + genus = extract_genus(canonical_name), taxon_rank = stringr::str_to_lower(taxon_rank), taxon_rank = stringr::str_replace(taxon_rank, "regnum", "kingdom"), taxon_rank = stringr::str_replace(taxon_rank, "classis", "class"), @@ -121,7 +121,7 @@ load_taxonomic_resources <- ), binomial = ifelse(is.na(binomial), zzz, binomial), binomial = base::replace(binomial, duplicated(binomial), zzz), - genus = stringr::word(stripped_canonical, 1), + genus = extract_genus(stripped_canonical), trinomial = stringr::word(stripped_canonical2, start = 1, end = 3), trinomial = ifelse(is.na(trinomial), zzz, trinomial), trinomial = base::replace(trinomial, duplicated(trinomial), zzz), @@ -159,7 +159,7 @@ load_taxonomic_resources <- trinomial = stringr::word(stripped_canonical2, start = 1, end = 3), trinomial = ifelse(is.na(trinomial), "zzzz zzzz", trinomial), trinomial = base::replace(trinomial, duplicated(trinomial), "zzzz zzzz"), - genus = stringr::word(stripped_canonical, 1), + genus = extract_genus(stripped_canonical), taxonomic_dataset = "APNI" ) %>% dplyr::distinct() %>% diff --git a/R/match_taxa.R b/R/match_taxa.R index a2550a96..cf117f6c 100644 --- a/R/match_taxa.R +++ b/R/match_taxa.R @@ -84,7 +84,7 @@ match_taxa <- function( update_na_with(strip_names_2(cleaned_name)), trinomial = stringr::word(stripped_name2, start = 1, end = 3), binomial = stringr::word(stripped_name2, start = 1, end = 2), - genus = stringr::word(original_name, start = 1, end = 1), + genus = extract_genus(original_name), fuzzy_match_genus = fuzzy_match_genera(genus, resources$genera_accepted$canonical_name), fuzzy_match_genus_known = diff --git a/R/standardise_names.R b/R/standardise_names.R index 016fa74d..b167ee32 100644 --- a/R/standardise_names.R +++ b/R/standardise_names.R @@ -98,3 +98,27 @@ standardise_names <- function(taxon_names) { stringr::str_squish() } +#' Extract Genus +#' +#' This function extracts the genus component of a scientific name. +#' It identifies if the genus is/is not a hybrid. For a hybrid genus, +#' the first two words of the taxon name are extracted (e.g. "x Cynochloris"), +#' while for a non-hybrid genus just the first word is extracted (e.g. "Banksia"). +#' +#' @param taxon_name +#' +#' @return The genus for a scientific name. +#' +#' @examples +#' genus = extract_genus(stripped_name) +#' +#' +extract_genus <- function(taxon_name) { + genus <- + ifelse( + stringr::word(taxon_name, 1) == "x", + stringr::word(taxon_name, start = 1, end = 2), + stringr::word(taxon_name, 1) + ) + genus +} diff --git a/R/update_taxonomy.R b/R/update_taxonomy.R index a6d5ac24..36c15afe 100644 --- a/R/update_taxonomy.R +++ b/R/update_taxonomy.R @@ -97,7 +97,7 @@ update_taxonomy <- function(aligned_data, aligned_data %>% dplyr::select(original_name, aligned_name, taxon_rank, taxonomic_dataset, aligned_reason) %>% dplyr::mutate( - genus = stringr::word(aligned_name, 1), + genus = extract_genus(aligned_name), row_number = dplyr::row_number() ) @@ -267,7 +267,7 @@ relevel_taxonomic_status_preferred_order <- function(taxonomic_status) { # Function to update names of taxa whose aligned_names are # taxon_rank = genus and taxonomic_dataset = APC update_taxonomy_APC_genus <- function(data, resources) { - + if(is.null(data)) return(NULL) data %>% @@ -300,7 +300,7 @@ update_taxonomy_APC_genus <- function(data, resources) { taxonomic_status = ifelse(is.na(accepted_name_usage_ID), as.character(my_order), paste("genus", taxonomic_status_genus)), taxon_ID_genus = resources$genera_all$taxon_ID[match(accepted_name_usage_ID, resources$genera_all$accepted_name_usage_ID)], # genus names in `aligned_name` that are not APC-accepted need to be updated to their current name in `suggested_name` - aligned_minus_genus = ifelse(is.na(genus_accepted), NA, stringr::str_replace(aligned_name, stringr::word(aligned_name, 1), "")), + aligned_minus_genus = ifelse(is.na(genus_accepted), NA, stringr::str_replace(aligned_name, extract_genus(aligned_name), "")), suggested_name = ifelse(taxonomic_status == "accepted", paste0(genus_accepted, aligned_minus_genus), NA), suggested_name = ifelse(taxonomic_status != "accepted", aligned_name, suggested_name), # indicate taxonomic_status of the genus name in `aligned_name` and why it needed to be updated for the `suggested_name` @@ -341,10 +341,10 @@ update_taxonomy_APNI_genus <- function(data, resources) { ) %>% # the `suggested_name` is set to the aligned_name and other columns are set to NA dplyr::mutate( - genus = NA_character_, # genera only in APNI don't appear in this column accepted_name = NA_character_, suggested_name = aligned_name, - taxonomic_status_genus = "unplaced" + taxonomic_status_genus = "genus unplaced by APC", + taxonomic_status = "genus unplaced by APC", ) } @@ -379,8 +379,7 @@ update_taxonomy_APC_species_and_infraspecific_taxa <- function(data, resources, dplyr::filter(species_and_infraspecific(taxon_rank)) %>% dplyr::distinct(canonical_name, .keep_all = TRUE) %>% dplyr::select(canonical_name, accepted_name_usage_ID) - - + split_taxa_table <- resources$APC %>% dplyr::filter(species_and_infraspecific(taxon_rank)) %>% @@ -495,8 +494,7 @@ update_taxonomy_APC_species_and_infraspecific_taxa <- function(data, resources, dplyr::select( aligned_name, taxonomic_status_aligned, - accepted_name_usage_ID, - scientific_name_ID + accepted_name_usage_ID ) ) %>% ## Second, find accepted names for each name in the species (and infraspecific taxon) list (sometimes they are the same) @@ -515,6 +513,7 @@ update_taxonomy_APC_species_and_infraspecific_taxa <- function(data, resources, accepted_name, taxonomic_status, scientific_name, + scientific_name_ID, family, subclass, taxon_distribution @@ -548,7 +547,7 @@ update_taxonomy_APC_species_and_infraspecific_taxa <- function(data, resources, suggested_name = ifelse(is.na(suggested_name), aligned_name, suggested_name), taxonomic_status = ifelse(is.na(accepted_name), taxonomic_status_aligned, "accepted"), # for APC-accepted species, the `genus` is the first word of the `accepted_name` - genus_accepted = stringr::word(suggested_name, 1), + genus_accepted = extract_genus(suggested_name), taxon_ID_genus = resources$genera_all$taxon_ID[match(genus_accepted, resources$genera_all$canonical_name)], update_reason = taxonomic_status_aligned, taxonomic_dataset = "APC", @@ -629,7 +628,10 @@ update_taxonomy_APNI_species_and_infraspecific_taxa <- function(data, resources) ## have proper columns filled in (`genus` & `taxonomic_ID_genus` & `taxonomic_dataset_genus`), ## while APNI names that do not align with an APC-accepted genus have these columns set to NA dplyr::mutate( - genus = ifelse(is.na(accepted_name_usage_ID_genus), NA_character_, resources$genera_all$canonical_name[match(accepted_name_usage_ID_genus, resources$genera_all$taxon_ID)]), + genus = ifelse( + is.na(accepted_name_usage_ID_genus), + genus, + resources$genera_all$canonical_name[match(accepted_name_usage_ID_genus, resources$genera_all$taxon_ID)]), taxon_ID_genus = resources$genera_all$taxon_ID[match(genus, resources$genera_all$canonical_name)], taxonomic_dataset_genus = ifelse(stringr::str_detect(taxonomic_dataset_genus, "APC"), "APC", taxonomic_dataset_genus) ) %>% diff --git a/tests/testthat/benchmarks/consistency_lookup.csv b/tests/testthat/benchmarks/consistency_lookup.csv index 8afe1363..45386000 100644 --- a/tests/testthat/benchmarks/consistency_lookup.csv +++ b/tests/testthat/benchmarks/consistency_lookup.csv @@ -7,7 +7,7 @@ Acacia paraneura,Acacia paraneura,Exact match of taxon name to an APC-accepted c Athrotaxis laxiflolia,Arthrotaxis laxifolia,Fuzzy match of taxon name to an APC-known canonical name once punctuation and filler words are removed (2023-08-17),APC,https://id.biodiversity.org.au/instance/apni/951456,orthographic variant,NA,https://id.biodiversity.org.au/node/apni/7555184,Athrotaxis x laxifolia,Hook.,https://id.biodiversity.org.au/name/apni/245642,species,accepted,Cupressaceae,Pinidae,Tas,https://id.biodiversity.org.au/node/apni/7555184,Athrotaxis Banksia integrifolia,Banksia integrifolia,Exact match of taxon name to an APC-accepted canonical name once punctuation and filler words are removed (2023-08-17),APC,https://id.biodiversity.org.au/node/apni/2892579,accepted,misapplied,https://id.biodiversity.org.au/node/apni/2892579,Banksia integrifolia,L.f.,https://id.biodiversity.org.au/name/apni/107602,species,accepted,Proteaceae,Magnoliidae,"WA (naturalised), Qld, NSW, NI (naturalised), Vic, Tas",https://id.biodiversity.org.au/node/apni/2892579,Banksia Commersonia rosea,Commersonia rosea,Exact match of taxon name to an APC-known canonical name once punctuation and filler words are removed (2023-08-17),APC,https://id.biodiversity.org.au/instance/apni/924361,basionym,NA,https://id.biodiversity.org.au/node/apni/2902836,Androcalva rosea,(S.A.J.Bell & L.M.Copel.) C.F.Wilkins & Whitlock,https://id.biodiversity.org.au/name/apni/188176,species,accepted,Malvaceae,Magnoliidae,NSW,https://id.biodiversity.org.au/node/apni/2902836,Androcalva -Galactia striata,Galactia sp. [Galactia striata],Exact match of the first word of the taxon name to an APC-accepted genus (2023-08-17),APNI,NA,unplaced by APC,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +Galactia striata,Galactia sp. [Galactia striata],Exact match of the first word of the taxon name to an APC-accepted genus (2023-08-17),APNI,NA,unplaced by APC,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,Galactia Genoplesium insigne,Genoplesium insigne,Exact match of taxon name to an APC-known canonical name once punctuation and filler words are removed (2023-08-17),APC,https://id.biodiversity.org.au/instance/apni/51441225,nomenclatural synonym,NA,https://id.biodiversity.org.au/taxon/apni/51441228,Corunastylis insignis,(D.L.Jones) D.L.Jones & M.A.Clem.,https://id.biodiversity.org.au/name/apni/167942,species,accepted,Orchidaceae,Magnoliidae,NSW,https://id.biodiversity.org.au/taxon/apni/51441228,Corunastylis Hibbertia sericea,Hibbertia sericea,Exact match of taxon name to an APC-accepted canonical name once punctuation and filler words are removed (2023-08-17),APC,https://id.biodiversity.org.au/instance/apni/912933,pro parte misapplied,misapplied | pro parte misapplied,https://id.biodiversity.org.au/node/apni/2899370,Hibbertia crinita,Toelken,https://id.biodiversity.org.au/name/apni/90852,species,accepted,Dilleniaceae,Magnoliidae,"SA, Vic",https://id.biodiversity.org.au/node/apni/2899370,Hibbertia Hibbertia sericea,Hibbertia sericea,Exact match of taxon name to an APC-accepted canonical name once punctuation and filler words are removed (2023-08-17),APC,https://id.biodiversity.org.au/instance/apni/912932,pro parte misapplied,misapplied | pro parte misapplied,https://id.biodiversity.org.au/node/apni/2899370,Hibbertia crinita,Toelken,https://id.biodiversity.org.au/name/apni/90852,species,accepted,Dilleniaceae,Magnoliidae,"SA, Vic",https://id.biodiversity.org.au/node/apni/2899370,Hibbertia @@ -43,7 +43,7 @@ Hibbertia sericea,Hibbertia sericea,Exact match of taxon name to an APC-accepted Hibbertia sericea,Hibbertia sericea,Exact match of taxon name to an APC-accepted canonical name once punctuation and filler words are removed (2023-08-17),APC,https://id.biodiversity.org.au/instance/apni/916524,pro parte misapplied,misapplied | pro parte misapplied,https://id.biodiversity.org.au/node/apni/2901708,Hibbertia simulans,Toelken,https://id.biodiversity.org.au/name/apni/90852,species,accepted,Dilleniaceae,Magnoliidae,NSW,https://id.biodiversity.org.au/node/apni/2901708,Hibbertia Hibbertia sericea,Hibbertia sericea,Exact match of taxon name to an APC-accepted canonical name once punctuation and filler words are removed (2023-08-17),APC,https://id.biodiversity.org.au/instance/apni/916556,pro parte misapplied,misapplied | pro parte misapplied,https://id.biodiversity.org.au/node/apni/2910589,Hibbertia superans,Toelken,https://id.biodiversity.org.au/name/apni/90852,species,accepted,Dilleniaceae,Magnoliidae,NSW,https://id.biodiversity.org.au/node/apni/2910589,Hibbertia Hibbertia sericea,Hibbertia sericea,Exact match of taxon name to an APC-accepted canonical name once punctuation and filler words are removed (2023-08-17),APC,https://id.biodiversity.org.au/instance/apni/916566,pro parte misapplied,misapplied | pro parte misapplied,https://id.biodiversity.org.au/node/apni/2915755,Hibbertia villifera,Tepper ex Toelken,https://id.biodiversity.org.au/name/apni/90852,species,accepted,Dilleniaceae,Magnoliidae,SA,https://id.biodiversity.org.au/node/apni/2915755,Hibbertia -Hibbertia sp.,Hibbertia sp.,Exact match of taxon name ending with `sp.` to an APC accepted genus (2023-08-17),APNI,NA,unplaced by APC,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +Hibbertia sp.,Hibbertia sp.,Exact match of taxon name ending with `sp.` to an APC accepted genus (2023-08-17),APNI,NA,unplaced by APC,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,Hibbertia Hibbertia stricta,Hibbertia stricta,Exact match of taxon name to an APC-accepted canonical name once punctuation and filler words are removed (2023-08-17),APC,https://id.biodiversity.org.au/instance/apni/910062,pro parte misapplied,taxonomic synonym | misapplied | pro parte misapplied,https://id.biodiversity.org.au/node/apni/2906127,Hibbertia australis,N.A.Wakef.,https://id.biodiversity.org.au/name/apni/91036,species,accepted,Dilleniaceae,Magnoliidae,"SA, Vic",https://id.biodiversity.org.au/node/apni/2906127,Hibbertia Hibbertia stricta,Hibbertia stricta,Exact match of taxon name to an APC-accepted canonical name once punctuation and filler words are removed (2023-08-17),APC,https://id.biodiversity.org.au/instance/apni/916725,pro parte misapplied,taxonomic synonym | misapplied | pro parte misapplied,https://id.biodiversity.org.au/node/apni/2915804,Hibbertia devitata,Toelken,https://id.biodiversity.org.au/name/apni/91036,species,accepted,Dilleniaceae,Magnoliidae,"SA, Vic",https://id.biodiversity.org.au/node/apni/2915804,Hibbertia Hibbertia stricta,Hibbertia stricta,Exact match of taxon name to an APC-accepted canonical name once punctuation and filler words are removed (2023-08-17),APC,https://id.biodiversity.org.au/instance/apni/916770,pro parte misapplied,taxonomic synonym | misapplied | pro parte misapplied,https://id.biodiversity.org.au/taxon/apni/51300211,Hibbertia glebosa,Toelken,https://id.biodiversity.org.au/name/apni/91036,species,accepted,Dilleniaceae,Magnoliidae,SA,https://id.biodiversity.org.au/taxon/apni/51300211,Hibbertia diff --git a/tests/testthat/test-alignment_executes.R b/tests/testthat/test-alignment_executes.R index e87f004d..f204b515 100644 --- a/tests/testthat/test-alignment_executes.R +++ b/tests/testthat/test-alignment_executes.R @@ -191,7 +191,7 @@ test_that("handles APNI taxa and genus level IDs",{ original_name <- c("Acacia sp.", "Dendropanax amplifolius", "Acanthopanax divaricatum", "Eucalyptus sp.") taxon_rank <- c("genus", "species", "species", "genus") taxonomic_dataset <- c("APC", "APNI", "APNI", "APC") - genus_updated <- c("Acacia", NA, NA, "Eucalyptus") + genus_updated <- c("Acacia", "Dendropanax", "Acanthopanax", "Eucalyptus") out1 <- align_taxa(original_name, resources = resources)