Skip to content

Commit

Permalink
simplfying data download and adding a trycatch (#169)
Browse files Browse the repository at this point in the history
removing contentid for the short term and simplifying data download.

---------

Co-authored-by: Elizabeth Wenk <[email protected]>
Co-authored-by: Fonti Kar <[email protected]>
  • Loading branch information
3 people authored Nov 13, 2023
1 parent 4d99afd commit 607a952
Show file tree
Hide file tree
Showing 7 changed files with 162 additions and 106 deletions.
1 change: 0 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ Imports:
forcats,
tibble,
dplyr,
contentid,
stringr,
stringi,
arrow,
Expand Down
210 changes: 120 additions & 90 deletions R/load_taxonomic_resources.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,90 +26,94 @@ load_taxonomic_resources <-
on.exit(message("...done"))

taxonomic_resources <-
dataset_access_function(version = version,
path = NULL,
type = stable_or_current_data)
dataset_access_function(
version = version,
path = tools::R_user_dir("APCalign"),
type = stable_or_current_data
)
names(taxonomic_resources) <- c("APC", "APNI")

## todo :review this, why zzz
### Note: Use `zzzz zzzz` because the fuzzy matching algorithm can't handles NA's
zzz <- "zzzz zzzz"

taxonomic_resources$APC <- taxonomic_resources$APC %>%
rename(
taxon_ID = .data$taxonID,
taxon_rank = .data$taxonRank,
name_type = .data$nameType,
taxonomic_status = .data$taxonomicStatus,
pro_parte = .data$proParte,
scientific_name = .data$scientificName,
scientific_name_ID = .data$scientificNameID,
accepted_name_usage_ID = .data$acceptedNameUsageID,
accepted_name_usage = .data$acceptedNameUsage,
canonical_name = .data$canonicalName,
scientific_name_authorship = .data$scientificNameAuthorship,
taxon_rank_sort_order = .data$taxonRankSortOrder,
taxon_remarks = .data$taxonRemarks,
taxon_distribution = .data$taxonDistribution,
higher_classification = .data$higherClassification,
nomenclatural_code = .data$nomenclaturalCode,
dataset_name = .data$datasetName
) %>%
mutate(
genus = stringr::word(canonical_name, 1),
taxon_rank = stringr::str_to_lower(taxon_rank),
taxon_rank = stringr::str_replace(taxon_rank, "regnum", "kingdom"),
taxon_rank = stringr::str_replace(taxon_rank, "classis", "class"),
taxon_rank = stringr::str_replace(taxon_rank, "ordo", "order"),
taxon_rank = stringr::str_replace(taxon_rank, "familia", "family"),
taxon_rank = stringr::str_replace(taxon_rank, "varietas", "variety"),
taxon_rank = stringr::str_replace(taxon_rank, "forma", "form"),
taxon_rank = stringr::str_replace(taxon_rank, "sectio", "section")
)

taxonomic_resources$APNI <- taxonomic_resources$APNI %>%
rename(
name_type = .data$nameType,
taxonomic_status = .data$taxonomicStatus,
taxon_rank = .data$taxonRank,
scientific_name = .data$scientificName,
scientific_name_ID = .data$scientificNameID,
canonical_name = .data$canonicalName,
scientific_name_authorship = .data$scientificNameAuthorship,
taxon_rank_sort_order = .data$taxonRankSortOrder,
nomenclatural_code = .data$nomenclaturalCode,
dataset_name = .data$datasetName,
name_element = .data$nameElement
) %>%
mutate(
genus = stringr::word(canonical_name, 1),
taxon_rank = stringr::str_to_lower(taxon_rank),
taxon_rank = stringr::str_replace(taxon_rank, "regnum", "kingdom"),
taxon_rank = stringr::str_replace(taxon_rank, "classis", "class"),
taxon_rank = stringr::str_replace(taxon_rank, "ordo", "order"),
taxon_rank = stringr::str_replace(taxon_rank, "familia", "family"),
taxon_rank = stringr::str_replace(taxon_rank, "varietas", "variety"),
taxon_rank = stringr::str_replace(taxon_rank, "forma", "form"),
taxon_rank = stringr::str_replace(taxon_rank, "sectio", "section")
)

taxonomic_resources$APC <- taxonomic_resources$APC %>%
rename(
taxon_ID = .data$taxonID,
taxon_rank = .data$taxonRank,
name_type = .data$nameType,
taxonomic_status = .data$taxonomicStatus,
pro_parte = .data$proParte,
scientific_name = .data$scientificName,
scientific_name_ID = .data$scientificNameID,
accepted_name_usage_ID = .data$acceptedNameUsageID,
accepted_name_usage = .data$acceptedNameUsage,
canonical_name = .data$canonicalName,
scientific_name_authorship = .data$scientificNameAuthorship,
taxon_rank_sort_order = .data$taxonRankSortOrder,
taxon_remarks = .data$taxonRemarks,
taxon_distribution = .data$taxonDistribution,
higher_classification = .data$higherClassification,
nomenclatural_code = .data$nomenclaturalCode,
dataset_name = .data$datasetName
) %>%
mutate(
genus = extract_genus(canonical_name),
taxon_rank = stringr::str_to_lower(taxon_rank),
taxon_rank = stringr::str_replace(taxon_rank, "regnum", "kingdom"),
taxon_rank = stringr::str_replace(taxon_rank, "classis", "class"),
taxon_rank = stringr::str_replace(taxon_rank, "ordo", "order"),
taxon_rank = stringr::str_replace(taxon_rank, "familia", "family"),
taxon_rank = stringr::str_replace(taxon_rank, "varietas", "variety"),
taxon_rank = stringr::str_replace(taxon_rank, "forma", "form"),
taxon_rank = stringr::str_replace(taxon_rank, "sectio", "section")
)
taxonomic_resources$APNI <- taxonomic_resources$APNI %>%
rename(
name_type = .data$nameType,
taxonomic_status = .data$taxonomicStatus,
taxon_rank = .data$taxonRank,
scientific_name = .data$scientificName,
scientific_name_ID = .data$scientificNameID,
canonical_name = .data$canonicalName,
scientific_name_authorship = .data$scientificNameAuthorship,
taxon_rank_sort_order = .data$taxonRankSortOrder,
nomenclatural_code = .data$nomenclaturalCode,
dataset_name = .data$datasetName,
name_element = .data$nameElement
) %>%
mutate(
genus = extract_genus(canonical_name),
taxon_rank = stringr::str_to_lower(taxon_rank),
taxon_rank = stringr::str_replace(taxon_rank, "regnum", "kingdom"),
taxon_rank = stringr::str_replace(taxon_rank, "classis", "class"),
taxon_rank = stringr::str_replace(taxon_rank, "ordo", "order"),
taxon_rank = stringr::str_replace(taxon_rank, "familia", "family"),
taxon_rank = stringr::str_replace(taxon_rank, "varietas", "variety"),
taxon_rank = stringr::str_replace(taxon_rank, "forma", "form"),
taxon_rank = stringr::str_replace(taxon_rank, "sectio", "section")
)
APC_tmp <-
taxonomic_resources$APC %>%
dplyr::arrange(taxonomic_status) %>%
dplyr::filter(taxon_rank %in% c("subspecies", "species", "form", "variety")) %>%
dplyr::filter(!stringr::str_detect(canonical_name, "[:space:]sp\\.$")) %>%
dplyr::select(canonical_name,
scientific_name,
taxonomic_status,
taxon_ID,
scientific_name_ID,
accepted_name_usage_ID,
name_type,
taxon_rank,
genus) %>%
dplyr::select(
canonical_name,
scientific_name,
taxonomic_status,
taxon_ID,
scientific_name_ID,
accepted_name_usage_ID,
name_type,
taxon_rank,
genus
) %>%
dplyr::mutate(
# strip_names removes punctuation and filler words associated with infraspecific taxa (subsp, var, f, ser)
stripped_canonical = strip_names(canonical_name),
stripped_canonical = strip_names(canonical_name),
## strip_names2 removes punctuation, filler words associated with infraspecific taxa (subsp, var, f, ser), and filler words associated with species name cases (x, sp)
## strip_names2 is essential for the matches involving 2 or 3 words, since you want those words to not count filler words
stripped_canonical2 = strip_names_2(canonical_name),
Expand All @@ -121,7 +125,7 @@ load_taxonomic_resources <-
),
binomial = ifelse(is.na(binomial), zzz, binomial),
binomial = base::replace(binomial, duplicated(binomial), zzz),
genus = stringr::word(stripped_canonical, 1),
genus = extract_genus(stripped_canonical),
trinomial = stringr::word(stripped_canonical2, start = 1, end = 3),
trinomial = ifelse(is.na(trinomial), zzz, trinomial),
trinomial = base::replace(trinomial, duplicated(trinomial), zzz),
Expand All @@ -143,7 +147,11 @@ load_taxonomic_resources <-
taxonomic_resources$APNI %>%
dplyr::filter(name_element != "sp.") %>%
dplyr::filter(!canonical_name %in% APC_tmp$canonical_name) %>%
dplyr::select(canonical_name, scientific_name, scientific_name_ID, name_type, taxon_rank) %>%
dplyr::select(canonical_name,
scientific_name,
scientific_name_ID,
name_type,
taxon_rank) %>%
dplyr::filter(taxon_rank %in% c("series", "subspecies", "species", "form", "variety")) %>%
dplyr::mutate(
taxonomic_status = "unplaced for APC",
Expand All @@ -159,7 +167,7 @@ load_taxonomic_resources <-
trinomial = stringr::word(stripped_canonical2, start = 1, end = 3),
trinomial = ifelse(is.na(trinomial), "zzzz zzzz", trinomial),
trinomial = base::replace(trinomial, duplicated(trinomial), "zzzz zzzz"),
genus = stringr::word(stripped_canonical, 1),
genus = extract_genus(stripped_canonical),
taxonomic_dataset = "APNI"
) %>%
dplyr::distinct() %>%
Expand Down Expand Up @@ -232,7 +240,7 @@ load_taxonomic_resources <-
taxonomic_resources[["family_accepted"]] <-
taxonomic_resources$APC %>%
dplyr::filter(taxon_rank %in% c("family"), taxonomic_status == "accepted")

return(taxonomic_resources)
}

Expand Down Expand Up @@ -262,7 +270,7 @@ load_taxonomic_resources <-
##' @noRd
dataset_access_function <-
function(version = default_version(),
path = NULL,
path = tools::R_user_dir("APCalign"),
type = "stable") {
if (type == "stable") {
return(dataset_get(version, path))
Expand Down Expand Up @@ -316,10 +324,11 @@ dataset_access_function <-
#'
#'
#' @noRd
default_version <- function(){
default_version <- function() {
# Get all the releases
output <- gh::gh("GET /repos/{owner}/{repo}/releases",
owner = "traitecoevo", repo = "APCalign")
owner = "traitecoevo",
repo = "APCalign")

# Determine how many versions there are
length(output)
Expand All @@ -336,35 +345,56 @@ default_version <- function(){
dataset_get <- function(version = default_version(),
path = tools::R_user_dir("APCalign")) {
#APC
url <-
apc.url <-
paste0(
"https://github.com/traitecoevo/APCalign/releases/download/",
version,
"/apc.parquet"
)
apc_hash <- contentid::register(url)
apc_file <- contentid::resolve(apc_hash, store = TRUE, path = path)
APC <- arrow::read_parquet(apc_file)

#APNI
url <-
apni.url <-
paste0(
"https://github.com/traitecoevo/APCalign/releases/download/",
version,
"/apni.parquet"
)

apni_hash <- contentid::register(url)
apni_file <- contentid::resolve(apni_hash, store = TRUE, path = path)

#only getting APNI names that are not in APC
APNI <- arrow::open_dataset(apni_file) %>% dplyr::filter(!.data$canonicalName %in% APC$canonicalName) %>% dplyr::collect()
download_and_read_parquet <- function(url, path_to_file) {
tryCatch({
utils::download.file(url, path_to_file, mode = "wb")
message("File downloaded successfully.")
return(arrow::read_parquet(path_to_file))
}, error = function(e) {
message(
"The whole internet or just the server may be down; error in downloading or reading the file: ",
e$message
)
return(NULL)
})
}

if (!dir.exists(path)) {
dir.create(path, recursive = TRUE)
}

path_to_apc <- file.path(path, paste0("apc", version, ".parquet"))
path_to_apni <-
file.path(path, paste0("apni", version, ".parquet"))

APC <- if (!file.exists(path_to_apc)) {
download_and_read_parquet(apc.url, path_to_apc)
} else {
arrow::read_parquet(path_to_apc)
}

APNI <- if (!file.exists(path_to_apni)) {
download_and_read_parquet(apni.url, path_to_apni)
} else {
arrow::read_parquet(path_to_apni)
}

#combine
current_list <- list(APC, APNI)
names(current_list) <- c("APC", "APNI")
return(current_list)
}

2 changes: 1 addition & 1 deletion R/match_taxa.R
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ match_taxa <- function(
update_na_with(strip_names_2(cleaned_name)),
trinomial = stringr::word(stripped_name2, start = 1, end = 3),
binomial = stringr::word(stripped_name2, start = 1, end = 2),
genus = stringr::word(original_name, start = 1, end = 1),
genus = extract_genus(original_name),
fuzzy_match_genus =
fuzzy_match_genera(genus, resources$genera_accepted$canonical_name),
fuzzy_match_genus_known =
Expand Down
25 changes: 25 additions & 0 deletions R/standardise_names.R
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,28 @@ standardise_names <- function(taxon_names) {
stringr::str_squish()
}

#' Extract Genus
#'
#' This function extracts the genus component of a scientific name.
#' It identifies if the genus is/is not a hybrid. For a hybrid genus,
#' the first two words of the taxon name are extracted (e.g. "x Cynochloris"),
#' while for a non-hybrid genus just the first word is extracted (e.g. "Banksia").
#'
#' @param taxon_name
#'
#' @return The genus for a scientific name.
#'
#' @examples
#' genus = extract_genus(stripped_name)
#'
#' @keywords internal
#' @noRd
extract_genus <- function(taxon_name) {
genus <-
ifelse(
stringr::word(taxon_name, 1) == "x",
stringr::word(taxon_name, start = 1, end = 2),
stringr::word(taxon_name, 1)
)
genus
}
Loading

0 comments on commit 607a952

Please sign in to comment.