simplfying data download and adding a trycatch (#169)

removing contentid for the short term and simplifying data download. --------- Co-authored-by: Elizabeth Wenk <[email protected]> Co-authored-by: Fonti Kar <[email protected]>
traitecoevo · Nov 13, 2023 · 607a952 · 607a952
1 parent 4d99afd
commit 607a952
Show file tree

Hide file tree

Showing 7 changed files with 162 additions and 106 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -21,7 +21,6 @@ Imports:
     forcats,
     tibble,
     dplyr,
-    contentid,
     stringr,
     stringi,
     arrow,

diff --git a/R/load_taxonomic_resources.R b/R/load_taxonomic_resources.R
@@ -26,90 +26,94 @@ load_taxonomic_resources <-
     on.exit(message("...done"))
 
     taxonomic_resources <-
-      dataset_access_function(version = version,
-                              path = NULL,
-                              type = stable_or_current_data)
+      dataset_access_function(
+        version = version,
+        path = tools::R_user_dir("APCalign"),
+        type = stable_or_current_data
+      )
     names(taxonomic_resources) <- c("APC", "APNI")
 
     ## todo :review this, why zzz
     ### Note: Use `zzzz zzzz` because the fuzzy matching algorithm can't handles NA's
     zzz <- "zzzz zzzz"
 
-  taxonomic_resources$APC <- taxonomic_resources$APC %>%
-    rename(
-      taxon_ID = .data$taxonID,
-      taxon_rank = .data$taxonRank,
-      name_type = .data$nameType,
-      taxonomic_status = .data$taxonomicStatus,
-      pro_parte = .data$proParte,
-      scientific_name = .data$scientificName,
-      scientific_name_ID = .data$scientificNameID,
-      accepted_name_usage_ID = .data$acceptedNameUsageID,
-      accepted_name_usage = .data$acceptedNameUsage,
-      canonical_name = .data$canonicalName,
-      scientific_name_authorship = .data$scientificNameAuthorship,
-      taxon_rank_sort_order = .data$taxonRankSortOrder,
-      taxon_remarks = .data$taxonRemarks,
-      taxon_distribution = .data$taxonDistribution,
-      higher_classification = .data$higherClassification,
-      nomenclatural_code = .data$nomenclaturalCode,
-      dataset_name = .data$datasetName
-    ) %>%
-    mutate(
-      genus = stringr::word(canonical_name, 1),
-      taxon_rank = stringr::str_to_lower(taxon_rank),
-      taxon_rank = stringr::str_replace(taxon_rank, "regnum", "kingdom"),
-      taxon_rank = stringr::str_replace(taxon_rank, "classis", "class"),
-      taxon_rank = stringr::str_replace(taxon_rank, "ordo", "order"),
-      taxon_rank = stringr::str_replace(taxon_rank, "familia", "family"),
-      taxon_rank = stringr::str_replace(taxon_rank, "varietas", "variety"),
-      taxon_rank = stringr::str_replace(taxon_rank, "forma", "form"),
-      taxon_rank = stringr::str_replace(taxon_rank, "sectio", "section")
-    )
-
-  taxonomic_resources$APNI <- taxonomic_resources$APNI %>%
-    rename(
-      name_type = .data$nameType,
-      taxonomic_status = .data$taxonomicStatus,
-      taxon_rank = .data$taxonRank,
-      scientific_name = .data$scientificName,
-      scientific_name_ID = .data$scientificNameID,
-      canonical_name = .data$canonicalName,
-      scientific_name_authorship = .data$scientificNameAuthorship,
-      taxon_rank_sort_order = .data$taxonRankSortOrder,
-      nomenclatural_code = .data$nomenclaturalCode,
-      dataset_name = .data$datasetName,
-      name_element = .data$nameElement
-    )  %>%
-    mutate(
-      genus = stringr::word(canonical_name, 1),
-      taxon_rank = stringr::str_to_lower(taxon_rank),
-      taxon_rank = stringr::str_replace(taxon_rank, "regnum", "kingdom"),
-      taxon_rank = stringr::str_replace(taxon_rank, "classis", "class"),
-      taxon_rank = stringr::str_replace(taxon_rank, "ordo", "order"),
-      taxon_rank = stringr::str_replace(taxon_rank, "familia", "family"),
-      taxon_rank = stringr::str_replace(taxon_rank, "varietas", "variety"),
-      taxon_rank = stringr::str_replace(taxon_rank, "forma", "form"),
-      taxon_rank = stringr::str_replace(taxon_rank, "sectio", "section")
-    )
-
+    taxonomic_resources$APC <- taxonomic_resources$APC %>%
+      rename(
+        taxon_ID = .data$taxonID,
+        taxon_rank = .data$taxonRank,
+        name_type = .data$nameType,
+        taxonomic_status = .data$taxonomicStatus,
+        pro_parte = .data$proParte,
+        scientific_name = .data$scientificName,
+        scientific_name_ID = .data$scientificNameID,
+        accepted_name_usage_ID = .data$acceptedNameUsageID,
+        accepted_name_usage = .data$acceptedNameUsage,
+        canonical_name = .data$canonicalName,
+        scientific_name_authorship = .data$scientificNameAuthorship,
+        taxon_rank_sort_order = .data$taxonRankSortOrder,
+        taxon_remarks = .data$taxonRemarks,
+        taxon_distribution = .data$taxonDistribution,
+        higher_classification = .data$higherClassification,
+        nomenclatural_code = .data$nomenclaturalCode,
+        dataset_name = .data$datasetName
+      ) %>%
+      mutate(
+        genus = extract_genus(canonical_name),
+        taxon_rank = stringr::str_to_lower(taxon_rank),
+        taxon_rank = stringr::str_replace(taxon_rank, "regnum", "kingdom"),
+        taxon_rank = stringr::str_replace(taxon_rank, "classis", "class"),
+        taxon_rank = stringr::str_replace(taxon_rank, "ordo", "order"),
+        taxon_rank = stringr::str_replace(taxon_rank, "familia", "family"),
+        taxon_rank = stringr::str_replace(taxon_rank, "varietas", "variety"),
+        taxon_rank = stringr::str_replace(taxon_rank, "forma", "form"),
+        taxon_rank = stringr::str_replace(taxon_rank, "sectio", "section")
+      )
+    
+    taxonomic_resources$APNI <- taxonomic_resources$APNI %>%
+      rename(
+        name_type = .data$nameType,
+        taxonomic_status = .data$taxonomicStatus,
+        taxon_rank = .data$taxonRank,
+        scientific_name = .data$scientificName,
+        scientific_name_ID = .data$scientificNameID,
+        canonical_name = .data$canonicalName,
+        scientific_name_authorship = .data$scientificNameAuthorship,
+        taxon_rank_sort_order = .data$taxonRankSortOrder,
+        nomenclatural_code = .data$nomenclaturalCode,
+        dataset_name = .data$datasetName,
+        name_element = .data$nameElement
+      )  %>%
+      mutate(
+        genus = extract_genus(canonical_name),
+        taxon_rank = stringr::str_to_lower(taxon_rank),
+        taxon_rank = stringr::str_replace(taxon_rank, "regnum", "kingdom"),
+        taxon_rank = stringr::str_replace(taxon_rank, "classis", "class"),
+        taxon_rank = stringr::str_replace(taxon_rank, "ordo", "order"),
+        taxon_rank = stringr::str_replace(taxon_rank, "familia", "family"),
+        taxon_rank = stringr::str_replace(taxon_rank, "varietas", "variety"),
+        taxon_rank = stringr::str_replace(taxon_rank, "forma", "form"),
+        taxon_rank = stringr::str_replace(taxon_rank, "sectio", "section")
+      )
+    
     APC_tmp <-
       taxonomic_resources$APC %>%
       dplyr::arrange(taxonomic_status) %>%
       dplyr::filter(taxon_rank %in% c("subspecies", "species", "form", "variety")) %>%
       dplyr::filter(!stringr::str_detect(canonical_name, "[:space:]sp\\.$")) %>%
-      dplyr::select(canonical_name,
-                    scientific_name,
-                    taxonomic_status,
-                    taxon_ID,
-                    scientific_name_ID,
-                    accepted_name_usage_ID,
-                    name_type,
-                    taxon_rank,
-                    genus) %>%
+      dplyr::select(
+        canonical_name,
+        scientific_name,
+        taxonomic_status,
+        taxon_ID,
+        scientific_name_ID,
+        accepted_name_usage_ID,
+        name_type,
+        taxon_rank,
+        genus
+      ) %>%
       dplyr::mutate(
         # strip_names removes punctuation and filler words associated with infraspecific taxa (subsp, var, f, ser)
-        stripped_canonical = strip_names(canonical_name),        
+        stripped_canonical = strip_names(canonical_name),
         ## strip_names2 removes punctuation, filler words associated with infraspecific taxa (subsp, var, f, ser), and filler words associated with species name cases (x, sp)
         ## strip_names2 is essential for the matches involving 2 or 3 words, since you want those words to not count filler words
         stripped_canonical2 = strip_names_2(canonical_name),
@@ -121,7 +125,7 @@ load_taxonomic_resources <-
         ),
         binomial = ifelse(is.na(binomial), zzz, binomial),
         binomial = base::replace(binomial, duplicated(binomial), zzz),
-        genus = stringr::word(stripped_canonical, 1),
+        genus = extract_genus(stripped_canonical),
         trinomial = stringr::word(stripped_canonical2, start = 1, end = 3),
         trinomial = ifelse(is.na(trinomial), zzz, trinomial),
         trinomial = base::replace(trinomial, duplicated(trinomial), zzz),
@@ -143,7 +147,11 @@ load_taxonomic_resources <-
       taxonomic_resources$APNI %>%
       dplyr::filter(name_element != "sp.") %>%
       dplyr::filter(!canonical_name %in% APC_tmp$canonical_name) %>%
-      dplyr::select(canonical_name, scientific_name, scientific_name_ID, name_type, taxon_rank) %>%
+      dplyr::select(canonical_name,
+                    scientific_name,
+                    scientific_name_ID,
+                    name_type,
+                    taxon_rank) %>%
       dplyr::filter(taxon_rank %in% c("series", "subspecies", "species", "form", "variety")) %>%
       dplyr::mutate(
         taxonomic_status = "unplaced for APC",
@@ -159,7 +167,7 @@ load_taxonomic_resources <-
         trinomial = stringr::word(stripped_canonical2, start = 1, end = 3),
         trinomial = ifelse(is.na(trinomial), "zzzz zzzz", trinomial),
         trinomial = base::replace(trinomial, duplicated(trinomial), "zzzz zzzz"),
-        genus = stringr::word(stripped_canonical, 1),
+        genus = extract_genus(stripped_canonical),
         taxonomic_dataset = "APNI"
       ) %>%
       dplyr::distinct() %>%
@@ -232,7 +240,7 @@ load_taxonomic_resources <-
     taxonomic_resources[["family_accepted"]] <-
       taxonomic_resources$APC %>%
       dplyr::filter(taxon_rank %in% c("family"), taxonomic_status == "accepted")
-
+    
     return(taxonomic_resources)
   }
 
@@ -262,7 +270,7 @@ load_taxonomic_resources <-
 ##' @noRd
 dataset_access_function <-
   function(version = default_version(),
-           path = NULL,
+           path = tools::R_user_dir("APCalign"),
            type = "stable") {
     if (type == "stable") {
       return(dataset_get(version, path))
@@ -316,10 +324,11 @@ dataset_access_function <-
 #'
 #'
 #' @noRd
-default_version <- function(){
+default_version <- function() {
   # Get all the releases
   output <- gh::gh("GET /repos/{owner}/{repo}/releases",
-                   owner = "traitecoevo", repo = "APCalign")
+                   owner = "traitecoevo",
+                   repo = "APCalign")
 
   # Determine how many versions there are
   length(output)
@@ -336,35 +345,56 @@ default_version <- function(){
 dataset_get <- function(version = default_version(),
                         path = tools::R_user_dir("APCalign")) {
   #APC
-  url <-
+  apc.url <-
     paste0(
       "https://github.com/traitecoevo/APCalign/releases/download/",
       version,
       "/apc.parquet"
     )
-  apc_hash <- contentid::register(url)
-  apc_file <- contentid::resolve(apc_hash, store = TRUE, path = path)
-  APC <- arrow::read_parquet(apc_file)
 
-  #APNI
-  url <-
+  apni.url <-
     paste0(
       "https://github.com/traitecoevo/APCalign/releases/download/",
       version,
       "/apni.parquet"
     )
-
-  apni_hash <- contentid::register(url)
-  apni_file <- contentid::resolve(apni_hash, store = TRUE, path = path)
 
-  #only getting APNI names that are not in APC
-  APNI <- arrow::open_dataset(apni_file) %>% dplyr::filter(!.data$canonicalName %in% APC$canonicalName) %>% dplyr::collect()
+  download_and_read_parquet <- function(url, path_to_file) {
+    tryCatch({
+      utils::download.file(url, path_to_file, mode = "wb")
+      message("File downloaded successfully.")
+      return(arrow::read_parquet(path_to_file))
+    }, error = function(e) {
+      message(
+        "The whole internet or just the server may be down; error in downloading or reading the file: ",
+        e$message
+      )
+      return(NULL)
+    })
+  }
 
+  if (!dir.exists(path)) {
+    dir.create(path, recursive = TRUE)
+  }
 
+  path_to_apc <- file.path(path, paste0("apc", version, ".parquet"))
+  path_to_apni <-
+    file.path(path, paste0("apni", version, ".parquet"))
+
+  APC <- if (!file.exists(path_to_apc)) {
+    download_and_read_parquet(apc.url, path_to_apc)
+  } else {
+    arrow::read_parquet(path_to_apc)
+  }
+
+  APNI <- if (!file.exists(path_to_apni)) {
+    download_and_read_parquet(apni.url, path_to_apni)
+  } else {
+    arrow::read_parquet(path_to_apni)
+  }
 
   #combine
   current_list <- list(APC, APNI)
   names(current_list) <- c("APC", "APNI")
   return(current_list)
 }
-
diff --git a/R/match_taxa.R b/R/match_taxa.R
@@ -84,7 +84,7 @@ match_taxa <- function(
         update_na_with(strip_names_2(cleaned_name)),
       trinomial = stringr::word(stripped_name2, start = 1, end = 3),
       binomial = stringr::word(stripped_name2, start = 1, end = 2),
-      genus = stringr::word(original_name, start = 1, end = 1),
+      genus = extract_genus(original_name),
       fuzzy_match_genus =
         fuzzy_match_genera(genus, resources$genera_accepted$canonical_name),
       fuzzy_match_genus_known =

diff --git a/R/standardise_names.R b/R/standardise_names.R
@@ -98,3 +98,28 @@ standardise_names <- function(taxon_names) {
     stringr::str_squish()
 }
 
+#' Extract Genus
+#' 
+#' This function extracts the genus component of a scientific name. 
+#' It identifies if the genus is/is not a hybrid. For a hybrid genus,
+#' the first two words of the taxon name are extracted (e.g. "x Cynochloris"),
+#' while for a non-hybrid genus just the first word is extracted (e.g. "Banksia").
+#'
+#' @param taxon_name 
+#'
+#' @return The genus for a scientific name.
+#'
+#' @examples
+#' genus = extract_genus(stripped_name)
+#' 
+#' @keywords internal
+#' @noRd
+extract_genus <- function(taxon_name) {
+  genus <- 
+    ifelse(
+      stringr::word(taxon_name, 1) == "x",
+      stringr::word(taxon_name, start = 1, end = 2),
+      stringr::word(taxon_name, 1)
+    )
+  genus
+}