cmu-delphi
diff --git a/‎.lintr
Lines changed: 1 addition & 0 deletions b/‎.lintr
Lines changed: 1 addition & 0 deletions
diff --git a/‎R/aux_data_utils.R
Lines changed: 14 additions & 2 deletions b/‎R/aux_data_utils.R
Lines changed: 14 additions & 2 deletions
diff --git a/‎R/utils.R
Lines changed: 104 additions & 7 deletions b/‎R/utils.R
Lines changed: 104 additions & 7 deletions
diff --git a/‎scripts/build_nhsn_archive.R
Lines changed: 66 additions & 26 deletions b/‎scripts/build_nhsn_archive.R
Lines changed: 66 additions & 26 deletions
diff --git a/‎scripts/covid_geo_exclusions.csv
Lines changed: 15 additions & 0 deletions b/‎scripts/covid_geo_exclusions.csv
Lines changed: 15 additions & 0 deletions
@@ -2,6 +2,7 @@ linters: linters_with_defaults(
   line_length_linter(120),
   cyclocomp_linter = NULL,
   object_length_linter(length = 40L),
+  object_usage_linter = NULL,
   commented_code_linter = NULL
   )
 exclusions: list(
 
@@ -687,10 +687,22 @@ up_to_date_nssp_state_archive <- function(disease = c("covid", "influenza")) {
     issues = "*"
   )
   nssp_state %>%
-    select(geo_value, time_value, issue, nssp = value) %>%
+    select(geo_value, time_value, version = issue, nssp = value) %>%
+    bind_rows(get_nssp_github()) %>%
     as_epi_archive(compactify = TRUE) %>%
     extract2("DT") %>%
     # End of week to midweek correction.
-    mutate(time_value = time_value + 3) %>%
+    mutate(time_value = floor_date(time_value, "week", week_start = 7) + 3) %>%
     as_epi_archive(compactify = TRUE)
 }
+
+get_nssp_github <- function() {
+  raw_file <- read_csv("https://raw.githubusercontent.com/CDCgov/covid19-forecast-hub/refs/heads/main/auxiliary-data/nssp-raw-data/latest.csv")
+  state_map <- get_population_data() %>% filter(state_id !="usa")
+  raw_file %>%
+    filter(county == "All") %>%
+    left_join(state_map, by = join_by(geography == state_name)) %>%
+    select(geo_value = state_id, time_value = week_end, nssp = percent_visits_covid) %>%
+    mutate(time_value = floor_date(time_value, "week", week_start = 7) + 3) %>%
+    mutate(version = Sys.Date())
+}
@@ -626,7 +626,6 @@ delete_files_from_s3 <- function(keys, bucket, batch_size = 500, .progress = TRU
 
 
 MIN_TIMESTAMP <- as.POSIXct("2000-01-01 00:00:00S", tz = "UTC")
-MAX_TIMESTAMP <- as.POSIXct("2040-01-01 00:00:00S", tz = "UTC")
 
 #' Get the last time a covidcast signal was updated.
 #'
@@ -674,37 +673,40 @@ get_s3_object_last_modified <- function(key, bucket, missing_value = MIN_TIMESTA
 #' @param dataset_url The URL of the Socrata dataset.
 #'
 #' @return The last updated date of the Socrata dataset in POSIXct format.
-get_socrata_updated_at <- function(dataset_url, missing_value = MAX_TIMESTAMP) {
+get_socrata_updated_at <- function(dataset_url, missing_value) {
   tryCatch(
     {
-      httr::with_config(
+      rowsUpdatedAt <- httr::with_config(
         httr::config(timeout = 5),
         httr::RETRY("GET", dataset_url, times = 5, pause_min = 5, pause_cap = 5)
       ) %>%
         httr::content() %>%
         # This field comes in as integer seconds since epoch, so we need to convert it.
-        pluck("rowsUpdatedAt") %>%
-        as.POSIXct(origin = "1970-01-01", tz = "UTC")
+        pluck("rowsUpdatedAt")
+      if (is.null(rowsUpdatedAt)) {
+        return(missing_value)
+      }
+      rowsUpdatedAt %>% as.POSIXct(origin = "1970-01-01", tz = "UTC")
     },
     error = function(cond) {
       return(missing_value)
     }
   )
 }
 
-
 #' get the unique shared (geo_value, forecast_date, target_end_date) tuples present for each forecaster in `forecasts`
 get_unique <- function(forecasts) {
   forecasters <- forecasts %>%
     pull(forecaster) %>%
     unique()
   distinct <- map(
     forecasters,
-    \(x)
+    \(x) {
       forecasts %>%
         filter(forecaster == x) %>%
         select(geo_value, forecast_date, target_end_date) %>%
         distinct()
+    }
   )
   distinct_dates <- reduce(
     distinct,
@@ -746,3 +748,98 @@ filter_shared_geo_dates <- function(
       inner_join(viable_dates, by = c("geo_value", "forecast_date", "target_end_date"))
   )
 }
+
+
+#' Calculate MD5 hash of a file
+#'
+#' This function reads a file into memory, calculates an MD5 hash of the
+#' binary data, and returns the hash as a character string.
+#'
+#' @param file The path to the file to hash
+#' @param algorithm The hash algorithm to use. Defaults to "md5".
+get_file_hash <- function(file, algorithm = "md5") {
+  readBin(file, what = "raw", n = file.size(file)) %>%
+    digest::digest(algo = algorithm, serialize = FALSE)
+}
+
+#' Calculate MD5 hash of a tibble as Parquet data
+#'
+#' This function takes a tibble, writes it to a Parquet file in memory,
+#' and calculates an MD5 hash of the resulting binary data. This is useful
+#' for creating content-based hashes of data that can be used for caching
+#' or detecting changes in data.
+#'
+#' @param df A tibble or data frame to hash
+#' @param algorithm The hash algorithm to use. Defaults to "md5".
+#'   Other options include "sha1", "sha256", "crc32", etc.
+#'
+#' @return A character string containing the MD5 hash
+#'
+#' @examples
+#' \dontrun{
+#' library(dplyr)
+#' data <- tibble(x = 1:5, y = letters[1:5])
+#' hash <- get_parquet_hash(data)
+#' print(hash)
+#' }
+#'
+#' @export
+get_tibble_hash <- function(df, algorithm = "md5") {
+  temp_file <- tempfile(fileext = ".parquet")
+  on.exit(unlink(temp_file), add = TRUE)
+  nanoparquet::write_parquet(df, temp_file)
+  get_file_hash(temp_file, algorithm = algorithm)
+}
+
+#' Compare an S3 ETag with local hashes
+#'
+#' This function downloads a file from S3, calculates various hashes of the
+#' binary data, and compares them to the ETag of the S3 object. A test to verify
+#' that I understand how S3 ETags are computed.
+#'
+#' @param bucket The name of the S3 bucket.
+#' @param key The key of the S3 object.
+compare_s3_etag <- function(bucket, key, region = "us-east-1") {
+  # Download file to temp location
+  temp_file <- tempfile()
+  on.exit(unlink(temp_file), add = TRUE)
+
+  # Download from S3
+  aws.s3::save_object(object = key, bucket = bucket, file = temp_file, region = region)
+
+  # Get S3 metadata to extract ETag
+  s3_meta <- aws.s3::head_object(object = key, bucket = bucket, region = region)
+
+  # Extract ETag (remove quotes if present)
+  s3_etag <- gsub('"', '', attr(s3_meta, "etag"))
+
+  # Calculate various hashes of the local file
+  raw_data <- readBin(temp_file, "raw", file.info(temp_file)$size)
+
+  hashes <- list(
+    md5 = digest::digest(raw_data, algo = "md5", serialize = FALSE),
+    sha1 = digest::digest(raw_data, algo = "sha1", serialize = FALSE),
+    sha256 = digest::digest(raw_data, algo = "sha256", serialize = FALSE),
+    crc32 = digest::digest(raw_data, algo = "crc32", serialize = FALSE)
+  )
+
+  # Compare results
+  cat("S3 ETag:", s3_etag, "\n")
+  cat("Local hashes:\n")
+  for (name in names(hashes)) {
+    match_indicator <- if (hashes[[name]] == s3_etag) " ✓ MATCH" else ""
+    cat(sprintf("  %s: %s%s\n", name, hashes[[name]], match_indicator))
+  }
+
+  # Check if it's a multipart upload (contains hyphen)
+  if (grepl("-", s3_etag)) {
+    cat("\nNote: ETag contains hyphen - this was likely a multipart upload\n")
+    cat("Multipart ETags are MD5 of concatenated part MD5s, plus part count\n")
+  }
+
+  invisible(list(
+    s3_etag = s3_etag,
+    local_hashes = hashes,
+    file_size = file.info(temp_file)$size
+  ))
+}
@@ -47,7 +47,9 @@ config <- list(
   prelim_metadata_url = "https://data.cdc.gov/api/views/mpgq-jmmr",
   raw_file_name_prefix = "nhsn_data_raw",
   s3_bucket = "forecasting-team-data",
-  archive_s3_key = "nhsn_data_archive.parquet"
+  archive_s3_key = "nhsn_data_archive.parquet",
+  local_raw_cache_path = "cache/nhsn_raw_cache",
+  hash_archive_file = "nhsn_hash_archive.parquet"
 )
 
 
@@ -79,6 +81,7 @@ get_last_raw_update_at <- function(type = c("raw", "prelim"), missing_value = MI
   )
 }
 
+
 #' Download the latest NHSN data from Socrata
 #'
 #' This function downloads the latest NHSN data from Socrata, if it has been
@@ -87,44 +90,81 @@ get_last_raw_update_at <- function(type = c("raw", "prelim"), missing_value = MI
 #'
 #' @param verbose Whether to print verbose output.
 update_nhsn_data_raw <- function() {
-  # If this request fails (which occurs surprisingly often, eyeroll), we
-  # will just return a future date (2040-01-01) and download anyway.
-  raw_update_at <- get_socrata_updated_at(config$raw_metadata_url)
-  # Same here.
-  prelim_update_at <- get_socrata_updated_at(config$prelim_metadata_url)
+  current_time <- with_tz(Sys.time(), tzone = "UTC")
+  # WARNING: These Socrata metadata fields have been unreliable. If they fail, they
+  # default to current time, which will trigger a download and then we compare
+  # with hash archive.
+  raw_update_at <- get_socrata_updated_at(config$raw_metadata_url, missing_value = current_time)
+  prelim_update_at <- get_socrata_updated_at(config$prelim_metadata_url, missing_value = current_time)
+  # Get the last time the raw data was updated from S3.
   last_raw_file_update_at <- get_last_raw_update_at("raw")
   last_prelim_file_update_at <- get_last_raw_update_at("prelim")
 
+  # Some derived values for logging and file naming.
+  raw_update_at_local <- with_tz(raw_update_at)
+  raw_update_at_formatted <- format(raw_update_at, "%Y-%m-%d_%H-%M-%OS5")
+  raw_file <- glue("{config$raw_file_name_prefix}_{raw_update_at_formatted}.parquet")
+  local_file_path <- here::here(config$local_raw_cache_path, raw_file)
+  prelim_update_at_local <- with_tz(prelim_update_at)
+  prelim_update_at_formatted <- format(prelim_update_at, "%Y-%m-%d_%H-%M-%OS5")
+  prelim_file <- glue("{config$raw_file_name_prefix}_{prelim_update_at_formatted}_prelim.parquet")
+  local_prelim_file_path <- here::here(config$local_raw_cache_path, prelim_file)
+  hash_archive_path <- here::here(config$local_raw_cache_path, config$hash_archive_file)
+
+  # Open the hash archive file.
+  hash_archive <- nanoparquet::read_parquet(hash_archive_path)
+
+  # If the raw data has been updated or there was a failure getting metadata,
+  # download it.
   if (raw_update_at > last_raw_file_update_at) {
-    raw_update_at_local <- with_tz(raw_update_at)
     cli_inform("The raw data has been updated at {raw_update_at_local} (UTC: {raw_update_at}).")
-    raw_update_at_formatted <- format(raw_update_at, "%Y-%m-%d_%H-%M-%OS5")
-    raw_file <- glue("{config$raw_file_name_prefix}_{raw_update_at_formatted}.parquet")
     cli_inform("Downloading the raw data... {raw_file}")
-    read_csv(config$raw_query_url) %>% s3write_using(write_parquet, object = raw_file, bucket = config$s3_bucket)
+    read_csv(config$raw_query_url) %>% write_parquet(local_file_path)
+
+    # Get the hash of the raw file.
+    raw_file_hash <- get_file_hash(local_file_path)
+
+    # If the raw file hash is not in the archive, add it to S3 and local file.
+    if (!raw_file_hash %in% hash_archive$hash) {
+      hash_archive <- bind_rows(hash_archive, tibble(file = raw_file, hash = raw_file_hash))
+      cli_inform("Adding raw file to S3 and local cache.")
+
+      # Back up the raw file to S3.
+      # s3write_using(write_parquet, object = raw_file, bucket = config$s3_bucket)
+
+      # Write the hash archive back to the file.
+      write_parquet(hash_archive, hash_archive_path)
+    } else {
+      cli_inform("New raw file is a duplicate, removing from local cache.")
+      unlink(local_file_path)
+    }
   }
 
+  # If the prelim data has been updated or there was a failure getting metadata,
+  # download it.
   if (prelim_update_at > last_prelim_file_update_at) {
-    prelim_update_at_local <- with_tz(prelim_update_at)
     cli_inform("The prelim data has been updated at {prelim_update_at_local} (UTC: {prelim_update_at}).")
-    prelim_update_at_formatted <- format(prelim_update_at, "%Y-%m-%d_%H-%M-%OS5")
-    prelim_file <- glue("{config$raw_file_name_prefix}_{prelim_update_at_formatted}_prelim.parquet")
     cli_inform("Downloading the prelim data... {prelim_file}")
-    read_csv(config$prelim_query_url) %>% s3write_using(write_parquet, object = prelim_file, bucket = config$s3_bucket)
-  }
+    read_csv(config$prelim_query_url) %>% write_parquet(local_prelim_file_path)
 
-  # Since we may have downloaded a duplicate file above, filter out the ones
-  # that have the same ETag. (I don't feel like rederiving AWS S3's ETag field
-  # and computing ahead of time.)
-  delete_df <- delete_duplicates_from_s3_by_etag(config$s3_bucket, config$raw_file_name_prefix, dry_run = FALSE)
-  if (nrow(delete_df) > 0) {
-    cli_inform("Deleted {nrow(delete_df)} duplicate files from S3.")
-    cli_inform("Deleted files:")
-    cli_inform(paste0(" - ", delete_df$Key))
-  } else {
-    cli_inform("No duplicate files to delete.")
+    # Get the hash of the prelim file.
+    prelim_file_hash <- get_file_hash(local_prelim_file_path)
+
+    # If the prelim file hash is not in the archive, add it to S3 and local file.
+    if (!prelim_file_hash %in% hash_archive$hash) {
+      hash_archive <- bind_rows(hash_archive, tibble(file = prelim_file, hash = prelim_file_hash))
+      cli_inform("Adding prelim file to S3 and local cache.")
+
+      # Back up the prelim file to S3.
+      # s3write_using(write_parquet, object = prelim_file, bucket = config$s3_bucket)
+
+      # Write the hash archive back to the file.
+      write_parquet(hash_archive, hash_archive_path)
+    } else {
+      cli_inform("New prelim file is a duplicate, removing from local cache.")
+      unlink(local_prelim_file_path)
+    }
   }
-  cli_inform("Finished fetching NHSN data.")
 }
 
 #' Process Raw NHSN Data File
 
@@ -12,6 +12,21 @@ forecast_date,forecaster,geo_value,weight
 2024-10-01, climate_geo_agged, all, 0
 2024-10-01, climate_quantile_extrapolated, all, 0
 ##################
+# Jun 25
+##################
+2024-10-01, all, mp, 0
+2024-10-01, windowed_seasonal, all, 0.0001
+2024-10-01, windowed_seasonal_extra_sources, all, 3
+2024-10-01, climate_linear, all, 0.0001
+2024-10-01, linear, all, 3
+2024-10-01, linearlog, all, 0
+2024-10-01, climate_base, all, 2
+2024-10-01, climate_geo_agged, all, 0
+2024-10-01, climate_quantile_extrapolated, all, 0.001
+2024-10-01, windowed_seasonal, ak, 10
+2024-10-01, windowed_seasonal_extra_sources, ak, 0.001
+2024-10-01, climate_linear, ak, 0.001
+##################
 # April 30th
 ##################
 2025-04-30, all, mp, 0
Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,7 @@ linters: linters_with_defaults(`
`2`	`2`	`line_length_linter(120),`
`3`	`3`	`cyclocomp_linter = NULL,`
`4`	`4`	`object_length_linter(length = 40L),`
	`5`	`+ object_usage_linter = NULL,`
`5`	`6`	`commented_code_linter = NULL`
`6`	`7`	`)`
`7`	`8`	`exclusions: list(`