@@ -47,7 +47,9 @@ config <- list(
47
47
prelim_metadata_url = " https://data.cdc.gov/api/views/mpgq-jmmr" ,
48
48
raw_file_name_prefix = " nhsn_data_raw" ,
49
49
s3_bucket = " forecasting-team-data" ,
50
- archive_s3_key = " nhsn_data_archive.parquet"
50
+ archive_s3_key = " nhsn_data_archive.parquet" ,
51
+ local_raw_cache_path = " cache/nhsn_raw_cache" ,
52
+ hash_archive_file = " nhsn_hash_archive.parquet"
51
53
)
52
54
53
55
@@ -79,6 +81,7 @@ get_last_raw_update_at <- function(type = c("raw", "prelim"), missing_value = MI
79
81
)
80
82
}
81
83
84
+
82
85
# ' Download the latest NHSN data from Socrata
83
86
# '
84
87
# ' This function downloads the latest NHSN data from Socrata, if it has been
@@ -87,44 +90,81 @@ get_last_raw_update_at <- function(type = c("raw", "prelim"), missing_value = MI
87
90
# '
88
91
# ' @param verbose Whether to print verbose output.
89
92
update_nhsn_data_raw <- function () {
90
- # If this request fails (which occurs surprisingly often, eyeroll), we
91
- # will just return a future date (2040-01-01) and download anyway.
92
- raw_update_at <- get_socrata_updated_at(config $ raw_metadata_url )
93
- # Same here.
94
- prelim_update_at <- get_socrata_updated_at(config $ prelim_metadata_url )
93
+ current_time <- with_tz(Sys.time(), tzone = " UTC" )
94
+ # WARNING: These Socrata metadata fields have been unreliable. If they fail, they
95
+ # default to current time, which will trigger a download and then we compare
96
+ # with hash archive.
97
+ raw_update_at <- get_socrata_updated_at(config $ raw_metadata_url , missing_value = current_time )
98
+ prelim_update_at <- get_socrata_updated_at(config $ prelim_metadata_url , missing_value = current_time )
99
+ # Get the last time the raw data was updated from S3.
95
100
last_raw_file_update_at <- get_last_raw_update_at(" raw" )
96
101
last_prelim_file_update_at <- get_last_raw_update_at(" prelim" )
97
102
103
+ # Some derived values for logging and file naming.
104
+ raw_update_at_local <- with_tz(raw_update_at )
105
+ raw_update_at_formatted <- format(raw_update_at , " %Y-%m-%d_%H-%M-%OS5" )
106
+ raw_file <- glue(" {config$raw_file_name_prefix}_{raw_update_at_formatted}.parquet" )
107
+ local_file_path <- here :: here(config $ local_raw_cache_path , raw_file )
108
+ prelim_update_at_local <- with_tz(prelim_update_at )
109
+ prelim_update_at_formatted <- format(prelim_update_at , " %Y-%m-%d_%H-%M-%OS5" )
110
+ prelim_file <- glue(" {config$raw_file_name_prefix}_{prelim_update_at_formatted}_prelim.parquet" )
111
+ local_prelim_file_path <- here :: here(config $ local_raw_cache_path , prelim_file )
112
+ hash_archive_path <- here :: here(config $ local_raw_cache_path , config $ hash_archive_file )
113
+
114
+ # Open the hash archive file.
115
+ hash_archive <- nanoparquet :: read_parquet(hash_archive_path )
116
+
117
+ # If the raw data has been updated or there was a failure getting metadata,
118
+ # download it.
98
119
if (raw_update_at > last_raw_file_update_at ) {
99
- raw_update_at_local <- with_tz(raw_update_at )
100
120
cli_inform(" The raw data has been updated at {raw_update_at_local} (UTC: {raw_update_at})." )
101
- raw_update_at_formatted <- format(raw_update_at , " %Y-%m-%d_%H-%M-%OS5" )
102
- raw_file <- glue(" {config$raw_file_name_prefix}_{raw_update_at_formatted}.parquet" )
103
121
cli_inform(" Downloading the raw data... {raw_file}" )
104
- read_csv(config $ raw_query_url ) %> % s3write_using(write_parquet , object = raw_file , bucket = config $ s3_bucket )
122
+ read_csv(config $ raw_query_url ) %> % write_parquet(local_file_path )
123
+
124
+ # Get the hash of the raw file.
125
+ raw_file_hash <- get_file_hash(local_file_path )
126
+
127
+ # If the raw file hash is not in the archive, add it to S3 and local file.
128
+ if (! raw_file_hash %in% hash_archive $ hash ) {
129
+ hash_archive <- bind_rows(hash_archive , tibble(file = raw_file , hash = raw_file_hash ))
130
+ cli_inform(" Adding raw file to S3 and local cache." )
131
+
132
+ # Back up the raw file to S3.
133
+ # s3write_using(write_parquet, object = raw_file, bucket = config$s3_bucket)
134
+
135
+ # Write the hash archive back to the file.
136
+ write_parquet(hash_archive , hash_archive_path )
137
+ } else {
138
+ cli_inform(" New raw file is a duplicate, removing from local cache." )
139
+ unlink(local_file_path )
140
+ }
105
141
}
106
142
143
+ # If the prelim data has been updated or there was a failure getting metadata,
144
+ # download it.
107
145
if (prelim_update_at > last_prelim_file_update_at ) {
108
- prelim_update_at_local <- with_tz(prelim_update_at )
109
146
cli_inform(" The prelim data has been updated at {prelim_update_at_local} (UTC: {prelim_update_at})." )
110
- prelim_update_at_formatted <- format(prelim_update_at , " %Y-%m-%d_%H-%M-%OS5" )
111
- prelim_file <- glue(" {config$raw_file_name_prefix}_{prelim_update_at_formatted}_prelim.parquet" )
112
147
cli_inform(" Downloading the prelim data... {prelim_file}" )
113
- read_csv(config $ prelim_query_url ) %> % s3write_using(write_parquet , object = prelim_file , bucket = config $ s3_bucket )
114
- }
148
+ read_csv(config $ prelim_query_url ) %> % write_parquet(local_prelim_file_path )
115
149
116
- # Since we may have downloaded a duplicate file above, filter out the ones
117
- # that have the same ETag. (I don't feel like rederiving AWS S3's ETag field
118
- # and computing ahead of time.)
119
- delete_df <- delete_duplicates_from_s3_by_etag(config $ s3_bucket , config $ raw_file_name_prefix , dry_run = FALSE )
120
- if (nrow(delete_df ) > 0 ) {
121
- cli_inform(" Deleted {nrow(delete_df)} duplicate files from S3." )
122
- cli_inform(" Deleted files:" )
123
- cli_inform(paste0(" - " , delete_df $ Key ))
124
- } else {
125
- cli_inform(" No duplicate files to delete." )
150
+ # Get the hash of the prelim file.
151
+ prelim_file_hash <- get_file_hash(local_prelim_file_path )
152
+
153
+ # If the prelim file hash is not in the archive, add it to S3 and local file.
154
+ if (! prelim_file_hash %in% hash_archive $ hash ) {
155
+ hash_archive <- bind_rows(hash_archive , tibble(file = prelim_file , hash = prelim_file_hash ))
156
+ cli_inform(" Adding prelim file to S3 and local cache." )
157
+
158
+ # Back up the prelim file to S3.
159
+ # s3write_using(write_parquet, object = prelim_file, bucket = config$s3_bucket)
160
+
161
+ # Write the hash archive back to the file.
162
+ write_parquet(hash_archive , hash_archive_path )
163
+ } else {
164
+ cli_inform(" New prelim file is a duplicate, removing from local cache." )
165
+ unlink(local_prelim_file_path )
166
+ }
126
167
}
127
- cli_inform(" Finished fetching NHSN data." )
128
168
}
129
169
130
170
# ' Process Raw NHSN Data File
0 commit comments