-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 2176eaf
Showing
43 changed files
with
3,312 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
.Rproj.user | ||
.Rhistory | ||
.RData | ||
.Ruserdata | ||
emoji-cheat-sheet.com |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
|
||
|
||
##------------------------------------------------------------------------------ | ||
## INITIALIZE | ||
##------------------------------------------------------------------------------ | ||
|
||
library(geneorama) | ||
sourceDir("R/functions/", trace = FALSE) | ||
geneorama::loadinstall_libraries(c("sp", "rgdal")) | ||
|
||
##------------------------------------------------------------------------------ | ||
## GET DATA | ||
##------------------------------------------------------------------------------ | ||
refresh_wnv() | ||
wnv <- open_latest_wnv_file() | ||
|
||
if(Sys.info()['sysname']== "Linux"){ | ||
## Linux / ROracle | ||
geneorama::loadinstall_libraries("ROracle") | ||
oracle_traps <- download_oracle_traps_linux(credential_file = "untracked/zdt_credentials_prod.txt") | ||
} else { | ||
## Windows / RODBC | ||
geneorama::loadinstall_libraries("RODBC") | ||
oracle_traps <- download_oracle_traps(credential_file = "untracked/zdt_credentials.txt") | ||
} | ||
|
||
##------------------------------------------------------------------------------ | ||
## PROCESS ORACLE DATA | ||
##------------------------------------------------------------------------------ | ||
|
||
## Filter out bad records, simplify & rename columns, and combine a few records | ||
address_fix_list <- fread("untracked/oracle_traps_address_repair_list.csv") | ||
oracle_traps <- filter_oracle_trap_xy(oracle_traps, address_fix_list) | ||
|
||
## Add in complete latitude / longitude | ||
oracle_latlon <- oracle_traps[ , stateplane2latlon(OX, OY)] | ||
oracle_traps[ , OLAT := oracle_latlon$latitude] | ||
oracle_traps[ , OLON := oracle_latlon$longitude] | ||
rm(oracle_latlon) | ||
|
||
##------------------------------------------------------------------------------ | ||
## PROCESS DATA PORTAL DATA | ||
##------------------------------------------------------------------------------ | ||
|
||
## Substitute GRAVID for CDC to match oracle | ||
wnv <- wnv[ , trap_type := gsub("CDC", "GRAVID", trap_type)] | ||
|
||
wnv_xy <- latlon2stateplane(wnv$latitude, wnv$longitude) | ||
wnv[ , X := round(wnv_xy$x)] | ||
wnv[ , Y := round(wnv_xy$y)] | ||
rm(wnv_xy) | ||
|
||
##------------------------------------------------------------------------------ | ||
## MERGE ORACLE TRAP DATA INTO DATA PORTAL DATA | ||
##------------------------------------------------------------------------------ | ||
|
||
oracle_traps[ , season_year := as.integer(season_year)] | ||
|
||
## Oracle data has detail for each year | ||
wnv <- merge(wnv, oracle_traps, by=c("trap", "season_year", "trap_type")) | ||
|
||
##------------------------------------------------------------------------------ | ||
## CALCULATE UNIQUE ID BASED ON TRAP, BLOCK, LAT/LON, AND TYPE | ||
##------------------------------------------------------------------------------ | ||
id_table <- wnv[ , .N, keyby = list(trap, | ||
block, | ||
oracle_address = address, | ||
latitude, longitude, | ||
X = round(X), | ||
Y = round(Y), | ||
OLAT, | ||
OLON, | ||
OX = round(OX), | ||
OY = round(OY), | ||
census_block, | ||
trap_type, | ||
distance_difference = sqrt((X-OX)^2 + (Y-OY)^2))] | ||
## Remove N, it's not needed | ||
id_table[ , N := NULL] | ||
|
||
## Add in id, which is in the format of "id###" | ||
id_table[ , id := sprintf("id%0.3i", 1:nrow(id_table))] | ||
|
||
##------------------------------------------------------------------------------ | ||
## SAVE RESULTS AND ANNOUNCE FINISH FOR LOG | ||
##------------------------------------------------------------------------------ | ||
saveRDS(id_table, "data/10_calculate_idtable.Rds") | ||
|
||
cat("file data/10_calculate_idtable.Rds has been created \n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,276 @@ | ||
|
||
|
||
##------------------------------------------------------------------------------ | ||
## INITIALIZE / IMPORT DATA | ||
##------------------------------------------------------------------------------ | ||
|
||
library(geneorama) | ||
sourceDir("R/functions/", trace = FALSE) | ||
loadinstall_libraries(c("fishmethods", "ggplot2", "labeling", "caret", "sp")) | ||
|
||
refresh_noaa() | ||
refresh_wnv() | ||
|
||
noaa_files <- list.files("data/", pattern="noaa_values", full.names=TRUE) | ||
noaa <- open_noaa_files(noaa_files) | ||
|
||
# noaa_files <- list.files("data/", pattern="noaa_attr", full.names=TRUE) | ||
# noaa_attrs <- open_noaa_files(noaa_files) | ||
|
||
wnv_original <- open_latest_wnv_file() | ||
id_table <- readRDS("data/10_calculate_idtable.Rds") | ||
|
||
ward_map <- readRDS("data/BoundariesWards.Rds") | ||
|
||
##------------------------------------------------------------------------------ | ||
## CALCULATE UNIQUE ID BASED ON TRAP, BLOCK, LAT/LON, AND TYPE | ||
##------------------------------------------------------------------------------ | ||
|
||
## Merge in trap id created in previous script | ||
wnv <- merge(x = wnv_original[i = TRUE, | ||
j = .SD, | ||
.SDcols = -c("latitude", "longitude", "trap_type")], | ||
y = id_table[,list(trap, trap_type, block, id, | ||
latitude = OLAT, longitude = OLON, | ||
X = OX, Y = OY, census_block)], | ||
by = c("trap", "block"), | ||
sort = F) | ||
|
||
##------------------------------------------------------------------------------ | ||
## FILTER / CLEAN UP SPECIES | ||
##------------------------------------------------------------------------------ | ||
|
||
dcast(wnv, species~year(date), fun.aggregate = length, value.var = "result") | ||
# clipper(dcast(wnv, species~year(date), fun.aggregate = length, value.var = "result")) | ||
|
||
other_species <- c("CULEX ERRATICUS", "CULEX SALINARIUS", | ||
"CULEX TARSALIS", "UNSPECIFIED CULEX") | ||
wnv[ , spec := species] | ||
wnv[species %in% other_species , spec := "other"] | ||
wnv[species == "CULEX PIPIENS", spec := "pipiens"] | ||
wnv[species == "CULEX PIPIENS/RESTUANS", spec := "pipiens_restauns"] | ||
wnv[species == "CULEX RESTUANS", spec := "restuans"] | ||
wnv[species == "CULEX TERRITANS", spec := "territans"] | ||
|
||
dcast(wnv, spec~year(date), fun.aggregate = length, value.var = "result") | ||
# clipper(dcast(wnv, spec~year(date), fun.aggregate = length, value.var = "result")) | ||
|
||
|
||
# dcast(wnv[result==TRUE], species~year(date), fun.aggregate = length, value.var = "result") | ||
# dcast(wnv[result==FALSE], species~year(date), fun.aggregate = length, value.var = "result") | ||
# dcast(wnv, species~year(date), fun.aggregate = length, value.var = "result") | ||
|
||
##------------------------------------------------------------------------------ | ||
## Fix dates | ||
## "week" is a better measure of the date than the date field, because there are | ||
## often errors with multiple measures in a single week | ||
##------------------------------------------------------------------------------ | ||
|
||
## The main collection date has shifted over time from Tuesday, to Friday, and | ||
## now it's Thursday | ||
dcast(wnv[ , .N, list(season_year, wday=wday(date))], | ||
season_year ~ wday, value.var = "N") | ||
setnames(wnv, "date", "date_orig") | ||
wnv[ , date := as.IDate(round(date_orig, "year") + (week / 52) * 365)] | ||
setcolorder(wnv, c('season_year', 'week', 'test_id', 'block', 'trap', 'trap_type', | ||
'date_orig', 'number_of_mosquitoes', 'result', 'species', | ||
'id', 'date', 'latitude', 'longitude', 'X', 'Y', | ||
'census_block', 'spec')) | ||
wnv[ , .N, keyby = list(date - date_orig)] | ||
|
||
##------------------------------------------------------------------------------ | ||
## CONVERT DATA TO WIDE | ||
##------------------------------------------------------------------------------ | ||
|
||
dat <- dcast(wnv, | ||
date + season_year + week + id + trap + latitude + longitude + | ||
X + Y + census_block + trap_type ~ spec + result, | ||
value.var = "number_of_mosquitoes", | ||
fun.aggregate = sum, fill = 0, na.rm = T) | ||
setcolorder(dat, c('season_year', 'week', 'date', 'trap', 'id', 'trap_type', | ||
'latitude', 'longitude', 'X', 'Y', 'census_block', | ||
'pipiens_restauns_TRUE', 'pipiens_restauns_FALSE', | ||
'pipiens_TRUE', 'pipiens_FALSE', | ||
'restuans_TRUE', 'restuans_FALSE', | ||
'territans_TRUE', 'territans_FALSE', | ||
'other_TRUE', 'other_FALSE')) | ||
|
||
# ## Add in row count from original WNV data | ||
# dat <- merge(dat, | ||
# wnv[ , .N, list(date, week, id, trap, trap_type)], | ||
# by = c("date", "week", "id", "trap", "trap_type")) | ||
# dat | ||
|
||
# encoding_matrix <- get_encoding_list(dat) | ||
|
||
NAsummary(dat) | ||
|
||
##------------------------------------------------------------------------------ | ||
## GEOCODE WARD | ||
##------------------------------------------------------------------------------ | ||
|
||
## Manual process | ||
sp::coordinates(dat) <- c("longitude", "latitude") | ||
ward_map@proj4string | ||
ward_map <- sp::spTransform(ward_map, sp::CRS("+proj=longlat +datum=WGS84")) | ||
dat@proj4string <- ward_map@proj4string | ||
system.time(geo <- sp::over(dat, ward_map)) | ||
dat <- as.data.table(dat) | ||
dat[ , ward := as.integer(geo$ward)] | ||
|
||
## chigeocodR process | ||
# system.time(addrs <- dat[ , chigeocodR::reverseGeocode(lat = latitude, | ||
# lon = longitude)]) | ||
# system.time(wards <- chigeocodR::forwardGeocode(streetAddresses = addrs$address, | ||
# geoTypes = "Ward")) | ||
# setnames(addrs, c("lat", "lon"), c("latitude", "longitude")) | ||
# wards <- as.data.table(wards) | ||
# dat <- merge(dat, addrs, c("latitude", "longitude")) | ||
# dat <- merge(dat, wards, c("address")) | ||
# setnames(dat, "geovalues.Ward", "ward") | ||
|
||
|
||
##------------------------------------------------------------------------------ | ||
## TRAP OBSERVATION COUNT FOR CREDIBILITY | ||
## IN THE MODEL WE CAN EXCLUDE TRAPS WITH VERY LOW COUNTS | ||
##------------------------------------------------------------------------------ | ||
|
||
## Calculate how many times we see a trap | ||
dat[ , trap_obs_count := .N, id] | ||
|
||
##------------------------------------------------------------------------------ | ||
## REMOVE SOME VARIABLES WITH MISSING DATA FOR SIMPLICITY | ||
##------------------------------------------------------------------------------ | ||
|
||
# noaa[ , WDF5 := NULL] ## Should impute later, this is wind speed | ||
# noaa[ , WSF5 := NULL] ## Should impute later, this is wind speed | ||
noaa[ , WT01 := NULL] | ||
noaa[ , WT02 := NULL] | ||
noaa[ , WT03 := NULL] | ||
noaa[ , WT08 := NULL] | ||
|
||
NAsummary(noaa) | ||
|
||
##------------------------------------------------------------------------------ | ||
## ADD Y VALUES | ||
##------------------------------------------------------------------------------ | ||
dat$total_true <- apply(dat[ , grep("_TRUE", colnames(dat)), with =F], 1, sum) | ||
dat$total_false <- apply(dat[ , grep("_FALSE", colnames(dat)), with =F], 1, sum) | ||
dat$total_mosquitoes <- dat[ , total_true + total_false] | ||
|
||
dat[ , pct_wnv := total_true / (total_true + total_false)] | ||
dat[ , wnv := as.integer(0!=(total_true / (total_true + total_false)))] | ||
|
||
##------------------------------------------------------------------------------ | ||
## CREATE VARIABLES BASED ON PREVIOUS PYTHON WORK AND BASED ON LAGGED VALUES | ||
##------------------------------------------------------------------------------ | ||
|
||
setkey(dat, id, date, week) | ||
NAsummary(dat) | ||
|
||
## Demo of the shift function | ||
# dat[ , date_prev1 := shift(as.character(date), -1), by = id] | ||
# dat[ , list(id, date, date_prev1)] | ||
# dat[ , date_prev1 := NULL, by = id] | ||
|
||
dat[ , wnvw1 := shift(wnv, -1), by = id] | ||
dat[ , wnvw2 := shift(wnv, -2), by = id] | ||
## Future WNV for forecast testing | ||
dat[ , wnv_f1 := shift(wnv, 1), by = id] | ||
|
||
season_prev_summary <- dat[i = TRUE, | ||
j = list(date, | ||
wnv_ytd = shift(cumsum(wnv), -1)), | ||
keyby = list(id, year = year(date))] | ||
# season_prev_summary | ||
# season_prev_summary[id=="id169"] | ||
|
||
dat <- merge(dat, | ||
season_prev_summary[,.SD,.SDcols=-"year"], | ||
c("id", "date")) | ||
# split(dat, dat$id) | ||
# dat[id=="id169"] | ||
|
||
## First create total | ||
dat[ , culx := other_FALSE + other_TRUE + pipiens_FALSE + pipiens_TRUE + | ||
pipiens_restauns_FALSE + pipiens_restauns_TRUE + restuans_FALSE + | ||
restuans_TRUE + territans_FALSE + territans_TRUE] | ||
dat[ , pip := pipiens_FALSE + pipiens_TRUE] | ||
dat[ , res := restuans_FALSE + restuans_TRUE] | ||
dat[ , pipres := pipiens_FALSE + pipiens_TRUE + restuans_FALSE + restuans_TRUE] | ||
dat[ , other := other_FALSE + other_TRUE + territans_FALSE + territans_TRUE] | ||
|
||
## Then shift the totals | ||
dat[ , culx1 := shift(culx, -1), by = id] | ||
dat[ , culx2 := shift(culx, -2), by = id] | ||
dat[ , pip1 := shift(pip, -1), by = id] | ||
dat[ , pip2 := shift(pip, -2), by = id] | ||
dat[ , res1 := shift(res, -1), by = id] | ||
dat[ , res2 := shift(res, -2), by = id] | ||
dat[ , pipres1 := shift(pipres, -1), by = id] | ||
dat[ , pipres2 := shift(pipres, -2), by = id] | ||
dat[ , other1 := shift(other, -1), by = id] | ||
dat[ , other2 := shift(other, -2), by = id] | ||
|
||
## Calculate previous week values, then join them to the data | ||
## Use all possible dates for flexibility | ||
dates <- unique(dat$date) | ||
noaa | ||
xx <- data.table(date = dates, start = dates - 8, end = dates - 1) | ||
yy <- noaa[ , list(AWND, PRCP, SNOW, SNWD, TMAX, TMIN, WDF2, WSF2), | ||
keyby = list(start = date, end = date)] | ||
jj <- foverlaps(xx, yy) | ||
weather_summary <- jj[i = TRUE, | ||
j = list(tmin = mean(TMIN), | ||
awnd = mean(AWND), | ||
prcp = mean(PRCP), | ||
snow = mean(SNOW), | ||
snwd = mean(SNWD), | ||
tmax = mean(TMAX), | ||
tmin = mean(TMIN), | ||
wdf2 = mean(WDF2), | ||
wsf2 = mean(WSF2)), | ||
keyby = list(date)] | ||
weather_summary | ||
rm(xx,yy,jj) | ||
caret::nearZeroVar(weather_summary) | ||
weather_summary <- weather_summary[,.SD,.SDcols=-c("snow", "snwd")] | ||
caret::findLinearCombos(weather_summary[ , list(tmin, tmax, awnd, prcp, wdf2, wsf2)]) | ||
cor(weather_summary[ , list(tmin, tmax, awnd, prcp, wdf2, wsf2)]) | ||
dat <- merge(dat, weather_summary, "date") | ||
|
||
|
||
## Diagnostics / plots | ||
if(FALSE){ | ||
msum <- dat[i = T, | ||
list(pos = sum(wnv), .N), | ||
list(date = round(date, "month"), | ||
month = month(date))] | ||
msum[ , month:=as.factor(month)] | ||
msum | ||
msum <- melt(msum, id.vars = "month", measure = c("pos", "N")) | ||
msum <- msum[!(variable=="pos" & value == 0)] | ||
|
||
## Dot plot then boxplot | ||
# ggplot(msum, aes(x=month, y = value, colour = variable)) + geom_point() | ||
# ggplot(msum, aes(month, value)) + geom_boxplot(aes(colour = variable)) | ||
mmsum <- data.frame(msum[ , list(mean = mean(value)), list(month, variable)]) | ||
ggplot(msum, aes(month, value)) + | ||
geom_boxplot(aes(colour = variable), width = .5) + | ||
geom_line(aes(month, mean, colour = variable, group = variable), | ||
data= mmsum, size = 2) + | ||
geom_point(aes(month, mean, colour = variable, group = variable), | ||
data= mmsum, size = 2, colour = "black") + | ||
ggtitle(paste0("Citywide count of traps collected (BLUE) compared to\n", | ||
"count of traps that were WNV positive (ORANGE)\n", | ||
"2008 - 2016\n")) | ||
} | ||
|
||
##------------------------------------------------------------------------------ | ||
## SAVE RESULTS | ||
##------------------------------------------------------------------------------ | ||
dat | ||
saveRDS(dat, "data/21_full_wnv_data_aggregated.Rds") | ||
cat("created 21_full_wnv_data_aggregated.Rds\n") | ||
|
||
|
Oops, something went wrong.