initial commit

Chicago · Jun 27, 2017 · 2176eaf · 2176eaf
commit 2176eaf
Show file tree

Hide file tree

Showing 43 changed files with 3,312 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
+emoji-cheat-sheet.com
diff --git a/R/10_calculate_idtable.R b/R/10_calculate_idtable.R
@@ -0,0 +1,89 @@
+
+
+##------------------------------------------------------------------------------
+## INITIALIZE
+##------------------------------------------------------------------------------
+
+library(geneorama)
+sourceDir("R/functions/", trace = FALSE)
+geneorama::loadinstall_libraries(c("sp", "rgdal"))
+
+##------------------------------------------------------------------------------
+## GET DATA
+##------------------------------------------------------------------------------
+refresh_wnv()
+wnv <- open_latest_wnv_file()
+
+if(Sys.info()['sysname']== "Linux"){
+    ## Linux / ROracle
+    geneorama::loadinstall_libraries("ROracle")
+    oracle_traps <- download_oracle_traps_linux(credential_file = "untracked/zdt_credentials_prod.txt")
+} else {
+    ## Windows / RODBC
+    geneorama::loadinstall_libraries("RODBC")
+    oracle_traps <- download_oracle_traps(credential_file = "untracked/zdt_credentials.txt")
+}
+
+##------------------------------------------------------------------------------
+## PROCESS ORACLE DATA
+##------------------------------------------------------------------------------
+
+## Filter out bad records, simplify & rename columns, and combine a few records
+address_fix_list <- fread("untracked/oracle_traps_address_repair_list.csv")
+oracle_traps <- filter_oracle_trap_xy(oracle_traps, address_fix_list)
+
+## Add in complete latitude / longitude
+oracle_latlon <- oracle_traps[ , stateplane2latlon(OX, OY)]
+oracle_traps[ , OLAT := oracle_latlon$latitude]
+oracle_traps[ , OLON := oracle_latlon$longitude]
+rm(oracle_latlon)
+
+##------------------------------------------------------------------------------
+## PROCESS DATA PORTAL DATA
+##------------------------------------------------------------------------------
+
+## Substitute GRAVID for CDC to match oracle 
+wnv <- wnv[ , trap_type := gsub("CDC", "GRAVID", trap_type)]
+
+wnv_xy <- latlon2stateplane(wnv$latitude, wnv$longitude)
+wnv[ , X := round(wnv_xy$x)]
+wnv[ , Y := round(wnv_xy$y)]
+rm(wnv_xy)
+
+##------------------------------------------------------------------------------
+## MERGE ORACLE TRAP DATA INTO DATA PORTAL DATA
+##------------------------------------------------------------------------------
+
+oracle_traps[ , season_year := as.integer(season_year)]
+
+## Oracle data has detail for each year
+wnv <- merge(wnv,  oracle_traps, by=c("trap", "season_year", "trap_type"))
+
+##------------------------------------------------------------------------------
+## CALCULATE UNIQUE ID BASED ON TRAP, BLOCK, LAT/LON, AND TYPE
+##------------------------------------------------------------------------------
+id_table <- wnv[ , .N, keyby = list(trap, 
+                                    block, 
+                                    oracle_address = address,
+                                    latitude, longitude,  
+                                    X = round(X), 
+                                    Y = round(Y), 
+                                    OLAT, 
+                                    OLON, 
+                                    OX = round(OX), 
+                                    OY = round(OY), 
+                                    census_block,
+                                    trap_type,
+                                    distance_difference = sqrt((X-OX)^2 + (Y-OY)^2))]
+## Remove N, it's not needed
+id_table[ , N := NULL]
+
+## Add in id, which is in the format of "id###"
+id_table[ , id := sprintf("id%0.3i", 1:nrow(id_table))]
+
+##------------------------------------------------------------------------------
+## SAVE RESULTS AND ANNOUNCE FINISH FOR LOG
+##------------------------------------------------------------------------------
+saveRDS(id_table, "data/10_calculate_idtable.Rds")
+
+cat("file data/10_calculate_idtable.Rds has been created \n")
diff --git a/R/21_create_features.R b/R/21_create_features.R
@@ -0,0 +1,276 @@
+
+
+##------------------------------------------------------------------------------
+## INITIALIZE / IMPORT DATA
+##------------------------------------------------------------------------------
+
+library(geneorama)
+sourceDir("R/functions/", trace = FALSE)
+loadinstall_libraries(c("fishmethods", "ggplot2", "labeling", "caret", "sp"))
+
+refresh_noaa()
+refresh_wnv()
+
+noaa_files <- list.files("data/", pattern="noaa_values", full.names=TRUE)
+noaa <- open_noaa_files(noaa_files)
+
+# noaa_files <- list.files("data/", pattern="noaa_attr", full.names=TRUE)
+# noaa_attrs <- open_noaa_files(noaa_files)
+
+wnv_original <- open_latest_wnv_file()
+id_table <- readRDS("data/10_calculate_idtable.Rds")
+
+ward_map <- readRDS("data/BoundariesWards.Rds")
+
+##------------------------------------------------------------------------------
+## CALCULATE UNIQUE ID BASED ON TRAP, BLOCK, LAT/LON, AND TYPE
+##------------------------------------------------------------------------------
+
+## Merge in trap id created in previous script
+wnv <- merge(x = wnv_original[i = TRUE,
+                              j = .SD, 
+                              .SDcols = -c("latitude", "longitude", "trap_type")], 
+             y = id_table[,list(trap, trap_type, block, id, 
+                                latitude = OLAT, longitude = OLON, 
+                                X = OX, Y = OY, census_block)], 
+             by = c("trap", "block"), 
+             sort = F)
+
+##------------------------------------------------------------------------------
+## FILTER / CLEAN UP SPECIES
+##------------------------------------------------------------------------------
+
+dcast(wnv, species~year(date), fun.aggregate = length, value.var = "result")
+# clipper(dcast(wnv, species~year(date), fun.aggregate = length, value.var = "result"))
+
+other_species <- c("CULEX ERRATICUS", "CULEX SALINARIUS",
+                   "CULEX TARSALIS", "UNSPECIFIED CULEX")
+wnv[ , spec := species]
+wnv[species %in% other_species , spec := "other"]
+wnv[species == "CULEX PIPIENS", spec := "pipiens"]
+wnv[species == "CULEX PIPIENS/RESTUANS", spec := "pipiens_restauns"]
+wnv[species == "CULEX RESTUANS", spec := "restuans"]
+wnv[species == "CULEX TERRITANS", spec := "territans"]
+
+dcast(wnv, spec~year(date), fun.aggregate = length, value.var = "result")
+# clipper(dcast(wnv, spec~year(date), fun.aggregate = length, value.var = "result"))
+
+
+# dcast(wnv[result==TRUE], species~year(date), fun.aggregate = length, value.var = "result")
+# dcast(wnv[result==FALSE], species~year(date), fun.aggregate = length, value.var = "result")
+# dcast(wnv, species~year(date), fun.aggregate = length, value.var = "result")
+
+##------------------------------------------------------------------------------
+## Fix dates 
+## "week" is a better measure of the date than the date field, because there are
+## often errors with multiple measures in a single week
+##------------------------------------------------------------------------------
+
+## The main collection date has shifted over time from Tuesday, to Friday, and 
+## now it's Thursday
+dcast(wnv[ , .N, list(season_year, wday=wday(date))], 
+      season_year ~ wday, value.var = "N")
+setnames(wnv, "date", "date_orig")
+wnv[ , date := as.IDate(round(date_orig, "year") + (week / 52) * 365)]
+setcolorder(wnv, c('season_year', 'week', 'test_id', 'block', 'trap', 'trap_type', 
+                   'date_orig', 'number_of_mosquitoes', 'result', 'species', 
+                   'id', 'date', 'latitude', 'longitude', 'X', 'Y', 
+                   'census_block', 'spec'))
+wnv[ , .N, keyby = list(date - date_orig)]
+
+##------------------------------------------------------------------------------
+## CONVERT DATA TO WIDE
+##------------------------------------------------------------------------------
+
+dat <- dcast(wnv, 
+             date + season_year + week + id + trap + latitude + longitude + 
+                 X + Y + census_block + trap_type ~ spec + result,
+             value.var = "number_of_mosquitoes",
+             fun.aggregate = sum, fill = 0, na.rm = T)
+setcolorder(dat, c('season_year', 'week', 'date', 'trap', 'id', 'trap_type', 
+                   'latitude', 'longitude', 'X', 'Y', 'census_block',
+                   'pipiens_restauns_TRUE', 'pipiens_restauns_FALSE', 
+                   'pipiens_TRUE', 'pipiens_FALSE',
+                   'restuans_TRUE', 'restuans_FALSE', 
+                   'territans_TRUE', 'territans_FALSE',
+                   'other_TRUE', 'other_FALSE'))
+
+# ## Add in row count from original WNV data
+# dat <- merge(dat, 
+#              wnv[ , .N, list(date, week, id, trap, trap_type)],
+#              by = c("date", "week", "id", "trap", "trap_type"))
+# dat
+
+# encoding_matrix <- get_encoding_list(dat)
+
+NAsummary(dat)
+
+##------------------------------------------------------------------------------
+## GEOCODE WARD 
+##------------------------------------------------------------------------------
+
+## Manual process
+sp::coordinates(dat) <- c("longitude", "latitude")
+ward_map@proj4string
+ward_map <- sp::spTransform(ward_map, sp::CRS("+proj=longlat +datum=WGS84"))
+dat@proj4string <- ward_map@proj4string
+system.time(geo <- sp::over(dat, ward_map))
+dat <- as.data.table(dat)
+dat[ , ward := as.integer(geo$ward)]
+
+## chigeocodR process
+# system.time(addrs <- dat[ , chigeocodR::reverseGeocode(lat = latitude, 
+#                                                        lon = longitude)])
+# system.time(wards <- chigeocodR::forwardGeocode(streetAddresses = addrs$address, 
+#                                                 geoTypes = "Ward"))
+# setnames(addrs, c("lat", "lon"), c("latitude", "longitude"))
+# wards <- as.data.table(wards)
+# dat <- merge(dat, addrs, c("latitude", "longitude"))
+# dat <- merge(dat, wards, c("address"))
+# setnames(dat, "geovalues.Ward", "ward")
+
+
+##------------------------------------------------------------------------------
+## TRAP OBSERVATION COUNT FOR CREDIBILITY
+## IN THE MODEL WE CAN EXCLUDE TRAPS WITH VERY LOW COUNTS
+##------------------------------------------------------------------------------
+
+## Calculate how many times we see a trap
+dat[ , trap_obs_count := .N, id]
+
+##------------------------------------------------------------------------------
+## REMOVE SOME VARIABLES WITH MISSING DATA FOR SIMPLICITY
+##------------------------------------------------------------------------------
+
+# noaa[ , WDF5 := NULL]  ## Should impute later, this is wind speed
+# noaa[ , WSF5 := NULL]  ## Should impute later, this is wind speed
+noaa[ , WT01 := NULL]
+noaa[ , WT02 := NULL]
+noaa[ , WT03 := NULL]
+noaa[ , WT08 := NULL]
+
+NAsummary(noaa)
+
+##------------------------------------------------------------------------------
+## ADD Y VALUES
+##------------------------------------------------------------------------------
+dat$total_true <- apply(dat[ , grep("_TRUE", colnames(dat)), with =F], 1, sum)
+dat$total_false <- apply(dat[ , grep("_FALSE", colnames(dat)), with =F], 1, sum)
+dat$total_mosquitoes <- dat[ , total_true + total_false]
+
+dat[ , pct_wnv := total_true / (total_true + total_false)]
+dat[ , wnv := as.integer(0!=(total_true / (total_true + total_false)))]
+
+##------------------------------------------------------------------------------
+## CREATE VARIABLES BASED ON PREVIOUS PYTHON WORK AND BASED ON LAGGED VALUES
+##------------------------------------------------------------------------------
+
+setkey(dat, id, date, week)
+NAsummary(dat)
+
+## Demo of the shift function
+# dat[ , date_prev1 := shift(as.character(date), -1), by = id]
+# dat[ , list(id, date, date_prev1)]
+# dat[ , date_prev1 := NULL, by = id]
+
+dat[ , wnvw1 := shift(wnv, -1), by = id]
+dat[ , wnvw2 := shift(wnv, -2), by = id]
+## Future WNV for forecast testing
+dat[ , wnv_f1 := shift(wnv, 1), by = id]
+
+season_prev_summary <- dat[i = TRUE,
+                           j = list(date,
+                                    wnv_ytd = shift(cumsum(wnv), -1)),
+                           keyby = list(id, year = year(date))]
+# season_prev_summary
+# season_prev_summary[id=="id169"]
+
+dat <- merge(dat,
+             season_prev_summary[,.SD,.SDcols=-"year"],
+             c("id", "date"))
+# split(dat, dat$id)
+# dat[id=="id169"]
+
+## First create total
+dat[ , culx := other_FALSE + other_TRUE + pipiens_FALSE + pipiens_TRUE +
+         pipiens_restauns_FALSE + pipiens_restauns_TRUE + restuans_FALSE +
+         restuans_TRUE + territans_FALSE + territans_TRUE]
+dat[ , pip := pipiens_FALSE + pipiens_TRUE]
+dat[ , res := restuans_FALSE + restuans_TRUE]
+dat[ , pipres := pipiens_FALSE + pipiens_TRUE + restuans_FALSE + restuans_TRUE]
+dat[ , other := other_FALSE + other_TRUE + territans_FALSE + territans_TRUE]
+
+## Then shift the totals
+dat[ , culx1 := shift(culx, -1), by = id]
+dat[ , culx2 := shift(culx, -2), by = id]
+dat[ , pip1 := shift(pip, -1), by = id]
+dat[ , pip2 := shift(pip, -2), by = id]
+dat[ , res1 := shift(res, -1), by = id]
+dat[ , res2 := shift(res, -2), by = id]
+dat[ , pipres1 := shift(pipres, -1), by = id]
+dat[ , pipres2 := shift(pipres, -2), by = id]
+dat[ , other1 := shift(other, -1), by = id]
+dat[ , other2 := shift(other, -2), by = id]
+
+## Calculate previous week values, then join them to the data
+## Use all possible dates for flexibility
+dates <- unique(dat$date)
+noaa
+xx <- data.table(date = dates, start = dates - 8, end = dates - 1)
+yy <- noaa[ , list(AWND, PRCP, SNOW, SNWD, TMAX, TMIN, WDF2, WSF2),
+            keyby = list(start = date, end = date)]
+jj <- foverlaps(xx, yy)
+weather_summary <- jj[i = TRUE,
+                      j = list(tmin = mean(TMIN),
+                               awnd = mean(AWND),
+                               prcp = mean(PRCP),
+                               snow = mean(SNOW),
+                               snwd = mean(SNWD),
+                               tmax = mean(TMAX),
+                               tmin = mean(TMIN),
+                               wdf2 = mean(WDF2),
+                               wsf2 = mean(WSF2)),
+                      keyby = list(date)]
+weather_summary
+rm(xx,yy,jj)
+caret::nearZeroVar(weather_summary)
+weather_summary <- weather_summary[,.SD,.SDcols=-c("snow", "snwd")]
+caret::findLinearCombos(weather_summary[ , list(tmin, tmax, awnd, prcp, wdf2, wsf2)])
+cor(weather_summary[ , list(tmin, tmax, awnd, prcp, wdf2, wsf2)])
+dat <- merge(dat, weather_summary, "date")
+
+
+## Diagnostics / plots
+if(FALSE){
+    msum <- dat[i = T,
+                list(pos = sum(wnv), .N), 
+                list(date = round(date, "month"), 
+                     month = month(date))]
+    msum[ , month:=as.factor(month)]
+    msum
+    msum <- melt(msum, id.vars = "month", measure = c("pos", "N"))
+    msum <- msum[!(variable=="pos" & value == 0)]
+
+    ## Dot plot then boxplot
+    # ggplot(msum, aes(x=month, y = value, colour = variable)) + geom_point()
+    # ggplot(msum, aes(month, value)) + geom_boxplot(aes(colour = variable))
+    mmsum <- data.frame(msum[ , list(mean = mean(value)), list(month, variable)])
+    ggplot(msum, aes(month, value)) + 
+        geom_boxplot(aes(colour = variable), width = .5) +
+        geom_line(aes(month, mean, colour = variable, group = variable), 
+                  data= mmsum, size = 2) +
+        geom_point(aes(month, mean, colour = variable, group = variable), 
+                   data= mmsum, size = 2, colour = "black") +
+        ggtitle(paste0("Citywide count of traps collected (BLUE) compared to\n",
+                       "count of traps that were WNV positive (ORANGE)\n",
+                       "2008 - 2016\n"))
+}
+
+##------------------------------------------------------------------------------
+## SAVE RESULTS
+##------------------------------------------------------------------------------
+dat
+saveRDS(dat, "data/21_full_wnv_data_aggregated.Rds")
+cat("created 21_full_wnv_data_aggregated.Rds\n")
+
+