Seasonal flow timeseries model
from paired flow and meterological/weather record

ALR July 2014
Conte Anadromous Fish Research Center

Load libraries, functions, etc

library(lme4)
library(devtools)
library(knitr)
library(ggplot2)
library(plyr)

opts_chunk$set(comment=NA)

#utility functions, load (source) from my saved gist
source_gist("https://gist.github.com/anarosner/ba285306fc0ce9d812a5", sha1="b25a1b73e02cc2b2d2c590f6c0b2c9c9945fa980")

model_dir<-"C:/ALR/Models/MetToFlow"
model_data_dir<-"C:/ALR/Models_processed_data"

#load (source) this project's general functions
setwd(file.path(model_dir,"A_get_data"))  
purl(input="0_general_functions.Rmd",output="0_general_functions.R",documentation=0)
source("0_general_functions.R")
file.remove("0_general_functions.R")

season.names<-c("winter","spring","summer","fall")

Load seasonal data, and look at number of records, sites, and years

load(file=file.path(model_data_dir,"flow_timeseries","dseasonal.Rdata"))


dseasonal<-subset(dseasonal,!is.na(HydrologicGroupAB) & !is.na(flow) & !is.na(rain) & !is.na(precip.e.lag2))


# length(unique(dseasonal$site_no))
# #238 unique gages
# 
length(unique(dseasonal$site_no[dseasonal$large_barriers==0]))

[1] 120

length(unique(dseasonal$site_no[dseasonal$TNC_DamCount==0]))

[1] 97

length(unique(dseasonal$site_no[dseasonal$OnChannelWaterSqKM<.5]))

[1] 171

length(unique(dseasonal$site_no[dseasonal$OnChannelWaterSqKM<.5 & dseasonal$large_barriers==0]))

[1] 110

length(unique(dseasonal$site_no[dseasonal$OnChannelWaterSqKM<.5 & dseasonal$TNC_DamCount==0]))

[1] 89

#120 with no LARGE barriers
#97 with no  barriers 


# 7870 total records
nrow(dseasonal)

[1] 7870

# 4140 records from sites w/ no LARGE barriers
nrow(dseasonal[dseasonal$large_barriers==0,])

[1] 4140

nrow(dseasonal[dseasonal$OnChannelWaterSqKM<.5,])

[1] 6226

# 3879 records from sites w/ no LARGE barriers and <.5 sq km impoundments
nrow(dseasonal[dseasonal$OnChannelWaterSqKM<.5 & dseasonal$large_barriers==0,])

[1] 3879

# 3283 records from sites w/ no barriers
nrow(dseasonal[dseasonal$TNC_DamCount==0,])

[1] 3283

# 3114 records from sites w/ no barriers and <.5 sq km impoundments
nrow(dseasonal[dseasonal$OnChannelWaterSqKM<.5 & dseasonal$TNC_DamCount==0,])

[1] 3114

#subset of unregulated gages
#   defined as NO dams from TNC barrier inventory
#   AND upstream on-channel open waterbodies < 0.5 sq km
d.unreg<-subset(dseasonal,TNC_DamCount==0 & dseasonal$OnChannelWaterSqKM<.5)

Create calibration and validation sets

# View(d.unreg[,c(1:9,58,59,21:23)])

#choose validation set
#select some gages to leave out for validations
#    AND select some years, leave out records from all gages for those year for validation
#    so that combined validation set #records is 10-15% of total records


#choose 8 sites w/ more than 4 yrs records
set.seed(933550175)
val.gages<-sample(unique(d.unreg$site_no[d.unreg$qseasonal>12]),size=8,replace=F)  
# val.gages

#choose 5 years
set.seed(993889335)
val.year<-sample(unique(d.unreg$year),5,replace=F)
sort(val.year)

[1] 1954 1956 1983 1999 2006

#check number of records chosen as part of validation gages and validation years
nrow(d.unreg) #3114

[1] 3114

nrow(d.unreg[d.unreg$qseasonal>12,]) #2870

[1] 2870

nrow(subset(d.unreg,site_no %in% val.gages)) #291

[1] 291

nrow(subset(d.unreg,year(as.Date(date)) %in% val.year)) #212

[1] 212

nrow(subset(d.unreg, site_no %in% val.gages & year(as.Date(date)) %in% val.year )) #22

[1] 22

#subset and count size of calib and valid data sets
d.calib<-subset(d.unreg,!(site_no %in% val.gages | year(as.Date(date)) %in% val.year))
nrow(d.calib) #2633

[1] 2633

d.valid<-subset(d.unreg, site_no %in% val.gages | year(as.Date(date)) %in% val.year)
nrow(d.valid) #481

[1] 481

nrow(d.valid)/nrow(d.unreg)*100 #15.44637    approx 15% of total records

[1] 15.45

#save calibration and validation data
setwd(file.path(model_dir,"c_seasonal_means/calib_data"))
save(d.calib,d.valid,file="calib_valid.Rdata")

Create mixed model

m.fixed<-lm(log(flow) ~ 
               log(drain_area_va) +
               log(non.zero(DrainageClass)) + 
                    log(non.zero(PercentSandy))+
#                     log(non.zero(HydrologicGroupAB))+ log(non.zero(SurficialCoarseC)) + 
               log(non.zero(Forest)) + 
               log(non.zero(Agriculture)) +     
#                     log(non.zero(Impervious))+
               log(non.zero(OffChannelWaterSqKM)) + 
#                     log(non.zero(OffChannelWetlandSqKM))+
#                     log(non.zero(OffChannelWaterSqKM+OffChannelWetlandSqKM)) + 
#                log(non.zero(BasinSlopePCNT))+ 
                    log(non.zero(ReachSlopePCNT)) +               
               log(non.zero(precip.e)) +  log(non.zero(precip.e.lag1)) + 
#                ((precip.e)) +  ((precip.e.lag1)) + 
#                log(non.zero(precip.e.lag2)) +
               1,
          na.action=NULL, #to ensure we attached fitted values w/ correct site_no
         data=d.calib) 

#create model for each season
m.fixed.season<-list()
# looping didn't work!  model would change when called later on depending on *current* value of i... oh dear

m.fixed.season[["winter"]]<-update(m.fixed, subset=season=="winter")
m.fixed.season$winter<-update(m.fixed.season$winter,formula=.~.
                              -log(non.zero(DrainageClass))
                              -log(non.zero(PercentSandy))
                              +log(non.zero(Impervious))
                              -log(non.zero(Forest))
                              -log(non.zero(precip.e)) -log(non.zero(precip.e.lag1))  
                              +precip.e +  precip.e.lag1 
#                               -log(non.zero(ReachSlopePCNT))
#                               + log(AnnualTminC+20)+ log(non.zero(frozen))+frozen+log(non.zero(gdd)) +gdd
                              )


m.fixed.season[["spring"]]<-update(m.fixed, subset=season=="spring")
m.fixed.season$spring<-update(m.fixed.season$spring,formula=.~.
                              -log(non.zero(DrainageClass))
                              -log(non.zero(OffChannelWaterSqKM))
#                               -log(non.zero(ReachSlopePCNT))
#                               +gdd
                              +log(non.zero(gdd))
                              +log(pet)
                              )
                              

m.fixed.season[["summer"]]<-update(m.fixed, subset=season=="summer")
m.fixed.season$summer<-update(m.fixed.season$summer,formula=.~.
                              +log(non.zero(precip.e.lag2))
                              - log(non.zero(PercentSandy)) 
                              -log(non.zero(OffChannelWaterSqKM)) 
                              +log(non.zero(gdd))
                              +log(pet)
#                               -log(non.zero(ReachSlopePCNT))  +log(non.zero(Impervious))-log(non.zero(Forest))
                              )


m.fixed.season[["fall"]]<-update(m.fixed, subset=season=="fall")
m.fixed.season$fall<-update(m.fixed.season$fall,formula=.~.
                            -log(non.zero(DrainageClass))
                            -log(non.zero(ReachSlopePCNT))  
                              +log(non.zero(gdd))
                              +log(pet) 
                            )

rm(m.fixed)



# summaries of fixed models
for (i in season.names) {
     print(paste("#####",i,"#####"))
     print(summary(m.fixed.season[[i]]))
     vif(m.fixed.season[[i]])
}

[1] "##### winter #####"

Call:
lm(formula = log(flow) ~ log(drain_area_va) + log(non.zero(Agriculture)) + 
    log(non.zero(OffChannelWaterSqKM)) + log(non.zero(ReachSlopePCNT)) + 
    log(non.zero(Impervious)) + precip.e + precip.e.lag1, data = d.calib, 
    subset = season == "winter", na.action = NULL)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.8802 -0.1965  0.0269  0.2421  2.1654 

Coefficients:
                                    Estimate Std. Error t value Pr(>|t|)
(Intercept)                        -0.591345   0.093548   -6.32  4.8e-10
log(drain_area_va)                  0.971027   0.021614   44.93  < 2e-16
log(non.zero(Agriculture))         -0.012747   0.004758   -2.68  0.00757
log(non.zero(OffChannelWaterSqKM))  0.020969   0.006269    3.35  0.00087
log(non.zero(ReachSlopePCNT))       0.075527   0.018521    4.08  5.1e-05
log(non.zero(Impervious))          -0.026342   0.008213   -3.21  0.00140
precip.e                            0.003552   0.000164   21.64  < 2e-16
precip.e.lag1                       0.001649   0.000180    9.16  < 2e-16
                                      
(Intercept)                        ***
log(drain_area_va)                 ***
log(non.zero(Agriculture))         ** 
log(non.zero(OffChannelWaterSqKM)) ***
log(non.zero(ReachSlopePCNT))      ***
log(non.zero(Impervious))          ** 
precip.e                           ***
precip.e.lag1                      ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.427 on 657 degrees of freedom
Multiple R-squared:  0.824,	Adjusted R-squared:  0.822 
F-statistic:  440 on 7 and 657 DF,  p-value: <2e-16

[1] "##### spring #####"

Call:
lm(formula = log(flow) ~ log(drain_area_va) + log(non.zero(PercentSandy)) + 
    log(non.zero(Forest)) + log(non.zero(Agriculture)) + log(non.zero(ReachSlopePCNT)) + 
    log(non.zero(precip.e)) + log(non.zero(precip.e.lag1)) + 
    log(non.zero(gdd)) + log(pet), data = d.calib, subset = season == 
    "spring", na.action = NULL)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.4318 -0.1673  0.0308  0.1936  1.5957 

Coefficients:
                              Estimate Std. Error t value Pr(>|t|)    
(Intercept)                   -6.24571    0.53736  -11.62  < 2e-16 ***
log(drain_area_va)             0.95985    0.01490   64.42  < 2e-16 ***
log(non.zero(PercentSandy))   -0.00639    0.00200   -3.19  0.00147 ** 
log(non.zero(Forest))          0.25214    0.04075    6.19  1.1e-09 ***
log(non.zero(Agriculture))    -0.01123    0.00299   -3.75  0.00019 ***
log(non.zero(ReachSlopePCNT))  0.03012    0.01756    1.72  0.08680 .  
log(non.zero(precip.e))        0.66673    0.03923   17.00  < 2e-16 ***
log(non.zero(precip.e.lag1))   0.11878    0.01866    6.37  3.6e-10 ***
log(non.zero(gdd))             0.13374    0.03008    4.45  1.0e-05 ***
log(pet)                      -0.96215    0.24985   -3.85  0.00013 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.311 on 660 degrees of freedom
Multiple R-squared:  0.885,	Adjusted R-squared:  0.884 
F-statistic:  566 on 9 and 660 DF,  p-value: <2e-16

[1] "##### summer #####"

Call:
lm(formula = log(flow) ~ log(drain_area_va) + log(non.zero(DrainageClass)) + 
    log(non.zero(Forest)) + log(non.zero(Agriculture)) + log(non.zero(ReachSlopePCNT)) + 
    log(non.zero(precip.e)) + log(non.zero(precip.e.lag1)) + 
    log(non.zero(precip.e.lag2)) + log(non.zero(gdd)) + log(pet), 
    data = d.calib, subset = season == "summer", na.action = NULL)

Residuals:
   Min     1Q Median     3Q    Max 
-1.873 -0.337  0.037  0.361  1.740 

Coefficients:
                               Estimate Std. Error t value Pr(>|t|)    
(Intercept)                   -28.14305    1.66340  -16.92  < 2e-16 ***
log(drain_area_va)              0.94399    0.02671   35.34  < 2e-16 ***
log(non.zero(DrainageClass))   -0.55772    0.11058   -5.04  5.9e-07 ***
log(non.zero(Forest))           0.24277    0.06676    3.64  0.00030 ***
log(non.zero(Agriculture))     -0.04225    0.00494   -8.55  < 2e-16 ***
log(non.zero(ReachSlopePCNT))  -0.10694    0.03075   -3.48  0.00054 ***
log(non.zero(precip.e))         1.38931    0.07721   17.99  < 2e-16 ***
log(non.zero(precip.e.lag1))    0.36951    0.06832    5.41  8.9e-08 ***
log(non.zero(precip.e.lag2))    0.03773    0.03617    1.04  0.29735    
log(non.zero(gdd))              1.84067    0.17853   10.31  < 2e-16 ***
log(pet)                       -7.01483    0.75970   -9.23  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.534 on 647 degrees of freedom
Multiple R-squared:  0.778,	Adjusted R-squared:  0.774 
F-statistic:  226 on 10 and 647 DF,  p-value: <2e-16

[1] "##### fall #####"

Call:
lm(formula = log(flow) ~ log(drain_area_va) + log(non.zero(PercentSandy)) + 
    log(non.zero(Forest)) + log(non.zero(Agriculture)) + log(non.zero(OffChannelWaterSqKM)) + 
    log(non.zero(precip.e)) + log(non.zero(precip.e.lag1)) + 
    log(non.zero(gdd)) + log(pet), data = d.calib, subset = season == 
    "fall", na.action = NULL)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.7668 -0.2519  0.0383  0.3090  2.6068 

Coefficients:
                                    Estimate Std. Error t value Pr(>|t|)
(Intercept)                        -17.09324    1.36631  -12.51  < 2e-16
log(drain_area_va)                   0.93750    0.02784   33.67  < 2e-16
log(non.zero(PercentSandy))          0.01217    0.00352    3.46  0.00058
log(non.zero(Forest))                0.10261    0.04462    2.30  0.02179
log(non.zero(Agriculture))          -0.04540    0.00551   -8.24    1e-15
log(non.zero(OffChannelWaterSqKM))  -0.01687    0.00731   -2.31  0.02139
log(non.zero(precip.e))              1.73004    0.06640   26.06  < 2e-16
log(non.zero(precip.e.lag1))         0.72811    0.07212   10.10  < 2e-16
log(non.zero(gdd))                   0.17523    0.09065    1.93  0.05368
log(pet)                            -0.88661    0.41523   -2.14  0.03313
                                      
(Intercept)                        ***
log(drain_area_va)                 ***
log(non.zero(PercentSandy))        ***
log(non.zero(Forest))              *  
log(non.zero(Agriculture))         ***
log(non.zero(OffChannelWaterSqKM)) *  
log(non.zero(precip.e))            ***
log(non.zero(precip.e.lag1))       ***
log(non.zero(gdd))                 .  
log(pet)                           *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.515 on 630 degrees of freedom
Multiple R-squared:  0.808,	Adjusted R-squared:  0.806 
F-statistic:  295 on 9 and 630 DF,  p-value: <2e-16

# # all DID...have adj-r-sqr around 0.8976 with old calibration set, but that set was too small....
i<-"winter" #0.8224     highest vif 3.00                                    previous 0.827     0.8321 v 0.8249
i<-"spring" #0.8838     highest vif 4.04       without gdd & pet 0.8776     previous0.8929 v .8697
i<-"summer" #0.7741     highest vif 5.12       without gdd & pet 0.7356     previous 0.7553 v 0.715
i<-"fall" #0.8056       highest vif 6.03       without gdd & pet 0.7833     previous0.7985 v 0.7827


### next addition: add avg temp, and total number of days <0C
#   might help account for runoff when ground is frozen in winter,
#   and account for ET in summer and fall (when r-sqr values are lower)

Create mixed model: random effects for year and site

m.mixed<-lmer(log(flow) ~ 
               log(drain_area_va) +
               log(non.zero(DrainageClass)) + 
               log(non.zero(PercentSandy))+
#                     log(non.zero(HydrologicGroupAB))+ log(non.zero(SurficialCoarseC)) + 
               log(non.zero(Forest)) + 
               log(non.zero(Agriculture)) +     
#                     log(non.zero(Impervious))+
               log(non.zero(OffChannelWaterSqKM)) + 
#                     log(non.zero(OffChannelWetlandSqKM))+
#                     log(non.zero(OffChannelWaterSqKM+OffChannelWetlandSqKM)) + 
#                log(non.zero(BasinSlopePCNT))+ 
               log(non.zero(ReachSlopePCNT)) +
               log(non.zero(precip.e)) +  log(non.zero(precip.e.lag1)) + 
               (1|date)+(1|site_no),
          na.action=NULL, #to ensure we attached fitted values w/ correct site_no
          data=d.calib)

m.mixed.season<-list()

# m.mixed.season[["winter"]]<-update(m.mixed, subset=season=="winter")
# m.mixed.season$winter<-update(m.mixed.season$winter,formula=.~.
#                               -log(non.zero(DrainageClass))
#                               -log(non.zero(PercentSandy)) )
# 
# 
# m.mixed.season[["spring"]]<-update(m.mixed, subset=season=="spring")
# m.mixed.season$spring<-update(m.mixed.season$spring,formula=.~.
#                               -log(non.zero(DrainageClass))
#                               -log(non.zero(ReachSlopePCNT))
#                               )
#                               
# 
# m.mixed.season[["summer"]]<-update(m.mixed, subset=season=="summer")
# m.mixed.season$summer<-update(m.mixed.season$summer,formula=.~.
#                               - log(non.zero(PercentSandy)) 
#                               -log(non.zero(OffChannelWaterSqKM)) 
#                               +log(non.zero(Impervious))
#                               -log(non.zero(Forest))
#                               )
# 
# 
# m.mixed.season[["fall"]]<-update(m.mixed, subset=season=="fall")
# m.mixed.season$fall<-update(m.mixed.season$fall,formula=.~.
#                             -log(non.zero(DrainageClass))
#                             -log(non.zero(ReachSlopePCNT))   
#                             )

m.mixed.season[["winter"]]<-update(m.mixed, subset=season=="winter")
m.mixed.season$winter<-update(m.mixed.season$winter,formula=.~.
                              -log(non.zero(DrainageClass))
                              -log(non.zero(PercentSandy))
                              +log(non.zero(Impervious))
                              -log(non.zero(Forest))
                              -log(non.zero(precip.e)) -log(non.zero(precip.e.lag1))  
                              +precip.e +  precip.e.lag1 
#                               -log(non.zero(ReachSlopePCNT))
#                               + log(AnnualTminC+20)+ log(non.zero(frozen))+frozen+log(non.zero(gdd)) +gdd
                              )


m.mixed.season[["spring"]]<-update(m.mixed, subset=season=="spring")
m.mixed.season$spring<-update(m.mixed.season$spring,formula=.~.
                              -log(non.zero(DrainageClass))
                              -log(non.zero(OffChannelWaterSqKM))
#                               -log(non.zero(ReachSlopePCNT))
#                               +gdd
                              +log(non.zero(gdd))
                              +log(pet)
                              )
                              

m.mixed.season[["summer"]]<-update(m.mixed, subset=season=="summer")
m.mixed.season$summer<-update(m.mixed.season$summer,formula=.~.
                              +log(non.zero(precip.e.lag2))
                              - log(non.zero(PercentSandy)) 
                              -log(non.zero(OffChannelWaterSqKM)) 
                              +log(non.zero(gdd))
                              +log(pet)
#                               -log(non.zero(ReachSlopePCNT))  +log(non.zero(Impervious))-log(non.zero(Forest))
                              )


m.mixed.season[["fall"]]<-update(m.mixed, subset=season=="fall")
m.mixed.season$fall<-update(m.mixed.season$fall,formula=.~.
                            -log(non.zero(DrainageClass))
                            -log(non.zero(ReachSlopePCNT))  
                              +log(non.zero(gdd))
                              +log(pet) 
                            )
rm(m.mixed)

Calculate goodness of fit for calibration data set

#create pred df
pred.calib<-d.calib
pred.calib$obs<-log(pred.calib$flow)
pred.calib$pred.fixed<-NA
pred.calib$pred.mixed<-NA

rm(goodness.fixed.calib,goodness.mixed.calib)

#predict and calc goodness of fit metrics of fixed and mixed models, calibration set
for (i in season.names) {
     pred.calib[pred.calib$season==i,"pred.fixed"]<-
          predict(m.fixed.season[[i]],
                  newdata=pred.calib[pred.calib$season==i,])
     pred.calib[pred.calib$season==i,"pred.mixed"]<-
          predict(m.mixed.season[[i]],
                  newdata=pred.calib[pred.calib$season==i,],
                  allow.new.levels=T)
     print(i)
     print(head(pred.calib[pred.calib$season==i,c(1:2,68:70)]))
     
     temp<-goodness(
               pred.calib[pred.calib$season==i,c("obs","pred.fixed")]
               )
     if(!exists("goodness.fixed.calib")){
          goodness.fixed.calib<-temp
          goodness.fixed.calib$season<-i
          goodness.fixed.calib<-goodness.fixed.calib[,c(ncol(goodness.fixed.calib),1:(ncol(goodness.fixed.calib)-1))]
     }
     else
          goodness.fixed.calib[nrow(goodness.fixed.calib)+1,]<-c(i,temp)
     
     temp<-goodness(
               pred.calib[pred.calib$season==i,c("obs","pred.mixed")]
               )
     if(!exists("goodness.mixed.calib")){
          goodness.mixed.calib<-temp     
          goodness.mixed.calib$season<-i
          goodness.mixed.calib<-goodness.mixed.calib[,c(ncol(goodness.mixed.calib),1:(ncol(goodness.mixed.calib)-1))]
     }
     else
          goodness.mixed.calib[nrow(goodness.mixed.calib)+1,]<-c(i,temp)
}

[1] "winter"
    site_no       date     obs pred.fixed pred.mixed
6  01049221 1978-02-28  2.9619     2.7203    2.69928
10 01049221 1979-02-28  1.9881     2.6532    2.40122
13 01049300 1964-02-28  0.4580     0.3985    0.17880
17 01049300 1965-02-28  0.2263     0.2801   -0.04823
21 01049300 1966-02-28 -0.8406     0.3343   -0.27860
25 01049300 1967-02-28 -0.3188     0.2511   -0.16082
[1] "spring"
    site_no       date    obs pred.fixed pred.mixed
7  01049221 1978-05-28 3.2361     3.6209     3.3658
11 01049221 1979-05-28 3.6801     3.8252     3.6457
14 01049300 1964-05-28 1.0799     0.8626     1.0433
18 01049300 1965-05-28 0.9475     0.6512     0.7077
22 01049300 1966-05-28 1.2241     0.9721     1.1564
26 01049300 1967-05-28 1.3933     1.0629     1.4427
[1] "summer"
    site_no       date     obs pred.fixed pred.mixed
8  01049221 1978-08-28  1.4648     0.9289    1.52273
12 01049221 1979-08-28  1.9010     1.1459    1.58876
15 01049300 1964-08-28 -1.0920    -0.8118   -0.97803
19 01049300 1965-08-28 -1.6607    -1.6080   -1.42808
23 01049300 1966-08-28 -1.1445    -1.2227   -1.23390
27 01049300 1967-08-28 -0.1766    -0.1509    0.01603
[1] "fall"
    site_no       date     obs pred.fixed pred.mixed
9  01049221 1978-11-28 -2.4327     0.3341  -1.346726
16 01049300 1964-11-28 -0.2044    -0.7673  -0.491215
20 01049300 1965-11-28 -0.5933    -0.9947  -0.737250
24 01049300 1966-11-28  0.3026    -0.1065   0.008043
28 01049300 1967-11-28 -0.3520    -0.6402  -0.200381
32 01049300 1968-11-28 -0.4775    -1.1692  -0.861738

Calculate goodness of fit for validation data set

#create pred df
pred.valid<-d.valid
pred.valid$obs<-log(pred.valid$flow)
pred.valid$pred.fixed<-NA
pred.valid$pred.mixed<-NA

rm(goodness.fixed.valid,goodness.mixed.valid)

#predict and calc goodness of fit metrics of fixed and mixed models, calibration set
for (i in season.names) {
     pred.valid[pred.valid$season==i,"pred.fixed"]<-
          predict(m.fixed.season[[i]],
                  newdata=pred.valid[pred.valid$season==i,])
     pred.valid[pred.valid$season==i,"pred.mixed"]<-
          predict(m.mixed.season[[i]],
                  newdata=pred.valid[pred.valid$season==i,],
                  allow.new.levels=T)
     print(i)
     print(head(pred.valid[pred.valid$season==i,c(1:2,68:70)]))
     
     temp<-goodness(
               pred.valid[pred.valid$season==i,c("obs","pred.fixed")]
               )
     if(!exists("goodness.fixed.valid")){
          goodness.fixed.valid<-temp
          goodness.fixed.valid$season<-i
          goodness.fixed.valid<-goodness.fixed.valid[,c(ncol(goodness.fixed.valid),1:(ncol(goodness.fixed.valid)-1))]
     }
     else
          goodness.fixed.valid[nrow(goodness.fixed.valid)+1,]<-c(i,temp)
     
     temp<-goodness(
               pred.valid[pred.valid$season==i,c("obs","pred.mixed")]
               )
     if(!exists("goodness.mixed.valid")){
          goodness.mixed.valid<-temp     
          goodness.mixed.valid$season<-i
          goodness.mixed.valid<-goodness.mixed.valid[,c(ncol(goodness.mixed.valid),1:(ncol(goodness.mixed.valid)-1))]
     }
     else
          goodness.mixed.valid[nrow(goodness.mixed.valid)+1,]<-c(i,temp)
}

[1] "winter"
     site_no       date    obs pred.fixed pred.mixed
89  01049300 1983-02-28 0.4487  -0.005257   -0.02617
112 01049396 1983-02-28 3.0626   2.683441    2.73753
248 01063310 1999-02-28 0.6748  -0.012671    0.21805
276 01063310 2006-02-28 1.2215   1.099568    1.05249
538 01064400 1983-02-28 2.3438   1.586063    1.73208
701 01073000 1954-02-28 3.0298   3.001064    2.89553
[1] "spring"
     site_no       date   obs pred.fixed pred.mixed
90  01049300 1983-05-28 1.817      1.615      1.804
113 01049396 1983-05-28 4.394      3.997      4.136
249 01063310 1999-05-28 1.413      1.102      1.361
277 01063310 2006-05-28 1.078      1.121      1.339
539 01064400 1983-05-28 3.321      3.211      3.274
702 01073000 1954-05-28 4.082      3.920      4.090
[1] "summer"
     site_no       date      obs pred.fixed pred.mixed
91  01049300 1983-08-28 -0.09347    -0.4877    -0.4675
114 01049396 1983-08-28  2.16918     1.5965     1.5707
250 01063310 1999-08-28 -1.16176    -0.7503    -0.4526
278 01063310 2006-08-28  1.01271     0.5530     0.7645
540 01064400 1983-08-28  1.43664     1.2508     1.2546
703 01073000 1954-08-28  2.11206     2.1995     2.1029
[1] "fall"
     site_no       date     obs pred.fixed pred.mixed
251 01063310 1999-11-28  0.9863    0.59900     0.5254
279 01063310 2006-11-28  0.9391    0.93637     0.9527
297 01063452 1999-11-28 -0.5893   -0.05364    -0.1743
541 01064400 1983-11-28  2.2004    1.98137     2.0546
704 01073000 1954-11-28  3.5397    2.90088     2.7322
712 01073000 1956-11-28  1.6046    1.68829     1.5126

Goodness of fit statistics

goodness.fixed.calib

  season sample.n  mean  RMSE NSEff bias percent.bias pearsonR CV.error
1 winter      665 2.001 0.300 0.824    0            0    0.908    0.212
2 spring      670 2.608 0.218 0.885    0            0    0.941    0.118
3 summer      658 1.141 0.374 0.778    0            0    0.882    0.464
4   fall      640 1.469 0.361 0.808    0            0    0.899    0.348

goodness.mixed.calib

  season sample.n  mean  RMSE NSEff bias percent.bias pearsonR CV.error
1 winter      665 2.001 0.185 0.933    0            0    0.966    0.131
2 spring      670 2.608 0.108 0.972    0            0    0.986    0.059
3 summer      658 1.141 0.242 0.907    0            0    0.953    0.300
4   fall      640 1.469 0.253 0.906    0            0    0.952    0.244

goodness.fixed.valid

  season sample.n  mean  RMSE NSEff   bias percent.bias pearsonR CV.error
1 winter      120 1.922 0.274 0.794 -0.126       -6.549    0.904    0.191
2 spring      122 2.279 0.277 0.807  0.077        3.388    0.903    0.169
3 summer      121 1.086 0.382 0.780 -0.066       -6.095    0.887    0.496
4   fall      118 1.370 0.399 0.799  0.135        9.860    0.907    0.401

goodness.mixed.valid

  season sample.n  mean  RMSE NSEff   bias percent.bias pearsonR CV.error
1 winter      120 1.922 0.272 0.797 -0.147       -7.655    0.910    0.185
2 spring      122 2.279 0.253 0.839  0.040        1.771    0.917    0.157
3 summer      121 1.086 0.348 0.818 -0.018       -1.688    0.906    0.454
4   fall      118 1.370 0.396 0.802  0.077        5.602    0.901    0.406

Melt data frames for ggplot

#create melted data frames for ggplot

#for correlation plots
dd.corr.calib<-melt(
     pred.calib[,c("site_no","season","year","obs","pred.fixed","pred.mixed")],
     id.vars=c("site_no","season","year","obs"))
dd.corr.calib$season<-factor(capitalize(dd.corr.calib$season),levels=c("Winter","Spring","Summer","Fall"))
dd.corr.valid<-melt(
     pred.valid[,c("site_no","season","year","obs","pred.fixed","pred.mixed")],
     id.vars=c("site_no","season","year","obs"))
dd.corr.valid$season<-factor(capitalize(dd.corr.valid$season),levels=c("Winter","Spring","Summer","Fall"))



#for hydrographs and other plots
dd.calib<-pred.calib[,c("site_no","FEATUREID","date","year","season","qseasonal","da_sqkm",
          "OnChannelWaterSqKM","OnChannelWetlandSqKM","large_barriers","small_barriers",
          "precip.e","precip.e.lag1","precip.e.lag2","flow",
          "obs","pred.fixed","pred.mixed" )]
dd.calib<-melt(dd.calib,id.vars=c("site_no","FEATUREID","date","year","season",
                                  "qseasonal","da_sqkm","OnChannelWaterSqKM",
                                  "OnChannelWetlandSqKM","large_barriers","small_barriers"))

dd.valid<-pred.valid[,c("site_no","FEATUREID","date","year","season","qseasonal","da_sqkm",
          "OnChannelWaterSqKM","OnChannelWetlandSqKM","large_barriers","small_barriers",
          "precip.e","precip.e.lag1","precip.e.lag2","flow",
          "obs","pred.fixed","pred.mixed" )]
dd.valid<-melt(dd.valid,id.vars=c("site_no","FEATUREID","date","year","season",
                                  "qseasonal","da_sqkm","OnChannelWaterSqKM",
                                  "OnChannelWetlandSqKM","large_barriers","small_barriers"))

###View correlation of observed vs predicted values

gg.corr<- ggplot(dd.corr.calib,
               aes(x=obs,y=value,colour=variable,pch=variable))+ 
          geom_abline(a=1,b=0,col="black",lty=2)+     
          theme_bw()+xlab("observed")+ylab("predicted")+
          #scale_colour_hue(l=40) 
          scale_colour_hue(l=40, 
               name="Model Type",
               breaks=c("pred.fixed", "pred.mixed"),
               labels=c("Fixed", "Mixed/Random Effects"))+
     scale_shape_manual(values=c(16,5))
gg.corr + geom_point() + facet_wrap(~season) + ggtitle("Calibration")

![plot of chunk plots correlation](figure/plots correlation1.png)

gg.corr %+% dd.corr.valid + geom_point() + facet_wrap(~season) + ggtitle("Validation")

![plot of chunk plots correlation](figure/plots correlation2.png)

View hydrograph of sample sites from calibration set, and all validation sites

gg.hydrograph<-ggplot(data=subset(dd.valid,
                             variable %in% c("obs","pred.fixed","pred.mixed") & site_no %in% val.gages),
                      aes(x=year,y=value,colour=variable,linetype=variable)) +
     theme_bw()+scale_colour_brewer(type="qual",palette="Set1") + scale_linetype_manual(values=c("dotted","solid","dashed"))

#pull sample sites from calibration set
set.seed(94802834)
sample.calib<-sample(x=dd.calib[dd.calib$qseasonal>=20,"site_no"],size=10,replace=F)
#calibration hydrographs
for (i in season.names) {
     print(
     gg.hydrograph %+% subset(dd.calib,site_no %in% sample.calib & variable %in% c("obs","pred.fixed","pred.mixed"))+
          geom_line(subset=.(season==i)) +
          geom_point(subset=.(season==i),cex=2,pch=1)+
          geom_point(subset=.(season==i & year %in% val.year),cex=4,pch=18)+
          facet_grid(site_no~.)+
          ggtitle(paste("Calibration:",capitalize(i)))
     )
}

![plot of chunk hydrograph of sample sites from calibration set, and all validation sites](figure/hydrograph of sample sites from calibration set, and all validation sites1.png) ![plot of chunk hydrograph of sample sites from calibration set, and all validation sites](figure/hydrograph of sample sites from calibration set, and all validation sites2.png) ![plot of chunk hydrograph of sample sites from calibration set, and all validation sites](figure/hydrograph of sample sites from calibration set, and all validation sites3.png) ![plot of chunk hydrograph of sample sites from calibration set, and all validation sites](figure/hydrograph of sample sites from calibration set, and all validation sites4.png)

#validation hydrographs
for (i in season.names) {
     print(
     gg.hydrograph + 
          geom_line(subset=.(season==i)) +
          geom_point(subset=.(season==i),cex=2,pch=1)+
          geom_point(subset=.(season==i & year %in% val.year),cex=4,pch=18)+
          facet_grid(site_no~.)+
          ggtitle(paste("Validation:",capitalize(i)))
     )
}

![plot of chunk hydrograph of sample sites from calibration set, and all validation sites](figure/hydrograph of sample sites from calibration set, and all validation sites5.png) ![plot of chunk hydrograph of sample sites from calibration set, and all validation sites](figure/hydrograph of sample sites from calibration set, and all validation sites6.png) ![plot of chunk hydrograph of sample sites from calibration set, and all validation sites](figure/hydrograph of sample sites from calibration set, and all validation sites7.png) ![plot of chunk hydrograph of sample sites from calibration set, and all validation sites](figure/hydrograph of sample sites from calibration set, and all validation sites8.png)

Trial of plots for validating by year

gg.error<-ggplot(data=dd.valid,
                 aes(x=site_no,y=value,col=variable)) +
     geom_boxplot(fill="grey30",col="grey30")+geom_point(pch=15,cex=3)+
     scale_colour_brewer(type="qual",palette="Set1")+
     facet_grid(season~year, scales="free_x", space="free_x")


#create sample of years from calibration data set
set.seed(96536573)
sample.calib.year<-sample(x=dd.calib[,"year"],size=10,replace=F)
#calibration plot
gg.error %+% subset(dd.calib, variable %in% c("obs","pred.mixed") & year %in% sample.calib.year )+ ggtitle("Calibration")

![plot of chunk error plots](figure/error plots1.png)

#validation plot
gg.error %+% subset(dd.valid, variable %in% c("obs","pred.mixed") & year %in% val.year )+ ggtitle("Validation")

![plot of chunk error plots](figure/error plots2.png)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

C_I_seasonal_regression_v1.md

C_I_seasonal_regression_v1.md

Seasonal flow timeseries model
from paired flow and meterological/weather record

ALR July 2014
Conte Anadromous Fish Research Center

Load libraries, functions, etc

Load seasonal data, and look at number of records, sites, and years

Create calibration and validation sets

Create mixed model

Create mixed model: random effects for year and site

Calculate goodness of fit for calibration data set

Calculate goodness of fit for validation data set

Goodness of fit statistics

Melt data frames for ggplot

View hydrograph of sample sites from calibration set, and all validation sites

Trial of plots for validating by year

Files

C_I_seasonal_regression_v1.md

Latest commit

History

C_I_seasonal_regression_v1.md

File metadata and controls

Seasonal flow timeseries model from paired flow and meterological/weather record

ALR July 2014Conte Anadromous Fish Research Center

Load libraries, functions, etc

Load seasonal data, and look at number of records, sites, and years

Create calibration and validation sets

Create mixed model

Create mixed model: random effects for year and site

Calculate goodness of fit for calibration data set

Calculate goodness of fit for validation data set

Goodness of fit statistics

Melt data frames for ggplot

View hydrograph of sample sites from calibration set, and all validation sites

Trial of plots for validating by year

Seasonal flow timeseries model
from paired flow and meterological/weather record

ALR July 2014
Conte Anadromous Fish Research Center