MW_Modelfitting(byyearspeciesplot).Rmd

---
title: "Maximum likelihood models: per year/plot/species"
author: "Janneke Hille Ris Lambers, Aji John, Meera Sethi, Elli Theobald"
date: "12/15/2020"
output: html_document
---

#Setup for R script and R markdown
1. Load all libraries
2. Specify string behavior
3. Load packages (should we add all packages here?)

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
options(stringsAsFactors = FALSE)
library(tidyverse)
#library(leaflet)
#library(lubridate)
library(boot)
#library(readr) 
```


#Read in clean phenology data
This chunk of code:

1. Reads in the cleaned up data (from data wrangling script), and defines the years, species, etc considered.

2. Reads in SDD data, merges it with PhenoSite data

```{r}
# Read in clean data
PhenoSite_0 <- read.csv("cleandata/PhenoSite_Clean.csv", header=TRUE)
SDDdat_0 <- read.csv("cleandata/MW_SDDall.csv", header=TRUE)

# Create new SDD - observed when present, predicted when not
SDDfin <- SDDdat_0$SDD
# This substitutes in predicted SDD where sensors failed = NA
SDDfin[is.na(SDDdat_0$SDD)==TRUE] <- 
  round(SDDdat_0$predSDD[is.na(SDDdat_0$SDD)==TRUE],0)
# this removes unwanted columns, including SDD and pred SDD
SDDdat <- subset(SDDdat_0, select=-c(Site_Num, calibration,
                                     snow_appearance_date, 
                                     snow_disappearance_date,
                                     snow_cover_duration, 
                                     minimum_soil_temp, SDD,
                                     notes, predSDD))
# This adds SDDfin back to data frame as SDD
SDDdat$SDD <- SDDfin

# Merge SDD and Pheno data
PhenoSite <- merge(PhenoSite_0,SDDdat, 
                   by=c("Year", "Site_Code", "Transect"))

# Define years of data, data filter, etc
yrs <- unique(PhenoSite$Year)
nyrs <- length(yrs)
species <- unique(PhenoSite$Species)
nspp <- length(species)
out_thresh <- max(PhenoSite$distyes[PhenoSite$Flower==1])

# Print this out as reminder
print("Number of flowering observations, columns")
print(dim(PhenoSite))
print("Number of years, species, outlier threshold")
print(c(nyrs, nspp, out_thresh))

```


#Model fitting: per plot / year / species
This code does the following:

1. Loads phenology functions (see HRL_phenology_functions.R).

2. For each plot / year / species data, fits null model and phenology curve to yes / no observations of flowering phenology. Model fit parameters (peak, range and maximum parameters) are written to object pars_yrpltsp, and saved.

3. For each plot / year / species combo, calculates AIC's. This is written to aics_yrpltsp, and saved.


```{r}
#read in appropriate functions
source("./HRL_phenology_functions.R")

#define object in which to save parameters, AIC values
pars_yrpltsp <- c() 
aics_yrpltsp <- c() 

#For loops to fit curves per year/plot/species, save parameters, plot 
for(i in 1:nyrs){ # First for loop: runs through years

  #extract data for that year
  PhenoSite_Year <- PhenoSite[PhenoSite$Year==yrs[i],]
  
  #pull out unique plots for year in question
  plots <- unique(PhenoSite_Year$Site_Code)
  
  #Nested for loop for each plot
  for(j in 1:length(plots)){ # Second for loop: each plot
    PhenoSite_YearPlot <- PhenoSite_Year[PhenoSite_Year$Site_Code==plots[j],]

    #Identify species in plot
    spinplt <- unique(PhenoSite_YearPlot$Species)
    if(length(spinplt)==0){next} #break if focal spp not in plot
    
    #Nested for loop for each species
    for(k in 1:length(spinplt)){ #Third for loop: each species in the pot

      #Extract data for the species in question      
      PhenoSite_YearPlotSpecies <- PhenoSite_YearPlot[
                                   PhenoSite_YearPlot$Species==spinplt[k],]
      
      #Extract other parameters
      trsct <- PhenoSite_YearPlotSpecies$Transect[1]
      stcd <- PhenoSite_YearPlotSpecies$Site_Loc[1]
      
      #define parameters for curvefitting
      days <- PhenoSite_YearPlotSpecies$DOY #explanatory variable: DOY 
      phenophase <- PhenoSite_YearPlotSpecies$Flower #yes / no flowering
	    
	    #add three weeks of zeroes before earliest SDD in those plots
	    SDDplt <- min(PhenoSite_YearPlotSpecies$SDD)
	    days <- c(SDDplt-21, SDDplt-14, SDDplt-7,days)
	    phenophase <- c(0,0,0,phenophase)

      #now fit null model
      model0 <- optimize(nullfit, c(0.000001,0.999999)) #fit null model
	
      #now fit alternative model - curve
      param <- c(mean(days[phenophase[]==1]), -0.001, 0) # initial parameters
      model1 <- optim(param, curvefit_perplot, control = list(maxit = 50000))
      if(model1$convergence==1){
        print(paste(spinplt[k],"no convergence", sep="-"))}
      
      #write the model tests / AIC to data frame
      AICnull <- round(2*(model0$objective+1),1)
      AICalt <- round(2*(model1$value + 3),1)
      pcurve <- signif(pchisq(model1$value-model0$objective,2),3)

      #save AIC values
      tmp_aic_vals <- c(yrs[i], as.character(plots[j]),
                        as.character(spinplt[k]),AICnull,AICalt)
      
      #save parameters to spyrpltpars
      tmp_pars <- c(yrs[i], trsct, stcd, as.character(plots[j]), SDDplt, 
                    as.character(spinplt[k]), model1$par[1:3])
      
      pars_yrpltsp <- rbind(pars_yrpltsp, tmp_pars)
      aics_yrpltsp <- rbind(aics_yrpltsp, tmp_aic_vals)
      
    }
  }
}
  
# turn spyrpltpars into a data frame
dimnames(pars_yrpltsp) <- list(c(), c("year","transect","site_code","plot",
                                      "SDD","species","peak","duration","max"))
pars_yrpltsp <- data.frame(pars_yrpltsp)

#change storage type to numeric - all except plot (since a few plots have a, b)
pars_yrpltsp$year <- as.numeric(pars_yrpltsp$year)
pars_yrpltsp$transect <- as.factor(pars_yrpltsp$transect)
pars_yrpltsp$site_code <- as.factor(pars_yrpltsp$site_code)
pars_yrpltsp$species <- as.factor(pars_yrpltsp$species)
pars_yrpltsp$SDD <- as.numeric(pars_yrpltsp$SDD)
pars_yrpltsp$peak <- as.numeric(pars_yrpltsp$peak)
pars_yrpltsp$duration <- as.numeric(pars_yrpltsp$duration)
pars_yrpltsp$max <- as.numeric(pars_yrpltsp$max)

# turn aic vector to dataframe
dimnames(aics_yrpltsp) <- list(c(), c("year", "plot", "species", 
                                      "AICnull", "AICalt"))
aics_yrpltsp <- data.frame(aics_yrpltsp)
aics_yrpltsp$year <- as.numeric(aics_yrpltsp$year)
aics_yrpltsp$species <- as.factor(aics_yrpltsp$species)
aics_yrpltsp$AICnull <- as.numeric(aics_yrpltsp$AICnull)
aics_yrpltsp$AICalt <- as.numeric(aics_yrpltsp$AICalt)

#examine pars, aics
head(pars_yrpltsp)
head(aics_yrpltsp)

#Write output to data folder
write.csv(pars_yrpltsp, "output/Parameters_plotmodel.csv", quote=FALSE,
          row.names=FALSE)

#Write AIC output to data folder
write.csv(aics_yrpltsp, "output/AIC_plotmodel.csv", quote=FALSE,
          row.names=FALSE)

```


#Graph to visualize peak flowering by year & trail
This code takes estimates of peak flowering from year-plot-species specific model fits (in pars_yrpltsp) and plots those estimates by year (year on the x-axis, DOY on the y axis) and by trail.

```{r}
##Plot peak flowering estimates of all species, trails, plots on one graph
par(mfrow=c(1,1),omi=c(0,0,0,0), mai=c(0.5,0.4,0.4,0.2), 
    tck=-0.01, mgp=c(1.25,0.25,0), xpd=TRUE)

#set plotting colors - 17 total to accommodate all species if needed
plotcol <- c("yellowgreen","magenta","orange","purple","yellow","springgreen",
             "pink","purple","navyblue","azure4","yellow4","orchid",
             "turquoise","salmon","maroon","black","grey")

#create plot to add points to
#earlypk <- min(spyrpltpars$peak); latepk <- max(spyrpltpars$peak)
earlypk <- 135; latepk <- 255

plot(2016,175, xlim=c(2012, 2020), ylim=c(earlypk,latepk),type="n",
     xaxp=c(2012,2020,8), yaxt="n", xlab="Year",ylab="Flowering")
text(2011.5,srt=90, c(135, 165, 195, 225, 255),-0.1, 
     labels=c("May", "Jun", "Jul", "Aug", "Sept"))
  
for(trail in 1:2){
  if(trail==1){
    pars_yrpltsp2 <- pars_yrpltsp[substr(pars_yrpltsp$plot,1,2)=="RL",]}
  if(trail==2){
    pars_yrpltsp2 <- pars_yrpltsp[substr(pars_yrpltsp$plot,1,2)=="GB",]}
  
  #how many years? differs per trail
  yrs2 <- unique(pars_yrpltsp2$year)

  #extract data per year, plot
  for(i in 1:length(yrs2)){
    paryear <- pars_yrpltsp2[pars_yrpltsp2$year==yrs2[i],]
 
    #now pull out all data for a species
    for(j in 1:length(species)){
      paryearsp <- paryear[paryear$species==species[j],]
      if(dim(paryearsp)[1]==0){next}
      pks <- paryearsp$peak
      tiny <- 0; if(trail==2){tiny <- 0.333}
      if(trail==1){pltshp <-21}; if(trail==2){pltshp <- 24}
      points((rep(yrs2[i],length(pks))+jitter(rep(tiny,length(pks)),2.5)),
             pks,pch=pltshp, bg=plotcol[j], cex=1.25)
    }
  }
}

legend(x="topleft", legend=c("RL","GB"), pch=c(21,24), 
       pt.bg="gray", cex=0.75, pt.cex=1.25)
legend(x="bottomleft",legend=species, pch=21, 
       pt.bg=plotcol[1:length(species)], cex=0.75, pt.cex=1.25)
```

#AIC graph
This graph plots the AIC for the null model (flowering probability does not vary over time) vs. the alternative model (flowering probability is a unimodal curve over time). Plotted also is the 1:1 line. Each point represents a year / plot / species fit.
```{r}
#Make a plot showing AIC null vs. AICalt
plot(aics_yrpltsp$AICnull, aics_yrpltsp$AICalt, pch=21, bg="grey",
     xlab="AIC Null", ylab="AIC Curve")
title("Compare Null to Curve")
abline(0,1)
```

##Graph of plot / Species / Year-specific model fits
Graph of all year-plot specific flowering curves for each species, on the same plot, with different colored lines per year. 
```{r}
#extract species
species <- unique(pars_yrpltsp$species)
species <- species[order(species)]
yrcols <- c("green","lightblue","orange","purple","pink","red","grey")
xx_DOY <- seq(120,270)

for(i in 1:length(species)){
  sp_pars <- pars_yrpltsp[pars_yrpltsp$species==species[i],]
  yrcolind <- sp_pars$year - 2012
  
  #make a dummy plot to add lines to
  par(mfrow=c(1,1), omi=c(0,0,0,0), mai=c(0.5,0.6,0.5,0.5), 
      tck=-0.02, mgp=c(1.1,0.5,0))
  plot(1,1, type="n", xlim=c(min(xx_DOY),max(xx_DOY)),
       ylim=c(0,1), xlab="", ylab="P(flowering", pin=c(6,4),
       xaxp=c(min(xx_DOY),max(xx_DOY),(max(xx_DOY)-min(xx_DOY))/30))
  title(species[i])
  legend(x="topleft", legend=yrs, col=yrcols, lwd=2)
  
  #Now go through all estimates, add lines
  for(j in 1:dim(sp_pars)[1]){
    pars <- c(sp_pars$peak[j], sp_pars$duration[j], sp_pars$max[j])
    predf <- predflower(xx_DOY, pars)
    lines(xx_DOY, predf, col=yrcols[yrcolind[j]], lwd=2)
  }
}

```

##Graph of SDD vs peak flowering
Graph of SDD vs. peak flowering for each species, different colored lines per year

```{r}
#extract species
species <- unique(pars_yrpltsp$species)
species <- species[order(species)]
yrcols <- c("green","lightblue","orange","purple","pink","red","grey")

##Plot SDD vs. peak per species
for(i in 1:length(species)){
  sp_pars <- pars_yrpltsp[pars_yrpltsp$species==species[i],]
  yrcolind <- sp_pars$year - 2012
  
  #correlate SDD to peak snowmelt
  SDDtest <- cor.test(sp_pars$SDD, sp_pars$peak)
  
  #make a dummy plot to add lines to
  par(mfrow=c(1,1), omi=c(0,0,0,0), mai=c(0.5,0.6,0.5,0.5), 
      tck=-0.02, mgp=c(1.1,0.5,0))
  tiny <- 0.05*(max(sp_pars$peak)-min(sp_pars$peak))
  ylms <- c(min(sp_pars$peak)-tiny, max(sp_pars$peak)+tiny)
  plot(sp_pars$SDD, sp_pars$peak, ylim=ylms, pty="s",
       pch=21, bg=yrcols[yrcolind], cex=1.5,
       xlab="Snow Disappearance Date", ylab="Peak Flowering")
  tiny <- 0.05*(max(sp_pars$peak)-min(sp_pars$peak))
  text(sp_pars$SDD, (sp_pars$peak+tiny), labels=sp_pars$plot, cex=0.5) 
  abline(0,1)
  title(species[i])
  legend(x="topleft", legend=yrs, pt.bg=yrcols, pch=21)
  
  #add cor.test results
  mtext(paste("r=",round(SDDtest$estimate,3),", p=", round(SDDtest$p.value, 3),
              sep="", collapse=NULL), side=3, line=-1, cex=0.75)
  
  # print average days after snowmelt the species is at peak flower
  daysafter <- mean (sp_pars$SDD - sp_pars$peak)
  print(paste(species[i], "days after SDD in peak flower", 
              round(daysafter,2), sep="-"))
  
}

```


#Plot-level parameters related to SDD, year
A quick lm analysis examining the relationship between the three sets of parameters and SDD and year. On the whole, peak clearly varies with SDD, and for several species that relationship also varies by year. The max and duration parameters generally don't vary by SDD, but in a few cases vary by year. 
```{r}
# First test peak flowering - per species
species <- species[order(species)]
peak_pars <- c()

for(i in 1:length(species)){
  sp_dat <- pars_yrpltsp[pars_yrpltsp$species==species[i],]
  peak_test_all <- lm(peak~SDD*as.factor(year), data=sp_dat)
  print(species[i])
  print(anova(peak_test_all))
  peak_test <- lm(peak~SDD, data=sp_dat)
  tmpout <- c(as.character(species[i]),coef(peak_test),
              summary(peak_test)$adj.r.squared)
  peak_pars <- rbind(peak_pars, tmpout)
}

dimnames(peak_pars) <- list(c(), c("species","intercept","slope","Rsquared"))
peak_pars <- data.frame(peak_pars)
for(i in 2:4){peak_pars[,i] <- as.numeric(peak_pars[,i])}
head(peak_pars)

# range and duration parameters - test wheter affected by SDD, year
for(i in 1:length(species)){
  sp_dat <- pars_yrpltsp[pars_yrpltsp$species==species[i],]
  ntrans <- length(unique(sp_dat$transect))
  range_test <- lm(duration~as.factor(year)*SDD, data=sp_dat)
  max_test <- lm(max~as.factor(year)*SDD, data=sp_dat)

  print(species[i])
  print("range")
  print(anova(range_test))
  print("max")
  print(anova(max_test))
  print(""); print("")
 }


```

#Model fit to QA/QC vs. non QA/QC data
This code checks to see whether model fits of volunteers vs. professional scientists vary.
```{r}
#define object in which to save parameters
pars_QAQC <- c()

#set # years, species
yrs <- unique(PhenoSite$Year)
nyrs <- length(yrs)
species <- unique(PhenoSite$Species)

#For loops to fit curves per year/plot/species, save parameters, plot 
for(i in 1:nyrs){ # First for loop: runs through years

  #extract data for that year
  PhenoSite_Year <- PhenoSite[PhenoSite$Year==yrs[i],]
  
  #pull out unique plots for year in question
  plots <- unique(PhenoSite_Year$Site_Code)
  
  #Nested for loop for each plot
  for(j in 1:length(plots)){ # Second for loop: each plot
    PhenoSite_YearPlot <- PhenoSite_Year[PhenoSite_Year$Site_Code==plots[j],]

    #Identify species in plot
    spinplt <- unique(PhenoSite_YearPlot$Species)
    if(length(spinplt)==0){next} #break if focal spp not in plot
    
    #Nested for loop for each species
    for(k in 1:length(spinplt)){ #Third for loop: each species in the plot

      #Extract data for the species in question      
      PhenoSite_yps <- PhenoSite_YearPlot[PhenoSite_YearPlot$Species
                                          ==spinplt[k],]
      PhenoSite_QAQC <- PhenoSite_yps[PhenoSite_yps$QA.QC==1,]
      PhenoSite_vol <- PhenoSite_yps[PhenoSite_yps$QA.QC!=1,]
      if(dim(PhenoSite_QAQC)[1]<9){next}
      
      #Extract other parameters
      trsct <- PhenoSite_QAQC$Transect[1]
      stcd <- PhenoSite_QAQC$Site_Loc[1]
      
      #define parameters for curvefitting: QAQC
      days <- PhenoSite_QAQC$DOY #explanatory variable: DOY 
      phenophase <- PhenoSite_QAQC$Flower #yes / no flowering
	    
	    #add three weeks of zeroes before earliest SDD in those plots
	    SDDplt <- min(PhenoSite_QAQC$SDD)
	    days <- c(SDDplt-21, SDDplt-14, SDDplt-7,days)
	    phenophase <- c(0,0,0,phenophase)
	    
	    if(sum(phenophase)<2){next}

      # fit phenological model
      param <- c(mean(days[phenophase[]==1]), -0.001, 0) # initial parameters
      model1 <- optim(param, curvefit_perplot, control = list(maxit = 50000))
      if(model1$convergence==1){
        print(paste(spinplt[k],"no convergence", sep="-"))}

      #define parameters for curvefitting: volunteers
      days <- PhenoSite_vol$DOY #explanatory variable: DOY 
      phenophase <- PhenoSite_vol$Flower #yes / no flowering
	    
	    #add three weeks of zeroes before earliest SDD in those plots
	    SDDplt <- min(PhenoSite_vol$SDD)
	    days <- c(SDDplt-21, SDDplt-14, SDDplt-7,days)
	    phenophase <- c(0,0,0,phenophase)
	    
	    if(sum(phenophase)<2){next}
	    
	    # fit phenological model
      param <- c(mean(days[phenophase[]==1]), -0.001, 0) # initial parameters
      model2 <- optim(param, curvefit_perplot, control = list(maxit = 50000))
      if(model2$convergence==1){
        print(paste(spinplt[k],"no convergence", sep="-"))}

	    #save parameters
      tmp_pars <- c(yrs[i], trsct, stcd, as.character(plots[j]), SDDplt, 
                    as.character(spinplt[k]), model1$par[1:3], model2$par[1:3])
      
      pars_QAQC <- rbind(pars_QAQC, tmp_pars)
      
    }
  }
}
  
# turn pars_QAQC into a data frame
dimnames(pars_QAQC) <- list(c(), c("year","transect","site_code","plot",
                                   "SDD","species","peak_Q","duration_Q",
                                   "max_Q", "peak_v","duration_v", "max_v"))
pars_QAQC <- data.frame(pars_QAQC)

#change storage type to numeric - all except plot (since a few plots have a, b)
pars_QAQC$year <- as.numeric(pars_QAQC$year)
pars_QAQC$transect <- as.factor(pars_QAQC$transect)
pars_QAQC$site_code <- as.factor(pars_QAQC$site_code)
pars_QAQC$species <- as.factor(pars_QAQC$species)
pars_QAQC$SDD <- as.numeric(pars_QAQC$SDD)
pars_QAQC$peak_Q <- as.numeric(pars_QAQC$peak_Q)
pars_QAQC$duration_Q <- as.numeric(pars_QAQC$duration_Q)
pars_QAQC$max_Q <- as.numeric(pars_QAQC$max_Q)
pars_QAQC$peak_v <- as.numeric(pars_QAQC$peak_v)
pars_QAQC$duration_v <- as.numeric(pars_QAQC$duration_v)
pars_QAQC$max_v <- as.numeric(pars_QAQC$max_v)

#show pars_QAQC
head(pars_QAQC)

```

# Species-level comparison: parameters
These graphs show fitted peak, max and duration parameters as fit to data collected by lab members vs those collected by volunteers.

Note - peak parameters are very well correlated (with one exception, LUAR in 2018 GB plot 9* - see note below*), but max and duration parameters are not. Max parameters represent the overall probability across all observing flowers across all observations, and volunteers vs HRL lab members have a different frequency of visitation, so this is unsurprising. The duration parameter is correlated with the max parameter, which may also influence the comparison.

* LUAR in GB plot 9 in 2018 seems to have flowered twice (potentially b/c the plant was herbivorized, potentially because there were 2 plants in the plot on a very different schedule, potentially b/c it rained mid summer providing a second flush of flowers). Because the second batch of flowering occurred very near the last day volunteers collected data, the estimate of peak flowering for volunteers in that plot / year is unrealistically late.
```{r}
# make graphs comparing parameters
for(i in 1:length(species)){
  #set up plot
  par(mfrow=c(1,3), omi=c(0,0,0,0), mai=c(0.5,0.5,0.4,0.3), 
      mgp=c(1.2,0.5,0), tck=-0.02, pty="s")

  # extract data for species in question
  pars_QAQC_sp <- pars_QAQC[pars_QAQC$species==species[i],]
  
  # plot peak, range, duration
  plot(pars_QAQC_sp$peak_Q, pars_QAQC_sp$peak_v, pch=21, bg="grey", 
       cex=1.25, xlab="peak-prof", ylab="peak-vol")
  abline(0,1)
  plot(pars_QAQC_sp$duration_Q, pars_QAQC_sp$duration_v, pch=21, bg="grey", 
       cex=1.25, xlab="duration-prof", ylab="duration-vol",
       main=species[i])
  abline(0,1)
  plot(pars_QAQC_sp$max_Q, pars_QAQC_sp$max_v, pch=21, bg="grey", 
       cex=1.25, xlab="max-prof", ylab="max-vol")
  abline(0,1)
}
```


#QAQC graph - peak flowering comparison volunteer vs. professional. This is Supplemental Figure E2.
```{r}

# Uncomment this if you want to plot the tiff file
tiff(file="output/figures/FigE2.tif", width=5, height=5, units="in", res=600)

#set up plot
par(mfrow=c(1,1), omi=c(0,0,0,0), mai=c(0.5,0.5,0.5,0.5), 
      mgp=c(1.35,0.45,0), tck=-0.02, pty="s")

#remove species not considered: MEPA, ANAR, LICA
pars_QAQC2 <- pars_QAQC[pars_QAQC$species!="ANAR"&pars_QAQC$species!="LICA"&
                        pars_QAQC$species!="MEPA"&pars_QAQC$species!="MIAL",]
species2 <- unique(pars_QAQC2$species); species2 <- species2[order(species2)]

spcols <- c("yellowgreen","gold3","pink","orange","magenta","yellow",
            "lightyellow2","plum1","lightsteelblue4","purple","goldenrod4",
            "greenyellow","navyblue","grey")

#make dummy plot
plot(180,180, type="n", xlim=c(160,275), ylim=c(160,275),
     xlab="Estimate - Ecologist", ylab="Estimate -Volunteers")
abline(0,1)
legend(x="bottomright", legend=species2, pt.bg=spcols, pch=21, 
       pt.cex=1.25,cex=0.65)

#correlation
pars_QAQC3 <- pars_QAQC2[pars_QAQC2$peak_v<280,]
ctest <- cor.test(pars_QAQC3$peak_Q, pars_QAQC3$peak_v)
Qcor <- round(ctest$estimate,3)
Qp0 <- round(ctest$p.value,3)
ifelse(Qp0==0,Qp<-"p<0.001",Qp<-paste("p=",Qp0,sep==""))
mtext(paste("cor=",Qcor,sep=""),side=3, adj=0.025, line=-0.9, cex=0.8)
mtext(Qp, side=3, adj=0.025, line=-1.8, cex=0.8)

#add text


# make graphs comparing parameter
for(i in 1:length(species2)){
  # extract data for species in question
  pars_QAQC_sp <- pars_QAQC2[pars_QAQC2$species==species2[i],]
  
  # plot peak, range, duration
  points(pars_QAQC_sp$peak_Q, pars_QAQC_sp$peak_v, 
         pch=21, bg=spcols[i], cex=1.25)

}

dev.off()
```

###########
ODDS AND ENDS BELOW (mostly figures for other purposes)


################
```{r}
plot_pars_years <- read_csv("data/PerPlotCurves.csv") 

# str_sub subsets first two characters pf the site ID, so here it would be either RL or GB.
# aes - aesthetics -we give x and y variables , and color 
# facets add third dimension to the figure here.
# %>%  is the pipe operator - which is equivalent to saying make all the field names available to the operation on right, it helps one to write simplified code where we don't need to use $ to access the fields in a dataframe

plot_pars_years %>%  mutate(site= str_sub(plot_pars_years$plot,1,2)) %>%
  ggplot() +
  geom_point(aes(as.factor(year),peak,color=species)) + 
  facet_grid(.~plot) +
  theme_minimal() + theme(axis.text.x = element_text(angle = 90,hjust = 5)) +
     geom_hline(aes(yintercept = 152))  +
  geom_text(mapping = aes(label = 'June 1st',y=152, x = 0.1), angle = 90,alpha=.4, hjust = 0) +
  geom_hline(aes(yintercept = 183))  +
  geom_text(mapping = aes(label = 'July 1st',y=183, x = 0.1), angle = 90, alpha=.4,hjust = 0) +
   geom_hline(aes(yintercept = 213))  +
  geom_text(mapping = aes(label = 'Aug 1st',y=213, x = 0.1), angle = 90, alpha=.4,hjust = 0) +
    geom_hline(aes(yintercept = 244))  +
  geom_text(mapping = aes(label = 'Sep 1st',y=244, x = 0.1), angle = 90,alpha=.4, hjust = 0) +
 # facet_grid(.~year) +
  labs(x=" Year", y= "Predicted Peak flowering (DOY)", color="Species")

# Its a comprehensive plot so might be better to view the generated image separately.
ggsave("figs/sanity_check.png",width = 70, height = 20, units = "cm")

``` 

#Alternative graph of model fits
Alternative plot to visualize outliers,  Site level (RL or GB) is on alternative y-axis, X-axis DOY, and Y-axis the focal species. We see the outliers are closer to start of the y-axis scale or towards the end. Columns here are years. SDD gradient can be ignored ( no variation)
```{r}
# note here aes  (aesthetics included 'fill' as well)
plot_pars_years %>% 
  mutate(site= str_sub(plot_pars_years$plot,1,2)) %>% 
  ggplot( aes(x = peak, y = species,fill=SDD)) +
   geom_jitter() +
    facet_grid(site~year) +
    scale_fill_viridis_c(name = "Snow Melt Date") +
    labs(title = 'Peaks across all the years',
       subtitle = 'Modelled peak flowering across 10 focal species', 
         x = "Day of the year") +
  theme_minimal(base_size = 14) + theme(axis.title.y = element_blank())

ggsave("figs/sanity_check_revised.png", width = 35, height = 20, units = "cm")
```

#Observation Effort
This code calculates the total number of observations for each plot / year / species combination.
```{r}

#This coce shows the total number of observations by plot / year / species
PhenoSite %>% 
  group_by(Transect,Year,Site_Code,Species)%>% 
  tally()

#This code does the same thing, but transforms the tibble into a dataframe
tallyObservations_typs <- PhenoSite %>% 
  group_by(Transect,Year,Site_Code,Species)%>% 
  tally() %>% as.data.frame()
```

#Graph showing species/ year / transect observer effort
Notice a perfect cluster of # of observationsin 2015 Reflections lake. Also, what we see is the low number of observations for some of the plot / species combinations

```{r}
# In dply lingo everything is a tibble , we are transforming it to a dataframe for compatibility
tallyObservations_typs %>% 
  ggplot() +
  geom_jitter(aes(as.factor(Year), n,color=Species))+
  facet_grid(.~Transect) +
  labs(y="Number of observations",x="Year") +
  theme_minimal(base_size = 14) 

```

#Calculate outlier 'yes' observations
This code calculates all pairwise differences in dates between all yes observations in each trail / year / plot / species combo, and identify any observation where the minimum value is > 7

For example, imagine a specific query : Species =='ANOC' & Year=='2013' & Site_Code =='RL9'

1.*group_by* orders the rows by given set of fields (in order)
2. *lag* gets the previous row

Returns

# Groups:   Transect, Year, Site_Code, Species [1]
  Transect          Year Site_Code Species Date      Observer                           QA.QC
  <chr>            <int> <chr>     <chr>   <chr>     <chr>                              <int>
1 Reflection Lakes  2013 RL9       ANOC    7/21/2013 Dave Purdon                            0
2 Reflection Lakes  2013 RL9       ANOC    7/23/2013 Anna Wilson; Cherry Chen               1
3 Reflection Lakes  2013 RL9       ANOC    7/23/2013 Weedy McCauley                         0
4 Reflection Lakes  2013 RL9       ANOC    7/24/2013 Brooke Upton                           0
5 Reflection Lakes  2013 RL9       ANOC    7/24/2013 Rita Moore; Dan Paquette               0
6 Reflection Lakes  2013 RL9       ANOC    7/25/2013 Carol Miltimore                        0
7 Reflection Lakes  2013 RL9       ANOC    7/26/2013 Carol Clingan                          0
8 Reflection Lakes  2013 RL9       ANOC    8/4/2013  Johndavid Hascup; Kristalyn Hascup     0
9 Reflection Lakes  2013 RL9       ANOC    8/10/2013 Bonnie Scott                           0

Row # 7 and 8 is what we use to find the difference which is greater than 7 

#TODO I suggest making pdif a variable we can play around with
#Question - does this only look at the difference between each observation and a previous flowering observation, or also the difference between previous and next? In the latter case, what happens to the first observation of flowering (no previous observation?)

```{r}
PhenoSite %>% 
  filter (Flower == 1) %>% 
  mutate (doy = lubridate::yday(as.Date(Date, "%m/%d/%Y"))  ) %>%
  group_by(Transect,Year,Site_Code,Species) %>%
  mutate(pdiff = doy -lag(doy, default = doy[1])) %>%
  filter (pdiff > 7) %>% 
  ggplot( aes(x = Site_Code, y = Species,fill=pdiff)) +
   geom_jitter() +
    facet_grid(Transect~Year)    +
  scale_fill_viridis_c(name = "Difference in DOY") +
    labs(title = 'Pairwise differences',
       subtitle = 'All species', 
         x = "Site Code") +
  theme_minimal(base_size = 14) + theme(axis.title.y = element_blank(),axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
```

# Pairwise differences but filter it only by focal species
#I got rid of this snippet of code, as I changed code above to only look at focal species

# Plot pairwise for RL only, looks like 128 observations for RL (no jitter)

```{r}
PhenoSite %>% 
  filter (Transect== 'Reflection Lakes' & Flower == 1 & Species %in% species) %>%
  mutate (doy = lubridate::yday(as.Date(Date, "%m/%d/%Y"))  ) %>%
  group_by(Transect,Year,Site_Code,Species) %>% 
  mutate(pdiff = doy - lag(doy, default = doy[1])) %>%
  filter (pdiff > 7) %>% ggplot( aes(x = Site_Code, y = Species,fill=pdiff)) +
   geom_point() +
    facet_grid(Transect~Year)    +
  scale_fill_viridis_c(name = "Difference in DOY") +
    labs(title = 'Pairwise differences',
       subtitle = 'All species', 
         x = "Site Code") +
  theme_minimal(base_size = 14) + theme(axis.title.y = element_blank(),axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

```

#Plot pairwise for GB only
```{r}
PhenoSite %>% 
  filter (Transect== 'Glacier Basin' & Flower == 1 & Species %in% species) %>%
  mutate (doy = lubridate::yday(as.Date(Date, "%m/%d/%Y"))  ) %>%
  group_by(Transect,Year,Site_Code,Species) %>% mutate(pdiff = doy -     lag(doy, default = doy[1])) %>%
  filter (pdiff > 7) %>% 
  ggplot( aes(x = Site_Code, y = Species,fill=pdiff)) +
   geom_jitter() +
    facet_grid(Transect~Year)    +
  scale_fill_viridis_c(name = "Difference in DOY") +
    labs(title = 'Pairwise differences',
       subtitle = 'All species (jittered)', 
         x = "Site Code") +
  theme_minimal(base_size = 14) + theme(axis.title.y = element_blank(),axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

```