Skip to content

Commit

Permalink
Merge branch 'release/v0.1.0' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristopherMarais committed Sep 2, 2024
2 parents 5a6396e + d3ffdb6 commit e062568
Show file tree
Hide file tree
Showing 65 changed files with 1,268 additions and 464 deletions.
File renamed without changes.
File renamed without changes.
2 changes: 2 additions & 0 deletions 01_Data_raw/Biodiversity/PLACEHOLDER.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
REMOVE THIS FILE AFTER ADDING DATA TO THIS FOLDER.
This is jsut so the folders are saved to Git.
2 changes: 2 additions & 0 deletions 01_Data_raw/Hydrology/PLACEHOLDER.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
REMOVE THIS FILE AFTER ADDING DATA TO THIS FOLDER.
This is jsut so the folders are saved to Git.
2 changes: 2 additions & 0 deletions 01_Data_raw/Shellfish/PLACEHOLDER.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
REMOVE THIS FILE AFTER ADDING DATA TO THIS FOLDER.
This is jsut so the folders are saved to Git.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
site,alternate_name,site_friendly,station_code,site_acronym,lat,long,start_date,end,wbid,location
site,alternate_name,site_friendly,StationCode,site_acronym,Latitude,Longitude,start_date,end,wbid,location
MICKLERS,N,Micklers,GTMMKNUT,MK,30.16073611,-81.36027778,,N,Lake,water_control
DEPGL1,N,Guana Lake 1,GTMGL1NUT,GL1,30.1504,-81.3604,,N,Lake,open_water
DEPGL2,N,Guana Lake 2,GTMGL2NUT,GL2,30.1161,-81.3511,,N,Lake,open_water
LAKE MIDDLE,DEPGL3,Lake Middle,GTMLMNUT,LM,30.08302,-81.34286,,N,Lake,open_water
LAKE MIDDLE,N,Lake Middle,GTMOLNUT,LM,30.08302,-81.34286,,Y,Lake,open_water
LAKE MIDDLE,N,Lake Middle,GTMOLNUT_dup,LM,30.08302,-81.34286,,Y,Lake,open_water
DEPGL4,N,Guana Lake 4,GTMGL4NUT,GL4,30.0451,-81.3351,,N,Lake,open_water
LAKE SOUTH,N,Lake South,GTMDNNUT,LS,30.023763,-81.327928,,Y,Lake,water_control
LAKE SOUTH,N,Lake South,GTMLSNUT,LS,30.023763,-81.327928,,N,Lake,water_control
Expand Down
Binary file not shown.
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 2 additions & 0 deletions 01_Data_raw/Weather/PLACEHOLDER.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
REMOVE THIS FILE AFTER ADDING DATA TO THIS FOLDER.
This is jsut so the folders are saved to Git.
11 changes: 3 additions & 8 deletions 02_Cleaning_scripts/HAB_FWC.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,7 @@

library(tidyverse)

HAB <- read_csv("01_Data_raw/HAB/HAB_FWC.csv")

# Save it as an .Rdata file so it can be read into the Shiny app
save(HAB, file = "03_Data_for_app/HAB.RData")

saveRDS(HAB, "03_Data_for_app/HAB.Rds")

HAB <- read_csv("01_Data_raw/Water_Quality/HAB/HAB_FWC.csv")


# Create long format so it can be used in the Shiny app
Expand All @@ -28,7 +22,8 @@ HAB <- HAB %>%
names_to = "vars", values_to = "vals")
HAB$Date <- dmy(HAB$`Sample Date`)

save(HAB, file = "03_Data_for_app/HAB.RData")
# Save it as an .Rds file so it can be read into the Shiny app
saveRDS(HAB, "03_Data_for_app/HAB.Rds")

# Check all unique categories
unique(HAB$Description)
Expand Down
54 changes: 32 additions & 22 deletions 02_Cleaning_scripts/WIN_data_clean.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,24 @@ library(tidyverse)
GTMNERR <- st_read("03_Data_for_app/shapefiles_new/counties_GTMNERR.shp")
# CRS: NAD83 / UTM zone 17N
# WIN Data
gps_data <- read.csv("./01_Data_raw/WIN/WIN_data_merged_20240501.csv")

# Already select which variables we want straightaway. Saves resources. Pick
# things that have "Org" in the name because those are the units/names/etc
# submitted by the sampling organization - which should be the GTM for this (?)
gps_data <- read_csv("./01_Data_raw/Water_Quality/WIN/WIN_data_merged_20240501.csv",
col_select = c(`Monitoring Location ID`, `Activity Type`,
`Activity Start Date Time`, `Activity Depth`,
`DEP Result ID`, `Org Analyte Name`, `Org Result Value`,
`Org Result Unit`, `Org MDL`, `RowID`, `LocationID`,
`Station ID`, `Station Name`, `Station Type`, `County`,
`Location_1`, `Location_2`))

lookup_names <- read_csv("03_Data_for_app/WQ_lookup_names.csv")

# Change column names so we can later merge this with other WQ data
recode_vec <- setNames(lookup_names$original_name, lookup_names$dashboard_name)
gps_data <- gps_data %>%
rename(any_of(recode_vec))

#### GTMNERR boundary and aquatic preserves ####

Expand Down Expand Up @@ -41,7 +58,7 @@ bound_box <- st_bbox(st_sfc(pt1, pt3, pt4, pt2, crs = st_crs(GTMNERR)))

# Filter GPS coordinates
# Convert to sf object
gps_sf <- st_as_sf(gps_data, coords = c("Location_2", "Location_1"), crs = 4326)
gps_sf <- st_as_sf(gps_data, coords = c("Longitude", "Latitude"), crs = 4326)

# Crop GPS points within the bounding box
gps_cropped <- st_crop(gps_sf, bound_box)
Expand All @@ -65,16 +82,16 @@ coordinates <- st_coordinates(gps_cropped)
WIN_df <- cbind(WIN_df, coordinates)

# Rename the coordinates columns if necessary
colnames(WIN_df)[(ncol(WIN_df)-1):ncol(WIN_df)] <- c("longitude", "latitude")
colnames(WIN_df)[(ncol(WIN_df)-1):ncol(WIN_df)] <- c("Longitude", "Latitude")

#### Keep only columns with varying information ####
# Function to remove columns with the same value in the whole column
remove_constant_columns <- function(df) {
df <- df[, sapply(df, function(col) length(unique(col)) > 1)]
return(df)
}

WIN_df <- remove_constant_columns(WIN_df)
# remove_constant_columns <- function(df) {
# df <- df[, sapply(df, function(col) length(unique(col)) > 1)]
# return(df)
# }
#
# WIN_df <- remove_constant_columns(WIN_df)

#### Reformat data to visualize easily ####
# turn WIN_df into long format with the following columns
Expand All @@ -83,29 +100,22 @@ WIN_df <- remove_constant_columns(WIN_df)

# Convert all columns to character before pivoting and retain the original row identifier
WIN_df <- WIN_df %>%
select(-all_of(c("Station.ID",
"Station.Name",
"Org.Analyte.Name",
"DEP.Result.Value.Number",
"DEP.MDL",
"DEP.PQL",
"Org.Detection.Unit",
"Org.Result.Unit",
"Activity.End.Date.Time"))) %>%
# Add a column to record the data source/provider
mutate(data_source = "WIN") %>% # or change this to DEP?
mutate(across(everything(), as.character)) %>%
mutate_all(~ na_if(., "")) %>%
pivot_longer(
cols = -c(RowID), # Exclude the Row_ID column from pivoting
names_to = "variable",
values_to = "value"
) %>%
mutate(value = na_if(value, ""))
)
# filter(!is.na(value) & value != "") # use this if space is an issue

#### Save data ####
# Save the filtered data to a new CSV file
# write.csv(WIN_df,
# "03_Data_for_app/Filtered_WIN_data_merged_20240501.csv",
# row.names = FALSE)
# Save the filtered data to a .RData file
save(WIN_df, file = "03_Data_for_app/WIN.RData")
# Save the filtered data to a .RDs file

saveRDS(WIN_df, "03_Data_for_app/WIN.Rds")
60 changes: 45 additions & 15 deletions 02_Cleaning_scripts/WQ_GTMNERR.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ library(tidyverse)
library(readxl)

### 1. Read in data -----------------------------------------------------------
WQ <- read_excel("01_Data_raw/Guana_WQ/Guana_masterdata.xlsx",
WQ <- read_excel("01_Data_raw/Water_Quality/Guana_WQ/Guana_masterdata.xlsx",
sheet = 1, # There is only one sheet, but just to be safe
guess_max = 13000) # This is not ideal + cols 14 and 16 have a
# mix of logical and numbers. Lord.
Expand All @@ -24,10 +24,17 @@ WQ <- read_excel("01_Data_raw/Guana_WQ/Guana_masterdata.xlsx",
#col_types = c("SampleDate" = "date", "#RQ" = "text")) # If not specified you get
# warnings (as it expects logical; text only starts after row 1445)

WQ_meta <- read_csv("01_Data_raw/Guana_WQ/guana_data_dictionary_updateGK.csv")
WQ_meta <- read_csv("01_Data_raw/Water_Quality/Guana_WQ/guana_data_dictionary_updateGK.csv")
# Some stations have two codes due to a name change (see Word doc with metadata)
# Don't remove

lookup_names <- read_csv("03_Data_for_app/WQ_lookup_names.csv")

# Change column names so we can later merge this with other WQ data
recode_vec <- setNames(lookup_names$original_name, lookup_names$dashboard_name)
WQ <- WQ %>%
rename(any_of(recode_vec))

### 2. Check categorical values ------------------------------------------------
# Check station names, componentLong and componentShort (spelling etc)
unique(WQ$StationCode)
Expand Down Expand Up @@ -60,38 +67,61 @@ unique(WQ$Remark) # inconsistent... But there are some capital letters that
# Change station code column name so it is the same as the column in the data
names(WQ_meta)[names(WQ_meta) == "station_code"] <- "StationCode"

WQ$StationCode <- str_trim(WQ$StationCode)

WQ <- WQ %>%
left_join(WQ_meta) %>%
select(-Lat, -Long)
left_join(WQ_meta, by = c("StationCode")) %>% # not all stations in the original have lat/lon
mutate(Latitude = coalesce(Latitude.y, Latitude.x),
Longitude = coalesce(Longitude.y, Longitude.x)) %>%
select(-c(Latitude.x, Latitude.y, Longitude.x, Longitude.y))

# Stations missing from metadata: GL1.5, GL2.5 and GL3.5 -> added manually and
# emailed Nikki

which(is.na(WQ$lat))
which(is.na(WQ$long))
# Check
which(is.na(WQ$Latitude))
which(is.na(WQ$Longitude))

# There is GTMMOLNUT and GTMMOLNUT_dup...? I think we can leave it in for now;I give
# them the same site, site acronym and site friendly name...
# And GTMLMNUT is the same? But DEP code? Fix this...

# WQ[which(is.na(WQ$Latitude)),] # duplicates??
# WQ <- WQ[-which(is.na(WQ$Latitude)),]

# Some station codes appear to have changed over time. Make sure only one code is
# reflected (otherwise we have issues with the dashboard)
# GTMDNNUT -> GTMLSNUT (Lake South)
# GTMDSNUT -> GTMRNNUT (River North)
# GTMOLNUT and GTMOLNUT_dup -> GTMLMNUT

replacement <- data.frame(StationCode = c("GTMDNNUT", "GTMDSNUT", "GTMOLNUT", "GTMOLNUT_dup"),
StationCode_repl = c("GTMLSNUT", "GTMRNNUT", "GTMLMNUT", "GTMLMNUT"))

WQ[which(is.na(WQ$lat)),] # duplicates?? Remove for now; emailed Nikki
WQ <- WQ[-which(is.na(WQ$lat)),]
WQ <- WQ %>%
left_join(replacement, by = "StationCode") %>%
mutate(StationCode = coalesce(StationCode_repl, StationCode)) %>%
select(-StationCode_repl)

# Create a separate dataframe with only station info, not the data (makes map
# too heavy)
WQ_locations <- WQ %>%
mutate(Year = year(SampleDate)) %>%
select(site_friendly, Year, site_acronym, lat, long, wbid, location) %>%
group_by(site_friendly, site_acronym, lat, long, wbid, location) %>%
summarize(maxYear = max(Year), minYear = min(Year))
select(site_friendly, Year, site_acronym, Latitude, Longitude, wbid, location) %>%
group_by(site_friendly, site_acronym, Latitude, Longitude, wbid, location) %>%
summarize(maxYear = max(Year), minYear = min(Year)) %>%
mutate(type = "Water quality",
dataset = "Guana Water Quality Monitoring (GTMNERR)")

WQ_data_available <- WQ %>%
mutate(Year = year(SampleDate)) %>%
select(StationCode, Year, SampleType, ComponentShort, ComponentLong, site_friendly,
site_acronym, lat, long, wbid, location) %>%
site_acronym, Latitude, Longitude, wbid, location) %>%
distinct()

### 4. Save data ---------------------------------------------------------------

# Save it as an .Rdata (and .Rds?) file so it can be read into the Shiny app
save(WQ, file = "03_Data_for_app/WQ.RData")
# Save it as an .Rds file so it can be read into the Shiny app
saveRDS(WQ, "03_Data_for_app/WQ.Rds")

save(WQ_locations, file = "03_Data_for_app/WQ_locations.RData")
saveRDS(WQ_locations, "03_Data_for_app/WQ_locations.Rds")
125 changes: 125 additions & 0 deletions 02_Cleaning_scripts/WQ_WIN_merge.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
########################################################################
########## NERRS Science Transfer project - GTMNERR #############
########################################################################

# Geraldine Klarenberg, PhD
# [email protected]
# 12 August 2024

# Load packages
library(tidyverse)

# Merge WIN and WQ data into one

WIN <- readRDS("03_Data_for_app/WIN.Rds")
WQ_GTMNERR <- readRDS("03_Data_for_app/WQ.Rds")

# Make WQ_GTMNERR long format as well, just like WIN
# Having a column for the dates is advisable though, as it is a separate data
# type. Update that later, not right now.

#### We need to update the variable list so the names are the same!

# First make sure that every row has a UNID and also add a column for the data
# source / provider
WQ_GTMNERR <- WQ_GTMNERR %>%
mutate(data_source = "GTMNERR") %>%
arrange(UNID, StationCode, SampleDate, ComponentShort)

for (i in 1:nrow(WQ_GTMNERR)){
if (is.na(WQ_GTMNERR$UNID[i])){
WQ_GTMNERR$UNID[i] <- WQ_GTMNERR$UNID[i-1] + 1
}
}
# Check there are no duplicates
sum(duplicated(WQ_GTMNERR$UNID))

# Add a geometry column, to later use for clicking markers (we might change this)
WQ_GTMNERR <- st_as_sf(WQ_GTMNERR, coords = c("Longitude", "Latitude"),
crs = 4326, remove = FALSE)
# Turn back into dataframe with geometry as a column
WQ_GTMNERR <- as.data.frame(WQ_GTMNERR)

WQ_GTMNERR_long <- WQ_GTMNERR %>%
mutate(across(everything(), as.character)) %>%
mutate_all(~ na_if(., "")) %>%
pivot_longer(cols = -UNID,
names_to = "variable",
values_to = "value")

# How to deal with UNID when merging? Start counting anew (or add however far the
# one dataset is?)

min(as.numeric(WQ_GTMNERR_long$UNID)) # 1
max(as.numeric(WQ_GTMNERR_long$UNID)) # 17098

min(as.numeric(WIN$RowID)) # 55135
max(as.numeric(WIN$RowID)) # 3677602
# Appears there will be no overlap. Merge.

WQ_GTMNERR_long <- WQ_GTMNERR_long %>%
rename(RowID = UNID)

WQ_all <- WIN %>%
full_join(WQ_GTMNERR_long)

unique(WQ_all$variable)
unique(WQ_all$value)

# Read in the WQ vars lookup table and replace variables with the names we need
lookup_WQ_vars <- read_csv("03_Data_for_app/WQ_lookup_variables.csv")

# # I am sure there is a nicer/quicker/tidyverse way of doing this, but whatevs for now
# for (i in 1:nrow(WQ_all)){
# if (is.na(WQ_all$value[i])){
# next
# }
# for (j in 1:nrow(lookup_WQ_vars)){
# if (WQ_all$value[i] == lookup_WQ_vars$value[j]){
# WQ_all$value[i] <- lookup_WQ_vars$new[j]
# }
# }
# }

# Quicker way
WQ_all <- WQ_all %>%
left_join(lookup_WQ_vars, by = "value") %>%
mutate(value = coalesce(new, value)) %>%
select(-new)

# Filter for only the things that we need:

# In ComponentLong:

# Air temperature
# Ammonium (filtered)
# Chlorophyll
# Chlorophyll a (corrected)
# Chlorophyll a (uncorrected)
# Fecal coliform
# Dissolved oxygen
# Organic carbon
# pH
# Salinity
# Specific conductance
# Total nitrogen (TKN + nitrate + nitrite)
# Phosphorus (total)
# Total dissolved solids
# Turbidity
# Water temperature

# I did this as follows now but I am NOT happy about it and we should update it.
# Because this only removes the variables names but keeps everything else
# associated with that station (which might make things unnecesarily slow)
# Maybe we can do it by rowID?
selected_values <- c("Air temperature", "Ammonium (filtered)", "Chlorophyll", "Chlorophyll a (corrected)",
"Chlorophyll a (uncorrected)", "Fecal coliform", "Dissolved oxygen", "Organic carbon",
"pH", "Salinity", "Specific conductance", "Total nitrogen (TKN + nitrate + nitrite)",
"Phosphorus (total)", "Total dissolved solids", "Turbidity", "Water temperature")

WQ_all <- WQ_all %>%
filter((variable == "ComponentLong" & value %in% selected_values) | variable != "ComponentLong")


# Save data
saveRDS(WQ_all, "03_Data_for_app/WQ_all.Rds")
Loading

0 comments on commit e062568

Please sign in to comment.