Skip to content

Commit

Permalink
Merge branch 'feature/36_feature' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
gklarenberg committed Aug 30, 2024
2 parents 2da7f6d + b9afb0f commit d3ffdb6
Show file tree
Hide file tree
Showing 16 changed files with 567 additions and 124 deletions.
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
site,alternate_name,site_friendly,station_code,site_acronym,Latitude,Longitude,start_date,end,wbid,location
site,alternate_name,site_friendly,StationCode,site_acronym,Latitude,Longitude,start_date,end,wbid,location
MICKLERS,N,Micklers,GTMMKNUT,MK,30.16073611,-81.36027778,,N,Lake,water_control
DEPGL1,N,Guana Lake 1,GTMGL1NUT,GL1,30.1504,-81.3604,,N,Lake,open_water
DEPGL2,N,Guana Lake 2,GTMGL2NUT,GL2,30.1161,-81.3511,,N,Lake,open_water
LAKE MIDDLE,DEPGL3,Lake Middle,GTMLMNUT,LM,30.08302,-81.34286,,N,Lake,open_water
LAKE MIDDLE,N,Lake Middle,GTMOLNUT,LM,30.08302,-81.34286,,Y,Lake,open_water
LAKE MIDDLE,N,Lake Middle,GTMOLNUT_dup,LM,30.08302,-81.34286,,Y,Lake,open_water
DEPGL4,N,Guana Lake 4,GTMGL4NUT,GL4,30.0451,-81.3351,,N,Lake,open_water
LAKE SOUTH,N,Lake South,GTMDNNUT,LS,30.023763,-81.327928,,Y,Lake,water_control
LAKE SOUTH,N,Lake South,GTMLSNUT,LS,30.023763,-81.327928,,N,Lake,water_control
Expand Down
36 changes: 18 additions & 18 deletions 02_Cleaning_scripts/WIN_data_clean.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,17 @@ library(tidyverse)
GTMNERR <- st_read("03_Data_for_app/shapefiles_new/counties_GTMNERR.shp")
# CRS: NAD83 / UTM zone 17N
# WIN Data
gps_data <- read_csv("./01_Data_raw/Water_Quality/WIN/WIN_data_merged_20240501.csv")

# Already select which variables we want straightaway. Saves resources. Pick
# things that have "Org" in the name because those are the units/names/etc
# submitted by the sampling organization - which should be the GTM for this (?)
gps_data <- read_csv("./01_Data_raw/Water_Quality/WIN/WIN_data_merged_20240501.csv",
col_select = c(`Monitoring Location ID`, `Activity Type`,
`Activity Start Date Time`, `Activity Depth`,
`DEP Result ID`, `Org Analyte Name`, `Org Result Value`,
`Org Result Unit`, `Org MDL`, `RowID`, `LocationID`,
`Station ID`, `Station Name`, `Station Type`, `County`,
`Location_1`, `Location_2`))

lookup_names <- read_csv("03_Data_for_app/WQ_lookup_names.csv")

Expand Down Expand Up @@ -76,12 +86,12 @@ colnames(WIN_df)[(ncol(WIN_df)-1):ncol(WIN_df)] <- c("Longitude", "Latitude")

#### Keep only columns with varying information ####
# Function to remove columns with the same value in the whole column
remove_constant_columns <- function(df) {
df <- df[, sapply(df, function(col) length(unique(col)) > 1)]
return(df)
}

WIN_df <- remove_constant_columns(WIN_df)
# remove_constant_columns <- function(df) {
# df <- df[, sapply(df, function(col) length(unique(col)) > 1)]
# return(df)
# }
#
# WIN_df <- remove_constant_columns(WIN_df)

#### Reformat data to visualize easily ####
# turn WIN_df into long format with the following columns
Expand All @@ -90,15 +100,6 @@ WIN_df <- remove_constant_columns(WIN_df)

# Convert all columns to character before pivoting and retain the original row identifier
WIN_df <- WIN_df %>%
select(-all_of(c("StationID",
"StationName",
"OrgAnalyteName",
"OrgResultValue", #GK: changed this from "DEPResultValueNumber", as I believe it is the DEP value we want to keep?
"DEPMDL",
"DEPPQL",
"OrgDetectionUnit",
"OrgResultUnit",
"ActivityEndDateTime"))) %>%
# Add a column to record the data source/provider
mutate(data_source = "WIN") %>% # or change this to DEP?
mutate(across(everything(), as.character)) %>%
Expand All @@ -107,8 +108,7 @@ WIN_df <- WIN_df %>%
cols = -c(RowID), # Exclude the Row_ID column from pivoting
names_to = "variable",
values_to = "value"
) %>%
mutate(value = na_if(value, ""))
)
# filter(!is.na(value) & value != "") # use this if space is an issue

#### Save data ####
Expand Down
30 changes: 27 additions & 3 deletions 02_Cleaning_scripts/WQ_GTMNERR.R
Original file line number Diff line number Diff line change
Expand Up @@ -67,17 +67,41 @@ unique(WQ$Remark) # inconsistent... But there are some capital letters that
# Change station code column name so it is the same as the column in the data
names(WQ_meta)[names(WQ_meta) == "station_code"] <- "StationCode"

WQ$StationCode <- str_trim(WQ$StationCode)

WQ <- WQ %>%
left_join(WQ_meta)
left_join(WQ_meta, by = c("StationCode")) %>% # not all stations in the original have lat/lon
mutate(Latitude = coalesce(Latitude.y, Latitude.x),
Longitude = coalesce(Longitude.y, Longitude.x)) %>%
select(-c(Latitude.x, Latitude.y, Longitude.x, Longitude.y))

# Stations missing from metadata: GL1.5, GL2.5 and GL3.5 -> added manually and
# emailed Nikki

# Check
which(is.na(WQ$Latitude))
which(is.na(WQ$Longitude))

WQ[which(is.na(WQ$Latitude)),] # duplicates?? Remove for now; emailed Nikki
WQ <- WQ[-which(is.na(WQ$Latitude)),]
# There is GTMMOLNUT and GTMMOLNUT_dup...? I think we can leave it in for now;I give
# them the same site, site acronym and site friendly name...
# And GTMLMNUT is the same? But DEP code? Fix this...

# WQ[which(is.na(WQ$Latitude)),] # duplicates??
# WQ <- WQ[-which(is.na(WQ$Latitude)),]

# Some station codes appear to have changed over time. Make sure only one code is
# reflected (otherwise we have issues with the dashboard)
# GTMDNNUT -> GTMLSNUT (Lake South)
# GTMDSNUT -> GTMRNNUT (River North)
# GTMOLNUT and GTMOLNUT_dup -> GTMLMNUT

replacement <- data.frame(StationCode = c("GTMDNNUT", "GTMDSNUT", "GTMOLNUT", "GTMOLNUT_dup"),
StationCode_repl = c("GTMLSNUT", "GTMRNNUT", "GTMLMNUT", "GTMLMNUT"))

WQ <- WQ %>%
left_join(replacement, by = "StationCode") %>%
mutate(StationCode = coalesce(StationCode_repl, StationCode)) %>%
select(-StationCode_repl)

# Create a separate dataframe with only station info, not the data (makes map
# too heavy)
Expand Down
60 changes: 59 additions & 1 deletion 02_Cleaning_scripts/WQ_WIN_merge.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ WQ_GTMNERR <- readRDS("03_Data_for_app/WQ.Rds")
# Having a column for the dates is advisable though, as it is a separate data
# type. Update that later, not right now.

#### We need to update the variable list so the names are the same!

# First make sure that every row has a UNID and also add a column for the data
# source / provider
WQ_GTMNERR <- WQ_GTMNERR %>%
Expand Down Expand Up @@ -49,7 +51,7 @@ WQ_GTMNERR_long <- WQ_GTMNERR %>%
# one dataset is?)

min(as.numeric(WQ_GTMNERR_long$UNID)) # 1
max(as.numeric(WQ_GTMNERR_long$UNID)) # 5016 (used to be 17098?)
max(as.numeric(WQ_GTMNERR_long$UNID)) # 17098

min(as.numeric(WIN$RowID)) # 55135
max(as.numeric(WIN$RowID)) # 3677602
Expand All @@ -62,6 +64,62 @@ WQ_all <- WIN %>%
full_join(WQ_GTMNERR_long)

unique(WQ_all$variable)
unique(WQ_all$value)

# Read in the WQ vars lookup table and replace variables with the names we need
lookup_WQ_vars <- read_csv("03_Data_for_app/WQ_lookup_variables.csv")

# # I am sure there is a nicer/quicker/tidyverse way of doing this, but whatevs for now
# for (i in 1:nrow(WQ_all)){
# if (is.na(WQ_all$value[i])){
# next
# }
# for (j in 1:nrow(lookup_WQ_vars)){
# if (WQ_all$value[i] == lookup_WQ_vars$value[j]){
# WQ_all$value[i] <- lookup_WQ_vars$new[j]
# }
# }
# }

# Quicker way
WQ_all <- WQ_all %>%
left_join(lookup_WQ_vars, by = "value") %>%
mutate(value = coalesce(new, value)) %>%
select(-new)

# Filter for only the things that we need:

# In ComponentLong:

# Air temperature
# Ammonium (filtered)
# Chlorophyll
# Chlorophyll a (corrected)
# Chlorophyll a (uncorrected)
# Fecal coliform
# Dissolved oxygen
# Organic carbon
# pH
# Salinity
# Specific conductance
# Total nitrogen (TKN + nitrate + nitrite)
# Phosphorus (total)
# Total dissolved solids
# Turbidity
# Water temperature

# I did this as follows now but I am NOT happy about it and we should update it.
# Because this only removes the variables names but keeps everything else
# associated with that station (which might make things unnecesarily slow)
# Maybe we can do it by rowID?
selected_values <- c("Air temperature", "Ammonium (filtered)", "Chlorophyll", "Chlorophyll a (corrected)",
"Chlorophyll a (uncorrected)", "Fecal coliform", "Dissolved oxygen", "Organic carbon",
"pH", "Salinity", "Specific conductance", "Total nitrogen (TKN + nitrate + nitrite)",
"Phosphorus (total)", "Total dissolved solids", "Turbidity", "Water temperature")

WQ_all <- WQ_all %>%
filter((variable == "ComponentLong" & value %in% selected_values) | variable != "ComponentLong")


# Save data
saveRDS(WQ_all, "03_Data_for_app/WQ_all.Rds")
Binary file modified 03_Data_for_app/WIN.Rds
Binary file not shown.
Binary file modified 03_Data_for_app/WQ.Rds
Binary file not shown.
Binary file modified 03_Data_for_app/WQ_all.Rds
Binary file not shown.
Binary file modified 03_Data_for_app/WQ_locations.Rds
Binary file not shown.
18 changes: 9 additions & 9 deletions 03_Data_for_app/WQ_lookup_names.csv
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,17 @@ WIN,Activity Updated Date,ActivityUpdatedDate
WIN,DEP Result ID,DEPResultID
WIN,Org Result ID,OrgResultID
WIN,DEP Analyte Group,DEPAnalyteGroup
WIN,DEP Analyte Name,ComponentLong
WIN,DEP Analyte Name,DEP Analyte Name
WIN,ADaPT Analyte ID,ADaPTAnalyteID
WIN,Org Analyte Name,OrgAnalyteName
WIN,Org Result Value,OrgResultValue
WIN,Org Result Unit,OrgResultUnit
WIN,Org Analyte Name,ComponentLong
WIN,Org Result Value,Result
WIN,Org Result Unit,Unit
WIN,Org MDL,OrgMDL
WIN,Org PQL,OrgPQL
WIN,Org Detection Unit,OrgDetectionUnit
WIN,DEP Result Value Number,Result
WIN,DEP Result Value Number,DEP Result Value Number
WIN,DEP Result Value Text,DEPResultValueText
WIN,DEP Result Unit,Unit
WIN,DEP Result Unit,DEP Result Unit
WIN,DEP MDL,DEPMDL
WIN,DEP PQL,DEPPQL
WIN,Value Qualifier,ValueQualifier
Expand Down Expand Up @@ -77,9 +77,9 @@ WIN,Result Updated Date,ResultUpdatedDate
WIN,RowID,RowID
WIN,LocationID,LocationID
WIN,ProgramID,ProgramID
WIN,Station ID,StationID
WIN,Station Name,StationName
WIN,Station Type,StationType
WIN,Station ID,StationCode
WIN,Station Name,site_friendly
WIN,Station Type,wbid
WIN,County,County
WIN,Start Date,StartDate
WIN,End Date,EndDate
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ App_dev/01_Data_raw/Water_Quality/Guana_WQ/Guana_masterdata.xlsx.
1. Copy-pasted the header names to App_dev/03_Data_for_app/WQ_lookup_names.csv and named the column original_name
2. Added a column data_sources with either WIN or GTMNERR
3. Created a column dashboard_name and copied all the names from original_name to it
4. For now (08/13/2024) I am only changing latitude, longitude, dates, sample type, components/analytes measured (DEP, not org). For visualization purposes. We can rename the rest later (analytes themselves will
also need to be renamed, as well as info on station names and types).
5. For the new names I also removed the spaces (because R inserts points there).
4. For now (08/13/2024) I am only changing latitude, longitude, dates, sample type, components/analytes measured (DEP, not org) to the names in the GTMNERR data. For visualization purposes. We can rename the rest later (analytes themselves will also need to be renamed, as well as info on station names and types).
5. For the new names I also removed the spaces (because R inserts points there).

23 August 2024
Updated lookup table so the org result and analyte name are being used instead of DEP (Result and ComponentLong). Also turned "Station ID" into "StationCode", "Station Type" into "bid", and "Station Name" into "site_friendly".
109 changes: 109 additions & 0 deletions 03_Data_for_app/WQ_lookup_variables.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
value,new
"2,4-Dichlorophenoxyacetic acid","2,4-Dichlorophenoxyacetic acid"
"2,4,5-T","2,4,5-T"
Acesulfame K,Acesulfame K
Acetaminophen ,Acetaminophen
Acetamiprid,Acetamiprid
Afidopyropen,Afidopyropen
Air temperature,Air temperature
"Ammonium, Filtered",Ammonium (filtered)
"Ammonium, Filtered ",Ammonium (filtered)
AMPA,AMPA
Bentazon,Bentazon
Bentazon ,Bentazon
Benzovindiflupyr,Benzovindiflupyr
bird specific Helicobacter GFD genetic marker,bird specific Helicobacter GFD genetic marker
Bromide,Bromide
canine-specific DG3 Bacteroides genetic marker,canine-specific DG3 Bacteroides genetic marker
Carbamazepine,Carbamazepine
Carbamazepine ,Carbamazepine
Chloride,Chloride
Chlorophyll,Chlorophyll
Chlorophyll a- corrected,Chlorophyll a (corrected)
Chlorophyll a- uncorrected,Chlorophyll a (uncorrected)
"Chlorophyll a, Corrected (Monochromatic)",Chlorophyll a (corrected)
"Chlorophyll a, Uncorrected (Trichromatic)",Chlorophyll a (uncorrected)
Chlorophyll b (Trichromatic),Chlorophyll b (Trichromatic)
Chlorophyll c (Trichromatic),Chlorophyll c (Trichromatic)
Chlorophyll_bottom,Chlorophyll_bottom
Clothianidin,Clothianidin
coastal bird specific Catellicoccus marimammalium Gull2 genetic marker,coastal bird specific Catellicoccus marimammalium Gull2 genetic marker
"Coliform, Fecal",Fecal coliform
Color (true),Color (true)
Copper,Copper
Dinotefuran,Dinotefuran
Dissolved Oxygen,Dissolved oxygen
Dissolved Oxygen Saturation,Dissolved Oxygen Saturation
"Dissolved oxygen, bottom","Dissolved oxygen, bottom"
"Dissolved oxygen, percent saturation","Dissolved oxygen, percent saturation"
"Dissolved oxygen, percent saturation, bottom","Dissolved oxygen, percent saturation, bottom"
Diuron,Diuron
Endothall,Endothall
Enterococcus,Enterococcus
Escherichia Coli-Quanti-Tray,Escherichia Coli-Quanti-Tray
Fenuron,Fenuron
Fluoride,Fluoride
Fluridone,Fluridone
Fluridone ,Fluridone
Glufosinate,Glufosinate
Glyphosate,Glyphosate
human-specific HF183 Bacteroides genetic marker,human-specific HF183 Bacteroides genetic marker
Hydrocodone,Hydrocodone
Hydrocodone ,Hydrocodone
Ibuprofen,Ibuprofen
Ibuprofen ,Ibuprofen
Imazapyr,Imazapyr
Imidacloprid,Imidacloprid
"Kjeldahl Nitrogen, Filtered","Kjeldahl Nitrogen, Filtered"
"Kjeldahl Nitrogen, Filtered ","Kjeldahl Nitrogen, Filtered "
Linuron,Linuron
Mandestrobin,Mandestrobin
Methylchlorophenoxypropionic acid,Methylchlorophenoxypropionic acid
Naproxen,Naproxen
Nitrate+Nitrite,Nitrate+Nitrite
"Nitrogen, Kjeldahl",TKN (Total Kjeldahl nitrogen: ammonia nitrogen + organic nitrogen)
"Nitrogen, Nitrite (NO2) + Nitrate (NO3) as N",Total nitrogen (TKN + nitrate + nitrite)
OD664b/OD665a,OD664b/OD665a
Organic Carbon,Organic carbon
Orthophosphate,Orthophosphate
Orthophosphate ,Orthophosphate
"Orthophosphate, Filtered","Orthophosphate, Filtered"
pH,pH
"pH, bottom","pH, bottom"
Phaeophytin a,Phaeophytin a
Pheophytin a,Pheophytin a
Phosphorus- Total,Phosphorus (total)
Phycoerythrin,Phycoerythrin
"Phycoerythrin, bottom","Phycoerythrin, bottom"
Primidone,Primidone
Pyraclostrobin,Pyraclostrobin
Ruminant specific Bacteroidetes BacR genetic marker,Ruminant specific Bacteroidetes BacR genetic marker
Salinity,Salinity
"Salinity, bottom","Salinity, bottom"
Secchi Disk,Secchi Disk
Silvex,Silvex
Specific Conductance,Specific conductance
"Specific Conductance, bottom","Specific Conductance, bottom"
Sucralose,Sucralose
Sulfate,Sulfate
TDS,Total dissolved solids
"Temperature, Water",Water temperature
Thiamethoxam,Thiamethoxam
Tolfenpyrad,Tolfenpyrad
Total Alkalinity ,Total Alkalinity
Total Kjeldahl Nitrogen,TKN (Total Kjeldahl nitrogen: ammonia nitrogen + organic nitrogen)
Total Kjeldahl Nitrogen ,TKN (Total Kjeldahl nitrogen: ammonia nitrogen + organic nitrogen)
Total Kjeldahl Nitrogen ,TKN (Total Kjeldahl nitrogen: ammonia nitrogen + organic nitrogen)
Total Kjeldahl Nitrogen Filtered,Total Kjeldahl Nitrogen Filtered
Total Nitrogen,Total nitrogen (TKN + nitrate + nitrite)
Total Phosphorus,Phosphorus (total)
Total Phosphorus ,Phosphorus (total)
Total Suspended Solids,Total dissolved solids
Triclopyr,Triclopyr
Turbidity,Turbidity
Turbidity ,Turbidity
Water Depth,Water Depth
Water temperature,Water temperature
"Water temperature, bottom","Water temperature, bottom"
Wind Direction,Wind Direction
Wind Speed,Wind Speed
31 changes: 31 additions & 0 deletions 03_Data_for_app/WQ_lookup_variables_creation.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
Name: Geraldine Klarenberg
Date: 26 August 2024

Created a .csv file as a lookup table to convert names of variables (i.e. analytes) to the same names so we can actually use it in a dropdown. For now I am only changing the ones that we will use in the first prototype of the dashboard.

1. From WIN_data_merged_20240501 I took all the unique names listed in the column "Org Analyte Name".
2. For the Guana WQ (NUTS?) data I took the unique names in the column ComponentLong in Guana_masterdata.xlsx
3. These were put in the column "original" in the lookup table: 108 variables. Note that some variable names appear duplicated, but that is because some have trailing blanks/white spaces after the names (which R will pick up as a "different" name).
4. These were sorted alphabetically.
5. The following consistent variables names were used and put in the column "new", on the row with the associated original variable name:

Air temperature
Ammonium (filtered)
Chlorophyll
Chlorophyll a (corrected)
Chlorophyll a (uncorrected)
Fecal coliform
Dissolved oxygen
Organic carbon
pH
Salinity
Specific conductance
Total nitrogen (TKN + nitrate + nitrite)
Phosphorus (total)
Total dissolved solids
Turbidity
Water temperature

NOTE: The nitrogen variables need to be updated/checked, as right now I do not have the detailed data dictionary for the WIN data.

6. For the remaining variables the original name was copy-pasted in the "new" column.
Loading

0 comments on commit d3ffdb6

Please sign in to comment.